MindsDB 25.1.2.1__py3-none-any.whl → 25.1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (34) hide show
  1. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.3.0.dist-info}/METADATA +251 -250
  2. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.3.0.dist-info}/RECORD +33 -27
  3. mindsdb/__about__.py +1 -1
  4. mindsdb/__main__.py +5 -3
  5. mindsdb/api/executor/sql_query/result_set.py +36 -21
  6. mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +1 -1
  7. mindsdb/api/executor/sql_query/steps/join_step.py +4 -4
  8. mindsdb/api/executor/utilities/sql.py +2 -10
  9. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +7 -0
  10. mindsdb/integrations/handlers/file_handler/file_handler.py +1 -1
  11. mindsdb/integrations/handlers/langchain_embedding_handler/fastapi_embeddings.py +82 -0
  12. mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +8 -1
  13. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +48 -16
  14. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +3 -3
  15. mindsdb/integrations/utilities/rag/loaders/vector_store_loader/pgvector.py +76 -27
  16. mindsdb/integrations/utilities/rag/loaders/vector_store_loader/vector_store_loader.py +18 -1
  17. mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +6 -2
  18. mindsdb/integrations/utilities/rag/settings.py +2 -0
  19. mindsdb/integrations/utilities/sql_utils.py +1 -1
  20. mindsdb/interfaces/knowledge_base/controller.py +33 -14
  21. mindsdb/interfaces/skills/retrieval_tool.py +10 -3
  22. mindsdb/utilities/cache.py +7 -4
  23. mindsdb/utilities/context.py +9 -0
  24. mindsdb/utilities/log.py +20 -2
  25. mindsdb/utilities/otel/__init__.py +206 -0
  26. mindsdb/utilities/otel/logger.py +25 -0
  27. mindsdb/utilities/otel/meter.py +19 -0
  28. mindsdb/utilities/otel/metric_handlers/__init__.py +25 -0
  29. mindsdb/utilities/otel/tracer.py +16 -0
  30. mindsdb/utilities/utils.py +34 -0
  31. mindsdb/utilities/otel.py +0 -72
  32. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.3.0.dist-info}/LICENSE +0 -0
  33. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.3.0.dist-info}/WHEEL +0 -0
  34. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.3.0.dist-info}/top_level.txt +0 -0
@@ -161,7 +161,7 @@ class PostgresHandler(DatabaseHandler):
161
161
  'float8': 'float64'
162
162
  }
163
163
  columns = df.columns
164
- df = df.set_axis(range(len(columns)), axis=1)
164
+ df.columns = list(range(len(columns)))
165
165
  for column_index, column_name in enumerate(df.columns):
166
166
  col = df[column_name]
167
167
  if str(col.dtype) == 'object':
@@ -172,7 +172,7 @@ class PostgresHandler(DatabaseHandler):
172
172
  df[column_name] = col.astype(types_map[pg_type.name])
173
173
  except ValueError as e:
174
174
  logger.error(f'Error casting column {col.name} to {types_map[pg_type.name]}: {e}')
175
- return df.set_axis(columns, axis=1)
175
+ df.columns = columns
176
176
 
177
177
  @profiler.profile()
178
178
  def native_query(self, query: str, params=None) -> Response:
@@ -202,7 +202,7 @@ class PostgresHandler(DatabaseHandler):
202
202
  result,
203
203
  columns=[x.name for x in cur.description]
204
204
  )
205
- df = self._cast_dtypes(df, cur.description)
205
+ self._cast_dtypes(df, cur.description)
206
206
  response = Response(
207
207
  RESPONSE_TYPE.TABLE,
208
208
  df
@@ -1,9 +1,9 @@
1
- from typing import Any, List, Optional, Dict
1
+ from typing import Any, List, Union, Optional, Dict
2
2
 
3
3
  from langchain_community.vectorstores import PGVector
4
4
  from langchain_community.vectorstores.pgvector import Base
5
5
 
6
- from pgvector.sqlalchemy import Vector
6
+ from pgvector.sqlalchemy import SPARSEVEC, Vector
7
7
  import sqlalchemy as sa
8
8
  from sqlalchemy.dialects.postgresql import JSON
9
9
 
@@ -15,9 +15,17 @@ _generated_sa_tables = {}
15
15
 
16
16
  class PGVectorMDB(PGVector):
17
17
  """
18
- langchain_community.vectorstores.PGVector adapted for mindsdb vector store table structure
18
+ langchain_community.vectorstores.PGVector adapted for mindsdb vector store table structure
19
19
  """
20
20
 
21
+ def __init__(self, *args, is_sparse: bool = False, vector_size: Optional[int] = None, **kwargs):
22
+ # todo get is_sparse and vector_size from kb vector table
23
+ self.is_sparse = is_sparse
24
+ if is_sparse and vector_size is None:
25
+ raise ValueError("vector_size is required when is_sparse=True")
26
+ self.vector_size = vector_size
27
+ super().__init__(*args, **kwargs)
28
+
21
29
  def __post_init__(
22
30
  self,
23
31
  ) -> None:
@@ -32,53 +40,94 @@ class PGVectorMDB(PGVector):
32
40
  __tablename__ = collection_name
33
41
 
34
42
  id = sa.Column(sa.Integer, primary_key=True)
35
- embedding: Vector = sa.Column('embeddings', Vector())
36
- document = sa.Column('content', sa.String, nullable=True)
37
- cmetadata = sa.Column('metadata', JSON, nullable=True)
43
+ embedding = sa.Column(
44
+ "embeddings",
45
+ SPARSEVEC() if self.is_sparse else Vector() if self.vector_size is None else
46
+ SPARSEVEC(self.vector_size) if self.is_sparse else Vector(self.vector_size)
47
+ )
48
+ document = sa.Column("content", sa.String, nullable=True)
49
+ cmetadata = sa.Column("metadata", JSON, nullable=True)
38
50
 
39
51
  _generated_sa_tables[collection_name] = EmbeddingStore
40
52
 
41
53
  self.EmbeddingStore = _generated_sa_tables[collection_name]
42
54
 
43
55
  def __query_collection(
44
- self,
45
- embedding: List[float],
46
- k: int = 4,
47
- filter: Optional[Dict[str, str]] = None,
56
+ self,
57
+ embedding: Union[List[float], Dict[int, float], str],
58
+ k: int = 4,
59
+ filter: Optional[Dict[str, str]] = None,
48
60
  ) -> List[Any]:
49
61
  """Query the collection."""
50
62
  with Session(self._bind) as session:
51
-
52
- results: List[Any] = (
53
- session.query(
54
- self.EmbeddingStore,
55
- self.distance_strategy(embedding).label("distance"),
56
- )
57
- .order_by(sa.asc("distance"))
58
- .limit(k)
59
- .all()
63
+ if self.is_sparse:
64
+ # Sparse vectors: expect string in format "{key:value,...}/size" or dictionary
65
+ if isinstance(embedding, dict):
66
+ from pgvector.utils import SparseVector
67
+ embedding = SparseVector(embedding, self.vector_size)
68
+ embedding_str = embedding.to_text()
69
+ elif isinstance(embedding, str):
70
+ # Use string as is - it should already be in the correct format
71
+ embedding_str = embedding
72
+ # Use inner product for sparse vectors
73
+ distance_op = "<#>"
74
+ # For inner product, larger values are better matches
75
+ order_direction = "DESC"
76
+ else:
77
+ # Dense vectors: expect string in JSON array format or list of floats
78
+ if isinstance(embedding, list):
79
+ embedding_str = f"[{','.join(str(x) for x in embedding)}]"
80
+ elif isinstance(embedding, str):
81
+ embedding_str = embedding
82
+ # Use cosine similarity for dense vectors
83
+ distance_op = "<=>"
84
+ # For cosine similarity, smaller values are better matches
85
+ order_direction = "ASC"
86
+
87
+ # Use SQL directly for vector comparison
88
+ query = sa.text(
89
+ f"""
90
+ SELECT t.*, t.embeddings {distance_op} '{embedding_str}' as distance
91
+ FROM {self.collection_name} t
92
+ ORDER BY distance {order_direction}
93
+ LIMIT {k}
94
+ """
60
95
  )
61
- for rec, _ in results:
62
- if not bool(rec.cmetadata):
63
- rec.cmetadata = {0: 0}
96
+ results = session.execute(query).all()
97
+
98
+ # Convert results to the expected format
99
+ formatted_results = []
100
+ for rec in results:
101
+ metadata = rec.metadata if bool(rec.metadata) else {0: 0}
102
+ embedding_store = self.EmbeddingStore()
103
+ embedding_store.document = rec.content
104
+ embedding_store.cmetadata = metadata
105
+ result = type(
106
+ 'Result', (), {
107
+ 'EmbeddingStore': embedding_store,
108
+ 'distance': rec.distance
109
+ }
110
+ )
111
+ formatted_results.append(result)
64
112
 
65
- return results
113
+ return formatted_results
66
114
 
67
115
  # aliases for different langchain versions
68
116
  def _PGVector__query_collection(self, *args, **kwargs):
117
+
69
118
  return self.__query_collection(*args, **kwargs)
70
119
 
71
120
  def _query_collection(self, *args, **kwargs):
72
121
  return self.__query_collection(*args, **kwargs)
73
122
 
74
123
  def create_collection(self):
75
- raise RuntimeError('Forbidden')
124
+ raise RuntimeError("Forbidden")
76
125
 
77
126
  def delete_collection(self):
78
- raise RuntimeError('Forbidden')
127
+ raise RuntimeError("Forbidden")
79
128
 
80
129
  def delete(self, *args, **kwargs):
81
- raise RuntimeError('Forbidden')
130
+ raise RuntimeError("Forbidden")
82
131
 
83
132
  def add_embeddings(self, *args, **kwargs):
84
- raise RuntimeError('Forbidden')
133
+ raise RuntimeError("Forbidden")
@@ -7,6 +7,7 @@ from pydantic import BaseModel
7
7
 
8
8
  from mindsdb.integrations.utilities.rag.settings import VectorStoreType, VectorStoreConfig
9
9
  from mindsdb.integrations.utilities.rag.loaders.vector_store_loader.MDBVectorStore import MDBVectorStore
10
+ from mindsdb.integrations.utilities.rag.loaders.vector_store_loader.pgvector import PGVectorMDB
10
11
  from mindsdb.utilities import log
11
12
 
12
13
 
@@ -28,6 +29,20 @@ class VectorStoreLoader(BaseModel):
28
29
  Loads the vector store based on the provided config and embeddings model
29
30
  :return:
30
31
  """
32
+ if self.config.is_sparse is not None and self.config.vector_size is not None and self.config.kb_table is not None:
33
+ # Only use PGVector store for sparse vectors.
34
+ db_handler = self.config.kb_table.get_vector_db()
35
+ db_args = db_handler.connection_args
36
+ # Assume we are always using PGVector & psycopg2.
37
+ connection_str = f"postgresql+psycopg2://{db_args.get('user')}:{db_args.get('password')}@{db_args.get('host')}:{db_args.get('port')}/{db_args.get('dbname', db_args.get('database'))}"
38
+
39
+ return PGVectorMDB(
40
+ connection_string=connection_str,
41
+ collection_name=self.config.kb_table._kb.vector_database_table,
42
+ embedding_function=self.embedding_model,
43
+ is_sparse=self.config.is_sparse,
44
+ vector_size=self.config.vector_size
45
+ )
31
46
  return MDBVectorStore(kb_table=self.config.kb_table)
32
47
 
33
48
 
@@ -56,5 +71,7 @@ class VectorStoreFactory:
56
71
  return PGVectorMDB(
57
72
  connection_string=settings.connection_string,
58
73
  collection_name=settings.collection_name,
59
- embedding_function=embedding_model
74
+ embedding_function=embedding_model,
75
+ is_sparse=settings.is_sparse,
76
+ vector_size=settings.vector_size
60
77
  )
@@ -169,7 +169,7 @@ Output:
169
169
  logger.info(f'SQL Retriever query {checked_sql_query} failed with error {error_msg}')
170
170
  if num_retries >= self.num_retries:
171
171
  logger.info('Using fallback retriever in SQL retriever.')
172
- return self.fallback_retriever._get_relevant_documents(retrieval_query, run_manager)
172
+ return self.fallback_retriever._get_relevant_documents(retrieval_query, run_manager=run_manager)
173
173
  query_to_retry = self._prepare_retry_query(checked_sql_query, error_msg, run_manager)
174
174
  query_to_retry_with_embeddings = query_to_retry.format(embeddings=str(embedded_query))
175
175
  # Handle LLM output that has the ```sql delimiter possibly.
@@ -185,4 +185,8 @@ Output:
185
185
  document_row.get('content', ''),
186
186
  metadata=document_row.get('metadata', {})
187
187
  ))
188
- return retrieved_documents
188
+ if retrieved_documents:
189
+ return retrieved_documents
190
+ # If the SQL query constructed did not return any documents, fallback.
191
+ logger.info('No documents returned from SQL retriever. using fallback retriever.')
192
+ return self.fallback_retriever._get_relevant_documents(retrieval_query, run_manager=run_manager)
@@ -290,6 +290,8 @@ class VectorStoreConfig(BaseModel):
290
290
  collection_name: str = DEFAULT_COLLECTION_NAME
291
291
  connection_string: str = None
292
292
  kb_table: Any = None
293
+ is_sparse: bool = False
294
+ vector_size: Optional[int] = None
293
295
 
294
296
  class Config:
295
297
  arbitrary_types_allowed = True
@@ -178,7 +178,7 @@ def project_dataframe(df, targets, table_columns):
178
178
 
179
179
  # adapt column names to projection
180
180
  if len(df_col_rename) > 0:
181
- df = df.rename(columns=df_col_rename)
181
+ df.rename(columns=df_col_rename, inplace=True)
182
182
  return df
183
183
 
184
184
 
@@ -642,11 +642,13 @@ class KnowledgeBaseController:
642
642
  storage: Identifier,
643
643
  params: dict,
644
644
  preprocessing_config: Optional[dict] = None,
645
- if_not_exists: bool = False,
645
+ if_not_exists: bool = False
646
646
  ) -> db.KnowledgeBase:
647
647
  """
648
648
  Add a new knowledge base to the database
649
649
  :param preprocessing_config: Optional preprocessing configuration to validate and store
650
+ :param is_sparse: Whether to use sparse vectors for embeddings
651
+ :param vector_size: Optional size specification for vectors, required when is_sparse=True
650
652
  """
651
653
  # Validate preprocessing config first if provided
652
654
  if preprocessing_config is not None:
@@ -654,6 +656,12 @@ class KnowledgeBaseController:
654
656
  params = params or {}
655
657
  params['preprocessing'] = preprocessing_config
656
658
 
659
+ # Check if vector_size is provided when using sparse vectors
660
+ is_sparse = params.get('is_sparse')
661
+ vector_size = params.get('vector_size')
662
+ if is_sparse and vector_size is None:
663
+ raise ValueError("vector_size is required when is_sparse=True")
664
+
657
665
  # get project id
658
666
  project = self.session.database_controller.get_project(project_name)
659
667
  project_id = project.id
@@ -693,7 +701,20 @@ class KnowledgeBaseController:
693
701
  cloud_pg_vector = os.environ.get('KB_PGVECTOR_URL')
694
702
  if cloud_pg_vector:
695
703
  vector_table_name = name
696
- vector_db_name = self._create_persistent_pgvector()
704
+ # Add sparse vector support for pgvector
705
+ vector_db_params = {}
706
+ # Check both explicit parameter and model configuration
707
+ is_sparse = is_sparse or model_record.learn_args.get('using', {}).get('sparse')
708
+ if is_sparse:
709
+ vector_db_params['is_sparse'] = True
710
+ if vector_size is not None:
711
+ vector_db_params['vector_size'] = vector_size
712
+ vector_db_name = self._create_persistent_pgvector(vector_db_params)
713
+
714
+ # create table in vectordb before creating KB
715
+ self.session.datahub.get(vector_db_name).integration_handler.create_table(
716
+ vector_table_name
717
+ )
697
718
  else:
698
719
  # create chroma db with same name
699
720
  vector_table_name = "default_collection"
@@ -707,15 +728,14 @@ class KnowledgeBaseController:
707
728
 
708
729
  vector_database_id = self.session.integration_controller.get(vector_db_name)['id']
709
730
 
710
- # create table in vectordb
711
- if model_record.learn_args.get('using', {}).get('sparse') is not None:
712
- self.session.datahub.get(vector_db_name).integration_handler.create_table(
713
- vector_table_name, sparse=model_record.learn_args.get('using', {}).get('sparse')
714
- )
715
- else:
716
- self.session.datahub.get(vector_db_name).integration_handler.create_table(
717
- vector_table_name
718
- )
731
+ # Store sparse vector settings in params if specified
732
+ if is_sparse:
733
+ params = params or {}
734
+ params['vector_config'] = {
735
+ 'is_sparse': is_sparse
736
+ }
737
+ if vector_size is not None:
738
+ params['vector_config']['vector_size'] = vector_size
719
739
 
720
740
  kb = db.KnowledgeBase(
721
741
  name=name,
@@ -729,16 +749,15 @@ class KnowledgeBaseController:
729
749
  db.session.commit()
730
750
  return kb
731
751
 
732
- def _create_persistent_pgvector(self):
752
+ def _create_persistent_pgvector(self, params=None):
733
753
  """Create default vector database for knowledge base, if not specified"""
734
-
735
754
  vector_store_name = "kb_pgvector_store"
736
755
 
737
756
  # check if exists
738
757
  if self.session.integration_controller.get(vector_store_name):
739
758
  return vector_store_name
740
759
 
741
- self.session.integration_controller.add(vector_store_name, 'pgvector', {})
760
+ self.session.integration_controller.add(vector_store_name, 'pgvector', params or {})
742
761
  return vector_store_name
743
762
 
744
763
  def _create_persistent_chroma(self, kb_name, engine="chromadb"):
@@ -43,10 +43,17 @@ def build_retrieval_tool(tool: dict, pred_args: dict, skill: db.Skills):
43
43
  raise ValueError(f"Knowledge base not found: {kb_name}")
44
44
 
45
45
  kb_table = executor.session.kb_controller.get_table(kb.name, kb.project_id)
46
+ vector_store_config = {
47
+ 'kb_table': kb_table
48
+ }
49
+ is_sparse = tools_config.pop('is_sparse', None)
50
+ vector_size = tools_config.pop('vector_size', None)
51
+ if is_sparse is not None:
52
+ vector_store_config['is_sparse'] = is_sparse
53
+ if vector_size is not None:
54
+ vector_store_config['vector_size'] = vector_size
46
55
  kb_params = {
47
- 'vector_store_config': {
48
- 'kb_table': kb_table
49
- }
56
+ 'vector_store_config': vector_store_config
50
57
  }
51
58
 
52
59
  # Get embedding model from knowledge base table
@@ -71,10 +71,13 @@ _CACHE_MAX_SIZE = 500
71
71
 
72
72
 
73
73
  def dataframe_checksum(df: pd.DataFrame):
74
-
75
- return str_checksum(str(
76
- df.set_axis(range(len(df.columns)), axis=1).to_records(index=False)
77
- ))
74
+ original_columns = df.columns
75
+ df.columns = list(range(len(df.columns)))
76
+ result = hashlib.sha256(
77
+ str(df.values).encode()
78
+ ).hexdigest()
79
+ df.columns = original_columns
80
+ return result
78
81
 
79
82
 
80
83
  def json_checksum(obj: t.Union[dict, list]):
@@ -52,6 +52,15 @@ class Context:
52
52
  def load(self, storage: dict) -> None:
53
53
  self._storage.set(storage)
54
54
 
55
+ def metadata(self, **kwargs) -> dict:
56
+ return {
57
+ 'user_id': self.user_id or "",
58
+ 'company_id': self.company_id or "",
59
+ 'session_id': self.session_id,
60
+ 'user_class': self.user_class,
61
+ **kwargs
62
+ }
63
+
55
64
 
56
65
  _context_var = ContextVar('mindsdb.context')
57
66
  context = Context(_context_var)
mindsdb/utilities/log.py CHANGED
@@ -29,6 +29,23 @@ class ColorFormatter(logging.Formatter):
29
29
  return log_fmt.format(record)
30
30
 
31
31
 
32
+ def get_console_handler_config_level() -> int:
33
+ console_handler_config = app_config['logging']['handlers']['console']
34
+ return getattr(logging, console_handler_config["level"])
35
+
36
+
37
+ def get_file_handler_config_level() -> int:
38
+ file_handler_config = app_config['logging']['handlers']['file']
39
+ return getattr(logging, file_handler_config["level"])
40
+
41
+
42
+ def get_mindsdb_log_level() -> int:
43
+ console_handler_config_level = get_console_handler_config_level()
44
+ file_handler_config_level = get_file_handler_config_level()
45
+
46
+ return min(console_handler_config_level, file_handler_config_level)
47
+
48
+
32
49
  def configure_logging():
33
50
  handlers_config = {}
34
51
  console_handler_config = app_config['logging']['handlers']['console']
@@ -39,6 +56,7 @@ def configure_logging():
39
56
  "formatter": "f",
40
57
  "level": console_handler_config_level
41
58
  }
59
+
42
60
  file_handler_config = app_config['logging']['handlers']['file']
43
61
  file_handler_config_level = getattr(logging, file_handler_config["level"])
44
62
  if file_handler_config['enabled'] is True:
@@ -51,7 +69,7 @@ def configure_logging():
51
69
  "backupCount": file_handler_config["backupCount"]
52
70
  }
53
71
 
54
- mindsdb_log_level = min(console_handler_config_level, file_handler_config_level)
72
+ mindsdb_log_level = get_mindsdb_log_level()
55
73
 
56
74
  logging_config = dict(
57
75
  version=1,
@@ -65,7 +83,7 @@ def configure_logging():
65
83
  loggers={
66
84
  "": { # root logger
67
85
  "handlers": list(handlers_config.keys()),
68
- "level": logging.WARNING,
86
+ "level": mindsdb_log_level,
69
87
  },
70
88
  "__main__": {
71
89
  "level": mindsdb_log_level,
@@ -0,0 +1,206 @@
1
+ import os
2
+ import typing
3
+
4
+ from opentelemetry import trace # noqa: F401
5
+ from opentelemetry import metrics # noqa: F401
6
+ from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter as OTLPLogExporterGRPC
7
+ from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter as OTLPLogExporterHTTP
8
+ from opentelemetry.sdk._logs._internal.export import LogExporter
9
+ from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as OTLPMetricExporterGRPC
10
+ from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as OTLPMetricExporterHTTP
11
+ from opentelemetry.sdk.metrics.export import MetricExporter, ConsoleMetricExporter
12
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter as OTLPSpanExporterGRPC
13
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as OTLPSpanExporterHTTP
14
+ from opentelemetry.sdk.trace.export import SpanExporter, ConsoleSpanExporter
15
+ from opentelemetry.sdk.resources import Resource
16
+ from opentelemetry.sdk.trace.sampling import TraceIdRatioBased
17
+
18
+ from mindsdb.utilities.otel.logger import setup_logger
19
+ from mindsdb.utilities.otel.meter import setup_meter
20
+ from mindsdb.utilities.otel.tracer import setup_tracer
21
+ from mindsdb.utilities.utils import parse_csv_attributes
22
+ from mindsdb.utilities import log
23
+
24
+ logger = log.getLogger(__name__)
25
+
26
+ # Check OpenTelemetry exporter type
27
+ OTEL_EXPORTER_TYPE = os.getenv("OTEL_EXPORTER_TYPE", "console") # console or otlp
28
+
29
+ # Define OpenTelemetry exporter protocol
30
+ OTEL_EXPORTER_PROTOCOL = os.getenv("OTEL_EXPORTER_PROTOCOL", "grpc") # grpc or http
31
+
32
+ # Define OTLP endpoint. If not set, the default OTLP endpoint will be used
33
+ OTEL_OTLP_ENDPOINT = os.getenv("OTEL_OTLP_ENDPOINT", "http://localhost:4317")
34
+
35
+ # Define OTLP logging endpoint. If not set, the default OTLP logging endpoint will be used
36
+ OTEL_OTLP_LOGGING_ENDPOINT = os.getenv("OTEL_OTLP_LOGGING_ENDPOINT", OTEL_OTLP_ENDPOINT)
37
+
38
+ # Define OTLP tracing endpoint. If not set, the default OTLP tracing endpoint will be used
39
+ OTEL_OTLP_TRACING_ENDPOINT = os.getenv("OTEL_OTLP_TRACING_ENDPOINT", OTEL_OTLP_ENDPOINT)
40
+
41
+ # Define OTLP metrics endpoint. If not set, the default OTLP metrics endpoint will be used
42
+ OTEL_OTLP_METRICS_ENDPOINT = os.getenv("OTEL_OTLP_METRICS_ENDPOINT", OTEL_OTLP_ENDPOINT)
43
+
44
+ # Define service name
45
+ OTEL_SERVICE_NAME = os.getenv("OTEL_SERVICE_NAME", "mindsdb")
46
+
47
+ # Define service instace ID
48
+ OTEL_SERVICE_INSTANCE_ID = os.getenv("OTEL_SERVICE_INSTANCE_ID", "mindsdb-instance")
49
+
50
+ # The name of the environment we"re on, by default local for development, this is set differently per-env in our Helm
51
+ # chart values files
52
+ OTEL_SERVICE_ENVIRONMENT = os.getenv("OTEL_SERVICE_ENVIRONMENT", "local").lower()
53
+
54
+ # Define service release
55
+ OTEL_SERVICE_RELEASE = os.getenv("OTEL_SERVICE_RELEASE", "local").lower()
56
+
57
+ # Define how often to capture traces
58
+ OTEL_TRACE_SAMPLE_RATE = float(os.getenv("OTEL_TRACE_SAMPLE_RATE", "1.0"))
59
+
60
+ # Define extra attributes
61
+ OTEL_EXTRA_ATTRIBUTES = os.getenv("OTEL_EXTRA_ATTRIBUTES", "")
62
+
63
+ # By default, we have Open Telemetry SDK enabled on all envs, except for local which is disabled by default.
64
+ OTEL_SDK_DISABLED = (os.getenv("OTEL_SDK_DISABLED", "false").lower() == "true"
65
+ or os.getenv("OTEL_SERVICE_ENVIRONMENT", "local").lower() == "local")
66
+
67
+ # Define if OpenTelemetry logging is disabled. By default, it is disabled.
68
+ OTEL_LOGGING_DISABLED = os.getenv("OTEL_LOGGING_DISABLED", "true").lower() == "true"
69
+
70
+ # Define if OpenTelemetry tracing is disabled. By default, it is enabled.
71
+ OTEL_TRACING_DISABLED = os.getenv("OTEL_TRACING_DISABLED", "false").lower() == "true"
72
+
73
+ # Define if OpenTelemetry metrics is disabled. By default, it is disabled.
74
+ OTEL_METRICS_DISABLED = os.getenv("OTEL_METRICS_DISABLED", "true").lower() == "true"
75
+
76
+ # If you want to enable Open Telemetry on local for some reason please set OTEL_SDK_FORCE_RUN to true
77
+ OTEL_SDK_FORCE_RUN = os.getenv("OTEL_SDK_FORCE_RUN", "false").lower() == "true"
78
+
79
+
80
+ def get_otel_attributes() -> dict:
81
+ """
82
+ Get OpenTelemetry attributes
83
+
84
+ Returns:
85
+ dict: OpenTelemetry attributes
86
+ """
87
+
88
+ base_attributes = {
89
+ "service.name": OTEL_SERVICE_NAME,
90
+ "service.instance.id": OTEL_SERVICE_INSTANCE_ID,
91
+ "environment": OTEL_SERVICE_ENVIRONMENT,
92
+ "release": OTEL_SERVICE_RELEASE,
93
+ }
94
+
95
+ extra_attributes = {}
96
+ try:
97
+ extra_attributes = parse_csv_attributes(OTEL_EXTRA_ATTRIBUTES)
98
+ except Exception as e:
99
+ logger.error(f"Failed to parse OTEL_EXTRA_ATTRIBUTES: {e}")
100
+
101
+ attributes = {**extra_attributes, **base_attributes} # Base attributes take precedence over extra attributes
102
+
103
+ return attributes
104
+
105
+
106
+ def get_logging_exporter() -> typing.Optional[LogExporter]:
107
+ """
108
+ Get OpenTelemetry logging exporter.
109
+
110
+ Returns:
111
+ OTLPLogExporter: OpenTelemetry logging exporter
112
+ """
113
+
114
+ if OTEL_EXPORTER_TYPE == "otlp":
115
+
116
+ if OTEL_EXPORTER_PROTOCOL == "grpc":
117
+ return OTLPLogExporterGRPC(
118
+ endpoint=OTEL_OTLP_LOGGING_ENDPOINT,
119
+ insecure=True
120
+ )
121
+
122
+ elif OTEL_EXPORTER_PROTOCOL == "http":
123
+ return OTLPLogExporterHTTP(
124
+ endpoint=OTEL_OTLP_LOGGING_ENDPOINT
125
+ )
126
+
127
+ return None
128
+
129
+
130
+ def get_span_exporter() -> SpanExporter:
131
+ """
132
+ Get OpenTelemetry span exporter
133
+
134
+ Returns:
135
+ OTLPSpanExporter: OpenTelemetry span exporter
136
+ """
137
+
138
+ if OTEL_EXPORTER_TYPE == "otlp":
139
+
140
+ if OTEL_EXPORTER_PROTOCOL == "grpc":
141
+ return OTLPSpanExporterGRPC(
142
+ endpoint=OTEL_OTLP_TRACING_ENDPOINT,
143
+ insecure=True
144
+ )
145
+
146
+ elif OTEL_EXPORTER_PROTOCOL == "http":
147
+ return OTLPSpanExporterHTTP(
148
+ endpoint=OTEL_OTLP_TRACING_ENDPOINT
149
+ )
150
+
151
+ return ConsoleSpanExporter()
152
+
153
+
154
+ def get_metrics_exporter() -> typing.Optional[MetricExporter]:
155
+ """
156
+ Get OpenTelemetry metrics exporter
157
+
158
+ Returns:
159
+ OTLPLogExporter: OpenTelemetry metrics exporter
160
+ """
161
+
162
+ if OTEL_EXPORTER_TYPE == "otlp":
163
+
164
+ if OTEL_EXPORTER_PROTOCOL == "grpc":
165
+ return OTLPMetricExporterGRPC(
166
+ endpoint=OTEL_OTLP_METRICS_ENDPOINT,
167
+ insecure=True
168
+ )
169
+
170
+ elif OTEL_EXPORTER_PROTOCOL == "http":
171
+ return OTLPMetricExporterHTTP(
172
+ endpoint=OTEL_OTLP_METRICS_ENDPOINT
173
+ )
174
+
175
+ return ConsoleMetricExporter()
176
+
177
+
178
+ if not OTEL_SDK_DISABLED or OTEL_SDK_FORCE_RUN:
179
+ logger.info("OpenTelemetry enabled")
180
+ logger.info(f"OpenTelemetry exporter type: {OTEL_EXPORTER_TYPE}")
181
+ logger.info(f"OpenTelemetry service name: {OTEL_SERVICE_NAME}")
182
+ logger.info(f"OpenTelemetry service environment: {OTEL_SERVICE_ENVIRONMENT}")
183
+ logger.info(f"OpenTelemetry service release: {OTEL_SERVICE_RELEASE}")
184
+ logger.info(f"OpenTelemetry trace sample rate: {OTEL_TRACE_SAMPLE_RATE}")
185
+ logger.info(f"OpenTelemetry extra attributes: {OTEL_EXTRA_ATTRIBUTES}")
186
+
187
+ # Define OpenTelemetry resources (e.g., service name)
188
+ attributes = get_otel_attributes()
189
+
190
+ # Define OpenTelemetry sampler
191
+ sampler = TraceIdRatioBased(OTEL_TRACE_SAMPLE_RATE)
192
+
193
+ # Define OpenTelemetry resources (e.g., service name)
194
+ resource = Resource(attributes=attributes)
195
+
196
+ if not OTEL_LOGGING_DISABLED:
197
+ logger.info("OpenTelemetry Logging is enabled")
198
+ setup_logger(resource, get_logging_exporter())
199
+
200
+ if not OTEL_TRACING_DISABLED:
201
+ logger.info("OpenTelemetry Tracing is enabled")
202
+ setup_tracer(resource, sampler, get_span_exporter())
203
+
204
+ if not OTEL_METRICS_DISABLED:
205
+ logger.info("OpenTelemetry Metrics is enabled")
206
+ setup_meter(resource, get_metrics_exporter())