MindsDB 25.1.2.0__py3-none-any.whl → 25.1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.3.0.dist-info}/METADATA +255 -242
- {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.3.0.dist-info}/RECORD +38 -30
- {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.3.0.dist-info}/WHEEL +1 -1
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +5 -3
- mindsdb/api/executor/sql_query/result_set.py +36 -21
- mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +1 -1
- mindsdb/api/executor/sql_query/steps/join_step.py +4 -4
- mindsdb/api/executor/utilities/sql.py +2 -10
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +7 -0
- mindsdb/integrations/handlers/file_handler/file_handler.py +1 -1
- mindsdb/integrations/handlers/langchain_embedding_handler/fastapi_embeddings.py +82 -0
- mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +8 -1
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +47 -12
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +3 -3
- mindsdb/integrations/utilities/rag/loaders/vector_store_loader/pgvector.py +76 -27
- mindsdb/integrations/utilities/rag/loaders/vector_store_loader/vector_store_loader.py +18 -1
- mindsdb/integrations/utilities/rag/pipelines/rag.py +11 -0
- mindsdb/integrations/utilities/rag/rag_pipeline_builder.py +16 -1
- mindsdb/integrations/utilities/rag/retrievers/__init__.py +3 -0
- mindsdb/integrations/utilities/rag/retrievers/multi_hop_retriever.py +85 -0
- mindsdb/integrations/utilities/rag/retrievers/retriever_factory.py +57 -0
- mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +46 -3
- mindsdb/integrations/utilities/rag/settings.py +160 -6
- mindsdb/integrations/utilities/sql_utils.py +1 -1
- mindsdb/interfaces/knowledge_base/controller.py +33 -9
- mindsdb/interfaces/skills/retrieval_tool.py +10 -3
- mindsdb/utilities/cache.py +7 -4
- mindsdb/utilities/context.py +9 -0
- mindsdb/utilities/log.py +20 -2
- mindsdb/utilities/otel/__init__.py +206 -0
- mindsdb/utilities/otel/logger.py +25 -0
- mindsdb/utilities/otel/meter.py +19 -0
- mindsdb/utilities/otel/metric_handlers/__init__.py +25 -0
- mindsdb/utilities/otel/tracer.py +16 -0
- mindsdb/utilities/utils.py +34 -0
- mindsdb/utilities/otel.py +0 -72
- {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.3.0.dist-info}/LICENSE +0 -0
- {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.3.0.dist-info}/top_level.txt +0 -0
|
@@ -150,11 +150,118 @@ Here is the user input:
|
|
|
150
150
|
{input}
|
|
151
151
|
'''
|
|
152
152
|
|
|
153
|
+
DEFAULT_QUESTION_REFORMULATION_TEMPLATE = """Given the original question and the retrieved context,
|
|
154
|
+
analyze what additional information is needed for a complete, accurate answer.
|
|
155
|
+
|
|
156
|
+
Original Question: {question}
|
|
157
|
+
|
|
158
|
+
Retrieved Context:
|
|
159
|
+
{context}
|
|
160
|
+
|
|
161
|
+
Analysis Instructions:
|
|
162
|
+
1. Evaluate Context Coverage:
|
|
163
|
+
- Identify key entities and concepts from the question
|
|
164
|
+
- Check for temporal information (dates, periods, sequences)
|
|
165
|
+
- Verify causal relationships are explained
|
|
166
|
+
- Confirm presence of requested quantitative data
|
|
167
|
+
- Assess if geographic or spatial context is sufficient
|
|
168
|
+
|
|
169
|
+
2. Quality Assessment:
|
|
170
|
+
If the retrieved context is:
|
|
171
|
+
- Irrelevant or tangential
|
|
172
|
+
- Too general or vague
|
|
173
|
+
- Potentially contradictory
|
|
174
|
+
- Missing key perspectives
|
|
175
|
+
- Lacking proper evidence
|
|
176
|
+
Generate questions to address these specific gaps.
|
|
177
|
+
|
|
178
|
+
3. Follow-up Question Requirements:
|
|
179
|
+
- Questions must directly contribute to answering the original query
|
|
180
|
+
- Break complex relationships into simpler, sequential steps
|
|
181
|
+
- Maintain specificity rather than broad inquiries
|
|
182
|
+
- Avoid questions answerable from existing context
|
|
183
|
+
- Ensure questions build on each other logically
|
|
184
|
+
- Limit questions to 150 characters each
|
|
185
|
+
- Each question must be self-contained
|
|
186
|
+
- Questions must end with a question mark
|
|
187
|
+
|
|
188
|
+
4. Response Format:
|
|
189
|
+
- Return a JSON array of strings
|
|
190
|
+
- Use square brackets and double quotes
|
|
191
|
+
- Questions must be unique (no duplicates)
|
|
192
|
+
- If context is sufficient, return empty array []
|
|
193
|
+
- Maximum 3 follow-up questions
|
|
194
|
+
- Minimum length per question: 30 characters
|
|
195
|
+
- No null values or empty strings
|
|
196
|
+
|
|
197
|
+
Example:
|
|
198
|
+
Original: "How did the development of antibiotics affect military casualties in WWII?"
|
|
199
|
+
|
|
200
|
+
Invalid responses:
|
|
201
|
+
{'questions': ['What are antibiotics?']} // Wrong format
|
|
202
|
+
['What is WWII?'] // Too basic
|
|
203
|
+
['How did it impact things?'] // Too vague
|
|
204
|
+
['', 'Question 2'] // Contains empty string
|
|
205
|
+
['Same question?', 'Same question?'] // Duplicate
|
|
206
|
+
|
|
207
|
+
Valid response:
|
|
208
|
+
["What were military casualty rates from infections before widespread antibiotic use in 1942?",
|
|
209
|
+
"How did penicillin availability change throughout different stages of WWII?",
|
|
210
|
+
"What were the primary battlefield infections treated with antibiotics during WWII?"]
|
|
211
|
+
|
|
212
|
+
or [] if context fully answers the original question.
|
|
213
|
+
|
|
214
|
+
Your task: Based on the analysis of the original question and context,
|
|
215
|
+
output ONLY a JSON array of follow-up questions needed to provide a complete answer.
|
|
216
|
+
If no additional information is needed, output an empty array [].
|
|
217
|
+
|
|
218
|
+
Follow-up Questions:"""
|
|
219
|
+
|
|
220
|
+
DEFAULT_QUERY_RETRY_PROMPT_TEMPLATE = '''
|
|
221
|
+
{query}
|
|
222
|
+
|
|
223
|
+
The {dialect} query above failed with the error message: {error}.
|
|
224
|
+
|
|
225
|
+
<< TABLES YOU HAVE ACCESS TO >>
|
|
226
|
+
1. {embeddings_table} - Contains document chunks, vector embeddings, and metadata for documents.
|
|
227
|
+
|
|
228
|
+
Columns:
|
|
229
|
+
```json
|
|
230
|
+
{{
|
|
231
|
+
"id": {{
|
|
232
|
+
"type": "string",
|
|
233
|
+
"description": "Unique ID for this document chunk"
|
|
234
|
+
}},
|
|
235
|
+
"content": {{
|
|
236
|
+
"type": "string",
|
|
237
|
+
"description": "A document chunk (subset of the original document)"
|
|
238
|
+
}},
|
|
239
|
+
"embeddings": {{
|
|
240
|
+
"type": "vector",
|
|
241
|
+
"description": "Vector embeddings for the document chunk."
|
|
242
|
+
}},
|
|
243
|
+
"metadata": {{
|
|
244
|
+
"type": "jsonb",
|
|
245
|
+
"description": "Metadata for the document chunk."
|
|
246
|
+
}}
|
|
247
|
+
}}
|
|
248
|
+
|
|
249
|
+
{schema}
|
|
250
|
+
|
|
251
|
+
Rewrite the query so it works.
|
|
252
|
+
|
|
253
|
+
Output the final SQL query only.
|
|
254
|
+
|
|
255
|
+
SQL Query:
|
|
256
|
+
'''
|
|
257
|
+
|
|
258
|
+
DEFAULT_NUM_QUERY_RETRIES = 2
|
|
259
|
+
|
|
153
260
|
|
|
154
261
|
class LLMConfig(BaseModel):
|
|
155
262
|
model_name: str = Field(default=DEFAULT_LLM_MODEL, description='LLM model to use for generation')
|
|
156
263
|
provider: str = Field(default=DEFAULT_LLM_MODEL_PROVIDER, description='LLM model provider to use for generation')
|
|
157
|
-
params: Dict[str, Any] =
|
|
264
|
+
params: Dict[str, Any] = Field(default_factory=dict)
|
|
158
265
|
|
|
159
266
|
|
|
160
267
|
class MultiVectorRetrieverMode(Enum):
|
|
@@ -183,17 +290,21 @@ class VectorStoreConfig(BaseModel):
|
|
|
183
290
|
collection_name: str = DEFAULT_COLLECTION_NAME
|
|
184
291
|
connection_string: str = None
|
|
185
292
|
kb_table: Any = None
|
|
293
|
+
is_sparse: bool = False
|
|
294
|
+
vector_size: Optional[int] = None
|
|
186
295
|
|
|
187
296
|
class Config:
|
|
188
297
|
arbitrary_types_allowed = True
|
|
189
298
|
extra = "forbid"
|
|
190
299
|
|
|
191
300
|
|
|
192
|
-
class RetrieverType(Enum):
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
301
|
+
class RetrieverType(str, Enum):
|
|
302
|
+
"""Retriever type for RAG pipeline"""
|
|
303
|
+
VECTOR_STORE = "vector_store"
|
|
304
|
+
AUTO = "auto"
|
|
305
|
+
MULTI = "multi"
|
|
306
|
+
SQL = "sql"
|
|
307
|
+
MULTI_HOP = "multi_hop"
|
|
197
308
|
|
|
198
309
|
|
|
199
310
|
class SearchType(Enum):
|
|
@@ -293,6 +404,14 @@ class SQLRetrieverConfig(BaseModel):
|
|
|
293
404
|
default=DEFAULT_QUERY_CHECKER_PROMPT_TEMPLATE,
|
|
294
405
|
description="Prompt template to use for double checking SQL queries before execution. Has 'query' and 'dialect' input variables."
|
|
295
406
|
)
|
|
407
|
+
query_retry_template: str = Field(
|
|
408
|
+
default=DEFAULT_QUERY_RETRY_PROMPT_TEMPLATE,
|
|
409
|
+
description="Prompt template to rewrite SQL query that failed. Has 'dialect', 'query', and 'error' input variables."
|
|
410
|
+
)
|
|
411
|
+
num_retries: int = Field(
|
|
412
|
+
default=DEFAULT_NUM_QUERY_RETRIES,
|
|
413
|
+
description="How many times for an LLM to try rewriting a failed SQL query before using the fallback retriever."
|
|
414
|
+
)
|
|
296
415
|
rewrite_prompt_template: str = Field(
|
|
297
416
|
default=DEFAULT_SEMANTIC_PROMPT_TEMPLATE,
|
|
298
417
|
description="Prompt template to rewrite user input to be better suited for retrieval. Has 'input' input variable."
|
|
@@ -336,6 +455,27 @@ class RerankerConfig(BaseModel):
|
|
|
336
455
|
num_docs_to_keep: Optional[int] = None
|
|
337
456
|
|
|
338
457
|
|
|
458
|
+
class MultiHopRetrieverConfig(BaseModel):
|
|
459
|
+
"""Configuration for multi-hop retrieval"""
|
|
460
|
+
base_retriever_type: RetrieverType = Field(
|
|
461
|
+
default=RetrieverType.VECTOR_STORE,
|
|
462
|
+
description="Type of base retriever to use for multi-hop retrieval"
|
|
463
|
+
)
|
|
464
|
+
max_hops: int = Field(
|
|
465
|
+
default=3,
|
|
466
|
+
description="Maximum number of follow-up questions to generate",
|
|
467
|
+
ge=1
|
|
468
|
+
)
|
|
469
|
+
reformulation_template: str = Field(
|
|
470
|
+
default=DEFAULT_QUESTION_REFORMULATION_TEMPLATE,
|
|
471
|
+
description="Template for reformulating questions"
|
|
472
|
+
)
|
|
473
|
+
llm_config: LLMConfig = Field(
|
|
474
|
+
default_factory=LLMConfig,
|
|
475
|
+
description="LLM configuration to use for generating follow-up questions"
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
|
|
339
479
|
class RAGPipelineModel(BaseModel):
|
|
340
480
|
documents: Optional[List[Document]] = Field(
|
|
341
481
|
default=None,
|
|
@@ -462,6 +602,20 @@ class RAGPipelineModel(BaseModel):
|
|
|
462
602
|
description="Reranker configuration"
|
|
463
603
|
)
|
|
464
604
|
|
|
605
|
+
multi_hop_config: Optional[MultiHopRetrieverConfig] = Field(
|
|
606
|
+
default=None,
|
|
607
|
+
description="Configuration for multi-hop retrieval. Required when retriever_type is MULTI_HOP."
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
@field_validator("multi_hop_config")
|
|
611
|
+
@classmethod
|
|
612
|
+
def validate_multi_hop_config(cls, v: Optional[MultiHopRetrieverConfig], info):
|
|
613
|
+
"""Validate that multi_hop_config is set when using multi-hop retrieval."""
|
|
614
|
+
values = info.data
|
|
615
|
+
if values.get("retriever_type") == RetrieverType.MULTI_HOP and v is None:
|
|
616
|
+
raise ValueError("multi_hop_config must be set when using multi-hop retrieval")
|
|
617
|
+
return v
|
|
618
|
+
|
|
465
619
|
class Config:
|
|
466
620
|
arbitrary_types_allowed = True
|
|
467
621
|
extra = "forbid"
|
|
@@ -642,11 +642,13 @@ class KnowledgeBaseController:
|
|
|
642
642
|
storage: Identifier,
|
|
643
643
|
params: dict,
|
|
644
644
|
preprocessing_config: Optional[dict] = None,
|
|
645
|
-
if_not_exists: bool = False
|
|
645
|
+
if_not_exists: bool = False
|
|
646
646
|
) -> db.KnowledgeBase:
|
|
647
647
|
"""
|
|
648
648
|
Add a new knowledge base to the database
|
|
649
649
|
:param preprocessing_config: Optional preprocessing configuration to validate and store
|
|
650
|
+
:param is_sparse: Whether to use sparse vectors for embeddings
|
|
651
|
+
:param vector_size: Optional size specification for vectors, required when is_sparse=True
|
|
650
652
|
"""
|
|
651
653
|
# Validate preprocessing config first if provided
|
|
652
654
|
if preprocessing_config is not None:
|
|
@@ -654,6 +656,12 @@ class KnowledgeBaseController:
|
|
|
654
656
|
params = params or {}
|
|
655
657
|
params['preprocessing'] = preprocessing_config
|
|
656
658
|
|
|
659
|
+
# Check if vector_size is provided when using sparse vectors
|
|
660
|
+
is_sparse = params.get('is_sparse')
|
|
661
|
+
vector_size = params.get('vector_size')
|
|
662
|
+
if is_sparse and vector_size is None:
|
|
663
|
+
raise ValueError("vector_size is required when is_sparse=True")
|
|
664
|
+
|
|
657
665
|
# get project id
|
|
658
666
|
project = self.session.database_controller.get_project(project_name)
|
|
659
667
|
project_id = project.id
|
|
@@ -693,7 +701,20 @@ class KnowledgeBaseController:
|
|
|
693
701
|
cloud_pg_vector = os.environ.get('KB_PGVECTOR_URL')
|
|
694
702
|
if cloud_pg_vector:
|
|
695
703
|
vector_table_name = name
|
|
696
|
-
|
|
704
|
+
# Add sparse vector support for pgvector
|
|
705
|
+
vector_db_params = {}
|
|
706
|
+
# Check both explicit parameter and model configuration
|
|
707
|
+
is_sparse = is_sparse or model_record.learn_args.get('using', {}).get('sparse')
|
|
708
|
+
if is_sparse:
|
|
709
|
+
vector_db_params['is_sparse'] = True
|
|
710
|
+
if vector_size is not None:
|
|
711
|
+
vector_db_params['vector_size'] = vector_size
|
|
712
|
+
vector_db_name = self._create_persistent_pgvector(vector_db_params)
|
|
713
|
+
|
|
714
|
+
# create table in vectordb before creating KB
|
|
715
|
+
self.session.datahub.get(vector_db_name).integration_handler.create_table(
|
|
716
|
+
vector_table_name
|
|
717
|
+
)
|
|
697
718
|
else:
|
|
698
719
|
# create chroma db with same name
|
|
699
720
|
vector_table_name = "default_collection"
|
|
@@ -707,10 +728,14 @@ class KnowledgeBaseController:
|
|
|
707
728
|
|
|
708
729
|
vector_database_id = self.session.integration_controller.get(vector_db_name)['id']
|
|
709
730
|
|
|
710
|
-
#
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
731
|
+
# Store sparse vector settings in params if specified
|
|
732
|
+
if is_sparse:
|
|
733
|
+
params = params or {}
|
|
734
|
+
params['vector_config'] = {
|
|
735
|
+
'is_sparse': is_sparse
|
|
736
|
+
}
|
|
737
|
+
if vector_size is not None:
|
|
738
|
+
params['vector_config']['vector_size'] = vector_size
|
|
714
739
|
|
|
715
740
|
kb = db.KnowledgeBase(
|
|
716
741
|
name=name,
|
|
@@ -724,16 +749,15 @@ class KnowledgeBaseController:
|
|
|
724
749
|
db.session.commit()
|
|
725
750
|
return kb
|
|
726
751
|
|
|
727
|
-
def _create_persistent_pgvector(self):
|
|
752
|
+
def _create_persistent_pgvector(self, params=None):
|
|
728
753
|
"""Create default vector database for knowledge base, if not specified"""
|
|
729
|
-
|
|
730
754
|
vector_store_name = "kb_pgvector_store"
|
|
731
755
|
|
|
732
756
|
# check if exists
|
|
733
757
|
if self.session.integration_controller.get(vector_store_name):
|
|
734
758
|
return vector_store_name
|
|
735
759
|
|
|
736
|
-
self.session.integration_controller.add(vector_store_name, 'pgvector', {})
|
|
760
|
+
self.session.integration_controller.add(vector_store_name, 'pgvector', params or {})
|
|
737
761
|
return vector_store_name
|
|
738
762
|
|
|
739
763
|
def _create_persistent_chroma(self, kb_name, engine="chromadb"):
|
|
@@ -43,10 +43,17 @@ def build_retrieval_tool(tool: dict, pred_args: dict, skill: db.Skills):
|
|
|
43
43
|
raise ValueError(f"Knowledge base not found: {kb_name}")
|
|
44
44
|
|
|
45
45
|
kb_table = executor.session.kb_controller.get_table(kb.name, kb.project_id)
|
|
46
|
+
vector_store_config = {
|
|
47
|
+
'kb_table': kb_table
|
|
48
|
+
}
|
|
49
|
+
is_sparse = tools_config.pop('is_sparse', None)
|
|
50
|
+
vector_size = tools_config.pop('vector_size', None)
|
|
51
|
+
if is_sparse is not None:
|
|
52
|
+
vector_store_config['is_sparse'] = is_sparse
|
|
53
|
+
if vector_size is not None:
|
|
54
|
+
vector_store_config['vector_size'] = vector_size
|
|
46
55
|
kb_params = {
|
|
47
|
-
'vector_store_config':
|
|
48
|
-
'kb_table': kb_table
|
|
49
|
-
}
|
|
56
|
+
'vector_store_config': vector_store_config
|
|
50
57
|
}
|
|
51
58
|
|
|
52
59
|
# Get embedding model from knowledge base table
|
mindsdb/utilities/cache.py
CHANGED
|
@@ -71,10 +71,13 @@ _CACHE_MAX_SIZE = 500
|
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
def dataframe_checksum(df: pd.DataFrame):
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
74
|
+
original_columns = df.columns
|
|
75
|
+
df.columns = list(range(len(df.columns)))
|
|
76
|
+
result = hashlib.sha256(
|
|
77
|
+
str(df.values).encode()
|
|
78
|
+
).hexdigest()
|
|
79
|
+
df.columns = original_columns
|
|
80
|
+
return result
|
|
78
81
|
|
|
79
82
|
|
|
80
83
|
def json_checksum(obj: t.Union[dict, list]):
|
mindsdb/utilities/context.py
CHANGED
|
@@ -52,6 +52,15 @@ class Context:
|
|
|
52
52
|
def load(self, storage: dict) -> None:
|
|
53
53
|
self._storage.set(storage)
|
|
54
54
|
|
|
55
|
+
def metadata(self, **kwargs) -> dict:
|
|
56
|
+
return {
|
|
57
|
+
'user_id': self.user_id or "",
|
|
58
|
+
'company_id': self.company_id or "",
|
|
59
|
+
'session_id': self.session_id,
|
|
60
|
+
'user_class': self.user_class,
|
|
61
|
+
**kwargs
|
|
62
|
+
}
|
|
63
|
+
|
|
55
64
|
|
|
56
65
|
_context_var = ContextVar('mindsdb.context')
|
|
57
66
|
context = Context(_context_var)
|
mindsdb/utilities/log.py
CHANGED
|
@@ -29,6 +29,23 @@ class ColorFormatter(logging.Formatter):
|
|
|
29
29
|
return log_fmt.format(record)
|
|
30
30
|
|
|
31
31
|
|
|
32
|
+
def get_console_handler_config_level() -> int:
|
|
33
|
+
console_handler_config = app_config['logging']['handlers']['console']
|
|
34
|
+
return getattr(logging, console_handler_config["level"])
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_file_handler_config_level() -> int:
|
|
38
|
+
file_handler_config = app_config['logging']['handlers']['file']
|
|
39
|
+
return getattr(logging, file_handler_config["level"])
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_mindsdb_log_level() -> int:
|
|
43
|
+
console_handler_config_level = get_console_handler_config_level()
|
|
44
|
+
file_handler_config_level = get_file_handler_config_level()
|
|
45
|
+
|
|
46
|
+
return min(console_handler_config_level, file_handler_config_level)
|
|
47
|
+
|
|
48
|
+
|
|
32
49
|
def configure_logging():
|
|
33
50
|
handlers_config = {}
|
|
34
51
|
console_handler_config = app_config['logging']['handlers']['console']
|
|
@@ -39,6 +56,7 @@ def configure_logging():
|
|
|
39
56
|
"formatter": "f",
|
|
40
57
|
"level": console_handler_config_level
|
|
41
58
|
}
|
|
59
|
+
|
|
42
60
|
file_handler_config = app_config['logging']['handlers']['file']
|
|
43
61
|
file_handler_config_level = getattr(logging, file_handler_config["level"])
|
|
44
62
|
if file_handler_config['enabled'] is True:
|
|
@@ -51,7 +69,7 @@ def configure_logging():
|
|
|
51
69
|
"backupCount": file_handler_config["backupCount"]
|
|
52
70
|
}
|
|
53
71
|
|
|
54
|
-
mindsdb_log_level =
|
|
72
|
+
mindsdb_log_level = get_mindsdb_log_level()
|
|
55
73
|
|
|
56
74
|
logging_config = dict(
|
|
57
75
|
version=1,
|
|
@@ -65,7 +83,7 @@ def configure_logging():
|
|
|
65
83
|
loggers={
|
|
66
84
|
"": { # root logger
|
|
67
85
|
"handlers": list(handlers_config.keys()),
|
|
68
|
-
"level":
|
|
86
|
+
"level": mindsdb_log_level,
|
|
69
87
|
},
|
|
70
88
|
"__main__": {
|
|
71
89
|
"level": mindsdb_log_level,
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import typing
|
|
3
|
+
|
|
4
|
+
from opentelemetry import trace # noqa: F401
|
|
5
|
+
from opentelemetry import metrics # noqa: F401
|
|
6
|
+
from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter as OTLPLogExporterGRPC
|
|
7
|
+
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter as OTLPLogExporterHTTP
|
|
8
|
+
from opentelemetry.sdk._logs._internal.export import LogExporter
|
|
9
|
+
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as OTLPMetricExporterGRPC
|
|
10
|
+
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as OTLPMetricExporterHTTP
|
|
11
|
+
from opentelemetry.sdk.metrics.export import MetricExporter, ConsoleMetricExporter
|
|
12
|
+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter as OTLPSpanExporterGRPC
|
|
13
|
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as OTLPSpanExporterHTTP
|
|
14
|
+
from opentelemetry.sdk.trace.export import SpanExporter, ConsoleSpanExporter
|
|
15
|
+
from opentelemetry.sdk.resources import Resource
|
|
16
|
+
from opentelemetry.sdk.trace.sampling import TraceIdRatioBased
|
|
17
|
+
|
|
18
|
+
from mindsdb.utilities.otel.logger import setup_logger
|
|
19
|
+
from mindsdb.utilities.otel.meter import setup_meter
|
|
20
|
+
from mindsdb.utilities.otel.tracer import setup_tracer
|
|
21
|
+
from mindsdb.utilities.utils import parse_csv_attributes
|
|
22
|
+
from mindsdb.utilities import log
|
|
23
|
+
|
|
24
|
+
logger = log.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
# Check OpenTelemetry exporter type
|
|
27
|
+
OTEL_EXPORTER_TYPE = os.getenv("OTEL_EXPORTER_TYPE", "console") # console or otlp
|
|
28
|
+
|
|
29
|
+
# Define OpenTelemetry exporter protocol
|
|
30
|
+
OTEL_EXPORTER_PROTOCOL = os.getenv("OTEL_EXPORTER_PROTOCOL", "grpc") # grpc or http
|
|
31
|
+
|
|
32
|
+
# Define OTLP endpoint. If not set, the default OTLP endpoint will be used
|
|
33
|
+
OTEL_OTLP_ENDPOINT = os.getenv("OTEL_OTLP_ENDPOINT", "http://localhost:4317")
|
|
34
|
+
|
|
35
|
+
# Define OTLP logging endpoint. If not set, the default OTLP logging endpoint will be used
|
|
36
|
+
OTEL_OTLP_LOGGING_ENDPOINT = os.getenv("OTEL_OTLP_LOGGING_ENDPOINT", OTEL_OTLP_ENDPOINT)
|
|
37
|
+
|
|
38
|
+
# Define OTLP tracing endpoint. If not set, the default OTLP tracing endpoint will be used
|
|
39
|
+
OTEL_OTLP_TRACING_ENDPOINT = os.getenv("OTEL_OTLP_TRACING_ENDPOINT", OTEL_OTLP_ENDPOINT)
|
|
40
|
+
|
|
41
|
+
# Define OTLP metrics endpoint. If not set, the default OTLP metrics endpoint will be used
|
|
42
|
+
OTEL_OTLP_METRICS_ENDPOINT = os.getenv("OTEL_OTLP_METRICS_ENDPOINT", OTEL_OTLP_ENDPOINT)
|
|
43
|
+
|
|
44
|
+
# Define service name
|
|
45
|
+
OTEL_SERVICE_NAME = os.getenv("OTEL_SERVICE_NAME", "mindsdb")
|
|
46
|
+
|
|
47
|
+
# Define service instace ID
|
|
48
|
+
OTEL_SERVICE_INSTANCE_ID = os.getenv("OTEL_SERVICE_INSTANCE_ID", "mindsdb-instance")
|
|
49
|
+
|
|
50
|
+
# The name of the environment we"re on, by default local for development, this is set differently per-env in our Helm
|
|
51
|
+
# chart values files
|
|
52
|
+
OTEL_SERVICE_ENVIRONMENT = os.getenv("OTEL_SERVICE_ENVIRONMENT", "local").lower()
|
|
53
|
+
|
|
54
|
+
# Define service release
|
|
55
|
+
OTEL_SERVICE_RELEASE = os.getenv("OTEL_SERVICE_RELEASE", "local").lower()
|
|
56
|
+
|
|
57
|
+
# Define how often to capture traces
|
|
58
|
+
OTEL_TRACE_SAMPLE_RATE = float(os.getenv("OTEL_TRACE_SAMPLE_RATE", "1.0"))
|
|
59
|
+
|
|
60
|
+
# Define extra attributes
|
|
61
|
+
OTEL_EXTRA_ATTRIBUTES = os.getenv("OTEL_EXTRA_ATTRIBUTES", "")
|
|
62
|
+
|
|
63
|
+
# By default, we have Open Telemetry SDK enabled on all envs, except for local which is disabled by default.
|
|
64
|
+
OTEL_SDK_DISABLED = (os.getenv("OTEL_SDK_DISABLED", "false").lower() == "true"
|
|
65
|
+
or os.getenv("OTEL_SERVICE_ENVIRONMENT", "local").lower() == "local")
|
|
66
|
+
|
|
67
|
+
# Define if OpenTelemetry logging is disabled. By default, it is disabled.
|
|
68
|
+
OTEL_LOGGING_DISABLED = os.getenv("OTEL_LOGGING_DISABLED", "true").lower() == "true"
|
|
69
|
+
|
|
70
|
+
# Define if OpenTelemetry tracing is disabled. By default, it is enabled.
|
|
71
|
+
OTEL_TRACING_DISABLED = os.getenv("OTEL_TRACING_DISABLED", "false").lower() == "true"
|
|
72
|
+
|
|
73
|
+
# Define if OpenTelemetry metrics is disabled. By default, it is disabled.
|
|
74
|
+
OTEL_METRICS_DISABLED = os.getenv("OTEL_METRICS_DISABLED", "true").lower() == "true"
|
|
75
|
+
|
|
76
|
+
# If you want to enable Open Telemetry on local for some reason please set OTEL_SDK_FORCE_RUN to true
|
|
77
|
+
OTEL_SDK_FORCE_RUN = os.getenv("OTEL_SDK_FORCE_RUN", "false").lower() == "true"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def get_otel_attributes() -> dict:
|
|
81
|
+
"""
|
|
82
|
+
Get OpenTelemetry attributes
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
dict: OpenTelemetry attributes
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
base_attributes = {
|
|
89
|
+
"service.name": OTEL_SERVICE_NAME,
|
|
90
|
+
"service.instance.id": OTEL_SERVICE_INSTANCE_ID,
|
|
91
|
+
"environment": OTEL_SERVICE_ENVIRONMENT,
|
|
92
|
+
"release": OTEL_SERVICE_RELEASE,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
extra_attributes = {}
|
|
96
|
+
try:
|
|
97
|
+
extra_attributes = parse_csv_attributes(OTEL_EXTRA_ATTRIBUTES)
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.error(f"Failed to parse OTEL_EXTRA_ATTRIBUTES: {e}")
|
|
100
|
+
|
|
101
|
+
attributes = {**extra_attributes, **base_attributes} # Base attributes take precedence over extra attributes
|
|
102
|
+
|
|
103
|
+
return attributes
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_logging_exporter() -> typing.Optional[LogExporter]:
|
|
107
|
+
"""
|
|
108
|
+
Get OpenTelemetry logging exporter.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
OTLPLogExporter: OpenTelemetry logging exporter
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
if OTEL_EXPORTER_TYPE == "otlp":
|
|
115
|
+
|
|
116
|
+
if OTEL_EXPORTER_PROTOCOL == "grpc":
|
|
117
|
+
return OTLPLogExporterGRPC(
|
|
118
|
+
endpoint=OTEL_OTLP_LOGGING_ENDPOINT,
|
|
119
|
+
insecure=True
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
elif OTEL_EXPORTER_PROTOCOL == "http":
|
|
123
|
+
return OTLPLogExporterHTTP(
|
|
124
|
+
endpoint=OTEL_OTLP_LOGGING_ENDPOINT
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def get_span_exporter() -> SpanExporter:
|
|
131
|
+
"""
|
|
132
|
+
Get OpenTelemetry span exporter
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
OTLPSpanExporter: OpenTelemetry span exporter
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
if OTEL_EXPORTER_TYPE == "otlp":
|
|
139
|
+
|
|
140
|
+
if OTEL_EXPORTER_PROTOCOL == "grpc":
|
|
141
|
+
return OTLPSpanExporterGRPC(
|
|
142
|
+
endpoint=OTEL_OTLP_TRACING_ENDPOINT,
|
|
143
|
+
insecure=True
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
elif OTEL_EXPORTER_PROTOCOL == "http":
|
|
147
|
+
return OTLPSpanExporterHTTP(
|
|
148
|
+
endpoint=OTEL_OTLP_TRACING_ENDPOINT
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
return ConsoleSpanExporter()
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def get_metrics_exporter() -> typing.Optional[MetricExporter]:
|
|
155
|
+
"""
|
|
156
|
+
Get OpenTelemetry metrics exporter
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
OTLPLogExporter: OpenTelemetry metrics exporter
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
if OTEL_EXPORTER_TYPE == "otlp":
|
|
163
|
+
|
|
164
|
+
if OTEL_EXPORTER_PROTOCOL == "grpc":
|
|
165
|
+
return OTLPMetricExporterGRPC(
|
|
166
|
+
endpoint=OTEL_OTLP_METRICS_ENDPOINT,
|
|
167
|
+
insecure=True
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
elif OTEL_EXPORTER_PROTOCOL == "http":
|
|
171
|
+
return OTLPMetricExporterHTTP(
|
|
172
|
+
endpoint=OTEL_OTLP_METRICS_ENDPOINT
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return ConsoleMetricExporter()
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
if not OTEL_SDK_DISABLED or OTEL_SDK_FORCE_RUN:
|
|
179
|
+
logger.info("OpenTelemetry enabled")
|
|
180
|
+
logger.info(f"OpenTelemetry exporter type: {OTEL_EXPORTER_TYPE}")
|
|
181
|
+
logger.info(f"OpenTelemetry service name: {OTEL_SERVICE_NAME}")
|
|
182
|
+
logger.info(f"OpenTelemetry service environment: {OTEL_SERVICE_ENVIRONMENT}")
|
|
183
|
+
logger.info(f"OpenTelemetry service release: {OTEL_SERVICE_RELEASE}")
|
|
184
|
+
logger.info(f"OpenTelemetry trace sample rate: {OTEL_TRACE_SAMPLE_RATE}")
|
|
185
|
+
logger.info(f"OpenTelemetry extra attributes: {OTEL_EXTRA_ATTRIBUTES}")
|
|
186
|
+
|
|
187
|
+
# Define OpenTelemetry resources (e.g., service name)
|
|
188
|
+
attributes = get_otel_attributes()
|
|
189
|
+
|
|
190
|
+
# Define OpenTelemetry sampler
|
|
191
|
+
sampler = TraceIdRatioBased(OTEL_TRACE_SAMPLE_RATE)
|
|
192
|
+
|
|
193
|
+
# Define OpenTelemetry resources (e.g., service name)
|
|
194
|
+
resource = Resource(attributes=attributes)
|
|
195
|
+
|
|
196
|
+
if not OTEL_LOGGING_DISABLED:
|
|
197
|
+
logger.info("OpenTelemetry Logging is enabled")
|
|
198
|
+
setup_logger(resource, get_logging_exporter())
|
|
199
|
+
|
|
200
|
+
if not OTEL_TRACING_DISABLED:
|
|
201
|
+
logger.info("OpenTelemetry Tracing is enabled")
|
|
202
|
+
setup_tracer(resource, sampler, get_span_exporter())
|
|
203
|
+
|
|
204
|
+
if not OTEL_METRICS_DISABLED:
|
|
205
|
+
logger.info("OpenTelemetry Metrics is enabled")
|
|
206
|
+
setup_meter(resource, get_metrics_exporter())
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from opentelemetry._logs import set_logger_provider
|
|
4
|
+
from opentelemetry.sdk._logs._internal.export import LogExporter
|
|
5
|
+
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
|
|
6
|
+
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
|
|
7
|
+
from opentelemetry.sdk.resources import Resource
|
|
8
|
+
|
|
9
|
+
from mindsdb.utilities.log import get_mindsdb_log_level
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def setup_logger(resource: Resource, exporter: LogExporter) -> None:
|
|
13
|
+
"""
|
|
14
|
+
Setup OpenTelemetry logging
|
|
15
|
+
"""
|
|
16
|
+
mindsdb_log_level = get_mindsdb_log_level()
|
|
17
|
+
|
|
18
|
+
logger_provider = LoggerProvider(resource=resource)
|
|
19
|
+
set_logger_provider(logger_provider)
|
|
20
|
+
|
|
21
|
+
logger_provider.add_log_record_processor(BatchLogRecordProcessor(exporter))
|
|
22
|
+
handler = LoggingHandler(level=mindsdb_log_level, logger_provider=logger_provider)
|
|
23
|
+
|
|
24
|
+
# Attach OTLP handler to root logger
|
|
25
|
+
logging.getLogger().addHandler(handler)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from opentelemetry import metrics
|
|
2
|
+
from opentelemetry.sdk.metrics import MeterProvider
|
|
3
|
+
from opentelemetry.sdk.resources import Resource
|
|
4
|
+
from opentelemetry.sdk.metrics.export import (
|
|
5
|
+
MetricExporter,
|
|
6
|
+
PeriodicExportingMetricReader,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def setup_meter(resource: Resource, exporter: MetricExporter) -> None:
|
|
11
|
+
"""
|
|
12
|
+
Setup OpenTelemetry metrics
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
metric_reader = PeriodicExportingMetricReader(exporter=exporter)
|
|
16
|
+
provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
|
|
17
|
+
|
|
18
|
+
# Sets the global default meter provider
|
|
19
|
+
metrics.set_meter_provider(provider)
|