MindsDB 25.4.4.0__py3-none-any.whl → 25.5.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +107 -125
- mindsdb/api/executor/command_executor.py +14 -3
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +8 -0
- mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +2 -1
- mindsdb/api/executor/datahub/datanodes/system_tables.py +10 -13
- mindsdb/api/executor/planner/query_plan.py +1 -0
- mindsdb/api/executor/planner/query_planner.py +9 -1
- mindsdb/api/executor/sql_query/sql_query.py +24 -8
- mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +21 -3
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +3 -1
- mindsdb/api/http/initialize.py +20 -3
- mindsdb/api/http/namespaces/analysis.py +14 -1
- mindsdb/api/http/namespaces/config.py +19 -11
- mindsdb/api/http/namespaces/tree.py +1 -1
- mindsdb/api/http/start.py +7 -2
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +4 -8
- mindsdb/api/mysql/mysql_proxy/utilities/exceptions.py +0 -4
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message_formats.py +2 -2
- mindsdb/integrations/handlers/bigquery_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/chromadb_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/gmail_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/google_analytics_handler/requirements.txt +2 -1
- mindsdb/integrations/handlers/google_books_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/google_calendar_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/google_content_shopping_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/google_fit_handler/requirements.txt +2 -0
- mindsdb/integrations/handlers/google_search_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/jira_handler/jira_handler.archived.py +75 -0
- mindsdb/integrations/handlers/jira_handler/jira_handler.py +113 -38
- mindsdb/integrations/handlers/jira_handler/jira_tables.py +229 -0
- mindsdb/integrations/handlers/jira_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/lightfm_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/lightwood_handler/lightwood_handler.py +0 -2
- mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
- mindsdb/integrations/handlers/lindorm_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/ms_one_drive_handler/requirements.txt +2 -0
- mindsdb/integrations/handlers/ms_teams_handler/requirements.txt +3 -1
- mindsdb/integrations/handlers/openai_handler/helpers.py +3 -5
- mindsdb/integrations/handlers/openai_handler/openai_handler.py +25 -12
- mindsdb/integrations/handlers/snowflake_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/togetherai_handler/__about__.py +9 -0
- mindsdb/integrations/handlers/togetherai_handler/__init__.py +20 -0
- mindsdb/integrations/handlers/togetherai_handler/creation_args.py +14 -0
- mindsdb/integrations/handlers/togetherai_handler/icon.svg +15 -0
- mindsdb/integrations/handlers/togetherai_handler/model_using_args.py +5 -0
- mindsdb/integrations/handlers/togetherai_handler/requirements.txt +2 -0
- mindsdb/integrations/handlers/togetherai_handler/settings.py +33 -0
- mindsdb/integrations/handlers/togetherai_handler/togetherai_handler.py +234 -0
- mindsdb/integrations/handlers/vertex_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/youtube_handler/requirements.txt +1 -0
- mindsdb/integrations/utilities/files/file_reader.py +5 -2
- mindsdb/integrations/utilities/handler_utils.py +4 -0
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +360 -0
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +6 -346
- mindsdb/interfaces/agents/constants.py +14 -2
- mindsdb/interfaces/agents/langchain_agent.py +2 -4
- mindsdb/interfaces/database/projects.py +1 -7
- mindsdb/interfaces/functions/controller.py +14 -16
- mindsdb/interfaces/functions/to_markdown.py +9 -124
- mindsdb/interfaces/knowledge_base/controller.py +109 -92
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +28 -5
- mindsdb/interfaces/knowledge_base/utils.py +10 -15
- mindsdb/interfaces/model/model_controller.py +0 -2
- mindsdb/interfaces/query_context/context_controller.py +55 -15
- mindsdb/interfaces/query_context/query_task.py +19 -0
- mindsdb/interfaces/skills/sql_agent.py +33 -11
- mindsdb/interfaces/storage/db.py +2 -2
- mindsdb/interfaces/tasks/task_monitor.py +5 -1
- mindsdb/interfaces/tasks/task_thread.py +6 -0
- mindsdb/migrations/migrate.py +0 -2
- mindsdb/migrations/versions/2025-04-22_53502b6d63bf_query_database.py +27 -0
- mindsdb/utilities/config.py +15 -3
- mindsdb/utilities/context.py +2 -1
- mindsdb/utilities/functions.py +0 -36
- mindsdb/utilities/langfuse.py +19 -10
- mindsdb/utilities/otel/__init__.py +9 -193
- mindsdb/utilities/otel/metric_handlers/__init__.py +5 -1
- mindsdb/utilities/otel/prepare.py +198 -0
- mindsdb/utilities/sql.py +83 -0
- {mindsdb-25.4.4.0.dist-info → mindsdb-25.5.3.0.dist-info}/METADATA +662 -592
- {mindsdb-25.4.4.0.dist-info → mindsdb-25.5.3.0.dist-info}/RECORD +85 -69
- {mindsdb-25.4.4.0.dist-info → mindsdb-25.5.3.0.dist-info}/WHEEL +1 -1
- mindsdb/api/mysql/mysql_proxy/classes/sql_statement_parser.py +0 -151
- {mindsdb-25.4.4.0.dist-info → mindsdb-25.5.3.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.4.4.0.dist-info → mindsdb-25.5.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,10 @@
|
|
|
1
|
-
import base64
|
|
2
1
|
from io import BytesIO
|
|
3
2
|
import os
|
|
4
3
|
from typing import Union
|
|
5
4
|
from urllib.parse import urlparse
|
|
6
5
|
|
|
7
|
-
import
|
|
8
|
-
from markitdown import MarkItDown
|
|
6
|
+
from aipdf import ocr
|
|
9
7
|
import mimetypes
|
|
10
|
-
from openai import OpenAI
|
|
11
8
|
import requests
|
|
12
9
|
|
|
13
10
|
|
|
@@ -15,41 +12,22 @@ class ToMarkdown:
|
|
|
15
12
|
"""
|
|
16
13
|
Extracts the content of documents of various formats in markdown format.
|
|
17
14
|
"""
|
|
18
|
-
def __init__(self
|
|
15
|
+
def __init__(self):
|
|
19
16
|
"""
|
|
20
17
|
Initializes the ToMarkdown class.
|
|
21
18
|
"""
|
|
22
|
-
# If use_llm is True, llm_client and llm_model must be provided.
|
|
23
|
-
if use_llm and (llm_client is None or llm_model is None):
|
|
24
|
-
raise ValueError('LLM client and model must be provided when use_llm is True.')
|
|
25
19
|
|
|
26
|
-
|
|
27
|
-
if not use_llm:
|
|
28
|
-
llm_client = None
|
|
29
|
-
llm_model = None
|
|
30
|
-
|
|
31
|
-
# Only OpenAI is supported for now.
|
|
32
|
-
# TODO: Add support for other LLMs.
|
|
33
|
-
if llm_client is not None and not isinstance(llm_client, OpenAI):
|
|
34
|
-
raise ValueError('Only OpenAI models are supported at the moment.')
|
|
35
|
-
|
|
36
|
-
self.use_llm = use_llm
|
|
37
|
-
self.llm_client = llm_client
|
|
38
|
-
self.llm_model = llm_model
|
|
39
|
-
|
|
40
|
-
def call(self, file_path_or_url: str) -> str:
|
|
20
|
+
def call(self, file_path_or_url: str, **kwargs) -> str:
|
|
41
21
|
"""
|
|
42
22
|
Converts a file to markdown.
|
|
43
23
|
"""
|
|
44
24
|
file_extension = self._get_file_extension(file_path_or_url)
|
|
45
|
-
|
|
25
|
+
file_content = self._get_file_content(file_path_or_url)
|
|
46
26
|
|
|
47
27
|
if file_extension == '.pdf':
|
|
48
|
-
return self._pdf_to_markdown(
|
|
49
|
-
elif file_extension in ['.jpg', '.jpeg', '.png', '.gif']:
|
|
50
|
-
return self._image_to_markdown(file)
|
|
28
|
+
return self._pdf_to_markdown(file_content, **kwargs)
|
|
51
29
|
else:
|
|
52
|
-
|
|
30
|
+
raise ValueError(f"Unsupported file type: {file_extension}.")
|
|
53
31
|
|
|
54
32
|
def _get_file_content(self, file_path_or_url: str) -> str:
|
|
55
33
|
"""
|
|
@@ -90,105 +68,12 @@ class ToMarkdown:
|
|
|
90
68
|
else:
|
|
91
69
|
return os.path.splitext(file_path_or_url)[1]
|
|
92
70
|
|
|
93
|
-
def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes]) -> str:
|
|
71
|
+
def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes], **kwargs) -> str:
|
|
94
72
|
"""
|
|
95
73
|
Converts a PDF file to markdown.
|
|
96
74
|
"""
|
|
97
|
-
if self.llm_client is None:
|
|
98
|
-
return self._pdf_to_markdown_no_llm(file_content)
|
|
99
|
-
else:
|
|
100
|
-
return self._pdf_to_markdown_llm(file_content)
|
|
101
|
-
|
|
102
|
-
def _pdf_to_markdown_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
103
|
-
"""
|
|
104
|
-
Converts a PDF file to markdown using LLM.
|
|
105
|
-
The LLM is used mainly for the purpose of generating descriptions of any images in the PDF.
|
|
106
|
-
"""
|
|
107
75
|
if isinstance(file_content, requests.Response):
|
|
108
76
|
file_content = BytesIO(file_content.content)
|
|
109
77
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
markdown_content = []
|
|
113
|
-
for page_num in range(len(document)):
|
|
114
|
-
page = document.load_page(page_num)
|
|
115
|
-
|
|
116
|
-
# Get text blocks with coordinates.
|
|
117
|
-
page_content = []
|
|
118
|
-
blocks = page.get_text("blocks")
|
|
119
|
-
for block in blocks:
|
|
120
|
-
x0, y0, x1, y1, text, _, _ = block
|
|
121
|
-
if text.strip(): # Skip empty or whitespace blocks.
|
|
122
|
-
page_content.append((y0, text.strip()))
|
|
123
|
-
|
|
124
|
-
# Extract images from the page.
|
|
125
|
-
image_list = page.get_images(full=True)
|
|
126
|
-
for img_index, img in enumerate(image_list):
|
|
127
|
-
xref = img[0]
|
|
128
|
-
base_image = document.extract_image(xref)
|
|
129
|
-
image_bytes = base_image["image"]
|
|
130
|
-
|
|
131
|
-
# Use actual image y-coordinate if available.
|
|
132
|
-
y0 = float(base_image.get("y", 0))
|
|
133
|
-
image_description = self._generate_image_description(image_bytes)
|
|
134
|
-
page_content.append((y0, f""))
|
|
135
|
-
|
|
136
|
-
# Sort the content by y0 coordinate
|
|
137
|
-
page_content.sort(key=lambda x: x[0])
|
|
138
|
-
|
|
139
|
-
# Add sorted content to the markdown
|
|
140
|
-
for _, text in page_content:
|
|
141
|
-
markdown_content.append(text)
|
|
142
|
-
markdown_content.append("\n")
|
|
143
|
-
|
|
144
|
-
document.close()
|
|
145
|
-
|
|
146
|
-
return "\n".join(markdown_content)
|
|
147
|
-
|
|
148
|
-
def _generate_image_description(self, image_bytes: bytes) -> str:
|
|
149
|
-
"""
|
|
150
|
-
Generates a description of the image using LLM.
|
|
151
|
-
"""
|
|
152
|
-
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
|
|
153
|
-
|
|
154
|
-
response = self.llm_client.chat.completions.create(
|
|
155
|
-
model=self.llm_model,
|
|
156
|
-
messages=[
|
|
157
|
-
{
|
|
158
|
-
"role": "user",
|
|
159
|
-
"content": [
|
|
160
|
-
{"type": "text", "text": "Describe this image"},
|
|
161
|
-
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
|
162
|
-
],
|
|
163
|
-
}
|
|
164
|
-
],
|
|
165
|
-
)
|
|
166
|
-
description = response.choices[0].message.content
|
|
167
|
-
return description
|
|
168
|
-
|
|
169
|
-
def _pdf_to_markdown_no_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
170
|
-
"""
|
|
171
|
-
Converts a PDF file to markdown without using LLM.
|
|
172
|
-
"""
|
|
173
|
-
md = MarkItDown(enable_plugins=True)
|
|
174
|
-
result = md.convert(file_content)
|
|
175
|
-
return result.markdown
|
|
176
|
-
|
|
177
|
-
def _image_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
178
|
-
"""
|
|
179
|
-
Converts images to markdown.
|
|
180
|
-
"""
|
|
181
|
-
if not self.use_llm or self.llm_client is None:
|
|
182
|
-
raise ValueError('LLM client must be enabled to convert images to markdown.')
|
|
183
|
-
|
|
184
|
-
md = MarkItDown(llm_client=self.llm_client, llm_model=self.llm_model, enable_plugins=True)
|
|
185
|
-
result = md.convert(file_content)
|
|
186
|
-
return result.markdown
|
|
187
|
-
|
|
188
|
-
def _other_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
189
|
-
"""
|
|
190
|
-
Converts other file formats to markdown.
|
|
191
|
-
"""
|
|
192
|
-
md = MarkItDown(enable_plugins=True)
|
|
193
|
-
result = md.convert(file_content)
|
|
194
|
-
return result.markdown
|
|
78
|
+
markdown_pages = ocr(file_content, **kwargs)
|
|
79
|
+
return "\n\n---\n\n".join(markdown_pages)
|
|
@@ -27,7 +27,7 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
|
|
|
27
27
|
from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
|
|
28
28
|
from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
|
|
29
29
|
from mindsdb.integrations.utilities.handler_utils import get_api_key
|
|
30
|
-
from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import construct_model_from_args
|
|
30
|
+
from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import construct_model_from_args
|
|
31
31
|
|
|
32
32
|
from mindsdb.interfaces.agents.constants import DEFAULT_EMBEDDINGS_MODEL_CLASS
|
|
33
33
|
from mindsdb.interfaces.agents.langchain_agent import create_chat_model, get_llm_provider
|
|
@@ -37,21 +37,35 @@ from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor impor
|
|
|
37
37
|
from mindsdb.interfaces.model.functions import PredictorRecordNotFound
|
|
38
38
|
from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
|
|
39
39
|
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
|
|
40
|
+
from mindsdb.utilities.config import config
|
|
40
41
|
from mindsdb.utilities.context import context as ctx
|
|
41
42
|
|
|
42
43
|
from mindsdb.api.executor.command_executor import ExecuteCommands
|
|
44
|
+
from mindsdb.api.executor.utilities.sql import query_df
|
|
43
45
|
from mindsdb.utilities import log
|
|
44
|
-
from mindsdb.integrations.utilities.rag.rerankers.
|
|
46
|
+
from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMReranker
|
|
45
47
|
|
|
46
48
|
logger = log.getLogger(__name__)
|
|
47
49
|
|
|
48
50
|
KB_TO_VECTORDB_COLUMNS = {
|
|
49
|
-
'id': '
|
|
51
|
+
'id': 'original_doc_id',
|
|
50
52
|
'chunk_id': 'id',
|
|
51
53
|
'chunk_content': 'content'
|
|
52
54
|
}
|
|
53
55
|
|
|
54
56
|
|
|
57
|
+
def get_model_params(model_params: dict, default_config_key: str):
|
|
58
|
+
"""
|
|
59
|
+
Get model parameters by combining default config with user provided parameters.
|
|
60
|
+
"""
|
|
61
|
+
combined_model_params = copy.deepcopy(config.get(default_config_key, {}))
|
|
62
|
+
|
|
63
|
+
if model_params:
|
|
64
|
+
combined_model_params.update(model_params)
|
|
65
|
+
|
|
66
|
+
return combined_model_params
|
|
67
|
+
|
|
68
|
+
|
|
55
69
|
def get_embedding_model_from_params(embedding_model_params: dict):
|
|
56
70
|
"""
|
|
57
71
|
Create embedding model from parameters.
|
|
@@ -89,7 +103,7 @@ def get_reranking_model_from_params(reranking_model_params: dict):
|
|
|
89
103
|
params_copy["api_key"] = get_api_key(provider, params_copy, strict=False)
|
|
90
104
|
params_copy['model'] = params_copy.pop('model_name', None)
|
|
91
105
|
|
|
92
|
-
return
|
|
106
|
+
return BaseLLMReranker(**params_copy)
|
|
93
107
|
|
|
94
108
|
|
|
95
109
|
class KnowledgeBaseTable:
|
|
@@ -137,13 +151,8 @@ class KnowledgeBaseTable:
|
|
|
137
151
|
query.from_table = Identifier(parts=[self._kb.vector_database_table])
|
|
138
152
|
logger.debug(f"Set table name to: {self._kb.vector_database_table}")
|
|
139
153
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
if isinstance(target, Star):
|
|
143
|
-
requested_kb_columns = None
|
|
144
|
-
break
|
|
145
|
-
else:
|
|
146
|
-
requested_kb_columns.append(target.parts[-1].lower())
|
|
154
|
+
# Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
|
|
155
|
+
query_copy = copy.deepcopy(query)
|
|
147
156
|
|
|
148
157
|
query.targets = [
|
|
149
158
|
Identifier(TableField.ID.value),
|
|
@@ -207,15 +216,23 @@ class KnowledgeBaseTable:
|
|
|
207
216
|
|
|
208
217
|
df = self.add_relevance(df, query_text, relevance_threshold)
|
|
209
218
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
219
|
+
if (
|
|
220
|
+
query.group_by is not None
|
|
221
|
+
or query.order_by is not None
|
|
222
|
+
or query.having is not None
|
|
223
|
+
or query.distinct is True
|
|
224
|
+
or len(query.targets) != 1
|
|
225
|
+
or not isinstance(query.targets[0], Star)
|
|
226
|
+
):
|
|
227
|
+
query_copy.where = None
|
|
228
|
+
df = query_df(df, query_copy, session=self.session)
|
|
229
|
+
|
|
213
230
|
return df
|
|
214
231
|
|
|
215
232
|
def add_relevance(self, df, query_text, relevance_threshold=None):
|
|
216
233
|
relevance_column = TableField.RELEVANCE.value
|
|
217
234
|
|
|
218
|
-
reranking_model_params = self._kb.params.get("reranking_model")
|
|
235
|
+
reranking_model_params = get_model_params(self._kb.params.get("reranking_model"), "default_llm")
|
|
219
236
|
if reranking_model_params and query_text and len(df) > 0:
|
|
220
237
|
# Use reranker for relevance score
|
|
221
238
|
try:
|
|
@@ -277,7 +294,7 @@ class KnowledgeBaseTable:
|
|
|
277
294
|
columns = list(df.columns)
|
|
278
295
|
# update id, get from metadata
|
|
279
296
|
df[TableField.ID.value] = df[TableField.METADATA.value].apply(
|
|
280
|
-
lambda m: None if m is None else m.get('
|
|
297
|
+
lambda m: None if m is None else m.get('original_doc_id')
|
|
281
298
|
)
|
|
282
299
|
|
|
283
300
|
# id on first place
|
|
@@ -466,12 +483,9 @@ class KnowledgeBaseTable:
|
|
|
466
483
|
# Use provided_id directly if it exists, otherwise generate one
|
|
467
484
|
doc_id = self._generate_document_id(content_str, col, provided_id)
|
|
468
485
|
|
|
469
|
-
# Need provided ID to link chunks back to original source (e.g. database row).
|
|
470
|
-
row_id = provided_id if provided_id else idx
|
|
471
|
-
|
|
472
486
|
metadata = {
|
|
473
487
|
**base_metadata,
|
|
474
|
-
'
|
|
488
|
+
'original_row_index': str(idx), # provide link to original row index
|
|
475
489
|
'content_column': col,
|
|
476
490
|
}
|
|
477
491
|
|
|
@@ -657,47 +671,34 @@ class KnowledgeBaseTable:
|
|
|
657
671
|
if df.empty:
|
|
658
672
|
return pd.DataFrame([], columns=[TableField.EMBEDDINGS.value])
|
|
659
673
|
|
|
660
|
-
# keep only content
|
|
661
|
-
df = df[[TableField.CONTENT.value]]
|
|
662
|
-
|
|
663
674
|
model_id = self._kb.embedding_model_id
|
|
664
|
-
if model_id:
|
|
665
|
-
# get the input columns
|
|
666
|
-
model_rec = db.session.query(db.Predictor).filter_by(id=model_id).first()
|
|
667
675
|
|
|
668
|
-
|
|
669
|
-
|
|
676
|
+
# get the input columns
|
|
677
|
+
model_rec = db.session.query(db.Predictor).filter_by(id=model_id).first()
|
|
670
678
|
|
|
671
|
-
|
|
679
|
+
assert model_rec is not None, f"Model not found: {model_id}"
|
|
680
|
+
model_project = db.session.query(db.Project).filter_by(id=model_rec.project_id).first()
|
|
672
681
|
|
|
673
|
-
|
|
674
|
-
input_col = model_using.get('question_column')
|
|
675
|
-
if input_col is None:
|
|
676
|
-
input_col = model_using.get('input_column')
|
|
682
|
+
project_datanode = self.session.datahub.get(model_project.name)
|
|
677
683
|
|
|
678
|
-
|
|
679
|
-
|
|
684
|
+
model_using = model_rec.learn_args.get('using', {})
|
|
685
|
+
input_col = model_using.get('question_column')
|
|
686
|
+
if input_col is None:
|
|
687
|
+
input_col = model_using.get('input_column')
|
|
680
688
|
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
df=df,
|
|
684
|
-
params=self.model_params
|
|
685
|
-
)
|
|
689
|
+
if input_col is not None and input_col != TableField.CONTENT.value:
|
|
690
|
+
df = df.rename(columns={TableField.CONTENT.value: input_col})
|
|
686
691
|
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
elif self._kb.params.get('embedding_model'):
|
|
693
|
-
embedding_model = get_embedding_model_from_params(self._kb.params.get('embedding_model'))
|
|
694
|
-
|
|
695
|
-
df_texts = df.apply(row_to_document, axis=1)
|
|
696
|
-
embeddings = embedding_model.embed_documents(df_texts.tolist())
|
|
697
|
-
df_out = df.copy().assign(**{TableField.EMBEDDINGS.value: embeddings})
|
|
692
|
+
df_out = project_datanode.predict(
|
|
693
|
+
model_name=model_rec.name,
|
|
694
|
+
df=df,
|
|
695
|
+
params=self.model_params
|
|
696
|
+
)
|
|
698
697
|
|
|
699
|
-
|
|
700
|
-
|
|
698
|
+
target = model_rec.to_predict[0]
|
|
699
|
+
if target != TableField.EMBEDDINGS.value:
|
|
700
|
+
# adapt output for vectordb
|
|
701
|
+
df_out = df_out.rename(columns={target: TableField.EMBEDDINGS.value})
|
|
701
702
|
|
|
702
703
|
df_out = df_out[[TableField.EMBEDDINGS.value]]
|
|
703
704
|
|
|
@@ -728,14 +729,15 @@ class KnowledgeBaseTable:
|
|
|
728
729
|
"""
|
|
729
730
|
# Get embedding model from knowledge base
|
|
730
731
|
embeddings_model = None
|
|
732
|
+
embedding_model_params = get_model_params(self._kb.params.get('embedding_model', {}), 'default_embedding_model')
|
|
731
733
|
if self._kb.embedding_model:
|
|
732
734
|
# Extract embedding model args from knowledge base table
|
|
733
735
|
embedding_args = self._kb.embedding_model.learn_args.get('using', {})
|
|
734
736
|
# Construct the embedding model directly
|
|
735
737
|
embeddings_model = construct_model_from_args(embedding_args)
|
|
736
738
|
logger.debug(f"Using knowledge base embedding model with args: {embedding_args}")
|
|
737
|
-
elif
|
|
738
|
-
embeddings_model = get_embedding_model_from_params(
|
|
739
|
+
elif embedding_model_params:
|
|
740
|
+
embeddings_model = get_embedding_model_from_params(embedding_model_params)
|
|
739
741
|
logger.debug(f"Using knowledge base embedding model from params: {self._kb.params['embedding_model']}")
|
|
740
742
|
else:
|
|
741
743
|
embeddings_model = DEFAULT_EMBEDDINGS_MODEL_CLASS()
|
|
@@ -786,7 +788,7 @@ class KnowledgeBaseTable:
|
|
|
786
788
|
def _generate_document_id(self, content: str, content_column: str, provided_id: str = None) -> str:
|
|
787
789
|
"""Generate a deterministic document ID using the utility function."""
|
|
788
790
|
from mindsdb.interfaces.knowledge_base.utils import generate_document_id
|
|
789
|
-
return generate_document_id(content,
|
|
791
|
+
return generate_document_id(content=content, provided_id=provided_id)
|
|
790
792
|
|
|
791
793
|
def _convert_metadata_value(self, value):
|
|
792
794
|
"""
|
|
@@ -869,35 +871,33 @@ class KnowledgeBaseController:
|
|
|
869
871
|
return kb
|
|
870
872
|
raise EntityExistsError("Knowledge base already exists", name)
|
|
871
873
|
|
|
872
|
-
|
|
873
|
-
reranking_model_params = params.get('reranking_model', None)
|
|
874
|
+
embedding_params = copy.deepcopy(config.get('default_embedding_model', {}))
|
|
874
875
|
|
|
876
|
+
model_name = None
|
|
877
|
+
model_project = project
|
|
875
878
|
if embedding_model:
|
|
876
879
|
model_name = embedding_model.parts[-1]
|
|
880
|
+
if len(embedding_model.parts) > 1:
|
|
881
|
+
model_project = self.session.database_controller.get_project(embedding_model.parts[-2])
|
|
877
882
|
|
|
878
|
-
elif
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
883
|
+
elif 'embedding_model' in params:
|
|
884
|
+
if isinstance(params['embedding_model'], str):
|
|
885
|
+
# it is model name
|
|
886
|
+
model_name = params['embedding_model']
|
|
887
|
+
else:
|
|
888
|
+
# it is params for model
|
|
889
|
+
embedding_params.update(params['embedding_model'])
|
|
884
890
|
|
|
885
|
-
|
|
886
|
-
model_name = self.
|
|
891
|
+
if model_name is None:
|
|
892
|
+
model_name = self._create_embedding_model(
|
|
887
893
|
project.name,
|
|
888
|
-
params=
|
|
894
|
+
params=embedding_params,
|
|
895
|
+
kb_name=name,
|
|
889
896
|
)
|
|
890
|
-
params['
|
|
891
|
-
|
|
892
|
-
model_project = None
|
|
893
|
-
if embedding_model is not None and len(embedding_model.parts) > 1:
|
|
894
|
-
# model project is set
|
|
895
|
-
model_project = self.session.database_controller.get_project(embedding_model.parts[-2])
|
|
896
|
-
elif not embedding_model_params:
|
|
897
|
-
model_project = project
|
|
897
|
+
params['created_embedding_model'] = model_name
|
|
898
898
|
|
|
899
899
|
embedding_model_id = None
|
|
900
|
-
if
|
|
900
|
+
if model_name is not None:
|
|
901
901
|
model = self.session.model_controller.get_model(
|
|
902
902
|
name=model_name,
|
|
903
903
|
project_name=model_project.name
|
|
@@ -905,6 +905,7 @@ class KnowledgeBaseController:
|
|
|
905
905
|
model_record = db.Predictor.query.get(model['id'])
|
|
906
906
|
embedding_model_id = model_record.id
|
|
907
907
|
|
|
908
|
+
reranking_model_params = get_model_params(params.get('reranking_model', {}), 'default_llm')
|
|
908
909
|
if reranking_model_params:
|
|
909
910
|
# Get reranking model from params.
|
|
910
911
|
# This is called here to check validaity of the parameters.
|
|
@@ -989,38 +990,54 @@ class KnowledgeBaseController:
|
|
|
989
990
|
self.session.integration_controller.add(vector_store_name, engine, connection_args)
|
|
990
991
|
return vector_store_name
|
|
991
992
|
|
|
992
|
-
def
|
|
993
|
+
def _create_embedding_model(self, project_name, engine="openai", params: dict = None, kb_name=''):
|
|
993
994
|
"""create a default embedding model for knowledge base, if not specified"""
|
|
994
|
-
model_name = "
|
|
995
|
+
model_name = f"kb_embedding_{kb_name}"
|
|
995
996
|
|
|
996
|
-
#
|
|
997
|
+
# drop if exists - parameters can be different
|
|
997
998
|
try:
|
|
998
999
|
model = self.session.model_controller.get_model(model_name, project_name=project_name)
|
|
999
1000
|
if model is not None:
|
|
1000
|
-
|
|
1001
|
+
self.session.model_controller.delete_model(model_name, project_name)
|
|
1001
1002
|
except PredictorRecordNotFound:
|
|
1002
1003
|
pass
|
|
1003
1004
|
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1005
|
+
if 'provider' in params:
|
|
1006
|
+
engine = params.pop('provider').lower()
|
|
1007
|
+
|
|
1008
|
+
api_key = get_api_key(engine, params, strict=False) or params.pop('api_key')
|
|
1009
|
+
|
|
1010
|
+
if engine == 'azure_openai':
|
|
1011
|
+
engine = 'openai'
|
|
1012
|
+
params['provider'] = 'azure'
|
|
1013
|
+
|
|
1014
|
+
if engine == 'openai':
|
|
1015
|
+
if 'question_column' not in params:
|
|
1016
|
+
params['question_column'] = 'content'
|
|
1017
|
+
if api_key:
|
|
1018
|
+
params[f"{engine}_api_key"] = api_key
|
|
1019
|
+
if 'base_url' in params:
|
|
1020
|
+
params['api_base'] = params.pop('base_url')
|
|
1021
|
+
|
|
1022
|
+
params['engine'] = engine
|
|
1023
|
+
params['join_learn_process'] = True
|
|
1024
|
+
params['mode'] = 'embedding'
|
|
1010
1025
|
|
|
1011
1026
|
# Include API key if provided.
|
|
1012
|
-
using_args.update({k: v for k, v in params.items() if 'api_key' in k})
|
|
1013
1027
|
statement = CreatePredictor(
|
|
1014
1028
|
name=Identifier(parts=[project_name, model_name]),
|
|
1015
|
-
using=
|
|
1029
|
+
using=params,
|
|
1016
1030
|
targets=[
|
|
1017
1031
|
Identifier(parts=[TableField.EMBEDDINGS.value])
|
|
1018
1032
|
]
|
|
1019
1033
|
)
|
|
1020
1034
|
|
|
1021
1035
|
command_executor = ExecuteCommands(self.session)
|
|
1022
|
-
command_executor.answer_create_predictor(statement, project_name)
|
|
1023
|
-
|
|
1036
|
+
resp = command_executor.answer_create_predictor(statement, project_name)
|
|
1037
|
+
# check model status
|
|
1038
|
+
record = resp.data.records[0]
|
|
1039
|
+
if record['STATUS'] == 'error':
|
|
1040
|
+
raise ValueError('Embedding model error:' + record['ERROR'])
|
|
1024
1041
|
return model_name
|
|
1025
1042
|
|
|
1026
1043
|
def delete(self, name: str, project_name: int, if_exists: bool = False) -> None:
|
|
@@ -1054,9 +1071,9 @@ class KnowledgeBaseController:
|
|
|
1054
1071
|
self.session.integration_controller.delete(kb.params['default_vector_storage'])
|
|
1055
1072
|
except EntityNotExistsError:
|
|
1056
1073
|
pass
|
|
1057
|
-
if '
|
|
1074
|
+
if 'created_embedding_model' in kb.params:
|
|
1058
1075
|
try:
|
|
1059
|
-
self.session.model_controller.delete_model(kb.params['
|
|
1076
|
+
self.session.model_controller.delete_model(kb.params['created_embedding_model'], project_name)
|
|
1060
1077
|
except EntityNotExistsError:
|
|
1061
1078
|
pass
|
|
1062
1079
|
|
|
@@ -25,6 +25,8 @@ from langchain_core.documents import Document as LangchainDocument
|
|
|
25
25
|
|
|
26
26
|
logger = log.getLogger(__name__)
|
|
27
27
|
|
|
28
|
+
_DEFAULT_CONTENT_COLUMN_NAME = "content"
|
|
29
|
+
|
|
28
30
|
|
|
29
31
|
class DocumentPreprocessor:
|
|
30
32
|
"""Base class for document preprocessing"""
|
|
@@ -90,14 +92,18 @@ class DocumentPreprocessor:
|
|
|
90
92
|
start_char: Optional[int] = None,
|
|
91
93
|
end_char: Optional[int] = None,
|
|
92
94
|
provided_id: str = None,
|
|
95
|
+
content_column: str = None,
|
|
93
96
|
) -> str:
|
|
94
97
|
"""Generate human-readable deterministic ID for a chunk
|
|
95
|
-
Format: <doc_id>:<chunk_number>of<total_chunks>:<start_char>to<end_char>
|
|
98
|
+
Format: <doc_id>:<content_column>:<chunk_number>of<total_chunks>:<start_char>to<end_char>
|
|
96
99
|
"""
|
|
97
100
|
if provided_id is None:
|
|
98
101
|
raise ValueError("Document ID must be provided for chunk ID generation")
|
|
99
102
|
|
|
100
|
-
|
|
103
|
+
if content_column is None:
|
|
104
|
+
raise ValueError("Content column must be provided for chunk ID generation")
|
|
105
|
+
|
|
106
|
+
chunk_id = f"{provided_id}:{content_column}:{chunk_index + 1}of{total_chunks}:{start_char}to{end_char}"
|
|
101
107
|
logger.debug(f"Generated chunk ID: {chunk_id}")
|
|
102
108
|
return chunk_id
|
|
103
109
|
|
|
@@ -254,8 +260,15 @@ Please give a short succinct context to situate this chunk within the overall do
|
|
|
254
260
|
if doc.metadata:
|
|
255
261
|
metadata.update(doc.metadata)
|
|
256
262
|
|
|
263
|
+
# Get content_column from metadata or use default
|
|
264
|
+
content_column = metadata.get('content_column')
|
|
265
|
+
if content_column is None:
|
|
266
|
+
# If content_column is not in metadata, use the default column name
|
|
267
|
+
content_column = _DEFAULT_CONTENT_COLUMN_NAME
|
|
268
|
+
logger.debug(f"No content_column found in metadata, using default: {_DEFAULT_CONTENT_COLUMN_NAME}")
|
|
269
|
+
|
|
257
270
|
chunk_id = self._generate_chunk_id(
|
|
258
|
-
chunk_index=chunk_index, provided_id=doc.id
|
|
271
|
+
chunk_index=chunk_index, provided_id=doc.id, content_column=content_column
|
|
259
272
|
)
|
|
260
273
|
processed_chunks.append(
|
|
261
274
|
ProcessedChunk(
|
|
@@ -324,13 +337,23 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
|
|
|
324
337
|
metadata["start_char"] = start_char
|
|
325
338
|
metadata["end_char"] = end_char
|
|
326
339
|
|
|
327
|
-
#
|
|
340
|
+
# Get content_column from metadata or use default
|
|
341
|
+
content_column = None
|
|
342
|
+
if doc.metadata:
|
|
343
|
+
content_column = doc.metadata.get('content_column')
|
|
344
|
+
|
|
345
|
+
if content_column is None:
|
|
346
|
+
# If content_column is not in metadata, use the default column name
|
|
347
|
+
content_column = _DEFAULT_CONTENT_COLUMN_NAME
|
|
348
|
+
logger.debug(f"No content_column found in metadata, using default: {_DEFAULT_CONTENT_COLUMN_NAME}")
|
|
349
|
+
|
|
328
350
|
chunk_id = self._generate_chunk_id(
|
|
329
351
|
chunk_index=i,
|
|
330
352
|
total_chunks=total_chunks,
|
|
331
353
|
start_char=start_char,
|
|
332
354
|
end_char=end_char,
|
|
333
|
-
provided_id=doc.id
|
|
355
|
+
provided_id=doc.id,
|
|
356
|
+
content_column=content_column
|
|
334
357
|
)
|
|
335
358
|
|
|
336
359
|
processed_chunks.append(
|
|
@@ -2,27 +2,22 @@
|
|
|
2
2
|
import hashlib
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
def generate_document_id(content: str, content_column: str, provided_id: str = None) -> str:
|
|
5
|
+
def generate_document_id(content: str, content_column: str = None, provided_id: str = None) -> str:
|
|
6
6
|
"""
|
|
7
|
-
Generate a deterministic document ID from content
|
|
8
|
-
If provided_id exists,
|
|
9
|
-
For generated IDs, uses a short hash of just the content
|
|
10
|
-
same content gets same base ID across different columns.
|
|
7
|
+
Generate a deterministic document ID from content.
|
|
8
|
+
If provided_id exists, returns it directly.
|
|
9
|
+
For generated IDs, uses a short hash of just the content.
|
|
11
10
|
|
|
12
11
|
Args:
|
|
13
12
|
content: The content string
|
|
14
|
-
content_column: Name of the content column
|
|
13
|
+
content_column: Name of the content column (not used in ID generation, kept for backward compatibility)
|
|
15
14
|
provided_id: Optional user-provided ID
|
|
16
15
|
Returns:
|
|
17
|
-
Deterministic document ID
|
|
18
|
-
where base_id is either the provided_id or a 16-char hash of content
|
|
16
|
+
Deterministic document ID (either provided_id or a 16-char hash of content)
|
|
19
17
|
"""
|
|
20
18
|
if provided_id is not None:
|
|
21
|
-
|
|
22
|
-
else:
|
|
23
|
-
# Generate a shorter 16-character hash based only on content
|
|
24
|
-
hash_obj = hashlib.md5(content.encode())
|
|
25
|
-
base_id = hash_obj.hexdigest()[:16]
|
|
19
|
+
return provided_id
|
|
26
20
|
|
|
27
|
-
#
|
|
28
|
-
|
|
21
|
+
# Generate a shorter 16-character hash based only on content
|
|
22
|
+
hash_obj = hashlib.md5(content.encode())
|
|
23
|
+
return hash_obj.hexdigest()[:16]
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import sys
|
|
2
1
|
import copy
|
|
3
2
|
import datetime as dt
|
|
4
3
|
from copy import deepcopy
|
|
@@ -28,7 +27,6 @@ from mindsdb.utilities import log
|
|
|
28
27
|
|
|
29
28
|
logger = log.getLogger(__name__)
|
|
30
29
|
|
|
31
|
-
IS_PY36 = sys.version_info[1] <= 6
|
|
32
30
|
|
|
33
31
|
default_project = config.get('default_project')
|
|
34
32
|
|