MindsDB 25.4.5.0__py3-none-any.whl → 25.5.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +107 -125
- mindsdb/api/executor/command_executor.py +2 -1
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +8 -0
- mindsdb/api/executor/datahub/datanodes/system_tables.py +10 -13
- mindsdb/api/executor/planner/query_planner.py +4 -1
- mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +2 -1
- mindsdb/api/http/initialize.py +20 -3
- mindsdb/api/http/namespaces/analysis.py +14 -1
- mindsdb/api/http/namespaces/tree.py +1 -1
- mindsdb/api/http/start.py +7 -2
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +4 -8
- mindsdb/api/mysql/mysql_proxy/utilities/exceptions.py +0 -4
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message_formats.py +2 -2
- mindsdb/integrations/handlers/bigquery_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/chromadb_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/gmail_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/google_analytics_handler/requirements.txt +2 -1
- mindsdb/integrations/handlers/google_books_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/google_calendar_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/google_content_shopping_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/google_fit_handler/requirements.txt +2 -0
- mindsdb/integrations/handlers/google_search_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/jira_handler/jira_handler.archived.py +75 -0
- mindsdb/integrations/handlers/jira_handler/jira_handler.py +113 -38
- mindsdb/integrations/handlers/jira_handler/jira_tables.py +229 -0
- mindsdb/integrations/handlers/jira_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/lightfm_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/lightwood_handler/lightwood_handler.py +0 -2
- mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
- mindsdb/integrations/handlers/lindorm_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/ms_one_drive_handler/requirements.txt +2 -0
- mindsdb/integrations/handlers/ms_teams_handler/requirements.txt +3 -1
- mindsdb/integrations/handlers/openai_handler/openai_handler.py +5 -4
- mindsdb/integrations/handlers/snowflake_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/vertex_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/youtube_handler/requirements.txt +1 -0
- mindsdb/integrations/utilities/files/file_reader.py +5 -2
- mindsdb/interfaces/agents/constants.py +14 -2
- mindsdb/interfaces/agents/langchain_agent.py +2 -4
- mindsdb/interfaces/database/projects.py +1 -7
- mindsdb/interfaces/functions/controller.py +11 -14
- mindsdb/interfaces/functions/to_markdown.py +9 -124
- mindsdb/interfaces/knowledge_base/controller.py +22 -19
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +28 -5
- mindsdb/interfaces/knowledge_base/utils.py +10 -15
- mindsdb/interfaces/model/model_controller.py +0 -2
- mindsdb/interfaces/skills/sql_agent.py +33 -11
- mindsdb/migrations/migrate.py +0 -2
- mindsdb/utilities/config.py +3 -2
- mindsdb/utilities/context.py +1 -1
- mindsdb/utilities/functions.py +0 -36
- mindsdb/utilities/langfuse.py +19 -10
- mindsdb/utilities/otel/__init__.py +9 -193
- mindsdb/utilities/otel/metric_handlers/__init__.py +5 -1
- mindsdb/utilities/otel/prepare.py +198 -0
- mindsdb/utilities/sql.py +83 -0
- {mindsdb-25.4.5.0.dist-info → mindsdb-25.5.3.0.dist-info}/METADATA +663 -596
- {mindsdb-25.4.5.0.dist-info → mindsdb-25.5.3.0.dist-info}/RECORD +62 -57
- {mindsdb-25.4.5.0.dist-info → mindsdb-25.5.3.0.dist-info}/WHEEL +1 -1
- mindsdb/api/mysql/mysql_proxy/classes/sql_statement_parser.py +0 -151
- {mindsdb-25.4.5.0.dist-info → mindsdb-25.5.3.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.4.5.0.dist-info → mindsdb-25.5.3.0.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,6 @@ from typing import List, Optional
|
|
|
4
4
|
from collections import OrderedDict
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sa
|
|
7
|
-
from sqlalchemy.orm.attributes import flag_modified
|
|
8
7
|
import numpy as np
|
|
9
8
|
|
|
10
9
|
from mindsdb_sql_parser.ast.base import ASTNode
|
|
@@ -457,7 +456,7 @@ class ProjectController:
|
|
|
457
456
|
project.create(name=name)
|
|
458
457
|
return project
|
|
459
458
|
|
|
460
|
-
def update(self, id: Optional[int] = None, name: Optional[str] = None, new_name: str = None
|
|
459
|
+
def update(self, id: Optional[int] = None, name: Optional[str] = None, new_name: str = None) -> Project:
|
|
461
460
|
if id is not None and name is not None:
|
|
462
461
|
raise ValueError("Both 'id' and 'name' can't be provided at the same time")
|
|
463
462
|
|
|
@@ -470,10 +469,5 @@ class ProjectController:
|
|
|
470
469
|
project.name = new_name
|
|
471
470
|
project.record.name = new_name
|
|
472
471
|
|
|
473
|
-
if new_metadata is not None:
|
|
474
|
-
project.metadata = new_metadata
|
|
475
|
-
project.record.metadata_ = new_metadata
|
|
476
|
-
flag_modified(project.record, 'metadata_')
|
|
477
|
-
|
|
478
472
|
db.session.commit()
|
|
479
473
|
return project
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import copy
|
|
2
3
|
|
|
3
4
|
from duckdb.typing import BIGINT, DOUBLE, VARCHAR, BLOB, BOOLEAN
|
|
4
|
-
from mindsdb.interfaces.functions.to_markdown import ToMarkdown
|
|
5
5
|
from mindsdb.interfaces.storage.model_fs import HandlerStorage
|
|
6
6
|
from mindsdb.utilities.config import config
|
|
7
7
|
|
|
@@ -159,31 +159,28 @@ class FunctionController(BYOMFunctionsController):
|
|
|
159
159
|
return meta
|
|
160
160
|
|
|
161
161
|
def to_markdown_call_function(self, node):
|
|
162
|
+
# load on-demand because lib is heavy
|
|
163
|
+
from mindsdb.interfaces.functions.to_markdown import ToMarkdown
|
|
162
164
|
name = node.op.lower()
|
|
163
165
|
|
|
164
166
|
if name in self.callbacks:
|
|
165
167
|
return self.callbacks[name]
|
|
166
168
|
|
|
167
|
-
def callback(file_path_or_url
|
|
169
|
+
def callback(file_path_or_url):
|
|
168
170
|
chat_model_params = self._parse_chat_model_params('TO_MARKDOWN_FUNCTION_')
|
|
169
171
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
llm = create_chat_model(chat_model_params)
|
|
175
|
-
llm_client = llm.root_client
|
|
176
|
-
llm_model = llm.model_name
|
|
177
|
-
except Exception:
|
|
178
|
-
pass
|
|
172
|
+
params_copy = copy.deepcopy(chat_model_params)
|
|
173
|
+
params_copy['model'] = params_copy.pop('model_name')
|
|
174
|
+
params_copy.pop('api_keys')
|
|
175
|
+
params_copy.pop('provider')
|
|
179
176
|
|
|
180
|
-
to_markdown = ToMarkdown(
|
|
181
|
-
return to_markdown.call(file_path_or_url)
|
|
177
|
+
to_markdown = ToMarkdown()
|
|
178
|
+
return to_markdown.call(file_path_or_url, **params_copy)
|
|
182
179
|
|
|
183
180
|
meta = {
|
|
184
181
|
'name': name,
|
|
185
182
|
'callback': callback,
|
|
186
|
-
'input_types': ['str'
|
|
183
|
+
'input_types': ['str'],
|
|
187
184
|
'output_type': 'str'
|
|
188
185
|
}
|
|
189
186
|
self.callbacks[name] = meta
|
|
@@ -1,13 +1,10 @@
|
|
|
1
|
-
import base64
|
|
2
1
|
from io import BytesIO
|
|
3
2
|
import os
|
|
4
3
|
from typing import Union
|
|
5
4
|
from urllib.parse import urlparse
|
|
6
5
|
|
|
7
|
-
import
|
|
8
|
-
from markitdown import MarkItDown
|
|
6
|
+
from aipdf import ocr
|
|
9
7
|
import mimetypes
|
|
10
|
-
from openai import OpenAI
|
|
11
8
|
import requests
|
|
12
9
|
|
|
13
10
|
|
|
@@ -15,41 +12,22 @@ class ToMarkdown:
|
|
|
15
12
|
"""
|
|
16
13
|
Extracts the content of documents of various formats in markdown format.
|
|
17
14
|
"""
|
|
18
|
-
def __init__(self
|
|
15
|
+
def __init__(self):
|
|
19
16
|
"""
|
|
20
17
|
Initializes the ToMarkdown class.
|
|
21
18
|
"""
|
|
22
|
-
# If use_llm is True, llm_client and llm_model must be provided.
|
|
23
|
-
if use_llm and (llm_client is None or llm_model is None):
|
|
24
|
-
raise ValueError('LLM client and model must be provided when use_llm is True.')
|
|
25
19
|
|
|
26
|
-
|
|
27
|
-
if not use_llm:
|
|
28
|
-
llm_client = None
|
|
29
|
-
llm_model = None
|
|
30
|
-
|
|
31
|
-
# Only OpenAI is supported for now.
|
|
32
|
-
# TODO: Add support for other LLMs.
|
|
33
|
-
if llm_client is not None and not isinstance(llm_client, OpenAI):
|
|
34
|
-
raise ValueError('Only OpenAI models are supported at the moment.')
|
|
35
|
-
|
|
36
|
-
self.use_llm = use_llm
|
|
37
|
-
self.llm_client = llm_client
|
|
38
|
-
self.llm_model = llm_model
|
|
39
|
-
|
|
40
|
-
def call(self, file_path_or_url: str) -> str:
|
|
20
|
+
def call(self, file_path_or_url: str, **kwargs) -> str:
|
|
41
21
|
"""
|
|
42
22
|
Converts a file to markdown.
|
|
43
23
|
"""
|
|
44
24
|
file_extension = self._get_file_extension(file_path_or_url)
|
|
45
|
-
|
|
25
|
+
file_content = self._get_file_content(file_path_or_url)
|
|
46
26
|
|
|
47
27
|
if file_extension == '.pdf':
|
|
48
|
-
return self._pdf_to_markdown(
|
|
49
|
-
elif file_extension in ['.jpg', '.jpeg', '.png', '.gif']:
|
|
50
|
-
return self._image_to_markdown(file)
|
|
28
|
+
return self._pdf_to_markdown(file_content, **kwargs)
|
|
51
29
|
else:
|
|
52
|
-
|
|
30
|
+
raise ValueError(f"Unsupported file type: {file_extension}.")
|
|
53
31
|
|
|
54
32
|
def _get_file_content(self, file_path_or_url: str) -> str:
|
|
55
33
|
"""
|
|
@@ -90,105 +68,12 @@ class ToMarkdown:
|
|
|
90
68
|
else:
|
|
91
69
|
return os.path.splitext(file_path_or_url)[1]
|
|
92
70
|
|
|
93
|
-
def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes]) -> str:
|
|
71
|
+
def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes], **kwargs) -> str:
|
|
94
72
|
"""
|
|
95
73
|
Converts a PDF file to markdown.
|
|
96
74
|
"""
|
|
97
|
-
if self.llm_client is None:
|
|
98
|
-
return self._pdf_to_markdown_no_llm(file_content)
|
|
99
|
-
else:
|
|
100
|
-
return self._pdf_to_markdown_llm(file_content)
|
|
101
|
-
|
|
102
|
-
def _pdf_to_markdown_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
103
|
-
"""
|
|
104
|
-
Converts a PDF file to markdown using LLM.
|
|
105
|
-
The LLM is used mainly for the purpose of generating descriptions of any images in the PDF.
|
|
106
|
-
"""
|
|
107
75
|
if isinstance(file_content, requests.Response):
|
|
108
76
|
file_content = BytesIO(file_content.content)
|
|
109
77
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
markdown_content = []
|
|
113
|
-
for page_num in range(len(document)):
|
|
114
|
-
page = document.load_page(page_num)
|
|
115
|
-
|
|
116
|
-
# Get text blocks with coordinates.
|
|
117
|
-
page_content = []
|
|
118
|
-
blocks = page.get_text("blocks")
|
|
119
|
-
for block in blocks:
|
|
120
|
-
x0, y0, x1, y1, text, _, _ = block
|
|
121
|
-
if text.strip(): # Skip empty or whitespace blocks.
|
|
122
|
-
page_content.append((y0, text.strip()))
|
|
123
|
-
|
|
124
|
-
# Extract images from the page.
|
|
125
|
-
image_list = page.get_images(full=True)
|
|
126
|
-
for img_index, img in enumerate(image_list):
|
|
127
|
-
xref = img[0]
|
|
128
|
-
base_image = document.extract_image(xref)
|
|
129
|
-
image_bytes = base_image["image"]
|
|
130
|
-
|
|
131
|
-
# Use actual image y-coordinate if available.
|
|
132
|
-
y0 = float(base_image.get("y", 0))
|
|
133
|
-
image_description = self._generate_image_description(image_bytes)
|
|
134
|
-
page_content.append((y0, f""))
|
|
135
|
-
|
|
136
|
-
# Sort the content by y0 coordinate
|
|
137
|
-
page_content.sort(key=lambda x: x[0])
|
|
138
|
-
|
|
139
|
-
# Add sorted content to the markdown
|
|
140
|
-
for _, text in page_content:
|
|
141
|
-
markdown_content.append(text)
|
|
142
|
-
markdown_content.append("\n")
|
|
143
|
-
|
|
144
|
-
document.close()
|
|
145
|
-
|
|
146
|
-
return "\n".join(markdown_content)
|
|
147
|
-
|
|
148
|
-
def _generate_image_description(self, image_bytes: bytes) -> str:
|
|
149
|
-
"""
|
|
150
|
-
Generates a description of the image using LLM.
|
|
151
|
-
"""
|
|
152
|
-
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
|
|
153
|
-
|
|
154
|
-
response = self.llm_client.chat.completions.create(
|
|
155
|
-
model=self.llm_model,
|
|
156
|
-
messages=[
|
|
157
|
-
{
|
|
158
|
-
"role": "user",
|
|
159
|
-
"content": [
|
|
160
|
-
{"type": "text", "text": "Describe this image"},
|
|
161
|
-
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
|
162
|
-
],
|
|
163
|
-
}
|
|
164
|
-
],
|
|
165
|
-
)
|
|
166
|
-
description = response.choices[0].message.content
|
|
167
|
-
return description
|
|
168
|
-
|
|
169
|
-
def _pdf_to_markdown_no_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
170
|
-
"""
|
|
171
|
-
Converts a PDF file to markdown without using LLM.
|
|
172
|
-
"""
|
|
173
|
-
md = MarkItDown(enable_plugins=True)
|
|
174
|
-
result = md.convert(file_content)
|
|
175
|
-
return result.markdown
|
|
176
|
-
|
|
177
|
-
def _image_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
178
|
-
"""
|
|
179
|
-
Converts images to markdown.
|
|
180
|
-
"""
|
|
181
|
-
if not self.use_llm or self.llm_client is None:
|
|
182
|
-
raise ValueError('LLM client must be enabled to convert images to markdown.')
|
|
183
|
-
|
|
184
|
-
md = MarkItDown(llm_client=self.llm_client, llm_model=self.llm_model, enable_plugins=True)
|
|
185
|
-
result = md.convert(file_content)
|
|
186
|
-
return result.markdown
|
|
187
|
-
|
|
188
|
-
def _other_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
189
|
-
"""
|
|
190
|
-
Converts other file formats to markdown.
|
|
191
|
-
"""
|
|
192
|
-
md = MarkItDown(enable_plugins=True)
|
|
193
|
-
result = md.convert(file_content)
|
|
194
|
-
return result.markdown
|
|
78
|
+
markdown_pages = ocr(file_content, **kwargs)
|
|
79
|
+
return "\n\n---\n\n".join(markdown_pages)
|
|
@@ -41,13 +41,14 @@ from mindsdb.utilities.config import config
|
|
|
41
41
|
from mindsdb.utilities.context import context as ctx
|
|
42
42
|
|
|
43
43
|
from mindsdb.api.executor.command_executor import ExecuteCommands
|
|
44
|
+
from mindsdb.api.executor.utilities.sql import query_df
|
|
44
45
|
from mindsdb.utilities import log
|
|
45
46
|
from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMReranker
|
|
46
47
|
|
|
47
48
|
logger = log.getLogger(__name__)
|
|
48
49
|
|
|
49
50
|
KB_TO_VECTORDB_COLUMNS = {
|
|
50
|
-
'id': '
|
|
51
|
+
'id': 'original_doc_id',
|
|
51
52
|
'chunk_id': 'id',
|
|
52
53
|
'chunk_content': 'content'
|
|
53
54
|
}
|
|
@@ -150,13 +151,8 @@ class KnowledgeBaseTable:
|
|
|
150
151
|
query.from_table = Identifier(parts=[self._kb.vector_database_table])
|
|
151
152
|
logger.debug(f"Set table name to: {self._kb.vector_database_table}")
|
|
152
153
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
if isinstance(target, Star):
|
|
156
|
-
requested_kb_columns = None
|
|
157
|
-
break
|
|
158
|
-
else:
|
|
159
|
-
requested_kb_columns.append(target.parts[-1].lower())
|
|
154
|
+
# Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
|
|
155
|
+
query_copy = copy.deepcopy(query)
|
|
160
156
|
|
|
161
157
|
query.targets = [
|
|
162
158
|
Identifier(TableField.ID.value),
|
|
@@ -220,9 +216,17 @@ class KnowledgeBaseTable:
|
|
|
220
216
|
|
|
221
217
|
df = self.add_relevance(df, query_text, relevance_threshold)
|
|
222
218
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
219
|
+
if (
|
|
220
|
+
query.group_by is not None
|
|
221
|
+
or query.order_by is not None
|
|
222
|
+
or query.having is not None
|
|
223
|
+
or query.distinct is True
|
|
224
|
+
or len(query.targets) != 1
|
|
225
|
+
or not isinstance(query.targets[0], Star)
|
|
226
|
+
):
|
|
227
|
+
query_copy.where = None
|
|
228
|
+
df = query_df(df, query_copy, session=self.session)
|
|
229
|
+
|
|
226
230
|
return df
|
|
227
231
|
|
|
228
232
|
def add_relevance(self, df, query_text, relevance_threshold=None):
|
|
@@ -290,7 +294,7 @@ class KnowledgeBaseTable:
|
|
|
290
294
|
columns = list(df.columns)
|
|
291
295
|
# update id, get from metadata
|
|
292
296
|
df[TableField.ID.value] = df[TableField.METADATA.value].apply(
|
|
293
|
-
lambda m: None if m is None else m.get('
|
|
297
|
+
lambda m: None if m is None else m.get('original_doc_id')
|
|
294
298
|
)
|
|
295
299
|
|
|
296
300
|
# id on first place
|
|
@@ -479,12 +483,9 @@ class KnowledgeBaseTable:
|
|
|
479
483
|
# Use provided_id directly if it exists, otherwise generate one
|
|
480
484
|
doc_id = self._generate_document_id(content_str, col, provided_id)
|
|
481
485
|
|
|
482
|
-
# Need provided ID to link chunks back to original source (e.g. database row).
|
|
483
|
-
row_id = provided_id if provided_id else idx
|
|
484
|
-
|
|
485
486
|
metadata = {
|
|
486
487
|
**base_metadata,
|
|
487
|
-
'
|
|
488
|
+
'original_row_index': str(idx), # provide link to original row index
|
|
488
489
|
'content_column': col,
|
|
489
490
|
}
|
|
490
491
|
|
|
@@ -787,7 +788,7 @@ class KnowledgeBaseTable:
|
|
|
787
788
|
def _generate_document_id(self, content: str, content_column: str, provided_id: str = None) -> str:
|
|
788
789
|
"""Generate a deterministic document ID using the utility function."""
|
|
789
790
|
from mindsdb.interfaces.knowledge_base.utils import generate_document_id
|
|
790
|
-
return generate_document_id(content,
|
|
791
|
+
return generate_document_id(content=content, provided_id=provided_id)
|
|
791
792
|
|
|
792
793
|
def _convert_metadata_value(self, value):
|
|
793
794
|
"""
|
|
@@ -1004,6 +1005,8 @@ class KnowledgeBaseController:
|
|
|
1004
1005
|
if 'provider' in params:
|
|
1005
1006
|
engine = params.pop('provider').lower()
|
|
1006
1007
|
|
|
1008
|
+
api_key = get_api_key(engine, params, strict=False) or params.pop('api_key')
|
|
1009
|
+
|
|
1007
1010
|
if engine == 'azure_openai':
|
|
1008
1011
|
engine = 'openai'
|
|
1009
1012
|
params['provider'] = 'azure'
|
|
@@ -1011,8 +1014,8 @@ class KnowledgeBaseController:
|
|
|
1011
1014
|
if engine == 'openai':
|
|
1012
1015
|
if 'question_column' not in params:
|
|
1013
1016
|
params['question_column'] = 'content'
|
|
1014
|
-
if
|
|
1015
|
-
params[f"{engine}_api_key"] =
|
|
1017
|
+
if api_key:
|
|
1018
|
+
params[f"{engine}_api_key"] = api_key
|
|
1016
1019
|
if 'base_url' in params:
|
|
1017
1020
|
params['api_base'] = params.pop('base_url')
|
|
1018
1021
|
|
|
@@ -25,6 +25,8 @@ from langchain_core.documents import Document as LangchainDocument
|
|
|
25
25
|
|
|
26
26
|
logger = log.getLogger(__name__)
|
|
27
27
|
|
|
28
|
+
_DEFAULT_CONTENT_COLUMN_NAME = "content"
|
|
29
|
+
|
|
28
30
|
|
|
29
31
|
class DocumentPreprocessor:
|
|
30
32
|
"""Base class for document preprocessing"""
|
|
@@ -90,14 +92,18 @@ class DocumentPreprocessor:
|
|
|
90
92
|
start_char: Optional[int] = None,
|
|
91
93
|
end_char: Optional[int] = None,
|
|
92
94
|
provided_id: str = None,
|
|
95
|
+
content_column: str = None,
|
|
93
96
|
) -> str:
|
|
94
97
|
"""Generate human-readable deterministic ID for a chunk
|
|
95
|
-
Format: <doc_id>:<chunk_number>of<total_chunks>:<start_char>to<end_char>
|
|
98
|
+
Format: <doc_id>:<content_column>:<chunk_number>of<total_chunks>:<start_char>to<end_char>
|
|
96
99
|
"""
|
|
97
100
|
if provided_id is None:
|
|
98
101
|
raise ValueError("Document ID must be provided for chunk ID generation")
|
|
99
102
|
|
|
100
|
-
|
|
103
|
+
if content_column is None:
|
|
104
|
+
raise ValueError("Content column must be provided for chunk ID generation")
|
|
105
|
+
|
|
106
|
+
chunk_id = f"{provided_id}:{content_column}:{chunk_index + 1}of{total_chunks}:{start_char}to{end_char}"
|
|
101
107
|
logger.debug(f"Generated chunk ID: {chunk_id}")
|
|
102
108
|
return chunk_id
|
|
103
109
|
|
|
@@ -254,8 +260,15 @@ Please give a short succinct context to situate this chunk within the overall do
|
|
|
254
260
|
if doc.metadata:
|
|
255
261
|
metadata.update(doc.metadata)
|
|
256
262
|
|
|
263
|
+
# Get content_column from metadata or use default
|
|
264
|
+
content_column = metadata.get('content_column')
|
|
265
|
+
if content_column is None:
|
|
266
|
+
# If content_column is not in metadata, use the default column name
|
|
267
|
+
content_column = _DEFAULT_CONTENT_COLUMN_NAME
|
|
268
|
+
logger.debug(f"No content_column found in metadata, using default: {_DEFAULT_CONTENT_COLUMN_NAME}")
|
|
269
|
+
|
|
257
270
|
chunk_id = self._generate_chunk_id(
|
|
258
|
-
chunk_index=chunk_index, provided_id=doc.id
|
|
271
|
+
chunk_index=chunk_index, provided_id=doc.id, content_column=content_column
|
|
259
272
|
)
|
|
260
273
|
processed_chunks.append(
|
|
261
274
|
ProcessedChunk(
|
|
@@ -324,13 +337,23 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
|
|
|
324
337
|
metadata["start_char"] = start_char
|
|
325
338
|
metadata["end_char"] = end_char
|
|
326
339
|
|
|
327
|
-
#
|
|
340
|
+
# Get content_column from metadata or use default
|
|
341
|
+
content_column = None
|
|
342
|
+
if doc.metadata:
|
|
343
|
+
content_column = doc.metadata.get('content_column')
|
|
344
|
+
|
|
345
|
+
if content_column is None:
|
|
346
|
+
# If content_column is not in metadata, use the default column name
|
|
347
|
+
content_column = _DEFAULT_CONTENT_COLUMN_NAME
|
|
348
|
+
logger.debug(f"No content_column found in metadata, using default: {_DEFAULT_CONTENT_COLUMN_NAME}")
|
|
349
|
+
|
|
328
350
|
chunk_id = self._generate_chunk_id(
|
|
329
351
|
chunk_index=i,
|
|
330
352
|
total_chunks=total_chunks,
|
|
331
353
|
start_char=start_char,
|
|
332
354
|
end_char=end_char,
|
|
333
|
-
provided_id=doc.id
|
|
355
|
+
provided_id=doc.id,
|
|
356
|
+
content_column=content_column
|
|
334
357
|
)
|
|
335
358
|
|
|
336
359
|
processed_chunks.append(
|
|
@@ -2,27 +2,22 @@
|
|
|
2
2
|
import hashlib
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
def generate_document_id(content: str, content_column: str, provided_id: str = None) -> str:
|
|
5
|
+
def generate_document_id(content: str, content_column: str = None, provided_id: str = None) -> str:
|
|
6
6
|
"""
|
|
7
|
-
Generate a deterministic document ID from content
|
|
8
|
-
If provided_id exists,
|
|
9
|
-
For generated IDs, uses a short hash of just the content
|
|
10
|
-
same content gets same base ID across different columns.
|
|
7
|
+
Generate a deterministic document ID from content.
|
|
8
|
+
If provided_id exists, returns it directly.
|
|
9
|
+
For generated IDs, uses a short hash of just the content.
|
|
11
10
|
|
|
12
11
|
Args:
|
|
13
12
|
content: The content string
|
|
14
|
-
content_column: Name of the content column
|
|
13
|
+
content_column: Name of the content column (not used in ID generation, kept for backward compatibility)
|
|
15
14
|
provided_id: Optional user-provided ID
|
|
16
15
|
Returns:
|
|
17
|
-
Deterministic document ID
|
|
18
|
-
where base_id is either the provided_id or a 16-char hash of content
|
|
16
|
+
Deterministic document ID (either provided_id or a 16-char hash of content)
|
|
19
17
|
"""
|
|
20
18
|
if provided_id is not None:
|
|
21
|
-
|
|
22
|
-
else:
|
|
23
|
-
# Generate a shorter 16-character hash based only on content
|
|
24
|
-
hash_obj = hashlib.md5(content.encode())
|
|
25
|
-
base_id = hash_obj.hexdigest()[:16]
|
|
19
|
+
return provided_id
|
|
26
20
|
|
|
27
|
-
#
|
|
28
|
-
|
|
21
|
+
# Generate a shorter 16-character hash based only on content
|
|
22
|
+
hash_obj = hashlib.md5(content.encode())
|
|
23
|
+
return hash_obj.hexdigest()[:16]
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import sys
|
|
2
1
|
import copy
|
|
3
2
|
import datetime as dt
|
|
4
3
|
from copy import deepcopy
|
|
@@ -28,7 +27,6 @@ from mindsdb.utilities import log
|
|
|
28
27
|
|
|
29
28
|
logger = log.getLogger(__name__)
|
|
30
29
|
|
|
31
|
-
IS_PY36 = sys.version_info[1] <= 6
|
|
32
30
|
|
|
33
31
|
default_project = config.get('default_project')
|
|
34
32
|
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
|
|
2
1
|
import re
|
|
3
2
|
import csv
|
|
4
3
|
import inspect
|
|
@@ -13,6 +12,7 @@ from mindsdb.utilities import log
|
|
|
13
12
|
from mindsdb.utilities.context import context as ctx
|
|
14
13
|
from mindsdb.integrations.utilities.query_traversal import query_traversal
|
|
15
14
|
from mindsdb.integrations.libs.response import INF_SCHEMA_COLUMNS_NAMES
|
|
15
|
+
from mindsdb.api.mysql.mysql_proxy.libs.constants.mysql import MYSQL_DATA_TYPE
|
|
16
16
|
|
|
17
17
|
logger = log.getLogger(__name__)
|
|
18
18
|
|
|
@@ -253,7 +253,7 @@ class SQLAgent:
|
|
|
253
253
|
for table in all_tables:
|
|
254
254
|
key = f"{ctx.company_id}_{table}_info"
|
|
255
255
|
table_info = self._cache.get(key) if self._cache else None
|
|
256
|
-
if table_info is None:
|
|
256
|
+
if True or table_info is None:
|
|
257
257
|
table_info = self._get_single_table_info(table)
|
|
258
258
|
if self._cache:
|
|
259
259
|
self._cache.set(key, table_info)
|
|
@@ -276,19 +276,41 @@ class SQLAgent:
|
|
|
276
276
|
dn = self._command_executor.session.datahub.get(integration)
|
|
277
277
|
|
|
278
278
|
fields, dtypes = [], []
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
279
|
+
try:
|
|
280
|
+
df = dn.get_table_columns_df(table_name, schema_name)
|
|
281
|
+
if not isinstance(df, pd.DataFrame) or df.empty:
|
|
282
|
+
logger.warning(f"Received empty or invalid DataFrame for table columns of {table_str}")
|
|
283
|
+
return f"Table named `{table_str}`:\n [No column information available]"
|
|
284
|
+
|
|
285
|
+
fields = df[INF_SCHEMA_COLUMNS_NAMES.COLUMN_NAME].to_list()
|
|
286
|
+
dtypes = [
|
|
287
|
+
mysql_data_type.value if isinstance(mysql_data_type, MYSQL_DATA_TYPE) else (data_type or 'UNKNOWN')
|
|
288
|
+
for mysql_data_type, data_type
|
|
289
|
+
in zip(
|
|
290
|
+
df[INF_SCHEMA_COLUMNS_NAMES.MYSQL_DATA_TYPE],
|
|
291
|
+
df[INF_SCHEMA_COLUMNS_NAMES.DATA_TYPE]
|
|
292
|
+
)
|
|
293
|
+
]
|
|
294
|
+
except Exception as e:
|
|
295
|
+
logger.error(f"Failed processing column info for {table_str}: {e}", exc_info=True)
|
|
296
|
+
raise ValueError(f"Failed to process column info for {table_str}") from e
|
|
297
|
+
|
|
298
|
+
if not fields:
|
|
299
|
+
logger.error(f"Could not extract column fields for {table_str}.")
|
|
300
|
+
return f"Table named `{table_str}`:\n [Could not extract column information]"
|
|
301
|
+
|
|
302
|
+
try:
|
|
303
|
+
sample_rows_info = self._get_sample_rows(table_str, fields)
|
|
304
|
+
except Exception as e:
|
|
305
|
+
logger.warning(f"Could not get sample rows for {table_str}: {e}")
|
|
306
|
+
sample_rows_info = "\n\t [error] Couldn't retrieve sample rows!"
|
|
286
307
|
|
|
287
308
|
info = f'Table named `{table_str}`:\n'
|
|
288
309
|
info += f"\nSample with first {self._sample_rows_in_table_info} rows from table {table_str} in CSV format (dialect is 'excel'):\n"
|
|
289
|
-
info +=
|
|
310
|
+
info += sample_rows_info + "\n"
|
|
290
311
|
info += '\nColumn data types: ' + ",\t".join(
|
|
291
|
-
[f'\n`{field}` : `{dtype}`' for field, dtype in zip(fields, dtypes)]
|
|
312
|
+
[f'\n`{field}` : `{dtype}`' for field, dtype in zip(fields, dtypes)]
|
|
313
|
+
) + '\n'
|
|
292
314
|
return info
|
|
293
315
|
|
|
294
316
|
def _get_sample_rows(self, table: str, fields: List[str]) -> str:
|
mindsdb/migrations/migrate.py
CHANGED
mindsdb/utilities/config.py
CHANGED
|
@@ -143,7 +143,8 @@ class Config:
|
|
|
143
143
|
'auth': {
|
|
144
144
|
'http_auth_enabled': False,
|
|
145
145
|
"http_permanent_session_lifetime": datetime.timedelta(days=31),
|
|
146
|
-
"username": "mindsdb"
|
|
146
|
+
"username": "mindsdb",
|
|
147
|
+
"password": ""
|
|
147
148
|
},
|
|
148
149
|
"logging": {
|
|
149
150
|
"handlers": {
|
|
@@ -459,7 +460,7 @@ class Config:
|
|
|
459
460
|
"""
|
|
460
461
|
updated = self.fetch_auto_config()
|
|
461
462
|
if updated:
|
|
462
|
-
self.
|
|
463
|
+
self.merge_configs()
|
|
463
464
|
|
|
464
465
|
def merge_configs(self) -> None:
|
|
465
466
|
"""Merge multiple configs to one.
|
mindsdb/utilities/context.py
CHANGED
|
@@ -54,7 +54,7 @@ class Context:
|
|
|
54
54
|
def load(self, storage: dict) -> None:
|
|
55
55
|
self._storage.set(storage)
|
|
56
56
|
|
|
57
|
-
def
|
|
57
|
+
def get_metadata(self, **kwargs) -> dict:
|
|
58
58
|
return {
|
|
59
59
|
'user_id': self.user_id or "",
|
|
60
60
|
'company_id': self.company_id or "",
|
mindsdb/utilities/functions.py
CHANGED
|
@@ -7,7 +7,6 @@ import textwrap
|
|
|
7
7
|
from functools import wraps
|
|
8
8
|
from collections.abc import Callable
|
|
9
9
|
|
|
10
|
-
import requests
|
|
11
10
|
from cryptography.fernet import Fernet
|
|
12
11
|
from mindsdb_sql_parser.ast import Identifier
|
|
13
12
|
|
|
@@ -72,41 +71,6 @@ def mark_process(name: str, custom_mark: str = None) -> Callable:
|
|
|
72
71
|
return mark_process_wrapper
|
|
73
72
|
|
|
74
73
|
|
|
75
|
-
def get_versions_where_predictors_become_obsolete():
|
|
76
|
-
""" Get list of MindsDB versions in which predictors should be retrained
|
|
77
|
-
Returns:
|
|
78
|
-
list of str or False
|
|
79
|
-
"""
|
|
80
|
-
versions_for_updating_predictors = []
|
|
81
|
-
try:
|
|
82
|
-
try:
|
|
83
|
-
res = requests.get(
|
|
84
|
-
'https://mindsdb-cloud-public-service-files.s3.us-east-2.amazonaws.com/version_for_updating_predictors.txt',
|
|
85
|
-
timeout=0.5
|
|
86
|
-
)
|
|
87
|
-
except (ConnectionError, requests.exceptions.ConnectionError) as e:
|
|
88
|
-
logger.error(f'Is no connection. {e}')
|
|
89
|
-
raise
|
|
90
|
-
except Exception as e:
|
|
91
|
-
logger.error(f'Is something wrong with getting version_for_updating_predictors.txt: {e}')
|
|
92
|
-
raise
|
|
93
|
-
|
|
94
|
-
if res.status_code != 200:
|
|
95
|
-
logger.error(f'Cant get version_for_updating_predictors.txt: returned status code = {res.status_code}')
|
|
96
|
-
raise
|
|
97
|
-
|
|
98
|
-
try:
|
|
99
|
-
versions_for_updating_predictors = res.text.replace(' \t\r', '').split('\n')
|
|
100
|
-
except Exception as e:
|
|
101
|
-
logger.error(f'Cant decode version_for_updating_predictors.txt: {e}')
|
|
102
|
-
raise
|
|
103
|
-
except Exception:
|
|
104
|
-
return False, versions_for_updating_predictors
|
|
105
|
-
|
|
106
|
-
versions_for_updating_predictors = [x for x in versions_for_updating_predictors if len(x) > 0]
|
|
107
|
-
return True, versions_for_updating_predictors
|
|
108
|
-
|
|
109
|
-
|
|
110
74
|
def init_lexer_parsers():
|
|
111
75
|
from mindsdb_sql_parser.lexer import MindsDBLexer
|
|
112
76
|
from mindsdb_sql_parser.parser import MindsDBParser
|