MindsDB 25.4.5.0__py3-none-any.whl → 25.5.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (63) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +107 -125
  3. mindsdb/api/executor/command_executor.py +2 -1
  4. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +8 -0
  5. mindsdb/api/executor/datahub/datanodes/system_tables.py +10 -13
  6. mindsdb/api/executor/planner/query_planner.py +4 -1
  7. mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +2 -1
  8. mindsdb/api/http/initialize.py +20 -3
  9. mindsdb/api/http/namespaces/analysis.py +14 -1
  10. mindsdb/api/http/namespaces/tree.py +1 -1
  11. mindsdb/api/http/start.py +7 -2
  12. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +4 -8
  13. mindsdb/api/mysql/mysql_proxy/utilities/exceptions.py +0 -4
  14. mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message_formats.py +2 -2
  15. mindsdb/integrations/handlers/bigquery_handler/requirements.txt +1 -0
  16. mindsdb/integrations/handlers/chromadb_handler/requirements.txt +1 -0
  17. mindsdb/integrations/handlers/gmail_handler/requirements.txt +1 -0
  18. mindsdb/integrations/handlers/google_analytics_handler/requirements.txt +2 -1
  19. mindsdb/integrations/handlers/google_books_handler/requirements.txt +1 -1
  20. mindsdb/integrations/handlers/google_calendar_handler/requirements.txt +1 -0
  21. mindsdb/integrations/handlers/google_content_shopping_handler/requirements.txt +1 -1
  22. mindsdb/integrations/handlers/google_fit_handler/requirements.txt +2 -0
  23. mindsdb/integrations/handlers/google_search_handler/requirements.txt +1 -1
  24. mindsdb/integrations/handlers/jira_handler/jira_handler.archived.py +75 -0
  25. mindsdb/integrations/handlers/jira_handler/jira_handler.py +113 -38
  26. mindsdb/integrations/handlers/jira_handler/jira_tables.py +229 -0
  27. mindsdb/integrations/handlers/jira_handler/requirements.txt +1 -0
  28. mindsdb/integrations/handlers/lightfm_handler/requirements.txt +1 -0
  29. mindsdb/integrations/handlers/lightwood_handler/lightwood_handler.py +0 -2
  30. mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
  31. mindsdb/integrations/handlers/lindorm_handler/requirements.txt +1 -0
  32. mindsdb/integrations/handlers/ms_one_drive_handler/requirements.txt +2 -0
  33. mindsdb/integrations/handlers/ms_teams_handler/requirements.txt +3 -1
  34. mindsdb/integrations/handlers/openai_handler/openai_handler.py +5 -4
  35. mindsdb/integrations/handlers/snowflake_handler/requirements.txt +1 -1
  36. mindsdb/integrations/handlers/vertex_handler/requirements.txt +1 -0
  37. mindsdb/integrations/handlers/youtube_handler/requirements.txt +1 -0
  38. mindsdb/integrations/utilities/files/file_reader.py +5 -2
  39. mindsdb/interfaces/agents/constants.py +14 -2
  40. mindsdb/interfaces/agents/langchain_agent.py +2 -4
  41. mindsdb/interfaces/database/projects.py +1 -7
  42. mindsdb/interfaces/functions/controller.py +11 -14
  43. mindsdb/interfaces/functions/to_markdown.py +9 -124
  44. mindsdb/interfaces/knowledge_base/controller.py +22 -19
  45. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +28 -5
  46. mindsdb/interfaces/knowledge_base/utils.py +10 -15
  47. mindsdb/interfaces/model/model_controller.py +0 -2
  48. mindsdb/interfaces/skills/sql_agent.py +33 -11
  49. mindsdb/migrations/migrate.py +0 -2
  50. mindsdb/utilities/config.py +3 -2
  51. mindsdb/utilities/context.py +1 -1
  52. mindsdb/utilities/functions.py +0 -36
  53. mindsdb/utilities/langfuse.py +19 -10
  54. mindsdb/utilities/otel/__init__.py +9 -193
  55. mindsdb/utilities/otel/metric_handlers/__init__.py +5 -1
  56. mindsdb/utilities/otel/prepare.py +198 -0
  57. mindsdb/utilities/sql.py +83 -0
  58. {mindsdb-25.4.5.0.dist-info → mindsdb-25.5.3.0.dist-info}/METADATA +663 -596
  59. {mindsdb-25.4.5.0.dist-info → mindsdb-25.5.3.0.dist-info}/RECORD +62 -57
  60. {mindsdb-25.4.5.0.dist-info → mindsdb-25.5.3.0.dist-info}/WHEEL +1 -1
  61. mindsdb/api/mysql/mysql_proxy/classes/sql_statement_parser.py +0 -151
  62. {mindsdb-25.4.5.0.dist-info → mindsdb-25.5.3.0.dist-info}/licenses/LICENSE +0 -0
  63. {mindsdb-25.4.5.0.dist-info → mindsdb-25.5.3.0.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,6 @@ from typing import List, Optional
4
4
  from collections import OrderedDict
5
5
 
6
6
  import sqlalchemy as sa
7
- from sqlalchemy.orm.attributes import flag_modified
8
7
  import numpy as np
9
8
 
10
9
  from mindsdb_sql_parser.ast.base import ASTNode
@@ -457,7 +456,7 @@ class ProjectController:
457
456
  project.create(name=name)
458
457
  return project
459
458
 
460
- def update(self, id: Optional[int] = None, name: Optional[str] = None, new_name: str = None, new_metadata: dict = None) -> Project:
459
+ def update(self, id: Optional[int] = None, name: Optional[str] = None, new_name: str = None) -> Project:
461
460
  if id is not None and name is not None:
462
461
  raise ValueError("Both 'id' and 'name' can't be provided at the same time")
463
462
 
@@ -470,10 +469,5 @@ class ProjectController:
470
469
  project.name = new_name
471
470
  project.record.name = new_name
472
471
 
473
- if new_metadata is not None:
474
- project.metadata = new_metadata
475
- project.record.metadata_ = new_metadata
476
- flag_modified(project.record, 'metadata_')
477
-
478
472
  db.session.commit()
479
473
  return project
@@ -1,7 +1,7 @@
1
1
  import os
2
+ import copy
2
3
 
3
4
  from duckdb.typing import BIGINT, DOUBLE, VARCHAR, BLOB, BOOLEAN
4
- from mindsdb.interfaces.functions.to_markdown import ToMarkdown
5
5
  from mindsdb.interfaces.storage.model_fs import HandlerStorage
6
6
  from mindsdb.utilities.config import config
7
7
 
@@ -159,31 +159,28 @@ class FunctionController(BYOMFunctionsController):
159
159
  return meta
160
160
 
161
161
  def to_markdown_call_function(self, node):
162
+ # load on-demand because lib is heavy
163
+ from mindsdb.interfaces.functions.to_markdown import ToMarkdown
162
164
  name = node.op.lower()
163
165
 
164
166
  if name in self.callbacks:
165
167
  return self.callbacks[name]
166
168
 
167
- def callback(file_path_or_url, use_llm):
169
+ def callback(file_path_or_url):
168
170
  chat_model_params = self._parse_chat_model_params('TO_MARKDOWN_FUNCTION_')
169
171
 
170
- llm_client = None
171
- llm_model = None
172
- try:
173
- from mindsdb.interfaces.agents.langchain_agent import create_chat_model
174
- llm = create_chat_model(chat_model_params)
175
- llm_client = llm.root_client
176
- llm_model = llm.model_name
177
- except Exception:
178
- pass
172
+ params_copy = copy.deepcopy(chat_model_params)
173
+ params_copy['model'] = params_copy.pop('model_name')
174
+ params_copy.pop('api_keys')
175
+ params_copy.pop('provider')
179
176
 
180
- to_markdown = ToMarkdown(use_llm, llm_client, llm_model)
181
- return to_markdown.call(file_path_or_url)
177
+ to_markdown = ToMarkdown()
178
+ return to_markdown.call(file_path_or_url, **params_copy)
182
179
 
183
180
  meta = {
184
181
  'name': name,
185
182
  'callback': callback,
186
- 'input_types': ['str', 'bool'],
183
+ 'input_types': ['str'],
187
184
  'output_type': 'str'
188
185
  }
189
186
  self.callbacks[name] = meta
@@ -1,13 +1,10 @@
1
- import base64
2
1
  from io import BytesIO
3
2
  import os
4
3
  from typing import Union
5
4
  from urllib.parse import urlparse
6
5
 
7
- import fitz # PyMuPDF
8
- from markitdown import MarkItDown
6
+ from aipdf import ocr
9
7
  import mimetypes
10
- from openai import OpenAI
11
8
  import requests
12
9
 
13
10
 
@@ -15,41 +12,22 @@ class ToMarkdown:
15
12
  """
16
13
  Extracts the content of documents of various formats in markdown format.
17
14
  """
18
- def __init__(self, use_llm: bool, llm_client: OpenAI = None, llm_model: str = None):
15
+ def __init__(self):
19
16
  """
20
17
  Initializes the ToMarkdown class.
21
18
  """
22
- # If use_llm is True, llm_client and llm_model must be provided.
23
- if use_llm and (llm_client is None or llm_model is None):
24
- raise ValueError('LLM client and model must be provided when use_llm is True.')
25
19
 
26
- # If use_llm is False, set llm_client and llm_model to None even if they are provided.
27
- if not use_llm:
28
- llm_client = None
29
- llm_model = None
30
-
31
- # Only OpenAI is supported for now.
32
- # TODO: Add support for other LLMs.
33
- if llm_client is not None and not isinstance(llm_client, OpenAI):
34
- raise ValueError('Only OpenAI models are supported at the moment.')
35
-
36
- self.use_llm = use_llm
37
- self.llm_client = llm_client
38
- self.llm_model = llm_model
39
-
40
- def call(self, file_path_or_url: str) -> str:
20
+ def call(self, file_path_or_url: str, **kwargs) -> str:
41
21
  """
42
22
  Converts a file to markdown.
43
23
  """
44
24
  file_extension = self._get_file_extension(file_path_or_url)
45
- file = self._get_file_content(file_path_or_url)
25
+ file_content = self._get_file_content(file_path_or_url)
46
26
 
47
27
  if file_extension == '.pdf':
48
- return self._pdf_to_markdown(file)
49
- elif file_extension in ['.jpg', '.jpeg', '.png', '.gif']:
50
- return self._image_to_markdown(file)
28
+ return self._pdf_to_markdown(file_content, **kwargs)
51
29
  else:
52
- return self._other_to_markdown(file)
30
+ raise ValueError(f"Unsupported file type: {file_extension}.")
53
31
 
54
32
  def _get_file_content(self, file_path_or_url: str) -> str:
55
33
  """
@@ -90,105 +68,12 @@ class ToMarkdown:
90
68
  else:
91
69
  return os.path.splitext(file_path_or_url)[1]
92
70
 
93
- def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes]) -> str:
71
+ def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes], **kwargs) -> str:
94
72
  """
95
73
  Converts a PDF file to markdown.
96
74
  """
97
- if self.llm_client is None:
98
- return self._pdf_to_markdown_no_llm(file_content)
99
- else:
100
- return self._pdf_to_markdown_llm(file_content)
101
-
102
- def _pdf_to_markdown_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
103
- """
104
- Converts a PDF file to markdown using LLM.
105
- The LLM is used mainly for the purpose of generating descriptions of any images in the PDF.
106
- """
107
75
  if isinstance(file_content, requests.Response):
108
76
  file_content = BytesIO(file_content.content)
109
77
 
110
- document = fitz.open(stream=file_content, filetype="pdf")
111
-
112
- markdown_content = []
113
- for page_num in range(len(document)):
114
- page = document.load_page(page_num)
115
-
116
- # Get text blocks with coordinates.
117
- page_content = []
118
- blocks = page.get_text("blocks")
119
- for block in blocks:
120
- x0, y0, x1, y1, text, _, _ = block
121
- if text.strip(): # Skip empty or whitespace blocks.
122
- page_content.append((y0, text.strip()))
123
-
124
- # Extract images from the page.
125
- image_list = page.get_images(full=True)
126
- for img_index, img in enumerate(image_list):
127
- xref = img[0]
128
- base_image = document.extract_image(xref)
129
- image_bytes = base_image["image"]
130
-
131
- # Use actual image y-coordinate if available.
132
- y0 = float(base_image.get("y", 0))
133
- image_description = self._generate_image_description(image_bytes)
134
- page_content.append((y0, f"![{image_description}](image_{page_num + 1}_{img_index + 1}.png)"))
135
-
136
- # Sort the content by y0 coordinate
137
- page_content.sort(key=lambda x: x[0])
138
-
139
- # Add sorted content to the markdown
140
- for _, text in page_content:
141
- markdown_content.append(text)
142
- markdown_content.append("\n")
143
-
144
- document.close()
145
-
146
- return "\n".join(markdown_content)
147
-
148
- def _generate_image_description(self, image_bytes: bytes) -> str:
149
- """
150
- Generates a description of the image using LLM.
151
- """
152
- image_base64 = base64.b64encode(image_bytes).decode("utf-8")
153
-
154
- response = self.llm_client.chat.completions.create(
155
- model=self.llm_model,
156
- messages=[
157
- {
158
- "role": "user",
159
- "content": [
160
- {"type": "text", "text": "Describe this image"},
161
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
162
- ],
163
- }
164
- ],
165
- )
166
- description = response.choices[0].message.content
167
- return description
168
-
169
- def _pdf_to_markdown_no_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
170
- """
171
- Converts a PDF file to markdown without using LLM.
172
- """
173
- md = MarkItDown(enable_plugins=True)
174
- result = md.convert(file_content)
175
- return result.markdown
176
-
177
- def _image_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
178
- """
179
- Converts images to markdown.
180
- """
181
- if not self.use_llm or self.llm_client is None:
182
- raise ValueError('LLM client must be enabled to convert images to markdown.')
183
-
184
- md = MarkItDown(llm_client=self.llm_client, llm_model=self.llm_model, enable_plugins=True)
185
- result = md.convert(file_content)
186
- return result.markdown
187
-
188
- def _other_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
189
- """
190
- Converts other file formats to markdown.
191
- """
192
- md = MarkItDown(enable_plugins=True)
193
- result = md.convert(file_content)
194
- return result.markdown
78
+ markdown_pages = ocr(file_content, **kwargs)
79
+ return "\n\n---\n\n".join(markdown_pages)
@@ -41,13 +41,14 @@ from mindsdb.utilities.config import config
41
41
  from mindsdb.utilities.context import context as ctx
42
42
 
43
43
  from mindsdb.api.executor.command_executor import ExecuteCommands
44
+ from mindsdb.api.executor.utilities.sql import query_df
44
45
  from mindsdb.utilities import log
45
46
  from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMReranker
46
47
 
47
48
  logger = log.getLogger(__name__)
48
49
 
49
50
  KB_TO_VECTORDB_COLUMNS = {
50
- 'id': 'original_row_id',
51
+ 'id': 'original_doc_id',
51
52
  'chunk_id': 'id',
52
53
  'chunk_content': 'content'
53
54
  }
@@ -150,13 +151,8 @@ class KnowledgeBaseTable:
150
151
  query.from_table = Identifier(parts=[self._kb.vector_database_table])
151
152
  logger.debug(f"Set table name to: {self._kb.vector_database_table}")
152
153
 
153
- requested_kb_columns = []
154
- for target in query.targets:
155
- if isinstance(target, Star):
156
- requested_kb_columns = None
157
- break
158
- else:
159
- requested_kb_columns.append(target.parts[-1].lower())
154
+ # Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
155
+ query_copy = copy.deepcopy(query)
160
156
 
161
157
  query.targets = [
162
158
  Identifier(TableField.ID.value),
@@ -220,9 +216,17 @@ class KnowledgeBaseTable:
220
216
 
221
217
  df = self.add_relevance(df, query_text, relevance_threshold)
222
218
 
223
- # filter by targets
224
- if requested_kb_columns is not None:
225
- df = df[requested_kb_columns]
219
+ if (
220
+ query.group_by is not None
221
+ or query.order_by is not None
222
+ or query.having is not None
223
+ or query.distinct is True
224
+ or len(query.targets) != 1
225
+ or not isinstance(query.targets[0], Star)
226
+ ):
227
+ query_copy.where = None
228
+ df = query_df(df, query_copy, session=self.session)
229
+
226
230
  return df
227
231
 
228
232
  def add_relevance(self, df, query_text, relevance_threshold=None):
@@ -290,7 +294,7 @@ class KnowledgeBaseTable:
290
294
  columns = list(df.columns)
291
295
  # update id, get from metadata
292
296
  df[TableField.ID.value] = df[TableField.METADATA.value].apply(
293
- lambda m: None if m is None else m.get('original_row_id')
297
+ lambda m: None if m is None else m.get('original_doc_id')
294
298
  )
295
299
 
296
300
  # id on first place
@@ -479,12 +483,9 @@ class KnowledgeBaseTable:
479
483
  # Use provided_id directly if it exists, otherwise generate one
480
484
  doc_id = self._generate_document_id(content_str, col, provided_id)
481
485
 
482
- # Need provided ID to link chunks back to original source (e.g. database row).
483
- row_id = provided_id if provided_id else idx
484
-
485
486
  metadata = {
486
487
  **base_metadata,
487
- 'original_row_id': str(row_id),
488
+ 'original_row_index': str(idx), # provide link to original row index
488
489
  'content_column': col,
489
490
  }
490
491
 
@@ -787,7 +788,7 @@ class KnowledgeBaseTable:
787
788
  def _generate_document_id(self, content: str, content_column: str, provided_id: str = None) -> str:
788
789
  """Generate a deterministic document ID using the utility function."""
789
790
  from mindsdb.interfaces.knowledge_base.utils import generate_document_id
790
- return generate_document_id(content, content_column, provided_id)
791
+ return generate_document_id(content=content, provided_id=provided_id)
791
792
 
792
793
  def _convert_metadata_value(self, value):
793
794
  """
@@ -1004,6 +1005,8 @@ class KnowledgeBaseController:
1004
1005
  if 'provider' in params:
1005
1006
  engine = params.pop('provider').lower()
1006
1007
 
1008
+ api_key = get_api_key(engine, params, strict=False) or params.pop('api_key')
1009
+
1007
1010
  if engine == 'azure_openai':
1008
1011
  engine = 'openai'
1009
1012
  params['provider'] = 'azure'
@@ -1011,8 +1014,8 @@ class KnowledgeBaseController:
1011
1014
  if engine == 'openai':
1012
1015
  if 'question_column' not in params:
1013
1016
  params['question_column'] = 'content'
1014
- if 'api_key' in params:
1015
- params[f"{engine}_api_key"] = params.pop('api_key')
1017
+ if api_key:
1018
+ params[f"{engine}_api_key"] = api_key
1016
1019
  if 'base_url' in params:
1017
1020
  params['api_base'] = params.pop('base_url')
1018
1021
 
@@ -25,6 +25,8 @@ from langchain_core.documents import Document as LangchainDocument
25
25
 
26
26
  logger = log.getLogger(__name__)
27
27
 
28
+ _DEFAULT_CONTENT_COLUMN_NAME = "content"
29
+
28
30
 
29
31
  class DocumentPreprocessor:
30
32
  """Base class for document preprocessing"""
@@ -90,14 +92,18 @@ class DocumentPreprocessor:
90
92
  start_char: Optional[int] = None,
91
93
  end_char: Optional[int] = None,
92
94
  provided_id: str = None,
95
+ content_column: str = None,
93
96
  ) -> str:
94
97
  """Generate human-readable deterministic ID for a chunk
95
- Format: <doc_id>:<chunk_number>of<total_chunks>:<start_char>to<end_char>
98
+ Format: <doc_id>:<content_column>:<chunk_number>of<total_chunks>:<start_char>to<end_char>
96
99
  """
97
100
  if provided_id is None:
98
101
  raise ValueError("Document ID must be provided for chunk ID generation")
99
102
 
100
- chunk_id = f"{provided_id}:{chunk_index + 1}of{total_chunks}:{start_char}to{end_char}"
103
+ if content_column is None:
104
+ raise ValueError("Content column must be provided for chunk ID generation")
105
+
106
+ chunk_id = f"{provided_id}:{content_column}:{chunk_index + 1}of{total_chunks}:{start_char}to{end_char}"
101
107
  logger.debug(f"Generated chunk ID: {chunk_id}")
102
108
  return chunk_id
103
109
 
@@ -254,8 +260,15 @@ Please give a short succinct context to situate this chunk within the overall do
254
260
  if doc.metadata:
255
261
  metadata.update(doc.metadata)
256
262
 
263
+ # Get content_column from metadata or use default
264
+ content_column = metadata.get('content_column')
265
+ if content_column is None:
266
+ # If content_column is not in metadata, use the default column name
267
+ content_column = _DEFAULT_CONTENT_COLUMN_NAME
268
+ logger.debug(f"No content_column found in metadata, using default: {_DEFAULT_CONTENT_COLUMN_NAME}")
269
+
257
270
  chunk_id = self._generate_chunk_id(
258
- chunk_index=chunk_index, provided_id=doc.id
271
+ chunk_index=chunk_index, provided_id=doc.id, content_column=content_column
259
272
  )
260
273
  processed_chunks.append(
261
274
  ProcessedChunk(
@@ -324,13 +337,23 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
324
337
  metadata["start_char"] = start_char
325
338
  metadata["end_char"] = end_char
326
339
 
327
- # Generate chunk ID with total chunks
340
+ # Get content_column from metadata or use default
341
+ content_column = None
342
+ if doc.metadata:
343
+ content_column = doc.metadata.get('content_column')
344
+
345
+ if content_column is None:
346
+ # If content_column is not in metadata, use the default column name
347
+ content_column = _DEFAULT_CONTENT_COLUMN_NAME
348
+ logger.debug(f"No content_column found in metadata, using default: {_DEFAULT_CONTENT_COLUMN_NAME}")
349
+
328
350
  chunk_id = self._generate_chunk_id(
329
351
  chunk_index=i,
330
352
  total_chunks=total_chunks,
331
353
  start_char=start_char,
332
354
  end_char=end_char,
333
- provided_id=doc.id
355
+ provided_id=doc.id,
356
+ content_column=content_column
334
357
  )
335
358
 
336
359
  processed_chunks.append(
@@ -2,27 +2,22 @@
2
2
  import hashlib
3
3
 
4
4
 
5
- def generate_document_id(content: str, content_column: str, provided_id: str = None) -> str:
5
+ def generate_document_id(content: str, content_column: str = None, provided_id: str = None) -> str:
6
6
  """
7
- Generate a deterministic document ID from content and column name.
8
- If provided_id exists, combines it with content_column.
9
- For generated IDs, uses a short hash of just the content to ensure
10
- same content gets same base ID across different columns.
7
+ Generate a deterministic document ID from content.
8
+ If provided_id exists, returns it directly.
9
+ For generated IDs, uses a short hash of just the content.
11
10
 
12
11
  Args:
13
12
  content: The content string
14
- content_column: Name of the content column
13
+ content_column: Name of the content column (not used in ID generation, kept for backward compatibility)
15
14
  provided_id: Optional user-provided ID
16
15
  Returns:
17
- Deterministic document ID in format: <base_id>_<column>
18
- where base_id is either the provided_id or a 16-char hash of content
16
+ Deterministic document ID (either provided_id or a 16-char hash of content)
19
17
  """
20
18
  if provided_id is not None:
21
- base_id = provided_id
22
- else:
23
- # Generate a shorter 16-character hash based only on content
24
- hash_obj = hashlib.md5(content.encode())
25
- base_id = hash_obj.hexdigest()[:16]
19
+ return provided_id
26
20
 
27
- # Append column name to maintain uniqueness across columns
28
- return f"{base_id}_{content_column}"
21
+ # Generate a shorter 16-character hash based only on content
22
+ hash_obj = hashlib.md5(content.encode())
23
+ return hash_obj.hexdigest()[:16]
@@ -1,4 +1,3 @@
1
- import sys
2
1
  import copy
3
2
  import datetime as dt
4
3
  from copy import deepcopy
@@ -28,7 +27,6 @@ from mindsdb.utilities import log
28
27
 
29
28
  logger = log.getLogger(__name__)
30
29
 
31
- IS_PY36 = sys.version_info[1] <= 6
32
30
 
33
31
  default_project = config.get('default_project')
34
32
 
@@ -1,4 +1,3 @@
1
-
2
1
  import re
3
2
  import csv
4
3
  import inspect
@@ -13,6 +12,7 @@ from mindsdb.utilities import log
13
12
  from mindsdb.utilities.context import context as ctx
14
13
  from mindsdb.integrations.utilities.query_traversal import query_traversal
15
14
  from mindsdb.integrations.libs.response import INF_SCHEMA_COLUMNS_NAMES
15
+ from mindsdb.api.mysql.mysql_proxy.libs.constants.mysql import MYSQL_DATA_TYPE
16
16
 
17
17
  logger = log.getLogger(__name__)
18
18
 
@@ -253,7 +253,7 @@ class SQLAgent:
253
253
  for table in all_tables:
254
254
  key = f"{ctx.company_id}_{table}_info"
255
255
  table_info = self._cache.get(key) if self._cache else None
256
- if table_info is None:
256
+ if True or table_info is None:
257
257
  table_info = self._get_single_table_info(table)
258
258
  if self._cache:
259
259
  self._cache.set(key, table_info)
@@ -276,19 +276,41 @@ class SQLAgent:
276
276
  dn = self._command_executor.session.datahub.get(integration)
277
277
 
278
278
  fields, dtypes = [], []
279
- for df in dn.get_table_columns_df(table_name, schema_name):
280
- df_records = df.to_dict(orient='records')
281
- fields.append(df_records[INF_SCHEMA_COLUMNS_NAMES.COLUMN_NAME])
282
- if df_records[INF_SCHEMA_COLUMNS_NAMES.MYSQL_DATA_TYPE] is not None:
283
- dtypes.append(df_records[INF_SCHEMA_COLUMNS_NAMES.MYSQL_DATA_TYPE].value)
284
- else:
285
- dtypes.append(df_records[INF_SCHEMA_COLUMNS_NAMES.DATA_TYPE])
279
+ try:
280
+ df = dn.get_table_columns_df(table_name, schema_name)
281
+ if not isinstance(df, pd.DataFrame) or df.empty:
282
+ logger.warning(f"Received empty or invalid DataFrame for table columns of {table_str}")
283
+ return f"Table named `{table_str}`:\n [No column information available]"
284
+
285
+ fields = df[INF_SCHEMA_COLUMNS_NAMES.COLUMN_NAME].to_list()
286
+ dtypes = [
287
+ mysql_data_type.value if isinstance(mysql_data_type, MYSQL_DATA_TYPE) else (data_type or 'UNKNOWN')
288
+ for mysql_data_type, data_type
289
+ in zip(
290
+ df[INF_SCHEMA_COLUMNS_NAMES.MYSQL_DATA_TYPE],
291
+ df[INF_SCHEMA_COLUMNS_NAMES.DATA_TYPE]
292
+ )
293
+ ]
294
+ except Exception as e:
295
+ logger.error(f"Failed processing column info for {table_str}: {e}", exc_info=True)
296
+ raise ValueError(f"Failed to process column info for {table_str}") from e
297
+
298
+ if not fields:
299
+ logger.error(f"Could not extract column fields for {table_str}.")
300
+ return f"Table named `{table_str}`:\n [Could not extract column information]"
301
+
302
+ try:
303
+ sample_rows_info = self._get_sample_rows(table_str, fields)
304
+ except Exception as e:
305
+ logger.warning(f"Could not get sample rows for {table_str}: {e}")
306
+ sample_rows_info = "\n\t [error] Couldn't retrieve sample rows!"
286
307
 
287
308
  info = f'Table named `{table_str}`:\n'
288
309
  info += f"\nSample with first {self._sample_rows_in_table_info} rows from table {table_str} in CSV format (dialect is 'excel'):\n"
289
- info += self._get_sample_rows(table_str, fields) + "\n"
310
+ info += sample_rows_info + "\n"
290
311
  info += '\nColumn data types: ' + ",\t".join(
291
- [f'\n`{field}` : `{dtype}`' for field, dtype in zip(fields, dtypes)]) + '\n' # noqa
312
+ [f'\n`{field}` : `{dtype}`' for field, dtype in zip(fields, dtypes)]
313
+ ) + '\n'
292
314
  return info
293
315
 
294
316
  def _get_sample_rows(self, table: str, fields: List[str]) -> str:
@@ -47,7 +47,5 @@ def migrate_to_head():
47
47
  if __name__ == "__main__":
48
48
  # have to import this because
49
49
  # all env initialization happens here
50
- from mindsdb.utilities.config import Config as MDBConfig
51
- MDBConfig()
52
50
  db.init()
53
51
  migrate_to_head()
@@ -143,7 +143,8 @@ class Config:
143
143
  'auth': {
144
144
  'http_auth_enabled': False,
145
145
  "http_permanent_session_lifetime": datetime.timedelta(days=31),
146
- "username": "mindsdb"
146
+ "username": "mindsdb",
147
+ "password": ""
147
148
  },
148
149
  "logging": {
149
150
  "handlers": {
@@ -459,7 +460,7 @@ class Config:
459
460
  """
460
461
  updated = self.fetch_auto_config()
461
462
  if updated:
462
- self.init_config()
463
+ self.merge_configs()
463
464
 
464
465
  def merge_configs(self) -> None:
465
466
  """Merge multiple configs to one.
@@ -54,7 +54,7 @@ class Context:
54
54
  def load(self, storage: dict) -> None:
55
55
  self._storage.set(storage)
56
56
 
57
- def metadata(self, **kwargs) -> dict:
57
+ def get_metadata(self, **kwargs) -> dict:
58
58
  return {
59
59
  'user_id': self.user_id or "",
60
60
  'company_id': self.company_id or "",
@@ -7,7 +7,6 @@ import textwrap
7
7
  from functools import wraps
8
8
  from collections.abc import Callable
9
9
 
10
- import requests
11
10
  from cryptography.fernet import Fernet
12
11
  from mindsdb_sql_parser.ast import Identifier
13
12
 
@@ -72,41 +71,6 @@ def mark_process(name: str, custom_mark: str = None) -> Callable:
72
71
  return mark_process_wrapper
73
72
 
74
73
 
75
- def get_versions_where_predictors_become_obsolete():
76
- """ Get list of MindsDB versions in which predictors should be retrained
77
- Returns:
78
- list of str or False
79
- """
80
- versions_for_updating_predictors = []
81
- try:
82
- try:
83
- res = requests.get(
84
- 'https://mindsdb-cloud-public-service-files.s3.us-east-2.amazonaws.com/version_for_updating_predictors.txt',
85
- timeout=0.5
86
- )
87
- except (ConnectionError, requests.exceptions.ConnectionError) as e:
88
- logger.error(f'Is no connection. {e}')
89
- raise
90
- except Exception as e:
91
- logger.error(f'Is something wrong with getting version_for_updating_predictors.txt: {e}')
92
- raise
93
-
94
- if res.status_code != 200:
95
- logger.error(f'Cant get version_for_updating_predictors.txt: returned status code = {res.status_code}')
96
- raise
97
-
98
- try:
99
- versions_for_updating_predictors = res.text.replace(' \t\r', '').split('\n')
100
- except Exception as e:
101
- logger.error(f'Cant decode version_for_updating_predictors.txt: {e}')
102
- raise
103
- except Exception:
104
- return False, versions_for_updating_predictors
105
-
106
- versions_for_updating_predictors = [x for x in versions_for_updating_predictors if len(x) > 0]
107
- return True, versions_for_updating_predictors
108
-
109
-
110
74
  def init_lexer_parsers():
111
75
  from mindsdb_sql_parser.lexer import MindsDBLexer
112
76
  from mindsdb_sql_parser.parser import MindsDBParser