MindsDB 25.4.4.0__py3-none-any.whl → 25.5.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (86) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +107 -125
  3. mindsdb/api/executor/command_executor.py +14 -3
  4. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +8 -0
  5. mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +2 -1
  6. mindsdb/api/executor/datahub/datanodes/system_tables.py +10 -13
  7. mindsdb/api/executor/planner/query_plan.py +1 -0
  8. mindsdb/api/executor/planner/query_planner.py +9 -1
  9. mindsdb/api/executor/sql_query/sql_query.py +24 -8
  10. mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +21 -3
  11. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +3 -1
  12. mindsdb/api/http/initialize.py +20 -3
  13. mindsdb/api/http/namespaces/analysis.py +14 -1
  14. mindsdb/api/http/namespaces/config.py +19 -11
  15. mindsdb/api/http/namespaces/tree.py +1 -1
  16. mindsdb/api/http/start.py +7 -2
  17. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +4 -8
  18. mindsdb/api/mysql/mysql_proxy/utilities/exceptions.py +0 -4
  19. mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message_formats.py +2 -2
  20. mindsdb/integrations/handlers/bigquery_handler/requirements.txt +1 -0
  21. mindsdb/integrations/handlers/chromadb_handler/requirements.txt +1 -0
  22. mindsdb/integrations/handlers/gmail_handler/requirements.txt +1 -0
  23. mindsdb/integrations/handlers/google_analytics_handler/requirements.txt +2 -1
  24. mindsdb/integrations/handlers/google_books_handler/requirements.txt +1 -1
  25. mindsdb/integrations/handlers/google_calendar_handler/requirements.txt +1 -0
  26. mindsdb/integrations/handlers/google_content_shopping_handler/requirements.txt +1 -1
  27. mindsdb/integrations/handlers/google_fit_handler/requirements.txt +2 -0
  28. mindsdb/integrations/handlers/google_search_handler/requirements.txt +1 -1
  29. mindsdb/integrations/handlers/jira_handler/jira_handler.archived.py +75 -0
  30. mindsdb/integrations/handlers/jira_handler/jira_handler.py +113 -38
  31. mindsdb/integrations/handlers/jira_handler/jira_tables.py +229 -0
  32. mindsdb/integrations/handlers/jira_handler/requirements.txt +1 -0
  33. mindsdb/integrations/handlers/lightfm_handler/requirements.txt +1 -0
  34. mindsdb/integrations/handlers/lightwood_handler/lightwood_handler.py +0 -2
  35. mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
  36. mindsdb/integrations/handlers/lindorm_handler/requirements.txt +1 -0
  37. mindsdb/integrations/handlers/ms_one_drive_handler/requirements.txt +2 -0
  38. mindsdb/integrations/handlers/ms_teams_handler/requirements.txt +3 -1
  39. mindsdb/integrations/handlers/openai_handler/helpers.py +3 -5
  40. mindsdb/integrations/handlers/openai_handler/openai_handler.py +25 -12
  41. mindsdb/integrations/handlers/snowflake_handler/requirements.txt +1 -1
  42. mindsdb/integrations/handlers/togetherai_handler/__about__.py +9 -0
  43. mindsdb/integrations/handlers/togetherai_handler/__init__.py +20 -0
  44. mindsdb/integrations/handlers/togetherai_handler/creation_args.py +14 -0
  45. mindsdb/integrations/handlers/togetherai_handler/icon.svg +15 -0
  46. mindsdb/integrations/handlers/togetherai_handler/model_using_args.py +5 -0
  47. mindsdb/integrations/handlers/togetherai_handler/requirements.txt +2 -0
  48. mindsdb/integrations/handlers/togetherai_handler/settings.py +33 -0
  49. mindsdb/integrations/handlers/togetherai_handler/togetherai_handler.py +234 -0
  50. mindsdb/integrations/handlers/vertex_handler/requirements.txt +1 -0
  51. mindsdb/integrations/handlers/youtube_handler/requirements.txt +1 -0
  52. mindsdb/integrations/utilities/files/file_reader.py +5 -2
  53. mindsdb/integrations/utilities/handler_utils.py +4 -0
  54. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +360 -0
  55. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +6 -346
  56. mindsdb/interfaces/agents/constants.py +14 -2
  57. mindsdb/interfaces/agents/langchain_agent.py +2 -4
  58. mindsdb/interfaces/database/projects.py +1 -7
  59. mindsdb/interfaces/functions/controller.py +14 -16
  60. mindsdb/interfaces/functions/to_markdown.py +9 -124
  61. mindsdb/interfaces/knowledge_base/controller.py +109 -92
  62. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +28 -5
  63. mindsdb/interfaces/knowledge_base/utils.py +10 -15
  64. mindsdb/interfaces/model/model_controller.py +0 -2
  65. mindsdb/interfaces/query_context/context_controller.py +55 -15
  66. mindsdb/interfaces/query_context/query_task.py +19 -0
  67. mindsdb/interfaces/skills/sql_agent.py +33 -11
  68. mindsdb/interfaces/storage/db.py +2 -2
  69. mindsdb/interfaces/tasks/task_monitor.py +5 -1
  70. mindsdb/interfaces/tasks/task_thread.py +6 -0
  71. mindsdb/migrations/migrate.py +0 -2
  72. mindsdb/migrations/versions/2025-04-22_53502b6d63bf_query_database.py +27 -0
  73. mindsdb/utilities/config.py +15 -3
  74. mindsdb/utilities/context.py +2 -1
  75. mindsdb/utilities/functions.py +0 -36
  76. mindsdb/utilities/langfuse.py +19 -10
  77. mindsdb/utilities/otel/__init__.py +9 -193
  78. mindsdb/utilities/otel/metric_handlers/__init__.py +5 -1
  79. mindsdb/utilities/otel/prepare.py +198 -0
  80. mindsdb/utilities/sql.py +83 -0
  81. {mindsdb-25.4.4.0.dist-info → mindsdb-25.5.3.0.dist-info}/METADATA +662 -592
  82. {mindsdb-25.4.4.0.dist-info → mindsdb-25.5.3.0.dist-info}/RECORD +85 -69
  83. {mindsdb-25.4.4.0.dist-info → mindsdb-25.5.3.0.dist-info}/WHEEL +1 -1
  84. mindsdb/api/mysql/mysql_proxy/classes/sql_statement_parser.py +0 -151
  85. {mindsdb-25.4.4.0.dist-info → mindsdb-25.5.3.0.dist-info}/licenses/LICENSE +0 -0
  86. {mindsdb-25.4.4.0.dist-info → mindsdb-25.5.3.0.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,10 @@
1
- import base64
2
1
  from io import BytesIO
3
2
  import os
4
3
  from typing import Union
5
4
  from urllib.parse import urlparse
6
5
 
7
- import fitz # PyMuPDF
8
- from markitdown import MarkItDown
6
+ from aipdf import ocr
9
7
  import mimetypes
10
- from openai import OpenAI
11
8
  import requests
12
9
 
13
10
 
@@ -15,41 +12,22 @@ class ToMarkdown:
15
12
  """
16
13
  Extracts the content of documents of various formats in markdown format.
17
14
  """
18
- def __init__(self, use_llm: bool, llm_client: OpenAI = None, llm_model: str = None):
15
+ def __init__(self):
19
16
  """
20
17
  Initializes the ToMarkdown class.
21
18
  """
22
- # If use_llm is True, llm_client and llm_model must be provided.
23
- if use_llm and (llm_client is None or llm_model is None):
24
- raise ValueError('LLM client and model must be provided when use_llm is True.')
25
19
 
26
- # If use_llm is False, set llm_client and llm_model to None even if they are provided.
27
- if not use_llm:
28
- llm_client = None
29
- llm_model = None
30
-
31
- # Only OpenAI is supported for now.
32
- # TODO: Add support for other LLMs.
33
- if llm_client is not None and not isinstance(llm_client, OpenAI):
34
- raise ValueError('Only OpenAI models are supported at the moment.')
35
-
36
- self.use_llm = use_llm
37
- self.llm_client = llm_client
38
- self.llm_model = llm_model
39
-
40
- def call(self, file_path_or_url: str) -> str:
20
+ def call(self, file_path_or_url: str, **kwargs) -> str:
41
21
  """
42
22
  Converts a file to markdown.
43
23
  """
44
24
  file_extension = self._get_file_extension(file_path_or_url)
45
- file = self._get_file_content(file_path_or_url)
25
+ file_content = self._get_file_content(file_path_or_url)
46
26
 
47
27
  if file_extension == '.pdf':
48
- return self._pdf_to_markdown(file)
49
- elif file_extension in ['.jpg', '.jpeg', '.png', '.gif']:
50
- return self._image_to_markdown(file)
28
+ return self._pdf_to_markdown(file_content, **kwargs)
51
29
  else:
52
- return self._other_to_markdown(file)
30
+ raise ValueError(f"Unsupported file type: {file_extension}.")
53
31
 
54
32
  def _get_file_content(self, file_path_or_url: str) -> str:
55
33
  """
@@ -90,105 +68,12 @@ class ToMarkdown:
90
68
  else:
91
69
  return os.path.splitext(file_path_or_url)[1]
92
70
 
93
- def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes]) -> str:
71
+ def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes], **kwargs) -> str:
94
72
  """
95
73
  Converts a PDF file to markdown.
96
74
  """
97
- if self.llm_client is None:
98
- return self._pdf_to_markdown_no_llm(file_content)
99
- else:
100
- return self._pdf_to_markdown_llm(file_content)
101
-
102
- def _pdf_to_markdown_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
103
- """
104
- Converts a PDF file to markdown using LLM.
105
- The LLM is used mainly for the purpose of generating descriptions of any images in the PDF.
106
- """
107
75
  if isinstance(file_content, requests.Response):
108
76
  file_content = BytesIO(file_content.content)
109
77
 
110
- document = fitz.open(stream=file_content, filetype="pdf")
111
-
112
- markdown_content = []
113
- for page_num in range(len(document)):
114
- page = document.load_page(page_num)
115
-
116
- # Get text blocks with coordinates.
117
- page_content = []
118
- blocks = page.get_text("blocks")
119
- for block in blocks:
120
- x0, y0, x1, y1, text, _, _ = block
121
- if text.strip(): # Skip empty or whitespace blocks.
122
- page_content.append((y0, text.strip()))
123
-
124
- # Extract images from the page.
125
- image_list = page.get_images(full=True)
126
- for img_index, img in enumerate(image_list):
127
- xref = img[0]
128
- base_image = document.extract_image(xref)
129
- image_bytes = base_image["image"]
130
-
131
- # Use actual image y-coordinate if available.
132
- y0 = float(base_image.get("y", 0))
133
- image_description = self._generate_image_description(image_bytes)
134
- page_content.append((y0, f"![{image_description}](image_{page_num + 1}_{img_index + 1}.png)"))
135
-
136
- # Sort the content by y0 coordinate
137
- page_content.sort(key=lambda x: x[0])
138
-
139
- # Add sorted content to the markdown
140
- for _, text in page_content:
141
- markdown_content.append(text)
142
- markdown_content.append("\n")
143
-
144
- document.close()
145
-
146
- return "\n".join(markdown_content)
147
-
148
- def _generate_image_description(self, image_bytes: bytes) -> str:
149
- """
150
- Generates a description of the image using LLM.
151
- """
152
- image_base64 = base64.b64encode(image_bytes).decode("utf-8")
153
-
154
- response = self.llm_client.chat.completions.create(
155
- model=self.llm_model,
156
- messages=[
157
- {
158
- "role": "user",
159
- "content": [
160
- {"type": "text", "text": "Describe this image"},
161
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
162
- ],
163
- }
164
- ],
165
- )
166
- description = response.choices[0].message.content
167
- return description
168
-
169
- def _pdf_to_markdown_no_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
170
- """
171
- Converts a PDF file to markdown without using LLM.
172
- """
173
- md = MarkItDown(enable_plugins=True)
174
- result = md.convert(file_content)
175
- return result.markdown
176
-
177
- def _image_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
178
- """
179
- Converts images to markdown.
180
- """
181
- if not self.use_llm or self.llm_client is None:
182
- raise ValueError('LLM client must be enabled to convert images to markdown.')
183
-
184
- md = MarkItDown(llm_client=self.llm_client, llm_model=self.llm_model, enable_plugins=True)
185
- result = md.convert(file_content)
186
- return result.markdown
187
-
188
- def _other_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
189
- """
190
- Converts other file formats to markdown.
191
- """
192
- md = MarkItDown(enable_plugins=True)
193
- result = md.convert(file_content)
194
- return result.markdown
78
+ markdown_pages = ocr(file_content, **kwargs)
79
+ return "\n\n---\n\n".join(markdown_pages)
@@ -27,7 +27,7 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
27
27
  from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
28
28
  from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
29
29
  from mindsdb.integrations.utilities.handler_utils import get_api_key
30
- from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import construct_model_from_args, row_to_document
30
+ from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import construct_model_from_args
31
31
 
32
32
  from mindsdb.interfaces.agents.constants import DEFAULT_EMBEDDINGS_MODEL_CLASS
33
33
  from mindsdb.interfaces.agents.langchain_agent import create_chat_model, get_llm_provider
@@ -37,21 +37,35 @@ from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor impor
37
37
  from mindsdb.interfaces.model.functions import PredictorRecordNotFound
38
38
  from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
39
39
  from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
40
+ from mindsdb.utilities.config import config
40
41
  from mindsdb.utilities.context import context as ctx
41
42
 
42
43
  from mindsdb.api.executor.command_executor import ExecuteCommands
44
+ from mindsdb.api.executor.utilities.sql import query_df
43
45
  from mindsdb.utilities import log
44
- from mindsdb.integrations.utilities.rag.rerankers.reranker_compressor import LLMReranker
46
+ from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMReranker
45
47
 
46
48
  logger = log.getLogger(__name__)
47
49
 
48
50
  KB_TO_VECTORDB_COLUMNS = {
49
- 'id': 'original_row_id',
51
+ 'id': 'original_doc_id',
50
52
  'chunk_id': 'id',
51
53
  'chunk_content': 'content'
52
54
  }
53
55
 
54
56
 
57
+ def get_model_params(model_params: dict, default_config_key: str):
58
+ """
59
+ Get model parameters by combining default config with user provided parameters.
60
+ """
61
+ combined_model_params = copy.deepcopy(config.get(default_config_key, {}))
62
+
63
+ if model_params:
64
+ combined_model_params.update(model_params)
65
+
66
+ return combined_model_params
67
+
68
+
55
69
  def get_embedding_model_from_params(embedding_model_params: dict):
56
70
  """
57
71
  Create embedding model from parameters.
@@ -89,7 +103,7 @@ def get_reranking_model_from_params(reranking_model_params: dict):
89
103
  params_copy["api_key"] = get_api_key(provider, params_copy, strict=False)
90
104
  params_copy['model'] = params_copy.pop('model_name', None)
91
105
 
92
- return LLMReranker(**params_copy)
106
+ return BaseLLMReranker(**params_copy)
93
107
 
94
108
 
95
109
  class KnowledgeBaseTable:
@@ -137,13 +151,8 @@ class KnowledgeBaseTable:
137
151
  query.from_table = Identifier(parts=[self._kb.vector_database_table])
138
152
  logger.debug(f"Set table name to: {self._kb.vector_database_table}")
139
153
 
140
- requested_kb_columns = []
141
- for target in query.targets:
142
- if isinstance(target, Star):
143
- requested_kb_columns = None
144
- break
145
- else:
146
- requested_kb_columns.append(target.parts[-1].lower())
154
+ # Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
155
+ query_copy = copy.deepcopy(query)
147
156
 
148
157
  query.targets = [
149
158
  Identifier(TableField.ID.value),
@@ -207,15 +216,23 @@ class KnowledgeBaseTable:
207
216
 
208
217
  df = self.add_relevance(df, query_text, relevance_threshold)
209
218
 
210
- # filter by targets
211
- if requested_kb_columns is not None:
212
- df = df[requested_kb_columns]
219
+ if (
220
+ query.group_by is not None
221
+ or query.order_by is not None
222
+ or query.having is not None
223
+ or query.distinct is True
224
+ or len(query.targets) != 1
225
+ or not isinstance(query.targets[0], Star)
226
+ ):
227
+ query_copy.where = None
228
+ df = query_df(df, query_copy, session=self.session)
229
+
213
230
  return df
214
231
 
215
232
  def add_relevance(self, df, query_text, relevance_threshold=None):
216
233
  relevance_column = TableField.RELEVANCE.value
217
234
 
218
- reranking_model_params = self._kb.params.get("reranking_model")
235
+ reranking_model_params = get_model_params(self._kb.params.get("reranking_model"), "default_llm")
219
236
  if reranking_model_params and query_text and len(df) > 0:
220
237
  # Use reranker for relevance score
221
238
  try:
@@ -277,7 +294,7 @@ class KnowledgeBaseTable:
277
294
  columns = list(df.columns)
278
295
  # update id, get from metadata
279
296
  df[TableField.ID.value] = df[TableField.METADATA.value].apply(
280
- lambda m: None if m is None else m.get('original_row_id')
297
+ lambda m: None if m is None else m.get('original_doc_id')
281
298
  )
282
299
 
283
300
  # id on first place
@@ -466,12 +483,9 @@ class KnowledgeBaseTable:
466
483
  # Use provided_id directly if it exists, otherwise generate one
467
484
  doc_id = self._generate_document_id(content_str, col, provided_id)
468
485
 
469
- # Need provided ID to link chunks back to original source (e.g. database row).
470
- row_id = provided_id if provided_id else idx
471
-
472
486
  metadata = {
473
487
  **base_metadata,
474
- 'original_row_id': str(row_id),
488
+ 'original_row_index': str(idx), # provide link to original row index
475
489
  'content_column': col,
476
490
  }
477
491
 
@@ -657,47 +671,34 @@ class KnowledgeBaseTable:
657
671
  if df.empty:
658
672
  return pd.DataFrame([], columns=[TableField.EMBEDDINGS.value])
659
673
 
660
- # keep only content
661
- df = df[[TableField.CONTENT.value]]
662
-
663
674
  model_id = self._kb.embedding_model_id
664
- if model_id:
665
- # get the input columns
666
- model_rec = db.session.query(db.Predictor).filter_by(id=model_id).first()
667
675
 
668
- assert model_rec is not None, f"Model not found: {model_id}"
669
- model_project = db.session.query(db.Project).filter_by(id=model_rec.project_id).first()
676
+ # get the input columns
677
+ model_rec = db.session.query(db.Predictor).filter_by(id=model_id).first()
670
678
 
671
- project_datanode = self.session.datahub.get(model_project.name)
679
+ assert model_rec is not None, f"Model not found: {model_id}"
680
+ model_project = db.session.query(db.Project).filter_by(id=model_rec.project_id).first()
672
681
 
673
- model_using = model_rec.learn_args.get('using', {})
674
- input_col = model_using.get('question_column')
675
- if input_col is None:
676
- input_col = model_using.get('input_column')
682
+ project_datanode = self.session.datahub.get(model_project.name)
677
683
 
678
- if input_col is not None and input_col != TableField.CONTENT.value:
679
- df = df.rename(columns={TableField.CONTENT.value: input_col})
684
+ model_using = model_rec.learn_args.get('using', {})
685
+ input_col = model_using.get('question_column')
686
+ if input_col is None:
687
+ input_col = model_using.get('input_column')
680
688
 
681
- df_out = project_datanode.predict(
682
- model_name=model_rec.name,
683
- df=df,
684
- params=self.model_params
685
- )
689
+ if input_col is not None and input_col != TableField.CONTENT.value:
690
+ df = df.rename(columns={TableField.CONTENT.value: input_col})
686
691
 
687
- target = model_rec.to_predict[0]
688
- if target != TableField.EMBEDDINGS.value:
689
- # adapt output for vectordb
690
- df_out = df_out.rename(columns={target: TableField.EMBEDDINGS.value})
691
-
692
- elif self._kb.params.get('embedding_model'):
693
- embedding_model = get_embedding_model_from_params(self._kb.params.get('embedding_model'))
694
-
695
- df_texts = df.apply(row_to_document, axis=1)
696
- embeddings = embedding_model.embed_documents(df_texts.tolist())
697
- df_out = df.copy().assign(**{TableField.EMBEDDINGS.value: embeddings})
692
+ df_out = project_datanode.predict(
693
+ model_name=model_rec.name,
694
+ df=df,
695
+ params=self.model_params
696
+ )
698
697
 
699
- else:
700
- raise ValueError("No embedding model found for the knowledge base.")
698
+ target = model_rec.to_predict[0]
699
+ if target != TableField.EMBEDDINGS.value:
700
+ # adapt output for vectordb
701
+ df_out = df_out.rename(columns={target: TableField.EMBEDDINGS.value})
701
702
 
702
703
  df_out = df_out[[TableField.EMBEDDINGS.value]]
703
704
 
@@ -728,14 +729,15 @@ class KnowledgeBaseTable:
728
729
  """
729
730
  # Get embedding model from knowledge base
730
731
  embeddings_model = None
732
+ embedding_model_params = get_model_params(self._kb.params.get('embedding_model', {}), 'default_embedding_model')
731
733
  if self._kb.embedding_model:
732
734
  # Extract embedding model args from knowledge base table
733
735
  embedding_args = self._kb.embedding_model.learn_args.get('using', {})
734
736
  # Construct the embedding model directly
735
737
  embeddings_model = construct_model_from_args(embedding_args)
736
738
  logger.debug(f"Using knowledge base embedding model with args: {embedding_args}")
737
- elif self._kb.params.get('embedding_model'):
738
- embeddings_model = get_embedding_model_from_params(self._kb.params['embedding_model'])
739
+ elif embedding_model_params:
740
+ embeddings_model = get_embedding_model_from_params(embedding_model_params)
739
741
  logger.debug(f"Using knowledge base embedding model from params: {self._kb.params['embedding_model']}")
740
742
  else:
741
743
  embeddings_model = DEFAULT_EMBEDDINGS_MODEL_CLASS()
@@ -786,7 +788,7 @@ class KnowledgeBaseTable:
786
788
  def _generate_document_id(self, content: str, content_column: str, provided_id: str = None) -> str:
787
789
  """Generate a deterministic document ID using the utility function."""
788
790
  from mindsdb.interfaces.knowledge_base.utils import generate_document_id
789
- return generate_document_id(content, content_column, provided_id)
791
+ return generate_document_id(content=content, provided_id=provided_id)
790
792
 
791
793
  def _convert_metadata_value(self, value):
792
794
  """
@@ -869,35 +871,33 @@ class KnowledgeBaseController:
869
871
  return kb
870
872
  raise EntityExistsError("Knowledge base already exists", name)
871
873
 
872
- embedding_model_params = params.get('embedding_model', None)
873
- reranking_model_params = params.get('reranking_model', None)
874
+ embedding_params = copy.deepcopy(config.get('default_embedding_model', {}))
874
875
 
876
+ model_name = None
877
+ model_project = project
875
878
  if embedding_model:
876
879
  model_name = embedding_model.parts[-1]
880
+ if len(embedding_model.parts) > 1:
881
+ model_project = self.session.database_controller.get_project(embedding_model.parts[-2])
877
882
 
878
- elif embedding_model_params:
879
- # Get embedding model from params.
880
- # This is called here to check validaity of the parameters.
881
- get_embedding_model_from_params(
882
- embedding_model_params
883
- )
883
+ elif 'embedding_model' in params:
884
+ if isinstance(params['embedding_model'], str):
885
+ # it is model name
886
+ model_name = params['embedding_model']
887
+ else:
888
+ # it is params for model
889
+ embedding_params.update(params['embedding_model'])
884
890
 
885
- else:
886
- model_name = self._get_default_embedding_model(
891
+ if model_name is None:
892
+ model_name = self._create_embedding_model(
887
893
  project.name,
888
- params=params
894
+ params=embedding_params,
895
+ kb_name=name,
889
896
  )
890
- params['default_embedding_model'] = model_name
891
-
892
- model_project = None
893
- if embedding_model is not None and len(embedding_model.parts) > 1:
894
- # model project is set
895
- model_project = self.session.database_controller.get_project(embedding_model.parts[-2])
896
- elif not embedding_model_params:
897
- model_project = project
897
+ params['created_embedding_model'] = model_name
898
898
 
899
899
  embedding_model_id = None
900
- if model_project:
900
+ if model_name is not None:
901
901
  model = self.session.model_controller.get_model(
902
902
  name=model_name,
903
903
  project_name=model_project.name
@@ -905,6 +905,7 @@ class KnowledgeBaseController:
905
905
  model_record = db.Predictor.query.get(model['id'])
906
906
  embedding_model_id = model_record.id
907
907
 
908
+ reranking_model_params = get_model_params(params.get('reranking_model', {}), 'default_llm')
908
909
  if reranking_model_params:
909
910
  # Get reranking model from params.
910
911
  # This is called here to check validaity of the parameters.
@@ -989,38 +990,54 @@ class KnowledgeBaseController:
989
990
  self.session.integration_controller.add(vector_store_name, engine, connection_args)
990
991
  return vector_store_name
991
992
 
992
- def _get_default_embedding_model(self, project_name, engine="langchain_embedding", params: dict = None):
993
+ def _create_embedding_model(self, project_name, engine="openai", params: dict = None, kb_name=''):
993
994
  """create a default embedding model for knowledge base, if not specified"""
994
- model_name = "kb_default_embedding_model"
995
+ model_name = f"kb_embedding_{kb_name}"
995
996
 
996
- # check exists
997
+ # drop if exists - parameters can be different
997
998
  try:
998
999
  model = self.session.model_controller.get_model(model_name, project_name=project_name)
999
1000
  if model is not None:
1000
- return model_name
1001
+ self.session.model_controller.delete_model(model_name, project_name)
1001
1002
  except PredictorRecordNotFound:
1002
1003
  pass
1003
1004
 
1004
- using_args = {
1005
- 'engine': engine
1006
- }
1007
- if engine == 'langchain_embedding':
1008
- # Use default embeddings.
1009
- using_args['class'] = 'openai'
1005
+ if 'provider' in params:
1006
+ engine = params.pop('provider').lower()
1007
+
1008
+ api_key = get_api_key(engine, params, strict=False) or params.pop('api_key')
1009
+
1010
+ if engine == 'azure_openai':
1011
+ engine = 'openai'
1012
+ params['provider'] = 'azure'
1013
+
1014
+ if engine == 'openai':
1015
+ if 'question_column' not in params:
1016
+ params['question_column'] = 'content'
1017
+ if api_key:
1018
+ params[f"{engine}_api_key"] = api_key
1019
+ if 'base_url' in params:
1020
+ params['api_base'] = params.pop('base_url')
1021
+
1022
+ params['engine'] = engine
1023
+ params['join_learn_process'] = True
1024
+ params['mode'] = 'embedding'
1010
1025
 
1011
1026
  # Include API key if provided.
1012
- using_args.update({k: v for k, v in params.items() if 'api_key' in k})
1013
1027
  statement = CreatePredictor(
1014
1028
  name=Identifier(parts=[project_name, model_name]),
1015
- using=using_args,
1029
+ using=params,
1016
1030
  targets=[
1017
1031
  Identifier(parts=[TableField.EMBEDDINGS.value])
1018
1032
  ]
1019
1033
  )
1020
1034
 
1021
1035
  command_executor = ExecuteCommands(self.session)
1022
- command_executor.answer_create_predictor(statement, project_name)
1023
-
1036
+ resp = command_executor.answer_create_predictor(statement, project_name)
1037
+ # check model status
1038
+ record = resp.data.records[0]
1039
+ if record['STATUS'] == 'error':
1040
+ raise ValueError('Embedding model error:' + record['ERROR'])
1024
1041
  return model_name
1025
1042
 
1026
1043
  def delete(self, name: str, project_name: int, if_exists: bool = False) -> None:
@@ -1054,9 +1071,9 @@ class KnowledgeBaseController:
1054
1071
  self.session.integration_controller.delete(kb.params['default_vector_storage'])
1055
1072
  except EntityNotExistsError:
1056
1073
  pass
1057
- if 'default_embedding_model' in kb.params:
1074
+ if 'created_embedding_model' in kb.params:
1058
1075
  try:
1059
- self.session.model_controller.delete_model(kb.params['default_embedding_model'], project_name)
1076
+ self.session.model_controller.delete_model(kb.params['created_embedding_model'], project_name)
1060
1077
  except EntityNotExistsError:
1061
1078
  pass
1062
1079
 
@@ -25,6 +25,8 @@ from langchain_core.documents import Document as LangchainDocument
25
25
 
26
26
  logger = log.getLogger(__name__)
27
27
 
28
+ _DEFAULT_CONTENT_COLUMN_NAME = "content"
29
+
28
30
 
29
31
  class DocumentPreprocessor:
30
32
  """Base class for document preprocessing"""
@@ -90,14 +92,18 @@ class DocumentPreprocessor:
90
92
  start_char: Optional[int] = None,
91
93
  end_char: Optional[int] = None,
92
94
  provided_id: str = None,
95
+ content_column: str = None,
93
96
  ) -> str:
94
97
  """Generate human-readable deterministic ID for a chunk
95
- Format: <doc_id>:<chunk_number>of<total_chunks>:<start_char>to<end_char>
98
+ Format: <doc_id>:<content_column>:<chunk_number>of<total_chunks>:<start_char>to<end_char>
96
99
  """
97
100
  if provided_id is None:
98
101
  raise ValueError("Document ID must be provided for chunk ID generation")
99
102
 
100
- chunk_id = f"{provided_id}:{chunk_index + 1}of{total_chunks}:{start_char}to{end_char}"
103
+ if content_column is None:
104
+ raise ValueError("Content column must be provided for chunk ID generation")
105
+
106
+ chunk_id = f"{provided_id}:{content_column}:{chunk_index + 1}of{total_chunks}:{start_char}to{end_char}"
101
107
  logger.debug(f"Generated chunk ID: {chunk_id}")
102
108
  return chunk_id
103
109
 
@@ -254,8 +260,15 @@ Please give a short succinct context to situate this chunk within the overall do
254
260
  if doc.metadata:
255
261
  metadata.update(doc.metadata)
256
262
 
263
+ # Get content_column from metadata or use default
264
+ content_column = metadata.get('content_column')
265
+ if content_column is None:
266
+ # If content_column is not in metadata, use the default column name
267
+ content_column = _DEFAULT_CONTENT_COLUMN_NAME
268
+ logger.debug(f"No content_column found in metadata, using default: {_DEFAULT_CONTENT_COLUMN_NAME}")
269
+
257
270
  chunk_id = self._generate_chunk_id(
258
- chunk_index=chunk_index, provided_id=doc.id
271
+ chunk_index=chunk_index, provided_id=doc.id, content_column=content_column
259
272
  )
260
273
  processed_chunks.append(
261
274
  ProcessedChunk(
@@ -324,13 +337,23 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
324
337
  metadata["start_char"] = start_char
325
338
  metadata["end_char"] = end_char
326
339
 
327
- # Generate chunk ID with total chunks
340
+ # Get content_column from metadata or use default
341
+ content_column = None
342
+ if doc.metadata:
343
+ content_column = doc.metadata.get('content_column')
344
+
345
+ if content_column is None:
346
+ # If content_column is not in metadata, use the default column name
347
+ content_column = _DEFAULT_CONTENT_COLUMN_NAME
348
+ logger.debug(f"No content_column found in metadata, using default: {_DEFAULT_CONTENT_COLUMN_NAME}")
349
+
328
350
  chunk_id = self._generate_chunk_id(
329
351
  chunk_index=i,
330
352
  total_chunks=total_chunks,
331
353
  start_char=start_char,
332
354
  end_char=end_char,
333
- provided_id=doc.id
355
+ provided_id=doc.id,
356
+ content_column=content_column
334
357
  )
335
358
 
336
359
  processed_chunks.append(
@@ -2,27 +2,22 @@
2
2
  import hashlib
3
3
 
4
4
 
5
- def generate_document_id(content: str, content_column: str, provided_id: str = None) -> str:
5
+ def generate_document_id(content: str, content_column: str = None, provided_id: str = None) -> str:
6
6
  """
7
- Generate a deterministic document ID from content and column name.
8
- If provided_id exists, combines it with content_column.
9
- For generated IDs, uses a short hash of just the content to ensure
10
- same content gets same base ID across different columns.
7
+ Generate a deterministic document ID from content.
8
+ If provided_id exists, returns it directly.
9
+ For generated IDs, uses a short hash of just the content.
11
10
 
12
11
  Args:
13
12
  content: The content string
14
- content_column: Name of the content column
13
+ content_column: Name of the content column (not used in ID generation, kept for backward compatibility)
15
14
  provided_id: Optional user-provided ID
16
15
  Returns:
17
- Deterministic document ID in format: <base_id>_<column>
18
- where base_id is either the provided_id or a 16-char hash of content
16
+ Deterministic document ID (either provided_id or a 16-char hash of content)
19
17
  """
20
18
  if provided_id is not None:
21
- base_id = provided_id
22
- else:
23
- # Generate a shorter 16-character hash based only on content
24
- hash_obj = hashlib.md5(content.encode())
25
- base_id = hash_obj.hexdigest()[:16]
19
+ return provided_id
26
20
 
27
- # Append column name to maintain uniqueness across columns
28
- return f"{base_id}_{content_column}"
21
+ # Generate a shorter 16-character hash based only on content
22
+ hash_obj = hashlib.md5(content.encode())
23
+ return hash_obj.hexdigest()[:16]
@@ -1,4 +1,3 @@
1
- import sys
2
1
  import copy
3
2
  import datetime as dt
4
3
  from copy import deepcopy
@@ -28,7 +27,6 @@ from mindsdb.utilities import log
28
27
 
29
28
  logger = log.getLogger(__name__)
30
29
 
31
- IS_PY36 = sys.version_info[1] <= 6
32
30
 
33
31
  default_project = config.get('default_project')
34
32