MindsDB 25.1.2.1__py3-none-any.whl → 25.1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (95) hide show
  1. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/METADATA +246 -255
  2. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/RECORD +94 -83
  3. mindsdb/__about__.py +1 -1
  4. mindsdb/__main__.py +5 -3
  5. mindsdb/api/executor/__init__.py +0 -1
  6. mindsdb/api/executor/command_executor.py +2 -1
  7. mindsdb/api/executor/data_types/answer.py +1 -1
  8. mindsdb/api/executor/datahub/datanodes/datanode.py +1 -1
  9. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
  10. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +8 -3
  11. mindsdb/api/executor/datahub/datanodes/project_datanode.py +9 -26
  12. mindsdb/api/executor/sql_query/__init__.py +1 -0
  13. mindsdb/api/executor/sql_query/result_set.py +36 -21
  14. mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +1 -1
  15. mindsdb/api/executor/sql_query/steps/join_step.py +4 -4
  16. mindsdb/api/executor/sql_query/steps/map_reduce_step.py +6 -39
  17. mindsdb/api/executor/utilities/sql.py +2 -10
  18. mindsdb/api/http/namespaces/agents.py +3 -1
  19. mindsdb/api/http/namespaces/knowledge_bases.py +3 -3
  20. mindsdb/api/http/namespaces/sql.py +3 -1
  21. mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +2 -1
  22. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +7 -0
  23. mindsdb/api/postgres/postgres_proxy/executor/executor.py +2 -1
  24. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +2 -2
  25. mindsdb/integrations/handlers/chromadb_handler/requirements.txt +1 -1
  26. mindsdb/integrations/handlers/databricks_handler/requirements.txt +1 -1
  27. mindsdb/integrations/handlers/file_handler/file_handler.py +1 -1
  28. mindsdb/integrations/handlers/file_handler/requirements.txt +0 -4
  29. mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +17 -1
  30. mindsdb/integrations/handlers/jira_handler/jira_handler.py +15 -1
  31. mindsdb/integrations/handlers/jira_handler/jira_table.py +52 -31
  32. mindsdb/integrations/handlers/langchain_embedding_handler/fastapi_embeddings.py +82 -0
  33. mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +8 -1
  34. mindsdb/integrations/handlers/langchain_handler/requirements.txt +1 -1
  35. mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_handler.py +1 -1
  36. mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +8 -0
  37. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +50 -16
  38. mindsdb/integrations/handlers/pinecone_handler/pinecone_handler.py +123 -72
  39. mindsdb/integrations/handlers/pinecone_handler/requirements.txt +1 -1
  40. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +12 -6
  41. mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +5 -3
  42. mindsdb/integrations/handlers/slack_handler/slack_handler.py +13 -2
  43. mindsdb/integrations/handlers/slack_handler/slack_tables.py +21 -1
  44. mindsdb/integrations/handlers/web_handler/requirements.txt +0 -1
  45. mindsdb/integrations/libs/ml_handler_process/learn_process.py +2 -2
  46. mindsdb/integrations/utilities/files/__init__.py +0 -0
  47. mindsdb/integrations/utilities/files/file_reader.py +258 -0
  48. mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +2 -1
  49. mindsdb/integrations/utilities/handlers/auth_utilities/microsoft/ms_graph_api_auth_utilities.py +8 -3
  50. mindsdb/integrations/utilities/rag/chains/map_reduce_summarizer_chain.py +5 -9
  51. mindsdb/integrations/utilities/rag/loaders/vector_store_loader/pgvector.py +76 -27
  52. mindsdb/integrations/utilities/rag/loaders/vector_store_loader/vector_store_loader.py +18 -1
  53. mindsdb/integrations/utilities/rag/pipelines/rag.py +74 -21
  54. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +166 -108
  55. mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +108 -78
  56. mindsdb/integrations/utilities/rag/settings.py +37 -16
  57. mindsdb/integrations/utilities/sql_utils.py +1 -1
  58. mindsdb/interfaces/agents/agents_controller.py +18 -8
  59. mindsdb/interfaces/agents/constants.py +1 -0
  60. mindsdb/interfaces/agents/langchain_agent.py +124 -157
  61. mindsdb/interfaces/agents/langfuse_callback_handler.py +4 -37
  62. mindsdb/interfaces/agents/mindsdb_database_agent.py +21 -13
  63. mindsdb/interfaces/chatbot/chatbot_controller.py +7 -11
  64. mindsdb/interfaces/chatbot/chatbot_task.py +16 -5
  65. mindsdb/interfaces/chatbot/memory.py +58 -13
  66. mindsdb/interfaces/database/integrations.py +5 -1
  67. mindsdb/interfaces/database/projects.py +55 -16
  68. mindsdb/interfaces/database/views.py +12 -25
  69. mindsdb/interfaces/knowledge_base/controller.py +39 -15
  70. mindsdb/interfaces/knowledge_base/preprocessing/document_loader.py +7 -26
  71. mindsdb/interfaces/model/functions.py +15 -4
  72. mindsdb/interfaces/model/model_controller.py +4 -7
  73. mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +51 -40
  74. mindsdb/interfaces/skills/retrieval_tool.py +10 -3
  75. mindsdb/interfaces/skills/skill_tool.py +97 -54
  76. mindsdb/interfaces/skills/skills_controller.py +7 -3
  77. mindsdb/interfaces/skills/sql_agent.py +127 -41
  78. mindsdb/interfaces/storage/db.py +1 -1
  79. mindsdb/migrations/versions/2025-01-15_c06c35f7e8e1_project_company.py +88 -0
  80. mindsdb/utilities/cache.py +7 -4
  81. mindsdb/utilities/context.py +11 -1
  82. mindsdb/utilities/langfuse.py +279 -0
  83. mindsdb/utilities/log.py +20 -2
  84. mindsdb/utilities/otel/__init__.py +206 -0
  85. mindsdb/utilities/otel/logger.py +25 -0
  86. mindsdb/utilities/otel/meter.py +19 -0
  87. mindsdb/utilities/otel/metric_handlers/__init__.py +25 -0
  88. mindsdb/utilities/otel/tracer.py +16 -0
  89. mindsdb/utilities/partitioning.py +52 -0
  90. mindsdb/utilities/render/sqlalchemy_render.py +7 -1
  91. mindsdb/utilities/utils.py +34 -0
  92. mindsdb/utilities/otel.py +0 -72
  93. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/LICENSE +0 -0
  94. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/WHEEL +0 -0
  95. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
1
 
2
+ from typing import Union
3
+
2
4
  from mindsdb_sql_parser.ast import Identifier, Select, BinaryOperation, Constant, OrderBy
3
5
 
4
6
  from mindsdb.interfaces.storage import db
5
-
6
-
7
7
  from .types import ChatBotMessage
8
8
 
9
9
 
@@ -60,7 +60,7 @@ class BaseMemory:
60
60
 
61
61
  # If the chat_id is a tuple, convert it to a string when storing the message in the database.
62
62
  self._add_to_history(
63
- str(chat_id) if isinstance(chat_id, tuple) else chat_id,
63
+ chat_id,
64
64
  chat_message,
65
65
  table_name=table_name
66
66
  )
@@ -74,7 +74,7 @@ class BaseMemory:
74
74
 
75
75
  else:
76
76
  history = self._get_chat_history(
77
- str(chat_id) if isinstance(chat_id, tuple) else chat_id,
77
+ chat_id,
78
78
  table_name
79
79
  )
80
80
  self._cache[key] = history
@@ -108,18 +108,44 @@ class HandlerMemory(BaseMemory):
108
108
  time_col = t_params['time_col']
109
109
  chat_id_cols = t_params['chat_id_col'] if isinstance(t_params['chat_id_col'], list) else [t_params['chat_id_col']]
110
110
 
111
- ast_query = Select(
112
- targets=[Identifier(text_col),
113
- Identifier(username_col),
114
- Identifier(time_col)],
115
- from_table=Identifier(t_params['name']),
116
- where=[BinaryOperation(
111
+ chat_id = chat_id if isinstance(chat_id, tuple) else (chat_id,)
112
+ # Add a WHERE clause for each chat_id column.
113
+ where_conditions = [
114
+ BinaryOperation(
117
115
  op='=',
118
116
  args=[
119
117
  Identifier(chat_id_col),
120
118
  Constant(chat_id[idx])
121
119
  ]
122
- ) for idx, chat_id_col in enumerate(chat_id_cols)],
120
+ ) for idx, chat_id_col in enumerate(chat_id_cols)
121
+ ]
122
+ # Add a WHERE clause to ignore holding messages from the bot.
123
+ from .chatbot_task import HOLDING_MESSAGE
124
+
125
+ where_conditions.append(
126
+ BinaryOperation(
127
+ op='!=',
128
+ args=[
129
+ Identifier(text_col),
130
+ Constant(HOLDING_MESSAGE)
131
+ ]
132
+ )
133
+ )
134
+
135
+ # Convert the WHERE conditions to a BinaryOperation object.
136
+ where_conditions_binary_operation = None
137
+ for condition in where_conditions:
138
+ if where_conditions_binary_operation is None:
139
+ where_conditions_binary_operation = condition
140
+ else:
141
+ where_conditions_binary_operation = BinaryOperation('and', args=[where_conditions_binary_operation, condition])
142
+
143
+ ast_query = Select(
144
+ targets=[Identifier(text_col),
145
+ Identifier(username_col),
146
+ Identifier(time_col)],
147
+ from_table=Identifier(t_params['name']),
148
+ where=where_conditions_binary_operation,
123
149
  order_by=[OrderBy(Identifier(time_col))],
124
150
  limit=Constant(self.MAX_DEPTH),
125
151
  )
@@ -151,9 +177,28 @@ class DBMemory(BaseMemory):
151
177
  uses mindsdb database to store messages
152
178
  '''
153
179
 
180
+ def _generate_chat_id_for_db(self, chat_id: Union[str, tuple], table_name: str = None) -> str:
181
+ """
182
+ Generate an ID for the chat to store in the database.
183
+ The ID is a string that includes the components of the chat ID and the table name (if provided) separated by underscores.
184
+
185
+ Args:
186
+ chat_id (str | tuple): The ID of the chat.
187
+ table_name (str): The name of the table the chat belongs to.
188
+ """
189
+ if isinstance(chat_id, tuple):
190
+ char_id_str = "_".join(str(val) for val in chat_id)
191
+ else:
192
+ char_id_str = str(chat_id)
193
+
194
+ if table_name:
195
+ chat_id_str = f"{table_name}_{char_id_str}"
196
+
197
+ return chat_id_str
198
+
154
199
  def _add_to_history(self, chat_id, message, table_name=None):
155
200
  chat_bot_id = self.chat_task.bot_id
156
- destination = str((chat_id, table_name)) if table_name else chat_id
201
+ destination = self._generate_chat_id_for_db(chat_id, table_name)
157
202
 
158
203
  message = db.ChatBotsHistory(
159
204
  chat_bot_id=chat_bot_id,
@@ -167,7 +212,7 @@ class DBMemory(BaseMemory):
167
212
 
168
213
  def _get_chat_history(self, chat_id, table_name=None):
169
214
  chat_bot_id = self.chat_task.bot_id
170
- destination = str((chat_id, table_name)) if table_name else chat_id
215
+ destination = self._generate_chat_id_for_db(chat_id, table_name)
171
216
 
172
217
  query = db.ChatBotsHistory.query\
173
218
  .filter(
@@ -215,6 +215,8 @@ class IntegrationController:
215
215
  def modify(self, name, data):
216
216
  self.handlers_cache.delete(name)
217
217
  integration_record = self._get_integration_record(name)
218
+ if isinstance(integration_record.data, dict) and integration_record.data.get('is_demo') is True:
219
+ raise ValueError("It is forbidden to change properties of the demo object")
218
220
  old_data = deepcopy(integration_record.data)
219
221
  for k in old_data:
220
222
  if k not in data:
@@ -234,9 +236,11 @@ class IntegrationController:
234
236
  handler = self.handler_modules[name]
235
237
 
236
238
  if getattr(handler, 'permanent', False) is True:
237
- raise Exception('Unable to drop: is permanent integration')
239
+ raise Exception('Unable to drop permanent integration')
238
240
 
239
241
  integration_record = self._get_integration_record(name)
242
+ if isinstance(integration_record.data, dict) and integration_record.data.get('is_demo') is True:
243
+ raise Exception('Unable to drop demo object')
240
244
 
241
245
  # if this is ml engine
242
246
  engine_models = get_model_records(ml_handler_name=name, deleted_at=None)
@@ -7,6 +7,7 @@ import sqlalchemy as sa
7
7
  import numpy as np
8
8
 
9
9
  from mindsdb_sql_parser.ast.base import ASTNode
10
+ from mindsdb_sql_parser.ast import Select, Star, Constant, Identifier
10
11
  from mindsdb_sql_parser import parse_sql
11
12
 
12
13
  from mindsdb.interfaces.storage import db
@@ -16,6 +17,9 @@ from mindsdb.interfaces.database.views import ViewController
16
17
  from mindsdb.utilities.context import context as ctx
17
18
  from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
18
19
  import mindsdb.utilities.profiler as profiler
20
+ from mindsdb.api.executor.sql_query import SQLQuery
21
+ from mindsdb.api.executor.utilities.sql import query_df
22
+ from mindsdb.interfaces.query_context.context_controller import query_context_controller
19
23
 
20
24
 
21
25
  class Project:
@@ -24,19 +28,14 @@ class Project:
24
28
  p = Project()
25
29
  p.record = db_record
26
30
  p.name = db_record.name
27
- p.company_id = db_record.company_id
31
+ p.company_id = ctx.company_id
28
32
  p.id = db_record.id
29
33
  return p
30
34
 
31
35
  def create(self, name: str):
32
36
  name = name.lower()
33
- existing_record = db.Project.query.filter(
34
- (sa.func.lower(db.Project.name) == name)
35
- & (db.Project.company_id == ctx.company_id)
36
- & (db.Project.deleted_at == sa.null())
37
- ).first()
38
- if existing_record is not None:
39
- raise EntityExistsError('Project already exists', name)
37
+
38
+ company_id = ctx.company_id if ctx.company_id is not None else 0
40
39
 
41
40
  existing_record = db.Integration.query.filter(
42
41
  sa.func.lower(db.Integration.name) == name,
@@ -45,23 +44,28 @@ class Project:
45
44
  if existing_record is not None:
46
45
  raise EntityExistsError('Database exists with this name ', name)
47
46
 
47
+ existing_record = db.Project.query.filter(
48
+ (sa.func.lower(db.Project.name) == name)
49
+ & (db.Project.company_id == company_id)
50
+ & (db.Project.deleted_at == sa.null())
51
+ ).first()
52
+ if existing_record is not None:
53
+ raise EntityExistsError('Project already exists', name)
54
+
48
55
  record = db.Project(
49
56
  name=name,
50
- company_id=ctx.company_id
57
+ company_id=company_id
51
58
  )
52
59
 
53
60
  self.record = record
54
61
  self.name = name
55
- self.company_id = ctx.company_id
62
+ self.company_id = company_id
56
63
 
57
64
  db.session.add(record)
58
65
  db.session.commit()
59
66
 
60
67
  self.id = record.id
61
68
 
62
- def save(self):
63
- db.session.commit()
64
-
65
69
  def delete(self):
66
70
  tables = self.get_tables()
67
71
  tables = [key for key, val in tables.items() if val['type'] != 'table']
@@ -111,7 +115,7 @@ class Project:
111
115
  project_name=self.name
112
116
  )
113
117
 
114
- def query_view(self, query: ASTNode) -> ASTNode:
118
+ def get_view_meta(self, query: ASTNode) -> ASTNode:
115
119
  view_name = query.from_table.parts[-1]
116
120
  view_meta = ViewController().get(
117
121
  name=view_name,
@@ -120,6 +124,30 @@ class Project:
120
124
  view_meta['query_ast'] = parse_sql(view_meta['query'])
121
125
  return view_meta
122
126
 
127
+ def query_view(self, query, session):
128
+
129
+ view_meta = self.get_view_meta(query)
130
+
131
+ query_context_controller.set_context('view', view_meta['id'])
132
+
133
+ try:
134
+ sqlquery = SQLQuery(
135
+ view_meta['query_ast'],
136
+ session=session
137
+ )
138
+ result = sqlquery.fetch(view='dataframe')
139
+
140
+ finally:
141
+ query_context_controller.release_context('view', view_meta['id'])
142
+
143
+ if result['success'] is False:
144
+ raise Exception(f"Cant execute view query: {view_meta['query_ast']}")
145
+ df = result['result']
146
+ # remove duplicated columns
147
+ df = df.loc[:, ~df.columns.duplicated()]
148
+
149
+ return query_df(df, query, session=session)
150
+
123
151
  @staticmethod
124
152
  def _get_model_data(predictor_record, integraion_record, with_secrets: bool = True):
125
153
  from mindsdb.interfaces.database.integrations import integration_controller
@@ -341,6 +369,15 @@ class Project:
341
369
  columns = predictor_record.to_predict
342
370
  if not isinstance(columns, list):
343
371
  columns = [columns]
372
+ return columns
373
+ if self.get_view(table_name):
374
+ query = Select(targets=[Star()], from_table=Identifier(table_name), limit=Constant(1))
375
+
376
+ from mindsdb.api.executor.controllers.session_controller import SessionController
377
+ session = SessionController()
378
+ session.database = self.name
379
+ df = self.query_view(query, session)
380
+ return df.columns
344
381
  else:
345
382
  # is it agent?
346
383
  agent = db.Agents.query.filter_by(
@@ -360,8 +397,9 @@ class ProjectController:
360
397
  pass
361
398
 
362
399
  def get_list(self) -> List[Project]:
400
+ company_id = ctx.company_id if ctx.company_id is not None else 0
363
401
  records = db.Project.query.filter(
364
- (db.Project.company_id == ctx.company_id)
402
+ (db.Project.company_id == company_id)
365
403
  & (db.Project.deleted_at == sa.null())
366
404
  ).order_by(db.Project.name)
367
405
 
@@ -371,7 +409,8 @@ class ProjectController:
371
409
  if id is not None and name is not None:
372
410
  raise ValueError("Both 'id' and 'name' is None")
373
411
 
374
- q = db.Project.query.filter_by(company_id=ctx.company_id)
412
+ company_id = ctx.company_id if ctx.company_id is not None else 0
413
+ q = db.Project.query.filter_by(company_id=company_id)
375
414
 
376
415
  if id is not None:
377
416
  q = q.filter_by(id=id)
@@ -3,6 +3,7 @@ from mindsdb.interfaces.storage import db
3
3
  from mindsdb.interfaces.query_context.context_controller import query_context_controller
4
4
  from mindsdb.utilities.context import context as ctx
5
5
  from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
6
+ from mindsdb.interfaces.model.functions import get_project_record, get_project_records
6
7
 
7
8
 
8
9
  class ViewController:
@@ -39,11 +40,8 @@ class ViewController:
39
40
 
40
41
  def update(self, name, query, project_name):
41
42
  name = name.lower()
42
- project_record = db.session.query(db.Project).filter_by(
43
- name=project_name,
44
- company_id=ctx.company_id,
45
- deleted_at=None
46
- ).first()
43
+ project_record = get_project_record(project_name)
44
+
47
45
  rec = db.session.query(db.View).filter(
48
46
  func.lower(db.View.name) == name,
49
47
  db.View.company_id == ctx.company_id,
@@ -56,11 +54,8 @@ class ViewController:
56
54
 
57
55
  def delete(self, name, project_name):
58
56
  name = name.lower()
59
- project_record = db.session.query(db.Project).filter_by(
60
- name=project_name,
61
- company_id=ctx.company_id,
62
- deleted_at=None
63
- ).first()
57
+ project_record = get_project_record(project_name)
58
+
64
59
  rec = db.session.query(db.View).filter(
65
60
  func.lower(db.View.name) == name,
66
61
  db.View.company_id == ctx.company_id,
@@ -74,17 +69,12 @@ class ViewController:
74
69
  query_context_controller.drop_query_context('view', rec.id)
75
70
 
76
71
  def list(self, project_name):
77
- query = db.session.query(db.Project).filter_by(
78
- company_id=ctx.company_id,
79
- deleted_at=None
80
- )
81
- if project_name is not None:
82
- query = query.filter_by(name=project_name)
83
72
 
84
- project_names = {
85
- i.id: i.name
86
- for i in query
87
- }
73
+ project_names = {}
74
+ for project in get_project_records():
75
+ if project_name is not None and project.name != project_name:
76
+ continue
77
+ project_names[project.id] = project.name
88
78
 
89
79
  query = db.session.query(db.View).filter(
90
80
  db.View.company_id == ctx.company_id,
@@ -112,11 +102,8 @@ class ViewController:
112
102
  }
113
103
 
114
104
  def get(self, id=None, name=None, project_name=None):
115
- project_record = db.session.query(db.Project).filter_by(
116
- name=project_name,
117
- company_id=ctx.company_id,
118
- deleted_at=None
119
- ).first()
105
+ project_record = get_project_record(project_name)
106
+
120
107
  if id is not None:
121
108
  records = db.session.query(db.View).filter_by(
122
109
  id=id,
@@ -52,6 +52,7 @@ class KnowledgeBaseTable:
52
52
  self.session = session
53
53
  self.document_preprocessor = None
54
54
  self.document_loader = None
55
+ self.model_params = None
55
56
 
56
57
  def configure_preprocessing(self, config: Optional[dict] = None):
57
58
  """Configure preprocessing for the knowledge base table"""
@@ -488,6 +489,7 @@ class KnowledgeBaseTable:
488
489
  df_out = project_datanode.predict(
489
490
  model_name=model_rec.name,
490
491
  df=df,
492
+ params=self.model_params
491
493
  )
492
494
 
493
495
  target = model_rec.to_predict[0]
@@ -642,11 +644,13 @@ class KnowledgeBaseController:
642
644
  storage: Identifier,
643
645
  params: dict,
644
646
  preprocessing_config: Optional[dict] = None,
645
- if_not_exists: bool = False,
647
+ if_not_exists: bool = False
646
648
  ) -> db.KnowledgeBase:
647
649
  """
648
650
  Add a new knowledge base to the database
649
651
  :param preprocessing_config: Optional preprocessing configuration to validate and store
652
+ :param is_sparse: Whether to use sparse vectors for embeddings
653
+ :param vector_size: Optional size specification for vectors, required when is_sparse=True
650
654
  """
651
655
  # Validate preprocessing config first if provided
652
656
  if preprocessing_config is not None:
@@ -654,6 +658,12 @@ class KnowledgeBaseController:
654
658
  params = params or {}
655
659
  params['preprocessing'] = preprocessing_config
656
660
 
661
+ # Check if vector_size is provided when using sparse vectors
662
+ is_sparse = params.get('is_sparse')
663
+ vector_size = params.get('vector_size')
664
+ if is_sparse and vector_size is None:
665
+ raise ValueError("vector_size is required when is_sparse=True")
666
+
657
667
  # get project id
658
668
  project = self.session.database_controller.get_project(project_name)
659
669
  project_id = project.id
@@ -693,7 +703,16 @@ class KnowledgeBaseController:
693
703
  cloud_pg_vector = os.environ.get('KB_PGVECTOR_URL')
694
704
  if cloud_pg_vector:
695
705
  vector_table_name = name
696
- vector_db_name = self._create_persistent_pgvector()
706
+ # Add sparse vector support for pgvector
707
+ vector_db_params = {}
708
+ # Check both explicit parameter and model configuration
709
+ is_sparse = is_sparse or model_record.learn_args.get('using', {}).get('sparse')
710
+ if is_sparse:
711
+ vector_db_params['is_sparse'] = True
712
+ if vector_size is not None:
713
+ vector_db_params['vector_size'] = vector_size
714
+ vector_db_name = self._create_persistent_pgvector(vector_db_params)
715
+
697
716
  else:
698
717
  # create chroma db with same name
699
718
  vector_table_name = "default_collection"
@@ -705,17 +724,20 @@ class KnowledgeBaseController:
705
724
  else:
706
725
  vector_db_name, vector_table_name = storage.parts
707
726
 
727
+ # create table in vectordb before creating KB
728
+ self.session.datahub.get(vector_db_name).integration_handler.create_table(
729
+ vector_table_name
730
+ )
708
731
  vector_database_id = self.session.integration_controller.get(vector_db_name)['id']
709
732
 
710
- # create table in vectordb
711
- if model_record.learn_args.get('using', {}).get('sparse') is not None:
712
- self.session.datahub.get(vector_db_name).integration_handler.create_table(
713
- vector_table_name, sparse=model_record.learn_args.get('using', {}).get('sparse')
714
- )
715
- else:
716
- self.session.datahub.get(vector_db_name).integration_handler.create_table(
717
- vector_table_name
718
- )
733
+ # Store sparse vector settings in params if specified
734
+ if is_sparse:
735
+ params = params or {}
736
+ params['vector_config'] = {
737
+ 'is_sparse': is_sparse
738
+ }
739
+ if vector_size is not None:
740
+ params['vector_config']['vector_size'] = vector_size
719
741
 
720
742
  kb = db.KnowledgeBase(
721
743
  name=name,
@@ -729,16 +751,15 @@ class KnowledgeBaseController:
729
751
  db.session.commit()
730
752
  return kb
731
753
 
732
- def _create_persistent_pgvector(self):
754
+ def _create_persistent_pgvector(self, params=None):
733
755
  """Create default vector database for knowledge base, if not specified"""
734
-
735
756
  vector_store_name = "kb_pgvector_store"
736
757
 
737
758
  # check if exists
738
759
  if self.session.integration_controller.get(vector_store_name):
739
760
  return vector_store_name
740
761
 
741
- self.session.integration_controller.add(vector_store_name, 'pgvector', {})
762
+ self.session.integration_controller.add(vector_store_name, 'pgvector', params or {})
742
763
  return vector_store_name
743
764
 
744
765
  def _create_persistent_chroma(self, kb_name, engine="chromadb"):
@@ -840,16 +861,19 @@ class KnowledgeBaseController:
840
861
  )
841
862
  return kb
842
863
 
843
- def get_table(self, name: str, project_id: int) -> KnowledgeBaseTable:
864
+ def get_table(self, name: str, project_id: int, params: dict = None) -> KnowledgeBaseTable:
844
865
  """
845
866
  Returns kb table object with properly configured preprocessing
846
867
  :param name: table name
847
868
  :param project_id: project id
869
+ :param params: runtime parameters for KB. Keys: 'model' - parameters for embedding model
848
870
  :return: kb table object
849
871
  """
850
872
  kb = self.get(name, project_id)
851
873
  if kb is not None:
852
874
  table = KnowledgeBaseTable(kb, self.session)
875
+ if params:
876
+ table.model_params = params.get('model')
853
877
 
854
878
  # Always configure preprocessing - either from params or default
855
879
  if kb.params and 'preprocessing' in kb.params:
@@ -1,15 +1,13 @@
1
1
  import os
2
2
  from typing import List, Iterator
3
3
  from langchain_core.documents import Document as LangchainDocument
4
- from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
4
+ from langchain_text_splitters import MarkdownHeaderTextSplitter
5
5
  import pandas as pd
6
6
 
7
7
  from mindsdb.interfaces.file.file_controller import FileController
8
8
  from mindsdb.integrations.utilities.rag.loaders.file_loader import FileLoader
9
9
  from mindsdb.integrations.utilities.rag.splitters.file_splitter import (
10
10
  FileSplitter,
11
- DEFAULT_CHUNK_SIZE,
12
- DEFAULT_CHUNK_OVERLAP
13
11
  )
14
12
  from mindsdb.integrations.handlers.web_handler.urlcrawl_helpers import get_all_websites
15
13
  from mindsdb.interfaces.knowledge_base.preprocessing.models import Document
@@ -45,12 +43,6 @@ class DocumentLoader:
45
43
  self.file_loader_class = file_loader_class
46
44
  self.mysql_proxy = mysql_proxy
47
45
 
48
- # Initialize text splitter for query results with default settings
49
- self.query_splitter = RecursiveCharacterTextSplitter(
50
- chunk_size=DEFAULT_CHUNK_SIZE,
51
- chunk_overlap=DEFAULT_CHUNK_OVERLAP
52
- )
53
-
54
46
  def load_files(self, file_names: List[str]) -> Iterator[Document]:
55
47
  """Load and split documents from files"""
56
48
  for file_name in file_names:
@@ -143,8 +135,9 @@ class DocumentLoader:
143
135
 
144
136
  # Process each row into a Document
145
137
  for _, row in df.iterrows():
146
- # Extract content and metadata
138
+ # Extract id, content and metadata
147
139
  content = str(row.get('content', ''))
140
+ id = row.get('id', None)
148
141
 
149
142
  # Convert remaining columns to metadata
150
143
  metadata = {
@@ -156,21 +149,9 @@ class DocumentLoader:
156
149
 
157
150
  # Split content using recursive splitter
158
151
  if content:
159
- doc = LangchainDocument(
160
- page_content=content,
152
+
153
+ yield Document(
154
+ id=id,
155
+ content=content,
161
156
  metadata=metadata
162
157
  )
163
- # Use FileSplitter with default recursive splitter
164
- split_docs = self.file_splitter.split_documents(
165
- [doc],
166
- default_failover=True
167
- )
168
-
169
- for split_doc in split_docs:
170
- metadata = doc.metadata.copy()
171
- metadata.update(split_doc.metadata or {})
172
-
173
- yield Document(
174
- content=split_doc.page_content,
175
- metadata=metadata
176
- )
@@ -1,4 +1,4 @@
1
- from typing import Optional
1
+ from typing import Optional, List
2
2
 
3
3
  from sqlalchemy import null, func
4
4
 
@@ -41,9 +41,7 @@ def get_integration_record(name: str) -> db.Integration:
41
41
 
42
42
  @profiler.profile()
43
43
  def get_project_record(name: str) -> db.Project:
44
- company_id = ctx.company_id
45
- if company_id is None:
46
- company_id = null()
44
+ company_id = ctx.company_id if ctx.company_id is not None else 0
47
45
 
48
46
  project_record = (
49
47
  db.session.query(db.Project)
@@ -56,6 +54,19 @@ def get_project_record(name: str) -> db.Project:
56
54
  return project_record
57
55
 
58
56
 
57
+ @profiler.profile()
58
+ def get_project_records() -> List[db.Project]:
59
+ company_id = ctx.company_id if ctx.company_id is not None else 0
60
+
61
+ return (
62
+ db.session.query(db.Project)
63
+ .filter(
64
+ (db.Project.company_id == company_id)
65
+ & (db.Project.deleted_at == null())
66
+ ).all()
67
+ )
68
+
69
+
59
70
  @profiler.profile()
60
71
  def get_predictor_integration(record: db.Predictor) -> db.Integration:
61
72
  integration_record = (
@@ -7,14 +7,15 @@ from multiprocessing.pool import ThreadPool
7
7
  import pandas as pd
8
8
  from dateutil.parser import parse as parse_datetime
9
9
 
10
- from sqlalchemy import func, null
10
+ from sqlalchemy import func
11
11
  import numpy as np
12
12
 
13
13
  import mindsdb.interfaces.storage.db as db
14
14
  from mindsdb.utilities.config import Config
15
15
  from mindsdb.interfaces.model.functions import (
16
16
  get_model_record,
17
- get_model_records
17
+ get_model_records,
18
+ get_project_record
18
19
  )
19
20
  from mindsdb.interfaces.storage.json import get_json_storage
20
21
  from mindsdb.interfaces.storage.model_fs import ModelStorage
@@ -151,11 +152,7 @@ class ModelController():
151
152
  def delete_model(self, model_name: str, project_name: str = 'mindsdb', version=None):
152
153
  from mindsdb.interfaces.database.database import DatabaseController
153
154
 
154
- project_record = db.Project.query.filter(
155
- (func.lower(db.Project.name) == func.lower(project_name))
156
- & (db.Project.company_id == ctx.company_id)
157
- & (db.Project.deleted_at == null())
158
- ).first()
155
+ project_record = get_project_record(func.lower(project_name))
159
156
  if project_record is None:
160
157
  raise Exception(f"Project '{project_name}' does not exists")
161
158