MindsDB 25.7.3.0__py3-none-any.whl → 25.8.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (102) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +11 -1
  3. mindsdb/api/a2a/common/server/server.py +16 -6
  4. mindsdb/api/executor/command_executor.py +215 -150
  5. mindsdb/api/executor/datahub/datanodes/project_datanode.py +14 -3
  6. mindsdb/api/executor/planner/plan_join.py +3 -0
  7. mindsdb/api/executor/planner/plan_join_ts.py +117 -100
  8. mindsdb/api/executor/planner/query_planner.py +1 -0
  9. mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +54 -85
  10. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +21 -24
  11. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +9 -3
  12. mindsdb/api/executor/sql_query/steps/subselect_step.py +11 -8
  13. mindsdb/api/executor/utilities/mysql_to_duckdb_functions.py +264 -0
  14. mindsdb/api/executor/utilities/sql.py +30 -0
  15. mindsdb/api/http/initialize.py +18 -44
  16. mindsdb/api/http/namespaces/agents.py +23 -20
  17. mindsdb/api/http/namespaces/chatbots.py +83 -120
  18. mindsdb/api/http/namespaces/file.py +1 -1
  19. mindsdb/api/http/namespaces/jobs.py +38 -60
  20. mindsdb/api/http/namespaces/tree.py +69 -61
  21. mindsdb/api/http/namespaces/views.py +56 -72
  22. mindsdb/api/mcp/start.py +2 -0
  23. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +3 -2
  24. mindsdb/integrations/handlers/autogluon_handler/requirements.txt +1 -1
  25. mindsdb/integrations/handlers/autosklearn_handler/requirements.txt +1 -1
  26. mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +25 -5
  27. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +3 -3
  28. mindsdb/integrations/handlers/db2_handler/db2_handler.py +19 -23
  29. mindsdb/integrations/handlers/flaml_handler/requirements.txt +1 -1
  30. mindsdb/integrations/handlers/gong_handler/__about__.py +2 -0
  31. mindsdb/integrations/handlers/gong_handler/__init__.py +30 -0
  32. mindsdb/integrations/handlers/gong_handler/connection_args.py +37 -0
  33. mindsdb/integrations/handlers/gong_handler/gong_handler.py +164 -0
  34. mindsdb/integrations/handlers/gong_handler/gong_tables.py +508 -0
  35. mindsdb/integrations/handlers/gong_handler/icon.svg +25 -0
  36. mindsdb/integrations/handlers/gong_handler/test_gong_handler.py +125 -0
  37. mindsdb/integrations/handlers/google_calendar_handler/google_calendar_tables.py +82 -73
  38. mindsdb/integrations/handlers/hubspot_handler/requirements.txt +1 -1
  39. mindsdb/integrations/handlers/huggingface_handler/__init__.py +8 -12
  40. mindsdb/integrations/handlers/huggingface_handler/finetune.py +203 -223
  41. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +360 -383
  42. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -7
  43. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -7
  44. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  45. mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +83 -77
  46. mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
  47. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +5 -2
  48. mindsdb/integrations/handlers/litellm_handler/settings.py +2 -1
  49. mindsdb/integrations/handlers/openai_handler/constants.py +11 -30
  50. mindsdb/integrations/handlers/openai_handler/helpers.py +27 -34
  51. mindsdb/integrations/handlers/openai_handler/openai_handler.py +14 -12
  52. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +106 -90
  53. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +41 -39
  54. mindsdb/integrations/handlers/salesforce_handler/constants.py +215 -0
  55. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +141 -80
  56. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +0 -1
  57. mindsdb/integrations/handlers/tpot_handler/requirements.txt +1 -1
  58. mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +32 -17
  59. mindsdb/integrations/handlers/web_handler/web_handler.py +19 -22
  60. mindsdb/integrations/libs/llm/config.py +0 -14
  61. mindsdb/integrations/libs/llm/utils.py +0 -15
  62. mindsdb/integrations/libs/vectordatabase_handler.py +10 -1
  63. mindsdb/integrations/utilities/files/file_reader.py +5 -19
  64. mindsdb/integrations/utilities/handler_utils.py +32 -12
  65. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +1 -1
  66. mindsdb/interfaces/agents/agents_controller.py +246 -149
  67. mindsdb/interfaces/agents/constants.py +0 -1
  68. mindsdb/interfaces/agents/langchain_agent.py +11 -6
  69. mindsdb/interfaces/data_catalog/data_catalog_loader.py +4 -4
  70. mindsdb/interfaces/database/database.py +38 -13
  71. mindsdb/interfaces/database/integrations.py +20 -5
  72. mindsdb/interfaces/database/projects.py +174 -23
  73. mindsdb/interfaces/database/views.py +86 -60
  74. mindsdb/interfaces/jobs/jobs_controller.py +103 -110
  75. mindsdb/interfaces/knowledge_base/controller.py +33 -6
  76. mindsdb/interfaces/knowledge_base/evaluate.py +2 -1
  77. mindsdb/interfaces/knowledge_base/executor.py +24 -0
  78. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +6 -10
  79. mindsdb/interfaces/knowledge_base/preprocessing/text_splitter.py +73 -0
  80. mindsdb/interfaces/query_context/context_controller.py +111 -145
  81. mindsdb/interfaces/skills/skills_controller.py +18 -6
  82. mindsdb/interfaces/storage/db.py +40 -6
  83. mindsdb/interfaces/variables/variables_controller.py +8 -15
  84. mindsdb/utilities/config.py +5 -3
  85. mindsdb/utilities/fs.py +54 -17
  86. mindsdb/utilities/functions.py +72 -60
  87. mindsdb/utilities/log.py +38 -6
  88. mindsdb/utilities/ps.py +7 -7
  89. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/METADATA +282 -268
  90. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/RECORD +94 -92
  91. mindsdb/integrations/handlers/anyscale_endpoints_handler/__about__.py +0 -9
  92. mindsdb/integrations/handlers/anyscale_endpoints_handler/__init__.py +0 -20
  93. mindsdb/integrations/handlers/anyscale_endpoints_handler/anyscale_endpoints_handler.py +0 -290
  94. mindsdb/integrations/handlers/anyscale_endpoints_handler/creation_args.py +0 -14
  95. mindsdb/integrations/handlers/anyscale_endpoints_handler/icon.svg +0 -4
  96. mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -2
  97. mindsdb/integrations/handlers/anyscale_endpoints_handler/settings.py +0 -51
  98. mindsdb/integrations/handlers/anyscale_endpoints_handler/tests/test_anyscale_endpoints_handler.py +0 -212
  99. /mindsdb/integrations/handlers/{anyscale_endpoints_handler/tests/__init__.py → gong_handler/requirements.txt} +0 -0
  100. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/WHEEL +0 -0
  101. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/licenses/LICENSE +0 -0
  102. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/top_level.txt +0 -0
@@ -21,29 +21,29 @@ from mindsdb.utilities import log
21
21
 
22
22
  logger = log.getLogger(__name__)
23
23
 
24
- default_project = config.get('default_project')
24
+ default_project = config.get("default_project")
25
25
 
26
26
 
27
27
  def split_sql(sql):
28
28
  # split sql by ';' ignoring delimiter in quotes
29
- pattern = re.compile(r'''((?:[^;"']|"[^"]*"|'[^']*')+)''')
29
+ pattern = re.compile(r"""((?:[^;"']|"[^"]*"|'[^']*')+)""")
30
30
  return pattern.split(sql)[1::2]
31
31
 
32
32
 
33
33
  def calc_next_date(schedule_str, base_date: dt.datetime):
34
34
  schedule_str = schedule_str.lower().strip()
35
35
 
36
- repeat_prefix = 'every '
36
+ repeat_prefix = "every "
37
37
  if schedule_str.startswith(repeat_prefix):
38
- repeat_str = schedule_str[len(repeat_prefix):]
38
+ repeat_str = schedule_str[len(repeat_prefix) :]
39
39
  else:
40
40
  # TODO cron format
41
- raise NotImplementedError(f'Schedule: {schedule_str}')
41
+ raise NotImplementedError(f"Schedule: {schedule_str}")
42
42
 
43
43
  items = repeat_str.split()
44
44
 
45
45
  if len(items) == 1:
46
- value = '1'
46
+ value = "1"
47
47
  period = items[0]
48
48
  elif len(items) == 2:
49
49
  value, period = items
@@ -53,15 +53,15 @@ def calc_next_date(schedule_str, base_date: dt.datetime):
53
53
  if not value.isdigit():
54
54
  raise Exception(f"Number expected: {value}")
55
55
  value = int(value)
56
- if period in ('minute', 'minutes', 'min'):
56
+ if period in ("minute", "minutes", "min"):
57
57
  delta = dt.timedelta(minutes=value)
58
- elif period in ('hour', 'hours'):
58
+ elif period in ("hour", "hours"):
59
59
  delta = dt.timedelta(hours=value)
60
- elif period in ('day', 'days'):
60
+ elif period in ("day", "days"):
61
61
  delta = dt.timedelta(days=value)
62
- elif period in ('week', 'weeks'):
62
+ elif period in ("week", "weeks"):
63
63
  delta = dt.timedelta(days=value * 7) # 1 week = 7 days
64
- elif period in ('month', 'months'):
64
+ elif period in ("month", "months"):
65
65
  delta = relativedelta(months=value)
66
66
  else:
67
67
  raise Exception(f"Unknown period: {period}")
@@ -85,10 +85,10 @@ def parse_job_date(date_str: str) -> dt.datetime:
85
85
  :return:
86
86
  """
87
87
 
88
- if date_str.upper() == 'NOW':
88
+ if date_str.upper() == "NOW":
89
89
  return dt.datetime.now()
90
90
 
91
- date_formats = ['%Y-%m-%d %H:%M:%S', '%Y-%m-%d']
91
+ date_formats = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d"]
92
92
  date = None
93
93
  for date_format in date_formats:
94
94
  try:
@@ -128,39 +128,41 @@ class JobsController:
128
128
  at the moment supports: 'every <number> <dimension>' or 'every <dimension>'
129
129
  :return: name of created job
130
130
  """
131
+ if not name.islower():
132
+ raise ValueError(f"The name must be in lower case: {name}")
131
133
 
132
134
  project_controller = ProjectController()
133
135
  project = project_controller.get(name=project_name)
134
136
 
135
137
  # check if exists
136
138
  if self.get(name, project_name) is not None:
137
- raise EntityExistsError('Job already exists', name)
139
+ raise EntityExistsError("Job already exists", name)
138
140
 
139
141
  if start_at is None:
140
142
  start_at = dt.datetime.now()
141
143
 
142
144
  if end_at is not None and end_at < start_at:
143
- raise Exception(f'Wrong end date {start_at} > {end_at}')
145
+ raise Exception(f"Wrong end date {start_at} > {end_at}")
144
146
 
145
147
  # check sql = try to parse it
146
148
  for sql in split_sql(query):
147
149
  try:
148
150
  # replace template variables with null
149
- sql = re.sub(r'\{\{[\w\d]+}}', "", sql)
151
+ sql = re.sub(r"\{\{[\w\d]+}}", "", sql)
150
152
 
151
153
  parse_sql(sql)
152
154
  except ParsingException as e:
153
- raise ParsingException(f'Unable to parse: {sql}: {e}')
155
+ raise ParsingException(f"Unable to parse: {sql}: {e}")
154
156
 
155
157
  if if_query is not None:
156
158
  for sql in split_sql(if_query):
157
159
  try:
158
160
  # replace template variables with null
159
- sql = re.sub(r'\{\{[\w\d]+}}', "", sql)
161
+ sql = re.sub(r"\{\{[\w\d]+}}", "", sql)
160
162
 
161
163
  parse_sql(sql)
162
164
  except ParsingException as e:
163
- raise ParsingException(f'Unable to parse: {sql}: {e}')
165
+ raise ParsingException(f"Unable to parse: {sql}: {e}")
164
166
 
165
167
  # plan next run
166
168
  next_run_at = start_at
@@ -185,7 +187,7 @@ class JobsController:
185
187
  start_at=start_at,
186
188
  end_at=end_at,
187
189
  next_run_at=next_run_at,
188
- schedule_str=schedule_str
190
+ schedule_str=schedule_str,
189
191
  )
190
192
  db.session.add(record)
191
193
  db.session.commit()
@@ -219,10 +221,11 @@ class JobsController:
219
221
 
220
222
  schedule_str = None
221
223
  if query.repeat_str is not None:
222
- schedule_str = 'every ' + query.repeat_str
224
+ schedule_str = "every " + query.repeat_str
223
225
 
224
226
  return self.add(
225
- name, project_name,
227
+ name,
228
+ project_name,
226
229
  query=query_str,
227
230
  start_at=start_at,
228
231
  end_at=end_at,
@@ -231,36 +234,30 @@ class JobsController:
231
234
  )
232
235
 
233
236
  def delete(self, name, project_name):
234
-
235
237
  project_controller = ProjectController()
236
238
  project = project_controller.get(name=project_name)
237
239
 
238
240
  # check if exists
239
- record = db.session.query(db.Jobs).filter_by(
240
- company_id=ctx.company_id,
241
- name=name,
242
- project_id=project.id,
243
- deleted_at=sa.null()
244
- ).first()
241
+ record = (
242
+ db.session.query(db.Jobs)
243
+ .filter_by(company_id=ctx.company_id, name=name, project_id=project.id, deleted_at=sa.null())
244
+ .first()
245
+ )
245
246
  if record is None:
246
- raise EntityNotExistsError('Job does not exist', name)
247
+ raise EntityNotExistsError("Job does not exist", name)
247
248
 
248
249
  self._delete_record(record)
249
250
  db.session.commit()
250
251
 
251
252
  # delete context
252
- query_context_controller.drop_query_context('job', record.id)
253
- query_context_controller.drop_query_context('job-if', record.id)
253
+ query_context_controller.drop_query_context("job", record.id)
254
+ query_context_controller.drop_query_context("job-if", record.id)
254
255
 
255
256
  def _delete_record(self, record):
256
257
  record.deleted_at = dt.datetime.now()
257
258
 
258
259
  def get_list(self, project_name=None):
259
-
260
- query = db.session.query(db.Jobs).filter_by(
261
- company_id=ctx.company_id,
262
- deleted_at=sa.null()
263
- )
260
+ query = db.session.query(db.Jobs).filter_by(company_id=ctx.company_id, deleted_at=sa.null())
264
261
 
265
262
  project_controller = ProjectController()
266
263
  if project_name is not None:
@@ -268,23 +265,22 @@ class JobsController:
268
265
  query = query.filter_by(project_id=project.id)
269
266
 
270
267
  data = []
271
- project_names = {
272
- i.id: i.name
273
- for i in project_controller.get_list()
274
- }
268
+ project_names = {i.id: i.name for i in project_controller.get_list()}
275
269
  for record in query:
276
- data.append({
277
- 'id': record.id,
278
- 'name': record.name,
279
- 'project': project_names[record.project_id],
280
- 'start_at': record.start_at,
281
- 'end_at': record.end_at,
282
- 'next_run_at': record.next_run_at,
283
- 'schedule_str': record.schedule_str,
284
- 'query': record.query_str,
285
- 'if_query': record.if_query_str,
286
- 'variables': query_context_controller.get_context_vars('job', record.id)
287
- })
270
+ data.append(
271
+ {
272
+ "id": record.id,
273
+ "name": record.name,
274
+ "project": project_names[record.project_id],
275
+ "start_at": record.start_at,
276
+ "end_at": record.end_at,
277
+ "next_run_at": record.next_run_at,
278
+ "schedule_str": record.schedule_str,
279
+ "query": record.query_str,
280
+ "if_query": record.if_query_str,
281
+ "variables": query_context_controller.get_context_vars("job", record.id),
282
+ }
283
+ )
288
284
  return data
289
285
 
290
286
  def get(self, name: str, project_name: str) -> dict:
@@ -298,25 +294,24 @@ class JobsController:
298
294
  project_controller = ProjectController()
299
295
  project = project_controller.get(name=project_name)
300
296
 
301
- record = db.session.query(db.Jobs).filter_by(
302
- company_id=ctx.company_id,
303
- name=name,
304
- project_id=project.id,
305
- deleted_at=sa.null()
306
- ).first()
297
+ record = (
298
+ db.session.query(db.Jobs)
299
+ .filter_by(company_id=ctx.company_id, name=name, project_id=project.id, deleted_at=sa.null())
300
+ .first()
301
+ )
307
302
 
308
303
  if record is not None:
309
304
  return {
310
- 'id': record.id,
311
- 'name': record.name,
312
- 'project': project_name,
313
- 'start_at': record.start_at,
314
- 'end_at': record.end_at,
315
- 'next_run_at': record.next_run_at,
316
- 'schedule_str': record.schedule_str,
317
- 'query': record.query_str,
318
- 'if_query': record.if_query_str,
319
- 'variables': query_context_controller.get_context_vars('job', record.id)
305
+ "id": record.id,
306
+ "name": record.name,
307
+ "project": project_name,
308
+ "start_at": record.start_at,
309
+ "end_at": record.end_at,
310
+ "next_run_at": record.next_run_at,
311
+ "schedule_str": record.schedule_str,
312
+ "query": record.query_str,
313
+ "if_query": record.if_query_str,
314
+ "variables": query_context_controller.get_context_vars("job", record.id),
320
315
  }
321
316
 
322
317
  def get_history(self, name: str, project_name: str) -> List[dict]:
@@ -331,27 +326,33 @@ class JobsController:
331
326
 
332
327
  query = Select(
333
328
  targets=[Star()],
334
- from_table=Identifier('jobs_history'),
335
- where=BinaryOperation(op='and', args=[
336
- BinaryOperation(op='=', args=[Identifier('name'), Constant(name)]),
337
- BinaryOperation(op='=', args=[Identifier('project'), Constant(project_name)])
338
- ])
329
+ from_table=Identifier("jobs_history"),
330
+ where=BinaryOperation(
331
+ op="and",
332
+ args=[
333
+ BinaryOperation(op="=", args=[Identifier("name"), Constant(name)]),
334
+ BinaryOperation(op="=", args=[Identifier("project"), Constant(project_name)]),
335
+ ],
336
+ ),
339
337
  )
340
338
  response = logs_db_controller.query(query)
341
339
 
342
- names = [i['name'] for i in response.columns]
343
- return response.data_frame[names].to_dict(orient='records')
340
+ names = [i["name"] for i in response.columns]
341
+ return response.data_frame[names].to_dict(orient="records")
344
342
 
345
343
 
346
344
  class JobsExecutor:
347
-
348
345
  def get_next_tasks(self):
349
346
  # filter next_run < now
350
- query = db.session.query(db.Jobs).filter(
351
- db.Jobs.next_run_at < dt.datetime.now(),
352
- db.Jobs.deleted_at == sa.null(),
353
- db.Jobs.active == True, # noqa
354
- ).order_by(db.Jobs.next_run_at)
347
+ query = (
348
+ db.session.query(db.Jobs)
349
+ .filter(
350
+ db.Jobs.next_run_at < dt.datetime.now(),
351
+ db.Jobs.deleted_at == sa.null(),
352
+ db.Jobs.active == True, # noqa
353
+ )
354
+ .order_by(db.Jobs.next_run_at)
355
+ )
355
356
 
356
357
  return query.all()
357
358
 
@@ -389,12 +390,7 @@ class JobsExecutor:
389
390
  record = db.Jobs.query.get(record_id)
390
391
 
391
392
  try:
392
-
393
- history_record = db.JobsHistory(
394
- job_id=record.id,
395
- start_at=record.next_run_at,
396
- company_id=record.company_id
397
- )
393
+ history_record = db.JobsHistory(job_id=record.id, start_at=record.next_run_at, company_id=record.company_id)
398
394
 
399
395
  db.session.add(history_record)
400
396
  db.session.commit()
@@ -408,9 +404,7 @@ class JobsExecutor:
408
404
 
409
405
  # check if it is an old lock
410
406
  history_record = db.JobsHistory.query.filter_by(
411
- job_id=record.id,
412
- start_at=record.next_run_at,
413
- company_id=record.company_id
407
+ job_id=record.id, start_at=record.next_run_at, company_id=record.company_id
414
408
  ).first()
415
409
  if history_record.updated_at < dt.datetime.now() - dt.timedelta(seconds=30):
416
410
  db.session.delete(history_record)
@@ -419,13 +413,14 @@ class JobsExecutor:
419
413
  return None
420
414
 
421
415
  def __fill_variables(self, sql, record, history_record):
422
- if '{{PREVIOUS_START_DATETIME}}' in sql:
416
+ if "{{PREVIOUS_START_DATETIME}}" in sql:
423
417
  # get previous run date
424
- history_prev = db.session.query(db.JobsHistory.start_at) \
425
- .filter(db.JobsHistory.job_id == record.id,
426
- db.JobsHistory.id != history_record.id) \
427
- .order_by(db.JobsHistory.id.desc()) \
418
+ history_prev = (
419
+ db.session.query(db.JobsHistory.start_at)
420
+ .filter(db.JobsHistory.job_id == record.id, db.JobsHistory.id != history_record.id)
421
+ .order_by(db.JobsHistory.id.desc())
428
422
  .first()
423
+ )
429
424
  if history_prev is None:
430
425
  # start date of the job
431
426
  value = record.created_at
@@ -433,18 +428,17 @@ class JobsExecutor:
433
428
  # fix for twitter: created_at filter must be minimum of 10 seconds prior to the current time
434
429
  value = history_prev.start_at - dt.timedelta(seconds=60)
435
430
  value = value.strftime("%Y-%m-%d %H:%M:%S")
436
- sql = sql.replace('{{PREVIOUS_START_DATETIME}}', value)
431
+ sql = sql.replace("{{PREVIOUS_START_DATETIME}}", value)
437
432
 
438
- if '{{START_DATE}}' in sql:
433
+ if "{{START_DATE}}" in sql:
439
434
  value = history_record.start_at.strftime("%Y-%m-%d")
440
- sql = sql.replace('{{START_DATE}}', value)
441
- if '{{START_DATETIME}}' in sql:
435
+ sql = sql.replace("{{START_DATE}}", value)
436
+ if "{{START_DATETIME}}" in sql:
442
437
  value = history_record.start_at.strftime("%Y-%m-%d %H:%M:%S")
443
- sql = sql.replace('{{START_DATETIME}}', value)
438
+ sql = sql.replace("{{START_DATETIME}}", value)
444
439
  return sql
445
440
 
446
441
  def execute_task_local(self, record_id, history_id=None):
447
-
448
442
  record = db.Jobs.query.get(record_id)
449
443
 
450
444
  # set up environment
@@ -470,7 +464,7 @@ class JobsExecutor:
470
464
 
471
465
  project_controller = ProjectController()
472
466
  project = project_controller.get(record.project_id)
473
- executed_sql = ''
467
+ executed_sql = ""
474
468
 
475
469
  from mindsdb.api.executor.controllers.session_controller import SessionController
476
470
  from mindsdb.api.executor.command_executor import ExecuteCommands
@@ -480,8 +474,8 @@ class JobsExecutor:
480
474
  command_executor = ExecuteCommands(sql_session)
481
475
 
482
476
  # job with condition?
483
- query_context_controller.set_context('job-if', record.id)
484
- error = ''
477
+ query_context_controller.set_context("job-if", record.id)
478
+ error = ""
485
479
  to_execute_query = True
486
480
  if record.if_query_str is not None:
487
481
  data = None
@@ -491,7 +485,7 @@ class JobsExecutor:
491
485
  sql = self.__fill_variables(sql, record, history_record)
492
486
 
493
487
  query = parse_sql(sql)
494
- executed_sql += sql + '; '
488
+ executed_sql += sql + "; "
495
489
 
496
490
  ret = command_executor.execute_command(query)
497
491
  if ret.error_code is not None:
@@ -508,17 +502,16 @@ class JobsExecutor:
508
502
  if error or data is None or len(data) == 0:
509
503
  to_execute_query = False
510
504
 
511
- query_context_controller.release_context('job-if', record.id)
505
+ query_context_controller.release_context("job-if", record.id)
512
506
  if to_execute_query:
513
-
514
- query_context_controller.set_context('job', record.id)
507
+ query_context_controller.set_context("job", record.id)
515
508
  for sql in split_sql(record.query_str):
516
509
  try:
517
510
  # fill template variables
518
511
  sql = self.__fill_variables(sql, record, history_record)
519
512
 
520
513
  query = parse_sql(sql)
521
- executed_sql += sql + '; '
514
+ executed_sql += sql + "; "
522
515
 
523
516
  ret = command_executor.execute_command(query)
524
517
  if ret.error_code is not None:
@@ -60,6 +60,7 @@ class KnowledgeBaseInputParams(BaseModel):
60
60
  is_sparse: bool = False
61
61
  vector_size: int | None = None
62
62
  reranking_model: Dict[Text, Any] | None = None
63
+ preprocessing: Dict[Text, Any] | None = None
63
64
 
64
65
  class Config:
65
66
  extra = "forbid"
@@ -244,9 +245,9 @@ class KnowledgeBaseTable:
244
245
  keyword_search_cols_and_values = []
245
246
  query_text = None
246
247
  relevance_threshold = None
247
- reranking_enabled_flag = True
248
248
  hybrid_search_enabled_flag = False
249
249
  query_conditions = db_handler.extract_conditions(query.where)
250
+ hybrid_search_alpha = None # Default to None, meaning no alpha weighted blending
250
251
  if query_conditions is not None:
251
252
  for item in query_conditions:
252
253
  if item.column == "relevance" and item.op.value == FilterOperator.GREATER_THAN_OR_EQUAL.value:
@@ -261,10 +262,8 @@ class KnowledgeBaseTable:
261
262
  logger.error(error_msg)
262
263
  raise ValueError(error_msg)
263
264
  elif item.column == "reranking":
264
- reranking_enabled_flag = item.value
265
- # cast to boolean
266
- if isinstance(reranking_enabled_flag, str):
267
- reranking_enabled_flag = reranking_enabled_flag.lower() not in ("false")
265
+ if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
266
+ disable_reranking = True
268
267
  elif item.column == "hybrid_search":
269
268
  hybrid_search_enabled_flag = item.value
270
269
  # cast to boolean
@@ -272,6 +271,14 @@ class KnowledgeBaseTable:
272
271
  hybrid_search_enabled_flag = hybrid_search_enabled_flag.lower() not in ("false")
273
272
  if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
274
273
  disable_reranking = True
274
+ elif item.column == "hybrid_search_alpha":
275
+ # validate item.value is a float
276
+ if not isinstance(item.value, (float, int)):
277
+ raise ValueError(f"Invalid hybrid_search_alpha value: {item.value}. Must be a float or int.")
278
+ # validate hybrid search alpha is between 0 and 1
279
+ if not (0 <= item.value <= 1):
280
+ raise ValueError(f"Invalid hybrid_search_alpha value: {item.value}. Must be between 0 and 1.")
281
+ hybrid_search_alpha = item.value
275
282
  elif item.column == "relevance" and item.op.value != FilterOperator.GREATER_THAN_OR_EQUAL.value:
276
283
  raise ValueError(
277
284
  f"Invalid operator for relevance: {item.op.value}. Only GREATER_THAN_OR_EQUAL is allowed."
@@ -345,7 +352,15 @@ class KnowledgeBaseTable:
345
352
  f"Keyword search returned different columns: {df_keyword_select.columns} "
346
353
  f"than expected: {df.columns}"
347
354
  )
355
+ if hybrid_search_alpha:
356
+ df_keyword_select[TableField.DISTANCE.value] = (
357
+ hybrid_search_alpha * df_keyword_select[TableField.DISTANCE.value]
358
+ )
359
+ df[TableField.DISTANCE.value] = (1 - hybrid_search_alpha) * df[TableField.DISTANCE.value]
348
360
  df = pd.concat([df, df_keyword_select], ignore_index=True)
361
+ # sort by distance if distance column exists
362
+ if TableField.DISTANCE.value in df.columns:
363
+ df = df.sort_values(by=TableField.DISTANCE.value, ascending=True)
349
364
  # if chunk_id column exists remove duplicates based on chunk_id
350
365
  if "chunk_id" in df.columns:
351
366
  df = df.drop_duplicates(subset=["chunk_id"])
@@ -519,6 +534,9 @@ class KnowledgeBaseTable:
519
534
 
520
535
  query.update_columns[emb_col] = Constant(self._content_to_embeddings(content))
521
536
 
537
+ if "metadata" not in query.update_columns:
538
+ query.update_columns["metadata"] = Constant({})
539
+
522
540
  # TODO search content in where clause?
523
541
 
524
542
  # set table name
@@ -1010,6 +1028,9 @@ class KnowledgeBaseController:
1010
1028
  :param is_sparse: Whether to use sparse vectors for embeddings
1011
1029
  :param vector_size: Optional size specification for vectors, required when is_sparse=True
1012
1030
  """
1031
+ if not name.islower():
1032
+ raise ValueError(f"The name must be in lower case: {name}")
1033
+
1013
1034
  # fill variables
1014
1035
  params = variables_controller.fill_parameters(params)
1015
1036
 
@@ -1118,8 +1139,14 @@ class KnowledgeBaseController:
1118
1139
  else:
1119
1140
  vector_db_name, vector_table_name = storage.parts
1120
1141
 
1142
+ data_node = self.session.datahub.get(vector_db_name)
1143
+ if data_node:
1144
+ vector_store_handler = data_node.integration_handler
1145
+ else:
1146
+ raise ValueError(
1147
+ f"Unable to find database named {vector_db_name}, please make sure {vector_db_name} is defined"
1148
+ )
1121
1149
  # create table in vectordb before creating KB
1122
- vector_store_handler = self.session.datahub.get(vector_db_name).integration_handler
1123
1150
  vector_store_handler.create_table(vector_table_name)
1124
1151
  if keyword_search_enabled:
1125
1152
  vector_store_handler.add_full_text_index(vector_table_name, TableField.CONTENT.value)
@@ -118,7 +118,8 @@ class EvaluateBase:
118
118
 
119
119
  dn, table_name = self._get_dn_table(query.from_table)
120
120
  query.from_table = table_name
121
- query.limit = Constant(self.DEFAULT_SAMPLE_SIZE)
121
+ if query.limit is None:
122
+ query.limit = Constant(self.DEFAULT_SAMPLE_SIZE)
122
123
 
123
124
  response = dn.query(query=query, session=self.session)
124
125
  df = response.data_frame
@@ -217,6 +217,17 @@ class KnowledgeBaseQueryExecutor:
217
217
  f'Operator "{content_condition.op}" is not supported for condition: {content_condition}'
218
218
  )
219
219
 
220
+ @staticmethod
221
+ def to_include_content(content_condition: BinaryOperation) -> List[str]:
222
+ """
223
+ Handles positive conditions for content. Returns list of content values
224
+ """
225
+ if content_condition.op == "IN":
226
+ return [item.value for item in content_condition.args[1].items]
227
+
228
+ elif content_condition.op in ("=", "LIKE"):
229
+ return [content_condition.args[1].value]
230
+
220
231
  def to_excluded_ids(
221
232
  self, content_condition: BinaryOperation, other_conditions: List[BinaryOperation]
222
233
  ) -> Optional[List[str]]:
@@ -290,11 +301,17 @@ class KnowledgeBaseQueryExecutor:
290
301
  if len(content_filters) > 0:
291
302
  content_filters2 = []
292
303
  exclude_ids = set()
304
+ include_contents = set()
293
305
  # exclude content conditions
294
306
  for condition in content_filters:
295
307
  ids = self.to_excluded_ids(condition, other_filters)
296
308
  if ids is not None:
297
309
  exclude_ids.update(ids)
310
+ continue
311
+ contents = self.to_include_content(condition)
312
+ if contents is not None:
313
+ include_contents.update(contents)
314
+ continue
298
315
  else:
299
316
  # keep origin content filter
300
317
  content_filters2.append(condition)
@@ -305,6 +322,13 @@ class KnowledgeBaseQueryExecutor:
305
322
  condition = BinaryOperation(op="NOT IN", args=[Identifier(self.id_column), Tuple(values)])
306
323
  other_filters.append(condition)
307
324
  # execute content filters
325
+ if include_contents:
326
+ content = " AND ".join(include_contents)
327
+ result = self.execute_content_condition(
328
+ BinaryOperation(op="=", args=[Identifier(self.content_column), Constant(content)]),
329
+ other_filters,
330
+ )
331
+ results.append(result)
308
332
  for condition in content_filters2:
309
333
  result = self.execute_content_condition(condition, other_filters)
310
334
  results.append(result)
@@ -4,8 +4,7 @@ import asyncio
4
4
  from typing import List, Dict, Optional, Any
5
5
 
6
6
  import pandas as pd
7
- from langchain_text_splitters import RecursiveCharacterTextSplitter
8
- from langchain_core.documents import Document as LangchainDocument
7
+ from mindsdb.interfaces.knowledge_base.preprocessing.text_splitter import TextSplitter
9
8
 
10
9
  from mindsdb.integrations.utilities.rag.splitters.file_splitter import (
11
10
  FileSplitter,
@@ -22,7 +21,6 @@ from mindsdb.interfaces.knowledge_base.preprocessing.models import (
22
21
  )
23
22
  from mindsdb.utilities import log
24
23
 
25
-
26
24
  logger = log.getLogger(__name__)
27
25
 
28
26
  _DEFAULT_CONTENT_COLUMN_NAME = "content"
@@ -49,11 +47,10 @@ class DocumentPreprocessor:
49
47
  if self.splitter is None:
50
48
  raise ValueError("Splitter not configured")
51
49
 
52
- # Convert to langchain Document for splitting
53
- langchain_doc = LangchainDocument(page_content=doc.content, metadata=doc.metadata or {})
50
+ metadata = doc.metadata or {}
54
51
  # Split and convert back to our Document type
55
- split_docs = self.splitter.split_documents([langchain_doc])
56
- return [Document(content=split_doc.page_content, metadata=split_doc.metadata) for split_doc in split_docs]
52
+ split_texts = self.splitter.split_text(doc.content)
53
+ return [Document(content=text, metadata=metadata) for text in split_texts]
57
54
 
58
55
  def _get_source(self) -> str:
59
56
  """Get the source identifier for this preprocessor"""
@@ -266,16 +263,15 @@ Please give a short succinct context to situate this chunk within the overall do
266
263
 
267
264
 
268
265
  class TextChunkingPreprocessor(DocumentPreprocessor):
269
- """Default text chunking preprocessor using RecursiveCharacterTextSplitter"""
266
+ """Default text chunking preprocessor using TextSplitter"""
270
267
 
271
268
  def __init__(self, config: Optional[TextChunkingConfig] = None):
272
269
  """Initialize with text chunking configuration"""
273
270
  super().__init__()
274
271
  self.config = config or TextChunkingConfig()
275
- self.splitter = RecursiveCharacterTextSplitter(
272
+ self.splitter = TextSplitter(
276
273
  chunk_size=self.config.chunk_size,
277
274
  chunk_overlap=self.config.chunk_overlap,
278
- length_function=self.config.length_function,
279
275
  separators=self.config.separators,
280
276
  )
281
277