MindsDB 25.7.3.0__py3-none-any.whl → 25.8.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +11 -1
- mindsdb/api/a2a/common/server/server.py +16 -6
- mindsdb/api/executor/command_executor.py +215 -150
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +14 -3
- mindsdb/api/executor/planner/plan_join.py +3 -0
- mindsdb/api/executor/planner/plan_join_ts.py +117 -100
- mindsdb/api/executor/planner/query_planner.py +1 -0
- mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +54 -85
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +21 -24
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +9 -3
- mindsdb/api/executor/sql_query/steps/subselect_step.py +11 -8
- mindsdb/api/executor/utilities/mysql_to_duckdb_functions.py +264 -0
- mindsdb/api/executor/utilities/sql.py +30 -0
- mindsdb/api/http/initialize.py +18 -44
- mindsdb/api/http/namespaces/agents.py +23 -20
- mindsdb/api/http/namespaces/chatbots.py +83 -120
- mindsdb/api/http/namespaces/file.py +1 -1
- mindsdb/api/http/namespaces/jobs.py +38 -60
- mindsdb/api/http/namespaces/tree.py +69 -61
- mindsdb/api/http/namespaces/views.py +56 -72
- mindsdb/api/mcp/start.py +2 -0
- mindsdb/api/mysql/mysql_proxy/utilities/dump.py +3 -2
- mindsdb/integrations/handlers/autogluon_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/autosklearn_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +25 -5
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +3 -3
- mindsdb/integrations/handlers/db2_handler/db2_handler.py +19 -23
- mindsdb/integrations/handlers/flaml_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/gong_handler/__about__.py +2 -0
- mindsdb/integrations/handlers/gong_handler/__init__.py +30 -0
- mindsdb/integrations/handlers/gong_handler/connection_args.py +37 -0
- mindsdb/integrations/handlers/gong_handler/gong_handler.py +164 -0
- mindsdb/integrations/handlers/gong_handler/gong_tables.py +508 -0
- mindsdb/integrations/handlers/gong_handler/icon.svg +25 -0
- mindsdb/integrations/handlers/gong_handler/test_gong_handler.py +125 -0
- mindsdb/integrations/handlers/google_calendar_handler/google_calendar_tables.py +82 -73
- mindsdb/integrations/handlers/hubspot_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/huggingface_handler/__init__.py +8 -12
- mindsdb/integrations/handlers/huggingface_handler/finetune.py +203 -223
- mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +360 -383
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -7
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -7
- mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
- mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +83 -77
- mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +5 -2
- mindsdb/integrations/handlers/litellm_handler/settings.py +2 -1
- mindsdb/integrations/handlers/openai_handler/constants.py +11 -30
- mindsdb/integrations/handlers/openai_handler/helpers.py +27 -34
- mindsdb/integrations/handlers/openai_handler/openai_handler.py +14 -12
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +106 -90
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +41 -39
- mindsdb/integrations/handlers/salesforce_handler/constants.py +215 -0
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +141 -80
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +0 -1
- mindsdb/integrations/handlers/tpot_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +32 -17
- mindsdb/integrations/handlers/web_handler/web_handler.py +19 -22
- mindsdb/integrations/libs/llm/config.py +0 -14
- mindsdb/integrations/libs/llm/utils.py +0 -15
- mindsdb/integrations/libs/vectordatabase_handler.py +10 -1
- mindsdb/integrations/utilities/files/file_reader.py +5 -19
- mindsdb/integrations/utilities/handler_utils.py +32 -12
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +1 -1
- mindsdb/interfaces/agents/agents_controller.py +246 -149
- mindsdb/interfaces/agents/constants.py +0 -1
- mindsdb/interfaces/agents/langchain_agent.py +11 -6
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +4 -4
- mindsdb/interfaces/database/database.py +38 -13
- mindsdb/interfaces/database/integrations.py +20 -5
- mindsdb/interfaces/database/projects.py +174 -23
- mindsdb/interfaces/database/views.py +86 -60
- mindsdb/interfaces/jobs/jobs_controller.py +103 -110
- mindsdb/interfaces/knowledge_base/controller.py +33 -6
- mindsdb/interfaces/knowledge_base/evaluate.py +2 -1
- mindsdb/interfaces/knowledge_base/executor.py +24 -0
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +6 -10
- mindsdb/interfaces/knowledge_base/preprocessing/text_splitter.py +73 -0
- mindsdb/interfaces/query_context/context_controller.py +111 -145
- mindsdb/interfaces/skills/skills_controller.py +18 -6
- mindsdb/interfaces/storage/db.py +40 -6
- mindsdb/interfaces/variables/variables_controller.py +8 -15
- mindsdb/utilities/config.py +5 -3
- mindsdb/utilities/fs.py +54 -17
- mindsdb/utilities/functions.py +72 -60
- mindsdb/utilities/log.py +38 -6
- mindsdb/utilities/ps.py +7 -7
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/METADATA +282 -268
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/RECORD +94 -92
- mindsdb/integrations/handlers/anyscale_endpoints_handler/__about__.py +0 -9
- mindsdb/integrations/handlers/anyscale_endpoints_handler/__init__.py +0 -20
- mindsdb/integrations/handlers/anyscale_endpoints_handler/anyscale_endpoints_handler.py +0 -290
- mindsdb/integrations/handlers/anyscale_endpoints_handler/creation_args.py +0 -14
- mindsdb/integrations/handlers/anyscale_endpoints_handler/icon.svg +0 -4
- mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -2
- mindsdb/integrations/handlers/anyscale_endpoints_handler/settings.py +0 -51
- mindsdb/integrations/handlers/anyscale_endpoints_handler/tests/test_anyscale_endpoints_handler.py +0 -212
- /mindsdb/integrations/handlers/{anyscale_endpoints_handler/tests/__init__.py → gong_handler/requirements.txt} +0 -0
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/top_level.txt +0 -0
|
@@ -21,29 +21,29 @@ from mindsdb.utilities import log
|
|
|
21
21
|
|
|
22
22
|
logger = log.getLogger(__name__)
|
|
23
23
|
|
|
24
|
-
default_project = config.get(
|
|
24
|
+
default_project = config.get("default_project")
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def split_sql(sql):
|
|
28
28
|
# split sql by ';' ignoring delimiter in quotes
|
|
29
|
-
pattern = re.compile(r
|
|
29
|
+
pattern = re.compile(r"""((?:[^;"']|"[^"]*"|'[^']*')+)""")
|
|
30
30
|
return pattern.split(sql)[1::2]
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
def calc_next_date(schedule_str, base_date: dt.datetime):
|
|
34
34
|
schedule_str = schedule_str.lower().strip()
|
|
35
35
|
|
|
36
|
-
repeat_prefix =
|
|
36
|
+
repeat_prefix = "every "
|
|
37
37
|
if schedule_str.startswith(repeat_prefix):
|
|
38
|
-
repeat_str = schedule_str[len(repeat_prefix):]
|
|
38
|
+
repeat_str = schedule_str[len(repeat_prefix) :]
|
|
39
39
|
else:
|
|
40
40
|
# TODO cron format
|
|
41
|
-
raise NotImplementedError(f
|
|
41
|
+
raise NotImplementedError(f"Schedule: {schedule_str}")
|
|
42
42
|
|
|
43
43
|
items = repeat_str.split()
|
|
44
44
|
|
|
45
45
|
if len(items) == 1:
|
|
46
|
-
value =
|
|
46
|
+
value = "1"
|
|
47
47
|
period = items[0]
|
|
48
48
|
elif len(items) == 2:
|
|
49
49
|
value, period = items
|
|
@@ -53,15 +53,15 @@ def calc_next_date(schedule_str, base_date: dt.datetime):
|
|
|
53
53
|
if not value.isdigit():
|
|
54
54
|
raise Exception(f"Number expected: {value}")
|
|
55
55
|
value = int(value)
|
|
56
|
-
if period in (
|
|
56
|
+
if period in ("minute", "minutes", "min"):
|
|
57
57
|
delta = dt.timedelta(minutes=value)
|
|
58
|
-
elif period in (
|
|
58
|
+
elif period in ("hour", "hours"):
|
|
59
59
|
delta = dt.timedelta(hours=value)
|
|
60
|
-
elif period in (
|
|
60
|
+
elif period in ("day", "days"):
|
|
61
61
|
delta = dt.timedelta(days=value)
|
|
62
|
-
elif period in (
|
|
62
|
+
elif period in ("week", "weeks"):
|
|
63
63
|
delta = dt.timedelta(days=value * 7) # 1 week = 7 days
|
|
64
|
-
elif period in (
|
|
64
|
+
elif period in ("month", "months"):
|
|
65
65
|
delta = relativedelta(months=value)
|
|
66
66
|
else:
|
|
67
67
|
raise Exception(f"Unknown period: {period}")
|
|
@@ -85,10 +85,10 @@ def parse_job_date(date_str: str) -> dt.datetime:
|
|
|
85
85
|
:return:
|
|
86
86
|
"""
|
|
87
87
|
|
|
88
|
-
if date_str.upper() ==
|
|
88
|
+
if date_str.upper() == "NOW":
|
|
89
89
|
return dt.datetime.now()
|
|
90
90
|
|
|
91
|
-
date_formats = [
|
|
91
|
+
date_formats = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d"]
|
|
92
92
|
date = None
|
|
93
93
|
for date_format in date_formats:
|
|
94
94
|
try:
|
|
@@ -128,39 +128,41 @@ class JobsController:
|
|
|
128
128
|
at the moment supports: 'every <number> <dimension>' or 'every <dimension>'
|
|
129
129
|
:return: name of created job
|
|
130
130
|
"""
|
|
131
|
+
if not name.islower():
|
|
132
|
+
raise ValueError(f"The name must be in lower case: {name}")
|
|
131
133
|
|
|
132
134
|
project_controller = ProjectController()
|
|
133
135
|
project = project_controller.get(name=project_name)
|
|
134
136
|
|
|
135
137
|
# check if exists
|
|
136
138
|
if self.get(name, project_name) is not None:
|
|
137
|
-
raise EntityExistsError(
|
|
139
|
+
raise EntityExistsError("Job already exists", name)
|
|
138
140
|
|
|
139
141
|
if start_at is None:
|
|
140
142
|
start_at = dt.datetime.now()
|
|
141
143
|
|
|
142
144
|
if end_at is not None and end_at < start_at:
|
|
143
|
-
raise Exception(f
|
|
145
|
+
raise Exception(f"Wrong end date {start_at} > {end_at}")
|
|
144
146
|
|
|
145
147
|
# check sql = try to parse it
|
|
146
148
|
for sql in split_sql(query):
|
|
147
149
|
try:
|
|
148
150
|
# replace template variables with null
|
|
149
|
-
sql = re.sub(r
|
|
151
|
+
sql = re.sub(r"\{\{[\w\d]+}}", "", sql)
|
|
150
152
|
|
|
151
153
|
parse_sql(sql)
|
|
152
154
|
except ParsingException as e:
|
|
153
|
-
raise ParsingException(f
|
|
155
|
+
raise ParsingException(f"Unable to parse: {sql}: {e}")
|
|
154
156
|
|
|
155
157
|
if if_query is not None:
|
|
156
158
|
for sql in split_sql(if_query):
|
|
157
159
|
try:
|
|
158
160
|
# replace template variables with null
|
|
159
|
-
sql = re.sub(r
|
|
161
|
+
sql = re.sub(r"\{\{[\w\d]+}}", "", sql)
|
|
160
162
|
|
|
161
163
|
parse_sql(sql)
|
|
162
164
|
except ParsingException as e:
|
|
163
|
-
raise ParsingException(f
|
|
165
|
+
raise ParsingException(f"Unable to parse: {sql}: {e}")
|
|
164
166
|
|
|
165
167
|
# plan next run
|
|
166
168
|
next_run_at = start_at
|
|
@@ -185,7 +187,7 @@ class JobsController:
|
|
|
185
187
|
start_at=start_at,
|
|
186
188
|
end_at=end_at,
|
|
187
189
|
next_run_at=next_run_at,
|
|
188
|
-
schedule_str=schedule_str
|
|
190
|
+
schedule_str=schedule_str,
|
|
189
191
|
)
|
|
190
192
|
db.session.add(record)
|
|
191
193
|
db.session.commit()
|
|
@@ -219,10 +221,11 @@ class JobsController:
|
|
|
219
221
|
|
|
220
222
|
schedule_str = None
|
|
221
223
|
if query.repeat_str is not None:
|
|
222
|
-
schedule_str =
|
|
224
|
+
schedule_str = "every " + query.repeat_str
|
|
223
225
|
|
|
224
226
|
return self.add(
|
|
225
|
-
name,
|
|
227
|
+
name,
|
|
228
|
+
project_name,
|
|
226
229
|
query=query_str,
|
|
227
230
|
start_at=start_at,
|
|
228
231
|
end_at=end_at,
|
|
@@ -231,36 +234,30 @@ class JobsController:
|
|
|
231
234
|
)
|
|
232
235
|
|
|
233
236
|
def delete(self, name, project_name):
|
|
234
|
-
|
|
235
237
|
project_controller = ProjectController()
|
|
236
238
|
project = project_controller.get(name=project_name)
|
|
237
239
|
|
|
238
240
|
# check if exists
|
|
239
|
-
record =
|
|
240
|
-
|
|
241
|
-
name=name,
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
).first()
|
|
241
|
+
record = (
|
|
242
|
+
db.session.query(db.Jobs)
|
|
243
|
+
.filter_by(company_id=ctx.company_id, name=name, project_id=project.id, deleted_at=sa.null())
|
|
244
|
+
.first()
|
|
245
|
+
)
|
|
245
246
|
if record is None:
|
|
246
|
-
raise EntityNotExistsError(
|
|
247
|
+
raise EntityNotExistsError("Job does not exist", name)
|
|
247
248
|
|
|
248
249
|
self._delete_record(record)
|
|
249
250
|
db.session.commit()
|
|
250
251
|
|
|
251
252
|
# delete context
|
|
252
|
-
query_context_controller.drop_query_context(
|
|
253
|
-
query_context_controller.drop_query_context(
|
|
253
|
+
query_context_controller.drop_query_context("job", record.id)
|
|
254
|
+
query_context_controller.drop_query_context("job-if", record.id)
|
|
254
255
|
|
|
255
256
|
def _delete_record(self, record):
|
|
256
257
|
record.deleted_at = dt.datetime.now()
|
|
257
258
|
|
|
258
259
|
def get_list(self, project_name=None):
|
|
259
|
-
|
|
260
|
-
query = db.session.query(db.Jobs).filter_by(
|
|
261
|
-
company_id=ctx.company_id,
|
|
262
|
-
deleted_at=sa.null()
|
|
263
|
-
)
|
|
260
|
+
query = db.session.query(db.Jobs).filter_by(company_id=ctx.company_id, deleted_at=sa.null())
|
|
264
261
|
|
|
265
262
|
project_controller = ProjectController()
|
|
266
263
|
if project_name is not None:
|
|
@@ -268,23 +265,22 @@ class JobsController:
|
|
|
268
265
|
query = query.filter_by(project_id=project.id)
|
|
269
266
|
|
|
270
267
|
data = []
|
|
271
|
-
project_names = {
|
|
272
|
-
i.id: i.name
|
|
273
|
-
for i in project_controller.get_list()
|
|
274
|
-
}
|
|
268
|
+
project_names = {i.id: i.name for i in project_controller.get_list()}
|
|
275
269
|
for record in query:
|
|
276
|
-
data.append(
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
270
|
+
data.append(
|
|
271
|
+
{
|
|
272
|
+
"id": record.id,
|
|
273
|
+
"name": record.name,
|
|
274
|
+
"project": project_names[record.project_id],
|
|
275
|
+
"start_at": record.start_at,
|
|
276
|
+
"end_at": record.end_at,
|
|
277
|
+
"next_run_at": record.next_run_at,
|
|
278
|
+
"schedule_str": record.schedule_str,
|
|
279
|
+
"query": record.query_str,
|
|
280
|
+
"if_query": record.if_query_str,
|
|
281
|
+
"variables": query_context_controller.get_context_vars("job", record.id),
|
|
282
|
+
}
|
|
283
|
+
)
|
|
288
284
|
return data
|
|
289
285
|
|
|
290
286
|
def get(self, name: str, project_name: str) -> dict:
|
|
@@ -298,25 +294,24 @@ class JobsController:
|
|
|
298
294
|
project_controller = ProjectController()
|
|
299
295
|
project = project_controller.get(name=project_name)
|
|
300
296
|
|
|
301
|
-
record =
|
|
302
|
-
|
|
303
|
-
name=name,
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
).first()
|
|
297
|
+
record = (
|
|
298
|
+
db.session.query(db.Jobs)
|
|
299
|
+
.filter_by(company_id=ctx.company_id, name=name, project_id=project.id, deleted_at=sa.null())
|
|
300
|
+
.first()
|
|
301
|
+
)
|
|
307
302
|
|
|
308
303
|
if record is not None:
|
|
309
304
|
return {
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
305
|
+
"id": record.id,
|
|
306
|
+
"name": record.name,
|
|
307
|
+
"project": project_name,
|
|
308
|
+
"start_at": record.start_at,
|
|
309
|
+
"end_at": record.end_at,
|
|
310
|
+
"next_run_at": record.next_run_at,
|
|
311
|
+
"schedule_str": record.schedule_str,
|
|
312
|
+
"query": record.query_str,
|
|
313
|
+
"if_query": record.if_query_str,
|
|
314
|
+
"variables": query_context_controller.get_context_vars("job", record.id),
|
|
320
315
|
}
|
|
321
316
|
|
|
322
317
|
def get_history(self, name: str, project_name: str) -> List[dict]:
|
|
@@ -331,27 +326,33 @@ class JobsController:
|
|
|
331
326
|
|
|
332
327
|
query = Select(
|
|
333
328
|
targets=[Star()],
|
|
334
|
-
from_table=Identifier(
|
|
335
|
-
where=BinaryOperation(
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
329
|
+
from_table=Identifier("jobs_history"),
|
|
330
|
+
where=BinaryOperation(
|
|
331
|
+
op="and",
|
|
332
|
+
args=[
|
|
333
|
+
BinaryOperation(op="=", args=[Identifier("name"), Constant(name)]),
|
|
334
|
+
BinaryOperation(op="=", args=[Identifier("project"), Constant(project_name)]),
|
|
335
|
+
],
|
|
336
|
+
),
|
|
339
337
|
)
|
|
340
338
|
response = logs_db_controller.query(query)
|
|
341
339
|
|
|
342
|
-
names = [i[
|
|
343
|
-
return response.data_frame[names].to_dict(orient=
|
|
340
|
+
names = [i["name"] for i in response.columns]
|
|
341
|
+
return response.data_frame[names].to_dict(orient="records")
|
|
344
342
|
|
|
345
343
|
|
|
346
344
|
class JobsExecutor:
|
|
347
|
-
|
|
348
345
|
def get_next_tasks(self):
|
|
349
346
|
# filter next_run < now
|
|
350
|
-
query =
|
|
351
|
-
db.
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
347
|
+
query = (
|
|
348
|
+
db.session.query(db.Jobs)
|
|
349
|
+
.filter(
|
|
350
|
+
db.Jobs.next_run_at < dt.datetime.now(),
|
|
351
|
+
db.Jobs.deleted_at == sa.null(),
|
|
352
|
+
db.Jobs.active == True, # noqa
|
|
353
|
+
)
|
|
354
|
+
.order_by(db.Jobs.next_run_at)
|
|
355
|
+
)
|
|
355
356
|
|
|
356
357
|
return query.all()
|
|
357
358
|
|
|
@@ -389,12 +390,7 @@ class JobsExecutor:
|
|
|
389
390
|
record = db.Jobs.query.get(record_id)
|
|
390
391
|
|
|
391
392
|
try:
|
|
392
|
-
|
|
393
|
-
history_record = db.JobsHistory(
|
|
394
|
-
job_id=record.id,
|
|
395
|
-
start_at=record.next_run_at,
|
|
396
|
-
company_id=record.company_id
|
|
397
|
-
)
|
|
393
|
+
history_record = db.JobsHistory(job_id=record.id, start_at=record.next_run_at, company_id=record.company_id)
|
|
398
394
|
|
|
399
395
|
db.session.add(history_record)
|
|
400
396
|
db.session.commit()
|
|
@@ -408,9 +404,7 @@ class JobsExecutor:
|
|
|
408
404
|
|
|
409
405
|
# check if it is an old lock
|
|
410
406
|
history_record = db.JobsHistory.query.filter_by(
|
|
411
|
-
job_id=record.id,
|
|
412
|
-
start_at=record.next_run_at,
|
|
413
|
-
company_id=record.company_id
|
|
407
|
+
job_id=record.id, start_at=record.next_run_at, company_id=record.company_id
|
|
414
408
|
).first()
|
|
415
409
|
if history_record.updated_at < dt.datetime.now() - dt.timedelta(seconds=30):
|
|
416
410
|
db.session.delete(history_record)
|
|
@@ -419,13 +413,14 @@ class JobsExecutor:
|
|
|
419
413
|
return None
|
|
420
414
|
|
|
421
415
|
def __fill_variables(self, sql, record, history_record):
|
|
422
|
-
if
|
|
416
|
+
if "{{PREVIOUS_START_DATETIME}}" in sql:
|
|
423
417
|
# get previous run date
|
|
424
|
-
history_prev =
|
|
425
|
-
.
|
|
426
|
-
|
|
427
|
-
.order_by(db.JobsHistory.id.desc())
|
|
418
|
+
history_prev = (
|
|
419
|
+
db.session.query(db.JobsHistory.start_at)
|
|
420
|
+
.filter(db.JobsHistory.job_id == record.id, db.JobsHistory.id != history_record.id)
|
|
421
|
+
.order_by(db.JobsHistory.id.desc())
|
|
428
422
|
.first()
|
|
423
|
+
)
|
|
429
424
|
if history_prev is None:
|
|
430
425
|
# start date of the job
|
|
431
426
|
value = record.created_at
|
|
@@ -433,18 +428,17 @@ class JobsExecutor:
|
|
|
433
428
|
# fix for twitter: created_at filter must be minimum of 10 seconds prior to the current time
|
|
434
429
|
value = history_prev.start_at - dt.timedelta(seconds=60)
|
|
435
430
|
value = value.strftime("%Y-%m-%d %H:%M:%S")
|
|
436
|
-
sql = sql.replace(
|
|
431
|
+
sql = sql.replace("{{PREVIOUS_START_DATETIME}}", value)
|
|
437
432
|
|
|
438
|
-
if
|
|
433
|
+
if "{{START_DATE}}" in sql:
|
|
439
434
|
value = history_record.start_at.strftime("%Y-%m-%d")
|
|
440
|
-
sql = sql.replace(
|
|
441
|
-
if
|
|
435
|
+
sql = sql.replace("{{START_DATE}}", value)
|
|
436
|
+
if "{{START_DATETIME}}" in sql:
|
|
442
437
|
value = history_record.start_at.strftime("%Y-%m-%d %H:%M:%S")
|
|
443
|
-
sql = sql.replace(
|
|
438
|
+
sql = sql.replace("{{START_DATETIME}}", value)
|
|
444
439
|
return sql
|
|
445
440
|
|
|
446
441
|
def execute_task_local(self, record_id, history_id=None):
|
|
447
|
-
|
|
448
442
|
record = db.Jobs.query.get(record_id)
|
|
449
443
|
|
|
450
444
|
# set up environment
|
|
@@ -470,7 +464,7 @@ class JobsExecutor:
|
|
|
470
464
|
|
|
471
465
|
project_controller = ProjectController()
|
|
472
466
|
project = project_controller.get(record.project_id)
|
|
473
|
-
executed_sql =
|
|
467
|
+
executed_sql = ""
|
|
474
468
|
|
|
475
469
|
from mindsdb.api.executor.controllers.session_controller import SessionController
|
|
476
470
|
from mindsdb.api.executor.command_executor import ExecuteCommands
|
|
@@ -480,8 +474,8 @@ class JobsExecutor:
|
|
|
480
474
|
command_executor = ExecuteCommands(sql_session)
|
|
481
475
|
|
|
482
476
|
# job with condition?
|
|
483
|
-
query_context_controller.set_context(
|
|
484
|
-
error =
|
|
477
|
+
query_context_controller.set_context("job-if", record.id)
|
|
478
|
+
error = ""
|
|
485
479
|
to_execute_query = True
|
|
486
480
|
if record.if_query_str is not None:
|
|
487
481
|
data = None
|
|
@@ -491,7 +485,7 @@ class JobsExecutor:
|
|
|
491
485
|
sql = self.__fill_variables(sql, record, history_record)
|
|
492
486
|
|
|
493
487
|
query = parse_sql(sql)
|
|
494
|
-
executed_sql += sql +
|
|
488
|
+
executed_sql += sql + "; "
|
|
495
489
|
|
|
496
490
|
ret = command_executor.execute_command(query)
|
|
497
491
|
if ret.error_code is not None:
|
|
@@ -508,17 +502,16 @@ class JobsExecutor:
|
|
|
508
502
|
if error or data is None or len(data) == 0:
|
|
509
503
|
to_execute_query = False
|
|
510
504
|
|
|
511
|
-
query_context_controller.release_context(
|
|
505
|
+
query_context_controller.release_context("job-if", record.id)
|
|
512
506
|
if to_execute_query:
|
|
513
|
-
|
|
514
|
-
query_context_controller.set_context('job', record.id)
|
|
507
|
+
query_context_controller.set_context("job", record.id)
|
|
515
508
|
for sql in split_sql(record.query_str):
|
|
516
509
|
try:
|
|
517
510
|
# fill template variables
|
|
518
511
|
sql = self.__fill_variables(sql, record, history_record)
|
|
519
512
|
|
|
520
513
|
query = parse_sql(sql)
|
|
521
|
-
executed_sql += sql +
|
|
514
|
+
executed_sql += sql + "; "
|
|
522
515
|
|
|
523
516
|
ret = command_executor.execute_command(query)
|
|
524
517
|
if ret.error_code is not None:
|
|
@@ -60,6 +60,7 @@ class KnowledgeBaseInputParams(BaseModel):
|
|
|
60
60
|
is_sparse: bool = False
|
|
61
61
|
vector_size: int | None = None
|
|
62
62
|
reranking_model: Dict[Text, Any] | None = None
|
|
63
|
+
preprocessing: Dict[Text, Any] | None = None
|
|
63
64
|
|
|
64
65
|
class Config:
|
|
65
66
|
extra = "forbid"
|
|
@@ -244,9 +245,9 @@ class KnowledgeBaseTable:
|
|
|
244
245
|
keyword_search_cols_and_values = []
|
|
245
246
|
query_text = None
|
|
246
247
|
relevance_threshold = None
|
|
247
|
-
reranking_enabled_flag = True
|
|
248
248
|
hybrid_search_enabled_flag = False
|
|
249
249
|
query_conditions = db_handler.extract_conditions(query.where)
|
|
250
|
+
hybrid_search_alpha = None # Default to None, meaning no alpha weighted blending
|
|
250
251
|
if query_conditions is not None:
|
|
251
252
|
for item in query_conditions:
|
|
252
253
|
if item.column == "relevance" and item.op.value == FilterOperator.GREATER_THAN_OR_EQUAL.value:
|
|
@@ -261,10 +262,8 @@ class KnowledgeBaseTable:
|
|
|
261
262
|
logger.error(error_msg)
|
|
262
263
|
raise ValueError(error_msg)
|
|
263
264
|
elif item.column == "reranking":
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
if isinstance(reranking_enabled_flag, str):
|
|
267
|
-
reranking_enabled_flag = reranking_enabled_flag.lower() not in ("false")
|
|
265
|
+
if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
|
|
266
|
+
disable_reranking = True
|
|
268
267
|
elif item.column == "hybrid_search":
|
|
269
268
|
hybrid_search_enabled_flag = item.value
|
|
270
269
|
# cast to boolean
|
|
@@ -272,6 +271,14 @@ class KnowledgeBaseTable:
|
|
|
272
271
|
hybrid_search_enabled_flag = hybrid_search_enabled_flag.lower() not in ("false")
|
|
273
272
|
if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
|
|
274
273
|
disable_reranking = True
|
|
274
|
+
elif item.column == "hybrid_search_alpha":
|
|
275
|
+
# validate item.value is a float
|
|
276
|
+
if not isinstance(item.value, (float, int)):
|
|
277
|
+
raise ValueError(f"Invalid hybrid_search_alpha value: {item.value}. Must be a float or int.")
|
|
278
|
+
# validate hybrid search alpha is between 0 and 1
|
|
279
|
+
if not (0 <= item.value <= 1):
|
|
280
|
+
raise ValueError(f"Invalid hybrid_search_alpha value: {item.value}. Must be between 0 and 1.")
|
|
281
|
+
hybrid_search_alpha = item.value
|
|
275
282
|
elif item.column == "relevance" and item.op.value != FilterOperator.GREATER_THAN_OR_EQUAL.value:
|
|
276
283
|
raise ValueError(
|
|
277
284
|
f"Invalid operator for relevance: {item.op.value}. Only GREATER_THAN_OR_EQUAL is allowed."
|
|
@@ -345,7 +352,15 @@ class KnowledgeBaseTable:
|
|
|
345
352
|
f"Keyword search returned different columns: {df_keyword_select.columns} "
|
|
346
353
|
f"than expected: {df.columns}"
|
|
347
354
|
)
|
|
355
|
+
if hybrid_search_alpha:
|
|
356
|
+
df_keyword_select[TableField.DISTANCE.value] = (
|
|
357
|
+
hybrid_search_alpha * df_keyword_select[TableField.DISTANCE.value]
|
|
358
|
+
)
|
|
359
|
+
df[TableField.DISTANCE.value] = (1 - hybrid_search_alpha) * df[TableField.DISTANCE.value]
|
|
348
360
|
df = pd.concat([df, df_keyword_select], ignore_index=True)
|
|
361
|
+
# sort by distance if distance column exists
|
|
362
|
+
if TableField.DISTANCE.value in df.columns:
|
|
363
|
+
df = df.sort_values(by=TableField.DISTANCE.value, ascending=True)
|
|
349
364
|
# if chunk_id column exists remove duplicates based on chunk_id
|
|
350
365
|
if "chunk_id" in df.columns:
|
|
351
366
|
df = df.drop_duplicates(subset=["chunk_id"])
|
|
@@ -519,6 +534,9 @@ class KnowledgeBaseTable:
|
|
|
519
534
|
|
|
520
535
|
query.update_columns[emb_col] = Constant(self._content_to_embeddings(content))
|
|
521
536
|
|
|
537
|
+
if "metadata" not in query.update_columns:
|
|
538
|
+
query.update_columns["metadata"] = Constant({})
|
|
539
|
+
|
|
522
540
|
# TODO search content in where clause?
|
|
523
541
|
|
|
524
542
|
# set table name
|
|
@@ -1010,6 +1028,9 @@ class KnowledgeBaseController:
|
|
|
1010
1028
|
:param is_sparse: Whether to use sparse vectors for embeddings
|
|
1011
1029
|
:param vector_size: Optional size specification for vectors, required when is_sparse=True
|
|
1012
1030
|
"""
|
|
1031
|
+
if not name.islower():
|
|
1032
|
+
raise ValueError(f"The name must be in lower case: {name}")
|
|
1033
|
+
|
|
1013
1034
|
# fill variables
|
|
1014
1035
|
params = variables_controller.fill_parameters(params)
|
|
1015
1036
|
|
|
@@ -1118,8 +1139,14 @@ class KnowledgeBaseController:
|
|
|
1118
1139
|
else:
|
|
1119
1140
|
vector_db_name, vector_table_name = storage.parts
|
|
1120
1141
|
|
|
1142
|
+
data_node = self.session.datahub.get(vector_db_name)
|
|
1143
|
+
if data_node:
|
|
1144
|
+
vector_store_handler = data_node.integration_handler
|
|
1145
|
+
else:
|
|
1146
|
+
raise ValueError(
|
|
1147
|
+
f"Unable to find database named {vector_db_name}, please make sure {vector_db_name} is defined"
|
|
1148
|
+
)
|
|
1121
1149
|
# create table in vectordb before creating KB
|
|
1122
|
-
vector_store_handler = self.session.datahub.get(vector_db_name).integration_handler
|
|
1123
1150
|
vector_store_handler.create_table(vector_table_name)
|
|
1124
1151
|
if keyword_search_enabled:
|
|
1125
1152
|
vector_store_handler.add_full_text_index(vector_table_name, TableField.CONTENT.value)
|
|
@@ -118,7 +118,8 @@ class EvaluateBase:
|
|
|
118
118
|
|
|
119
119
|
dn, table_name = self._get_dn_table(query.from_table)
|
|
120
120
|
query.from_table = table_name
|
|
121
|
-
query.limit
|
|
121
|
+
if query.limit is None:
|
|
122
|
+
query.limit = Constant(self.DEFAULT_SAMPLE_SIZE)
|
|
122
123
|
|
|
123
124
|
response = dn.query(query=query, session=self.session)
|
|
124
125
|
df = response.data_frame
|
|
@@ -217,6 +217,17 @@ class KnowledgeBaseQueryExecutor:
|
|
|
217
217
|
f'Operator "{content_condition.op}" is not supported for condition: {content_condition}'
|
|
218
218
|
)
|
|
219
219
|
|
|
220
|
+
@staticmethod
|
|
221
|
+
def to_include_content(content_condition: BinaryOperation) -> List[str]:
|
|
222
|
+
"""
|
|
223
|
+
Handles positive conditions for content. Returns list of content values
|
|
224
|
+
"""
|
|
225
|
+
if content_condition.op == "IN":
|
|
226
|
+
return [item.value for item in content_condition.args[1].items]
|
|
227
|
+
|
|
228
|
+
elif content_condition.op in ("=", "LIKE"):
|
|
229
|
+
return [content_condition.args[1].value]
|
|
230
|
+
|
|
220
231
|
def to_excluded_ids(
|
|
221
232
|
self, content_condition: BinaryOperation, other_conditions: List[BinaryOperation]
|
|
222
233
|
) -> Optional[List[str]]:
|
|
@@ -290,11 +301,17 @@ class KnowledgeBaseQueryExecutor:
|
|
|
290
301
|
if len(content_filters) > 0:
|
|
291
302
|
content_filters2 = []
|
|
292
303
|
exclude_ids = set()
|
|
304
|
+
include_contents = set()
|
|
293
305
|
# exclude content conditions
|
|
294
306
|
for condition in content_filters:
|
|
295
307
|
ids = self.to_excluded_ids(condition, other_filters)
|
|
296
308
|
if ids is not None:
|
|
297
309
|
exclude_ids.update(ids)
|
|
310
|
+
continue
|
|
311
|
+
contents = self.to_include_content(condition)
|
|
312
|
+
if contents is not None:
|
|
313
|
+
include_contents.update(contents)
|
|
314
|
+
continue
|
|
298
315
|
else:
|
|
299
316
|
# keep origin content filter
|
|
300
317
|
content_filters2.append(condition)
|
|
@@ -305,6 +322,13 @@ class KnowledgeBaseQueryExecutor:
|
|
|
305
322
|
condition = BinaryOperation(op="NOT IN", args=[Identifier(self.id_column), Tuple(values)])
|
|
306
323
|
other_filters.append(condition)
|
|
307
324
|
# execute content filters
|
|
325
|
+
if include_contents:
|
|
326
|
+
content = " AND ".join(include_contents)
|
|
327
|
+
result = self.execute_content_condition(
|
|
328
|
+
BinaryOperation(op="=", args=[Identifier(self.content_column), Constant(content)]),
|
|
329
|
+
other_filters,
|
|
330
|
+
)
|
|
331
|
+
results.append(result)
|
|
308
332
|
for condition in content_filters2:
|
|
309
333
|
result = self.execute_content_condition(condition, other_filters)
|
|
310
334
|
results.append(result)
|
|
@@ -4,8 +4,7 @@ import asyncio
|
|
|
4
4
|
from typing import List, Dict, Optional, Any
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
|
-
from
|
|
8
|
-
from langchain_core.documents import Document as LangchainDocument
|
|
7
|
+
from mindsdb.interfaces.knowledge_base.preprocessing.text_splitter import TextSplitter
|
|
9
8
|
|
|
10
9
|
from mindsdb.integrations.utilities.rag.splitters.file_splitter import (
|
|
11
10
|
FileSplitter,
|
|
@@ -22,7 +21,6 @@ from mindsdb.interfaces.knowledge_base.preprocessing.models import (
|
|
|
22
21
|
)
|
|
23
22
|
from mindsdb.utilities import log
|
|
24
23
|
|
|
25
|
-
|
|
26
24
|
logger = log.getLogger(__name__)
|
|
27
25
|
|
|
28
26
|
_DEFAULT_CONTENT_COLUMN_NAME = "content"
|
|
@@ -49,11 +47,10 @@ class DocumentPreprocessor:
|
|
|
49
47
|
if self.splitter is None:
|
|
50
48
|
raise ValueError("Splitter not configured")
|
|
51
49
|
|
|
52
|
-
|
|
53
|
-
langchain_doc = LangchainDocument(page_content=doc.content, metadata=doc.metadata or {})
|
|
50
|
+
metadata = doc.metadata or {}
|
|
54
51
|
# Split and convert back to our Document type
|
|
55
|
-
|
|
56
|
-
return [Document(content=
|
|
52
|
+
split_texts = self.splitter.split_text(doc.content)
|
|
53
|
+
return [Document(content=text, metadata=metadata) for text in split_texts]
|
|
57
54
|
|
|
58
55
|
def _get_source(self) -> str:
|
|
59
56
|
"""Get the source identifier for this preprocessor"""
|
|
@@ -266,16 +263,15 @@ Please give a short succinct context to situate this chunk within the overall do
|
|
|
266
263
|
|
|
267
264
|
|
|
268
265
|
class TextChunkingPreprocessor(DocumentPreprocessor):
|
|
269
|
-
"""Default text chunking preprocessor using
|
|
266
|
+
"""Default text chunking preprocessor using TextSplitter"""
|
|
270
267
|
|
|
271
268
|
def __init__(self, config: Optional[TextChunkingConfig] = None):
|
|
272
269
|
"""Initialize with text chunking configuration"""
|
|
273
270
|
super().__init__()
|
|
274
271
|
self.config = config or TextChunkingConfig()
|
|
275
|
-
self.splitter =
|
|
272
|
+
self.splitter = TextSplitter(
|
|
276
273
|
chunk_size=self.config.chunk_size,
|
|
277
274
|
chunk_overlap=self.config.chunk_overlap,
|
|
278
|
-
length_function=self.config.length_function,
|
|
279
275
|
separators=self.config.separators,
|
|
280
276
|
)
|
|
281
277
|
|