MindsDB 25.7.2.0__py3-none-any.whl → 25.7.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +1 -1
- mindsdb/api/a2a/common/server/server.py +16 -6
- mindsdb/api/executor/command_executor.py +213 -137
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +5 -1
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +14 -3
- mindsdb/api/executor/planner/plan_join.py +3 -0
- mindsdb/api/executor/planner/plan_join_ts.py +117 -100
- mindsdb/api/executor/planner/query_planner.py +1 -0
- mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +54 -85
- mindsdb/api/http/initialize.py +16 -43
- mindsdb/api/http/namespaces/agents.py +24 -21
- mindsdb/api/http/namespaces/chatbots.py +83 -120
- mindsdb/api/http/namespaces/file.py +1 -1
- mindsdb/api/http/namespaces/jobs.py +38 -60
- mindsdb/api/http/namespaces/tree.py +69 -61
- mindsdb/api/mcp/start.py +2 -0
- mindsdb/api/mysql/mysql_proxy/utilities/dump.py +3 -2
- mindsdb/integrations/handlers/autogluon_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/autosklearn_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +25 -5
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +3 -3
- mindsdb/integrations/handlers/flaml_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/google_calendar_handler/google_calendar_tables.py +82 -73
- mindsdb/integrations/handlers/hubspot_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +83 -76
- mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +16 -3
- mindsdb/integrations/handlers/litellm_handler/settings.py +2 -1
- mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +106 -90
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +41 -39
- mindsdb/integrations/handlers/s3_handler/s3_handler.py +72 -70
- mindsdb/integrations/handlers/salesforce_handler/constants.py +208 -0
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +142 -81
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +12 -4
- mindsdb/integrations/handlers/slack_handler/slack_tables.py +141 -161
- mindsdb/integrations/handlers/tpot_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +32 -17
- mindsdb/integrations/handlers/web_handler/web_handler.py +19 -22
- mindsdb/integrations/handlers/youtube_handler/youtube_tables.py +183 -55
- mindsdb/integrations/libs/vectordatabase_handler.py +10 -1
- mindsdb/integrations/utilities/handler_utils.py +32 -12
- mindsdb/interfaces/agents/agents_controller.py +169 -110
- mindsdb/interfaces/agents/langchain_agent.py +10 -3
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +22 -8
- mindsdb/interfaces/database/database.py +38 -13
- mindsdb/interfaces/database/integrations.py +20 -5
- mindsdb/interfaces/database/projects.py +63 -16
- mindsdb/interfaces/database/views.py +86 -60
- mindsdb/interfaces/jobs/jobs_controller.py +103 -110
- mindsdb/interfaces/knowledge_base/controller.py +33 -5
- mindsdb/interfaces/knowledge_base/evaluate.py +53 -9
- mindsdb/interfaces/knowledge_base/executor.py +24 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +3 -3
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +21 -13
- mindsdb/interfaces/query_context/context_controller.py +100 -133
- mindsdb/interfaces/skills/skills_controller.py +18 -6
- mindsdb/interfaces/storage/db.py +40 -6
- mindsdb/interfaces/variables/variables_controller.py +8 -15
- mindsdb/utilities/config.py +3 -3
- mindsdb/utilities/functions.py +72 -60
- mindsdb/utilities/log.py +38 -6
- mindsdb/utilities/ps.py +7 -7
- {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/METADATA +262 -263
- {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/RECORD +69 -68
- {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/top_level.txt +0 -0
|
@@ -21,29 +21,29 @@ from mindsdb.utilities import log
|
|
|
21
21
|
|
|
22
22
|
logger = log.getLogger(__name__)
|
|
23
23
|
|
|
24
|
-
default_project = config.get(
|
|
24
|
+
default_project = config.get("default_project")
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def split_sql(sql):
|
|
28
28
|
# split sql by ';' ignoring delimiter in quotes
|
|
29
|
-
pattern = re.compile(r
|
|
29
|
+
pattern = re.compile(r"""((?:[^;"']|"[^"]*"|'[^']*')+)""")
|
|
30
30
|
return pattern.split(sql)[1::2]
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
def calc_next_date(schedule_str, base_date: dt.datetime):
|
|
34
34
|
schedule_str = schedule_str.lower().strip()
|
|
35
35
|
|
|
36
|
-
repeat_prefix =
|
|
36
|
+
repeat_prefix = "every "
|
|
37
37
|
if schedule_str.startswith(repeat_prefix):
|
|
38
|
-
repeat_str = schedule_str[len(repeat_prefix):]
|
|
38
|
+
repeat_str = schedule_str[len(repeat_prefix) :]
|
|
39
39
|
else:
|
|
40
40
|
# TODO cron format
|
|
41
|
-
raise NotImplementedError(f
|
|
41
|
+
raise NotImplementedError(f"Schedule: {schedule_str}")
|
|
42
42
|
|
|
43
43
|
items = repeat_str.split()
|
|
44
44
|
|
|
45
45
|
if len(items) == 1:
|
|
46
|
-
value =
|
|
46
|
+
value = "1"
|
|
47
47
|
period = items[0]
|
|
48
48
|
elif len(items) == 2:
|
|
49
49
|
value, period = items
|
|
@@ -53,15 +53,15 @@ def calc_next_date(schedule_str, base_date: dt.datetime):
|
|
|
53
53
|
if not value.isdigit():
|
|
54
54
|
raise Exception(f"Number expected: {value}")
|
|
55
55
|
value = int(value)
|
|
56
|
-
if period in (
|
|
56
|
+
if period in ("minute", "minutes", "min"):
|
|
57
57
|
delta = dt.timedelta(minutes=value)
|
|
58
|
-
elif period in (
|
|
58
|
+
elif period in ("hour", "hours"):
|
|
59
59
|
delta = dt.timedelta(hours=value)
|
|
60
|
-
elif period in (
|
|
60
|
+
elif period in ("day", "days"):
|
|
61
61
|
delta = dt.timedelta(days=value)
|
|
62
|
-
elif period in (
|
|
62
|
+
elif period in ("week", "weeks"):
|
|
63
63
|
delta = dt.timedelta(days=value * 7) # 1 week = 7 days
|
|
64
|
-
elif period in (
|
|
64
|
+
elif period in ("month", "months"):
|
|
65
65
|
delta = relativedelta(months=value)
|
|
66
66
|
else:
|
|
67
67
|
raise Exception(f"Unknown period: {period}")
|
|
@@ -85,10 +85,10 @@ def parse_job_date(date_str: str) -> dt.datetime:
|
|
|
85
85
|
:return:
|
|
86
86
|
"""
|
|
87
87
|
|
|
88
|
-
if date_str.upper() ==
|
|
88
|
+
if date_str.upper() == "NOW":
|
|
89
89
|
return dt.datetime.now()
|
|
90
90
|
|
|
91
|
-
date_formats = [
|
|
91
|
+
date_formats = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d"]
|
|
92
92
|
date = None
|
|
93
93
|
for date_format in date_formats:
|
|
94
94
|
try:
|
|
@@ -128,39 +128,41 @@ class JobsController:
|
|
|
128
128
|
at the moment supports: 'every <number> <dimension>' or 'every <dimension>'
|
|
129
129
|
:return: name of created job
|
|
130
130
|
"""
|
|
131
|
+
if not name.islower():
|
|
132
|
+
raise ValueError(f"The name must be in lower case: {name}")
|
|
131
133
|
|
|
132
134
|
project_controller = ProjectController()
|
|
133
135
|
project = project_controller.get(name=project_name)
|
|
134
136
|
|
|
135
137
|
# check if exists
|
|
136
138
|
if self.get(name, project_name) is not None:
|
|
137
|
-
raise EntityExistsError(
|
|
139
|
+
raise EntityExistsError("Job already exists", name)
|
|
138
140
|
|
|
139
141
|
if start_at is None:
|
|
140
142
|
start_at = dt.datetime.now()
|
|
141
143
|
|
|
142
144
|
if end_at is not None and end_at < start_at:
|
|
143
|
-
raise Exception(f
|
|
145
|
+
raise Exception(f"Wrong end date {start_at} > {end_at}")
|
|
144
146
|
|
|
145
147
|
# check sql = try to parse it
|
|
146
148
|
for sql in split_sql(query):
|
|
147
149
|
try:
|
|
148
150
|
# replace template variables with null
|
|
149
|
-
sql = re.sub(r
|
|
151
|
+
sql = re.sub(r"\{\{[\w\d]+}}", "", sql)
|
|
150
152
|
|
|
151
153
|
parse_sql(sql)
|
|
152
154
|
except ParsingException as e:
|
|
153
|
-
raise ParsingException(f
|
|
155
|
+
raise ParsingException(f"Unable to parse: {sql}: {e}")
|
|
154
156
|
|
|
155
157
|
if if_query is not None:
|
|
156
158
|
for sql in split_sql(if_query):
|
|
157
159
|
try:
|
|
158
160
|
# replace template variables with null
|
|
159
|
-
sql = re.sub(r
|
|
161
|
+
sql = re.sub(r"\{\{[\w\d]+}}", "", sql)
|
|
160
162
|
|
|
161
163
|
parse_sql(sql)
|
|
162
164
|
except ParsingException as e:
|
|
163
|
-
raise ParsingException(f
|
|
165
|
+
raise ParsingException(f"Unable to parse: {sql}: {e}")
|
|
164
166
|
|
|
165
167
|
# plan next run
|
|
166
168
|
next_run_at = start_at
|
|
@@ -185,7 +187,7 @@ class JobsController:
|
|
|
185
187
|
start_at=start_at,
|
|
186
188
|
end_at=end_at,
|
|
187
189
|
next_run_at=next_run_at,
|
|
188
|
-
schedule_str=schedule_str
|
|
190
|
+
schedule_str=schedule_str,
|
|
189
191
|
)
|
|
190
192
|
db.session.add(record)
|
|
191
193
|
db.session.commit()
|
|
@@ -219,10 +221,11 @@ class JobsController:
|
|
|
219
221
|
|
|
220
222
|
schedule_str = None
|
|
221
223
|
if query.repeat_str is not None:
|
|
222
|
-
schedule_str =
|
|
224
|
+
schedule_str = "every " + query.repeat_str
|
|
223
225
|
|
|
224
226
|
return self.add(
|
|
225
|
-
name,
|
|
227
|
+
name,
|
|
228
|
+
project_name,
|
|
226
229
|
query=query_str,
|
|
227
230
|
start_at=start_at,
|
|
228
231
|
end_at=end_at,
|
|
@@ -231,36 +234,30 @@ class JobsController:
|
|
|
231
234
|
)
|
|
232
235
|
|
|
233
236
|
def delete(self, name, project_name):
|
|
234
|
-
|
|
235
237
|
project_controller = ProjectController()
|
|
236
238
|
project = project_controller.get(name=project_name)
|
|
237
239
|
|
|
238
240
|
# check if exists
|
|
239
|
-
record =
|
|
240
|
-
|
|
241
|
-
name=name,
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
).first()
|
|
241
|
+
record = (
|
|
242
|
+
db.session.query(db.Jobs)
|
|
243
|
+
.filter_by(company_id=ctx.company_id, name=name, project_id=project.id, deleted_at=sa.null())
|
|
244
|
+
.first()
|
|
245
|
+
)
|
|
245
246
|
if record is None:
|
|
246
|
-
raise EntityNotExistsError(
|
|
247
|
+
raise EntityNotExistsError("Job does not exist", name)
|
|
247
248
|
|
|
248
249
|
self._delete_record(record)
|
|
249
250
|
db.session.commit()
|
|
250
251
|
|
|
251
252
|
# delete context
|
|
252
|
-
query_context_controller.drop_query_context(
|
|
253
|
-
query_context_controller.drop_query_context(
|
|
253
|
+
query_context_controller.drop_query_context("job", record.id)
|
|
254
|
+
query_context_controller.drop_query_context("job-if", record.id)
|
|
254
255
|
|
|
255
256
|
def _delete_record(self, record):
|
|
256
257
|
record.deleted_at = dt.datetime.now()
|
|
257
258
|
|
|
258
259
|
def get_list(self, project_name=None):
|
|
259
|
-
|
|
260
|
-
query = db.session.query(db.Jobs).filter_by(
|
|
261
|
-
company_id=ctx.company_id,
|
|
262
|
-
deleted_at=sa.null()
|
|
263
|
-
)
|
|
260
|
+
query = db.session.query(db.Jobs).filter_by(company_id=ctx.company_id, deleted_at=sa.null())
|
|
264
261
|
|
|
265
262
|
project_controller = ProjectController()
|
|
266
263
|
if project_name is not None:
|
|
@@ -268,23 +265,22 @@ class JobsController:
|
|
|
268
265
|
query = query.filter_by(project_id=project.id)
|
|
269
266
|
|
|
270
267
|
data = []
|
|
271
|
-
project_names = {
|
|
272
|
-
i.id: i.name
|
|
273
|
-
for i in project_controller.get_list()
|
|
274
|
-
}
|
|
268
|
+
project_names = {i.id: i.name for i in project_controller.get_list()}
|
|
275
269
|
for record in query:
|
|
276
|
-
data.append(
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
270
|
+
data.append(
|
|
271
|
+
{
|
|
272
|
+
"id": record.id,
|
|
273
|
+
"name": record.name,
|
|
274
|
+
"project": project_names[record.project_id],
|
|
275
|
+
"start_at": record.start_at,
|
|
276
|
+
"end_at": record.end_at,
|
|
277
|
+
"next_run_at": record.next_run_at,
|
|
278
|
+
"schedule_str": record.schedule_str,
|
|
279
|
+
"query": record.query_str,
|
|
280
|
+
"if_query": record.if_query_str,
|
|
281
|
+
"variables": query_context_controller.get_context_vars("job", record.id),
|
|
282
|
+
}
|
|
283
|
+
)
|
|
288
284
|
return data
|
|
289
285
|
|
|
290
286
|
def get(self, name: str, project_name: str) -> dict:
|
|
@@ -298,25 +294,24 @@ class JobsController:
|
|
|
298
294
|
project_controller = ProjectController()
|
|
299
295
|
project = project_controller.get(name=project_name)
|
|
300
296
|
|
|
301
|
-
record =
|
|
302
|
-
|
|
303
|
-
name=name,
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
).first()
|
|
297
|
+
record = (
|
|
298
|
+
db.session.query(db.Jobs)
|
|
299
|
+
.filter_by(company_id=ctx.company_id, name=name, project_id=project.id, deleted_at=sa.null())
|
|
300
|
+
.first()
|
|
301
|
+
)
|
|
307
302
|
|
|
308
303
|
if record is not None:
|
|
309
304
|
return {
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
305
|
+
"id": record.id,
|
|
306
|
+
"name": record.name,
|
|
307
|
+
"project": project_name,
|
|
308
|
+
"start_at": record.start_at,
|
|
309
|
+
"end_at": record.end_at,
|
|
310
|
+
"next_run_at": record.next_run_at,
|
|
311
|
+
"schedule_str": record.schedule_str,
|
|
312
|
+
"query": record.query_str,
|
|
313
|
+
"if_query": record.if_query_str,
|
|
314
|
+
"variables": query_context_controller.get_context_vars("job", record.id),
|
|
320
315
|
}
|
|
321
316
|
|
|
322
317
|
def get_history(self, name: str, project_name: str) -> List[dict]:
|
|
@@ -331,27 +326,33 @@ class JobsController:
|
|
|
331
326
|
|
|
332
327
|
query = Select(
|
|
333
328
|
targets=[Star()],
|
|
334
|
-
from_table=Identifier(
|
|
335
|
-
where=BinaryOperation(
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
329
|
+
from_table=Identifier("jobs_history"),
|
|
330
|
+
where=BinaryOperation(
|
|
331
|
+
op="and",
|
|
332
|
+
args=[
|
|
333
|
+
BinaryOperation(op="=", args=[Identifier("name"), Constant(name)]),
|
|
334
|
+
BinaryOperation(op="=", args=[Identifier("project"), Constant(project_name)]),
|
|
335
|
+
],
|
|
336
|
+
),
|
|
339
337
|
)
|
|
340
338
|
response = logs_db_controller.query(query)
|
|
341
339
|
|
|
342
|
-
names = [i[
|
|
343
|
-
return response.data_frame[names].to_dict(orient=
|
|
340
|
+
names = [i["name"] for i in response.columns]
|
|
341
|
+
return response.data_frame[names].to_dict(orient="records")
|
|
344
342
|
|
|
345
343
|
|
|
346
344
|
class JobsExecutor:
|
|
347
|
-
|
|
348
345
|
def get_next_tasks(self):
|
|
349
346
|
# filter next_run < now
|
|
350
|
-
query =
|
|
351
|
-
db.
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
347
|
+
query = (
|
|
348
|
+
db.session.query(db.Jobs)
|
|
349
|
+
.filter(
|
|
350
|
+
db.Jobs.next_run_at < dt.datetime.now(),
|
|
351
|
+
db.Jobs.deleted_at == sa.null(),
|
|
352
|
+
db.Jobs.active == True, # noqa
|
|
353
|
+
)
|
|
354
|
+
.order_by(db.Jobs.next_run_at)
|
|
355
|
+
)
|
|
355
356
|
|
|
356
357
|
return query.all()
|
|
357
358
|
|
|
@@ -389,12 +390,7 @@ class JobsExecutor:
|
|
|
389
390
|
record = db.Jobs.query.get(record_id)
|
|
390
391
|
|
|
391
392
|
try:
|
|
392
|
-
|
|
393
|
-
history_record = db.JobsHistory(
|
|
394
|
-
job_id=record.id,
|
|
395
|
-
start_at=record.next_run_at,
|
|
396
|
-
company_id=record.company_id
|
|
397
|
-
)
|
|
393
|
+
history_record = db.JobsHistory(job_id=record.id, start_at=record.next_run_at, company_id=record.company_id)
|
|
398
394
|
|
|
399
395
|
db.session.add(history_record)
|
|
400
396
|
db.session.commit()
|
|
@@ -408,9 +404,7 @@ class JobsExecutor:
|
|
|
408
404
|
|
|
409
405
|
# check if it is an old lock
|
|
410
406
|
history_record = db.JobsHistory.query.filter_by(
|
|
411
|
-
job_id=record.id,
|
|
412
|
-
start_at=record.next_run_at,
|
|
413
|
-
company_id=record.company_id
|
|
407
|
+
job_id=record.id, start_at=record.next_run_at, company_id=record.company_id
|
|
414
408
|
).first()
|
|
415
409
|
if history_record.updated_at < dt.datetime.now() - dt.timedelta(seconds=30):
|
|
416
410
|
db.session.delete(history_record)
|
|
@@ -419,13 +413,14 @@ class JobsExecutor:
|
|
|
419
413
|
return None
|
|
420
414
|
|
|
421
415
|
def __fill_variables(self, sql, record, history_record):
|
|
422
|
-
if
|
|
416
|
+
if "{{PREVIOUS_START_DATETIME}}" in sql:
|
|
423
417
|
# get previous run date
|
|
424
|
-
history_prev =
|
|
425
|
-
.
|
|
426
|
-
|
|
427
|
-
.order_by(db.JobsHistory.id.desc())
|
|
418
|
+
history_prev = (
|
|
419
|
+
db.session.query(db.JobsHistory.start_at)
|
|
420
|
+
.filter(db.JobsHistory.job_id == record.id, db.JobsHistory.id != history_record.id)
|
|
421
|
+
.order_by(db.JobsHistory.id.desc())
|
|
428
422
|
.first()
|
|
423
|
+
)
|
|
429
424
|
if history_prev is None:
|
|
430
425
|
# start date of the job
|
|
431
426
|
value = record.created_at
|
|
@@ -433,18 +428,17 @@ class JobsExecutor:
|
|
|
433
428
|
# fix for twitter: created_at filter must be minimum of 10 seconds prior to the current time
|
|
434
429
|
value = history_prev.start_at - dt.timedelta(seconds=60)
|
|
435
430
|
value = value.strftime("%Y-%m-%d %H:%M:%S")
|
|
436
|
-
sql = sql.replace(
|
|
431
|
+
sql = sql.replace("{{PREVIOUS_START_DATETIME}}", value)
|
|
437
432
|
|
|
438
|
-
if
|
|
433
|
+
if "{{START_DATE}}" in sql:
|
|
439
434
|
value = history_record.start_at.strftime("%Y-%m-%d")
|
|
440
|
-
sql = sql.replace(
|
|
441
|
-
if
|
|
435
|
+
sql = sql.replace("{{START_DATE}}", value)
|
|
436
|
+
if "{{START_DATETIME}}" in sql:
|
|
442
437
|
value = history_record.start_at.strftime("%Y-%m-%d %H:%M:%S")
|
|
443
|
-
sql = sql.replace(
|
|
438
|
+
sql = sql.replace("{{START_DATETIME}}", value)
|
|
444
439
|
return sql
|
|
445
440
|
|
|
446
441
|
def execute_task_local(self, record_id, history_id=None):
|
|
447
|
-
|
|
448
442
|
record = db.Jobs.query.get(record_id)
|
|
449
443
|
|
|
450
444
|
# set up environment
|
|
@@ -470,7 +464,7 @@ class JobsExecutor:
|
|
|
470
464
|
|
|
471
465
|
project_controller = ProjectController()
|
|
472
466
|
project = project_controller.get(record.project_id)
|
|
473
|
-
executed_sql =
|
|
467
|
+
executed_sql = ""
|
|
474
468
|
|
|
475
469
|
from mindsdb.api.executor.controllers.session_controller import SessionController
|
|
476
470
|
from mindsdb.api.executor.command_executor import ExecuteCommands
|
|
@@ -480,8 +474,8 @@ class JobsExecutor:
|
|
|
480
474
|
command_executor = ExecuteCommands(sql_session)
|
|
481
475
|
|
|
482
476
|
# job with condition?
|
|
483
|
-
query_context_controller.set_context(
|
|
484
|
-
error =
|
|
477
|
+
query_context_controller.set_context("job-if", record.id)
|
|
478
|
+
error = ""
|
|
485
479
|
to_execute_query = True
|
|
486
480
|
if record.if_query_str is not None:
|
|
487
481
|
data = None
|
|
@@ -491,7 +485,7 @@ class JobsExecutor:
|
|
|
491
485
|
sql = self.__fill_variables(sql, record, history_record)
|
|
492
486
|
|
|
493
487
|
query = parse_sql(sql)
|
|
494
|
-
executed_sql += sql +
|
|
488
|
+
executed_sql += sql + "; "
|
|
495
489
|
|
|
496
490
|
ret = command_executor.execute_command(query)
|
|
497
491
|
if ret.error_code is not None:
|
|
@@ -508,17 +502,16 @@ class JobsExecutor:
|
|
|
508
502
|
if error or data is None or len(data) == 0:
|
|
509
503
|
to_execute_query = False
|
|
510
504
|
|
|
511
|
-
query_context_controller.release_context(
|
|
505
|
+
query_context_controller.release_context("job-if", record.id)
|
|
512
506
|
if to_execute_query:
|
|
513
|
-
|
|
514
|
-
query_context_controller.set_context('job', record.id)
|
|
507
|
+
query_context_controller.set_context("job", record.id)
|
|
515
508
|
for sql in split_sql(record.query_str):
|
|
516
509
|
try:
|
|
517
510
|
# fill template variables
|
|
518
511
|
sql = self.__fill_variables(sql, record, history_record)
|
|
519
512
|
|
|
520
513
|
query = parse_sql(sql)
|
|
521
|
-
executed_sql += sql +
|
|
514
|
+
executed_sql += sql + "; "
|
|
522
515
|
|
|
523
516
|
ret = command_executor.execute_command(query)
|
|
524
517
|
if ret.error_code is not None:
|
|
@@ -60,6 +60,7 @@ class KnowledgeBaseInputParams(BaseModel):
|
|
|
60
60
|
is_sparse: bool = False
|
|
61
61
|
vector_size: int | None = None
|
|
62
62
|
reranking_model: Dict[Text, Any] | None = None
|
|
63
|
+
preprocessing: Dict[Text, Any] | None = None
|
|
63
64
|
|
|
64
65
|
class Config:
|
|
65
66
|
extra = "forbid"
|
|
@@ -244,9 +245,9 @@ class KnowledgeBaseTable:
|
|
|
244
245
|
keyword_search_cols_and_values = []
|
|
245
246
|
query_text = None
|
|
246
247
|
relevance_threshold = None
|
|
247
|
-
reranking_enabled_flag = True
|
|
248
248
|
hybrid_search_enabled_flag = False
|
|
249
249
|
query_conditions = db_handler.extract_conditions(query.where)
|
|
250
|
+
hybrid_search_alpha = None # Default to None, meaning no alpha weighted blending
|
|
250
251
|
if query_conditions is not None:
|
|
251
252
|
for item in query_conditions:
|
|
252
253
|
if item.column == "relevance" and item.op.value == FilterOperator.GREATER_THAN_OR_EQUAL.value:
|
|
@@ -261,10 +262,8 @@ class KnowledgeBaseTable:
|
|
|
261
262
|
logger.error(error_msg)
|
|
262
263
|
raise ValueError(error_msg)
|
|
263
264
|
elif item.column == "reranking":
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
if isinstance(reranking_enabled_flag, str):
|
|
267
|
-
reranking_enabled_flag = reranking_enabled_flag.lower() not in ("false")
|
|
265
|
+
if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
|
|
266
|
+
disable_reranking = True
|
|
268
267
|
elif item.column == "hybrid_search":
|
|
269
268
|
hybrid_search_enabled_flag = item.value
|
|
270
269
|
# cast to boolean
|
|
@@ -272,6 +271,14 @@ class KnowledgeBaseTable:
|
|
|
272
271
|
hybrid_search_enabled_flag = hybrid_search_enabled_flag.lower() not in ("false")
|
|
273
272
|
if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
|
|
274
273
|
disable_reranking = True
|
|
274
|
+
elif item.column == "hybrid_search_alpha":
|
|
275
|
+
# validate item.value is a float
|
|
276
|
+
if not isinstance(item.value, (float, int)):
|
|
277
|
+
raise ValueError(f"Invalid hybrid_search_alpha value: {item.value}. Must be a float or int.")
|
|
278
|
+
# validate hybrid search alpha is between 0 and 1
|
|
279
|
+
if not (0 <= item.value <= 1):
|
|
280
|
+
raise ValueError(f"Invalid hybrid_search_alpha value: {item.value}. Must be between 0 and 1.")
|
|
281
|
+
hybrid_search_alpha = item.value
|
|
275
282
|
elif item.column == "relevance" and item.op.value != FilterOperator.GREATER_THAN_OR_EQUAL.value:
|
|
276
283
|
raise ValueError(
|
|
277
284
|
f"Invalid operator for relevance: {item.op.value}. Only GREATER_THAN_OR_EQUAL is allowed."
|
|
@@ -345,7 +352,15 @@ class KnowledgeBaseTable:
|
|
|
345
352
|
f"Keyword search returned different columns: {df_keyword_select.columns} "
|
|
346
353
|
f"than expected: {df.columns}"
|
|
347
354
|
)
|
|
355
|
+
if hybrid_search_alpha:
|
|
356
|
+
df_keyword_select[TableField.DISTANCE.value] = (
|
|
357
|
+
hybrid_search_alpha * df_keyword_select[TableField.DISTANCE.value]
|
|
358
|
+
)
|
|
359
|
+
df[TableField.DISTANCE.value] = (1 - hybrid_search_alpha) * df[TableField.DISTANCE.value]
|
|
348
360
|
df = pd.concat([df, df_keyword_select], ignore_index=True)
|
|
361
|
+
# sort by distance if distance column exists
|
|
362
|
+
if TableField.DISTANCE.value in df.columns:
|
|
363
|
+
df = df.sort_values(by=TableField.DISTANCE.value, ascending=True)
|
|
349
364
|
# if chunk_id column exists remove duplicates based on chunk_id
|
|
350
365
|
if "chunk_id" in df.columns:
|
|
351
366
|
df = df.drop_duplicates(subset=["chunk_id"])
|
|
@@ -519,6 +534,9 @@ class KnowledgeBaseTable:
|
|
|
519
534
|
|
|
520
535
|
query.update_columns[emb_col] = Constant(self._content_to_embeddings(content))
|
|
521
536
|
|
|
537
|
+
if "metadata" not in query.update_columns:
|
|
538
|
+
query.update_columns["metadata"] = Constant({})
|
|
539
|
+
|
|
522
540
|
# TODO search content in where clause?
|
|
523
541
|
|
|
524
542
|
# set table name
|
|
@@ -1010,6 +1028,9 @@ class KnowledgeBaseController:
|
|
|
1010
1028
|
:param is_sparse: Whether to use sparse vectors for embeddings
|
|
1011
1029
|
:param vector_size: Optional size specification for vectors, required when is_sparse=True
|
|
1012
1030
|
"""
|
|
1031
|
+
if not name.islower():
|
|
1032
|
+
raise ValueError(f"The name must be in lower case: {name}")
|
|
1033
|
+
|
|
1013
1034
|
# fill variables
|
|
1014
1035
|
params = variables_controller.fill_parameters(params)
|
|
1015
1036
|
|
|
@@ -1186,6 +1207,13 @@ class KnowledgeBaseController:
|
|
|
1186
1207
|
if "provider" not in params:
|
|
1187
1208
|
raise ValueError("'provider' parameter is required for embedding model")
|
|
1188
1209
|
|
|
1210
|
+
# check available providers
|
|
1211
|
+
avail_providers = ("openai", "azure_openai", "bedrock", "gemini", "google")
|
|
1212
|
+
if params["provider"] not in avail_providers:
|
|
1213
|
+
raise ValueError(
|
|
1214
|
+
f"Wrong embedding provider: {params['provider']}. Available providers: {', '.join(avail_providers)}"
|
|
1215
|
+
)
|
|
1216
|
+
|
|
1189
1217
|
if params["provider"] not in ("openai", "azure_openai"):
|
|
1190
1218
|
# try use litellm
|
|
1191
1219
|
try:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import math
|
|
3
|
+
import re
|
|
3
4
|
import time
|
|
4
5
|
from typing import List
|
|
5
6
|
|
|
@@ -16,15 +17,15 @@ logger = log.getLogger(__name__)
|
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
GENERATE_QA_SYSTEM_PROMPT = """
|
|
19
|
-
Your task is to generate question and answer pairs for a search engine.
|
|
20
|
+
Your task is to generate question and answer pairs for a search engine.
|
|
20
21
|
The search engine will take your query and return a list of documents.
|
|
21
22
|
You will be given a text and you need to generate a question that can be answered using the information in the text.
|
|
22
23
|
Your questions will be used to evaluate the search engine.
|
|
23
|
-
Question should always have enough clues to identify the specific text that this question is generated from.
|
|
24
|
+
Question should always have enough clues to identify the specific text that this question is generated from.
|
|
24
25
|
Never ask questions like "What license number is associated with Amend 6" because Amend 6 could be found in many documents and the question is not specific enough.
|
|
25
|
-
Example output 1: {\"query\": \"What processor does the HP 2023 14\" FHD IPS Laptop use?\", \"reference_answer\": \"Ryzen 3 5300U\"}
|
|
26
|
+
Example output 1: {\"query\": \"What processor does the HP 2023 14\" FHD IPS Laptop use?\", \"reference_answer\": \"Ryzen 3 5300U\"}
|
|
26
27
|
Example output 2: {\"query\": \"What is the name of the river in Paris?\", \"reference_answer\": \"Seine\"}
|
|
27
|
-
Don't generate questions like "What is being amended in the application?" because these questions cannot be answered using the text and without knowing which document it refers to.
|
|
28
|
+
Don't generate questions like "What is being amended in the application?" because these questions cannot be answered using the text and without knowing which document it refers to.
|
|
28
29
|
The question should be answerable without the text, but the answer should be present in the text.
|
|
29
30
|
Return ONLY a json response. No other text.
|
|
30
31
|
"""
|
|
@@ -43,6 +44,39 @@ def calc_entropy(values: List[float]) -> float:
|
|
|
43
44
|
return -sum([pk * math.log(pk) for pk in values])
|
|
44
45
|
|
|
45
46
|
|
|
47
|
+
def sanitize_json_response(response: str) -> str:
|
|
48
|
+
"""Remove markdown code block formatting from JSON response and extract valid JSON."""
|
|
49
|
+
if not response or not response.strip():
|
|
50
|
+
raise ValueError("Empty response provided.")
|
|
51
|
+
|
|
52
|
+
# Remove leading/trailing whitespace
|
|
53
|
+
response = response.strip()
|
|
54
|
+
|
|
55
|
+
# Remove markdown code block markers if present
|
|
56
|
+
response = re.sub(r"^```(?:json|JSON)?\s*", "", response, flags=re.MULTILINE)
|
|
57
|
+
response = re.sub(r"\s*```$", "", response, flags=re.MULTILINE)
|
|
58
|
+
response = response.strip()
|
|
59
|
+
|
|
60
|
+
# Find the first opening brace
|
|
61
|
+
start_idx = response.find("{")
|
|
62
|
+
if start_idx == -1:
|
|
63
|
+
raise ValueError("No JSON object found in the response.")
|
|
64
|
+
|
|
65
|
+
# Try to parse JSON starting from first { with increasing end positions
|
|
66
|
+
# This handles nested objects and strings with braces correctly
|
|
67
|
+
for end_idx in range(len(response), start_idx, -1): # Start from end and work backwards
|
|
68
|
+
candidate = response[start_idx:end_idx]
|
|
69
|
+
try:
|
|
70
|
+
parsed = json.loads(candidate)
|
|
71
|
+
# Ensure it's a dictionary (object) not just any valid JSON
|
|
72
|
+
if isinstance(parsed, dict):
|
|
73
|
+
return candidate
|
|
74
|
+
except json.JSONDecodeError:
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
raise ValueError("No valid JSON object found in the response.")
|
|
78
|
+
|
|
79
|
+
|
|
46
80
|
class EvaluateBase:
|
|
47
81
|
DEFAULT_QUESTION_COUNT = 20
|
|
48
82
|
DEFAULT_SAMPLE_SIZE = 10000
|
|
@@ -84,7 +118,8 @@ class EvaluateBase:
|
|
|
84
118
|
|
|
85
119
|
dn, table_name = self._get_dn_table(query.from_table)
|
|
86
120
|
query.from_table = table_name
|
|
87
|
-
query.limit
|
|
121
|
+
if query.limit is None:
|
|
122
|
+
query.limit = Constant(self.DEFAULT_SAMPLE_SIZE)
|
|
88
123
|
|
|
89
124
|
response = dn.query(query=query, session=self.session)
|
|
90
125
|
df = response.data_frame
|
|
@@ -178,6 +213,7 @@ class EvaluateBase:
|
|
|
178
213
|
test_data = self.read_from_table(test_table)
|
|
179
214
|
|
|
180
215
|
scores = self.evaluate(test_data)
|
|
216
|
+
scores["id"] = math.floor(time.time()) # unique ID for the evaluation run
|
|
181
217
|
scores["name"] = self.name
|
|
182
218
|
scores["created_at"] = dt.datetime.now()
|
|
183
219
|
|
|
@@ -237,9 +273,13 @@ class EvaluateRerank(EvaluateBase):
|
|
|
237
273
|
{"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
|
|
238
274
|
{"role": "user", "content": f"\n\nText:\n{text}\n\n"},
|
|
239
275
|
]
|
|
240
|
-
answer = self.llm_client.completion(messages)
|
|
276
|
+
answer = self.llm_client.completion(messages, json_output=True)
|
|
277
|
+
|
|
278
|
+
# Sanitize the response by removing markdown code block formatting like ```json
|
|
279
|
+
sanitized_answer = sanitize_json_response(answer)
|
|
280
|
+
|
|
241
281
|
try:
|
|
242
|
-
output = json.loads(
|
|
282
|
+
output = json.loads(sanitized_answer)
|
|
243
283
|
except json.JSONDecodeError:
|
|
244
284
|
raise ValueError(f"Could not parse response from LLM: {answer}")
|
|
245
285
|
|
|
@@ -448,9 +488,13 @@ class EvaluateDocID(EvaluateBase):
|
|
|
448
488
|
{"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
|
|
449
489
|
{"role": "user", "content": f"\n\nText:\n{text}\n\n"},
|
|
450
490
|
]
|
|
451
|
-
answer = self.llm_client.completion(messages)
|
|
491
|
+
answer = self.llm_client.completion(messages, json_output=True)
|
|
492
|
+
|
|
493
|
+
# Sanitize the response by removing markdown code block formatting like ```json
|
|
494
|
+
sanitized_answer = sanitize_json_response(answer)
|
|
495
|
+
|
|
452
496
|
try:
|
|
453
|
-
output = json.loads(
|
|
497
|
+
output = json.loads(sanitized_answer)
|
|
454
498
|
except json.JSONDecodeError:
|
|
455
499
|
raise ValueError(f"Could not parse response from LLM: {answer}")
|
|
456
500
|
|