MindsDB 25.4.1.0__py3-none-any.whl → 25.4.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/command_executor.py +62 -61
- mindsdb/api/executor/data_types/answer.py +9 -12
- mindsdb/api/executor/datahub/classes/response.py +11 -0
- mindsdb/api/executor/datahub/datanodes/datanode.py +4 -4
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +7 -9
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +22 -16
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +20 -20
- mindsdb/api/executor/planner/plan_join.py +1 -1
- mindsdb/api/executor/planner/steps.py +2 -1
- mindsdb/api/executor/sql_query/result_set.py +10 -7
- mindsdb/api/executor/sql_query/sql_query.py +36 -82
- mindsdb/api/executor/sql_query/steps/delete_step.py +2 -3
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +5 -3
- mindsdb/api/executor/sql_query/steps/insert_step.py +2 -2
- mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -2
- mindsdb/api/executor/sql_query/steps/subselect_step.py +20 -8
- mindsdb/api/executor/sql_query/steps/update_step.py +4 -6
- mindsdb/api/http/namespaces/sql.py +4 -1
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/ok_packet.py +1 -1
- mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +4 -27
- mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +1 -0
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +38 -37
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +23 -13
- mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +1 -1
- mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +3 -2
- mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +4 -4
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +19 -5
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +9 -4
- mindsdb/integrations/handlers/redshift_handler/redshift_handler.py +1 -1
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +18 -11
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +1 -2
- mindsdb/integrations/libs/response.py +9 -4
- mindsdb/integrations/libs/vectordatabase_handler.py +17 -5
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +8 -98
- mindsdb/interfaces/database/log.py +8 -9
- mindsdb/interfaces/database/projects.py +1 -5
- mindsdb/interfaces/functions/controller.py +59 -17
- mindsdb/interfaces/functions/to_markdown.py +194 -0
- mindsdb/interfaces/jobs/jobs_controller.py +3 -3
- mindsdb/interfaces/knowledge_base/controller.py +101 -60
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +3 -14
- mindsdb/interfaces/query_context/context_controller.py +3 -1
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/METADATA +231 -230
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/RECORD +48 -46
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import ast
|
|
2
2
|
import sys
|
|
3
|
+
import os
|
|
3
4
|
from typing import Dict, List, Optional, Union
|
|
4
5
|
import hashlib
|
|
5
6
|
|
|
@@ -67,6 +68,8 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
67
68
|
"persist_directory": self.persist_directory,
|
|
68
69
|
}
|
|
69
70
|
|
|
71
|
+
self._use_handler_storage = False
|
|
72
|
+
|
|
70
73
|
self.connect()
|
|
71
74
|
|
|
72
75
|
def validate_connection_parameters(self, name, **kwargs):
|
|
@@ -79,11 +82,15 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
79
82
|
|
|
80
83
|
config = ChromaHandlerConfig(**_config)
|
|
81
84
|
|
|
82
|
-
if config.persist_directory
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
85
|
+
if config.persist_directory:
|
|
86
|
+
if os.path.isabs(config.persist_directory):
|
|
87
|
+
self.persist_directory = config.persist_directory
|
|
88
|
+
elif not self.handler_storage.is_temporal:
|
|
89
|
+
# get full persistence directory from handler storage
|
|
90
|
+
self.persist_directory = self.handler_storage.folder_get(
|
|
91
|
+
config.persist_directory
|
|
92
|
+
)
|
|
93
|
+
self._use_handler_storage = True
|
|
87
94
|
|
|
88
95
|
return config
|
|
89
96
|
|
|
@@ -105,7 +112,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
105
112
|
|
|
106
113
|
def _sync(self):
|
|
107
114
|
"""Sync the database to disk if using persistent storage"""
|
|
108
|
-
if self.persist_directory:
|
|
115
|
+
if self.persist_directory and self._use_handler_storage:
|
|
109
116
|
self.handler_storage.folder_sync(self.persist_directory)
|
|
110
117
|
|
|
111
118
|
def __del__(self):
|
|
@@ -162,6 +169,8 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
162
169
|
FilterOperator.LESS_THAN_OR_EQUAL: "$lte",
|
|
163
170
|
FilterOperator.GREATER_THAN: "$gt",
|
|
164
171
|
FilterOperator.GREATER_THAN_OR_EQUAL: "$gte",
|
|
172
|
+
FilterOperator.IN: "$in",
|
|
173
|
+
FilterOperator.NOT_IN: "$nin",
|
|
165
174
|
}
|
|
166
175
|
|
|
167
176
|
if operator not in mapping:
|
|
@@ -308,7 +317,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
308
317
|
}
|
|
309
318
|
|
|
310
319
|
if columns is not None:
|
|
311
|
-
payload = {column: payload[column] for column in columns}
|
|
320
|
+
payload = {column: payload[column] for column in columns if column != TableField.DISTANCE.value}
|
|
312
321
|
|
|
313
322
|
# always include distance
|
|
314
323
|
distance_filter = None
|
|
@@ -316,10 +325,11 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
316
325
|
if distances is not None:
|
|
317
326
|
payload[distance_col] = distances
|
|
318
327
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
328
|
+
if conditions is not None:
|
|
329
|
+
for cond in conditions:
|
|
330
|
+
if cond.column == distance_col:
|
|
331
|
+
distance_filter = cond
|
|
332
|
+
break
|
|
323
333
|
|
|
324
334
|
df = pd.DataFrame(payload)
|
|
325
335
|
if distance_filter is not None:
|
|
@@ -413,8 +423,8 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
413
423
|
collection.upsert(
|
|
414
424
|
ids=data_dict[TableField.ID.value],
|
|
415
425
|
documents=data_dict[TableField.CONTENT.value],
|
|
416
|
-
embeddings=data_dict.get(TableField.EMBEDDINGS.value),
|
|
417
|
-
metadatas=data_dict.get(TableField.METADATA.value)
|
|
426
|
+
embeddings=data_dict.get(TableField.EMBEDDINGS.value, None),
|
|
427
|
+
metadatas=data_dict.get(TableField.METADATA.value, None)
|
|
418
428
|
)
|
|
419
429
|
self._sync()
|
|
420
430
|
except Exception as e:
|
|
@@ -177,7 +177,7 @@ class SqlServerHandler(DatabaseHandler):
|
|
|
177
177
|
)
|
|
178
178
|
)
|
|
179
179
|
else:
|
|
180
|
-
response = Response(RESPONSE_TYPE.OK)
|
|
180
|
+
response = Response(RESPONSE_TYPE.OK, affected_rows=cur.rowcount)
|
|
181
181
|
connection.commit()
|
|
182
182
|
except Exception as e:
|
|
183
183
|
logger.error(f'Error running query: {query} on {self.database}, {e}!')
|
|
@@ -178,10 +178,11 @@ class MySQLHandler(DatabaseHandler):
|
|
|
178
178
|
pd.DataFrame(
|
|
179
179
|
result,
|
|
180
180
|
columns=[x[0] for x in cur.description]
|
|
181
|
-
)
|
|
181
|
+
),
|
|
182
|
+
affected_rows=cur.rowcount
|
|
182
183
|
)
|
|
183
184
|
else:
|
|
184
|
-
response = Response(RESPONSE_TYPE.OK)
|
|
185
|
+
response = Response(RESPONSE_TYPE.OK, affected_rows=cur.rowcount)
|
|
185
186
|
except mysql.connector.Error as e:
|
|
186
187
|
logger.error(f'Error running query: {query} on {self.connection_data["database"]}!')
|
|
187
188
|
response = Response(
|
|
@@ -205,8 +205,10 @@ class OracleHandler(DatabaseHandler):
|
|
|
205
205
|
with connection.cursor() as cur:
|
|
206
206
|
try:
|
|
207
207
|
cur.execute(query)
|
|
208
|
-
|
|
209
|
-
|
|
208
|
+
if cur.description is None:
|
|
209
|
+
response = Response(RESPONSE_TYPE.OK, affected_rows=cur.rowcount)
|
|
210
|
+
else:
|
|
211
|
+
result = cur.fetchall()
|
|
210
212
|
response = Response(
|
|
211
213
|
RESPONSE_TYPE.TABLE,
|
|
212
214
|
data_frame=pd.DataFrame(
|
|
@@ -214,8 +216,6 @@ class OracleHandler(DatabaseHandler):
|
|
|
214
216
|
columns=[row[0] for row in cur.description],
|
|
215
217
|
),
|
|
216
218
|
)
|
|
217
|
-
else:
|
|
218
|
-
response = Response(RESPONSE_TYPE.OK)
|
|
219
219
|
|
|
220
220
|
connection.commit()
|
|
221
221
|
except DatabaseError as database_error:
|
|
@@ -149,7 +149,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
149
149
|
for key, value in filter_conditions.items():
|
|
150
150
|
if key == "embeddings":
|
|
151
151
|
continue
|
|
152
|
-
if value['op'].lower()
|
|
152
|
+
if value['op'].lower() in ('in', 'not in'):
|
|
153
153
|
values = list(repr(i) for i in value['value'])
|
|
154
154
|
value['value'] = '({})'.format(', '.join(values))
|
|
155
155
|
else:
|
|
@@ -165,9 +165,9 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
165
165
|
|
|
166
166
|
@staticmethod
|
|
167
167
|
def _construct_full_after_from_clause(
|
|
168
|
+
where_clause: str,
|
|
168
169
|
offset_clause: str,
|
|
169
170
|
limit_clause: str,
|
|
170
|
-
where_clause: str,
|
|
171
171
|
) -> str:
|
|
172
172
|
|
|
173
173
|
return f"{where_clause} {offset_clause} {limit_clause}"
|
|
@@ -200,10 +200,20 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
200
200
|
where_clause, offset_clause, limit_clause
|
|
201
201
|
)
|
|
202
202
|
|
|
203
|
-
|
|
204
|
-
|
|
203
|
+
# Handle distance column specially since it's calculated, not stored
|
|
204
|
+
modified_columns = []
|
|
205
|
+
has_distance = False
|
|
206
|
+
if columns is not None:
|
|
207
|
+
for col in columns:
|
|
208
|
+
if col == TableField.DISTANCE.value:
|
|
209
|
+
has_distance = True
|
|
210
|
+
else:
|
|
211
|
+
modified_columns.append(col)
|
|
205
212
|
else:
|
|
206
|
-
|
|
213
|
+
modified_columns = ['id', 'content', 'embeddings', 'metadata']
|
|
214
|
+
has_distance = True
|
|
215
|
+
|
|
216
|
+
targets = ', '.join(modified_columns)
|
|
207
217
|
|
|
208
218
|
|
|
209
219
|
if filter_conditions:
|
|
@@ -227,6 +237,10 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
227
237
|
# Use cosine similarity for dense vectors
|
|
228
238
|
distance_op = "<=>"
|
|
229
239
|
|
|
240
|
+
# Calculate distance as part of the query if needed
|
|
241
|
+
if has_distance:
|
|
242
|
+
targets = f"{targets}, (embeddings {distance_op} '{search_vector}') as distance"
|
|
243
|
+
|
|
230
244
|
return f"SELECT {targets} FROM {table_name} ORDER BY embeddings {distance_op} '{search_vector}' ASC {after_from_clause}"
|
|
231
245
|
|
|
232
246
|
else:
|
|
@@ -228,7 +228,7 @@ class PostgresHandler(DatabaseHandler):
|
|
|
228
228
|
else:
|
|
229
229
|
cur.execute(query)
|
|
230
230
|
if cur.pgresult is None or ExecStatus(cur.pgresult.status) == ExecStatus.COMMAND_OK:
|
|
231
|
-
response = Response(RESPONSE_TYPE.OK)
|
|
231
|
+
response = Response(RESPONSE_TYPE.OK, affected_rows=cur.rowcount)
|
|
232
232
|
else:
|
|
233
233
|
result = cur.fetchall()
|
|
234
234
|
df = DataFrame(
|
|
@@ -238,7 +238,8 @@ class PostgresHandler(DatabaseHandler):
|
|
|
238
238
|
self._cast_dtypes(df, cur.description)
|
|
239
239
|
response = Response(
|
|
240
240
|
RESPONSE_TYPE.TABLE,
|
|
241
|
-
df
|
|
241
|
+
data_frame=df,
|
|
242
|
+
affected_rows=cur.rowcount
|
|
242
243
|
)
|
|
243
244
|
connection.commit()
|
|
244
245
|
except Exception as e:
|
|
@@ -255,15 +256,16 @@ class PostgresHandler(DatabaseHandler):
|
|
|
255
256
|
|
|
256
257
|
return response
|
|
257
258
|
|
|
258
|
-
def insert(self, table_name: str, df: pd.DataFrame):
|
|
259
|
+
def insert(self, table_name: str, df: pd.DataFrame) -> Response:
|
|
259
260
|
need_to_close = not self.is_connected
|
|
260
261
|
|
|
261
262
|
connection = self.connect()
|
|
262
263
|
|
|
263
264
|
columns = [f'"{c}"' for c in df.columns]
|
|
265
|
+
rowcount = None
|
|
264
266
|
with connection.cursor() as cur:
|
|
265
267
|
try:
|
|
266
|
-
with cur.copy(f'copy "{table_name}" ({",".join(columns)}) from STDIN
|
|
268
|
+
with cur.copy(f'copy "{table_name}" ({",".join(columns)}) from STDIN WITH CSV') as copy:
|
|
267
269
|
df.to_csv(copy, index=False, header=False)
|
|
268
270
|
|
|
269
271
|
connection.commit()
|
|
@@ -271,10 +273,13 @@ class PostgresHandler(DatabaseHandler):
|
|
|
271
273
|
logger.error(f'Error running insert to {table_name} on {self.database}, {e}!')
|
|
272
274
|
connection.rollback()
|
|
273
275
|
raise e
|
|
276
|
+
rowcount = cur.rowcount
|
|
274
277
|
|
|
275
278
|
if need_to_close:
|
|
276
279
|
self.disconnect()
|
|
277
280
|
|
|
281
|
+
return Response(RESPONSE_TYPE.OK, affected_rows=rowcount)
|
|
282
|
+
|
|
278
283
|
@profiler.profile()
|
|
279
284
|
def query(self, query: ASTNode) -> Response:
|
|
280
285
|
"""
|
|
@@ -52,7 +52,7 @@ class RedshiftHandler(PostgresHandler):
|
|
|
52
52
|
with connection.cursor() as cur:
|
|
53
53
|
try:
|
|
54
54
|
cur.executemany(query, df.values.tolist())
|
|
55
|
-
response = Response(RESPONSE_TYPE.OK)
|
|
55
|
+
response = Response(RESPONSE_TYPE.OK, affected_rows=cur.rowcount)
|
|
56
56
|
|
|
57
57
|
connection.commit()
|
|
58
58
|
except Exception as e:
|
|
@@ -230,18 +230,25 @@ class SnowflakeHandler(DatabaseHandler):
|
|
|
230
230
|
# Fallback for CREATE/DELETE/UPDATE. These commands returns table with single column,
|
|
231
231
|
# but it cannot be retrieved as pandas DataFrame.
|
|
232
232
|
result = cur.fetchall()
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
233
|
+
match result:
|
|
234
|
+
case (
|
|
235
|
+
[{'number of rows inserted': affected_rows}]
|
|
236
|
+
| [{'number of rows deleted': affected_rows}]
|
|
237
|
+
| [{'number of rows updated': affected_rows, 'number of multi-joined rows updated': _}]
|
|
238
|
+
):
|
|
239
|
+
response = Response(RESPONSE_TYPE.OK, affected_rows=affected_rows)
|
|
240
|
+
case list():
|
|
241
|
+
response = Response(
|
|
242
|
+
RESPONSE_TYPE.TABLE,
|
|
243
|
+
DataFrame(
|
|
244
|
+
result,
|
|
245
|
+
columns=[x[0] for x in cur.description]
|
|
246
|
+
)
|
|
239
247
|
)
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
response = Response(RESPONSE_TYPE.OK)
|
|
248
|
+
case _:
|
|
249
|
+
# Looks like SnowFlake always returns something in response, so this is suspicious
|
|
250
|
+
logger.warning('Snowflake did not return any data in response.')
|
|
251
|
+
response = Response(RESPONSE_TYPE.OK)
|
|
245
252
|
except Exception as e:
|
|
246
253
|
logger.error(f"Error running query: {query} on {self.connection_data.get('database')}, {e}!")
|
|
247
254
|
response = Response(
|
|
@@ -78,8 +78,7 @@ def learn_process(data_integration_ref: dict, problem_definition: dict, fetch_da
|
|
|
78
78
|
query_ast = parse_sql(fetch_data_query)
|
|
79
79
|
sqlquery = SQLQuery(query_ast, session=sql_session)
|
|
80
80
|
|
|
81
|
-
|
|
82
|
-
training_data_df = result['result']
|
|
81
|
+
training_data_df = sqlquery.fetched_data.to_df()
|
|
83
82
|
|
|
84
83
|
training_data_columns_count, training_data_rows_count = 0, 0
|
|
85
84
|
if training_data_df is not None:
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from typing import Optional
|
|
1
2
|
from pandas import DataFrame
|
|
2
3
|
|
|
3
4
|
from mindsdb.utilities import log
|
|
@@ -8,13 +9,16 @@ from mindsdb_sql_parser.ast import ASTNode
|
|
|
8
9
|
logger = log.getLogger(__name__)
|
|
9
10
|
|
|
10
11
|
class HandlerResponse:
|
|
11
|
-
def __init__(self, resp_type: RESPONSE_TYPE, data_frame: DataFrame = None,
|
|
12
|
-
|
|
12
|
+
def __init__(self, resp_type: RESPONSE_TYPE, data_frame: DataFrame = None, query: ASTNode = 0, error_code: int = 0,
|
|
13
|
+
error_message: Optional[str] = None, affected_rows: Optional[int] = None) -> None:
|
|
13
14
|
self.resp_type = resp_type
|
|
14
15
|
self.query = query
|
|
15
16
|
self.data_frame = data_frame
|
|
16
17
|
self.error_code = error_code
|
|
17
18
|
self.error_message = error_message
|
|
19
|
+
self.affected_rows = affected_rows
|
|
20
|
+
if isinstance(self.affected_rows, int) is False or self.affected_rows < 0:
|
|
21
|
+
self.affected_rows = 0
|
|
18
22
|
|
|
19
23
|
@property
|
|
20
24
|
def type(self):
|
|
@@ -35,13 +39,14 @@ class HandlerResponse:
|
|
|
35
39
|
"error": self.error_message}
|
|
36
40
|
|
|
37
41
|
def __repr__(self):
|
|
38
|
-
return "%s: resp_type=%s, query=%s, data_frame=%s, err_code=%s, error=%s" % (
|
|
42
|
+
return "%s: resp_type=%s, query=%s, data_frame=%s, err_code=%s, error=%s, affected_rows=%s" % (
|
|
39
43
|
self.__class__.__name__,
|
|
40
44
|
self.resp_type,
|
|
41
45
|
self.query,
|
|
42
46
|
self.data_frame,
|
|
43
47
|
self.error_code,
|
|
44
|
-
self.error_message
|
|
48
|
+
self.error_message,
|
|
49
|
+
self.affected_rows
|
|
45
50
|
)
|
|
46
51
|
|
|
47
52
|
class HandlerStatusResponse:
|
|
@@ -20,7 +20,7 @@ from mindsdb_sql_parser.ast.base import ASTNode
|
|
|
20
20
|
|
|
21
21
|
from mindsdb.integrations.libs.response import RESPONSE_TYPE, HandlerResponse
|
|
22
22
|
from mindsdb.utilities import log
|
|
23
|
-
from mindsdb.integrations.utilities.sql_utils import
|
|
23
|
+
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
|
|
24
24
|
|
|
25
25
|
from mindsdb.integrations.utilities.query_traversal import query_traversal
|
|
26
26
|
from .base import BaseHandler
|
|
@@ -39,6 +39,7 @@ class TableField(Enum):
|
|
|
39
39
|
METADATA = "metadata"
|
|
40
40
|
SEARCH_VECTOR = "search_vector"
|
|
41
41
|
DISTANCE = "distance"
|
|
42
|
+
RELEVANCE = "relevance"
|
|
42
43
|
|
|
43
44
|
|
|
44
45
|
class DistanceFunction(Enum):
|
|
@@ -69,6 +70,10 @@ class VectorStoreHandler(BaseHandler):
|
|
|
69
70
|
"name": TableField.METADATA.value,
|
|
70
71
|
"data_type": "json",
|
|
71
72
|
},
|
|
73
|
+
{
|
|
74
|
+
"name": TableField.DISTANCE.value,
|
|
75
|
+
"data_type": "float",
|
|
76
|
+
},
|
|
72
77
|
]
|
|
73
78
|
|
|
74
79
|
def validate_connection_parameters(self, name, **kwargs):
|
|
@@ -231,7 +236,7 @@ class VectorStoreHandler(BaseHandler):
|
|
|
231
236
|
|
|
232
237
|
return self.do_upsert(table_name, pd.DataFrame(data))
|
|
233
238
|
|
|
234
|
-
def
|
|
239
|
+
def dispatch_update(self, query: Update, conditions: List[FilterCondition] = None):
|
|
235
240
|
"""
|
|
236
241
|
Dispatch update query to the appropriate method.
|
|
237
242
|
"""
|
|
@@ -250,8 +255,15 @@ class VectorStoreHandler(BaseHandler):
|
|
|
250
255
|
pass
|
|
251
256
|
row[k] = v
|
|
252
257
|
|
|
253
|
-
|
|
254
|
-
|
|
258
|
+
if conditions is None:
|
|
259
|
+
where_statement = query.where
|
|
260
|
+
conditions = self.extract_conditions(where_statement)
|
|
261
|
+
|
|
262
|
+
for condition in conditions:
|
|
263
|
+
if condition.op != FilterOperator.EQUAL:
|
|
264
|
+
raise NotImplementedError
|
|
265
|
+
|
|
266
|
+
row[condition.column] = condition.value
|
|
255
267
|
|
|
256
268
|
# checks
|
|
257
269
|
if TableField.EMBEDDINGS.value not in row:
|
|
@@ -381,7 +393,7 @@ class VectorStoreHandler(BaseHandler):
|
|
|
381
393
|
CreateTable: self._dispatch_create_table,
|
|
382
394
|
DropTables: self._dispatch_drop_table,
|
|
383
395
|
Insert: self._dispatch_insert,
|
|
384
|
-
Update: self.
|
|
396
|
+
Update: self.dispatch_update,
|
|
385
397
|
Delete: self.dispatch_delete,
|
|
386
398
|
Select: self.dispatch_select,
|
|
387
399
|
}
|
|
@@ -54,7 +54,7 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
54
54
|
max_retries=2 # Client-level retries
|
|
55
55
|
)
|
|
56
56
|
|
|
57
|
-
async def search_relevancy(self, query: str, document: str) -> Any:
|
|
57
|
+
async def search_relevancy(self, query: str, document: str, custom_event: bool = True) -> Any:
|
|
58
58
|
await self._init_client()
|
|
59
59
|
|
|
60
60
|
async with self._semaphore:
|
|
@@ -82,7 +82,8 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
82
82
|
}
|
|
83
83
|
|
|
84
84
|
# Stream reranking update.
|
|
85
|
-
|
|
85
|
+
if custom_event:
|
|
86
|
+
dispatch_custom_event("rerank", rerank_data)
|
|
86
87
|
return rerank_data
|
|
87
88
|
|
|
88
89
|
except Exception as e:
|
|
@@ -93,7 +94,7 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
93
94
|
retry_delay = self.retry_delay * (2 ** attempt) + random.uniform(0, 0.1)
|
|
94
95
|
await asyncio.sleep(retry_delay)
|
|
95
96
|
|
|
96
|
-
async def _rank(self, query_document_pairs: List[Tuple[str, str]]) -> List[Tuple[str, float]]:
|
|
97
|
+
async def _rank(self, query_document_pairs: List[Tuple[str, str]], custom_event: bool = True) -> List[Tuple[str, float]]:
|
|
97
98
|
ranked_results = []
|
|
98
99
|
|
|
99
100
|
# Process in larger batches for better throughput
|
|
@@ -102,7 +103,7 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
102
103
|
batch = query_document_pairs[i:i + batch_size]
|
|
103
104
|
try:
|
|
104
105
|
results = await asyncio.gather(
|
|
105
|
-
*[self.search_relevancy(query=query, document=document) for (query, document) in batch],
|
|
106
|
+
*[self.search_relevancy(query=query, document=document, custom_event=custom_event) for (query, document) in batch],
|
|
106
107
|
return_exceptions=True
|
|
107
108
|
)
|
|
108
109
|
|
|
@@ -227,16 +228,7 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
227
228
|
"remove_irrelevant": self.remove_irrelevant,
|
|
228
229
|
}
|
|
229
230
|
|
|
230
|
-
def get_scores(self, query: str, documents: list[str],
|
|
231
|
-
"""
|
|
232
|
-
Get relevance scores for documents given a query.
|
|
233
|
-
Args:
|
|
234
|
-
query: The query text
|
|
235
|
-
documents: List of document texts to score
|
|
236
|
-
disable_events: Whether to disable event dispatching (default True)
|
|
237
|
-
Returns:
|
|
238
|
-
List of relevance scores
|
|
239
|
-
"""
|
|
231
|
+
def get_scores(self, query: str, documents: list[str], custom_event: bool = False):
|
|
240
232
|
query_document_pairs = [(query, doc) for doc in documents]
|
|
241
233
|
# Create event loop and run async code
|
|
242
234
|
import asyncio
|
|
@@ -246,89 +238,7 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
246
238
|
# If no running loop exists, create a new one
|
|
247
239
|
loop = asyncio.new_event_loop()
|
|
248
240
|
asyncio.set_event_loop(loop)
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
# Create a wrapper function that doesn't dispatch events
|
|
252
|
-
async def _rank_without_events(query_document_pairs):
|
|
253
|
-
ranked_results = []
|
|
254
|
-
# Process in larger batches for better throughput
|
|
255
|
-
batch_size = min(self.max_concurrent_requests * 2, len(query_document_pairs))
|
|
256
|
-
for i in range(0, len(query_document_pairs), batch_size):
|
|
257
|
-
batch = query_document_pairs[i:i + batch_size]
|
|
258
|
-
try:
|
|
259
|
-
# Define a no-events version of search_relevancy inside this closure
|
|
260
|
-
async def search_relevancy_no_events(query, document):
|
|
261
|
-
await self._init_client()
|
|
262
|
-
async with self._semaphore:
|
|
263
|
-
for attempt in range(self.max_retries):
|
|
264
|
-
try:
|
|
265
|
-
response = await self.client.chat.completions.create(
|
|
266
|
-
model=self.model,
|
|
267
|
-
messages=[
|
|
268
|
-
{"role": "system", "content": "Rate the relevance of the document to the query. Respond with 'yes' or 'no'."},
|
|
269
|
-
{"role": "user", "content": f"Query: {query}\nDocument: {document}\nIs this document relevant?"}
|
|
270
|
-
],
|
|
271
|
-
temperature=self.temperature,
|
|
272
|
-
n=1,
|
|
273
|
-
logprobs=True,
|
|
274
|
-
max_tokens=1
|
|
275
|
-
)
|
|
276
|
-
# Extract response and confidence score
|
|
277
|
-
answer = response.choices[0].message.content
|
|
278
|
-
logprob = response.choices[0].logprobs.content[0].logprob
|
|
279
|
-
# No event dispatch here
|
|
280
|
-
return {"document": document, "answer": answer, "logprob": logprob}
|
|
281
|
-
except Exception as e:
|
|
282
|
-
if attempt == self.max_retries - 1:
|
|
283
|
-
log.error(f"Failed after {self.max_retries} attempts: {str(e)}")
|
|
284
|
-
raise
|
|
285
|
-
# Exponential backoff with jitter
|
|
286
|
-
retry_delay = self.retry_delay * (2 ** attempt) + random.uniform(0, 0.1)
|
|
287
|
-
await asyncio.sleep(retry_delay)
|
|
288
|
-
# Use our no-events version for this batch
|
|
289
|
-
results = await asyncio.gather(
|
|
290
|
-
*[search_relevancy_no_events(query=query, document=document) for (query, document) in batch],
|
|
291
|
-
return_exceptions=True
|
|
292
|
-
)
|
|
293
|
-
for idx, result in enumerate(results):
|
|
294
|
-
if isinstance(result, Exception):
|
|
295
|
-
log.error(f"Error processing document {i+idx}: {str(result)}")
|
|
296
|
-
ranked_results.append((batch[idx][1], 0.0))
|
|
297
|
-
continue
|
|
298
|
-
answer = result["answer"]
|
|
299
|
-
logprob = result["logprob"]
|
|
300
|
-
prob = math.exp(logprob)
|
|
301
|
-
# Convert answer to score using the model's confidence
|
|
302
|
-
if answer.lower().strip() == "yes":
|
|
303
|
-
score = prob # If yes, use the model's confidence
|
|
304
|
-
elif answer.lower().strip() == "no":
|
|
305
|
-
score = 1 - prob # If no, invert the confidence
|
|
306
|
-
else:
|
|
307
|
-
score = 0.5 * prob # For unclear answers, reduce confidence
|
|
308
|
-
ranked_results.append((batch[idx][1], score))
|
|
309
|
-
# Check if we should stop early
|
|
310
|
-
try:
|
|
311
|
-
high_scoring_docs = [r for r in ranked_results if r[1] >= self.filtering_threshold]
|
|
312
|
-
can_stop_early = (
|
|
313
|
-
self.early_stop # Early stopping is enabled
|
|
314
|
-
and self.num_docs_to_keep # We have a target number of docs
|
|
315
|
-
and len(high_scoring_docs) >= self.num_docs_to_keep # Found enough good docs
|
|
316
|
-
and score >= self.early_stop_threshold # Current doc is good enough
|
|
317
|
-
)
|
|
318
|
-
if can_stop_early:
|
|
319
|
-
log.info(f"Early stopping after finding {self.num_docs_to_keep} documents with high confidence")
|
|
320
|
-
return ranked_results
|
|
321
|
-
except Exception as e:
|
|
322
|
-
# Don't let early stopping errors stop the whole process
|
|
323
|
-
log.warning(f"Error in early stopping check: {str(e)}")
|
|
324
|
-
except Exception as e:
|
|
325
|
-
log.error(f"Batch processing error: {str(e)}")
|
|
326
|
-
continue
|
|
327
|
-
return ranked_results
|
|
328
|
-
# Use our no-events version
|
|
329
|
-
documents_and_scores = loop.run_until_complete(_rank_without_events(query_document_pairs))
|
|
330
|
-
else:
|
|
331
|
-
# Use the original _rank method
|
|
332
|
-
documents_and_scores = loop.run_until_complete(self._rank(query_document_pairs))
|
|
241
|
+
|
|
242
|
+
documents_and_scores = loop.run_until_complete(self._rank(query_document_pairs, custom_event=custom_event))
|
|
333
243
|
scores = [score for _, score in documents_and_scores]
|
|
334
244
|
return scores
|
|
@@ -1,21 +1,21 @@
|
|
|
1
|
+
from typing import List
|
|
1
2
|
from copy import deepcopy
|
|
2
3
|
from abc import ABC, abstractmethod
|
|
3
|
-
from typing import List, Union, Tuple
|
|
4
4
|
from collections import OrderedDict
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
|
-
|
|
8
7
|
from mindsdb_sql_parser import parse_sql
|
|
9
8
|
from mindsdb_sql_parser.ast import Select, Identifier, Star, BinaryOperation, Constant, Join, Function
|
|
10
9
|
from mindsdb_sql_parser.utils import JoinType
|
|
10
|
+
|
|
11
11
|
from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
|
|
12
12
|
from mindsdb.integrations.utilities.query_traversal import query_traversal
|
|
13
|
-
|
|
14
13
|
from mindsdb.utilities.functions import resolve_table_identifier
|
|
15
14
|
from mindsdb.api.executor.utilities.sql import get_query_tables
|
|
16
15
|
from mindsdb.utilities.exception import EntityNotExistsError
|
|
17
16
|
import mindsdb.interfaces.storage.db as db
|
|
18
17
|
from mindsdb.utilities.context import context as ctx
|
|
18
|
+
from mindsdb.api.executor.datahub.classes.response import DataHubResponse
|
|
19
19
|
from mindsdb.api.executor.datahub.classes.tables_row import (
|
|
20
20
|
TABLES_ROW_TYPE,
|
|
21
21
|
TablesRow,
|
|
@@ -223,8 +223,7 @@ class LogDBController:
|
|
|
223
223
|
for table_name in self._tables.keys()
|
|
224
224
|
]
|
|
225
225
|
|
|
226
|
-
def query(self, query: Select = None, native_query: str = None,
|
|
227
|
-
session=None, return_as: str = 'split') -> Union[pd.DataFrame, Tuple[pd.DataFrame, list]]:
|
|
226
|
+
def query(self, query: Select = None, native_query: str = None, session=None) -> DataHubResponse:
|
|
228
227
|
if native_query is not None:
|
|
229
228
|
if query is not None:
|
|
230
229
|
raise Exception("'query' and 'native_query' arguments can not be used together")
|
|
@@ -286,12 +285,12 @@ class LogDBController:
|
|
|
286
285
|
df[df_column_name] = df[df_column_name].astype(column_type)
|
|
287
286
|
# endregion
|
|
288
287
|
|
|
289
|
-
if return_as != 'split':
|
|
290
|
-
return df
|
|
291
|
-
|
|
292
288
|
columns_info = [{
|
|
293
289
|
'name': k,
|
|
294
290
|
'type': v
|
|
295
291
|
} for k, v in df.dtypes.items()]
|
|
296
292
|
|
|
297
|
-
return
|
|
293
|
+
return DataHubResponse(
|
|
294
|
+
data_frame=df,
|
|
295
|
+
columns=columns_info
|
|
296
|
+
)
|
|
@@ -137,14 +137,10 @@ class Project:
|
|
|
137
137
|
view_meta['query_ast'],
|
|
138
138
|
session=session
|
|
139
139
|
)
|
|
140
|
-
|
|
141
|
-
|
|
140
|
+
df = sqlquery.fetched_data.to_df()
|
|
142
141
|
finally:
|
|
143
142
|
query_context_controller.release_context('view', view_meta['id'])
|
|
144
143
|
|
|
145
|
-
if result['success'] is False:
|
|
146
|
-
raise Exception(f"Cant execute view query: {view_meta['query_ast']}")
|
|
147
|
-
df = result['result']
|
|
148
144
|
# remove duplicated columns
|
|
149
145
|
df = df.loc[:, ~df.columns.duplicated()]
|
|
150
146
|
|