MindsDB 25.4.1.0__py3-none-any.whl → 25.4.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (48) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/executor/command_executor.py +62 -61
  3. mindsdb/api/executor/data_types/answer.py +9 -12
  4. mindsdb/api/executor/datahub/classes/response.py +11 -0
  5. mindsdb/api/executor/datahub/datanodes/datanode.py +4 -4
  6. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +7 -9
  7. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +22 -16
  8. mindsdb/api/executor/datahub/datanodes/project_datanode.py +20 -20
  9. mindsdb/api/executor/planner/plan_join.py +1 -1
  10. mindsdb/api/executor/planner/steps.py +2 -1
  11. mindsdb/api/executor/sql_query/result_set.py +10 -7
  12. mindsdb/api/executor/sql_query/sql_query.py +36 -82
  13. mindsdb/api/executor/sql_query/steps/delete_step.py +2 -3
  14. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +5 -3
  15. mindsdb/api/executor/sql_query/steps/insert_step.py +2 -2
  16. mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -2
  17. mindsdb/api/executor/sql_query/steps/subselect_step.py +20 -8
  18. mindsdb/api/executor/sql_query/steps/update_step.py +4 -6
  19. mindsdb/api/http/namespaces/sql.py +4 -1
  20. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/ok_packet.py +1 -1
  21. mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +4 -27
  22. mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +1 -0
  23. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +38 -37
  24. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +23 -13
  25. mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +1 -1
  26. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +3 -2
  27. mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +4 -4
  28. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +19 -5
  29. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +9 -4
  30. mindsdb/integrations/handlers/redshift_handler/redshift_handler.py +1 -1
  31. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +18 -11
  32. mindsdb/integrations/libs/ml_handler_process/learn_process.py +1 -2
  33. mindsdb/integrations/libs/response.py +9 -4
  34. mindsdb/integrations/libs/vectordatabase_handler.py +17 -5
  35. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +8 -98
  36. mindsdb/interfaces/database/log.py +8 -9
  37. mindsdb/interfaces/database/projects.py +1 -5
  38. mindsdb/interfaces/functions/controller.py +59 -17
  39. mindsdb/interfaces/functions/to_markdown.py +194 -0
  40. mindsdb/interfaces/jobs/jobs_controller.py +3 -3
  41. mindsdb/interfaces/knowledge_base/controller.py +101 -60
  42. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +3 -14
  43. mindsdb/interfaces/query_context/context_controller.py +3 -1
  44. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/METADATA +231 -230
  45. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/RECORD +48 -46
  46. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/WHEEL +0 -0
  47. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/licenses/LICENSE +0 -0
  48. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import ast
2
2
  import sys
3
+ import os
3
4
  from typing import Dict, List, Optional, Union
4
5
  import hashlib
5
6
 
@@ -67,6 +68,8 @@ class ChromaDBHandler(VectorStoreHandler):
67
68
  "persist_directory": self.persist_directory,
68
69
  }
69
70
 
71
+ self._use_handler_storage = False
72
+
70
73
  self.connect()
71
74
 
72
75
  def validate_connection_parameters(self, name, **kwargs):
@@ -79,11 +82,15 @@ class ChromaDBHandler(VectorStoreHandler):
79
82
 
80
83
  config = ChromaHandlerConfig(**_config)
81
84
 
82
- if config.persist_directory and not self.handler_storage.is_temporal:
83
- # get full persistence directory from handler storage
84
- self.persist_directory = self.handler_storage.folder_get(
85
- config.persist_directory
86
- )
85
+ if config.persist_directory:
86
+ if os.path.isabs(config.persist_directory):
87
+ self.persist_directory = config.persist_directory
88
+ elif not self.handler_storage.is_temporal:
89
+ # get full persistence directory from handler storage
90
+ self.persist_directory = self.handler_storage.folder_get(
91
+ config.persist_directory
92
+ )
93
+ self._use_handler_storage = True
87
94
 
88
95
  return config
89
96
 
@@ -105,7 +112,7 @@ class ChromaDBHandler(VectorStoreHandler):
105
112
 
106
113
  def _sync(self):
107
114
  """Sync the database to disk if using persistent storage"""
108
- if self.persist_directory:
115
+ if self.persist_directory and self._use_handler_storage:
109
116
  self.handler_storage.folder_sync(self.persist_directory)
110
117
 
111
118
  def __del__(self):
@@ -162,6 +169,8 @@ class ChromaDBHandler(VectorStoreHandler):
162
169
  FilterOperator.LESS_THAN_OR_EQUAL: "$lte",
163
170
  FilterOperator.GREATER_THAN: "$gt",
164
171
  FilterOperator.GREATER_THAN_OR_EQUAL: "$gte",
172
+ FilterOperator.IN: "$in",
173
+ FilterOperator.NOT_IN: "$nin",
165
174
  }
166
175
 
167
176
  if operator not in mapping:
@@ -308,7 +317,7 @@ class ChromaDBHandler(VectorStoreHandler):
308
317
  }
309
318
 
310
319
  if columns is not None:
311
- payload = {column: payload[column] for column in columns}
320
+ payload = {column: payload[column] for column in columns if column != TableField.DISTANCE.value}
312
321
 
313
322
  # always include distance
314
323
  distance_filter = None
@@ -316,10 +325,11 @@ class ChromaDBHandler(VectorStoreHandler):
316
325
  if distances is not None:
317
326
  payload[distance_col] = distances
318
327
 
319
- for cond in conditions:
320
- if cond.column == distance_col:
321
- distance_filter = cond
322
- break
328
+ if conditions is not None:
329
+ for cond in conditions:
330
+ if cond.column == distance_col:
331
+ distance_filter = cond
332
+ break
323
333
 
324
334
  df = pd.DataFrame(payload)
325
335
  if distance_filter is not None:
@@ -413,8 +423,8 @@ class ChromaDBHandler(VectorStoreHandler):
413
423
  collection.upsert(
414
424
  ids=data_dict[TableField.ID.value],
415
425
  documents=data_dict[TableField.CONTENT.value],
416
- embeddings=data_dict.get(TableField.EMBEDDINGS.value),
417
- metadatas=data_dict.get(TableField.METADATA.value)
426
+ embeddings=data_dict.get(TableField.EMBEDDINGS.value, None),
427
+ metadatas=data_dict.get(TableField.METADATA.value, None)
418
428
  )
419
429
  self._sync()
420
430
  except Exception as e:
@@ -177,7 +177,7 @@ class SqlServerHandler(DatabaseHandler):
177
177
  )
178
178
  )
179
179
  else:
180
- response = Response(RESPONSE_TYPE.OK)
180
+ response = Response(RESPONSE_TYPE.OK, affected_rows=cur.rowcount)
181
181
  connection.commit()
182
182
  except Exception as e:
183
183
  logger.error(f'Error running query: {query} on {self.database}, {e}!')
@@ -178,10 +178,11 @@ class MySQLHandler(DatabaseHandler):
178
178
  pd.DataFrame(
179
179
  result,
180
180
  columns=[x[0] for x in cur.description]
181
- )
181
+ ),
182
+ affected_rows=cur.rowcount
182
183
  )
183
184
  else:
184
- response = Response(RESPONSE_TYPE.OK)
185
+ response = Response(RESPONSE_TYPE.OK, affected_rows=cur.rowcount)
185
186
  except mysql.connector.Error as e:
186
187
  logger.error(f'Error running query: {query} on {self.connection_data["database"]}!')
187
188
  response = Response(
@@ -205,8 +205,10 @@ class OracleHandler(DatabaseHandler):
205
205
  with connection.cursor() as cur:
206
206
  try:
207
207
  cur.execute(query)
208
- result = cur.fetchall()
209
- if result:
208
+ if cur.description is None:
209
+ response = Response(RESPONSE_TYPE.OK, affected_rows=cur.rowcount)
210
+ else:
211
+ result = cur.fetchall()
210
212
  response = Response(
211
213
  RESPONSE_TYPE.TABLE,
212
214
  data_frame=pd.DataFrame(
@@ -214,8 +216,6 @@ class OracleHandler(DatabaseHandler):
214
216
  columns=[row[0] for row in cur.description],
215
217
  ),
216
218
  )
217
- else:
218
- response = Response(RESPONSE_TYPE.OK)
219
219
 
220
220
  connection.commit()
221
221
  except DatabaseError as database_error:
@@ -149,7 +149,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
149
149
  for key, value in filter_conditions.items():
150
150
  if key == "embeddings":
151
151
  continue
152
- if value['op'].lower() == 'in':
152
+ if value['op'].lower() in ('in', 'not in'):
153
153
  values = list(repr(i) for i in value['value'])
154
154
  value['value'] = '({})'.format(', '.join(values))
155
155
  else:
@@ -165,9 +165,9 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
165
165
 
166
166
  @staticmethod
167
167
  def _construct_full_after_from_clause(
168
+ where_clause: str,
168
169
  offset_clause: str,
169
170
  limit_clause: str,
170
- where_clause: str,
171
171
  ) -> str:
172
172
 
173
173
  return f"{where_clause} {offset_clause} {limit_clause}"
@@ -200,10 +200,20 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
200
200
  where_clause, offset_clause, limit_clause
201
201
  )
202
202
 
203
- if columns is None:
204
- targets = '*'
203
+ # Handle distance column specially since it's calculated, not stored
204
+ modified_columns = []
205
+ has_distance = False
206
+ if columns is not None:
207
+ for col in columns:
208
+ if col == TableField.DISTANCE.value:
209
+ has_distance = True
210
+ else:
211
+ modified_columns.append(col)
205
212
  else:
206
- targets = ', '.join(columns)
213
+ modified_columns = ['id', 'content', 'embeddings', 'metadata']
214
+ has_distance = True
215
+
216
+ targets = ', '.join(modified_columns)
207
217
 
208
218
 
209
219
  if filter_conditions:
@@ -227,6 +237,10 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
227
237
  # Use cosine similarity for dense vectors
228
238
  distance_op = "<=>"
229
239
 
240
+ # Calculate distance as part of the query if needed
241
+ if has_distance:
242
+ targets = f"{targets}, (embeddings {distance_op} '{search_vector}') as distance"
243
+
230
244
  return f"SELECT {targets} FROM {table_name} ORDER BY embeddings {distance_op} '{search_vector}' ASC {after_from_clause}"
231
245
 
232
246
  else:
@@ -228,7 +228,7 @@ class PostgresHandler(DatabaseHandler):
228
228
  else:
229
229
  cur.execute(query)
230
230
  if cur.pgresult is None or ExecStatus(cur.pgresult.status) == ExecStatus.COMMAND_OK:
231
- response = Response(RESPONSE_TYPE.OK)
231
+ response = Response(RESPONSE_TYPE.OK, affected_rows=cur.rowcount)
232
232
  else:
233
233
  result = cur.fetchall()
234
234
  df = DataFrame(
@@ -238,7 +238,8 @@ class PostgresHandler(DatabaseHandler):
238
238
  self._cast_dtypes(df, cur.description)
239
239
  response = Response(
240
240
  RESPONSE_TYPE.TABLE,
241
- df
241
+ data_frame=df,
242
+ affected_rows=cur.rowcount
242
243
  )
243
244
  connection.commit()
244
245
  except Exception as e:
@@ -255,15 +256,16 @@ class PostgresHandler(DatabaseHandler):
255
256
 
256
257
  return response
257
258
 
258
- def insert(self, table_name: str, df: pd.DataFrame):
259
+ def insert(self, table_name: str, df: pd.DataFrame) -> Response:
259
260
  need_to_close = not self.is_connected
260
261
 
261
262
  connection = self.connect()
262
263
 
263
264
  columns = [f'"{c}"' for c in df.columns]
265
+ rowcount = None
264
266
  with connection.cursor() as cur:
265
267
  try:
266
- with cur.copy(f'copy "{table_name}" ({",".join(columns)}) from STDIN WITH CSV') as copy:
268
+ with cur.copy(f'copy "{table_name}" ({",".join(columns)}) from STDIN WITH CSV') as copy:
267
269
  df.to_csv(copy, index=False, header=False)
268
270
 
269
271
  connection.commit()
@@ -271,10 +273,13 @@ class PostgresHandler(DatabaseHandler):
271
273
  logger.error(f'Error running insert to {table_name} on {self.database}, {e}!')
272
274
  connection.rollback()
273
275
  raise e
276
+ rowcount = cur.rowcount
274
277
 
275
278
  if need_to_close:
276
279
  self.disconnect()
277
280
 
281
+ return Response(RESPONSE_TYPE.OK, affected_rows=rowcount)
282
+
278
283
  @profiler.profile()
279
284
  def query(self, query: ASTNode) -> Response:
280
285
  """
@@ -52,7 +52,7 @@ class RedshiftHandler(PostgresHandler):
52
52
  with connection.cursor() as cur:
53
53
  try:
54
54
  cur.executemany(query, df.values.tolist())
55
- response = Response(RESPONSE_TYPE.OK)
55
+ response = Response(RESPONSE_TYPE.OK, affected_rows=cur.rowcount)
56
56
 
57
57
  connection.commit()
58
58
  except Exception as e:
@@ -230,18 +230,25 @@ class SnowflakeHandler(DatabaseHandler):
230
230
  # Fallback for CREATE/DELETE/UPDATE. These commands returns table with single column,
231
231
  # but it cannot be retrieved as pandas DataFrame.
232
232
  result = cur.fetchall()
233
- if result:
234
- response = Response(
235
- RESPONSE_TYPE.TABLE,
236
- DataFrame(
237
- result,
238
- columns=[x[0] for x in cur.description]
233
+ match result:
234
+ case (
235
+ [{'number of rows inserted': affected_rows}]
236
+ | [{'number of rows deleted': affected_rows}]
237
+ | [{'number of rows updated': affected_rows, 'number of multi-joined rows updated': _}]
238
+ ):
239
+ response = Response(RESPONSE_TYPE.OK, affected_rows=affected_rows)
240
+ case list():
241
+ response = Response(
242
+ RESPONSE_TYPE.TABLE,
243
+ DataFrame(
244
+ result,
245
+ columns=[x[0] for x in cur.description]
246
+ )
239
247
  )
240
- )
241
- else:
242
- # Looks like SnowFlake always returns something in response, so this is suspicious
243
- logger.warning('Snowflake did not return any data in response.')
244
- response = Response(RESPONSE_TYPE.OK)
248
+ case _:
249
+ # Looks like SnowFlake always returns something in response, so this is suspicious
250
+ logger.warning('Snowflake did not return any data in response.')
251
+ response = Response(RESPONSE_TYPE.OK)
245
252
  except Exception as e:
246
253
  logger.error(f"Error running query: {query} on {self.connection_data.get('database')}, {e}!")
247
254
  response = Response(
@@ -78,8 +78,7 @@ def learn_process(data_integration_ref: dict, problem_definition: dict, fetch_da
78
78
  query_ast = parse_sql(fetch_data_query)
79
79
  sqlquery = SQLQuery(query_ast, session=sql_session)
80
80
 
81
- result = sqlquery.fetch(view='dataframe')
82
- training_data_df = result['result']
81
+ training_data_df = sqlquery.fetched_data.to_df()
83
82
 
84
83
  training_data_columns_count, training_data_rows_count = 0, 0
85
84
  if training_data_df is not None:
@@ -1,3 +1,4 @@
1
+ from typing import Optional
1
2
  from pandas import DataFrame
2
3
 
3
4
  from mindsdb.utilities import log
@@ -8,13 +9,16 @@ from mindsdb_sql_parser.ast import ASTNode
8
9
  logger = log.getLogger(__name__)
9
10
 
10
11
  class HandlerResponse:
11
- def __init__(self, resp_type: RESPONSE_TYPE, data_frame: DataFrame = None,
12
- query: ASTNode = 0, error_code: int = 0, error_message: str = None) -> None:
12
+ def __init__(self, resp_type: RESPONSE_TYPE, data_frame: DataFrame = None, query: ASTNode = 0, error_code: int = 0,
13
+ error_message: Optional[str] = None, affected_rows: Optional[int] = None) -> None:
13
14
  self.resp_type = resp_type
14
15
  self.query = query
15
16
  self.data_frame = data_frame
16
17
  self.error_code = error_code
17
18
  self.error_message = error_message
19
+ self.affected_rows = affected_rows
20
+ if isinstance(self.affected_rows, int) is False or self.affected_rows < 0:
21
+ self.affected_rows = 0
18
22
 
19
23
  @property
20
24
  def type(self):
@@ -35,13 +39,14 @@ class HandlerResponse:
35
39
  "error": self.error_message}
36
40
 
37
41
  def __repr__(self):
38
- return "%s: resp_type=%s, query=%s, data_frame=%s, err_code=%s, error=%s" % (
42
+ return "%s: resp_type=%s, query=%s, data_frame=%s, err_code=%s, error=%s, affected_rows=%s" % (
39
43
  self.__class__.__name__,
40
44
  self.resp_type,
41
45
  self.query,
42
46
  self.data_frame,
43
47
  self.error_code,
44
- self.error_message
48
+ self.error_message,
49
+ self.affected_rows
45
50
  )
46
51
 
47
52
  class HandlerStatusResponse:
@@ -20,7 +20,7 @@ from mindsdb_sql_parser.ast.base import ASTNode
20
20
 
21
21
  from mindsdb.integrations.libs.response import RESPONSE_TYPE, HandlerResponse
22
22
  from mindsdb.utilities import log
23
- from mindsdb.integrations.utilities.sql_utils import conditions_to_filter, FilterCondition, FilterOperator
23
+ from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
24
24
 
25
25
  from mindsdb.integrations.utilities.query_traversal import query_traversal
26
26
  from .base import BaseHandler
@@ -39,6 +39,7 @@ class TableField(Enum):
39
39
  METADATA = "metadata"
40
40
  SEARCH_VECTOR = "search_vector"
41
41
  DISTANCE = "distance"
42
+ RELEVANCE = "relevance"
42
43
 
43
44
 
44
45
  class DistanceFunction(Enum):
@@ -69,6 +70,10 @@ class VectorStoreHandler(BaseHandler):
69
70
  "name": TableField.METADATA.value,
70
71
  "data_type": "json",
71
72
  },
73
+ {
74
+ "name": TableField.DISTANCE.value,
75
+ "data_type": "float",
76
+ },
72
77
  ]
73
78
 
74
79
  def validate_connection_parameters(self, name, **kwargs):
@@ -231,7 +236,7 @@ class VectorStoreHandler(BaseHandler):
231
236
 
232
237
  return self.do_upsert(table_name, pd.DataFrame(data))
233
238
 
234
- def _dispatch_update(self, query: Update):
239
+ def dispatch_update(self, query: Update, conditions: List[FilterCondition] = None):
235
240
  """
236
241
  Dispatch update query to the appropriate method.
237
242
  """
@@ -250,8 +255,15 @@ class VectorStoreHandler(BaseHandler):
250
255
  pass
251
256
  row[k] = v
252
257
 
253
- filters = conditions_to_filter(query.where)
254
- row.update(filters)
258
+ if conditions is None:
259
+ where_statement = query.where
260
+ conditions = self.extract_conditions(where_statement)
261
+
262
+ for condition in conditions:
263
+ if condition.op != FilterOperator.EQUAL:
264
+ raise NotImplementedError
265
+
266
+ row[condition.column] = condition.value
255
267
 
256
268
  # checks
257
269
  if TableField.EMBEDDINGS.value not in row:
@@ -381,7 +393,7 @@ class VectorStoreHandler(BaseHandler):
381
393
  CreateTable: self._dispatch_create_table,
382
394
  DropTables: self._dispatch_drop_table,
383
395
  Insert: self._dispatch_insert,
384
- Update: self._dispatch_update,
396
+ Update: self.dispatch_update,
385
397
  Delete: self.dispatch_delete,
386
398
  Select: self.dispatch_select,
387
399
  }
@@ -54,7 +54,7 @@ class LLMReranker(BaseDocumentCompressor):
54
54
  max_retries=2 # Client-level retries
55
55
  )
56
56
 
57
- async def search_relevancy(self, query: str, document: str) -> Any:
57
+ async def search_relevancy(self, query: str, document: str, custom_event: bool = True) -> Any:
58
58
  await self._init_client()
59
59
 
60
60
  async with self._semaphore:
@@ -82,7 +82,8 @@ class LLMReranker(BaseDocumentCompressor):
82
82
  }
83
83
 
84
84
  # Stream reranking update.
85
- dispatch_custom_event("rerank", rerank_data)
85
+ if custom_event:
86
+ dispatch_custom_event("rerank", rerank_data)
86
87
  return rerank_data
87
88
 
88
89
  except Exception as e:
@@ -93,7 +94,7 @@ class LLMReranker(BaseDocumentCompressor):
93
94
  retry_delay = self.retry_delay * (2 ** attempt) + random.uniform(0, 0.1)
94
95
  await asyncio.sleep(retry_delay)
95
96
 
96
- async def _rank(self, query_document_pairs: List[Tuple[str, str]]) -> List[Tuple[str, float]]:
97
+ async def _rank(self, query_document_pairs: List[Tuple[str, str]], custom_event: bool = True) -> List[Tuple[str, float]]:
97
98
  ranked_results = []
98
99
 
99
100
  # Process in larger batches for better throughput
@@ -102,7 +103,7 @@ class LLMReranker(BaseDocumentCompressor):
102
103
  batch = query_document_pairs[i:i + batch_size]
103
104
  try:
104
105
  results = await asyncio.gather(
105
- *[self.search_relevancy(query=query, document=document) for (query, document) in batch],
106
+ *[self.search_relevancy(query=query, document=document, custom_event=custom_event) for (query, document) in batch],
106
107
  return_exceptions=True
107
108
  )
108
109
 
@@ -227,16 +228,7 @@ class LLMReranker(BaseDocumentCompressor):
227
228
  "remove_irrelevant": self.remove_irrelevant,
228
229
  }
229
230
 
230
- def get_scores(self, query: str, documents: list[str], disable_events: bool = True):
231
- """
232
- Get relevance scores for documents given a query.
233
- Args:
234
- query: The query text
235
- documents: List of document texts to score
236
- disable_events: Whether to disable event dispatching (default True)
237
- Returns:
238
- List of relevance scores
239
- """
231
+ def get_scores(self, query: str, documents: list[str], custom_event: bool = False):
240
232
  query_document_pairs = [(query, doc) for doc in documents]
241
233
  # Create event loop and run async code
242
234
  import asyncio
@@ -246,89 +238,7 @@ class LLMReranker(BaseDocumentCompressor):
246
238
  # If no running loop exists, create a new one
247
239
  loop = asyncio.new_event_loop()
248
240
  asyncio.set_event_loop(loop)
249
- # If disable_events is True, we need to modify the _rank function to not use dispatch_custom_event
250
- if disable_events:
251
- # Create a wrapper function that doesn't dispatch events
252
- async def _rank_without_events(query_document_pairs):
253
- ranked_results = []
254
- # Process in larger batches for better throughput
255
- batch_size = min(self.max_concurrent_requests * 2, len(query_document_pairs))
256
- for i in range(0, len(query_document_pairs), batch_size):
257
- batch = query_document_pairs[i:i + batch_size]
258
- try:
259
- # Define a no-events version of search_relevancy inside this closure
260
- async def search_relevancy_no_events(query, document):
261
- await self._init_client()
262
- async with self._semaphore:
263
- for attempt in range(self.max_retries):
264
- try:
265
- response = await self.client.chat.completions.create(
266
- model=self.model,
267
- messages=[
268
- {"role": "system", "content": "Rate the relevance of the document to the query. Respond with 'yes' or 'no'."},
269
- {"role": "user", "content": f"Query: {query}\nDocument: {document}\nIs this document relevant?"}
270
- ],
271
- temperature=self.temperature,
272
- n=1,
273
- logprobs=True,
274
- max_tokens=1
275
- )
276
- # Extract response and confidence score
277
- answer = response.choices[0].message.content
278
- logprob = response.choices[0].logprobs.content[0].logprob
279
- # No event dispatch here
280
- return {"document": document, "answer": answer, "logprob": logprob}
281
- except Exception as e:
282
- if attempt == self.max_retries - 1:
283
- log.error(f"Failed after {self.max_retries} attempts: {str(e)}")
284
- raise
285
- # Exponential backoff with jitter
286
- retry_delay = self.retry_delay * (2 ** attempt) + random.uniform(0, 0.1)
287
- await asyncio.sleep(retry_delay)
288
- # Use our no-events version for this batch
289
- results = await asyncio.gather(
290
- *[search_relevancy_no_events(query=query, document=document) for (query, document) in batch],
291
- return_exceptions=True
292
- )
293
- for idx, result in enumerate(results):
294
- if isinstance(result, Exception):
295
- log.error(f"Error processing document {i+idx}: {str(result)}")
296
- ranked_results.append((batch[idx][1], 0.0))
297
- continue
298
- answer = result["answer"]
299
- logprob = result["logprob"]
300
- prob = math.exp(logprob)
301
- # Convert answer to score using the model's confidence
302
- if answer.lower().strip() == "yes":
303
- score = prob # If yes, use the model's confidence
304
- elif answer.lower().strip() == "no":
305
- score = 1 - prob # If no, invert the confidence
306
- else:
307
- score = 0.5 * prob # For unclear answers, reduce confidence
308
- ranked_results.append((batch[idx][1], score))
309
- # Check if we should stop early
310
- try:
311
- high_scoring_docs = [r for r in ranked_results if r[1] >= self.filtering_threshold]
312
- can_stop_early = (
313
- self.early_stop # Early stopping is enabled
314
- and self.num_docs_to_keep # We have a target number of docs
315
- and len(high_scoring_docs) >= self.num_docs_to_keep # Found enough good docs
316
- and score >= self.early_stop_threshold # Current doc is good enough
317
- )
318
- if can_stop_early:
319
- log.info(f"Early stopping after finding {self.num_docs_to_keep} documents with high confidence")
320
- return ranked_results
321
- except Exception as e:
322
- # Don't let early stopping errors stop the whole process
323
- log.warning(f"Error in early stopping check: {str(e)}")
324
- except Exception as e:
325
- log.error(f"Batch processing error: {str(e)}")
326
- continue
327
- return ranked_results
328
- # Use our no-events version
329
- documents_and_scores = loop.run_until_complete(_rank_without_events(query_document_pairs))
330
- else:
331
- # Use the original _rank method
332
- documents_and_scores = loop.run_until_complete(self._rank(query_document_pairs))
241
+
242
+ documents_and_scores = loop.run_until_complete(self._rank(query_document_pairs, custom_event=custom_event))
333
243
  scores = [score for _, score in documents_and_scores]
334
244
  return scores
@@ -1,21 +1,21 @@
1
+ from typing import List
1
2
  from copy import deepcopy
2
3
  from abc import ABC, abstractmethod
3
- from typing import List, Union, Tuple
4
4
  from collections import OrderedDict
5
5
 
6
6
  import pandas as pd
7
-
8
7
  from mindsdb_sql_parser import parse_sql
9
8
  from mindsdb_sql_parser.ast import Select, Identifier, Star, BinaryOperation, Constant, Join, Function
10
9
  from mindsdb_sql_parser.utils import JoinType
10
+
11
11
  from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
12
12
  from mindsdb.integrations.utilities.query_traversal import query_traversal
13
-
14
13
  from mindsdb.utilities.functions import resolve_table_identifier
15
14
  from mindsdb.api.executor.utilities.sql import get_query_tables
16
15
  from mindsdb.utilities.exception import EntityNotExistsError
17
16
  import mindsdb.interfaces.storage.db as db
18
17
  from mindsdb.utilities.context import context as ctx
18
+ from mindsdb.api.executor.datahub.classes.response import DataHubResponse
19
19
  from mindsdb.api.executor.datahub.classes.tables_row import (
20
20
  TABLES_ROW_TYPE,
21
21
  TablesRow,
@@ -223,8 +223,7 @@ class LogDBController:
223
223
  for table_name in self._tables.keys()
224
224
  ]
225
225
 
226
- def query(self, query: Select = None, native_query: str = None,
227
- session=None, return_as: str = 'split') -> Union[pd.DataFrame, Tuple[pd.DataFrame, list]]:
226
+ def query(self, query: Select = None, native_query: str = None, session=None) -> DataHubResponse:
228
227
  if native_query is not None:
229
228
  if query is not None:
230
229
  raise Exception("'query' and 'native_query' arguments can not be used together")
@@ -286,12 +285,12 @@ class LogDBController:
286
285
  df[df_column_name] = df[df_column_name].astype(column_type)
287
286
  # endregion
288
287
 
289
- if return_as != 'split':
290
- return df
291
-
292
288
  columns_info = [{
293
289
  'name': k,
294
290
  'type': v
295
291
  } for k, v in df.dtypes.items()]
296
292
 
297
- return df, columns_info
293
+ return DataHubResponse(
294
+ data_frame=df,
295
+ columns=columns_info
296
+ )
@@ -137,14 +137,10 @@ class Project:
137
137
  view_meta['query_ast'],
138
138
  session=session
139
139
  )
140
- result = sqlquery.fetch(view='dataframe')
141
-
140
+ df = sqlquery.fetched_data.to_df()
142
141
  finally:
143
142
  query_context_controller.release_context('view', view_meta['id'])
144
143
 
145
- if result['success'] is False:
146
- raise Exception(f"Cant execute view query: {view_meta['query_ast']}")
147
- df = result['result']
148
144
  # remove duplicated columns
149
145
  df = df.loc[:, ~df.columns.duplicated()]
150
146