MindsDB 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (61) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +53 -94
  3. mindsdb/api/a2a/agent.py +30 -206
  4. mindsdb/api/a2a/common/server/server.py +26 -27
  5. mindsdb/api/a2a/task_manager.py +93 -227
  6. mindsdb/api/a2a/utils.py +21 -0
  7. mindsdb/api/executor/command_executor.py +8 -6
  8. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
  9. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
  10. mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
  11. mindsdb/api/executor/planner/query_prepare.py +68 -87
  12. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
  13. mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
  14. mindsdb/api/executor/utilities/sql.py +97 -21
  15. mindsdb/api/http/namespaces/agents.py +126 -201
  16. mindsdb/api/http/namespaces/config.py +12 -1
  17. mindsdb/api/http/namespaces/file.py +49 -24
  18. mindsdb/api/mcp/start.py +45 -31
  19. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
  20. mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
  21. mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
  22. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
  23. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
  24. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
  25. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  26. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
  27. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +244 -141
  28. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
  29. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
  30. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
  31. mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
  32. mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
  33. mindsdb/integrations/libs/keyword_search_base.py +41 -0
  34. mindsdb/integrations/libs/vectordatabase_handler.py +114 -84
  35. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
  36. mindsdb/integrations/utilities/sql_utils.py +11 -0
  37. mindsdb/interfaces/agents/agents_controller.py +29 -9
  38. mindsdb/interfaces/agents/langchain_agent.py +7 -5
  39. mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
  40. mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
  41. mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
  42. mindsdb/interfaces/database/projects.py +1 -3
  43. mindsdb/interfaces/functions/controller.py +54 -64
  44. mindsdb/interfaces/functions/to_markdown.py +47 -14
  45. mindsdb/interfaces/knowledge_base/controller.py +228 -110
  46. mindsdb/interfaces/knowledge_base/evaluate.py +18 -6
  47. mindsdb/interfaces/knowledge_base/executor.py +346 -0
  48. mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
  49. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
  50. mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
  51. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
  52. mindsdb/interfaces/skills/sql_agent.py +181 -130
  53. mindsdb/interfaces/storage/db.py +9 -7
  54. mindsdb/utilities/config.py +58 -40
  55. mindsdb/utilities/exception.py +58 -7
  56. mindsdb/utilities/security.py +54 -11
  57. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +245 -259
  58. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +61 -58
  59. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
  60. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
  61. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  import os
2
2
  import json
3
- from typing import Dict, List, Union, Literal
3
+ from typing import Dict, List, Literal, Tuple
4
4
  from urllib.parse import urlparse
5
5
 
6
6
  import pandas as pd
@@ -16,8 +16,10 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
16
16
  FilterCondition,
17
17
  VectorStoreHandler,
18
18
  DistanceFunction,
19
- TableField
19
+ TableField,
20
20
  )
21
+ from mindsdb.integrations.libs.keyword_search_base import KeywordSearchBase
22
+ from mindsdb.integrations.utilities.sql_utils import KeywordSearchArgs
21
23
  from mindsdb.utilities import log
22
24
  from mindsdb.utilities.profiler import profiler
23
25
  from mindsdb.utilities.context import context as ctx
@@ -26,19 +28,18 @@ logger = log.getLogger(__name__)
26
28
 
27
29
 
28
30
  # todo Issue #7316 add support for different indexes and search algorithms e.g. cosine similarity or L2 norm
29
- class PgVectorHandler(PostgresHandler, VectorStoreHandler):
31
+ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
30
32
  """This handler handles connection and execution of the PostgreSQL with pgvector extension statements."""
31
33
 
32
34
  name = "pgvector"
33
35
 
34
36
  def __init__(self, name: str, **kwargs):
35
-
36
37
  super().__init__(name=name, **kwargs)
37
38
  self._is_shared_db = False
38
39
  self._is_vector_registered = False
39
40
  # we get these from the connection args on PostgresHandler parent
40
- self._is_sparse = self.connection_args.get('is_sparse', False)
41
- self._vector_size = self.connection_args.get('vector_size', None)
41
+ self._is_sparse = self.connection_args.get("is_sparse", False)
42
+ self._vector_size = self.connection_args.get("vector_size", None)
42
43
 
43
44
  if self._is_sparse:
44
45
  if not self._vector_size:
@@ -48,20 +49,20 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
48
49
  distance_op = "<#>"
49
50
 
50
51
  else:
51
- distance_op = '<=>'
52
- if 'distance' in self.connection_args:
52
+ distance_op = "<=>"
53
+ if "distance" in self.connection_args:
53
54
  distance_ops = {
54
- 'l1': '<+>',
55
- 'l2': '<->',
56
- 'ip': '<#>', # inner product
57
- 'cosine': '<=>',
58
- 'hamming': '<~>',
59
- 'jaccard': '<%>'
55
+ "l1": "<+>",
56
+ "l2": "<->",
57
+ "ip": "<#>", # inner product
58
+ "cosine": "<=>",
59
+ "hamming": "<~>",
60
+ "jaccard": "<%>",
60
61
  }
61
62
 
62
- distance_op = distance_ops.get(self.connection_args['distance'])
63
+ distance_op = distance_ops.get(self.connection_args["distance"])
63
64
  if distance_op is None:
64
- raise ValueError(f'Wrong distance type. Allowed options are {list(distance_ops.keys())}')
65
+ raise ValueError(f"Wrong distance type. Allowed options are {list(distance_ops.keys())}")
65
66
 
66
67
  self.distance_op = distance_op
67
68
  self.connect()
@@ -72,26 +73,26 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
72
73
 
73
74
  """
74
75
  distance_ops_to_metric_type_map = {
75
- '<->': 'vector_l2_ops',
76
- '<#>': 'vector_ip_ops',
77
- '<=>': 'vector_cosine_ops',
78
- '<+>': 'vector_l1_ops',
79
- '<~>': 'bit_hamming_ops',
80
- '<%>': 'bit_jaccard_ops'
76
+ "<->": "vector_l2_ops",
77
+ "<#>": "vector_ip_ops",
78
+ "<=>": "vector_cosine_ops",
79
+ "<+>": "vector_l1_ops",
80
+ "<~>": "bit_hamming_ops",
81
+ "<%>": "bit_jaccard_ops",
81
82
  }
82
- return distance_ops_to_metric_type_map.get(self.distance_op, 'vector_cosine_ops')
83
+ return distance_ops_to_metric_type_map.get(self.distance_op, "vector_cosine_ops")
83
84
 
84
85
  def _make_connection_args(self):
85
- cloud_pgvector_url = os.environ.get('KB_PGVECTOR_URL')
86
+ cloud_pgvector_url = os.environ.get("KB_PGVECTOR_URL")
86
87
  # if no connection args and shared pg vector defined - use it
87
88
  if len(self.connection_args) == 0 and cloud_pgvector_url is not None:
88
89
  result = urlparse(cloud_pgvector_url)
89
90
  self.connection_args = {
90
- 'host': result.hostname,
91
- 'port': result.port,
92
- 'user': result.username,
93
- 'password': result.password,
94
- 'database': result.path[1:]
91
+ "host": result.hostname,
92
+ "port": result.port,
93
+ "user": result.username,
94
+ "password": result.password,
95
+ "database": result.path[1:],
95
96
  }
96
97
  self._is_shared_db = True
97
98
  return super()._make_connection_args()
@@ -132,9 +133,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
132
133
 
133
134
  except psycopg.Error as e:
134
135
  self.connection.rollback()
135
- logger.error(
136
- f"Error loading pg_vector extension, ensure you have installed it before running, {e}!"
137
- )
136
+ logger.error(f"Error loading pg_vector extension, ensure you have installed it before running, {e}!")
138
137
  raise
139
138
 
140
139
  # register vector type with psycopg2 connection
@@ -143,19 +142,33 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
143
142
 
144
143
  return self.connection
145
144
 
145
+ def add_full_text_index(self, table_name: str, column_name: str) -> Response:
146
+ """
147
+ Add a full text index to the specified column of the table.
148
+ Args:
149
+ table_name (str): Name of the table to add the index to.
150
+ column_name (str): Name of the column to add the index to.
151
+ Returns:
152
+ Response: Response object indicating success or failure.
153
+ """
154
+ table_name = self._check_table(table_name)
155
+ query = f"CREATE INDEX IF NOT EXISTS {table_name}_{column_name}_fts_idx ON {table_name} USING gin(to_tsvector('english', {column_name}))"
156
+ self.raw_query(query)
157
+ return Response(RESPONSE_TYPE.OK)
158
+
146
159
  @staticmethod
147
- def _translate_conditions(conditions: List[FilterCondition]) -> Union[dict, None]:
160
+ def _translate_conditions(conditions: List[FilterCondition]) -> Tuple[List[dict], dict]:
148
161
  """
149
162
  Translate filter conditions to a dictionary
150
163
  """
151
164
 
152
165
  if conditions is None:
153
- return {}
166
+ conditions = []
154
167
 
155
- filter_conditions = {}
168
+ filter_conditions = []
169
+ embedding_condition = None
156
170
 
157
171
  for condition in conditions:
158
-
159
172
  parts = condition.column.split(".")
160
173
  key = parts[0]
161
174
  # converts 'col.el1.el2' to col->'el1'->>'el2'
@@ -167,12 +180,25 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
167
180
  # last element
168
181
  key += f" ->> '{parts[-1]}'"
169
182
 
170
- filter_conditions[key] = {
183
+ type_cast = None
184
+ if isinstance(condition.value, int):
185
+ type_cast = "int"
186
+ elif isinstance(condition.value, float):
187
+ type_cast = "float"
188
+ if type_cast is not None:
189
+ key = f"({key})::{type_cast}"
190
+
191
+ item = {
192
+ "name": key,
171
193
  "op": condition.op.value,
172
194
  "value": condition.value,
173
195
  }
196
+ if key == "embeddings":
197
+ embedding_condition = item
198
+ else:
199
+ filter_conditions.append(item)
174
200
 
175
- return filter_conditions
201
+ return filter_conditions, embedding_condition
176
202
 
177
203
  @staticmethod
178
204
  def _construct_where_clause(filter_conditions=None):
@@ -184,15 +210,18 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
184
210
 
185
211
  where_clauses = []
186
212
 
187
- for key, value in filter_conditions.items():
188
- if key == "embeddings":
189
- continue
190
- if value['op'].lower() in ('in', 'not in'):
191
- values = list(repr(i) for i in value['value'])
192
- value['value'] = '({})'.format(', '.join(values))
213
+ for item in filter_conditions:
214
+ key = item["name"]
215
+
216
+ if item["op"].lower() in ("in", "not in"):
217
+ values = list(repr(i) for i in item["value"])
218
+ item["value"] = "({})".format(", ".join(values))
193
219
  else:
194
- value['value'] = repr(value['value'])
195
- where_clauses.append(f'{key} {value["op"]} {value["value"]}')
220
+ if item["value"] is None:
221
+ item["value"] = "null"
222
+ else:
223
+ item["value"] = repr(item["value"])
224
+ where_clauses.append(f"{key} {item['op']} {item['value']}")
196
225
 
197
226
  if len(where_clauses) > 1:
198
227
  return f"WHERE {' AND '.join(where_clauses)}"
@@ -201,15 +230,78 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
201
230
  else:
202
231
  return ""
203
232
 
233
+ @staticmethod
234
+ def _construct_where_clause_with_keywords(filter_conditions=None, keyword_query=None, content_column_name=None):
235
+ if not keyword_query or not content_column_name:
236
+ return PgVectorHandler._construct_where_clause(filter_conditions)
237
+
238
+ keyword_query_condition = (
239
+ f"""to_tsvector('english', {content_column_name}) @@ websearch_to_tsquery('english', '{keyword_query}')"""
240
+ )
241
+ if filter_conditions is None:
242
+ return ""
243
+
244
+ where_clauses = []
245
+
246
+ for item in filter_conditions:
247
+ key = item["name"]
248
+
249
+ if item["op"].lower() in ("in", "not in"):
250
+ values = list(repr(i) for i in item["value"])
251
+ item["value"] = "({})".format(", ".join(values))
252
+ else:
253
+ if item["value"] is None:
254
+ item["value"] = "null"
255
+ else:
256
+ item["value"] = repr(item["value"])
257
+ where_clauses.append(f"{key} {item['op']} {item['value']}")
258
+
259
+ where_clauses.append(keyword_query_condition)
260
+ if len(where_clauses) > 1:
261
+ return f"WHERE {' AND '.join(where_clauses)}"
262
+ elif len(where_clauses) == 1:
263
+ return f"WHERE {where_clauses[0]}"
264
+ else:
265
+ return ""
266
+
204
267
  @staticmethod
205
268
  def _construct_full_after_from_clause(
206
269
  where_clause: str,
207
270
  offset_clause: str,
208
271
  limit_clause: str,
209
272
  ) -> str:
210
-
211
273
  return f"{where_clause} {offset_clause} {limit_clause}"
212
274
 
275
+ def _build_keyword_bm25_query(
276
+ self,
277
+ table_name: str,
278
+ query: str,
279
+ columns: List[str] = None,
280
+ content_column_name: str = "content",
281
+ conditions: List[FilterCondition] = None,
282
+ limit: int = None,
283
+ offset: int = None,
284
+ ):
285
+ if columns is None:
286
+ columns = ["id", "content", "metadata"]
287
+
288
+ filter_conditions, _ = self._translate_conditions(conditions)
289
+
290
+ # given filter conditions, construct where clause
291
+ where_clause = self._construct_where_clause_with_keywords(filter_conditions, query, content_column_name)
292
+
293
+ query = f"""
294
+ SELECT
295
+ {", ".join(columns)},
296
+ ts_rank_cd(to_tsvector('english', {content_column_name}), websearch_to_tsquery('english', '{query}')) as distance
297
+ FROM
298
+ {table_name}
299
+ {where_clause if where_clause else ""}
300
+ {f"LIMIT {limit}" if limit else ""}
301
+ {f"OFFSET {offset}" if offset else ""};"""
302
+
303
+ return query
304
+
213
305
  def _build_select_query(
214
306
  self,
215
307
  table_name: str,
@@ -225,10 +317,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
225
317
  offset_clause = f"OFFSET {offset}" if offset else ""
226
318
 
227
319
  # translate filter conditions to dictionary
228
- filter_conditions = self._translate_conditions(conditions)
229
-
230
- # check if search vector is in filter conditions
231
- embedding_search = filter_conditions.get("embeddings", None)
320
+ filter_conditions, embedding_search = self._translate_conditions(conditions)
232
321
 
233
322
  # given filter conditions, construct where clause
234
323
  where_clause = self._construct_where_clause(filter_conditions)
@@ -243,47 +332,41 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
243
332
  else:
244
333
  modified_columns.append(col)
245
334
  else:
246
- modified_columns = ['id', 'content', 'embeddings', 'metadata']
335
+ modified_columns = ["id", "content", "embeddings", "metadata"]
247
336
  has_distance = True
248
337
 
249
- targets = ', '.join(modified_columns)
250
-
251
- if filter_conditions:
338
+ targets = ", ".join(modified_columns)
252
339
 
253
- if embedding_search:
254
- search_vector = filter_conditions["embeddings"]["value"]
255
- filter_conditions.pop("embeddings")
340
+ if embedding_search:
341
+ search_vector = embedding_search["value"]
256
342
 
257
- if self._is_sparse:
258
- # Convert dict to sparse vector if needed
259
- if isinstance(search_vector, dict):
260
- from pgvector.utils import SparseVector
261
- embedding = SparseVector(search_vector, self._vector_size)
262
- search_vector = embedding.to_text()
263
- else:
264
- # Convert list to vector string if needed
265
- if isinstance(search_vector, list):
266
- search_vector = f"[{','.join(str(x) for x in search_vector)}]"
343
+ if self._is_sparse:
344
+ # Convert dict to sparse vector if needed
345
+ if isinstance(search_vector, dict):
346
+ from pgvector.utils import SparseVector
267
347
 
268
- # Calculate distance as part of the query if needed
269
- if has_distance:
270
- targets = f"{targets}, (embeddings {self.distance_op} '{search_vector}') as distance"
348
+ embedding = SparseVector(search_vector, self._vector_size)
349
+ search_vector = embedding.to_text()
350
+ else:
351
+ # Convert list to vector string if needed
352
+ if isinstance(search_vector, list):
353
+ search_vector = f"[{','.join(str(x) for x in search_vector)}]"
271
354
 
272
- return f"SELECT {targets} FROM {table_name} {where_clause} ORDER BY embeddings {self.distance_op} '{search_vector}' ASC {limit_clause} {offset_clause} "
355
+ # Calculate distance as part of the query if needed
356
+ if has_distance:
357
+ targets = f"{targets}, (embeddings {self.distance_op} '{search_vector}') as distance"
273
358
 
274
- else:
275
- # if filter conditions, return rows that satisfy the conditions
276
- return f"SELECT {targets} FROM {table_name} {where_clause} {limit_clause} {offset_clause}"
359
+ return f"SELECT {targets} FROM {table_name} {where_clause} ORDER BY embeddings {self.distance_op} '{search_vector}' ASC {limit_clause} {offset_clause} "
277
360
 
278
361
  else:
279
- # if no filter conditions, return all rows
280
- return f"SELECT {targets} FROM {table_name} {limit_clause} {offset_clause}"
362
+ # if filter conditions, return rows that satisfy the conditions
363
+ return f"SELECT {targets} FROM {table_name} {where_clause} {limit_clause} {offset_clause}"
281
364
 
282
365
  def _check_table(self, table_name: str):
283
366
  # Apply namespace for a user
284
367
  if self._is_shared_db:
285
- company_id = ctx.company_id or 'x'
286
- return f't_{company_id}_{table_name}'
368
+ company_id = ctx.company_id or "x"
369
+ return f"t_{company_id}_{table_name}"
287
370
  return table_name
288
371
 
289
372
  def select(
@@ -303,6 +386,33 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
303
386
  columns = ["id", "content", "embeddings", "metadata"]
304
387
 
305
388
  query = self._build_select_query(table_name, columns, conditions, limit, offset)
389
+
390
+ result = self.raw_query(query)
391
+
392
+ # ensure embeddings are returned as string so they can be parsed by mindsdb
393
+ if "embeddings" in columns:
394
+ result["embeddings"] = result["embeddings"].astype(str)
395
+
396
+ return result
397
+
398
+ def keyword_select(
399
+ self,
400
+ table_name: str,
401
+ columns: List[str] = None,
402
+ conditions: List[FilterCondition] = None,
403
+ offset: int = None,
404
+ limit: int = None,
405
+ keyword_search_args: KeywordSearchArgs = None,
406
+ ) -> pd.DataFrame:
407
+ table_name = self._check_table(table_name)
408
+
409
+ if columns is None:
410
+ columns = ["id", "content", "embeddings", "metadata"]
411
+ content_column_name = keyword_search_args.column
412
+ query = self._build_keyword_bm25_query(
413
+ table_name, keyword_search_args.query, columns, content_column_name, conditions, limit, offset
414
+ )
415
+
306
416
  result = self.raw_query(query)
307
417
 
308
418
  # ensure embeddings are returned as string so they can be parsed by mindsdb
@@ -318,9 +428,9 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
318
428
  query: str = None,
319
429
  metadata: Dict[str, str] = None,
320
430
  distance_function=DistanceFunction.COSINE_DISTANCE,
321
- **kwargs
431
+ **kwargs,
322
432
  ) -> pd.DataFrame:
323
- '''
433
+ """
324
434
  Executes a hybrid search, combining semantic search and one or both of keyword/metadata search.
325
435
 
326
436
  For insight on the query construction, see: https://docs.pgvecto.rs/use-case/hybrid-search.html#advanced-search-merge-the-results-of-full-text-search-and-vector-search.
@@ -340,23 +450,25 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
340
450
 
341
451
  Returns:
342
452
  df(pd.DataFrame): Hybrid search result, sorted by hybrid search rank
343
- '''
453
+ """
344
454
  if query is None and metadata is None:
345
- raise ValueError('Must provide at least one of: query for keyword search, or metadata filters. For only embeddings search, use normal search instead.')
346
-
347
- id_column_name = kwargs.get('id_column_name', 'id')
348
- content_column_name = kwargs.get('content_column_name', 'content')
349
- embeddings_column_name = kwargs.get('embeddings_column_name', 'embeddings')
350
- metadata_column_name = kwargs.get('metadata_column_name', 'metadata')
455
+ raise ValueError(
456
+ "Must provide at least one of: query for keyword search, or metadata filters. For only embeddings search, use normal search instead."
457
+ )
458
+
459
+ id_column_name = kwargs.get("id_column_name", "id")
460
+ content_column_name = kwargs.get("content_column_name", "content")
461
+ embeddings_column_name = kwargs.get("embeddings_column_name", "embeddings")
462
+ metadata_column_name = kwargs.get("metadata_column_name", "metadata")
351
463
  # Filter by given metadata for semantic search & full text search CTEs, if present.
352
- where_clause = ' WHERE '
464
+ where_clause = " WHERE "
353
465
  if metadata is None:
354
- where_clause = ''
466
+ where_clause = ""
355
467
  metadata = {}
356
468
  for i, (k, v) in enumerate(metadata.items()):
357
469
  where_clause += f"{metadata_column_name}->>'{k}' = '{v}'"
358
470
  if i < len(metadata.items()) - 1:
359
- where_clause += ' AND '
471
+ where_clause += " AND "
360
472
 
361
473
  # See https://docs.pgvecto.rs/use-case/hybrid-search.html#advanced-search-merge-the-results-of-full-text-search-and-vector-search.
362
474
  #
@@ -381,47 +493,51 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
381
493
  # Or, if we are only doing metadata search, we leave out the JOIN & full text search CTEs.
382
494
  #
383
495
  # We calculate the final "hybrid" rank by summing the reciprocals of the ranks from each individual CTE.
384
- semantic_search_cte = f'''WITH semantic_search AS (
496
+ semantic_search_cte = f"""WITH semantic_search AS (
385
497
  SELECT {id_column_name}, {content_column_name}, {embeddings_column_name},
386
498
  RANK () OVER (ORDER BY {embeddings_column_name} {distance_function.value} '{str(embeddings)}') AS rank
387
499
  FROM {table_name}{where_clause}
388
500
  ORDER BY {embeddings_column_name} {distance_function.value} '{str(embeddings)}'::vector
389
- )'''
501
+ )"""
390
502
 
391
- full_text_search_cte = ''
503
+ full_text_search_cte = ""
392
504
  if query is not None:
393
- ts_vector_clause = f"WHERE to_tsvector('english', {content_column_name}) @@ plainto_tsquery('english', '{query}')"
505
+ ts_vector_clause = (
506
+ f"WHERE to_tsvector('english', {content_column_name}) @@ plainto_tsquery('english', '{query}')"
507
+ )
394
508
  if metadata:
395
- ts_vector_clause = f"AND to_tsvector('english', {content_column_name}) @@ plainto_tsquery('english', '{query}')"
396
- full_text_search_cte = f''',
509
+ ts_vector_clause = (
510
+ f"AND to_tsvector('english', {content_column_name}) @@ plainto_tsquery('english', '{query}')"
511
+ )
512
+ full_text_search_cte = f""",
397
513
  full_text_search AS (
398
514
  SELECT {id_column_name}, {content_column_name}, {embeddings_column_name},
399
515
  RANK () OVER (ORDER BY ts_rank(to_tsvector('english', {content_column_name}), plainto_tsquery('english', '{query}')) DESC) AS rank
400
516
  FROM {table_name}{where_clause}
401
517
  {ts_vector_clause}
402
518
  ORDER BY ts_rank(to_tsvector('english', {content_column_name}), plainto_tsquery('english', '{query}')) DESC
403
- )'''
519
+ )"""
404
520
 
405
- hybrid_select = '''
406
- SELECT * FROM semantic_search'''
521
+ hybrid_select = """
522
+ SELECT * FROM semantic_search"""
407
523
  if query is not None:
408
- hybrid_select = f'''
524
+ hybrid_select = f"""
409
525
  SELECT
410
526
  COALESCE(semantic_search.{id_column_name}, full_text_search.{id_column_name}) AS id,
411
527
  COALESCE(semantic_search.{content_column_name}, full_text_search.{content_column_name}) AS content,
412
528
  COALESCE(semantic_search.{embeddings_column_name}, full_text_search.{embeddings_column_name}) AS embeddings,
413
529
  COALESCE(1.0 / (1 + semantic_search.rank), 0.0) + COALESCE(1.0 / (1 + full_text_search.rank), 0.0) AS rank
414
530
  FROM semantic_search FULL OUTER JOIN full_text_search USING ({id_column_name}) ORDER BY rank DESC;
415
- '''
531
+ """
416
532
 
417
- full_search_query = f'{semantic_search_cte}{full_text_search_cte}{hybrid_select}'
533
+ full_search_query = f"{semantic_search_cte}{full_text_search_cte}{hybrid_select}"
418
534
  return self.raw_query(full_search_query)
419
535
 
420
536
  def create_table(self, table_name: str):
421
537
  """Create a table with a vector column."""
422
538
  with self.connection.cursor() as cur:
423
539
  # For sparse vectors, use sparsevec type
424
- vector_column_type = 'sparsevec' if self._is_sparse else 'vector'
540
+ vector_column_type = "sparsevec" if self._is_sparse else "vector"
425
541
 
426
542
  # Vector size is required for sparse vectors, optional for dense
427
543
  if self._is_sparse and not self._vector_size:
@@ -429,8 +545,8 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
429
545
 
430
546
  # Add vector size specification only if provided
431
547
  size_spec = f"({self._vector_size})" if self._vector_size is not None else "()"
432
- if vector_column_type == 'vector':
433
- size_spec = ''
548
+ if vector_column_type == "vector":
549
+ size_spec = ""
434
550
 
435
551
  cur.execute(f"""
436
552
  CREATE TABLE IF NOT EXISTS {table_name} (
@@ -442,16 +558,14 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
442
558
  """)
443
559
  self.connection.commit()
444
560
 
445
- def insert(
446
- self, table_name: str, data: pd.DataFrame
447
- ):
561
+ def insert(self, table_name: str, data: pd.DataFrame):
448
562
  """
449
563
  Insert data into the pgvector table database.
450
564
  """
451
565
  table_name = self._check_table(table_name)
452
566
 
453
- if 'metadata' in data.columns:
454
- data['metadata'] = data['metadata'].apply(json.dumps)
567
+ if "metadata" in data.columns:
568
+ data["metadata"] = data["metadata"].apply(json.dumps)
455
569
 
456
570
  resp = super().insert(table_name, data)
457
571
  if resp.resp_type == RESPONSE_TYPE.ERROR:
@@ -459,9 +573,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
459
573
  if resp.resp_type == RESPONSE_TYPE.TABLE:
460
574
  return resp.data_frame
461
575
 
462
- def update(
463
- self, table_name: str, data: pd.DataFrame, key_columns: List[str] = None
464
- ):
576
+ def update(self, table_name: str, data: pd.DataFrame, key_columns: List[str] = None):
465
577
  """
466
578
  Udate data into the pgvector table database.
467
579
  """
@@ -471,43 +583,32 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
471
583
  update_columns = {}
472
584
 
473
585
  for col in data.columns:
474
- value = Parameter('%s')
586
+ value = Parameter("%s")
475
587
 
476
588
  if col in key_columns:
477
- cond = BinaryOperation(
478
- op='=',
479
- args=[Identifier(col), value]
480
- )
589
+ cond = BinaryOperation(op="=", args=[Identifier(col), value])
481
590
  if where is None:
482
591
  where = cond
483
592
  else:
484
- where = BinaryOperation(
485
- op='AND',
486
- args=[where, cond]
487
- )
593
+ where = BinaryOperation(op="AND", args=[where, cond])
488
594
  else:
489
595
  update_columns[col] = value
490
596
 
491
- query = Update(
492
- table=Identifier(table_name),
493
- update_columns=update_columns,
494
- where=where
495
- )
597
+ query = Update(table=Identifier(table_name), update_columns=update_columns, where=where)
496
598
 
497
599
  if TableField.METADATA.value in data.columns:
600
+
498
601
  def fnc(v):
499
602
  if isinstance(v, dict):
500
603
  return json.dumps(v)
604
+
501
605
  data[TableField.METADATA.value] = data[TableField.METADATA.value].apply(fnc)
502
606
 
503
607
  data = data.astype({TableField.METADATA.value: str})
504
608
 
505
609
  transposed_data = []
506
610
  for _, record in data.iterrows():
507
- row = [
508
- record[col]
509
- for col in update_columns.keys()
510
- ]
611
+ row = [record[col] for col in update_columns.keys()]
511
612
  for key_column in key_columns:
512
613
  row.append(record[key_column])
513
614
  transposed_data.append(row)
@@ -515,17 +616,13 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
515
616
  query_str = self.renderer.get_string(query)
516
617
  self.raw_query(query_str, transposed_data)
517
618
 
518
- def delete(
519
- self, table_name: str, conditions: List[FilterCondition] = None
520
- ):
619
+ def delete(self, table_name: str, conditions: List[FilterCondition] = None):
521
620
  table_name = self._check_table(table_name)
522
621
 
523
- filter_conditions = self._translate_conditions(conditions)
622
+ filter_conditions, _ = self._translate_conditions(conditions)
524
623
  where_clause = self._construct_where_clause(filter_conditions)
525
624
 
526
- query = (
527
- f"DELETE FROM {table_name} {where_clause}"
528
- )
625
+ query = f"DELETE FROM {table_name} {where_clause}"
529
626
  self.raw_query(query)
530
627
 
531
628
  def drop_table(self, table_name: str, if_exists=True):
@@ -535,7 +632,13 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
535
632
  table_name = self._check_table(table_name)
536
633
  self.raw_query(f"DROP TABLE IF EXISTS {table_name}")
537
634
 
538
- def create_index(self, table_name: str, column_name: str = "embeddings", index_type: Literal['ivfflat', 'hnsw'] = "hnsw", metric_type: str = None):
635
+ def create_index(
636
+ self,
637
+ table_name: str,
638
+ column_name: str = "embeddings",
639
+ index_type: Literal["ivfflat", "hnsw"] = "hnsw",
640
+ metric_type: str = None,
641
+ ):
539
642
  """
540
643
  Create an index on the pgvector table.
541
644
  Args:
@@ -547,7 +650,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
547
650
  if metric_type is None:
548
651
  metric_type = self.get_metric_type()
549
652
  # Check if the index type is supported
550
- if index_type not in ['ivfflat', 'hnsw']:
653
+ if index_type not in ["ivfflat", "hnsw"]:
551
654
  raise ValueError("Invalid index type. Supported types are 'ivfflat' and 'hnsw'.")
552
655
  table_name = self._check_table(table_name)
553
656
  # first we make sure embedding dimension is set