MindsDB 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +53 -94
- mindsdb/api/a2a/agent.py +30 -206
- mindsdb/api/a2a/common/server/server.py +26 -27
- mindsdb/api/a2a/task_manager.py +93 -227
- mindsdb/api/a2a/utils.py +21 -0
- mindsdb/api/executor/command_executor.py +8 -6
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
- mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
- mindsdb/api/executor/planner/query_prepare.py +68 -87
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
- mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
- mindsdb/api/executor/utilities/sql.py +97 -21
- mindsdb/api/http/namespaces/agents.py +126 -201
- mindsdb/api/http/namespaces/config.py +12 -1
- mindsdb/api/http/namespaces/file.py +49 -24
- mindsdb/api/mcp/start.py +45 -31
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
- mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
- mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
- mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +244 -141
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/libs/keyword_search_base.py +41 -0
- mindsdb/integrations/libs/vectordatabase_handler.py +114 -84
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
- mindsdb/integrations/utilities/sql_utils.py +11 -0
- mindsdb/interfaces/agents/agents_controller.py +29 -9
- mindsdb/interfaces/agents/langchain_agent.py +7 -5
- mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
- mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
- mindsdb/interfaces/database/projects.py +1 -3
- mindsdb/interfaces/functions/controller.py +54 -64
- mindsdb/interfaces/functions/to_markdown.py +47 -14
- mindsdb/interfaces/knowledge_base/controller.py +228 -110
- mindsdb/interfaces/knowledge_base/evaluate.py +18 -6
- mindsdb/interfaces/knowledge_base/executor.py +346 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
- mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
- mindsdb/interfaces/skills/sql_agent.py +181 -130
- mindsdb/interfaces/storage/db.py +9 -7
- mindsdb/utilities/config.py +58 -40
- mindsdb/utilities/exception.py +58 -7
- mindsdb/utilities/security.py +54 -11
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +245 -259
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +61 -58
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
|
-
from typing import Dict, List,
|
|
3
|
+
from typing import Dict, List, Literal, Tuple
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
@@ -16,8 +16,10 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
|
|
|
16
16
|
FilterCondition,
|
|
17
17
|
VectorStoreHandler,
|
|
18
18
|
DistanceFunction,
|
|
19
|
-
TableField
|
|
19
|
+
TableField,
|
|
20
20
|
)
|
|
21
|
+
from mindsdb.integrations.libs.keyword_search_base import KeywordSearchBase
|
|
22
|
+
from mindsdb.integrations.utilities.sql_utils import KeywordSearchArgs
|
|
21
23
|
from mindsdb.utilities import log
|
|
22
24
|
from mindsdb.utilities.profiler import profiler
|
|
23
25
|
from mindsdb.utilities.context import context as ctx
|
|
@@ -26,19 +28,18 @@ logger = log.getLogger(__name__)
|
|
|
26
28
|
|
|
27
29
|
|
|
28
30
|
# todo Issue #7316 add support for different indexes and search algorithms e.g. cosine similarity or L2 norm
|
|
29
|
-
class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
31
|
+
class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
|
|
30
32
|
"""This handler handles connection and execution of the PostgreSQL with pgvector extension statements."""
|
|
31
33
|
|
|
32
34
|
name = "pgvector"
|
|
33
35
|
|
|
34
36
|
def __init__(self, name: str, **kwargs):
|
|
35
|
-
|
|
36
37
|
super().__init__(name=name, **kwargs)
|
|
37
38
|
self._is_shared_db = False
|
|
38
39
|
self._is_vector_registered = False
|
|
39
40
|
# we get these from the connection args on PostgresHandler parent
|
|
40
|
-
self._is_sparse = self.connection_args.get(
|
|
41
|
-
self._vector_size = self.connection_args.get(
|
|
41
|
+
self._is_sparse = self.connection_args.get("is_sparse", False)
|
|
42
|
+
self._vector_size = self.connection_args.get("vector_size", None)
|
|
42
43
|
|
|
43
44
|
if self._is_sparse:
|
|
44
45
|
if not self._vector_size:
|
|
@@ -48,20 +49,20 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
48
49
|
distance_op = "<#>"
|
|
49
50
|
|
|
50
51
|
else:
|
|
51
|
-
distance_op =
|
|
52
|
-
if
|
|
52
|
+
distance_op = "<=>"
|
|
53
|
+
if "distance" in self.connection_args:
|
|
53
54
|
distance_ops = {
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
55
|
+
"l1": "<+>",
|
|
56
|
+
"l2": "<->",
|
|
57
|
+
"ip": "<#>", # inner product
|
|
58
|
+
"cosine": "<=>",
|
|
59
|
+
"hamming": "<~>",
|
|
60
|
+
"jaccard": "<%>",
|
|
60
61
|
}
|
|
61
62
|
|
|
62
|
-
distance_op = distance_ops.get(self.connection_args[
|
|
63
|
+
distance_op = distance_ops.get(self.connection_args["distance"])
|
|
63
64
|
if distance_op is None:
|
|
64
|
-
raise ValueError(f
|
|
65
|
+
raise ValueError(f"Wrong distance type. Allowed options are {list(distance_ops.keys())}")
|
|
65
66
|
|
|
66
67
|
self.distance_op = distance_op
|
|
67
68
|
self.connect()
|
|
@@ -72,26 +73,26 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
72
73
|
|
|
73
74
|
"""
|
|
74
75
|
distance_ops_to_metric_type_map = {
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
76
|
+
"<->": "vector_l2_ops",
|
|
77
|
+
"<#>": "vector_ip_ops",
|
|
78
|
+
"<=>": "vector_cosine_ops",
|
|
79
|
+
"<+>": "vector_l1_ops",
|
|
80
|
+
"<~>": "bit_hamming_ops",
|
|
81
|
+
"<%>": "bit_jaccard_ops",
|
|
81
82
|
}
|
|
82
|
-
return distance_ops_to_metric_type_map.get(self.distance_op,
|
|
83
|
+
return distance_ops_to_metric_type_map.get(self.distance_op, "vector_cosine_ops")
|
|
83
84
|
|
|
84
85
|
def _make_connection_args(self):
|
|
85
|
-
cloud_pgvector_url = os.environ.get(
|
|
86
|
+
cloud_pgvector_url = os.environ.get("KB_PGVECTOR_URL")
|
|
86
87
|
# if no connection args and shared pg vector defined - use it
|
|
87
88
|
if len(self.connection_args) == 0 and cloud_pgvector_url is not None:
|
|
88
89
|
result = urlparse(cloud_pgvector_url)
|
|
89
90
|
self.connection_args = {
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
91
|
+
"host": result.hostname,
|
|
92
|
+
"port": result.port,
|
|
93
|
+
"user": result.username,
|
|
94
|
+
"password": result.password,
|
|
95
|
+
"database": result.path[1:],
|
|
95
96
|
}
|
|
96
97
|
self._is_shared_db = True
|
|
97
98
|
return super()._make_connection_args()
|
|
@@ -132,9 +133,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
132
133
|
|
|
133
134
|
except psycopg.Error as e:
|
|
134
135
|
self.connection.rollback()
|
|
135
|
-
logger.error(
|
|
136
|
-
f"Error loading pg_vector extension, ensure you have installed it before running, {e}!"
|
|
137
|
-
)
|
|
136
|
+
logger.error(f"Error loading pg_vector extension, ensure you have installed it before running, {e}!")
|
|
138
137
|
raise
|
|
139
138
|
|
|
140
139
|
# register vector type with psycopg2 connection
|
|
@@ -143,19 +142,33 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
143
142
|
|
|
144
143
|
return self.connection
|
|
145
144
|
|
|
145
|
+
def add_full_text_index(self, table_name: str, column_name: str) -> Response:
|
|
146
|
+
"""
|
|
147
|
+
Add a full text index to the specified column of the table.
|
|
148
|
+
Args:
|
|
149
|
+
table_name (str): Name of the table to add the index to.
|
|
150
|
+
column_name (str): Name of the column to add the index to.
|
|
151
|
+
Returns:
|
|
152
|
+
Response: Response object indicating success or failure.
|
|
153
|
+
"""
|
|
154
|
+
table_name = self._check_table(table_name)
|
|
155
|
+
query = f"CREATE INDEX IF NOT EXISTS {table_name}_{column_name}_fts_idx ON {table_name} USING gin(to_tsvector('english', {column_name}))"
|
|
156
|
+
self.raw_query(query)
|
|
157
|
+
return Response(RESPONSE_TYPE.OK)
|
|
158
|
+
|
|
146
159
|
@staticmethod
|
|
147
|
-
def _translate_conditions(conditions: List[FilterCondition]) ->
|
|
160
|
+
def _translate_conditions(conditions: List[FilterCondition]) -> Tuple[List[dict], dict]:
|
|
148
161
|
"""
|
|
149
162
|
Translate filter conditions to a dictionary
|
|
150
163
|
"""
|
|
151
164
|
|
|
152
165
|
if conditions is None:
|
|
153
|
-
|
|
166
|
+
conditions = []
|
|
154
167
|
|
|
155
|
-
filter_conditions =
|
|
168
|
+
filter_conditions = []
|
|
169
|
+
embedding_condition = None
|
|
156
170
|
|
|
157
171
|
for condition in conditions:
|
|
158
|
-
|
|
159
172
|
parts = condition.column.split(".")
|
|
160
173
|
key = parts[0]
|
|
161
174
|
# converts 'col.el1.el2' to col->'el1'->>'el2'
|
|
@@ -167,12 +180,25 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
167
180
|
# last element
|
|
168
181
|
key += f" ->> '{parts[-1]}'"
|
|
169
182
|
|
|
170
|
-
|
|
183
|
+
type_cast = None
|
|
184
|
+
if isinstance(condition.value, int):
|
|
185
|
+
type_cast = "int"
|
|
186
|
+
elif isinstance(condition.value, float):
|
|
187
|
+
type_cast = "float"
|
|
188
|
+
if type_cast is not None:
|
|
189
|
+
key = f"({key})::{type_cast}"
|
|
190
|
+
|
|
191
|
+
item = {
|
|
192
|
+
"name": key,
|
|
171
193
|
"op": condition.op.value,
|
|
172
194
|
"value": condition.value,
|
|
173
195
|
}
|
|
196
|
+
if key == "embeddings":
|
|
197
|
+
embedding_condition = item
|
|
198
|
+
else:
|
|
199
|
+
filter_conditions.append(item)
|
|
174
200
|
|
|
175
|
-
return filter_conditions
|
|
201
|
+
return filter_conditions, embedding_condition
|
|
176
202
|
|
|
177
203
|
@staticmethod
|
|
178
204
|
def _construct_where_clause(filter_conditions=None):
|
|
@@ -184,15 +210,18 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
184
210
|
|
|
185
211
|
where_clauses = []
|
|
186
212
|
|
|
187
|
-
for
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
if
|
|
191
|
-
values = list(repr(i) for i in
|
|
192
|
-
|
|
213
|
+
for item in filter_conditions:
|
|
214
|
+
key = item["name"]
|
|
215
|
+
|
|
216
|
+
if item["op"].lower() in ("in", "not in"):
|
|
217
|
+
values = list(repr(i) for i in item["value"])
|
|
218
|
+
item["value"] = "({})".format(", ".join(values))
|
|
193
219
|
else:
|
|
194
|
-
|
|
195
|
-
|
|
220
|
+
if item["value"] is None:
|
|
221
|
+
item["value"] = "null"
|
|
222
|
+
else:
|
|
223
|
+
item["value"] = repr(item["value"])
|
|
224
|
+
where_clauses.append(f"{key} {item['op']} {item['value']}")
|
|
196
225
|
|
|
197
226
|
if len(where_clauses) > 1:
|
|
198
227
|
return f"WHERE {' AND '.join(where_clauses)}"
|
|
@@ -201,15 +230,78 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
201
230
|
else:
|
|
202
231
|
return ""
|
|
203
232
|
|
|
233
|
+
@staticmethod
|
|
234
|
+
def _construct_where_clause_with_keywords(filter_conditions=None, keyword_query=None, content_column_name=None):
|
|
235
|
+
if not keyword_query or not content_column_name:
|
|
236
|
+
return PgVectorHandler._construct_where_clause(filter_conditions)
|
|
237
|
+
|
|
238
|
+
keyword_query_condition = (
|
|
239
|
+
f"""to_tsvector('english', {content_column_name}) @@ websearch_to_tsquery('english', '{keyword_query}')"""
|
|
240
|
+
)
|
|
241
|
+
if filter_conditions is None:
|
|
242
|
+
return ""
|
|
243
|
+
|
|
244
|
+
where_clauses = []
|
|
245
|
+
|
|
246
|
+
for item in filter_conditions:
|
|
247
|
+
key = item["name"]
|
|
248
|
+
|
|
249
|
+
if item["op"].lower() in ("in", "not in"):
|
|
250
|
+
values = list(repr(i) for i in item["value"])
|
|
251
|
+
item["value"] = "({})".format(", ".join(values))
|
|
252
|
+
else:
|
|
253
|
+
if item["value"] is None:
|
|
254
|
+
item["value"] = "null"
|
|
255
|
+
else:
|
|
256
|
+
item["value"] = repr(item["value"])
|
|
257
|
+
where_clauses.append(f"{key} {item['op']} {item['value']}")
|
|
258
|
+
|
|
259
|
+
where_clauses.append(keyword_query_condition)
|
|
260
|
+
if len(where_clauses) > 1:
|
|
261
|
+
return f"WHERE {' AND '.join(where_clauses)}"
|
|
262
|
+
elif len(where_clauses) == 1:
|
|
263
|
+
return f"WHERE {where_clauses[0]}"
|
|
264
|
+
else:
|
|
265
|
+
return ""
|
|
266
|
+
|
|
204
267
|
@staticmethod
|
|
205
268
|
def _construct_full_after_from_clause(
|
|
206
269
|
where_clause: str,
|
|
207
270
|
offset_clause: str,
|
|
208
271
|
limit_clause: str,
|
|
209
272
|
) -> str:
|
|
210
|
-
|
|
211
273
|
return f"{where_clause} {offset_clause} {limit_clause}"
|
|
212
274
|
|
|
275
|
+
def _build_keyword_bm25_query(
|
|
276
|
+
self,
|
|
277
|
+
table_name: str,
|
|
278
|
+
query: str,
|
|
279
|
+
columns: List[str] = None,
|
|
280
|
+
content_column_name: str = "content",
|
|
281
|
+
conditions: List[FilterCondition] = None,
|
|
282
|
+
limit: int = None,
|
|
283
|
+
offset: int = None,
|
|
284
|
+
):
|
|
285
|
+
if columns is None:
|
|
286
|
+
columns = ["id", "content", "metadata"]
|
|
287
|
+
|
|
288
|
+
filter_conditions, _ = self._translate_conditions(conditions)
|
|
289
|
+
|
|
290
|
+
# given filter conditions, construct where clause
|
|
291
|
+
where_clause = self._construct_where_clause_with_keywords(filter_conditions, query, content_column_name)
|
|
292
|
+
|
|
293
|
+
query = f"""
|
|
294
|
+
SELECT
|
|
295
|
+
{", ".join(columns)},
|
|
296
|
+
ts_rank_cd(to_tsvector('english', {content_column_name}), websearch_to_tsquery('english', '{query}')) as distance
|
|
297
|
+
FROM
|
|
298
|
+
{table_name}
|
|
299
|
+
{where_clause if where_clause else ""}
|
|
300
|
+
{f"LIMIT {limit}" if limit else ""}
|
|
301
|
+
{f"OFFSET {offset}" if offset else ""};"""
|
|
302
|
+
|
|
303
|
+
return query
|
|
304
|
+
|
|
213
305
|
def _build_select_query(
|
|
214
306
|
self,
|
|
215
307
|
table_name: str,
|
|
@@ -225,10 +317,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
225
317
|
offset_clause = f"OFFSET {offset}" if offset else ""
|
|
226
318
|
|
|
227
319
|
# translate filter conditions to dictionary
|
|
228
|
-
filter_conditions = self._translate_conditions(conditions)
|
|
229
|
-
|
|
230
|
-
# check if search vector is in filter conditions
|
|
231
|
-
embedding_search = filter_conditions.get("embeddings", None)
|
|
320
|
+
filter_conditions, embedding_search = self._translate_conditions(conditions)
|
|
232
321
|
|
|
233
322
|
# given filter conditions, construct where clause
|
|
234
323
|
where_clause = self._construct_where_clause(filter_conditions)
|
|
@@ -243,47 +332,41 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
243
332
|
else:
|
|
244
333
|
modified_columns.append(col)
|
|
245
334
|
else:
|
|
246
|
-
modified_columns = [
|
|
335
|
+
modified_columns = ["id", "content", "embeddings", "metadata"]
|
|
247
336
|
has_distance = True
|
|
248
337
|
|
|
249
|
-
targets =
|
|
250
|
-
|
|
251
|
-
if filter_conditions:
|
|
338
|
+
targets = ", ".join(modified_columns)
|
|
252
339
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
filter_conditions.pop("embeddings")
|
|
340
|
+
if embedding_search:
|
|
341
|
+
search_vector = embedding_search["value"]
|
|
256
342
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
embedding = SparseVector(search_vector, self._vector_size)
|
|
262
|
-
search_vector = embedding.to_text()
|
|
263
|
-
else:
|
|
264
|
-
# Convert list to vector string if needed
|
|
265
|
-
if isinstance(search_vector, list):
|
|
266
|
-
search_vector = f"[{','.join(str(x) for x in search_vector)}]"
|
|
343
|
+
if self._is_sparse:
|
|
344
|
+
# Convert dict to sparse vector if needed
|
|
345
|
+
if isinstance(search_vector, dict):
|
|
346
|
+
from pgvector.utils import SparseVector
|
|
267
347
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
348
|
+
embedding = SparseVector(search_vector, self._vector_size)
|
|
349
|
+
search_vector = embedding.to_text()
|
|
350
|
+
else:
|
|
351
|
+
# Convert list to vector string if needed
|
|
352
|
+
if isinstance(search_vector, list):
|
|
353
|
+
search_vector = f"[{','.join(str(x) for x in search_vector)}]"
|
|
271
354
|
|
|
272
|
-
|
|
355
|
+
# Calculate distance as part of the query if needed
|
|
356
|
+
if has_distance:
|
|
357
|
+
targets = f"{targets}, (embeddings {self.distance_op} '{search_vector}') as distance"
|
|
273
358
|
|
|
274
|
-
|
|
275
|
-
# if filter conditions, return rows that satisfy the conditions
|
|
276
|
-
return f"SELECT {targets} FROM {table_name} {where_clause} {limit_clause} {offset_clause}"
|
|
359
|
+
return f"SELECT {targets} FROM {table_name} {where_clause} ORDER BY embeddings {self.distance_op} '{search_vector}' ASC {limit_clause} {offset_clause} "
|
|
277
360
|
|
|
278
361
|
else:
|
|
279
|
-
# if
|
|
280
|
-
return f"SELECT {targets} FROM {table_name} {limit_clause} {offset_clause}"
|
|
362
|
+
# if filter conditions, return rows that satisfy the conditions
|
|
363
|
+
return f"SELECT {targets} FROM {table_name} {where_clause} {limit_clause} {offset_clause}"
|
|
281
364
|
|
|
282
365
|
def _check_table(self, table_name: str):
|
|
283
366
|
# Apply namespace for a user
|
|
284
367
|
if self._is_shared_db:
|
|
285
|
-
company_id = ctx.company_id or
|
|
286
|
-
return f
|
|
368
|
+
company_id = ctx.company_id or "x"
|
|
369
|
+
return f"t_{company_id}_{table_name}"
|
|
287
370
|
return table_name
|
|
288
371
|
|
|
289
372
|
def select(
|
|
@@ -303,6 +386,33 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
303
386
|
columns = ["id", "content", "embeddings", "metadata"]
|
|
304
387
|
|
|
305
388
|
query = self._build_select_query(table_name, columns, conditions, limit, offset)
|
|
389
|
+
|
|
390
|
+
result = self.raw_query(query)
|
|
391
|
+
|
|
392
|
+
# ensure embeddings are returned as string so they can be parsed by mindsdb
|
|
393
|
+
if "embeddings" in columns:
|
|
394
|
+
result["embeddings"] = result["embeddings"].astype(str)
|
|
395
|
+
|
|
396
|
+
return result
|
|
397
|
+
|
|
398
|
+
def keyword_select(
|
|
399
|
+
self,
|
|
400
|
+
table_name: str,
|
|
401
|
+
columns: List[str] = None,
|
|
402
|
+
conditions: List[FilterCondition] = None,
|
|
403
|
+
offset: int = None,
|
|
404
|
+
limit: int = None,
|
|
405
|
+
keyword_search_args: KeywordSearchArgs = None,
|
|
406
|
+
) -> pd.DataFrame:
|
|
407
|
+
table_name = self._check_table(table_name)
|
|
408
|
+
|
|
409
|
+
if columns is None:
|
|
410
|
+
columns = ["id", "content", "embeddings", "metadata"]
|
|
411
|
+
content_column_name = keyword_search_args.column
|
|
412
|
+
query = self._build_keyword_bm25_query(
|
|
413
|
+
table_name, keyword_search_args.query, columns, content_column_name, conditions, limit, offset
|
|
414
|
+
)
|
|
415
|
+
|
|
306
416
|
result = self.raw_query(query)
|
|
307
417
|
|
|
308
418
|
# ensure embeddings are returned as string so they can be parsed by mindsdb
|
|
@@ -318,9 +428,9 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
318
428
|
query: str = None,
|
|
319
429
|
metadata: Dict[str, str] = None,
|
|
320
430
|
distance_function=DistanceFunction.COSINE_DISTANCE,
|
|
321
|
-
**kwargs
|
|
431
|
+
**kwargs,
|
|
322
432
|
) -> pd.DataFrame:
|
|
323
|
-
|
|
433
|
+
"""
|
|
324
434
|
Executes a hybrid search, combining semantic search and one or both of keyword/metadata search.
|
|
325
435
|
|
|
326
436
|
For insight on the query construction, see: https://docs.pgvecto.rs/use-case/hybrid-search.html#advanced-search-merge-the-results-of-full-text-search-and-vector-search.
|
|
@@ -340,23 +450,25 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
340
450
|
|
|
341
451
|
Returns:
|
|
342
452
|
df(pd.DataFrame): Hybrid search result, sorted by hybrid search rank
|
|
343
|
-
|
|
453
|
+
"""
|
|
344
454
|
if query is None and metadata is None:
|
|
345
|
-
raise ValueError(
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
455
|
+
raise ValueError(
|
|
456
|
+
"Must provide at least one of: query for keyword search, or metadata filters. For only embeddings search, use normal search instead."
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
id_column_name = kwargs.get("id_column_name", "id")
|
|
460
|
+
content_column_name = kwargs.get("content_column_name", "content")
|
|
461
|
+
embeddings_column_name = kwargs.get("embeddings_column_name", "embeddings")
|
|
462
|
+
metadata_column_name = kwargs.get("metadata_column_name", "metadata")
|
|
351
463
|
# Filter by given metadata for semantic search & full text search CTEs, if present.
|
|
352
|
-
where_clause =
|
|
464
|
+
where_clause = " WHERE "
|
|
353
465
|
if metadata is None:
|
|
354
|
-
where_clause =
|
|
466
|
+
where_clause = ""
|
|
355
467
|
metadata = {}
|
|
356
468
|
for i, (k, v) in enumerate(metadata.items()):
|
|
357
469
|
where_clause += f"{metadata_column_name}->>'{k}' = '{v}'"
|
|
358
470
|
if i < len(metadata.items()) - 1:
|
|
359
|
-
where_clause +=
|
|
471
|
+
where_clause += " AND "
|
|
360
472
|
|
|
361
473
|
# See https://docs.pgvecto.rs/use-case/hybrid-search.html#advanced-search-merge-the-results-of-full-text-search-and-vector-search.
|
|
362
474
|
#
|
|
@@ -381,47 +493,51 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
381
493
|
# Or, if we are only doing metadata search, we leave out the JOIN & full text search CTEs.
|
|
382
494
|
#
|
|
383
495
|
# We calculate the final "hybrid" rank by summing the reciprocals of the ranks from each individual CTE.
|
|
384
|
-
semantic_search_cte = f
|
|
496
|
+
semantic_search_cte = f"""WITH semantic_search AS (
|
|
385
497
|
SELECT {id_column_name}, {content_column_name}, {embeddings_column_name},
|
|
386
498
|
RANK () OVER (ORDER BY {embeddings_column_name} {distance_function.value} '{str(embeddings)}') AS rank
|
|
387
499
|
FROM {table_name}{where_clause}
|
|
388
500
|
ORDER BY {embeddings_column_name} {distance_function.value} '{str(embeddings)}'::vector
|
|
389
|
-
)
|
|
501
|
+
)"""
|
|
390
502
|
|
|
391
|
-
full_text_search_cte =
|
|
503
|
+
full_text_search_cte = ""
|
|
392
504
|
if query is not None:
|
|
393
|
-
ts_vector_clause =
|
|
505
|
+
ts_vector_clause = (
|
|
506
|
+
f"WHERE to_tsvector('english', {content_column_name}) @@ plainto_tsquery('english', '{query}')"
|
|
507
|
+
)
|
|
394
508
|
if metadata:
|
|
395
|
-
ts_vector_clause =
|
|
396
|
-
|
|
509
|
+
ts_vector_clause = (
|
|
510
|
+
f"AND to_tsvector('english', {content_column_name}) @@ plainto_tsquery('english', '{query}')"
|
|
511
|
+
)
|
|
512
|
+
full_text_search_cte = f""",
|
|
397
513
|
full_text_search AS (
|
|
398
514
|
SELECT {id_column_name}, {content_column_name}, {embeddings_column_name},
|
|
399
515
|
RANK () OVER (ORDER BY ts_rank(to_tsvector('english', {content_column_name}), plainto_tsquery('english', '{query}')) DESC) AS rank
|
|
400
516
|
FROM {table_name}{where_clause}
|
|
401
517
|
{ts_vector_clause}
|
|
402
518
|
ORDER BY ts_rank(to_tsvector('english', {content_column_name}), plainto_tsquery('english', '{query}')) DESC
|
|
403
|
-
)
|
|
519
|
+
)"""
|
|
404
520
|
|
|
405
|
-
hybrid_select =
|
|
406
|
-
SELECT * FROM semantic_search
|
|
521
|
+
hybrid_select = """
|
|
522
|
+
SELECT * FROM semantic_search"""
|
|
407
523
|
if query is not None:
|
|
408
|
-
hybrid_select = f
|
|
524
|
+
hybrid_select = f"""
|
|
409
525
|
SELECT
|
|
410
526
|
COALESCE(semantic_search.{id_column_name}, full_text_search.{id_column_name}) AS id,
|
|
411
527
|
COALESCE(semantic_search.{content_column_name}, full_text_search.{content_column_name}) AS content,
|
|
412
528
|
COALESCE(semantic_search.{embeddings_column_name}, full_text_search.{embeddings_column_name}) AS embeddings,
|
|
413
529
|
COALESCE(1.0 / (1 + semantic_search.rank), 0.0) + COALESCE(1.0 / (1 + full_text_search.rank), 0.0) AS rank
|
|
414
530
|
FROM semantic_search FULL OUTER JOIN full_text_search USING ({id_column_name}) ORDER BY rank DESC;
|
|
415
|
-
|
|
531
|
+
"""
|
|
416
532
|
|
|
417
|
-
full_search_query = f
|
|
533
|
+
full_search_query = f"{semantic_search_cte}{full_text_search_cte}{hybrid_select}"
|
|
418
534
|
return self.raw_query(full_search_query)
|
|
419
535
|
|
|
420
536
|
def create_table(self, table_name: str):
|
|
421
537
|
"""Create a table with a vector column."""
|
|
422
538
|
with self.connection.cursor() as cur:
|
|
423
539
|
# For sparse vectors, use sparsevec type
|
|
424
|
-
vector_column_type =
|
|
540
|
+
vector_column_type = "sparsevec" if self._is_sparse else "vector"
|
|
425
541
|
|
|
426
542
|
# Vector size is required for sparse vectors, optional for dense
|
|
427
543
|
if self._is_sparse and not self._vector_size:
|
|
@@ -429,8 +545,8 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
429
545
|
|
|
430
546
|
# Add vector size specification only if provided
|
|
431
547
|
size_spec = f"({self._vector_size})" if self._vector_size is not None else "()"
|
|
432
|
-
if vector_column_type ==
|
|
433
|
-
size_spec =
|
|
548
|
+
if vector_column_type == "vector":
|
|
549
|
+
size_spec = ""
|
|
434
550
|
|
|
435
551
|
cur.execute(f"""
|
|
436
552
|
CREATE TABLE IF NOT EXISTS {table_name} (
|
|
@@ -442,16 +558,14 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
442
558
|
""")
|
|
443
559
|
self.connection.commit()
|
|
444
560
|
|
|
445
|
-
def insert(
|
|
446
|
-
self, table_name: str, data: pd.DataFrame
|
|
447
|
-
):
|
|
561
|
+
def insert(self, table_name: str, data: pd.DataFrame):
|
|
448
562
|
"""
|
|
449
563
|
Insert data into the pgvector table database.
|
|
450
564
|
"""
|
|
451
565
|
table_name = self._check_table(table_name)
|
|
452
566
|
|
|
453
|
-
if
|
|
454
|
-
data[
|
|
567
|
+
if "metadata" in data.columns:
|
|
568
|
+
data["metadata"] = data["metadata"].apply(json.dumps)
|
|
455
569
|
|
|
456
570
|
resp = super().insert(table_name, data)
|
|
457
571
|
if resp.resp_type == RESPONSE_TYPE.ERROR:
|
|
@@ -459,9 +573,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
459
573
|
if resp.resp_type == RESPONSE_TYPE.TABLE:
|
|
460
574
|
return resp.data_frame
|
|
461
575
|
|
|
462
|
-
def update(
|
|
463
|
-
self, table_name: str, data: pd.DataFrame, key_columns: List[str] = None
|
|
464
|
-
):
|
|
576
|
+
def update(self, table_name: str, data: pd.DataFrame, key_columns: List[str] = None):
|
|
465
577
|
"""
|
|
466
578
|
Udate data into the pgvector table database.
|
|
467
579
|
"""
|
|
@@ -471,43 +583,32 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
471
583
|
update_columns = {}
|
|
472
584
|
|
|
473
585
|
for col in data.columns:
|
|
474
|
-
value = Parameter(
|
|
586
|
+
value = Parameter("%s")
|
|
475
587
|
|
|
476
588
|
if col in key_columns:
|
|
477
|
-
cond = BinaryOperation(
|
|
478
|
-
op='=',
|
|
479
|
-
args=[Identifier(col), value]
|
|
480
|
-
)
|
|
589
|
+
cond = BinaryOperation(op="=", args=[Identifier(col), value])
|
|
481
590
|
if where is None:
|
|
482
591
|
where = cond
|
|
483
592
|
else:
|
|
484
|
-
where = BinaryOperation(
|
|
485
|
-
op='AND',
|
|
486
|
-
args=[where, cond]
|
|
487
|
-
)
|
|
593
|
+
where = BinaryOperation(op="AND", args=[where, cond])
|
|
488
594
|
else:
|
|
489
595
|
update_columns[col] = value
|
|
490
596
|
|
|
491
|
-
query = Update(
|
|
492
|
-
table=Identifier(table_name),
|
|
493
|
-
update_columns=update_columns,
|
|
494
|
-
where=where
|
|
495
|
-
)
|
|
597
|
+
query = Update(table=Identifier(table_name), update_columns=update_columns, where=where)
|
|
496
598
|
|
|
497
599
|
if TableField.METADATA.value in data.columns:
|
|
600
|
+
|
|
498
601
|
def fnc(v):
|
|
499
602
|
if isinstance(v, dict):
|
|
500
603
|
return json.dumps(v)
|
|
604
|
+
|
|
501
605
|
data[TableField.METADATA.value] = data[TableField.METADATA.value].apply(fnc)
|
|
502
606
|
|
|
503
607
|
data = data.astype({TableField.METADATA.value: str})
|
|
504
608
|
|
|
505
609
|
transposed_data = []
|
|
506
610
|
for _, record in data.iterrows():
|
|
507
|
-
row = [
|
|
508
|
-
record[col]
|
|
509
|
-
for col in update_columns.keys()
|
|
510
|
-
]
|
|
611
|
+
row = [record[col] for col in update_columns.keys()]
|
|
511
612
|
for key_column in key_columns:
|
|
512
613
|
row.append(record[key_column])
|
|
513
614
|
transposed_data.append(row)
|
|
@@ -515,17 +616,13 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
515
616
|
query_str = self.renderer.get_string(query)
|
|
516
617
|
self.raw_query(query_str, transposed_data)
|
|
517
618
|
|
|
518
|
-
def delete(
|
|
519
|
-
self, table_name: str, conditions: List[FilterCondition] = None
|
|
520
|
-
):
|
|
619
|
+
def delete(self, table_name: str, conditions: List[FilterCondition] = None):
|
|
521
620
|
table_name = self._check_table(table_name)
|
|
522
621
|
|
|
523
|
-
filter_conditions = self._translate_conditions(conditions)
|
|
622
|
+
filter_conditions, _ = self._translate_conditions(conditions)
|
|
524
623
|
where_clause = self._construct_where_clause(filter_conditions)
|
|
525
624
|
|
|
526
|
-
query =
|
|
527
|
-
f"DELETE FROM {table_name} {where_clause}"
|
|
528
|
-
)
|
|
625
|
+
query = f"DELETE FROM {table_name} {where_clause}"
|
|
529
626
|
self.raw_query(query)
|
|
530
627
|
|
|
531
628
|
def drop_table(self, table_name: str, if_exists=True):
|
|
@@ -535,7 +632,13 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
535
632
|
table_name = self._check_table(table_name)
|
|
536
633
|
self.raw_query(f"DROP TABLE IF EXISTS {table_name}")
|
|
537
634
|
|
|
538
|
-
def create_index(
|
|
635
|
+
def create_index(
|
|
636
|
+
self,
|
|
637
|
+
table_name: str,
|
|
638
|
+
column_name: str = "embeddings",
|
|
639
|
+
index_type: Literal["ivfflat", "hnsw"] = "hnsw",
|
|
640
|
+
metric_type: str = None,
|
|
641
|
+
):
|
|
539
642
|
"""
|
|
540
643
|
Create an index on the pgvector table.
|
|
541
644
|
Args:
|
|
@@ -547,7 +650,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
547
650
|
if metric_type is None:
|
|
548
651
|
metric_type = self.get_metric_type()
|
|
549
652
|
# Check if the index type is supported
|
|
550
|
-
if index_type not in [
|
|
653
|
+
if index_type not in ["ivfflat", "hnsw"]:
|
|
551
654
|
raise ValueError("Invalid index type. Supported types are 'ivfflat' and 'hnsw'.")
|
|
552
655
|
table_name = self._check_table(table_name)
|
|
553
656
|
# first we make sure embedding dimension is set
|