MindsDB 25.6.3.1__py3-none-any.whl → 25.7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/command_executor.py +8 -6
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +72 -44
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +14 -1
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/system_tables.py +314 -1
- mindsdb/api/executor/planner/plan_join.py +1 -1
- mindsdb/api/executor/planner/query_planner.py +7 -1
- mindsdb/api/executor/planner/query_prepare.py +68 -87
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
- mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
- mindsdb/api/http/namespaces/file.py +49 -24
- mindsdb/api/mcp/start.py +45 -31
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
- mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
- mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
- mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
- mindsdb/integrations/handlers/ludwig_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +150 -140
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +2 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/libs/api_handler.py +6 -7
- mindsdb/integrations/libs/vectordatabase_handler.py +86 -77
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
- mindsdb/interfaces/agents/agents_controller.py +29 -9
- mindsdb/interfaces/agents/constants.py +44 -0
- mindsdb/interfaces/agents/langchain_agent.py +15 -6
- mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
- mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +22 -3
- mindsdb/interfaces/knowledge_base/controller.py +121 -102
- mindsdb/interfaces/knowledge_base/evaluate.py +19 -7
- mindsdb/interfaces/knowledge_base/executor.py +346 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
- mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +26 -22
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +40 -28
- mindsdb/interfaces/skills/skill_tool.py +91 -88
- mindsdb/interfaces/skills/sql_agent.py +181 -130
- mindsdb/interfaces/storage/db.py +9 -7
- mindsdb/utilities/config.py +12 -1
- mindsdb/utilities/exception.py +47 -7
- mindsdb/utilities/security.py +54 -11
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/METADATA +239 -251
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/RECORD +55 -54
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
|
-
from typing import Dict, List,
|
|
3
|
+
from typing import Dict, List, Literal, Tuple
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
@@ -16,7 +16,7 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
|
|
|
16
16
|
FilterCondition,
|
|
17
17
|
VectorStoreHandler,
|
|
18
18
|
DistanceFunction,
|
|
19
|
-
TableField
|
|
19
|
+
TableField,
|
|
20
20
|
)
|
|
21
21
|
from mindsdb.utilities import log
|
|
22
22
|
from mindsdb.utilities.profiler import profiler
|
|
@@ -32,13 +32,12 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
32
32
|
name = "pgvector"
|
|
33
33
|
|
|
34
34
|
def __init__(self, name: str, **kwargs):
|
|
35
|
-
|
|
36
35
|
super().__init__(name=name, **kwargs)
|
|
37
36
|
self._is_shared_db = False
|
|
38
37
|
self._is_vector_registered = False
|
|
39
38
|
# we get these from the connection args on PostgresHandler parent
|
|
40
|
-
self._is_sparse = self.connection_args.get(
|
|
41
|
-
self._vector_size = self.connection_args.get(
|
|
39
|
+
self._is_sparse = self.connection_args.get("is_sparse", False)
|
|
40
|
+
self._vector_size = self.connection_args.get("vector_size", None)
|
|
42
41
|
|
|
43
42
|
if self._is_sparse:
|
|
44
43
|
if not self._vector_size:
|
|
@@ -48,20 +47,20 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
48
47
|
distance_op = "<#>"
|
|
49
48
|
|
|
50
49
|
else:
|
|
51
|
-
distance_op =
|
|
52
|
-
if
|
|
50
|
+
distance_op = "<=>"
|
|
51
|
+
if "distance" in self.connection_args:
|
|
53
52
|
distance_ops = {
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
53
|
+
"l1": "<+>",
|
|
54
|
+
"l2": "<->",
|
|
55
|
+
"ip": "<#>", # inner product
|
|
56
|
+
"cosine": "<=>",
|
|
57
|
+
"hamming": "<~>",
|
|
58
|
+
"jaccard": "<%>",
|
|
60
59
|
}
|
|
61
60
|
|
|
62
|
-
distance_op = distance_ops.get(self.connection_args[
|
|
61
|
+
distance_op = distance_ops.get(self.connection_args["distance"])
|
|
63
62
|
if distance_op is None:
|
|
64
|
-
raise ValueError(f
|
|
63
|
+
raise ValueError(f"Wrong distance type. Allowed options are {list(distance_ops.keys())}")
|
|
65
64
|
|
|
66
65
|
self.distance_op = distance_op
|
|
67
66
|
self.connect()
|
|
@@ -72,26 +71,26 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
72
71
|
|
|
73
72
|
"""
|
|
74
73
|
distance_ops_to_metric_type_map = {
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
74
|
+
"<->": "vector_l2_ops",
|
|
75
|
+
"<#>": "vector_ip_ops",
|
|
76
|
+
"<=>": "vector_cosine_ops",
|
|
77
|
+
"<+>": "vector_l1_ops",
|
|
78
|
+
"<~>": "bit_hamming_ops",
|
|
79
|
+
"<%>": "bit_jaccard_ops",
|
|
81
80
|
}
|
|
82
|
-
return distance_ops_to_metric_type_map.get(self.distance_op,
|
|
81
|
+
return distance_ops_to_metric_type_map.get(self.distance_op, "vector_cosine_ops")
|
|
83
82
|
|
|
84
83
|
def _make_connection_args(self):
|
|
85
|
-
cloud_pgvector_url = os.environ.get(
|
|
84
|
+
cloud_pgvector_url = os.environ.get("KB_PGVECTOR_URL")
|
|
86
85
|
# if no connection args and shared pg vector defined - use it
|
|
87
86
|
if len(self.connection_args) == 0 and cloud_pgvector_url is not None:
|
|
88
87
|
result = urlparse(cloud_pgvector_url)
|
|
89
88
|
self.connection_args = {
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
89
|
+
"host": result.hostname,
|
|
90
|
+
"port": result.port,
|
|
91
|
+
"user": result.username,
|
|
92
|
+
"password": result.password,
|
|
93
|
+
"database": result.path[1:],
|
|
95
94
|
}
|
|
96
95
|
self._is_shared_db = True
|
|
97
96
|
return super()._make_connection_args()
|
|
@@ -132,9 +131,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
132
131
|
|
|
133
132
|
except psycopg.Error as e:
|
|
134
133
|
self.connection.rollback()
|
|
135
|
-
logger.error(
|
|
136
|
-
f"Error loading pg_vector extension, ensure you have installed it before running, {e}!"
|
|
137
|
-
)
|
|
134
|
+
logger.error(f"Error loading pg_vector extension, ensure you have installed it before running, {e}!")
|
|
138
135
|
raise
|
|
139
136
|
|
|
140
137
|
# register vector type with psycopg2 connection
|
|
@@ -143,19 +140,33 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
143
140
|
|
|
144
141
|
return self.connection
|
|
145
142
|
|
|
143
|
+
def add_full_text_index(self, table_name: str, column_name: str) -> Response:
|
|
144
|
+
"""
|
|
145
|
+
Add a full text index to the specified column of the table.
|
|
146
|
+
Args:
|
|
147
|
+
table_name (str): Name of the table to add the index to.
|
|
148
|
+
column_name (str): Name of the column to add the index to.
|
|
149
|
+
Returns:
|
|
150
|
+
Response: Response object indicating success or failure.
|
|
151
|
+
"""
|
|
152
|
+
table_name = self._check_table(table_name)
|
|
153
|
+
query = f"CREATE INDEX IF NOT EXISTS {table_name}_{column_name}_fts_idx ON {table_name} USING gin(to_tsvector('english', {column_name}))"
|
|
154
|
+
self.raw_query(query)
|
|
155
|
+
return Response(RESPONSE_TYPE.OK)
|
|
156
|
+
|
|
146
157
|
@staticmethod
|
|
147
|
-
def _translate_conditions(conditions: List[FilterCondition]) ->
|
|
158
|
+
def _translate_conditions(conditions: List[FilterCondition]) -> Tuple[List[dict], dict]:
|
|
148
159
|
"""
|
|
149
160
|
Translate filter conditions to a dictionary
|
|
150
161
|
"""
|
|
151
162
|
|
|
152
163
|
if conditions is None:
|
|
153
|
-
|
|
164
|
+
conditions = []
|
|
154
165
|
|
|
155
|
-
filter_conditions =
|
|
166
|
+
filter_conditions = []
|
|
167
|
+
embedding_condition = None
|
|
156
168
|
|
|
157
169
|
for condition in conditions:
|
|
158
|
-
|
|
159
170
|
parts = condition.column.split(".")
|
|
160
171
|
key = parts[0]
|
|
161
172
|
# converts 'col.el1.el2' to col->'el1'->>'el2'
|
|
@@ -167,12 +178,25 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
167
178
|
# last element
|
|
168
179
|
key += f" ->> '{parts[-1]}'"
|
|
169
180
|
|
|
170
|
-
|
|
181
|
+
type_cast = None
|
|
182
|
+
if isinstance(condition.value, int):
|
|
183
|
+
type_cast = "int"
|
|
184
|
+
elif isinstance(condition.value, float):
|
|
185
|
+
type_cast = "float"
|
|
186
|
+
if type_cast is not None:
|
|
187
|
+
key = f"({key})::{type_cast}"
|
|
188
|
+
|
|
189
|
+
item = {
|
|
190
|
+
"name": key,
|
|
171
191
|
"op": condition.op.value,
|
|
172
192
|
"value": condition.value,
|
|
173
193
|
}
|
|
194
|
+
if key == "embeddings":
|
|
195
|
+
embedding_condition = item
|
|
196
|
+
else:
|
|
197
|
+
filter_conditions.append(item)
|
|
174
198
|
|
|
175
|
-
return filter_conditions
|
|
199
|
+
return filter_conditions, embedding_condition
|
|
176
200
|
|
|
177
201
|
@staticmethod
|
|
178
202
|
def _construct_where_clause(filter_conditions=None):
|
|
@@ -184,15 +208,18 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
184
208
|
|
|
185
209
|
where_clauses = []
|
|
186
210
|
|
|
187
|
-
for
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
if
|
|
191
|
-
values = list(repr(i) for i in
|
|
192
|
-
|
|
211
|
+
for item in filter_conditions:
|
|
212
|
+
key = item["name"]
|
|
213
|
+
|
|
214
|
+
if item["op"].lower() in ("in", "not in"):
|
|
215
|
+
values = list(repr(i) for i in item["value"])
|
|
216
|
+
item["value"] = "({})".format(", ".join(values))
|
|
193
217
|
else:
|
|
194
|
-
|
|
195
|
-
|
|
218
|
+
if item["value"] is None:
|
|
219
|
+
item["value"] = "null"
|
|
220
|
+
else:
|
|
221
|
+
item["value"] = repr(item["value"])
|
|
222
|
+
where_clauses.append(f"{key} {item['op']} {item['value']}")
|
|
196
223
|
|
|
197
224
|
if len(where_clauses) > 1:
|
|
198
225
|
return f"WHERE {' AND '.join(where_clauses)}"
|
|
@@ -207,7 +234,6 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
207
234
|
offset_clause: str,
|
|
208
235
|
limit_clause: str,
|
|
209
236
|
) -> str:
|
|
210
|
-
|
|
211
237
|
return f"{where_clause} {offset_clause} {limit_clause}"
|
|
212
238
|
|
|
213
239
|
def _build_select_query(
|
|
@@ -225,10 +251,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
225
251
|
offset_clause = f"OFFSET {offset}" if offset else ""
|
|
226
252
|
|
|
227
253
|
# translate filter conditions to dictionary
|
|
228
|
-
filter_conditions = self._translate_conditions(conditions)
|
|
229
|
-
|
|
230
|
-
# check if search vector is in filter conditions
|
|
231
|
-
embedding_search = filter_conditions.get("embeddings", None)
|
|
254
|
+
filter_conditions, embedding_search = self._translate_conditions(conditions)
|
|
232
255
|
|
|
233
256
|
# given filter conditions, construct where clause
|
|
234
257
|
where_clause = self._construct_where_clause(filter_conditions)
|
|
@@ -243,47 +266,41 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
243
266
|
else:
|
|
244
267
|
modified_columns.append(col)
|
|
245
268
|
else:
|
|
246
|
-
modified_columns = [
|
|
269
|
+
modified_columns = ["id", "content", "embeddings", "metadata"]
|
|
247
270
|
has_distance = True
|
|
248
271
|
|
|
249
|
-
targets =
|
|
272
|
+
targets = ", ".join(modified_columns)
|
|
250
273
|
|
|
251
|
-
if
|
|
274
|
+
if embedding_search:
|
|
275
|
+
search_vector = embedding_search["value"]
|
|
252
276
|
|
|
253
|
-
if
|
|
254
|
-
|
|
255
|
-
|
|
277
|
+
if self._is_sparse:
|
|
278
|
+
# Convert dict to sparse vector if needed
|
|
279
|
+
if isinstance(search_vector, dict):
|
|
280
|
+
from pgvector.utils import SparseVector
|
|
256
281
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
else:
|
|
264
|
-
# Convert list to vector string if needed
|
|
265
|
-
if isinstance(search_vector, list):
|
|
266
|
-
search_vector = f"[{','.join(str(x) for x in search_vector)}]"
|
|
282
|
+
embedding = SparseVector(search_vector, self._vector_size)
|
|
283
|
+
search_vector = embedding.to_text()
|
|
284
|
+
else:
|
|
285
|
+
# Convert list to vector string if needed
|
|
286
|
+
if isinstance(search_vector, list):
|
|
287
|
+
search_vector = f"[{','.join(str(x) for x in search_vector)}]"
|
|
267
288
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
289
|
+
# Calculate distance as part of the query if needed
|
|
290
|
+
if has_distance:
|
|
291
|
+
targets = f"{targets}, (embeddings {self.distance_op} '{search_vector}') as distance"
|
|
271
292
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
else:
|
|
275
|
-
# if filter conditions, return rows that satisfy the conditions
|
|
276
|
-
return f"SELECT {targets} FROM {table_name} {where_clause} {limit_clause} {offset_clause}"
|
|
293
|
+
return f"SELECT {targets} FROM {table_name} {where_clause} ORDER BY embeddings {self.distance_op} '{search_vector}' ASC {limit_clause} {offset_clause} "
|
|
277
294
|
|
|
278
295
|
else:
|
|
279
|
-
# if
|
|
280
|
-
return f"SELECT {targets} FROM {table_name} {limit_clause} {offset_clause}"
|
|
296
|
+
# if filter conditions, return rows that satisfy the conditions
|
|
297
|
+
return f"SELECT {targets} FROM {table_name} {where_clause} {limit_clause} {offset_clause}"
|
|
281
298
|
|
|
282
299
|
def _check_table(self, table_name: str):
|
|
283
300
|
# Apply namespace for a user
|
|
284
301
|
if self._is_shared_db:
|
|
285
|
-
company_id = ctx.company_id or
|
|
286
|
-
return f
|
|
302
|
+
company_id = ctx.company_id or "x"
|
|
303
|
+
return f"t_{company_id}_{table_name}"
|
|
287
304
|
return table_name
|
|
288
305
|
|
|
289
306
|
def select(
|
|
@@ -318,9 +335,9 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
318
335
|
query: str = None,
|
|
319
336
|
metadata: Dict[str, str] = None,
|
|
320
337
|
distance_function=DistanceFunction.COSINE_DISTANCE,
|
|
321
|
-
**kwargs
|
|
338
|
+
**kwargs,
|
|
322
339
|
) -> pd.DataFrame:
|
|
323
|
-
|
|
340
|
+
"""
|
|
324
341
|
Executes a hybrid search, combining semantic search and one or both of keyword/metadata search.
|
|
325
342
|
|
|
326
343
|
For insight on the query construction, see: https://docs.pgvecto.rs/use-case/hybrid-search.html#advanced-search-merge-the-results-of-full-text-search-and-vector-search.
|
|
@@ -340,23 +357,25 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
340
357
|
|
|
341
358
|
Returns:
|
|
342
359
|
df(pd.DataFrame): Hybrid search result, sorted by hybrid search rank
|
|
343
|
-
|
|
360
|
+
"""
|
|
344
361
|
if query is None and metadata is None:
|
|
345
|
-
raise ValueError(
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
362
|
+
raise ValueError(
|
|
363
|
+
"Must provide at least one of: query for keyword search, or metadata filters. For only embeddings search, use normal search instead."
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
id_column_name = kwargs.get("id_column_name", "id")
|
|
367
|
+
content_column_name = kwargs.get("content_column_name", "content")
|
|
368
|
+
embeddings_column_name = kwargs.get("embeddings_column_name", "embeddings")
|
|
369
|
+
metadata_column_name = kwargs.get("metadata_column_name", "metadata")
|
|
351
370
|
# Filter by given metadata for semantic search & full text search CTEs, if present.
|
|
352
|
-
where_clause =
|
|
371
|
+
where_clause = " WHERE "
|
|
353
372
|
if metadata is None:
|
|
354
|
-
where_clause =
|
|
373
|
+
where_clause = ""
|
|
355
374
|
metadata = {}
|
|
356
375
|
for i, (k, v) in enumerate(metadata.items()):
|
|
357
376
|
where_clause += f"{metadata_column_name}->>'{k}' = '{v}'"
|
|
358
377
|
if i < len(metadata.items()) - 1:
|
|
359
|
-
where_clause +=
|
|
378
|
+
where_clause += " AND "
|
|
360
379
|
|
|
361
380
|
# See https://docs.pgvecto.rs/use-case/hybrid-search.html#advanced-search-merge-the-results-of-full-text-search-and-vector-search.
|
|
362
381
|
#
|
|
@@ -381,47 +400,51 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
381
400
|
# Or, if we are only doing metadata search, we leave out the JOIN & full text search CTEs.
|
|
382
401
|
#
|
|
383
402
|
# We calculate the final "hybrid" rank by summing the reciprocals of the ranks from each individual CTE.
|
|
384
|
-
semantic_search_cte = f
|
|
403
|
+
semantic_search_cte = f"""WITH semantic_search AS (
|
|
385
404
|
SELECT {id_column_name}, {content_column_name}, {embeddings_column_name},
|
|
386
405
|
RANK () OVER (ORDER BY {embeddings_column_name} {distance_function.value} '{str(embeddings)}') AS rank
|
|
387
406
|
FROM {table_name}{where_clause}
|
|
388
407
|
ORDER BY {embeddings_column_name} {distance_function.value} '{str(embeddings)}'::vector
|
|
389
|
-
)
|
|
408
|
+
)"""
|
|
390
409
|
|
|
391
|
-
full_text_search_cte =
|
|
410
|
+
full_text_search_cte = ""
|
|
392
411
|
if query is not None:
|
|
393
|
-
ts_vector_clause =
|
|
412
|
+
ts_vector_clause = (
|
|
413
|
+
f"WHERE to_tsvector('english', {content_column_name}) @@ plainto_tsquery('english', '{query}')"
|
|
414
|
+
)
|
|
394
415
|
if metadata:
|
|
395
|
-
ts_vector_clause =
|
|
396
|
-
|
|
416
|
+
ts_vector_clause = (
|
|
417
|
+
f"AND to_tsvector('english', {content_column_name}) @@ plainto_tsquery('english', '{query}')"
|
|
418
|
+
)
|
|
419
|
+
full_text_search_cte = f""",
|
|
397
420
|
full_text_search AS (
|
|
398
421
|
SELECT {id_column_name}, {content_column_name}, {embeddings_column_name},
|
|
399
422
|
RANK () OVER (ORDER BY ts_rank(to_tsvector('english', {content_column_name}), plainto_tsquery('english', '{query}')) DESC) AS rank
|
|
400
423
|
FROM {table_name}{where_clause}
|
|
401
424
|
{ts_vector_clause}
|
|
402
425
|
ORDER BY ts_rank(to_tsvector('english', {content_column_name}), plainto_tsquery('english', '{query}')) DESC
|
|
403
|
-
)
|
|
426
|
+
)"""
|
|
404
427
|
|
|
405
|
-
hybrid_select =
|
|
406
|
-
SELECT * FROM semantic_search
|
|
428
|
+
hybrid_select = """
|
|
429
|
+
SELECT * FROM semantic_search"""
|
|
407
430
|
if query is not None:
|
|
408
|
-
hybrid_select = f
|
|
431
|
+
hybrid_select = f"""
|
|
409
432
|
SELECT
|
|
410
433
|
COALESCE(semantic_search.{id_column_name}, full_text_search.{id_column_name}) AS id,
|
|
411
434
|
COALESCE(semantic_search.{content_column_name}, full_text_search.{content_column_name}) AS content,
|
|
412
435
|
COALESCE(semantic_search.{embeddings_column_name}, full_text_search.{embeddings_column_name}) AS embeddings,
|
|
413
436
|
COALESCE(1.0 / (1 + semantic_search.rank), 0.0) + COALESCE(1.0 / (1 + full_text_search.rank), 0.0) AS rank
|
|
414
437
|
FROM semantic_search FULL OUTER JOIN full_text_search USING ({id_column_name}) ORDER BY rank DESC;
|
|
415
|
-
|
|
438
|
+
"""
|
|
416
439
|
|
|
417
|
-
full_search_query = f
|
|
440
|
+
full_search_query = f"{semantic_search_cte}{full_text_search_cte}{hybrid_select}"
|
|
418
441
|
return self.raw_query(full_search_query)
|
|
419
442
|
|
|
420
443
|
def create_table(self, table_name: str):
|
|
421
444
|
"""Create a table with a vector column."""
|
|
422
445
|
with self.connection.cursor() as cur:
|
|
423
446
|
# For sparse vectors, use sparsevec type
|
|
424
|
-
vector_column_type =
|
|
447
|
+
vector_column_type = "sparsevec" if self._is_sparse else "vector"
|
|
425
448
|
|
|
426
449
|
# Vector size is required for sparse vectors, optional for dense
|
|
427
450
|
if self._is_sparse and not self._vector_size:
|
|
@@ -429,8 +452,8 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
429
452
|
|
|
430
453
|
# Add vector size specification only if provided
|
|
431
454
|
size_spec = f"({self._vector_size})" if self._vector_size is not None else "()"
|
|
432
|
-
if vector_column_type ==
|
|
433
|
-
size_spec =
|
|
455
|
+
if vector_column_type == "vector":
|
|
456
|
+
size_spec = ""
|
|
434
457
|
|
|
435
458
|
cur.execute(f"""
|
|
436
459
|
CREATE TABLE IF NOT EXISTS {table_name} (
|
|
@@ -442,16 +465,14 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
442
465
|
""")
|
|
443
466
|
self.connection.commit()
|
|
444
467
|
|
|
445
|
-
def insert(
|
|
446
|
-
self, table_name: str, data: pd.DataFrame
|
|
447
|
-
):
|
|
468
|
+
def insert(self, table_name: str, data: pd.DataFrame):
|
|
448
469
|
"""
|
|
449
470
|
Insert data into the pgvector table database.
|
|
450
471
|
"""
|
|
451
472
|
table_name = self._check_table(table_name)
|
|
452
473
|
|
|
453
|
-
if
|
|
454
|
-
data[
|
|
474
|
+
if "metadata" in data.columns:
|
|
475
|
+
data["metadata"] = data["metadata"].apply(json.dumps)
|
|
455
476
|
|
|
456
477
|
resp = super().insert(table_name, data)
|
|
457
478
|
if resp.resp_type == RESPONSE_TYPE.ERROR:
|
|
@@ -459,9 +480,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
459
480
|
if resp.resp_type == RESPONSE_TYPE.TABLE:
|
|
460
481
|
return resp.data_frame
|
|
461
482
|
|
|
462
|
-
def update(
|
|
463
|
-
self, table_name: str, data: pd.DataFrame, key_columns: List[str] = None
|
|
464
|
-
):
|
|
483
|
+
def update(self, table_name: str, data: pd.DataFrame, key_columns: List[str] = None):
|
|
465
484
|
"""
|
|
466
485
|
Udate data into the pgvector table database.
|
|
467
486
|
"""
|
|
@@ -471,43 +490,32 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
471
490
|
update_columns = {}
|
|
472
491
|
|
|
473
492
|
for col in data.columns:
|
|
474
|
-
value = Parameter(
|
|
493
|
+
value = Parameter("%s")
|
|
475
494
|
|
|
476
495
|
if col in key_columns:
|
|
477
|
-
cond = BinaryOperation(
|
|
478
|
-
op='=',
|
|
479
|
-
args=[Identifier(col), value]
|
|
480
|
-
)
|
|
496
|
+
cond = BinaryOperation(op="=", args=[Identifier(col), value])
|
|
481
497
|
if where is None:
|
|
482
498
|
where = cond
|
|
483
499
|
else:
|
|
484
|
-
where = BinaryOperation(
|
|
485
|
-
op='AND',
|
|
486
|
-
args=[where, cond]
|
|
487
|
-
)
|
|
500
|
+
where = BinaryOperation(op="AND", args=[where, cond])
|
|
488
501
|
else:
|
|
489
502
|
update_columns[col] = value
|
|
490
503
|
|
|
491
|
-
query = Update(
|
|
492
|
-
table=Identifier(table_name),
|
|
493
|
-
update_columns=update_columns,
|
|
494
|
-
where=where
|
|
495
|
-
)
|
|
504
|
+
query = Update(table=Identifier(table_name), update_columns=update_columns, where=where)
|
|
496
505
|
|
|
497
506
|
if TableField.METADATA.value in data.columns:
|
|
507
|
+
|
|
498
508
|
def fnc(v):
|
|
499
509
|
if isinstance(v, dict):
|
|
500
510
|
return json.dumps(v)
|
|
511
|
+
|
|
501
512
|
data[TableField.METADATA.value] = data[TableField.METADATA.value].apply(fnc)
|
|
502
513
|
|
|
503
514
|
data = data.astype({TableField.METADATA.value: str})
|
|
504
515
|
|
|
505
516
|
transposed_data = []
|
|
506
517
|
for _, record in data.iterrows():
|
|
507
|
-
row = [
|
|
508
|
-
record[col]
|
|
509
|
-
for col in update_columns.keys()
|
|
510
|
-
]
|
|
518
|
+
row = [record[col] for col in update_columns.keys()]
|
|
511
519
|
for key_column in key_columns:
|
|
512
520
|
row.append(record[key_column])
|
|
513
521
|
transposed_data.append(row)
|
|
@@ -515,17 +523,13 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
515
523
|
query_str = self.renderer.get_string(query)
|
|
516
524
|
self.raw_query(query_str, transposed_data)
|
|
517
525
|
|
|
518
|
-
def delete(
|
|
519
|
-
self, table_name: str, conditions: List[FilterCondition] = None
|
|
520
|
-
):
|
|
526
|
+
def delete(self, table_name: str, conditions: List[FilterCondition] = None):
|
|
521
527
|
table_name = self._check_table(table_name)
|
|
522
528
|
|
|
523
|
-
filter_conditions = self._translate_conditions(conditions)
|
|
529
|
+
filter_conditions, _ = self._translate_conditions(conditions)
|
|
524
530
|
where_clause = self._construct_where_clause(filter_conditions)
|
|
525
531
|
|
|
526
|
-
query =
|
|
527
|
-
f"DELETE FROM {table_name} {where_clause}"
|
|
528
|
-
)
|
|
532
|
+
query = f"DELETE FROM {table_name} {where_clause}"
|
|
529
533
|
self.raw_query(query)
|
|
530
534
|
|
|
531
535
|
def drop_table(self, table_name: str, if_exists=True):
|
|
@@ -535,7 +539,13 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
535
539
|
table_name = self._check_table(table_name)
|
|
536
540
|
self.raw_query(f"DROP TABLE IF EXISTS {table_name}")
|
|
537
541
|
|
|
538
|
-
def create_index(
|
|
542
|
+
def create_index(
|
|
543
|
+
self,
|
|
544
|
+
table_name: str,
|
|
545
|
+
column_name: str = "embeddings",
|
|
546
|
+
index_type: Literal["ivfflat", "hnsw"] = "hnsw",
|
|
547
|
+
metric_type: str = None,
|
|
548
|
+
):
|
|
539
549
|
"""
|
|
540
550
|
Create an index on the pgvector table.
|
|
541
551
|
Args:
|
|
@@ -547,7 +557,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
547
557
|
if metric_type is None:
|
|
548
558
|
metric_type = self.get_metric_type()
|
|
549
559
|
# Check if the index type is supported
|
|
550
|
-
if index_type not in [
|
|
560
|
+
if index_type not in ["ivfflat", "hnsw"]:
|
|
551
561
|
raise ValueError("Invalid index type. Supported types are 'ivfflat' and 'hnsw'.")
|
|
552
562
|
table_name = self._check_table(table_name)
|
|
553
563
|
# first we make sure embedding dimension is set
|
|
@@ -476,7 +476,7 @@ class PostgresHandler(MetaDatabaseHandler):
|
|
|
476
476
|
config = self._make_connection_args()
|
|
477
477
|
config["autocommit"] = True
|
|
478
478
|
|
|
479
|
-
conn = psycopg.connect(
|
|
479
|
+
conn = psycopg.connect(**config)
|
|
480
480
|
|
|
481
481
|
# create db trigger
|
|
482
482
|
trigger_name = f"mdb_notify_{table_name}"
|
|
@@ -203,6 +203,8 @@ def create_table_class(resource_name: Text) -> MetaAPIResource:
|
|
|
203
203
|
"column_name": field["name"],
|
|
204
204
|
"data_type": field["type"],
|
|
205
205
|
"is_nullable": field.get("nillable", False),
|
|
206
|
+
"default_value": field.get("defaultValue", ""),
|
|
207
|
+
"description": field.get("inlineHelpText", ""),
|
|
206
208
|
}
|
|
207
209
|
)
|
|
208
210
|
|
|
@@ -433,16 +433,15 @@ class APIHandler(BaseHandler):
|
|
|
433
433
|
Args:
|
|
434
434
|
name (str): the handler name
|
|
435
435
|
"""
|
|
436
|
-
|
|
437
436
|
self._tables = {}
|
|
438
437
|
|
|
439
438
|
def _register_table(self, table_name: str, table_class: Any):
|
|
440
439
|
"""
|
|
441
440
|
Register the data resource. For e.g if you are using Twitter API it registers the `tweets` resource from `/api/v2/tweets`.
|
|
442
441
|
"""
|
|
443
|
-
if table_name in self._tables:
|
|
442
|
+
if table_name.lower() in self._tables:
|
|
444
443
|
raise TableAlreadyExists(f"Table with name {table_name} already exists for this handler")
|
|
445
|
-
self._tables[table_name] = table_class
|
|
444
|
+
self._tables[table_name.lower()] = table_class
|
|
446
445
|
|
|
447
446
|
def _get_table(self, name: Identifier):
|
|
448
447
|
"""
|
|
@@ -450,10 +449,10 @@ class APIHandler(BaseHandler):
|
|
|
450
449
|
Args:
|
|
451
450
|
name (Identifier): the table name
|
|
452
451
|
"""
|
|
453
|
-
name = name.parts[-1]
|
|
454
|
-
if name
|
|
455
|
-
|
|
456
|
-
|
|
452
|
+
name = name.parts[-1].lower()
|
|
453
|
+
if name in self._tables:
|
|
454
|
+
return self._tables[name]
|
|
455
|
+
raise TableNotFound(f"Table not found: {name}")
|
|
457
456
|
|
|
458
457
|
def query(self, query: ASTNode):
|
|
459
458
|
if isinstance(query, Select):
|