MindsDB 25.1.2.0__py3-none-any.whl → 25.1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.5.0.dist-info}/METADATA +258 -255
- {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.5.0.dist-info}/RECORD +98 -85
- {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.5.0.dist-info}/WHEEL +1 -1
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +5 -3
- mindsdb/api/executor/__init__.py +0 -1
- mindsdb/api/executor/command_executor.py +2 -1
- mindsdb/api/executor/data_types/answer.py +1 -1
- mindsdb/api/executor/datahub/datanodes/datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +8 -3
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +9 -26
- mindsdb/api/executor/sql_query/__init__.py +1 -0
- mindsdb/api/executor/sql_query/result_set.py +36 -21
- mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +1 -1
- mindsdb/api/executor/sql_query/steps/join_step.py +4 -4
- mindsdb/api/executor/sql_query/steps/map_reduce_step.py +6 -39
- mindsdb/api/executor/utilities/sql.py +2 -10
- mindsdb/api/http/namespaces/agents.py +3 -1
- mindsdb/api/http/namespaces/knowledge_bases.py +3 -3
- mindsdb/api/http/namespaces/sql.py +3 -1
- mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +2 -1
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +7 -0
- mindsdb/api/postgres/postgres_proxy/executor/executor.py +2 -1
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +2 -2
- mindsdb/integrations/handlers/chromadb_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/databricks_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/file_handler/file_handler.py +1 -1
- mindsdb/integrations/handlers/file_handler/requirements.txt +0 -4
- mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +17 -1
- mindsdb/integrations/handlers/jira_handler/jira_handler.py +15 -1
- mindsdb/integrations/handlers/jira_handler/jira_table.py +52 -31
- mindsdb/integrations/handlers/langchain_embedding_handler/fastapi_embeddings.py +82 -0
- mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +8 -1
- mindsdb/integrations/handlers/langchain_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_handler.py +1 -1
- mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +8 -0
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +49 -12
- mindsdb/integrations/handlers/pinecone_handler/pinecone_handler.py +123 -72
- mindsdb/integrations/handlers/pinecone_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +12 -6
- mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +5 -3
- mindsdb/integrations/handlers/slack_handler/slack_handler.py +13 -2
- mindsdb/integrations/handlers/slack_handler/slack_tables.py +21 -1
- mindsdb/integrations/handlers/web_handler/requirements.txt +0 -1
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +2 -2
- mindsdb/integrations/utilities/files/__init__.py +0 -0
- mindsdb/integrations/utilities/files/file_reader.py +258 -0
- mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +2 -1
- mindsdb/integrations/utilities/handlers/auth_utilities/microsoft/ms_graph_api_auth_utilities.py +8 -3
- mindsdb/integrations/utilities/rag/chains/map_reduce_summarizer_chain.py +5 -9
- mindsdb/integrations/utilities/rag/loaders/vector_store_loader/pgvector.py +76 -27
- mindsdb/integrations/utilities/rag/loaders/vector_store_loader/vector_store_loader.py +18 -1
- mindsdb/integrations/utilities/rag/pipelines/rag.py +84 -20
- mindsdb/integrations/utilities/rag/rag_pipeline_builder.py +16 -1
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +166 -108
- mindsdb/integrations/utilities/rag/retrievers/__init__.py +3 -0
- mindsdb/integrations/utilities/rag/retrievers/multi_hop_retriever.py +85 -0
- mindsdb/integrations/utilities/rag/retrievers/retriever_factory.py +57 -0
- mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +117 -48
- mindsdb/integrations/utilities/rag/settings.py +190 -17
- mindsdb/integrations/utilities/sql_utils.py +1 -1
- mindsdb/interfaces/agents/agents_controller.py +18 -8
- mindsdb/interfaces/agents/constants.py +1 -0
- mindsdb/interfaces/agents/langchain_agent.py +124 -157
- mindsdb/interfaces/agents/langfuse_callback_handler.py +4 -37
- mindsdb/interfaces/agents/mindsdb_database_agent.py +21 -13
- mindsdb/interfaces/chatbot/chatbot_controller.py +7 -11
- mindsdb/interfaces/chatbot/chatbot_task.py +16 -5
- mindsdb/interfaces/chatbot/memory.py +58 -13
- mindsdb/interfaces/database/integrations.py +5 -1
- mindsdb/interfaces/database/projects.py +55 -16
- mindsdb/interfaces/database/views.py +12 -25
- mindsdb/interfaces/knowledge_base/controller.py +38 -9
- mindsdb/interfaces/knowledge_base/preprocessing/document_loader.py +7 -26
- mindsdb/interfaces/model/functions.py +15 -4
- mindsdb/interfaces/model/model_controller.py +4 -7
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +51 -40
- mindsdb/interfaces/skills/retrieval_tool.py +10 -3
- mindsdb/interfaces/skills/skill_tool.py +97 -54
- mindsdb/interfaces/skills/skills_controller.py +7 -3
- mindsdb/interfaces/skills/sql_agent.py +127 -41
- mindsdb/interfaces/storage/db.py +1 -1
- mindsdb/migrations/versions/2025-01-15_c06c35f7e8e1_project_company.py +88 -0
- mindsdb/utilities/cache.py +7 -4
- mindsdb/utilities/context.py +11 -1
- mindsdb/utilities/langfuse.py +279 -0
- mindsdb/utilities/log.py +20 -2
- mindsdb/utilities/otel/__init__.py +206 -0
- mindsdb/utilities/otel/logger.py +25 -0
- mindsdb/utilities/otel/meter.py +19 -0
- mindsdb/utilities/otel/metric_handlers/__init__.py +25 -0
- mindsdb/utilities/otel/tracer.py +16 -0
- mindsdb/utilities/partitioning.py +52 -0
- mindsdb/utilities/render/sqlalchemy_render.py +7 -1
- mindsdb/utilities/utils.py +34 -0
- mindsdb/utilities/otel.py +0 -72
- {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.5.0.dist-info}/LICENSE +0 -0
- {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.5.0.dist-info}/top_level.txt +0 -0
|
@@ -10,9 +10,26 @@ from mindsdb_sql_parser import ast
|
|
|
10
10
|
|
|
11
11
|
logger = log.getLogger(__name__)
|
|
12
12
|
|
|
13
|
+
|
|
14
|
+
def flatten_json(nested_json, parent_key="", separator="."):
|
|
15
|
+
"""
|
|
16
|
+
Recursively flattens a nested JSON object into a dictionary with dot notation keys.
|
|
17
|
+
"""
|
|
18
|
+
items = []
|
|
19
|
+
for k, v in nested_json.items():
|
|
20
|
+
new_key = f"{parent_key}{separator}{k}" if parent_key else k
|
|
21
|
+
if isinstance(v, dict):
|
|
22
|
+
items.extend(flatten_json(v, new_key, separator=separator).items())
|
|
23
|
+
else:
|
|
24
|
+
items.append((new_key, v))
|
|
25
|
+
return dict(items)
|
|
26
|
+
|
|
27
|
+
|
|
13
28
|
class JiraProjectsTable(APITable):
|
|
14
29
|
"""Jira Projects Table implementation"""
|
|
30
|
+
|
|
15
31
|
_MAX_API_RESULTS = 100
|
|
32
|
+
|
|
16
33
|
def select(self, query: ast.Select) -> pd.DataFrame:
|
|
17
34
|
"""Pulls data from the Jira "get_all_project_issues" API endpoint
|
|
18
35
|
Parameters
|
|
@@ -42,8 +59,8 @@ class JiraProjectsTable(APITable):
|
|
|
42
59
|
|
|
43
60
|
for an_order in query.order_by:
|
|
44
61
|
if an_order.field.parts[0] != "key":
|
|
45
|
-
continue
|
|
46
|
-
if an_order.field.parts[1] in ["reporter","assignee","status"]:
|
|
62
|
+
continue
|
|
63
|
+
if an_order.field.parts[1] in ["reporter", "assignee", "status"]:
|
|
47
64
|
if issues_kwargs != {}:
|
|
48
65
|
raise ValueError(
|
|
49
66
|
"Duplicate order conditions found for reporter,status and assignee"
|
|
@@ -61,9 +78,9 @@ class JiraProjectsTable(APITable):
|
|
|
61
78
|
raise ValueError(
|
|
62
79
|
f"Order by unknown column {an_order.field.parts[1]}"
|
|
63
80
|
)
|
|
64
|
-
project = self.handler.connection_data[
|
|
81
|
+
project = self.handler.connection_data["project"]
|
|
65
82
|
jira_project_df = self.call_jira_api(project)
|
|
66
|
-
|
|
83
|
+
|
|
67
84
|
selected_columns = []
|
|
68
85
|
for target in query.targets:
|
|
69
86
|
if isinstance(target, ast.Star):
|
|
@@ -74,7 +91,6 @@ class JiraProjectsTable(APITable):
|
|
|
74
91
|
else:
|
|
75
92
|
raise ValueError(f"Unknown query target {type(target)}")
|
|
76
93
|
|
|
77
|
-
|
|
78
94
|
if len(jira_project_df) == 0:
|
|
79
95
|
jira_project_df = pd.DataFrame([], columns=selected_columns)
|
|
80
96
|
return jira_project_df
|
|
@@ -88,7 +104,7 @@ class JiraProjectsTable(APITable):
|
|
|
88
104
|
by=order_by_conditions["columns"],
|
|
89
105
|
ascending=order_by_conditions["ascending"],
|
|
90
106
|
)
|
|
91
|
-
|
|
107
|
+
|
|
92
108
|
if query.limit:
|
|
93
109
|
jira_project_df = jira_project_df.head(total_results)
|
|
94
110
|
|
|
@@ -102,12 +118,12 @@ class JiraProjectsTable(APITable):
|
|
|
102
118
|
List of columns
|
|
103
119
|
"""
|
|
104
120
|
return [
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
121
|
+
"key",
|
|
122
|
+
"summary",
|
|
123
|
+
"status",
|
|
124
|
+
"reporter",
|
|
125
|
+
"assignee",
|
|
126
|
+
"priority",
|
|
111
127
|
]
|
|
112
128
|
|
|
113
129
|
def call_jira_api(self, project):
|
|
@@ -116,36 +132,41 @@ class JiraProjectsTable(APITable):
|
|
|
116
132
|
max_records = jira.get_project_issues_count(project)
|
|
117
133
|
max_records = 100
|
|
118
134
|
jql_query = self.handler.construct_jql()
|
|
119
|
-
max_results = self._MAX_API_RESULTS
|
|
135
|
+
max_results = self._MAX_API_RESULTS
|
|
120
136
|
start_index = 0
|
|
121
137
|
total = 1
|
|
122
138
|
fields = [
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
139
|
+
"key",
|
|
140
|
+
"fields.summary",
|
|
141
|
+
"fields.status.name",
|
|
142
|
+
"fields.reporter.displayName",
|
|
143
|
+
"fields.assignee.displayName",
|
|
144
|
+
"fields.priority.name",
|
|
129
145
|
]
|
|
130
146
|
|
|
131
147
|
all_jira_issues_df = pd.DataFrame(columns=fields)
|
|
132
148
|
|
|
133
149
|
while start_index <= total:
|
|
134
|
-
results = self.handler.connect().jql(
|
|
135
|
-
|
|
150
|
+
results = self.handler.connect().jql(
|
|
151
|
+
jql_query, start=start_index, limit=max_results
|
|
152
|
+
)
|
|
153
|
+
flattened_data = [flatten_json(item) for item in results["issues"]]
|
|
154
|
+
df = pd.DataFrame(flattened_data)
|
|
136
155
|
df = df[fields]
|
|
137
156
|
start_index += max_results
|
|
138
|
-
total =
|
|
157
|
+
total = results["total"]
|
|
139
158
|
all_jira_issues_df = pd.concat([all_jira_issues_df, df], axis=0)
|
|
140
159
|
|
|
160
|
+
all_jira_issues_df = all_jira_issues_df.rename(
|
|
161
|
+
columns={
|
|
162
|
+
"key": "key",
|
|
163
|
+
"fields.summary": "summary",
|
|
164
|
+
"fields.reporter.displayName": "reporter",
|
|
165
|
+
"fields.assignee.displayName": "assignee",
|
|
166
|
+
"fields.priority.name": "priority",
|
|
167
|
+
"fields.status.name": "status",
|
|
168
|
+
},
|
|
169
|
+
errors="ignore",
|
|
170
|
+
)
|
|
141
171
|
|
|
142
|
-
all_jira_issues_df = all_jira_issues_df.rename(columns={
|
|
143
|
-
'key': 'key',
|
|
144
|
-
'fields.summary': 'summary',
|
|
145
|
-
'fields.reporter.name':'reporter',
|
|
146
|
-
'fields.assignee.name':'assignee',
|
|
147
|
-
'fields.priority.name':'priority',
|
|
148
|
-
'fields.status.name':'status'})
|
|
149
|
-
|
|
150
172
|
return all_jira_issues_df
|
|
151
|
-
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from typing import Any, List
|
|
2
|
+
from langchain_core.embeddings import Embeddings
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FastAPIEmbeddings(Embeddings):
|
|
7
|
+
"""An embedding extension that interfaces with FAST API. Useful for custom serving solutions."""
|
|
8
|
+
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
api_base: str,
|
|
12
|
+
model: str,
|
|
13
|
+
batch_size: int = 32,
|
|
14
|
+
**kwargs: Any,
|
|
15
|
+
):
|
|
16
|
+
"""Initialize the embeddings class.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
api_base: Base URL for the VLLM server
|
|
20
|
+
model: Model name/path to use for embeddings
|
|
21
|
+
batch_size: Batch size for generating embeddings
|
|
22
|
+
"""
|
|
23
|
+
super().__init__()
|
|
24
|
+
self.api_base = api_base
|
|
25
|
+
self.model = model
|
|
26
|
+
self.batch_size = batch_size
|
|
27
|
+
|
|
28
|
+
# initialize requests here with the api_base
|
|
29
|
+
|
|
30
|
+
def _get_embeddings(self, texts: List[str]) -> List[str]:
|
|
31
|
+
"""Get embeddings for a batch of text chunks.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
List of embeddings as strings. For sparse vectors, returns strings in format
|
|
35
|
+
"{key:value,...}/size" where size is the dimension of the vector space.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
headers = {"accept": "application/json", "Content-Type": "application/json"}
|
|
39
|
+
|
|
40
|
+
data = {
|
|
41
|
+
"input": texts,
|
|
42
|
+
"model": self.model
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
response = requests.post(self.api_base, headers=headers, json=data)
|
|
46
|
+
|
|
47
|
+
response.raise_for_status()
|
|
48
|
+
|
|
49
|
+
embeddings = []
|
|
50
|
+
for response_dict in response.json()["data"]:
|
|
51
|
+
embedding = response_dict["embedding"]
|
|
52
|
+
embeddings.append(embedding)
|
|
53
|
+
|
|
54
|
+
return embeddings
|
|
55
|
+
|
|
56
|
+
def embed_documents(self, texts: List[str]) -> List[str]:
|
|
57
|
+
"""Embed a list of documents using vLLM.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
texts: List of documents to embed
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
List of embeddings as strings, one for each document.
|
|
64
|
+
For sparse embeddings, returns strings in format "{key:value,...}/size"
|
|
65
|
+
For dense embeddings, returns JSON strings of float lists
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
return self._get_embeddings(texts)
|
|
69
|
+
|
|
70
|
+
def embed_query(self, text: str) -> str:
|
|
71
|
+
"""Embed a single query text using vLLM.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
text: Query text to embed
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Query embedding as a string.
|
|
78
|
+
For sparse embeddings, returns string in format "{key:value,...}/size"
|
|
79
|
+
For dense embeddings, returns JSON string of float list
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
return self._get_embeddings([text])[0]
|
|
@@ -10,6 +10,7 @@ from mindsdb.integrations.libs.base import BaseMLEngine
|
|
|
10
10
|
from mindsdb.utilities import log
|
|
11
11
|
from langchain_core.embeddings import Embeddings
|
|
12
12
|
from mindsdb.integrations.handlers.langchain_embedding_handler.vllm_embeddings import VLLMEmbeddings
|
|
13
|
+
from mindsdb.integrations.handlers.langchain_embedding_handler.fastapi_embeddings import FastAPIEmbeddings
|
|
13
14
|
|
|
14
15
|
logger = log.getLogger(__name__)
|
|
15
16
|
|
|
@@ -20,7 +21,10 @@ logger = log.getLogger(__name__)
|
|
|
20
21
|
# This is used for the user to select the embedding model
|
|
21
22
|
EMBEDDING_MODELS = {
|
|
22
23
|
'VLLM': 'VLLMEmbeddings',
|
|
23
|
-
'vllm': 'VLLMEmbeddings'
|
|
24
|
+
'vllm': 'VLLMEmbeddings',
|
|
25
|
+
'FastAPI': 'FastAPIEmbeddings',
|
|
26
|
+
'fastapi': 'FastAPIEmbeddings'
|
|
27
|
+
|
|
24
28
|
}
|
|
25
29
|
|
|
26
30
|
try:
|
|
@@ -55,6 +59,9 @@ def get_langchain_class(class_name: str) -> Embeddings:
|
|
|
55
59
|
if class_name == "VLLMEmbeddings":
|
|
56
60
|
return VLLMEmbeddings
|
|
57
61
|
|
|
62
|
+
if class_name == "FastAPIEmbeddings":
|
|
63
|
+
return FastAPIEmbeddings
|
|
64
|
+
|
|
58
65
|
# Then try langchain_community.embeddings
|
|
59
66
|
try:
|
|
60
67
|
module = importlib.import_module("langchain_community.embeddings")
|
|
@@ -3,6 +3,6 @@ wikipedia==1.4.0
|
|
|
3
3
|
tiktoken
|
|
4
4
|
anthropic>=0.26.1
|
|
5
5
|
litellm==1.44.8
|
|
6
|
-
chromadb # Knowledge bases.
|
|
6
|
+
chromadb~=0.6.3 # Knowledge bases.
|
|
7
7
|
-r mindsdb/integrations/handlers/openai_handler/requirements.txt
|
|
8
8
|
-r mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt
|
|
@@ -28,7 +28,7 @@ class MSOneDriveHandler(APIHandler):
|
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
30
|
name = 'one_drive'
|
|
31
|
-
supported_file_formats = ['csv', 'tsv', 'json', 'parquet']
|
|
31
|
+
supported_file_formats = ['csv', 'tsv', 'json', 'parquet', 'pdf', 'txt']
|
|
32
32
|
|
|
33
33
|
def __init__(self, name: Text, connection_data: Dict, **kwargs: Any) -> None:
|
|
34
34
|
"""
|
|
@@ -9,6 +9,8 @@ from mindsdb.integrations.utilities.sql_utils import (
|
|
|
9
9
|
SortColumn
|
|
10
10
|
)
|
|
11
11
|
|
|
12
|
+
from mindsdb.integrations.utilities.files.file_reader import FileReader
|
|
13
|
+
|
|
12
14
|
|
|
13
15
|
class ListFilesTable(APIResource):
|
|
14
16
|
"""
|
|
@@ -97,4 +99,10 @@ class FileTable(APIResource):
|
|
|
97
99
|
elif file_extension == "parquet":
|
|
98
100
|
df = pd.read_parquet(BytesIO(file_content))
|
|
99
101
|
|
|
102
|
+
elif file_extension == "pdf":
|
|
103
|
+
df = FileReader().read_pdf(BytesIO(file_content))
|
|
104
|
+
|
|
105
|
+
elif file_extension == "txt":
|
|
106
|
+
df = FileReader().read_txt(BytesIO(file_content))
|
|
107
|
+
|
|
100
108
|
return df
|
|
@@ -37,6 +37,11 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
|
|
|
37
37
|
super().__init__(name=name, **kwargs)
|
|
38
38
|
self._is_shared_db = False
|
|
39
39
|
self._is_vector_registered = False
|
|
40
|
+
# we get these from the connection args on PostgresHandler parent
|
|
41
|
+
self._is_sparse = self.connection_args.get('is_sparse', False)
|
|
42
|
+
self._vector_size = self.connection_args.get('vector_size', None)
|
|
43
|
+
if self._is_sparse and not self._vector_size:
|
|
44
|
+
raise ValueError("vector_size is required when is_sparse=True")
|
|
40
45
|
self.connect()
|
|
41
46
|
|
|
42
47
|
def _make_connection_args(self):
|
|
@@ -190,13 +195,30 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
|
|
|
190
195
|
if filter_conditions:
|
|
191
196
|
|
|
192
197
|
if embedding_search:
|
|
193
|
-
# if search vector, return similar rows, apply other filters after if any
|
|
194
198
|
search_vector = filter_conditions["embeddings"]["value"][0]
|
|
195
199
|
filter_conditions.pop("embeddings")
|
|
196
|
-
|
|
200
|
+
|
|
201
|
+
if self._is_sparse:
|
|
202
|
+
# Convert dict to sparse vector if needed
|
|
203
|
+
if isinstance(search_vector, dict):
|
|
204
|
+
from pgvector.utils import SparseVector
|
|
205
|
+
embedding = SparseVector(search_vector, self._vector_size)
|
|
206
|
+
search_vector = embedding.to_text()
|
|
207
|
+
# Use inner product for sparse vectors
|
|
208
|
+
distance_op = "<#>"
|
|
209
|
+
else:
|
|
210
|
+
# Convert list to vector string if needed
|
|
211
|
+
if isinstance(search_vector, list):
|
|
212
|
+
search_vector = f"[{','.join(str(x) for x in search_vector)}]"
|
|
213
|
+
# Use cosine similarity for dense vectors
|
|
214
|
+
distance_op = "<=>"
|
|
215
|
+
|
|
216
|
+
return f"SELECT {targets} FROM {table_name} ORDER BY embeddings {distance_op} '{search_vector}' ASC {after_from_clause}"
|
|
217
|
+
|
|
197
218
|
else:
|
|
198
|
-
# if filter conditions, return
|
|
219
|
+
# if filter conditions, return rows that satisfy the conditions
|
|
199
220
|
return f"SELECT {targets} FROM {table_name} {after_from_clause}"
|
|
221
|
+
|
|
200
222
|
else:
|
|
201
223
|
# if no filter conditions, return all rows
|
|
202
224
|
return f"SELECT {targets} FROM {table_name} {after_from_clause}"
|
|
@@ -339,14 +361,30 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
|
|
|
339
361
|
full_search_query = f'{semantic_search_cte}{full_text_search_cte}{hybrid_select}'
|
|
340
362
|
return self.raw_query(full_search_query)
|
|
341
363
|
|
|
342
|
-
def create_table(self, table_name: str
|
|
343
|
-
"""
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
364
|
+
def create_table(self, table_name: str):
|
|
365
|
+
"""Create a table with a vector column."""
|
|
366
|
+
with self.connection.cursor() as cur:
|
|
367
|
+
# For sparse vectors, use sparsevec type
|
|
368
|
+
vector_column_type = 'sparsevec' if self._is_sparse else 'vector'
|
|
369
|
+
|
|
370
|
+
# Vector size is required for sparse vectors, optional for dense
|
|
371
|
+
if self._is_sparse and not self._vector_size:
|
|
372
|
+
raise ValueError("vector_size is required for sparse vectors")
|
|
373
|
+
|
|
374
|
+
# Add vector size specification only if provided
|
|
375
|
+
size_spec = f"({self._vector_size})" if self._vector_size is not None else "()"
|
|
376
|
+
if vector_column_type == 'vector':
|
|
377
|
+
size_spec = ''
|
|
378
|
+
|
|
379
|
+
cur.execute(f"""
|
|
380
|
+
CREATE TABLE IF NOT EXISTS {table_name} (
|
|
381
|
+
id TEXT PRIMARY KEY,
|
|
382
|
+
embeddings {vector_column_type}{size_spec},
|
|
383
|
+
content TEXT,
|
|
384
|
+
metadata JSONB
|
|
385
|
+
)
|
|
386
|
+
""")
|
|
387
|
+
self.connection.commit()
|
|
350
388
|
|
|
351
389
|
def insert(
|
|
352
390
|
self, table_name: str, data: pd.DataFrame
|
|
@@ -444,4 +482,3 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
|
|
|
444
482
|
"""
|
|
445
483
|
table_name = self._check_table(table_name)
|
|
446
484
|
self.raw_query(f"DROP TABLE IF EXISTS {table_name}")
|
|
447
|
-
|