MindsDB 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +53 -94
- mindsdb/api/a2a/agent.py +30 -206
- mindsdb/api/a2a/common/server/server.py +26 -27
- mindsdb/api/a2a/task_manager.py +93 -227
- mindsdb/api/a2a/utils.py +21 -0
- mindsdb/api/executor/command_executor.py +8 -6
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
- mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
- mindsdb/api/executor/planner/query_prepare.py +68 -87
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
- mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
- mindsdb/api/executor/utilities/sql.py +97 -21
- mindsdb/api/http/namespaces/agents.py +126 -201
- mindsdb/api/http/namespaces/config.py +12 -1
- mindsdb/api/http/namespaces/file.py +49 -24
- mindsdb/api/mcp/start.py +45 -31
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
- mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
- mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
- mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +244 -141
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/libs/keyword_search_base.py +41 -0
- mindsdb/integrations/libs/vectordatabase_handler.py +114 -84
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
- mindsdb/integrations/utilities/sql_utils.py +11 -0
- mindsdb/interfaces/agents/agents_controller.py +29 -9
- mindsdb/interfaces/agents/langchain_agent.py +7 -5
- mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
- mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
- mindsdb/interfaces/database/projects.py +1 -3
- mindsdb/interfaces/functions/controller.py +54 -64
- mindsdb/interfaces/functions/to_markdown.py +47 -14
- mindsdb/interfaces/knowledge_base/controller.py +228 -110
- mindsdb/interfaces/knowledge_base/evaluate.py +18 -6
- mindsdb/interfaces/knowledge_base/executor.py +346 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
- mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
- mindsdb/interfaces/skills/sql_agent.py +181 -130
- mindsdb/interfaces/storage/db.py +9 -7
- mindsdb/utilities/config.py +58 -40
- mindsdb/utilities/exception.py +58 -7
- mindsdb/utilities/security.py +54 -11
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +245 -259
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +61 -58
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0
|
@@ -3,6 +3,7 @@ import shutil
|
|
|
3
3
|
import tarfile
|
|
4
4
|
import tempfile
|
|
5
5
|
import zipfile
|
|
6
|
+
from urllib.parse import urlparse
|
|
6
7
|
|
|
7
8
|
import multipart
|
|
8
9
|
import requests
|
|
@@ -13,7 +14,7 @@ from flask_restx import Resource
|
|
|
13
14
|
from mindsdb.api.http.namespaces.configs.files import ns_conf
|
|
14
15
|
from mindsdb.api.http.utils import http_error
|
|
15
16
|
from mindsdb.metrics.metrics import api_endpoint_metrics
|
|
16
|
-
from mindsdb.utilities.config import
|
|
17
|
+
from mindsdb.utilities.config import config
|
|
17
18
|
from mindsdb.utilities.context import context as ctx
|
|
18
19
|
from mindsdb.utilities import log
|
|
19
20
|
from mindsdb.utilities.security import is_private_url, clear_filename, validate_urls
|
|
@@ -105,31 +106,55 @@ class File(Resource):
|
|
|
105
106
|
|
|
106
107
|
if data.get("source_type") == "url":
|
|
107
108
|
url = data["source"]
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
109
|
+
try:
|
|
110
|
+
url = urlparse(url)
|
|
111
|
+
if not (url.scheme and url.netloc):
|
|
112
|
+
raise ValueError()
|
|
113
|
+
url = url.geturl()
|
|
114
|
+
except Exception:
|
|
115
|
+
return http_error(
|
|
116
|
+
400,
|
|
117
|
+
"Invalid URL",
|
|
118
|
+
f"The URL is not valid: {data['source']}",
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
url_file_upload_enabled = config["url_file_upload"]["enabled"]
|
|
122
|
+
if url_file_upload_enabled is False:
|
|
123
|
+
return http_error(400, "URL file upload is disabled.", "URL file upload is disabled.")
|
|
124
|
+
|
|
125
|
+
allowed_origins = config["url_file_upload"]["allowed_origins"]
|
|
126
|
+
disallowed_origins = config["url_file_upload"]["disallowed_origins"]
|
|
127
|
+
|
|
128
|
+
if validate_urls(url, allowed_origins, disallowed_origins) is False:
|
|
129
|
+
return http_error(
|
|
130
|
+
400,
|
|
131
|
+
"Invalid URL",
|
|
132
|
+
"URL is not allowed for security reasons. Allowed hosts are: "
|
|
133
|
+
f"{', '.join(allowed_origins) if allowed_origins else 'not specified'}.",
|
|
134
|
+
)
|
|
135
|
+
|
|
112
136
|
data["file"] = clear_filename(data["name"])
|
|
113
137
|
is_cloud = config.get("cloud", False)
|
|
114
|
-
if is_cloud
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
138
|
+
if is_cloud:
|
|
139
|
+
if is_private_url(url):
|
|
140
|
+
return http_error(400, f"URL is private: {url}")
|
|
141
|
+
|
|
142
|
+
if ctx.user_class != 1:
|
|
143
|
+
info = requests.head(url, timeout=30)
|
|
144
|
+
file_size = info.headers.get("Content-Length")
|
|
145
|
+
try:
|
|
146
|
+
file_size = int(file_size)
|
|
147
|
+
except Exception:
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
if file_size is None:
|
|
151
|
+
return http_error(
|
|
152
|
+
400,
|
|
153
|
+
"Error getting file info",
|
|
154
|
+
"Сan't determine remote file size",
|
|
155
|
+
)
|
|
156
|
+
if file_size > MAX_FILE_SIZE:
|
|
157
|
+
return http_error(400, "File is too big", f"Upload limit for file is {MAX_FILE_SIZE >> 20} MB")
|
|
133
158
|
with requests.get(url, stream=True) as r:
|
|
134
159
|
if r.status_code != 200:
|
|
135
160
|
return http_error(400, "Error getting file", f"Got status code: {r.status_code}")
|
mindsdb/api/mcp/start.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
|
+
from typing import Any
|
|
3
|
+
from textwrap import dedent
|
|
2
4
|
from contextlib import asynccontextmanager
|
|
3
5
|
from collections.abc import AsyncIterator
|
|
4
|
-
from typing import Optional, Dict, Any
|
|
5
6
|
from dataclasses import dataclass
|
|
6
7
|
|
|
7
8
|
import uvicorn
|
|
@@ -41,16 +42,32 @@ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
|
|
|
41
42
|
mcp = FastMCP(
|
|
42
43
|
"MindsDB",
|
|
43
44
|
lifespan=app_lifespan,
|
|
44
|
-
dependencies=["mindsdb"] # Add any additional dependencies
|
|
45
|
+
dependencies=["mindsdb"], # Add any additional dependencies
|
|
45
46
|
)
|
|
46
47
|
# MCP Queries
|
|
47
48
|
LISTING_QUERY = "SHOW DATABASES"
|
|
48
49
|
|
|
49
50
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
51
|
+
query_tool_description = dedent("""\
|
|
52
|
+
Executes a SQL query against MindsDB.
|
|
53
|
+
|
|
54
|
+
A database must be specified either in the `context` parameter or directly in the query string (e.g., `SELECT * FROM my_database.my_table`). Queries like `SELECT * FROM my_table` will fail without a `context`.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
query (str): The SQL query to execute.
|
|
58
|
+
context (dict, optional): The default database context. For example, `{"db": "my_postgres"}`.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
A dictionary describing the result.
|
|
62
|
+
- For a successful query with no data to return (e.g., an `UPDATE` statement), the response is `{"type": "ok"}`.
|
|
63
|
+
- If the query returns tabular data, the response is a dictionary containing `data` (a list of rows) and `column_names` (a list of column names). For example: `{"type": "table", "data": [[1, "a"], [2, "b"]], "column_names": ["column_a", "column_b"]}`.
|
|
64
|
+
- In case of an error, a response is `{"type": "error", "error_message": "the error message"}`.
|
|
65
|
+
""")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@mcp.tool(name="query", description=query_tool_description)
|
|
69
|
+
def query(query: str, context: dict | None = None) -> dict[str, Any]:
|
|
70
|
+
"""Execute a SQL query against MindsDB
|
|
54
71
|
|
|
55
72
|
Args:
|
|
56
73
|
query: The SQL query to execute
|
|
@@ -63,7 +80,7 @@ def query(query: str, context: Optional[Dict] = None) -> Dict[str, Any]:
|
|
|
63
80
|
if context is None:
|
|
64
81
|
context = {}
|
|
65
82
|
|
|
66
|
-
logger.debug(f
|
|
83
|
+
logger.debug(f"Incoming MCP query: {query}")
|
|
67
84
|
|
|
68
85
|
mysql_proxy = FakeMysqlProxy()
|
|
69
86
|
mysql_proxy.set_context(context)
|
|
@@ -78,34 +95,30 @@ def query(query: str, context: Optional[Dict] = None) -> Dict[str, Any]:
|
|
|
78
95
|
return {
|
|
79
96
|
"type": SQL_RESPONSE_TYPE.TABLE,
|
|
80
97
|
"data": result.result_set.to_lists(json_types=True),
|
|
81
|
-
"column_names": [
|
|
82
|
-
column.alias or column.name
|
|
83
|
-
for column in result.result_set.columns
|
|
84
|
-
],
|
|
98
|
+
"column_names": [column.alias or column.name for column in result.result_set.columns],
|
|
85
99
|
}
|
|
86
100
|
else:
|
|
87
|
-
return {
|
|
88
|
-
"type": SQL_RESPONSE_TYPE.ERROR,
|
|
89
|
-
"error_code": 0,
|
|
90
|
-
"error_message": "Unknown response type"
|
|
91
|
-
}
|
|
101
|
+
return {"type": SQL_RESPONSE_TYPE.ERROR, "error_code": 0, "error_message": "Unknown response type"}
|
|
92
102
|
|
|
93
103
|
except Exception as e:
|
|
94
104
|
logger.error(f"Error processing query: {str(e)}")
|
|
95
|
-
return {
|
|
96
|
-
"type": SQL_RESPONSE_TYPE.ERROR,
|
|
97
|
-
"error_code": 0,
|
|
98
|
-
"error_message": str(e)
|
|
99
|
-
}
|
|
105
|
+
return {"type": SQL_RESPONSE_TYPE.ERROR, "error_code": 0, "error_message": str(e)}
|
|
100
106
|
|
|
101
107
|
|
|
102
|
-
|
|
103
|
-
|
|
108
|
+
list_databases_tool_description = (
|
|
109
|
+
"Returns a list of all database connections currently available in MindsDB. "
|
|
110
|
+
+ "The tool takes no parameters and responds with a list of database names, "
|
|
111
|
+
+ 'for example: ["my_postgres", "my_mysql", "test_db"].'
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@mcp.tool(name="list_databases", description=list_databases_tool_description)
|
|
116
|
+
def list_databases() -> list[str]:
|
|
104
117
|
"""
|
|
105
|
-
List all databases in MindsDB
|
|
118
|
+
List all databases in MindsDB
|
|
106
119
|
|
|
107
120
|
Returns:
|
|
108
|
-
|
|
121
|
+
list[str]: list of databases
|
|
109
122
|
"""
|
|
110
123
|
|
|
111
124
|
mysql_proxy = FakeMysqlProxy()
|
|
@@ -124,6 +137,7 @@ def list_databases() -> Dict[str, Any]:
|
|
|
124
137
|
|
|
125
138
|
elif result.type == SQL_RESPONSE_TYPE.TABLE:
|
|
126
139
|
data = result.result_set.to_lists(json_types=True)
|
|
140
|
+
data = [val[0] for val in data]
|
|
127
141
|
return data
|
|
128
142
|
|
|
129
143
|
except Exception as e:
|
|
@@ -135,12 +149,12 @@ def list_databases() -> Dict[str, Any]:
|
|
|
135
149
|
|
|
136
150
|
|
|
137
151
|
class CustomAuthMiddleware(BaseHTTPMiddleware):
|
|
138
|
-
"""Custom middleware to handle authentication basing on header 'Authorization'
|
|
139
|
-
|
|
152
|
+
"""Custom middleware to handle authentication basing on header 'Authorization'"""
|
|
153
|
+
|
|
140
154
|
async def dispatch(self, request: Request, call_next):
|
|
141
|
-
mcp_access_token = os.environ.get(
|
|
155
|
+
mcp_access_token = os.environ.get("MINDSDB_MCP_ACCESS_TOKEN")
|
|
142
156
|
if mcp_access_token is not None:
|
|
143
|
-
auth_token = request.headers.get(
|
|
157
|
+
auth_token = request.headers.get("Authorization", "").partition("Bearer ")[-1]
|
|
144
158
|
if mcp_access_token != auth_token:
|
|
145
159
|
return Response(status_code=401, content="Unauthorized", media_type="text/plain")
|
|
146
160
|
|
|
@@ -171,8 +185,8 @@ def start(*args, **kwargs):
|
|
|
171
185
|
port (int): Port to listen on
|
|
172
186
|
"""
|
|
173
187
|
config = Config()
|
|
174
|
-
port = int(config[
|
|
175
|
-
host = config[
|
|
188
|
+
port = int(config["api"].get("mcp", {}).get("port", 47337))
|
|
189
|
+
host = config["api"].get("mcp", {}).get("host", "127.0.0.1")
|
|
176
190
|
|
|
177
191
|
logger.info(f"Starting MCP server on {host}:{port}")
|
|
178
192
|
mcp.settings.host = host
|
|
@@ -91,9 +91,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
91
91
|
self.persist_directory = config.persist_directory
|
|
92
92
|
elif not self.handler_storage.is_temporal:
|
|
93
93
|
# get full persistence directory from handler storage
|
|
94
|
-
self.persist_directory = self.handler_storage.folder_get(
|
|
95
|
-
config.persist_directory
|
|
96
|
-
)
|
|
94
|
+
self.persist_directory = self.handler_storage.folder_get(config.persist_directory)
|
|
97
95
|
self._use_handler_storage = True
|
|
98
96
|
|
|
99
97
|
return config
|
|
@@ -141,7 +139,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
141
139
|
def disconnect(self):
|
|
142
140
|
"""Close the database connection."""
|
|
143
141
|
if self.is_connected:
|
|
144
|
-
if hasattr(self._client,
|
|
142
|
+
if hasattr(self._client, "close"):
|
|
145
143
|
self._client.close() # Some ChromaDB clients have a close method
|
|
146
144
|
self._client = None
|
|
147
145
|
self.is_connected = False
|
|
@@ -182,9 +180,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
182
180
|
|
|
183
181
|
return mapping[operator]
|
|
184
182
|
|
|
185
|
-
def _translate_metadata_condition(
|
|
186
|
-
self, conditions: List[FilterCondition]
|
|
187
|
-
) -> Optional[dict]:
|
|
183
|
+
def _translate_metadata_condition(self, conditions: List[FilterCondition]) -> Optional[dict]:
|
|
188
184
|
"""
|
|
189
185
|
Translate a list of FilterCondition objects a dict that can be used by ChromaDB.
|
|
190
186
|
E.g.,
|
|
@@ -212,9 +208,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
212
208
|
if conditions is None:
|
|
213
209
|
return None
|
|
214
210
|
metadata_conditions = [
|
|
215
|
-
condition
|
|
216
|
-
for condition in conditions
|
|
217
|
-
if condition.column.startswith(TableField.METADATA.value)
|
|
211
|
+
condition for condition in conditions if condition.column.startswith(TableField.METADATA.value)
|
|
218
212
|
]
|
|
219
213
|
if len(metadata_conditions) == 0:
|
|
220
214
|
return None
|
|
@@ -224,19 +218,11 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
224
218
|
for condition in metadata_conditions:
|
|
225
219
|
metadata_key = condition.column.split(".")[-1]
|
|
226
220
|
|
|
227
|
-
chroma_db_conditions.append(
|
|
228
|
-
{
|
|
229
|
-
metadata_key: {
|
|
230
|
-
self._get_chromadb_operator(condition.op): condition.value
|
|
231
|
-
}
|
|
232
|
-
}
|
|
233
|
-
)
|
|
221
|
+
chroma_db_conditions.append({metadata_key: {self._get_chromadb_operator(condition.op): condition.value}})
|
|
234
222
|
|
|
235
223
|
# we combine all metadata conditions into a single dict
|
|
236
224
|
metadata_condition = (
|
|
237
|
-
{"$and": chroma_db_conditions}
|
|
238
|
-
if len(chroma_db_conditions) > 1
|
|
239
|
-
else chroma_db_conditions[0]
|
|
225
|
+
{"$and": chroma_db_conditions} if len(chroma_db_conditions) > 1 else chroma_db_conditions[0]
|
|
240
226
|
)
|
|
241
227
|
return metadata_condition
|
|
242
228
|
|
|
@@ -248,7 +234,6 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
248
234
|
offset: int = None,
|
|
249
235
|
limit: int = None,
|
|
250
236
|
) -> pd.DataFrame:
|
|
251
|
-
|
|
252
237
|
collection = self._client.get_collection(table_name)
|
|
253
238
|
filters = self._translate_metadata_condition(conditions)
|
|
254
239
|
|
|
@@ -258,38 +243,43 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
258
243
|
vector_filter = (
|
|
259
244
|
[]
|
|
260
245
|
if conditions is None
|
|
261
|
-
else [
|
|
262
|
-
condition
|
|
263
|
-
for condition in conditions
|
|
264
|
-
if condition.column == TableField.EMBEDDINGS.value
|
|
265
|
-
]
|
|
246
|
+
else [condition for condition in conditions if condition.column == TableField.EMBEDDINGS.value]
|
|
266
247
|
)
|
|
267
248
|
|
|
268
249
|
if len(vector_filter) > 0:
|
|
269
250
|
vector_filter = vector_filter[0]
|
|
270
251
|
else:
|
|
271
252
|
vector_filter = None
|
|
272
|
-
|
|
253
|
+
ids_include = []
|
|
254
|
+
ids_exclude = []
|
|
255
|
+
|
|
273
256
|
if conditions is not None:
|
|
274
257
|
for condition in conditions:
|
|
275
258
|
if condition.column != TableField.ID.value:
|
|
276
259
|
continue
|
|
277
260
|
if condition.op == FilterOperator.EQUAL:
|
|
278
|
-
|
|
261
|
+
ids_include.append(condition.value)
|
|
279
262
|
elif condition.op == FilterOperator.IN:
|
|
280
|
-
|
|
263
|
+
ids_include.extend(condition.value)
|
|
264
|
+
elif condition.op == FilterOperator.NOT_EQUAL:
|
|
265
|
+
ids_exclude.append(condition.value)
|
|
266
|
+
elif condition.op == FilterOperator.NOT_IN:
|
|
267
|
+
ids_exclude.extend(condition.value)
|
|
281
268
|
|
|
282
269
|
if vector_filter is not None:
|
|
283
270
|
# similarity search
|
|
284
271
|
query_payload = {
|
|
285
272
|
"where": filters,
|
|
286
|
-
"query_embeddings": vector_filter.value
|
|
287
|
-
if vector_filter is not None
|
|
288
|
-
else None,
|
|
273
|
+
"query_embeddings": vector_filter.value if vector_filter is not None else None,
|
|
289
274
|
"include": include + ["distances"],
|
|
290
275
|
}
|
|
276
|
+
|
|
291
277
|
if limit is not None:
|
|
292
|
-
|
|
278
|
+
if len(ids_include) == 0 and len(ids_exclude) == 0:
|
|
279
|
+
query_payload["n_results"] = limit
|
|
280
|
+
else:
|
|
281
|
+
# get more results if we have filters by id
|
|
282
|
+
query_payload["n_results"] = limit * 10
|
|
293
283
|
|
|
294
284
|
result = collection.query(**query_payload)
|
|
295
285
|
ids = result["ids"][0]
|
|
@@ -301,7 +291,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
301
291
|
else:
|
|
302
292
|
# general get query
|
|
303
293
|
result = collection.get(
|
|
304
|
-
ids=
|
|
294
|
+
ids=ids_include or None,
|
|
305
295
|
where=filters,
|
|
306
296
|
limit=limit,
|
|
307
297
|
offset=offset,
|
|
@@ -337,13 +327,21 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
337
327
|
break
|
|
338
328
|
|
|
339
329
|
df = pd.DataFrame(payload)
|
|
330
|
+
if ids_exclude or ids_include:
|
|
331
|
+
if ids_exclude:
|
|
332
|
+
df = df[~df[TableField.ID.value].isin(ids_exclude)]
|
|
333
|
+
if ids_include:
|
|
334
|
+
df = df[df[TableField.ID.value].isin(ids_include)]
|
|
335
|
+
if limit is not None:
|
|
336
|
+
df = df[:limit]
|
|
337
|
+
|
|
340
338
|
if distance_filter is not None:
|
|
341
339
|
op_map = {
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
340
|
+
"<": "__lt__",
|
|
341
|
+
"<=": "__le__",
|
|
342
|
+
">": "__gt__",
|
|
343
|
+
">=": "__ge__",
|
|
344
|
+
"=": "__eq__",
|
|
347
345
|
}
|
|
348
346
|
op = op_map.get(distance_filter.op.value)
|
|
349
347
|
if op:
|
|
@@ -393,7 +391,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
393
391
|
else:
|
|
394
392
|
# Convert IDs to strings and remove any duplicates
|
|
395
393
|
df[TableField.ID.value] = df[TableField.ID.value].astype(str)
|
|
396
|
-
df = df.drop_duplicates(subset=[TableField.ID.value], keep=
|
|
394
|
+
df = df.drop_duplicates(subset=[TableField.ID.value], keep="last")
|
|
397
395
|
|
|
398
396
|
return df
|
|
399
397
|
|
|
@@ -413,7 +411,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
413
411
|
df = df.dropna(subset=[TableField.METADATA.value])
|
|
414
412
|
|
|
415
413
|
# Convert embeddings from string to list if they are strings
|
|
416
|
-
if TableField.EMBEDDINGS.value in df.columns and df[TableField.EMBEDDINGS.value].dtype ==
|
|
414
|
+
if TableField.EMBEDDINGS.value in df.columns and df[TableField.EMBEDDINGS.value].dtype == "object":
|
|
417
415
|
df[TableField.EMBEDDINGS.value] = df[TableField.EMBEDDINGS.value].apply(
|
|
418
416
|
lambda x: ast.literal_eval(x) if isinstance(x, str) else x
|
|
419
417
|
)
|
|
@@ -429,7 +427,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
429
427
|
ids=data_dict[TableField.ID.value],
|
|
430
428
|
documents=data_dict[TableField.CONTENT.value],
|
|
431
429
|
embeddings=data_dict.get(TableField.EMBEDDINGS.value, None),
|
|
432
|
-
metadatas=data_dict.get(TableField.METADATA.value, None)
|
|
430
|
+
metadatas=data_dict.get(TableField.METADATA.value, None),
|
|
433
431
|
)
|
|
434
432
|
self._sync()
|
|
435
433
|
except Exception as e:
|
|
@@ -467,16 +465,10 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
467
465
|
)
|
|
468
466
|
self._sync()
|
|
469
467
|
|
|
470
|
-
def delete(
|
|
471
|
-
self, table_name: str, conditions: List[FilterCondition] = None
|
|
472
|
-
):
|
|
468
|
+
def delete(self, table_name: str, conditions: List[FilterCondition] = None):
|
|
473
469
|
filters = self._translate_metadata_condition(conditions)
|
|
474
470
|
# get id filters
|
|
475
|
-
id_filters = [
|
|
476
|
-
condition.value
|
|
477
|
-
for condition in conditions
|
|
478
|
-
if condition.column == TableField.ID.value
|
|
479
|
-
] or None
|
|
471
|
+
id_filters = [condition.value for condition in conditions if condition.column == TableField.ID.value] or None
|
|
480
472
|
|
|
481
473
|
if filters is None and id_filters is None:
|
|
482
474
|
raise Exception("Delete query must have at least one condition!")
|
|
@@ -488,8 +480,9 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
488
480
|
"""
|
|
489
481
|
Create a collection with the given name in the ChromaDB database.
|
|
490
482
|
"""
|
|
491
|
-
self._client.create_collection(
|
|
492
|
-
|
|
483
|
+
self._client.create_collection(
|
|
484
|
+
table_name, get_or_create=if_not_exists, metadata=self.create_collection_metadata
|
|
485
|
+
)
|
|
493
486
|
self._sync()
|
|
494
487
|
|
|
495
488
|
def drop_table(self, table_name: str, if_exists=True):
|
|
@@ -1,20 +1,25 @@
|
|
|
1
1
|
from mindsdb.integrations.libs.const import HANDLER_TYPE
|
|
2
2
|
|
|
3
3
|
from .__about__ import __version__ as version, __description__ as description
|
|
4
|
-
try:
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
except Exception as e:
|
|
8
|
-
|
|
9
|
-
|
|
4
|
+
# try:
|
|
5
|
+
# from .huggingface_handler import HuggingFaceHandler as Handler
|
|
6
|
+
# import_error = None
|
|
7
|
+
# except Exception as e:
|
|
8
|
+
# Handler = None
|
|
9
|
+
# import_error = e
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
# NOTE: security vulnerability is in `pytorch` v2.7.1, revert changes here and in
|
|
12
|
+
# requirements.txt/requirements_cpu.txt when new version is released
|
|
13
|
+
Handler = None
|
|
14
|
+
import_error = """
|
|
15
|
+
The `huggingface_handler` is temporary disabled in current version of MindsDB due to security vulnerability.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
title = "Hugging Face"
|
|
19
|
+
name = "huggingface"
|
|
13
20
|
type = HANDLER_TYPE.ML
|
|
14
21
|
icon_path = "icon.svg"
|
|
15
22
|
permanent = False
|
|
16
|
-
execution_method =
|
|
23
|
+
execution_method = "subprocess_keep"
|
|
17
24
|
|
|
18
|
-
__all__ = [
|
|
19
|
-
'Handler', 'version', 'name', 'type', 'title', 'description', 'import_error', 'icon_path'
|
|
20
|
-
]
|
|
25
|
+
__all__ = ["Handler", "version", "name", "type", "title", "description", "import_error", "icon_path"]
|