MindsDB 25.6.3.1__py3-none-any.whl → 25.7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/command_executor.py +8 -6
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +72 -44
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +14 -1
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/system_tables.py +314 -1
- mindsdb/api/executor/planner/plan_join.py +1 -1
- mindsdb/api/executor/planner/query_planner.py +7 -1
- mindsdb/api/executor/planner/query_prepare.py +68 -87
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
- mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
- mindsdb/api/http/namespaces/file.py +49 -24
- mindsdb/api/mcp/start.py +45 -31
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
- mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
- mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
- mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
- mindsdb/integrations/handlers/ludwig_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +150 -140
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +2 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/libs/api_handler.py +6 -7
- mindsdb/integrations/libs/vectordatabase_handler.py +86 -77
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
- mindsdb/interfaces/agents/agents_controller.py +29 -9
- mindsdb/interfaces/agents/constants.py +44 -0
- mindsdb/interfaces/agents/langchain_agent.py +15 -6
- mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
- mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +22 -3
- mindsdb/interfaces/knowledge_base/controller.py +121 -102
- mindsdb/interfaces/knowledge_base/evaluate.py +19 -7
- mindsdb/interfaces/knowledge_base/executor.py +346 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
- mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +26 -22
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +40 -28
- mindsdb/interfaces/skills/skill_tool.py +91 -88
- mindsdb/interfaces/skills/sql_agent.py +181 -130
- mindsdb/interfaces/storage/db.py +9 -7
- mindsdb/utilities/config.py +12 -1
- mindsdb/utilities/exception.py +47 -7
- mindsdb/utilities/security.py +54 -11
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/METADATA +239 -251
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/RECORD +55 -54
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,9 @@ import csv
|
|
|
3
3
|
import inspect
|
|
4
4
|
import traceback
|
|
5
5
|
from io import StringIO
|
|
6
|
-
from typing import Iterable, List, Optional, Any
|
|
6
|
+
from typing import Iterable, List, Optional, Any, Tuple
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
import fnmatch
|
|
7
9
|
|
|
8
10
|
import pandas as pd
|
|
9
11
|
from mindsdb_sql_parser import parse_sql
|
|
@@ -75,12 +77,84 @@ def split_table_name(table_name: str) -> List[str]:
|
|
|
75
77
|
if current:
|
|
76
78
|
result.append(current.strip("`"))
|
|
77
79
|
|
|
78
|
-
# ensure we split the table name
|
|
79
|
-
# result = [r.split(".") for r in result][0]
|
|
80
|
-
|
|
81
80
|
return result
|
|
82
81
|
|
|
83
82
|
|
|
83
|
+
class TablesCollection:
|
|
84
|
+
"""
|
|
85
|
+
Collection of identifiers.
|
|
86
|
+
Supports wildcard in tables name.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(self, items: List[Identifier | str] = None, default_db=None):
|
|
90
|
+
if items is None:
|
|
91
|
+
items = []
|
|
92
|
+
|
|
93
|
+
self.items = items
|
|
94
|
+
self._dbs = defaultdict(set)
|
|
95
|
+
self._schemas = defaultdict(dict)
|
|
96
|
+
self._no_db_tables = set()
|
|
97
|
+
self.has_wildcard = False
|
|
98
|
+
self.databases = set()
|
|
99
|
+
self._default_db = default_db
|
|
100
|
+
|
|
101
|
+
for name in items:
|
|
102
|
+
if not isinstance(name, Identifier):
|
|
103
|
+
name = Identifier(name)
|
|
104
|
+
db, schema, tbl = self._get_paths(name)
|
|
105
|
+
if db is None:
|
|
106
|
+
self._no_db_tables.add(tbl)
|
|
107
|
+
elif schema is None:
|
|
108
|
+
self._dbs[db].add(tbl)
|
|
109
|
+
else:
|
|
110
|
+
if schema not in self._schemas[db]:
|
|
111
|
+
self._schemas[db][schema] = set()
|
|
112
|
+
self._schemas[db][schema].add(tbl)
|
|
113
|
+
|
|
114
|
+
if "*" in tbl:
|
|
115
|
+
self.has_wildcard = True
|
|
116
|
+
self.databases.add(db)
|
|
117
|
+
|
|
118
|
+
def _get_paths(self, table: Identifier) -> Tuple:
|
|
119
|
+
# split identifier to db, schema, table name
|
|
120
|
+
schema = None
|
|
121
|
+
db = None
|
|
122
|
+
|
|
123
|
+
match [x.lower() for x in table.parts]:
|
|
124
|
+
case [tbl]:
|
|
125
|
+
pass
|
|
126
|
+
case [db, tbl]:
|
|
127
|
+
pass
|
|
128
|
+
case [db, schema, tbl]:
|
|
129
|
+
pass
|
|
130
|
+
case _:
|
|
131
|
+
raise NotImplementedError
|
|
132
|
+
return db, schema, tbl.lower()
|
|
133
|
+
|
|
134
|
+
def match(self, table: Identifier) -> bool:
|
|
135
|
+
# Check if input table matches to tables in collection
|
|
136
|
+
|
|
137
|
+
db, schema, tbl = self._get_paths(table)
|
|
138
|
+
if db is None:
|
|
139
|
+
if tbl in self._no_db_tables:
|
|
140
|
+
return True
|
|
141
|
+
if self._default_db is not None:
|
|
142
|
+
return self.match(Identifier(parts=[self._default_db, tbl]))
|
|
143
|
+
|
|
144
|
+
if schema is not None:
|
|
145
|
+
if any([fnmatch.fnmatch(tbl, pattern) for pattern in self._schemas[db].get(schema, [])]):
|
|
146
|
+
return True
|
|
147
|
+
|
|
148
|
+
# table might be specified without schema
|
|
149
|
+
return any([fnmatch.fnmatch(tbl, pattern) for pattern in self._dbs[db]])
|
|
150
|
+
|
|
151
|
+
def __bool__(self):
|
|
152
|
+
return len(self.items) > 0
|
|
153
|
+
|
|
154
|
+
def __repr__(self):
|
|
155
|
+
return f"Tables({self.items})"
|
|
156
|
+
|
|
157
|
+
|
|
84
158
|
class SQLAgent:
|
|
85
159
|
"""
|
|
86
160
|
SQLAgent is a class that handles SQL queries for agents.
|
|
@@ -117,21 +191,23 @@ class SQLAgent:
|
|
|
117
191
|
self._command_executor = command_executor
|
|
118
192
|
self._mindsdb_db_struct = databases_struct
|
|
119
193
|
self.knowledge_base_database = knowledge_base_database # This is a project name, not a database connection
|
|
194
|
+
self._databases = databases
|
|
120
195
|
self._sample_rows_in_table_info = int(sample_rows_in_table_info)
|
|
121
196
|
|
|
122
|
-
self._tables_to_include = include_tables
|
|
123
|
-
self.
|
|
124
|
-
self._knowledge_bases_to_include = include_knowledge_bases
|
|
125
|
-
self._knowledge_bases_to_ignore = []
|
|
126
|
-
self._databases = databases
|
|
127
|
-
if not self._tables_to_include:
|
|
197
|
+
self._tables_to_include = TablesCollection(include_tables)
|
|
198
|
+
if self._tables_to_include:
|
|
128
199
|
# ignore_tables and include_tables should not be used together.
|
|
129
200
|
# include_tables takes priority if it's set.
|
|
130
|
-
|
|
131
|
-
|
|
201
|
+
ignore_tables = []
|
|
202
|
+
self._tables_to_ignore = TablesCollection(ignore_tables)
|
|
203
|
+
|
|
204
|
+
self._knowledge_bases_to_include = TablesCollection(include_knowledge_bases, default_db=knowledge_base_database)
|
|
205
|
+
if self._knowledge_bases_to_include:
|
|
132
206
|
# ignore_knowledge_bases and include_knowledge_bases should not be used together.
|
|
133
207
|
# include_knowledge_bases takes priority if it's set.
|
|
134
|
-
|
|
208
|
+
ignore_knowledge_bases = []
|
|
209
|
+
self._knowledge_bases_to_ignore = TablesCollection(ignore_knowledge_bases, default_db=knowledge_base_database)
|
|
210
|
+
|
|
135
211
|
self._cache = cache
|
|
136
212
|
|
|
137
213
|
from mindsdb.interfaces.skills.skill_tool import SkillToolController
|
|
@@ -159,46 +235,54 @@ class SQLAgent:
|
|
|
159
235
|
if not isinstance(ast_query, (Select, Show, Describe, Explain)):
|
|
160
236
|
raise ValueError(f"Query is not allowed: {ast_query.to_string()}")
|
|
161
237
|
|
|
238
|
+
kb_names = self.get_all_knowledge_base_names()
|
|
239
|
+
|
|
162
240
|
# Check tables
|
|
163
241
|
if self._tables_to_include:
|
|
164
|
-
tables_parts = [split_table_name(x) for x in self._tables_to_include]
|
|
165
|
-
no_schema_parts = []
|
|
166
|
-
for t in tables_parts:
|
|
167
|
-
if len(t) == 3:
|
|
168
|
-
no_schema_parts.append([t[0], t[2]])
|
|
169
|
-
tables_parts += no_schema_parts
|
|
170
242
|
|
|
171
243
|
def _check_f(node, is_table=None, **kwargs):
|
|
172
244
|
if is_table and isinstance(node, Identifier):
|
|
173
245
|
table_name = ".".join(node.parts)
|
|
174
246
|
|
|
175
|
-
# Get the list of available knowledge bases
|
|
176
|
-
kb_names = self.get_usable_knowledge_base_names()
|
|
177
|
-
|
|
178
247
|
# Check if this table is a knowledge base
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
# Check if it's a restricted table
|
|
197
|
-
elif not is_kb and table_name in self._tables_to_ignore:
|
|
198
|
-
raise ValueError(f"Table {table_name} is not allowed.")
|
|
248
|
+
if table_name in kb_names or node.parts[-1] in kb_names:
|
|
249
|
+
# If it's a knowledge base and we have knowledge base restrictions
|
|
250
|
+
self.check_knowledge_base_permission(node)
|
|
251
|
+
else:
|
|
252
|
+
try:
|
|
253
|
+
# Regular table check
|
|
254
|
+
self.check_table_permission(node)
|
|
255
|
+
except ValueError as origin_exc:
|
|
256
|
+
# was it badly quoted by llm?
|
|
257
|
+
if len(node.parts) == 1 and node.is_quoted[0] and "." in node.parts[0]:
|
|
258
|
+
node2 = Identifier(node.parts[0])
|
|
259
|
+
try:
|
|
260
|
+
_check_f(node2, is_table=True)
|
|
261
|
+
return node2
|
|
262
|
+
except ValueError:
|
|
263
|
+
...
|
|
264
|
+
raise origin_exc
|
|
199
265
|
|
|
200
266
|
query_traversal(ast_query, _check_f)
|
|
201
267
|
|
|
268
|
+
def check_knowledge_base_permission(self, node):
|
|
269
|
+
if self._knowledge_bases_to_include and not self._knowledge_bases_to_include.match(node):
|
|
270
|
+
raise ValueError(
|
|
271
|
+
f"Knowledge base {str(node)} not found. Available knowledge bases: {', '.join(self._knowledge_bases_to_include.items)}"
|
|
272
|
+
)
|
|
273
|
+
# Check if it's a restricted knowledge base
|
|
274
|
+
if self._knowledge_bases_to_ignore and self._knowledge_bases_to_ignore.match(node):
|
|
275
|
+
raise ValueError(f"Knowledge base {str(node)} is not allowed.")
|
|
276
|
+
|
|
277
|
+
def check_table_permission(self, node):
|
|
278
|
+
if self._tables_to_include and not self._tables_to_include.match(node):
|
|
279
|
+
raise ValueError(
|
|
280
|
+
f"Table {str(node)} not found. Available tables: {', '.join(self._tables_to_include.items)}"
|
|
281
|
+
)
|
|
282
|
+
# Check if it's a restricted table
|
|
283
|
+
if self._tables_to_ignore and self._tables_to_ignore.match(node):
|
|
284
|
+
raise ValueError(f"Table {str(node)} is not allowed.")
|
|
285
|
+
|
|
202
286
|
def get_usable_table_names(self) -> Iterable[str]:
|
|
203
287
|
"""Get a list of tables that the agent has access to.
|
|
204
288
|
|
|
@@ -213,50 +297,35 @@ class SQLAgent:
|
|
|
213
297
|
if cached_tables:
|
|
214
298
|
return cached_tables
|
|
215
299
|
|
|
216
|
-
if self._tables_to_include:
|
|
217
|
-
|
|
300
|
+
if not self._tables_to_include:
|
|
301
|
+
# no tables allowed
|
|
302
|
+
return []
|
|
303
|
+
if not self._tables_to_include.has_wildcard:
|
|
304
|
+
return self._tables_to_include.items
|
|
218
305
|
|
|
219
306
|
result_tables = []
|
|
220
307
|
|
|
221
|
-
for db_name in self.
|
|
308
|
+
for db_name in self._tables_to_include.databases:
|
|
222
309
|
handler = self._command_executor.session.integration_controller.get_data_handler(db_name)
|
|
223
310
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
raise Exception("default schema and named schemas can not be used in same filter")
|
|
227
|
-
|
|
228
|
-
if None in schemas_names:
|
|
229
|
-
# get tables only from default schema
|
|
230
|
-
response = handler.get_tables()
|
|
231
|
-
tables_in_default_schema = list(response.data_frame.table_name)
|
|
232
|
-
schema_tables_restrictions = self._mindsdb_db_struct[db_name][None] # None - is default schema
|
|
233
|
-
if schema_tables_restrictions is None:
|
|
234
|
-
for table_name in tables_in_default_schema:
|
|
235
|
-
result_tables.append([db_name, table_name])
|
|
236
|
-
else:
|
|
237
|
-
for table_name in schema_tables_restrictions:
|
|
238
|
-
if table_name in tables_in_default_schema:
|
|
239
|
-
result_tables.append([db_name, table_name])
|
|
311
|
+
if "all" in inspect.signature(handler.get_tables).parameters:
|
|
312
|
+
response = handler.get_tables(all=True)
|
|
240
313
|
else:
|
|
241
|
-
|
|
242
|
-
|
|
314
|
+
response = handler.get_tables()
|
|
315
|
+
df = response.data_frame
|
|
316
|
+
col_name = "table_name"
|
|
317
|
+
if col_name not in df.columns:
|
|
318
|
+
# get first column if not found
|
|
319
|
+
col_name = df.columns[0]
|
|
320
|
+
|
|
321
|
+
for _, row in df.iterrows():
|
|
322
|
+
if "table_schema" in row:
|
|
323
|
+
parts = [db_name, row["table_schema"], row[col_name]]
|
|
243
324
|
else:
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
raise Exception("There are no allowed schemas in ds")
|
|
249
|
-
|
|
250
|
-
for schema_name in schemas_intersection:
|
|
251
|
-
schema_sub_df = response.data_frame[response.data_frame["table_schema"] == schema_name]
|
|
252
|
-
if self._mindsdb_db_struct[db_name][schema_name] is None:
|
|
253
|
-
# all tables from schema allowed
|
|
254
|
-
for row in schema_sub_df:
|
|
255
|
-
result_tables.append([db_name, schema_name, row["table_name"]])
|
|
256
|
-
else:
|
|
257
|
-
for table_name in self._mindsdb_db_struct[db_name][schema_name]:
|
|
258
|
-
if table_name in schema_sub_df["table_name"].values:
|
|
259
|
-
result_tables.append([db_name, schema_name, table_name])
|
|
325
|
+
parts = [db_name, row[col_name]]
|
|
326
|
+
if self._tables_to_include.match(Identifier(parts=parts)):
|
|
327
|
+
if not self._tables_to_ignore.match(Identifier(parts=parts)):
|
|
328
|
+
result_tables.append(parts)
|
|
260
329
|
|
|
261
330
|
result_tables = [".".join(x) for x in result_tables]
|
|
262
331
|
if self._cache:
|
|
@@ -269,7 +338,28 @@ class SQLAgent:
|
|
|
269
338
|
Returns:
|
|
270
339
|
Iterable[str]: list with knowledge base names
|
|
271
340
|
"""
|
|
272
|
-
|
|
341
|
+
|
|
342
|
+
if not self._knowledge_bases_to_include and not self._knowledge_bases_to_ignore:
|
|
343
|
+
# white or black list have to be set
|
|
344
|
+
return []
|
|
345
|
+
|
|
346
|
+
# Filter knowledge bases based on ignore list
|
|
347
|
+
kb_names = []
|
|
348
|
+
for kb_name in self.get_all_knowledge_base_names():
|
|
349
|
+
kb = Identifier(parts=[self.knowledge_base_database, kb_name])
|
|
350
|
+
if self._knowledge_bases_to_include and not self._knowledge_bases_to_include.match(kb):
|
|
351
|
+
continue
|
|
352
|
+
if not self._knowledge_bases_to_ignore.match(kb):
|
|
353
|
+
kb_names.append(kb_name)
|
|
354
|
+
return kb_names
|
|
355
|
+
|
|
356
|
+
def get_all_knowledge_base_names(self) -> Iterable[str]:
|
|
357
|
+
"""Get a list of all knowledge bases
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
Iterable[str]: list with knowledge base names
|
|
361
|
+
"""
|
|
362
|
+
# cache_key = f"{ctx.company_id}_{self.knowledge_base_database}_knowledge_bases"
|
|
273
363
|
|
|
274
364
|
# todo we need to fix the cache, file cache can potentially store out of data information
|
|
275
365
|
# # first check cache and return if found
|
|
@@ -278,58 +368,18 @@ class SQLAgent:
|
|
|
278
368
|
# if cached_kbs:
|
|
279
369
|
# return cached_kbs
|
|
280
370
|
|
|
281
|
-
if self._knowledge_bases_to_include:
|
|
282
|
-
return self._knowledge_bases_to_include
|
|
283
|
-
|
|
284
371
|
try:
|
|
285
372
|
# Query to get all knowledge bases
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
result = self._call_engine(query, database=self.knowledge_base_database)
|
|
289
|
-
except Exception as e:
|
|
290
|
-
# If the direct query fails, try a different approach
|
|
291
|
-
# This handles the case where knowledge_base_database is not a valid integration
|
|
292
|
-
logger.warning(f"Error querying knowledge bases from {self.knowledge_base_database}: {str(e)}")
|
|
293
|
-
# Try to get knowledge bases directly from the project database
|
|
294
|
-
try:
|
|
295
|
-
# Get knowledge bases from the project database
|
|
296
|
-
kb_controller = self._command_executor.session.kb_controller
|
|
297
|
-
kb_names = [kb["name"] for kb in kb_controller.list()]
|
|
298
|
-
|
|
299
|
-
# Filter knowledge bases based on include list
|
|
300
|
-
if self._knowledge_bases_to_include:
|
|
301
|
-
kb_names = [kb_name for kb_name in kb_names if kb_name in self._knowledge_bases_to_include]
|
|
302
|
-
if not kb_names:
|
|
303
|
-
logger.warning(
|
|
304
|
-
f"No knowledge bases found in the include list: {self._knowledge_bases_to_include}"
|
|
305
|
-
)
|
|
306
|
-
return []
|
|
307
|
-
|
|
308
|
-
return kb_names
|
|
309
|
-
|
|
310
|
-
# Filter knowledge bases based on ignore list
|
|
311
|
-
kb_names = [kb_name for kb_name in kb_names if kb_name not in self._knowledge_bases_to_ignore]
|
|
312
|
-
|
|
313
|
-
if self._cache:
|
|
314
|
-
self._cache.set(cache_key, set(kb_names))
|
|
315
|
-
|
|
316
|
-
return kb_names
|
|
317
|
-
except Exception as inner_e:
|
|
318
|
-
logger.error(f"Error getting knowledge bases from kb_controller: {str(inner_e)}")
|
|
319
|
-
return []
|
|
320
|
-
|
|
321
|
-
if not result:
|
|
322
|
-
return []
|
|
373
|
+
ast_query = Show(category="Knowledge Bases")
|
|
374
|
+
result = self._command_executor.execute_command(ast_query, database_name=self.knowledge_base_database)
|
|
323
375
|
|
|
324
376
|
# Filter knowledge bases based on ignore list
|
|
325
377
|
kb_names = []
|
|
326
|
-
for row in result:
|
|
327
|
-
|
|
328
|
-
if kb_name not in self._knowledge_bases_to_ignore:
|
|
329
|
-
kb_names.append(kb_name)
|
|
378
|
+
for row in result.data.records:
|
|
379
|
+
kb_names.append(row["NAME"])
|
|
330
380
|
|
|
331
|
-
if self._cache:
|
|
332
|
-
|
|
381
|
+
# if self._cache:
|
|
382
|
+
# self._cache.set(cache_key, set(kb_names))
|
|
333
383
|
|
|
334
384
|
return kb_names
|
|
335
385
|
except Exception as e:
|
|
@@ -369,7 +419,7 @@ class SQLAgent:
|
|
|
369
419
|
table_identifier = tables_idx.get(tuple(table_parts))
|
|
370
420
|
|
|
371
421
|
if table_identifier is None:
|
|
372
|
-
raise ValueError(f"Table {
|
|
422
|
+
raise ValueError(f"Table {table_name} not found in the database")
|
|
373
423
|
tables.append(table_identifier)
|
|
374
424
|
|
|
375
425
|
return tables
|
|
@@ -411,13 +461,14 @@ class SQLAgent:
|
|
|
411
461
|
if len(parts) == 1:
|
|
412
462
|
raise ValueError(f"Invalid table name: {name}. Expected format is 'database.table'.")
|
|
413
463
|
|
|
414
|
-
database_table_map
|
|
464
|
+
database_table_map.setdefault(parts[0], []).append(parts[1])
|
|
415
465
|
|
|
416
466
|
data_catalog_str = ""
|
|
417
467
|
for database_name, table_names in database_table_map.items():
|
|
418
468
|
data_catalog_reader = DataCatalogReader(database_name=database_name, table_names=table_names)
|
|
419
469
|
|
|
420
|
-
|
|
470
|
+
result = data_catalog_reader.read_metadata_as_string()
|
|
471
|
+
data_catalog_str += str(result or "")
|
|
421
472
|
|
|
422
473
|
return data_catalog_str
|
|
423
474
|
|
|
@@ -430,7 +481,7 @@ class SQLAgent:
|
|
|
430
481
|
|
|
431
482
|
split = name.split(".")
|
|
432
483
|
if len(split) > 1:
|
|
433
|
-
all_tables.append(Identifier(parts=[split[0], split[1]]))
|
|
484
|
+
all_tables.append(Identifier(parts=[split[0], split[-1]]))
|
|
434
485
|
else:
|
|
435
486
|
all_tables.append(Identifier(name))
|
|
436
487
|
|
mindsdb/interfaces/storage/db.py
CHANGED
|
@@ -684,10 +684,10 @@ class MetaColumns(Base):
|
|
|
684
684
|
if self.default_value:
|
|
685
685
|
column_info += f"\n{pad}- Default Value: {self.default_value}"
|
|
686
686
|
|
|
687
|
-
|
|
687
|
+
stats = self.meta_column_statistics or []
|
|
688
|
+
if stats and callable(getattr(stats[0], "as_string", None)):
|
|
688
689
|
column_info += f"\n\n{pad}- Column Statistics:"
|
|
689
|
-
column_info += f"\n{
|
|
690
|
-
|
|
690
|
+
column_info += f"\n{stats[0].as_string(indent + 4)}"
|
|
691
691
|
return column_info
|
|
692
692
|
|
|
693
693
|
|
|
@@ -708,18 +708,20 @@ class MetaColumnStatistics(Base):
|
|
|
708
708
|
inner_pad = " " * (indent + 4)
|
|
709
709
|
|
|
710
710
|
column_statistics = ""
|
|
711
|
+
most_common_values = self.most_common_values or []
|
|
712
|
+
most_common_frequencies = self.most_common_frequencies or []
|
|
711
713
|
|
|
712
|
-
if
|
|
714
|
+
if most_common_values and most_common_frequencies:
|
|
713
715
|
column_statistics += f"{pad}- Top 10 Most Common Values and Frequencies:"
|
|
714
|
-
for i in range(min(10, len(
|
|
715
|
-
freq =
|
|
716
|
+
for i in range(min(10, len(most_common_values))):
|
|
717
|
+
freq = most_common_frequencies[i]
|
|
716
718
|
try:
|
|
717
719
|
percent = float(freq) * 100
|
|
718
720
|
freq_str = f"{percent:.2f}%"
|
|
719
721
|
except (ValueError, TypeError):
|
|
720
722
|
freq_str = str(freq)
|
|
721
723
|
|
|
722
|
-
column_statistics += f"\n{inner_pad}- {
|
|
724
|
+
column_statistics += f"\n{inner_pad}- {most_common_values[i]}: {freq_str}"
|
|
723
725
|
column_statistics += "\n"
|
|
724
726
|
|
|
725
727
|
if self.null_percentage:
|
mindsdb/utilities/config.py
CHANGED
|
@@ -199,7 +199,8 @@ class Config:
|
|
|
199
199
|
},
|
|
200
200
|
"cache": {"type": "local"},
|
|
201
201
|
"ml_task_queue": {"type": "local"},
|
|
202
|
-
"
|
|
202
|
+
"url_file_upload": {"enabled": True, "allowed_origins": [], "disallowed_origins": []},
|
|
203
|
+
"file_upload_domains": [], # deprecated, use config[url_file_upload][allowed_origins] instead
|
|
203
204
|
"web_crawling_allowed_sites": [],
|
|
204
205
|
"cloud": False,
|
|
205
206
|
"jobs": {"disable": False},
|
|
@@ -548,6 +549,16 @@ class Config:
|
|
|
548
549
|
"Use 'MINDSDB_HTTP_SERVER_TYPE' instead."
|
|
549
550
|
)
|
|
550
551
|
|
|
552
|
+
file_upload_domains = self._config.get("file_upload_domains")
|
|
553
|
+
if isinstance(file_upload_domains, list) and len(file_upload_domains) > 0:
|
|
554
|
+
allowed_origins = self._config["url_file_upload"]["allowed_origins"]
|
|
555
|
+
if isinstance(allowed_origins, list) and len(allowed_origins) == 0:
|
|
556
|
+
self._config["url_file_upload"]["allowed_origins"] = file_upload_domains
|
|
557
|
+
logger.warning(
|
|
558
|
+
'Config option "file_upload_domains" is deprecated, '
|
|
559
|
+
'use config["url_file_upload"]["allowed_origins"] instead.'
|
|
560
|
+
)
|
|
561
|
+
|
|
551
562
|
for env_name in ("MINDSDB_HTTP_SERVER_TYPE", "MINDSDB_DEFAULT_SERVER"):
|
|
552
563
|
env_value = os.environ.get(env_name, "")
|
|
553
564
|
if env_value.lower() not in ("waitress", "flask", "gunicorn", ""):
|
mindsdb/utilities/exception.py
CHANGED
|
@@ -1,29 +1,69 @@
|
|
|
1
|
+
from textwrap import indent
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
class BaseEntityException(Exception):
|
|
2
5
|
"""Base exception for entitys errors
|
|
3
6
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
+
Attributes:
|
|
8
|
+
message (str): error message
|
|
9
|
+
entity_name (str): entity name
|
|
7
10
|
"""
|
|
11
|
+
|
|
8
12
|
def __init__(self, message: str, entity_name: str = None) -> None:
|
|
9
13
|
self.message = message
|
|
10
|
-
self.entity_name = entity_name or
|
|
14
|
+
self.entity_name = entity_name or "unknown"
|
|
11
15
|
|
|
12
16
|
def __str__(self) -> str:
|
|
13
|
-
return f
|
|
17
|
+
return f"{self.message}: {self.entity_name}"
|
|
14
18
|
|
|
15
19
|
|
|
16
20
|
class EntityExistsError(BaseEntityException):
|
|
17
21
|
"""Raise when entity exists, but should not"""
|
|
22
|
+
|
|
18
23
|
def __init__(self, message: str = None, entity_name: str = None) -> None:
|
|
19
24
|
if message is None:
|
|
20
|
-
message =
|
|
25
|
+
message = "Entity exists error"
|
|
21
26
|
super().__init__(message, entity_name)
|
|
22
27
|
|
|
23
28
|
|
|
24
29
|
class EntityNotExistsError(BaseEntityException):
|
|
25
30
|
"""Raise when entity not exists, but should"""
|
|
31
|
+
|
|
26
32
|
def __init__(self, message: str = None, entity_name: str = None) -> None:
|
|
27
33
|
if message is None:
|
|
28
|
-
message =
|
|
34
|
+
message = "Entity does not exists error"
|
|
29
35
|
super().__init__(message, entity_name)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def format_db_error_message(
|
|
39
|
+
db_name: str | None = None,
|
|
40
|
+
db_type: str | None = None,
|
|
41
|
+
db_error_msg: str | None = None,
|
|
42
|
+
failed_query: str | None = None,
|
|
43
|
+
) -> str:
|
|
44
|
+
"""Format the error message for the database query.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
db_name (str | None): The name of the database.
|
|
48
|
+
db_type (str | None): The type of the database.
|
|
49
|
+
db_error_msg (str | None): The error message.
|
|
50
|
+
failed_query (str | None): The failed query.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
str: The formatted error message.
|
|
54
|
+
"""
|
|
55
|
+
error_message = "Failed to execute external database query during query processing."
|
|
56
|
+
if db_name is not None or db_type is not None:
|
|
57
|
+
error_message += "\n\nDatabase Details:"
|
|
58
|
+
if db_name is not None:
|
|
59
|
+
error_message += f"\n- Name: {db_name}"
|
|
60
|
+
if db_type is not None:
|
|
61
|
+
error_message += f"\n- Type: {db_type}"
|
|
62
|
+
|
|
63
|
+
if db_error_msg is not None:
|
|
64
|
+
error_message += f"\n\nError:\n{indent(db_error_msg, ' ')}"
|
|
65
|
+
|
|
66
|
+
if failed_query is not None:
|
|
67
|
+
error_message += f"\n\nFailed Query:\n{indent(failed_query, ' ')}"
|
|
68
|
+
|
|
69
|
+
return error_message
|
mindsdb/utilities/security.py
CHANGED
|
@@ -27,28 +27,71 @@ def clear_filename(filename: str) -> str:
|
|
|
27
27
|
|
|
28
28
|
if not filename:
|
|
29
29
|
return filename
|
|
30
|
-
badchars = '
|
|
30
|
+
badchars = '\\/:*?"<>|'
|
|
31
31
|
for c in badchars:
|
|
32
|
-
filename = filename.replace(c,
|
|
32
|
+
filename = filename.replace(c, "")
|
|
33
33
|
return filename
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
def
|
|
36
|
+
def _split_url(url: str) -> tuple[str, str]:
|
|
37
|
+
"""
|
|
38
|
+
Splits the URL into scheme and netloc.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
url (str): The URL to split.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
tuple[str, str]: The scheme and netloc of the URL.
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
ValueError: If the URL does not include protocol and host name.
|
|
48
|
+
"""
|
|
49
|
+
parsed_url = urlparse(url)
|
|
50
|
+
if not (parsed_url.scheme and parsed_url.netloc):
|
|
51
|
+
raise ValueError(f"URL must include protocol and host name: {url}")
|
|
52
|
+
return parsed_url.scheme.lower(), parsed_url.netloc.lower()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def validate_urls(urls: str | list[str], allowed_urls: list[str], disallowed_urls: list[str] | None = None) -> bool:
|
|
37
56
|
"""
|
|
38
57
|
Checks if the provided URL(s) is/are from an allowed host.
|
|
39
58
|
|
|
40
|
-
This function parses the URL(s) and checks the
|
|
59
|
+
This function parses the URL(s) and checks the origin (scheme + netloc)
|
|
41
60
|
against a list of allowed hosts.
|
|
42
61
|
|
|
43
|
-
:
|
|
44
|
-
|
|
45
|
-
|
|
62
|
+
Examples:
|
|
63
|
+
validate_urls("http://site.com/file", ["site.com"]) -> Exception
|
|
64
|
+
validate_urls("https://site.com/file", ["https://site.com"]) -> True
|
|
65
|
+
validate_urls("http://site.com/file", ["https://site.com"]) -> False
|
|
66
|
+
validate_urls("https://site.com/file", ["https://example.com"]) -> False
|
|
67
|
+
validate_urls("site.com/file", ["https://site.com"]) -> Exception
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
urls (str | list[str]): The URL(s) to check. Can be a single URL (str) or a list of URLs (list).
|
|
71
|
+
allowed_urls (list[str]): The list of allowed URLs.
|
|
72
|
+
disallowed_urls (list[str]): The list of disallowed URLs. If provided, the function
|
|
73
|
+
will return False if the URL is in the disallowed list.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
bool: True if the URL(s) is/are from an allowed host and not in the disallowed list, False otherwise.
|
|
46
77
|
"""
|
|
47
|
-
|
|
78
|
+
if disallowed_urls is None:
|
|
79
|
+
disallowed_urls = []
|
|
80
|
+
|
|
81
|
+
allowed_origins = [_split_url(url) for url in allowed_urls]
|
|
82
|
+
disallowed_origins = [_split_url(url) for url in disallowed_urls]
|
|
48
83
|
|
|
49
84
|
if isinstance(urls, str):
|
|
50
85
|
urls = [urls]
|
|
51
86
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
87
|
+
if allowed_origins:
|
|
88
|
+
for url in urls:
|
|
89
|
+
if _split_url(url) not in allowed_origins:
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
if disallowed_origins:
|
|
93
|
+
for url in urls:
|
|
94
|
+
if _split_url(url) in disallowed_origins:
|
|
95
|
+
return False
|
|
96
|
+
|
|
97
|
+
return True
|