MindsDB 25.6.3.1__py3-none-any.whl → 25.7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (55) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/executor/command_executor.py +8 -6
  3. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +72 -44
  4. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +14 -1
  5. mindsdb/api/executor/datahub/datanodes/project_datanode.py +1 -1
  6. mindsdb/api/executor/datahub/datanodes/system_tables.py +314 -1
  7. mindsdb/api/executor/planner/plan_join.py +1 -1
  8. mindsdb/api/executor/planner/query_planner.py +7 -1
  9. mindsdb/api/executor/planner/query_prepare.py +68 -87
  10. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
  11. mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
  12. mindsdb/api/http/namespaces/file.py +49 -24
  13. mindsdb/api/mcp/start.py +45 -31
  14. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
  15. mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
  16. mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
  17. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
  18. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
  19. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
  20. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  21. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
  22. mindsdb/integrations/handlers/ludwig_handler/requirements.txt +1 -1
  23. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +150 -140
  24. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
  25. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +2 -0
  26. mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
  27. mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
  28. mindsdb/integrations/libs/api_handler.py +6 -7
  29. mindsdb/integrations/libs/vectordatabase_handler.py +86 -77
  30. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
  31. mindsdb/interfaces/agents/agents_controller.py +29 -9
  32. mindsdb/interfaces/agents/constants.py +44 -0
  33. mindsdb/interfaces/agents/langchain_agent.py +15 -6
  34. mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
  35. mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
  36. mindsdb/interfaces/data_catalog/data_catalog_reader.py +22 -3
  37. mindsdb/interfaces/knowledge_base/controller.py +121 -102
  38. mindsdb/interfaces/knowledge_base/evaluate.py +19 -7
  39. mindsdb/interfaces/knowledge_base/executor.py +346 -0
  40. mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
  41. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
  42. mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
  43. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +26 -22
  44. mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +40 -28
  45. mindsdb/interfaces/skills/skill_tool.py +91 -88
  46. mindsdb/interfaces/skills/sql_agent.py +181 -130
  47. mindsdb/interfaces/storage/db.py +9 -7
  48. mindsdb/utilities/config.py +12 -1
  49. mindsdb/utilities/exception.py +47 -7
  50. mindsdb/utilities/security.py +54 -11
  51. {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/METADATA +239 -251
  52. {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/RECORD +55 -54
  53. {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/WHEEL +0 -0
  54. {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/licenses/LICENSE +0 -0
  55. {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,9 @@ import csv
3
3
  import inspect
4
4
  import traceback
5
5
  from io import StringIO
6
- from typing import Iterable, List, Optional, Any
6
+ from typing import Iterable, List, Optional, Any, Tuple
7
+ from collections import defaultdict
8
+ import fnmatch
7
9
 
8
10
  import pandas as pd
9
11
  from mindsdb_sql_parser import parse_sql
@@ -75,12 +77,84 @@ def split_table_name(table_name: str) -> List[str]:
75
77
  if current:
76
78
  result.append(current.strip("`"))
77
79
 
78
- # ensure we split the table name
79
- # result = [r.split(".") for r in result][0]
80
-
81
80
  return result
82
81
 
83
82
 
83
+ class TablesCollection:
84
+ """
85
+ Collection of identifiers.
86
+ Supports wildcard in tables name.
87
+ """
88
+
89
+ def __init__(self, items: List[Identifier | str] = None, default_db=None):
90
+ if items is None:
91
+ items = []
92
+
93
+ self.items = items
94
+ self._dbs = defaultdict(set)
95
+ self._schemas = defaultdict(dict)
96
+ self._no_db_tables = set()
97
+ self.has_wildcard = False
98
+ self.databases = set()
99
+ self._default_db = default_db
100
+
101
+ for name in items:
102
+ if not isinstance(name, Identifier):
103
+ name = Identifier(name)
104
+ db, schema, tbl = self._get_paths(name)
105
+ if db is None:
106
+ self._no_db_tables.add(tbl)
107
+ elif schema is None:
108
+ self._dbs[db].add(tbl)
109
+ else:
110
+ if schema not in self._schemas[db]:
111
+ self._schemas[db][schema] = set()
112
+ self._schemas[db][schema].add(tbl)
113
+
114
+ if "*" in tbl:
115
+ self.has_wildcard = True
116
+ self.databases.add(db)
117
+
118
+ def _get_paths(self, table: Identifier) -> Tuple:
119
+ # split identifier to db, schema, table name
120
+ schema = None
121
+ db = None
122
+
123
+ match [x.lower() for x in table.parts]:
124
+ case [tbl]:
125
+ pass
126
+ case [db, tbl]:
127
+ pass
128
+ case [db, schema, tbl]:
129
+ pass
130
+ case _:
131
+ raise NotImplementedError
132
+ return db, schema, tbl.lower()
133
+
134
+ def match(self, table: Identifier) -> bool:
135
+ # Check if input table matches to tables in collection
136
+
137
+ db, schema, tbl = self._get_paths(table)
138
+ if db is None:
139
+ if tbl in self._no_db_tables:
140
+ return True
141
+ if self._default_db is not None:
142
+ return self.match(Identifier(parts=[self._default_db, tbl]))
143
+
144
+ if schema is not None:
145
+ if any([fnmatch.fnmatch(tbl, pattern) for pattern in self._schemas[db].get(schema, [])]):
146
+ return True
147
+
148
+ # table might be specified without schema
149
+ return any([fnmatch.fnmatch(tbl, pattern) for pattern in self._dbs[db]])
150
+
151
+ def __bool__(self):
152
+ return len(self.items) > 0
153
+
154
+ def __repr__(self):
155
+ return f"Tables({self.items})"
156
+
157
+
84
158
  class SQLAgent:
85
159
  """
86
160
  SQLAgent is a class that handles SQL queries for agents.
@@ -117,21 +191,23 @@ class SQLAgent:
117
191
  self._command_executor = command_executor
118
192
  self._mindsdb_db_struct = databases_struct
119
193
  self.knowledge_base_database = knowledge_base_database # This is a project name, not a database connection
194
+ self._databases = databases
120
195
  self._sample_rows_in_table_info = int(sample_rows_in_table_info)
121
196
 
122
- self._tables_to_include = include_tables
123
- self._tables_to_ignore = []
124
- self._knowledge_bases_to_include = include_knowledge_bases
125
- self._knowledge_bases_to_ignore = []
126
- self._databases = databases
127
- if not self._tables_to_include:
197
+ self._tables_to_include = TablesCollection(include_tables)
198
+ if self._tables_to_include:
128
199
  # ignore_tables and include_tables should not be used together.
129
200
  # include_tables takes priority if it's set.
130
- self._tables_to_ignore = ignore_tables or []
131
- if not self._knowledge_bases_to_include:
201
+ ignore_tables = []
202
+ self._tables_to_ignore = TablesCollection(ignore_tables)
203
+
204
+ self._knowledge_bases_to_include = TablesCollection(include_knowledge_bases, default_db=knowledge_base_database)
205
+ if self._knowledge_bases_to_include:
132
206
  # ignore_knowledge_bases and include_knowledge_bases should not be used together.
133
207
  # include_knowledge_bases takes priority if it's set.
134
- self._knowledge_bases_to_ignore = ignore_knowledge_bases or []
208
+ ignore_knowledge_bases = []
209
+ self._knowledge_bases_to_ignore = TablesCollection(ignore_knowledge_bases, default_db=knowledge_base_database)
210
+
135
211
  self._cache = cache
136
212
 
137
213
  from mindsdb.interfaces.skills.skill_tool import SkillToolController
@@ -159,46 +235,54 @@ class SQLAgent:
159
235
  if not isinstance(ast_query, (Select, Show, Describe, Explain)):
160
236
  raise ValueError(f"Query is not allowed: {ast_query.to_string()}")
161
237
 
238
+ kb_names = self.get_all_knowledge_base_names()
239
+
162
240
  # Check tables
163
241
  if self._tables_to_include:
164
- tables_parts = [split_table_name(x) for x in self._tables_to_include]
165
- no_schema_parts = []
166
- for t in tables_parts:
167
- if len(t) == 3:
168
- no_schema_parts.append([t[0], t[2]])
169
- tables_parts += no_schema_parts
170
242
 
171
243
  def _check_f(node, is_table=None, **kwargs):
172
244
  if is_table and isinstance(node, Identifier):
173
245
  table_name = ".".join(node.parts)
174
246
 
175
- # Get the list of available knowledge bases
176
- kb_names = self.get_usable_knowledge_base_names()
177
-
178
247
  # Check if this table is a knowledge base
179
- is_kb = table_name in kb_names
180
-
181
- # If it's a knowledge base and we have knowledge base restrictions
182
- if is_kb and self._knowledge_bases_to_include:
183
- kb_parts = [split_table_name(x) for x in self._knowledge_bases_to_include]
184
- if node.parts not in kb_parts:
185
- raise ValueError(
186
- f"Knowledge base {table_name} not found. Available knowledge bases: {', '.join(self._knowledge_bases_to_include)}"
187
- )
188
- # Regular table check
189
- elif not is_kb and self._tables_to_include and node.parts not in tables_parts:
190
- raise ValueError(
191
- f"Table {table_name} not found. Available tables: {', '.join(self._tables_to_include)}"
192
- )
193
- # Check if it's a restricted knowledge base
194
- elif is_kb and table_name in self._knowledge_bases_to_ignore:
195
- raise ValueError(f"Knowledge base {table_name} is not allowed.")
196
- # Check if it's a restricted table
197
- elif not is_kb and table_name in self._tables_to_ignore:
198
- raise ValueError(f"Table {table_name} is not allowed.")
248
+ if table_name in kb_names or node.parts[-1] in kb_names:
249
+ # If it's a knowledge base and we have knowledge base restrictions
250
+ self.check_knowledge_base_permission(node)
251
+ else:
252
+ try:
253
+ # Regular table check
254
+ self.check_table_permission(node)
255
+ except ValueError as origin_exc:
256
+ # was it badly quoted by llm?
257
+ if len(node.parts) == 1 and node.is_quoted[0] and "." in node.parts[0]:
258
+ node2 = Identifier(node.parts[0])
259
+ try:
260
+ _check_f(node2, is_table=True)
261
+ return node2
262
+ except ValueError:
263
+ ...
264
+ raise origin_exc
199
265
 
200
266
  query_traversal(ast_query, _check_f)
201
267
 
268
+ def check_knowledge_base_permission(self, node):
269
+ if self._knowledge_bases_to_include and not self._knowledge_bases_to_include.match(node):
270
+ raise ValueError(
271
+ f"Knowledge base {str(node)} not found. Available knowledge bases: {', '.join(self._knowledge_bases_to_include.items)}"
272
+ )
273
+ # Check if it's a restricted knowledge base
274
+ if self._knowledge_bases_to_ignore and self._knowledge_bases_to_ignore.match(node):
275
+ raise ValueError(f"Knowledge base {str(node)} is not allowed.")
276
+
277
+ def check_table_permission(self, node):
278
+ if self._tables_to_include and not self._tables_to_include.match(node):
279
+ raise ValueError(
280
+ f"Table {str(node)} not found. Available tables: {', '.join(self._tables_to_include.items)}"
281
+ )
282
+ # Check if it's a restricted table
283
+ if self._tables_to_ignore and self._tables_to_ignore.match(node):
284
+ raise ValueError(f"Table {str(node)} is not allowed.")
285
+
202
286
  def get_usable_table_names(self) -> Iterable[str]:
203
287
  """Get a list of tables that the agent has access to.
204
288
 
@@ -213,50 +297,35 @@ class SQLAgent:
213
297
  if cached_tables:
214
298
  return cached_tables
215
299
 
216
- if self._tables_to_include:
217
- return self._tables_to_include
300
+ if not self._tables_to_include:
301
+ # no tables allowed
302
+ return []
303
+ if not self._tables_to_include.has_wildcard:
304
+ return self._tables_to_include.items
218
305
 
219
306
  result_tables = []
220
307
 
221
- for db_name in self._mindsdb_db_struct:
308
+ for db_name in self._tables_to_include.databases:
222
309
  handler = self._command_executor.session.integration_controller.get_data_handler(db_name)
223
310
 
224
- schemas_names = list(self._mindsdb_db_struct[db_name].keys())
225
- if len(schemas_names) > 1 and None in schemas_names:
226
- raise Exception("default schema and named schemas can not be used in same filter")
227
-
228
- if None in schemas_names:
229
- # get tables only from default schema
230
- response = handler.get_tables()
231
- tables_in_default_schema = list(response.data_frame.table_name)
232
- schema_tables_restrictions = self._mindsdb_db_struct[db_name][None] # None - is default schema
233
- if schema_tables_restrictions is None:
234
- for table_name in tables_in_default_schema:
235
- result_tables.append([db_name, table_name])
236
- else:
237
- for table_name in schema_tables_restrictions:
238
- if table_name in tables_in_default_schema:
239
- result_tables.append([db_name, table_name])
311
+ if "all" in inspect.signature(handler.get_tables).parameters:
312
+ response = handler.get_tables(all=True)
240
313
  else:
241
- if "all" in inspect.signature(handler.get_tables).parameters:
242
- response = handler.get_tables(all=True)
314
+ response = handler.get_tables()
315
+ df = response.data_frame
316
+ col_name = "table_name"
317
+ if col_name not in df.columns:
318
+ # get first column if not found
319
+ col_name = df.columns[0]
320
+
321
+ for _, row in df.iterrows():
322
+ if "table_schema" in row:
323
+ parts = [db_name, row["table_schema"], row[col_name]]
243
324
  else:
244
- response = handler.get_tables()
245
- response_schema_names = list(response.data_frame.table_schema.unique())
246
- schemas_intersection = set(schemas_names) & set(response_schema_names)
247
- if len(schemas_intersection) == 0:
248
- raise Exception("There are no allowed schemas in ds")
249
-
250
- for schema_name in schemas_intersection:
251
- schema_sub_df = response.data_frame[response.data_frame["table_schema"] == schema_name]
252
- if self._mindsdb_db_struct[db_name][schema_name] is None:
253
- # all tables from schema allowed
254
- for row in schema_sub_df:
255
- result_tables.append([db_name, schema_name, row["table_name"]])
256
- else:
257
- for table_name in self._mindsdb_db_struct[db_name][schema_name]:
258
- if table_name in schema_sub_df["table_name"].values:
259
- result_tables.append([db_name, schema_name, table_name])
325
+ parts = [db_name, row[col_name]]
326
+ if self._tables_to_include.match(Identifier(parts=parts)):
327
+ if not self._tables_to_ignore.match(Identifier(parts=parts)):
328
+ result_tables.append(parts)
260
329
 
261
330
  result_tables = [".".join(x) for x in result_tables]
262
331
  if self._cache:
@@ -269,7 +338,28 @@ class SQLAgent:
269
338
  Returns:
270
339
  Iterable[str]: list with knowledge base names
271
340
  """
272
- cache_key = f"{ctx.company_id}_{self.knowledge_base_database}_knowledge_bases"
341
+
342
+ if not self._knowledge_bases_to_include and not self._knowledge_bases_to_ignore:
343
+ # white or black list have to be set
344
+ return []
345
+
346
+ # Filter knowledge bases based on ignore list
347
+ kb_names = []
348
+ for kb_name in self.get_all_knowledge_base_names():
349
+ kb = Identifier(parts=[self.knowledge_base_database, kb_name])
350
+ if self._knowledge_bases_to_include and not self._knowledge_bases_to_include.match(kb):
351
+ continue
352
+ if not self._knowledge_bases_to_ignore.match(kb):
353
+ kb_names.append(kb_name)
354
+ return kb_names
355
+
356
+ def get_all_knowledge_base_names(self) -> Iterable[str]:
357
+ """Get a list of all knowledge bases
358
+
359
+ Returns:
360
+ Iterable[str]: list with knowledge base names
361
+ """
362
+ # cache_key = f"{ctx.company_id}_{self.knowledge_base_database}_knowledge_bases"
273
363
 
274
364
  # todo we need to fix the cache, file cache can potentially store out of data information
275
365
  # # first check cache and return if found
@@ -278,58 +368,18 @@ class SQLAgent:
278
368
  # if cached_kbs:
279
369
  # return cached_kbs
280
370
 
281
- if self._knowledge_bases_to_include:
282
- return self._knowledge_bases_to_include
283
-
284
371
  try:
285
372
  # Query to get all knowledge bases
286
- query = f"SHOW KNOWLEDGE_BASES FROM {self.knowledge_base_database};"
287
- try:
288
- result = self._call_engine(query, database=self.knowledge_base_database)
289
- except Exception as e:
290
- # If the direct query fails, try a different approach
291
- # This handles the case where knowledge_base_database is not a valid integration
292
- logger.warning(f"Error querying knowledge bases from {self.knowledge_base_database}: {str(e)}")
293
- # Try to get knowledge bases directly from the project database
294
- try:
295
- # Get knowledge bases from the project database
296
- kb_controller = self._command_executor.session.kb_controller
297
- kb_names = [kb["name"] for kb in kb_controller.list()]
298
-
299
- # Filter knowledge bases based on include list
300
- if self._knowledge_bases_to_include:
301
- kb_names = [kb_name for kb_name in kb_names if kb_name in self._knowledge_bases_to_include]
302
- if not kb_names:
303
- logger.warning(
304
- f"No knowledge bases found in the include list: {self._knowledge_bases_to_include}"
305
- )
306
- return []
307
-
308
- return kb_names
309
-
310
- # Filter knowledge bases based on ignore list
311
- kb_names = [kb_name for kb_name in kb_names if kb_name not in self._knowledge_bases_to_ignore]
312
-
313
- if self._cache:
314
- self._cache.set(cache_key, set(kb_names))
315
-
316
- return kb_names
317
- except Exception as inner_e:
318
- logger.error(f"Error getting knowledge bases from kb_controller: {str(inner_e)}")
319
- return []
320
-
321
- if not result:
322
- return []
373
+ ast_query = Show(category="Knowledge Bases")
374
+ result = self._command_executor.execute_command(ast_query, database_name=self.knowledge_base_database)
323
375
 
324
376
  # Filter knowledge bases based on ignore list
325
377
  kb_names = []
326
- for row in result:
327
- kb_name = row["name"]
328
- if kb_name not in self._knowledge_bases_to_ignore:
329
- kb_names.append(kb_name)
378
+ for row in result.data.records:
379
+ kb_names.append(row["NAME"])
330
380
 
331
- if self._cache:
332
- self._cache.set(cache_key, set(kb_names))
381
+ # if self._cache:
382
+ # self._cache.set(cache_key, set(kb_names))
333
383
 
334
384
  return kb_names
335
385
  except Exception as e:
@@ -369,7 +419,7 @@ class SQLAgent:
369
419
  table_identifier = tables_idx.get(tuple(table_parts))
370
420
 
371
421
  if table_identifier is None:
372
- raise ValueError(f"Table {table} not found in the database")
422
+ raise ValueError(f"Table {table_name} not found in the database")
373
423
  tables.append(table_identifier)
374
424
 
375
425
  return tables
@@ -411,13 +461,14 @@ class SQLAgent:
411
461
  if len(parts) == 1:
412
462
  raise ValueError(f"Invalid table name: {name}. Expected format is 'database.table'.")
413
463
 
414
- database_table_map[parts[0]] = database_table_map.get(parts[0], []) + [parts[1]]
464
+ database_table_map.setdefault(parts[0], []).append(parts[1])
415
465
 
416
466
  data_catalog_str = ""
417
467
  for database_name, table_names in database_table_map.items():
418
468
  data_catalog_reader = DataCatalogReader(database_name=database_name, table_names=table_names)
419
469
 
420
- data_catalog_str += data_catalog_reader.read_metadata_as_string()
470
+ result = data_catalog_reader.read_metadata_as_string()
471
+ data_catalog_str += str(result or "")
421
472
 
422
473
  return data_catalog_str
423
474
 
@@ -430,7 +481,7 @@ class SQLAgent:
430
481
 
431
482
  split = name.split(".")
432
483
  if len(split) > 1:
433
- all_tables.append(Identifier(parts=[split[0], split[1]]))
484
+ all_tables.append(Identifier(parts=[split[0], split[-1]]))
434
485
  else:
435
486
  all_tables.append(Identifier(name))
436
487
 
@@ -684,10 +684,10 @@ class MetaColumns(Base):
684
684
  if self.default_value:
685
685
  column_info += f"\n{pad}- Default Value: {self.default_value}"
686
686
 
687
- if self.meta_column_statistics:
687
+ stats = self.meta_column_statistics or []
688
+ if stats and callable(getattr(stats[0], "as_string", None)):
688
689
  column_info += f"\n\n{pad}- Column Statistics:"
689
- column_info += f"\n{self.meta_column_statistics[0].as_string(indent + 4)}"
690
-
690
+ column_info += f"\n{stats[0].as_string(indent + 4)}"
691
691
  return column_info
692
692
 
693
693
 
@@ -708,18 +708,20 @@ class MetaColumnStatistics(Base):
708
708
  inner_pad = " " * (indent + 4)
709
709
 
710
710
  column_statistics = ""
711
+ most_common_values = self.most_common_values or []
712
+ most_common_frequencies = self.most_common_frequencies or []
711
713
 
712
- if any(self.most_common_values) and any(self.most_common_frequencies):
714
+ if most_common_values and most_common_frequencies:
713
715
  column_statistics += f"{pad}- Top 10 Most Common Values and Frequencies:"
714
- for i in range(min(10, len(self.most_common_values))):
715
- freq = self.most_common_frequencies[i]
716
+ for i in range(min(10, len(most_common_values))):
717
+ freq = most_common_frequencies[i]
716
718
  try:
717
719
  percent = float(freq) * 100
718
720
  freq_str = f"{percent:.2f}%"
719
721
  except (ValueError, TypeError):
720
722
  freq_str = str(freq)
721
723
 
722
- column_statistics += f"\n{inner_pad}- {self.most_common_values[i]}: {freq_str}"
724
+ column_statistics += f"\n{inner_pad}- {most_common_values[i]}: {freq_str}"
723
725
  column_statistics += "\n"
724
726
 
725
727
  if self.null_percentage:
@@ -199,7 +199,8 @@ class Config:
199
199
  },
200
200
  "cache": {"type": "local"},
201
201
  "ml_task_queue": {"type": "local"},
202
- "file_upload_domains": [],
202
+ "url_file_upload": {"enabled": True, "allowed_origins": [], "disallowed_origins": []},
203
+ "file_upload_domains": [], # deprecated, use config[url_file_upload][allowed_origins] instead
203
204
  "web_crawling_allowed_sites": [],
204
205
  "cloud": False,
205
206
  "jobs": {"disable": False},
@@ -548,6 +549,16 @@ class Config:
548
549
  "Use 'MINDSDB_HTTP_SERVER_TYPE' instead."
549
550
  )
550
551
 
552
+ file_upload_domains = self._config.get("file_upload_domains")
553
+ if isinstance(file_upload_domains, list) and len(file_upload_domains) > 0:
554
+ allowed_origins = self._config["url_file_upload"]["allowed_origins"]
555
+ if isinstance(allowed_origins, list) and len(allowed_origins) == 0:
556
+ self._config["url_file_upload"]["allowed_origins"] = file_upload_domains
557
+ logger.warning(
558
+ 'Config option "file_upload_domains" is deprecated, '
559
+ 'use config["url_file_upload"]["allowed_origins"] instead.'
560
+ )
561
+
551
562
  for env_name in ("MINDSDB_HTTP_SERVER_TYPE", "MINDSDB_DEFAULT_SERVER"):
552
563
  env_value = os.environ.get(env_name, "")
553
564
  if env_value.lower() not in ("waitress", "flask", "gunicorn", ""):
@@ -1,29 +1,69 @@
1
+ from textwrap import indent
2
+
3
+
1
4
  class BaseEntityException(Exception):
2
5
  """Base exception for entitys errors
3
6
 
4
- Attributes:
5
- message (str): error message
6
- entity_name (str): entity name
7
+ Attributes:
8
+ message (str): error message
9
+ entity_name (str): entity name
7
10
  """
11
+
8
12
  def __init__(self, message: str, entity_name: str = None) -> None:
9
13
  self.message = message
10
- self.entity_name = entity_name or 'unknown'
14
+ self.entity_name = entity_name or "unknown"
11
15
 
12
16
  def __str__(self) -> str:
13
- return f'{self.message}: {self.entity_name}'
17
+ return f"{self.message}: {self.entity_name}"
14
18
 
15
19
 
16
20
  class EntityExistsError(BaseEntityException):
17
21
  """Raise when entity exists, but should not"""
22
+
18
23
  def __init__(self, message: str = None, entity_name: str = None) -> None:
19
24
  if message is None:
20
- message = 'Entity exists error'
25
+ message = "Entity exists error"
21
26
  super().__init__(message, entity_name)
22
27
 
23
28
 
24
29
  class EntityNotExistsError(BaseEntityException):
25
30
  """Raise when entity not exists, but should"""
31
+
26
32
  def __init__(self, message: str = None, entity_name: str = None) -> None:
27
33
  if message is None:
28
- message = 'Entity does not exists error'
34
+ message = "Entity does not exists error"
29
35
  super().__init__(message, entity_name)
36
+
37
+
38
+ def format_db_error_message(
39
+ db_name: str | None = None,
40
+ db_type: str | None = None,
41
+ db_error_msg: str | None = None,
42
+ failed_query: str | None = None,
43
+ ) -> str:
44
+ """Format the error message for the database query.
45
+
46
+ Args:
47
+ db_name (str | None): The name of the database.
48
+ db_type (str | None): The type of the database.
49
+ db_error_msg (str | None): The error message.
50
+ failed_query (str | None): The failed query.
51
+
52
+ Returns:
53
+ str: The formatted error message.
54
+ """
55
+ error_message = "Failed to execute external database query during query processing."
56
+ if db_name is not None or db_type is not None:
57
+ error_message += "\n\nDatabase Details:"
58
+ if db_name is not None:
59
+ error_message += f"\n- Name: {db_name}"
60
+ if db_type is not None:
61
+ error_message += f"\n- Type: {db_type}"
62
+
63
+ if db_error_msg is not None:
64
+ error_message += f"\n\nError:\n{indent(db_error_msg, ' ')}"
65
+
66
+ if failed_query is not None:
67
+ error_message += f"\n\nFailed Query:\n{indent(failed_query, ' ')}"
68
+
69
+ return error_message
@@ -27,28 +27,71 @@ def clear_filename(filename: str) -> str:
27
27
 
28
28
  if not filename:
29
29
  return filename
30
- badchars = '\\/:*?\"<>|'
30
+ badchars = '\\/:*?"<>|'
31
31
  for c in badchars:
32
- filename = filename.replace(c, '')
32
+ filename = filename.replace(c, "")
33
33
  return filename
34
34
 
35
35
 
36
- def validate_urls(urls, allowed_urls):
36
+ def _split_url(url: str) -> tuple[str, str]:
37
+ """
38
+ Splits the URL into scheme and netloc.
39
+
40
+ Args:
41
+ url (str): The URL to split.
42
+
43
+ Returns:
44
+ tuple[str, str]: The scheme and netloc of the URL.
45
+
46
+ Raises:
47
+ ValueError: If the URL does not include protocol and host name.
48
+ """
49
+ parsed_url = urlparse(url)
50
+ if not (parsed_url.scheme and parsed_url.netloc):
51
+ raise ValueError(f"URL must include protocol and host name: {url}")
52
+ return parsed_url.scheme.lower(), parsed_url.netloc.lower()
53
+
54
+
55
+ def validate_urls(urls: str | list[str], allowed_urls: list[str], disallowed_urls: list[str] | None = None) -> bool:
37
56
  """
38
57
  Checks if the provided URL(s) is/are from an allowed host.
39
58
 
40
- This function parses the URL(s) and checks the network location part (netloc)
59
+ This function parses the URL(s) and checks the origin (scheme + netloc)
41
60
  against a list of allowed hosts.
42
61
 
43
- :param urls: The URL(s) to check. Can be a single URL (str) or a list of URLs (list).
44
- :param allowed_urls: The list of allowed URLs.
45
- :return bool: True if the URL(s) is/are from an allowed host, False otherwise.
62
+ Examples:
63
+ validate_urls("http://site.com/file", ["site.com"]) -> Exception
64
+ validate_urls("https://site.com/file", ["https://site.com"]) -> True
65
+ validate_urls("http://site.com/file", ["https://site.com"]) -> False
66
+ validate_urls("https://site.com/file", ["https://example.com"]) -> False
67
+ validate_urls("site.com/file", ["https://site.com"]) -> Exception
68
+
69
+ Args:
70
+ urls (str | list[str]): The URL(s) to check. Can be a single URL (str) or a list of URLs (list).
71
+ allowed_urls (list[str]): The list of allowed URLs.
72
+ disallowed_urls (list[str]): The list of disallowed URLs. If provided, the function
73
+ will return False if the URL is in the disallowed list.
74
+
75
+ Returns:
76
+ bool: True if the URL(s) is/are from an allowed host and not in the disallowed list, False otherwise.
46
77
  """
47
- allowed_netlocs = [urlparse(allowed_url).netloc for allowed_url in allowed_urls]
78
+ if disallowed_urls is None:
79
+ disallowed_urls = []
80
+
81
+ allowed_origins = [_split_url(url) for url in allowed_urls]
82
+ disallowed_origins = [_split_url(url) for url in disallowed_urls]
48
83
 
49
84
  if isinstance(urls, str):
50
85
  urls = [urls]
51
86
 
52
- # Check if all provided URLs are from the allowed sites
53
- valid = all(urlparse(url).netloc in allowed_netlocs for url in urls)
54
- return valid
87
+ if allowed_origins:
88
+ for url in urls:
89
+ if _split_url(url) not in allowed_origins:
90
+ return False
91
+
92
+ if disallowed_origins:
93
+ for url in urls:
94
+ if _split_url(url) in disallowed_origins:
95
+ return False
96
+
97
+ return True