MindsDB 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (61) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +53 -94
  3. mindsdb/api/a2a/agent.py +30 -206
  4. mindsdb/api/a2a/common/server/server.py +26 -27
  5. mindsdb/api/a2a/task_manager.py +93 -227
  6. mindsdb/api/a2a/utils.py +21 -0
  7. mindsdb/api/executor/command_executor.py +8 -6
  8. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
  9. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
  10. mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
  11. mindsdb/api/executor/planner/query_prepare.py +68 -87
  12. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
  13. mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
  14. mindsdb/api/executor/utilities/sql.py +97 -21
  15. mindsdb/api/http/namespaces/agents.py +126 -201
  16. mindsdb/api/http/namespaces/config.py +12 -1
  17. mindsdb/api/http/namespaces/file.py +49 -24
  18. mindsdb/api/mcp/start.py +45 -31
  19. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
  20. mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
  21. mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
  22. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
  23. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
  24. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
  25. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  26. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
  27. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +244 -141
  28. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
  29. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
  30. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
  31. mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
  32. mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
  33. mindsdb/integrations/libs/keyword_search_base.py +41 -0
  34. mindsdb/integrations/libs/vectordatabase_handler.py +114 -84
  35. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
  36. mindsdb/integrations/utilities/sql_utils.py +11 -0
  37. mindsdb/interfaces/agents/agents_controller.py +29 -9
  38. mindsdb/interfaces/agents/langchain_agent.py +7 -5
  39. mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
  40. mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
  41. mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
  42. mindsdb/interfaces/database/projects.py +1 -3
  43. mindsdb/interfaces/functions/controller.py +54 -64
  44. mindsdb/interfaces/functions/to_markdown.py +47 -14
  45. mindsdb/interfaces/knowledge_base/controller.py +228 -110
  46. mindsdb/interfaces/knowledge_base/evaluate.py +18 -6
  47. mindsdb/interfaces/knowledge_base/executor.py +346 -0
  48. mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
  49. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
  50. mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
  51. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
  52. mindsdb/interfaces/skills/sql_agent.py +181 -130
  53. mindsdb/interfaces/storage/db.py +9 -7
  54. mindsdb/utilities/config.py +58 -40
  55. mindsdb/utilities/exception.py +58 -7
  56. mindsdb/utilities/security.py +54 -11
  57. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +245 -259
  58. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +61 -58
  59. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
  60. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
  61. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0
@@ -18,45 +18,31 @@ class PreprocessorType(Enum):
18
18
 
19
19
  class BasePreprocessingConfig(BaseModel):
20
20
  """Base configuration for preprocessing"""
21
+
21
22
  chunk_size: int = Field(default=DEFAULT_CHUNK_SIZE, description="Size of document chunks")
22
23
  chunk_overlap: int = Field(default=DEFAULT_CHUNK_OVERLAP, description="Overlap between chunks")
24
+ doc_id_column_name: str = Field(default="_original_doc_id", description="Name of doc_id columns in metadata")
23
25
 
24
26
 
25
27
  class ContextualConfig(BasePreprocessingConfig):
26
28
  """Configuration specific to contextual preprocessing"""
29
+
27
30
  llm_config: LLMConfig = Field(
28
- default_factory=LLMConfig,
29
- description="LLM configuration to use for context generation"
30
- )
31
- context_template: Optional[str] = Field(
32
- default=None,
33
- description="Custom template for context generation"
34
- )
35
- summarize: Optional[bool] = Field(
36
- default=False,
37
- description="Whether to return chunks as summarizations"
31
+ default_factory=LLMConfig, description="LLM configuration to use for context generation"
38
32
  )
33
+ context_template: Optional[str] = Field(default=None, description="Custom template for context generation")
34
+ summarize: Optional[bool] = Field(default=False, description="Whether to return chunks as summarizations")
39
35
 
40
36
 
41
- class TextChunkingConfig(BaseModel):
37
+ class TextChunkingConfig(BasePreprocessingConfig):
42
38
  """Configuration for text chunking preprocessor using Pydantic"""
43
- chunk_size: int = Field(
44
- default=1000,
45
- description="The target size of each text chunk",
46
- gt=0
47
- )
48
- chunk_overlap: int = Field(
49
- default=200,
50
- description="The number of characters to overlap between chunks",
51
- ge=0
52
- )
53
- length_function: Callable = Field(
54
- default=len,
55
- description="Function to measure text length"
56
- )
39
+
40
+ chunk_size: int = Field(default=1000, description="The target size of each text chunk", gt=0)
41
+ chunk_overlap: int = Field(default=200, description="The number of characters to overlap between chunks", ge=0)
42
+ length_function: Callable = Field(default=len, description="Function to measure text length")
57
43
  separators: List[str] = Field(
58
44
  default=["\n\n", "\n", " ", ""],
59
- description="List of separators to use for splitting text, in order of priority"
45
+ description="List of separators to use for splitting text, in order of priority",
60
46
  )
61
47
 
62
48
  class Config:
@@ -65,44 +51,28 @@ class TextChunkingConfig(BaseModel):
65
51
 
66
52
  class JSONChunkingConfig(BasePreprocessingConfig):
67
53
  """Configuration for JSON chunking preprocessor"""
68
- flatten_nested: bool = Field(
69
- default=True,
70
- description="Whether to flatten nested JSON structures"
71
- )
72
- include_metadata: bool = Field(
73
- default=True,
74
- description="Whether to include original metadata in chunks"
75
- )
54
+
55
+ flatten_nested: bool = Field(default=True, description="Whether to flatten nested JSON structures")
56
+ include_metadata: bool = Field(default=True, description="Whether to include original metadata in chunks")
76
57
  chunk_by_object: bool = Field(
77
- default=True,
78
- description="Whether to chunk by top-level objects (True) or create a single document (False)"
79
- )
80
- exclude_fields: List[str] = Field(
81
- default_factory=list,
82
- description="List of fields to exclude from chunking"
58
+ default=True, description="Whether to chunk by top-level objects (True) or create a single document (False)"
83
59
  )
60
+ exclude_fields: List[str] = Field(default_factory=list, description="List of fields to exclude from chunking")
84
61
  include_fields: List[str] = Field(
85
62
  default_factory=list,
86
- description="List of fields to include in chunking (if empty, all fields except excluded ones are included)"
63
+ description="List of fields to include in chunking (if empty, all fields except excluded ones are included)",
87
64
  )
88
65
  metadata_fields: List[str] = Field(
89
66
  default_factory=list,
90
67
  description="List of fields to extract into metadata for filtering "
91
- "(can include nested fields using dot notation). "
92
- "If empty, all primitive fields will be extracted (top-level fields if available, otherwise all primitive fields in the flattened structure)."
68
+ "(can include nested fields using dot notation). "
69
+ "If empty, all primitive fields will be extracted (top-level fields if available, otherwise all primitive fields in the flattened structure).",
93
70
  )
94
71
  extract_all_primitives: bool = Field(
95
- default=False,
96
- description="Whether to extract all primitive values (strings, numbers, booleans) into metadata"
97
- )
98
- nested_delimiter: str = Field(
99
- default=".",
100
- description="Delimiter for flattened nested field names"
101
- )
102
- content_column: str = Field(
103
- default="content",
104
- description="Name of the content column for chunk ID generation"
72
+ default=False, description="Whether to extract all primitive values (strings, numbers, booleans) into metadata"
105
73
  )
74
+ nested_delimiter: str = Field(default=".", description="Delimiter for flattened nested field names")
75
+ content_column: str = Field(default="content", description="Name of the content column for chunk ID generation")
106
76
 
107
77
  class Config:
108
78
  arbitrary_types_allowed = True
@@ -110,25 +80,20 @@ class JSONChunkingConfig(BasePreprocessingConfig):
110
80
 
111
81
  class PreprocessingConfig(BaseModel):
112
82
  """Complete preprocessing configuration"""
113
- type: PreprocessorType = Field(
114
- default=PreprocessorType.TEXT_CHUNKING,
115
- description="Type of preprocessing to apply"
116
- )
83
+
84
+ type: PreprocessorType = Field(default=PreprocessorType.TEXT_CHUNKING, description="Type of preprocessing to apply")
117
85
  contextual_config: Optional[ContextualConfig] = Field(
118
- default=None,
119
- description="Configuration for contextual preprocessing"
86
+ default=None, description="Configuration for contextual preprocessing"
120
87
  )
121
88
  text_chunking_config: Optional[TextChunkingConfig] = Field(
122
- default=None,
123
- description="Configuration for text chunking preprocessing"
89
+ default=None, description="Configuration for text chunking preprocessing"
124
90
  )
125
91
  json_chunking_config: Optional[JSONChunkingConfig] = Field(
126
- default=None,
127
- description="Configuration for JSON chunking preprocessing"
92
+ default=None, description="Configuration for JSON chunking preprocessing"
128
93
  )
129
94
 
130
- @model_validator(mode='after')
131
- def validate_config_presence(self) -> 'PreprocessingConfig':
95
+ @model_validator(mode="after")
96
+ def validate_config_presence(self) -> "PreprocessingConfig":
132
97
  """Ensure the appropriate config is present for the chosen type"""
133
98
  if self.type == PreprocessorType.CONTEXTUAL and not self.contextual_config:
134
99
  self.contextual_config = ContextualConfig()
@@ -137,26 +102,28 @@ class PreprocessingConfig(BaseModel):
137
102
  if self.type == PreprocessorType.JSON_CHUNKING and not self.json_chunking_config:
138
103
  # Import here to avoid circular imports
139
104
  from mindsdb.interfaces.knowledge_base.preprocessing.json_chunker import JSONChunkingConfig
105
+
140
106
  self.json_chunking_config = JSONChunkingConfig()
141
107
  return self
142
108
 
143
109
 
144
110
  class Document(BaseModel):
145
-
146
111
  """Document model with default metadata handling"""
112
+
147
113
  id: Optional[Union[int, str]] = Field(default=None, description="Unique identifier for the document")
148
114
  content: str = Field(description="The document content")
149
115
  embeddings: Optional[List[float]] = Field(default=None, description="Vector embeddings of the content")
150
116
  metadata: Optional[Dict[str, Any]] = Field(default=None, description="Additional document metadata")
151
117
 
152
- @model_validator(mode='after')
153
- def validate_metadata(self) -> 'Document':
118
+ @model_validator(mode="after")
119
+ def validate_metadata(self) -> "Document":
154
120
  """Ensure metadata is present and valid"""
155
121
  if not self.metadata:
156
- self.metadata = {'source': 'default'}
122
+ self.metadata = {"source": "default"}
157
123
  return self
158
124
 
159
125
 
160
126
  class ProcessedChunk(Document):
161
127
  """Processed chunk that aligns with VectorStoreHandler schema"""
128
+
162
129
  pass
@@ -106,6 +106,8 @@ class KnowledgeBaseInfoTool(BaseTool):
106
106
 
107
107
  for kb_name in kb_names:
108
108
  try:
109
+ self.db.check_knowledge_base_permission(Identifier(kb_name))
110
+
109
111
  # Get knowledge base schema
110
112
  schema_result = self.db.run_no_throw(str(Describe(kb_name, type="knowledge_base")))
111
113
 
@@ -3,7 +3,9 @@ import csv
3
3
  import inspect
4
4
  import traceback
5
5
  from io import StringIO
6
- from typing import Iterable, List, Optional, Any
6
+ from typing import Iterable, List, Optional, Any, Tuple
7
+ from collections import defaultdict
8
+ import fnmatch
7
9
 
8
10
  import pandas as pd
9
11
  from mindsdb_sql_parser import parse_sql
@@ -75,12 +77,84 @@ def split_table_name(table_name: str) -> List[str]:
75
77
  if current:
76
78
  result.append(current.strip("`"))
77
79
 
78
- # ensure we split the table name
79
- # result = [r.split(".") for r in result][0]
80
-
81
80
  return result
82
81
 
83
82
 
83
+ class TablesCollection:
84
+ """
85
+ Collection of identifiers.
86
+ Supports wildcard in tables name.
87
+ """
88
+
89
+ def __init__(self, items: List[Identifier | str] = None, default_db=None):
90
+ if items is None:
91
+ items = []
92
+
93
+ self.items = items
94
+ self._dbs = defaultdict(set)
95
+ self._schemas = defaultdict(dict)
96
+ self._no_db_tables = set()
97
+ self.has_wildcard = False
98
+ self.databases = set()
99
+ self._default_db = default_db
100
+
101
+ for name in items:
102
+ if not isinstance(name, Identifier):
103
+ name = Identifier(name)
104
+ db, schema, tbl = self._get_paths(name)
105
+ if db is None:
106
+ self._no_db_tables.add(tbl)
107
+ elif schema is None:
108
+ self._dbs[db].add(tbl)
109
+ else:
110
+ if schema not in self._schemas[db]:
111
+ self._schemas[db][schema] = set()
112
+ self._schemas[db][schema].add(tbl)
113
+
114
+ if "*" in tbl:
115
+ self.has_wildcard = True
116
+ self.databases.add(db)
117
+
118
+ def _get_paths(self, table: Identifier) -> Tuple:
119
+ # split identifier to db, schema, table name
120
+ schema = None
121
+ db = None
122
+
123
+ match [x.lower() for x in table.parts]:
124
+ case [tbl]:
125
+ pass
126
+ case [db, tbl]:
127
+ pass
128
+ case [db, schema, tbl]:
129
+ pass
130
+ case _:
131
+ raise NotImplementedError
132
+ return db, schema, tbl.lower()
133
+
134
+ def match(self, table: Identifier) -> bool:
135
+ # Check if input table matches to tables in collection
136
+
137
+ db, schema, tbl = self._get_paths(table)
138
+ if db is None:
139
+ if tbl in self._no_db_tables:
140
+ return True
141
+ if self._default_db is not None:
142
+ return self.match(Identifier(parts=[self._default_db, tbl]))
143
+
144
+ if schema is not None:
145
+ if any([fnmatch.fnmatch(tbl, pattern) for pattern in self._schemas[db].get(schema, [])]):
146
+ return True
147
+
148
+ # table might be specified without schema
149
+ return any([fnmatch.fnmatch(tbl, pattern) for pattern in self._dbs[db]])
150
+
151
+ def __bool__(self):
152
+ return len(self.items) > 0
153
+
154
+ def __repr__(self):
155
+ return f"Tables({self.items})"
156
+
157
+
84
158
  class SQLAgent:
85
159
  """
86
160
  SQLAgent is a class that handles SQL queries for agents.
@@ -117,21 +191,23 @@ class SQLAgent:
117
191
  self._command_executor = command_executor
118
192
  self._mindsdb_db_struct = databases_struct
119
193
  self.knowledge_base_database = knowledge_base_database # This is a project name, not a database connection
194
+ self._databases = databases
120
195
  self._sample_rows_in_table_info = int(sample_rows_in_table_info)
121
196
 
122
- self._tables_to_include = include_tables
123
- self._tables_to_ignore = []
124
- self._knowledge_bases_to_include = include_knowledge_bases
125
- self._knowledge_bases_to_ignore = []
126
- self._databases = databases
127
- if not self._tables_to_include:
197
+ self._tables_to_include = TablesCollection(include_tables)
198
+ if self._tables_to_include:
128
199
  # ignore_tables and include_tables should not be used together.
129
200
  # include_tables takes priority if it's set.
130
- self._tables_to_ignore = ignore_tables or []
131
- if not self._knowledge_bases_to_include:
201
+ ignore_tables = []
202
+ self._tables_to_ignore = TablesCollection(ignore_tables)
203
+
204
+ self._knowledge_bases_to_include = TablesCollection(include_knowledge_bases, default_db=knowledge_base_database)
205
+ if self._knowledge_bases_to_include:
132
206
  # ignore_knowledge_bases and include_knowledge_bases should not be used together.
133
207
  # include_knowledge_bases takes priority if it's set.
134
- self._knowledge_bases_to_ignore = ignore_knowledge_bases or []
208
+ ignore_knowledge_bases = []
209
+ self._knowledge_bases_to_ignore = TablesCollection(ignore_knowledge_bases, default_db=knowledge_base_database)
210
+
135
211
  self._cache = cache
136
212
 
137
213
  from mindsdb.interfaces.skills.skill_tool import SkillToolController
@@ -159,46 +235,54 @@ class SQLAgent:
159
235
  if not isinstance(ast_query, (Select, Show, Describe, Explain)):
160
236
  raise ValueError(f"Query is not allowed: {ast_query.to_string()}")
161
237
 
238
+ kb_names = self.get_all_knowledge_base_names()
239
+
162
240
  # Check tables
163
241
  if self._tables_to_include:
164
- tables_parts = [split_table_name(x) for x in self._tables_to_include]
165
- no_schema_parts = []
166
- for t in tables_parts:
167
- if len(t) == 3:
168
- no_schema_parts.append([t[0], t[2]])
169
- tables_parts += no_schema_parts
170
242
 
171
243
  def _check_f(node, is_table=None, **kwargs):
172
244
  if is_table and isinstance(node, Identifier):
173
245
  table_name = ".".join(node.parts)
174
246
 
175
- # Get the list of available knowledge bases
176
- kb_names = self.get_usable_knowledge_base_names()
177
-
178
247
  # Check if this table is a knowledge base
179
- is_kb = table_name in kb_names
180
-
181
- # If it's a knowledge base and we have knowledge base restrictions
182
- if is_kb and self._knowledge_bases_to_include:
183
- kb_parts = [split_table_name(x) for x in self._knowledge_bases_to_include]
184
- if node.parts not in kb_parts:
185
- raise ValueError(
186
- f"Knowledge base {table_name} not found. Available knowledge bases: {', '.join(self._knowledge_bases_to_include)}"
187
- )
188
- # Regular table check
189
- elif not is_kb and self._tables_to_include and node.parts not in tables_parts:
190
- raise ValueError(
191
- f"Table {table_name} not found. Available tables: {', '.join(self._tables_to_include)}"
192
- )
193
- # Check if it's a restricted knowledge base
194
- elif is_kb and table_name in self._knowledge_bases_to_ignore:
195
- raise ValueError(f"Knowledge base {table_name} is not allowed.")
196
- # Check if it's a restricted table
197
- elif not is_kb and table_name in self._tables_to_ignore:
198
- raise ValueError(f"Table {table_name} is not allowed.")
248
+ if table_name in kb_names or node.parts[-1] in kb_names:
249
+ # If it's a knowledge base and we have knowledge base restrictions
250
+ self.check_knowledge_base_permission(node)
251
+ else:
252
+ try:
253
+ # Regular table check
254
+ self.check_table_permission(node)
255
+ except ValueError as origin_exc:
256
+ # was it badly quoted by llm?
257
+ if len(node.parts) == 1 and node.is_quoted[0] and "." in node.parts[0]:
258
+ node2 = Identifier(node.parts[0])
259
+ try:
260
+ _check_f(node2, is_table=True)
261
+ return node2
262
+ except ValueError:
263
+ ...
264
+ raise origin_exc
199
265
 
200
266
  query_traversal(ast_query, _check_f)
201
267
 
268
+ def check_knowledge_base_permission(self, node):
269
+ if self._knowledge_bases_to_include and not self._knowledge_bases_to_include.match(node):
270
+ raise ValueError(
271
+ f"Knowledge base {str(node)} not found. Available knowledge bases: {', '.join(self._knowledge_bases_to_include.items)}"
272
+ )
273
+ # Check if it's a restricted knowledge base
274
+ if self._knowledge_bases_to_ignore and self._knowledge_bases_to_ignore.match(node):
275
+ raise ValueError(f"Knowledge base {str(node)} is not allowed.")
276
+
277
+ def check_table_permission(self, node):
278
+ if self._tables_to_include and not self._tables_to_include.match(node):
279
+ raise ValueError(
280
+ f"Table {str(node)} not found. Available tables: {', '.join(self._tables_to_include.items)}"
281
+ )
282
+ # Check if it's a restricted table
283
+ if self._tables_to_ignore and self._tables_to_ignore.match(node):
284
+ raise ValueError(f"Table {str(node)} is not allowed.")
285
+
202
286
  def get_usable_table_names(self) -> Iterable[str]:
203
287
  """Get a list of tables that the agent has access to.
204
288
 
@@ -213,50 +297,35 @@ class SQLAgent:
213
297
  if cached_tables:
214
298
  return cached_tables
215
299
 
216
- if self._tables_to_include:
217
- return self._tables_to_include
300
+ if not self._tables_to_include:
301
+ # no tables allowed
302
+ return []
303
+ if not self._tables_to_include.has_wildcard:
304
+ return self._tables_to_include.items
218
305
 
219
306
  result_tables = []
220
307
 
221
- for db_name in self._mindsdb_db_struct:
308
+ for db_name in self._tables_to_include.databases:
222
309
  handler = self._command_executor.session.integration_controller.get_data_handler(db_name)
223
310
 
224
- schemas_names = list(self._mindsdb_db_struct[db_name].keys())
225
- if len(schemas_names) > 1 and None in schemas_names:
226
- raise Exception("default schema and named schemas can not be used in same filter")
227
-
228
- if None in schemas_names:
229
- # get tables only from default schema
230
- response = handler.get_tables()
231
- tables_in_default_schema = list(response.data_frame.table_name)
232
- schema_tables_restrictions = self._mindsdb_db_struct[db_name][None] # None - is default schema
233
- if schema_tables_restrictions is None:
234
- for table_name in tables_in_default_schema:
235
- result_tables.append([db_name, table_name])
236
- else:
237
- for table_name in schema_tables_restrictions:
238
- if table_name in tables_in_default_schema:
239
- result_tables.append([db_name, table_name])
311
+ if "all" in inspect.signature(handler.get_tables).parameters:
312
+ response = handler.get_tables(all=True)
240
313
  else:
241
- if "all" in inspect.signature(handler.get_tables).parameters:
242
- response = handler.get_tables(all=True)
314
+ response = handler.get_tables()
315
+ df = response.data_frame
316
+ col_name = "table_name"
317
+ if col_name not in df.columns:
318
+ # get first column if not found
319
+ col_name = df.columns[0]
320
+
321
+ for _, row in df.iterrows():
322
+ if "table_schema" in row:
323
+ parts = [db_name, row["table_schema"], row[col_name]]
243
324
  else:
244
- response = handler.get_tables()
245
- response_schema_names = list(response.data_frame.table_schema.unique())
246
- schemas_intersection = set(schemas_names) & set(response_schema_names)
247
- if len(schemas_intersection) == 0:
248
- raise Exception("There are no allowed schemas in ds")
249
-
250
- for schema_name in schemas_intersection:
251
- schema_sub_df = response.data_frame[response.data_frame["table_schema"] == schema_name]
252
- if self._mindsdb_db_struct[db_name][schema_name] is None:
253
- # all tables from schema allowed
254
- for row in schema_sub_df:
255
- result_tables.append([db_name, schema_name, row["table_name"]])
256
- else:
257
- for table_name in self._mindsdb_db_struct[db_name][schema_name]:
258
- if table_name in schema_sub_df["table_name"].values:
259
- result_tables.append([db_name, schema_name, table_name])
325
+ parts = [db_name, row[col_name]]
326
+ if self._tables_to_include.match(Identifier(parts=parts)):
327
+ if not self._tables_to_ignore.match(Identifier(parts=parts)):
328
+ result_tables.append(parts)
260
329
 
261
330
  result_tables = [".".join(x) for x in result_tables]
262
331
  if self._cache:
@@ -269,7 +338,28 @@ class SQLAgent:
269
338
  Returns:
270
339
  Iterable[str]: list with knowledge base names
271
340
  """
272
- cache_key = f"{ctx.company_id}_{self.knowledge_base_database}_knowledge_bases"
341
+
342
+ if not self._knowledge_bases_to_include and not self._knowledge_bases_to_ignore:
343
+ # white or black list have to be set
344
+ return []
345
+
346
+ # Filter knowledge bases based on ignore list
347
+ kb_names = []
348
+ for kb_name in self.get_all_knowledge_base_names():
349
+ kb = Identifier(parts=[self.knowledge_base_database, kb_name])
350
+ if self._knowledge_bases_to_include and not self._knowledge_bases_to_include.match(kb):
351
+ continue
352
+ if not self._knowledge_bases_to_ignore.match(kb):
353
+ kb_names.append(kb_name)
354
+ return kb_names
355
+
356
+ def get_all_knowledge_base_names(self) -> Iterable[str]:
357
+ """Get a list of all knowledge bases
358
+
359
+ Returns:
360
+ Iterable[str]: list with knowledge base names
361
+ """
362
+ # cache_key = f"{ctx.company_id}_{self.knowledge_base_database}_knowledge_bases"
273
363
 
274
364
  # todo we need to fix the cache, file cache can potentially store out of data information
275
365
  # # first check cache and return if found
@@ -278,58 +368,18 @@ class SQLAgent:
278
368
  # if cached_kbs:
279
369
  # return cached_kbs
280
370
 
281
- if self._knowledge_bases_to_include:
282
- return self._knowledge_bases_to_include
283
-
284
371
  try:
285
372
  # Query to get all knowledge bases
286
- query = f"SHOW KNOWLEDGE_BASES FROM {self.knowledge_base_database};"
287
- try:
288
- result = self._call_engine(query, database=self.knowledge_base_database)
289
- except Exception as e:
290
- # If the direct query fails, try a different approach
291
- # This handles the case where knowledge_base_database is not a valid integration
292
- logger.warning(f"Error querying knowledge bases from {self.knowledge_base_database}: {str(e)}")
293
- # Try to get knowledge bases directly from the project database
294
- try:
295
- # Get knowledge bases from the project database
296
- kb_controller = self._command_executor.session.kb_controller
297
- kb_names = [kb["name"] for kb in kb_controller.list()]
298
-
299
- # Filter knowledge bases based on include list
300
- if self._knowledge_bases_to_include:
301
- kb_names = [kb_name for kb_name in kb_names if kb_name in self._knowledge_bases_to_include]
302
- if not kb_names:
303
- logger.warning(
304
- f"No knowledge bases found in the include list: {self._knowledge_bases_to_include}"
305
- )
306
- return []
307
-
308
- return kb_names
309
-
310
- # Filter knowledge bases based on ignore list
311
- kb_names = [kb_name for kb_name in kb_names if kb_name not in self._knowledge_bases_to_ignore]
312
-
313
- if self._cache:
314
- self._cache.set(cache_key, set(kb_names))
315
-
316
- return kb_names
317
- except Exception as inner_e:
318
- logger.error(f"Error getting knowledge bases from kb_controller: {str(inner_e)}")
319
- return []
320
-
321
- if not result:
322
- return []
373
+ ast_query = Show(category="Knowledge Bases")
374
+ result = self._command_executor.execute_command(ast_query, database_name=self.knowledge_base_database)
323
375
 
324
376
  # Filter knowledge bases based on ignore list
325
377
  kb_names = []
326
- for row in result:
327
- kb_name = row["name"]
328
- if kb_name not in self._knowledge_bases_to_ignore:
329
- kb_names.append(kb_name)
378
+ for row in result.data.records:
379
+ kb_names.append(row["NAME"])
330
380
 
331
- if self._cache:
332
- self._cache.set(cache_key, set(kb_names))
381
+ # if self._cache:
382
+ # self._cache.set(cache_key, set(kb_names))
333
383
 
334
384
  return kb_names
335
385
  except Exception as e:
@@ -369,7 +419,7 @@ class SQLAgent:
369
419
  table_identifier = tables_idx.get(tuple(table_parts))
370
420
 
371
421
  if table_identifier is None:
372
- raise ValueError(f"Table {table} not found in the database")
422
+ raise ValueError(f"Table {table_name} not found in the database")
373
423
  tables.append(table_identifier)
374
424
 
375
425
  return tables
@@ -411,13 +461,14 @@ class SQLAgent:
411
461
  if len(parts) == 1:
412
462
  raise ValueError(f"Invalid table name: {name}. Expected format is 'database.table'.")
413
463
 
414
- database_table_map[parts[0]] = database_table_map.get(parts[0], []) + [parts[1]]
464
+ database_table_map.setdefault(parts[0], []).append(parts[1])
415
465
 
416
466
  data_catalog_str = ""
417
467
  for database_name, table_names in database_table_map.items():
418
468
  data_catalog_reader = DataCatalogReader(database_name=database_name, table_names=table_names)
419
469
 
420
- data_catalog_str += data_catalog_reader.read_metadata_as_string()
470
+ result = data_catalog_reader.read_metadata_as_string()
471
+ data_catalog_str += str(result or "")
421
472
 
422
473
  return data_catalog_str
423
474
 
@@ -430,7 +481,7 @@ class SQLAgent:
430
481
 
431
482
  split = name.split(".")
432
483
  if len(split) > 1:
433
- all_tables.append(Identifier(parts=[split[0], split[1]]))
484
+ all_tables.append(Identifier(parts=[split[0], split[-1]]))
434
485
  else:
435
486
  all_tables.append(Identifier(name))
436
487
 
@@ -684,10 +684,10 @@ class MetaColumns(Base):
684
684
  if self.default_value:
685
685
  column_info += f"\n{pad}- Default Value: {self.default_value}"
686
686
 
687
- if self.meta_column_statistics:
687
+ stats = self.meta_column_statistics or []
688
+ if stats and callable(getattr(stats[0], "as_string", None)):
688
689
  column_info += f"\n\n{pad}- Column Statistics:"
689
- column_info += f"\n{self.meta_column_statistics[0].as_string(indent + 4)}"
690
-
690
+ column_info += f"\n{stats[0].as_string(indent + 4)}"
691
691
  return column_info
692
692
 
693
693
 
@@ -708,18 +708,20 @@ class MetaColumnStatistics(Base):
708
708
  inner_pad = " " * (indent + 4)
709
709
 
710
710
  column_statistics = ""
711
+ most_common_values = self.most_common_values or []
712
+ most_common_frequencies = self.most_common_frequencies or []
711
713
 
712
- if any(self.most_common_values) and any(self.most_common_frequencies):
714
+ if most_common_values and most_common_frequencies:
713
715
  column_statistics += f"{pad}- Top 10 Most Common Values and Frequencies:"
714
- for i in range(min(10, len(self.most_common_values))):
715
- freq = self.most_common_frequencies[i]
716
+ for i in range(min(10, len(most_common_values))):
717
+ freq = most_common_frequencies[i]
716
718
  try:
717
719
  percent = float(freq) * 100
718
720
  freq_str = f"{percent:.2f}%"
719
721
  except (ValueError, TypeError):
720
722
  freq_str = str(freq)
721
723
 
722
- column_statistics += f"\n{inner_pad}- {self.most_common_values[i]}: {freq_str}"
724
+ column_statistics += f"\n{inner_pad}- {most_common_values[i]}: {freq_str}"
723
725
  column_statistics += "\n"
724
726
 
725
727
  if self.null_percentage: