MindsDB 25.6.4.0__py3-none-any.whl → 25.7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/command_executor.py +8 -6
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
- mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
- mindsdb/api/executor/planner/query_prepare.py +68 -87
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
- mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
- mindsdb/api/http/namespaces/file.py +49 -24
- mindsdb/api/mcp/start.py +45 -31
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
- mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
- mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
- mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +150 -140
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/libs/vectordatabase_handler.py +86 -77
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
- mindsdb/interfaces/agents/agents_controller.py +29 -9
- mindsdb/interfaces/agents/langchain_agent.py +7 -5
- mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
- mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
- mindsdb/interfaces/knowledge_base/controller.py +115 -89
- mindsdb/interfaces/knowledge_base/evaluate.py +16 -4
- mindsdb/interfaces/knowledge_base/executor.py +346 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
- mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
- mindsdb/interfaces/skills/sql_agent.py +181 -130
- mindsdb/interfaces/storage/db.py +9 -7
- mindsdb/utilities/config.py +12 -1
- mindsdb/utilities/exception.py +47 -7
- mindsdb/utilities/security.py +54 -11
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/METADATA +248 -262
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/RECORD +46 -45
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/top_level.txt +0 -0
|
@@ -18,45 +18,31 @@ class PreprocessorType(Enum):
|
|
|
18
18
|
|
|
19
19
|
class BasePreprocessingConfig(BaseModel):
|
|
20
20
|
"""Base configuration for preprocessing"""
|
|
21
|
+
|
|
21
22
|
chunk_size: int = Field(default=DEFAULT_CHUNK_SIZE, description="Size of document chunks")
|
|
22
23
|
chunk_overlap: int = Field(default=DEFAULT_CHUNK_OVERLAP, description="Overlap between chunks")
|
|
24
|
+
doc_id_column_name: str = Field(default="_original_doc_id", description="Name of doc_id columns in metadata")
|
|
23
25
|
|
|
24
26
|
|
|
25
27
|
class ContextualConfig(BasePreprocessingConfig):
|
|
26
28
|
"""Configuration specific to contextual preprocessing"""
|
|
29
|
+
|
|
27
30
|
llm_config: LLMConfig = Field(
|
|
28
|
-
default_factory=LLMConfig,
|
|
29
|
-
description="LLM configuration to use for context generation"
|
|
30
|
-
)
|
|
31
|
-
context_template: Optional[str] = Field(
|
|
32
|
-
default=None,
|
|
33
|
-
description="Custom template for context generation"
|
|
34
|
-
)
|
|
35
|
-
summarize: Optional[bool] = Field(
|
|
36
|
-
default=False,
|
|
37
|
-
description="Whether to return chunks as summarizations"
|
|
31
|
+
default_factory=LLMConfig, description="LLM configuration to use for context generation"
|
|
38
32
|
)
|
|
33
|
+
context_template: Optional[str] = Field(default=None, description="Custom template for context generation")
|
|
34
|
+
summarize: Optional[bool] = Field(default=False, description="Whether to return chunks as summarizations")
|
|
39
35
|
|
|
40
36
|
|
|
41
|
-
class TextChunkingConfig(
|
|
37
|
+
class TextChunkingConfig(BasePreprocessingConfig):
|
|
42
38
|
"""Configuration for text chunking preprocessor using Pydantic"""
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
)
|
|
48
|
-
chunk_overlap: int = Field(
|
|
49
|
-
default=200,
|
|
50
|
-
description="The number of characters to overlap between chunks",
|
|
51
|
-
ge=0
|
|
52
|
-
)
|
|
53
|
-
length_function: Callable = Field(
|
|
54
|
-
default=len,
|
|
55
|
-
description="Function to measure text length"
|
|
56
|
-
)
|
|
39
|
+
|
|
40
|
+
chunk_size: int = Field(default=1000, description="The target size of each text chunk", gt=0)
|
|
41
|
+
chunk_overlap: int = Field(default=200, description="The number of characters to overlap between chunks", ge=0)
|
|
42
|
+
length_function: Callable = Field(default=len, description="Function to measure text length")
|
|
57
43
|
separators: List[str] = Field(
|
|
58
44
|
default=["\n\n", "\n", " ", ""],
|
|
59
|
-
description="List of separators to use for splitting text, in order of priority"
|
|
45
|
+
description="List of separators to use for splitting text, in order of priority",
|
|
60
46
|
)
|
|
61
47
|
|
|
62
48
|
class Config:
|
|
@@ -65,44 +51,28 @@ class TextChunkingConfig(BaseModel):
|
|
|
65
51
|
|
|
66
52
|
class JSONChunkingConfig(BasePreprocessingConfig):
|
|
67
53
|
"""Configuration for JSON chunking preprocessor"""
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
)
|
|
72
|
-
include_metadata: bool = Field(
|
|
73
|
-
default=True,
|
|
74
|
-
description="Whether to include original metadata in chunks"
|
|
75
|
-
)
|
|
54
|
+
|
|
55
|
+
flatten_nested: bool = Field(default=True, description="Whether to flatten nested JSON structures")
|
|
56
|
+
include_metadata: bool = Field(default=True, description="Whether to include original metadata in chunks")
|
|
76
57
|
chunk_by_object: bool = Field(
|
|
77
|
-
default=True,
|
|
78
|
-
description="Whether to chunk by top-level objects (True) or create a single document (False)"
|
|
79
|
-
)
|
|
80
|
-
exclude_fields: List[str] = Field(
|
|
81
|
-
default_factory=list,
|
|
82
|
-
description="List of fields to exclude from chunking"
|
|
58
|
+
default=True, description="Whether to chunk by top-level objects (True) or create a single document (False)"
|
|
83
59
|
)
|
|
60
|
+
exclude_fields: List[str] = Field(default_factory=list, description="List of fields to exclude from chunking")
|
|
84
61
|
include_fields: List[str] = Field(
|
|
85
62
|
default_factory=list,
|
|
86
|
-
description="List of fields to include in chunking (if empty, all fields except excluded ones are included)"
|
|
63
|
+
description="List of fields to include in chunking (if empty, all fields except excluded ones are included)",
|
|
87
64
|
)
|
|
88
65
|
metadata_fields: List[str] = Field(
|
|
89
66
|
default_factory=list,
|
|
90
67
|
description="List of fields to extract into metadata for filtering "
|
|
91
|
-
|
|
92
|
-
|
|
68
|
+
"(can include nested fields using dot notation). "
|
|
69
|
+
"If empty, all primitive fields will be extracted (top-level fields if available, otherwise all primitive fields in the flattened structure).",
|
|
93
70
|
)
|
|
94
71
|
extract_all_primitives: bool = Field(
|
|
95
|
-
default=False,
|
|
96
|
-
description="Whether to extract all primitive values (strings, numbers, booleans) into metadata"
|
|
97
|
-
)
|
|
98
|
-
nested_delimiter: str = Field(
|
|
99
|
-
default=".",
|
|
100
|
-
description="Delimiter for flattened nested field names"
|
|
101
|
-
)
|
|
102
|
-
content_column: str = Field(
|
|
103
|
-
default="content",
|
|
104
|
-
description="Name of the content column for chunk ID generation"
|
|
72
|
+
default=False, description="Whether to extract all primitive values (strings, numbers, booleans) into metadata"
|
|
105
73
|
)
|
|
74
|
+
nested_delimiter: str = Field(default=".", description="Delimiter for flattened nested field names")
|
|
75
|
+
content_column: str = Field(default="content", description="Name of the content column for chunk ID generation")
|
|
106
76
|
|
|
107
77
|
class Config:
|
|
108
78
|
arbitrary_types_allowed = True
|
|
@@ -110,25 +80,20 @@ class JSONChunkingConfig(BasePreprocessingConfig):
|
|
|
110
80
|
|
|
111
81
|
class PreprocessingConfig(BaseModel):
|
|
112
82
|
"""Complete preprocessing configuration"""
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
description="Type of preprocessing to apply"
|
|
116
|
-
)
|
|
83
|
+
|
|
84
|
+
type: PreprocessorType = Field(default=PreprocessorType.TEXT_CHUNKING, description="Type of preprocessing to apply")
|
|
117
85
|
contextual_config: Optional[ContextualConfig] = Field(
|
|
118
|
-
default=None,
|
|
119
|
-
description="Configuration for contextual preprocessing"
|
|
86
|
+
default=None, description="Configuration for contextual preprocessing"
|
|
120
87
|
)
|
|
121
88
|
text_chunking_config: Optional[TextChunkingConfig] = Field(
|
|
122
|
-
default=None,
|
|
123
|
-
description="Configuration for text chunking preprocessing"
|
|
89
|
+
default=None, description="Configuration for text chunking preprocessing"
|
|
124
90
|
)
|
|
125
91
|
json_chunking_config: Optional[JSONChunkingConfig] = Field(
|
|
126
|
-
default=None,
|
|
127
|
-
description="Configuration for JSON chunking preprocessing"
|
|
92
|
+
default=None, description="Configuration for JSON chunking preprocessing"
|
|
128
93
|
)
|
|
129
94
|
|
|
130
|
-
@model_validator(mode=
|
|
131
|
-
def validate_config_presence(self) ->
|
|
95
|
+
@model_validator(mode="after")
|
|
96
|
+
def validate_config_presence(self) -> "PreprocessingConfig":
|
|
132
97
|
"""Ensure the appropriate config is present for the chosen type"""
|
|
133
98
|
if self.type == PreprocessorType.CONTEXTUAL and not self.contextual_config:
|
|
134
99
|
self.contextual_config = ContextualConfig()
|
|
@@ -137,26 +102,28 @@ class PreprocessingConfig(BaseModel):
|
|
|
137
102
|
if self.type == PreprocessorType.JSON_CHUNKING and not self.json_chunking_config:
|
|
138
103
|
# Import here to avoid circular imports
|
|
139
104
|
from mindsdb.interfaces.knowledge_base.preprocessing.json_chunker import JSONChunkingConfig
|
|
105
|
+
|
|
140
106
|
self.json_chunking_config = JSONChunkingConfig()
|
|
141
107
|
return self
|
|
142
108
|
|
|
143
109
|
|
|
144
110
|
class Document(BaseModel):
|
|
145
|
-
|
|
146
111
|
"""Document model with default metadata handling"""
|
|
112
|
+
|
|
147
113
|
id: Optional[Union[int, str]] = Field(default=None, description="Unique identifier for the document")
|
|
148
114
|
content: str = Field(description="The document content")
|
|
149
115
|
embeddings: Optional[List[float]] = Field(default=None, description="Vector embeddings of the content")
|
|
150
116
|
metadata: Optional[Dict[str, Any]] = Field(default=None, description="Additional document metadata")
|
|
151
117
|
|
|
152
|
-
@model_validator(mode=
|
|
153
|
-
def validate_metadata(self) ->
|
|
118
|
+
@model_validator(mode="after")
|
|
119
|
+
def validate_metadata(self) -> "Document":
|
|
154
120
|
"""Ensure metadata is present and valid"""
|
|
155
121
|
if not self.metadata:
|
|
156
|
-
self.metadata = {
|
|
122
|
+
self.metadata = {"source": "default"}
|
|
157
123
|
return self
|
|
158
124
|
|
|
159
125
|
|
|
160
126
|
class ProcessedChunk(Document):
|
|
161
127
|
"""Processed chunk that aligns with VectorStoreHandler schema"""
|
|
128
|
+
|
|
162
129
|
pass
|
|
@@ -106,6 +106,8 @@ class KnowledgeBaseInfoTool(BaseTool):
|
|
|
106
106
|
|
|
107
107
|
for kb_name in kb_names:
|
|
108
108
|
try:
|
|
109
|
+
self.db.check_knowledge_base_permission(Identifier(kb_name))
|
|
110
|
+
|
|
109
111
|
# Get knowledge base schema
|
|
110
112
|
schema_result = self.db.run_no_throw(str(Describe(kb_name, type="knowledge_base")))
|
|
111
113
|
|
|
@@ -3,7 +3,9 @@ import csv
|
|
|
3
3
|
import inspect
|
|
4
4
|
import traceback
|
|
5
5
|
from io import StringIO
|
|
6
|
-
from typing import Iterable, List, Optional, Any
|
|
6
|
+
from typing import Iterable, List, Optional, Any, Tuple
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
import fnmatch
|
|
7
9
|
|
|
8
10
|
import pandas as pd
|
|
9
11
|
from mindsdb_sql_parser import parse_sql
|
|
@@ -75,12 +77,84 @@ def split_table_name(table_name: str) -> List[str]:
|
|
|
75
77
|
if current:
|
|
76
78
|
result.append(current.strip("`"))
|
|
77
79
|
|
|
78
|
-
# ensure we split the table name
|
|
79
|
-
# result = [r.split(".") for r in result][0]
|
|
80
|
-
|
|
81
80
|
return result
|
|
82
81
|
|
|
83
82
|
|
|
83
|
+
class TablesCollection:
|
|
84
|
+
"""
|
|
85
|
+
Collection of identifiers.
|
|
86
|
+
Supports wildcard in tables name.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(self, items: List[Identifier | str] = None, default_db=None):
|
|
90
|
+
if items is None:
|
|
91
|
+
items = []
|
|
92
|
+
|
|
93
|
+
self.items = items
|
|
94
|
+
self._dbs = defaultdict(set)
|
|
95
|
+
self._schemas = defaultdict(dict)
|
|
96
|
+
self._no_db_tables = set()
|
|
97
|
+
self.has_wildcard = False
|
|
98
|
+
self.databases = set()
|
|
99
|
+
self._default_db = default_db
|
|
100
|
+
|
|
101
|
+
for name in items:
|
|
102
|
+
if not isinstance(name, Identifier):
|
|
103
|
+
name = Identifier(name)
|
|
104
|
+
db, schema, tbl = self._get_paths(name)
|
|
105
|
+
if db is None:
|
|
106
|
+
self._no_db_tables.add(tbl)
|
|
107
|
+
elif schema is None:
|
|
108
|
+
self._dbs[db].add(tbl)
|
|
109
|
+
else:
|
|
110
|
+
if schema not in self._schemas[db]:
|
|
111
|
+
self._schemas[db][schema] = set()
|
|
112
|
+
self._schemas[db][schema].add(tbl)
|
|
113
|
+
|
|
114
|
+
if "*" in tbl:
|
|
115
|
+
self.has_wildcard = True
|
|
116
|
+
self.databases.add(db)
|
|
117
|
+
|
|
118
|
+
def _get_paths(self, table: Identifier) -> Tuple:
|
|
119
|
+
# split identifier to db, schema, table name
|
|
120
|
+
schema = None
|
|
121
|
+
db = None
|
|
122
|
+
|
|
123
|
+
match [x.lower() for x in table.parts]:
|
|
124
|
+
case [tbl]:
|
|
125
|
+
pass
|
|
126
|
+
case [db, tbl]:
|
|
127
|
+
pass
|
|
128
|
+
case [db, schema, tbl]:
|
|
129
|
+
pass
|
|
130
|
+
case _:
|
|
131
|
+
raise NotImplementedError
|
|
132
|
+
return db, schema, tbl.lower()
|
|
133
|
+
|
|
134
|
+
def match(self, table: Identifier) -> bool:
|
|
135
|
+
# Check if input table matches to tables in collection
|
|
136
|
+
|
|
137
|
+
db, schema, tbl = self._get_paths(table)
|
|
138
|
+
if db is None:
|
|
139
|
+
if tbl in self._no_db_tables:
|
|
140
|
+
return True
|
|
141
|
+
if self._default_db is not None:
|
|
142
|
+
return self.match(Identifier(parts=[self._default_db, tbl]))
|
|
143
|
+
|
|
144
|
+
if schema is not None:
|
|
145
|
+
if any([fnmatch.fnmatch(tbl, pattern) for pattern in self._schemas[db].get(schema, [])]):
|
|
146
|
+
return True
|
|
147
|
+
|
|
148
|
+
# table might be specified without schema
|
|
149
|
+
return any([fnmatch.fnmatch(tbl, pattern) for pattern in self._dbs[db]])
|
|
150
|
+
|
|
151
|
+
def __bool__(self):
|
|
152
|
+
return len(self.items) > 0
|
|
153
|
+
|
|
154
|
+
def __repr__(self):
|
|
155
|
+
return f"Tables({self.items})"
|
|
156
|
+
|
|
157
|
+
|
|
84
158
|
class SQLAgent:
|
|
85
159
|
"""
|
|
86
160
|
SQLAgent is a class that handles SQL queries for agents.
|
|
@@ -117,21 +191,23 @@ class SQLAgent:
|
|
|
117
191
|
self._command_executor = command_executor
|
|
118
192
|
self._mindsdb_db_struct = databases_struct
|
|
119
193
|
self.knowledge_base_database = knowledge_base_database # This is a project name, not a database connection
|
|
194
|
+
self._databases = databases
|
|
120
195
|
self._sample_rows_in_table_info = int(sample_rows_in_table_info)
|
|
121
196
|
|
|
122
|
-
self._tables_to_include = include_tables
|
|
123
|
-
self.
|
|
124
|
-
self._knowledge_bases_to_include = include_knowledge_bases
|
|
125
|
-
self._knowledge_bases_to_ignore = []
|
|
126
|
-
self._databases = databases
|
|
127
|
-
if not self._tables_to_include:
|
|
197
|
+
self._tables_to_include = TablesCollection(include_tables)
|
|
198
|
+
if self._tables_to_include:
|
|
128
199
|
# ignore_tables and include_tables should not be used together.
|
|
129
200
|
# include_tables takes priority if it's set.
|
|
130
|
-
|
|
131
|
-
|
|
201
|
+
ignore_tables = []
|
|
202
|
+
self._tables_to_ignore = TablesCollection(ignore_tables)
|
|
203
|
+
|
|
204
|
+
self._knowledge_bases_to_include = TablesCollection(include_knowledge_bases, default_db=knowledge_base_database)
|
|
205
|
+
if self._knowledge_bases_to_include:
|
|
132
206
|
# ignore_knowledge_bases and include_knowledge_bases should not be used together.
|
|
133
207
|
# include_knowledge_bases takes priority if it's set.
|
|
134
|
-
|
|
208
|
+
ignore_knowledge_bases = []
|
|
209
|
+
self._knowledge_bases_to_ignore = TablesCollection(ignore_knowledge_bases, default_db=knowledge_base_database)
|
|
210
|
+
|
|
135
211
|
self._cache = cache
|
|
136
212
|
|
|
137
213
|
from mindsdb.interfaces.skills.skill_tool import SkillToolController
|
|
@@ -159,46 +235,54 @@ class SQLAgent:
|
|
|
159
235
|
if not isinstance(ast_query, (Select, Show, Describe, Explain)):
|
|
160
236
|
raise ValueError(f"Query is not allowed: {ast_query.to_string()}")
|
|
161
237
|
|
|
238
|
+
kb_names = self.get_all_knowledge_base_names()
|
|
239
|
+
|
|
162
240
|
# Check tables
|
|
163
241
|
if self._tables_to_include:
|
|
164
|
-
tables_parts = [split_table_name(x) for x in self._tables_to_include]
|
|
165
|
-
no_schema_parts = []
|
|
166
|
-
for t in tables_parts:
|
|
167
|
-
if len(t) == 3:
|
|
168
|
-
no_schema_parts.append([t[0], t[2]])
|
|
169
|
-
tables_parts += no_schema_parts
|
|
170
242
|
|
|
171
243
|
def _check_f(node, is_table=None, **kwargs):
|
|
172
244
|
if is_table and isinstance(node, Identifier):
|
|
173
245
|
table_name = ".".join(node.parts)
|
|
174
246
|
|
|
175
|
-
# Get the list of available knowledge bases
|
|
176
|
-
kb_names = self.get_usable_knowledge_base_names()
|
|
177
|
-
|
|
178
247
|
# Check if this table is a knowledge base
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
# Check if it's a restricted table
|
|
197
|
-
elif not is_kb and table_name in self._tables_to_ignore:
|
|
198
|
-
raise ValueError(f"Table {table_name} is not allowed.")
|
|
248
|
+
if table_name in kb_names or node.parts[-1] in kb_names:
|
|
249
|
+
# If it's a knowledge base and we have knowledge base restrictions
|
|
250
|
+
self.check_knowledge_base_permission(node)
|
|
251
|
+
else:
|
|
252
|
+
try:
|
|
253
|
+
# Regular table check
|
|
254
|
+
self.check_table_permission(node)
|
|
255
|
+
except ValueError as origin_exc:
|
|
256
|
+
# was it badly quoted by llm?
|
|
257
|
+
if len(node.parts) == 1 and node.is_quoted[0] and "." in node.parts[0]:
|
|
258
|
+
node2 = Identifier(node.parts[0])
|
|
259
|
+
try:
|
|
260
|
+
_check_f(node2, is_table=True)
|
|
261
|
+
return node2
|
|
262
|
+
except ValueError:
|
|
263
|
+
...
|
|
264
|
+
raise origin_exc
|
|
199
265
|
|
|
200
266
|
query_traversal(ast_query, _check_f)
|
|
201
267
|
|
|
268
|
+
def check_knowledge_base_permission(self, node):
|
|
269
|
+
if self._knowledge_bases_to_include and not self._knowledge_bases_to_include.match(node):
|
|
270
|
+
raise ValueError(
|
|
271
|
+
f"Knowledge base {str(node)} not found. Available knowledge bases: {', '.join(self._knowledge_bases_to_include.items)}"
|
|
272
|
+
)
|
|
273
|
+
# Check if it's a restricted knowledge base
|
|
274
|
+
if self._knowledge_bases_to_ignore and self._knowledge_bases_to_ignore.match(node):
|
|
275
|
+
raise ValueError(f"Knowledge base {str(node)} is not allowed.")
|
|
276
|
+
|
|
277
|
+
def check_table_permission(self, node):
|
|
278
|
+
if self._tables_to_include and not self._tables_to_include.match(node):
|
|
279
|
+
raise ValueError(
|
|
280
|
+
f"Table {str(node)} not found. Available tables: {', '.join(self._tables_to_include.items)}"
|
|
281
|
+
)
|
|
282
|
+
# Check if it's a restricted table
|
|
283
|
+
if self._tables_to_ignore and self._tables_to_ignore.match(node):
|
|
284
|
+
raise ValueError(f"Table {str(node)} is not allowed.")
|
|
285
|
+
|
|
202
286
|
def get_usable_table_names(self) -> Iterable[str]:
|
|
203
287
|
"""Get a list of tables that the agent has access to.
|
|
204
288
|
|
|
@@ -213,50 +297,35 @@ class SQLAgent:
|
|
|
213
297
|
if cached_tables:
|
|
214
298
|
return cached_tables
|
|
215
299
|
|
|
216
|
-
if self._tables_to_include:
|
|
217
|
-
|
|
300
|
+
if not self._tables_to_include:
|
|
301
|
+
# no tables allowed
|
|
302
|
+
return []
|
|
303
|
+
if not self._tables_to_include.has_wildcard:
|
|
304
|
+
return self._tables_to_include.items
|
|
218
305
|
|
|
219
306
|
result_tables = []
|
|
220
307
|
|
|
221
|
-
for db_name in self.
|
|
308
|
+
for db_name in self._tables_to_include.databases:
|
|
222
309
|
handler = self._command_executor.session.integration_controller.get_data_handler(db_name)
|
|
223
310
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
raise Exception("default schema and named schemas can not be used in same filter")
|
|
227
|
-
|
|
228
|
-
if None in schemas_names:
|
|
229
|
-
# get tables only from default schema
|
|
230
|
-
response = handler.get_tables()
|
|
231
|
-
tables_in_default_schema = list(response.data_frame.table_name)
|
|
232
|
-
schema_tables_restrictions = self._mindsdb_db_struct[db_name][None] # None - is default schema
|
|
233
|
-
if schema_tables_restrictions is None:
|
|
234
|
-
for table_name in tables_in_default_schema:
|
|
235
|
-
result_tables.append([db_name, table_name])
|
|
236
|
-
else:
|
|
237
|
-
for table_name in schema_tables_restrictions:
|
|
238
|
-
if table_name in tables_in_default_schema:
|
|
239
|
-
result_tables.append([db_name, table_name])
|
|
311
|
+
if "all" in inspect.signature(handler.get_tables).parameters:
|
|
312
|
+
response = handler.get_tables(all=True)
|
|
240
313
|
else:
|
|
241
|
-
|
|
242
|
-
|
|
314
|
+
response = handler.get_tables()
|
|
315
|
+
df = response.data_frame
|
|
316
|
+
col_name = "table_name"
|
|
317
|
+
if col_name not in df.columns:
|
|
318
|
+
# get first column if not found
|
|
319
|
+
col_name = df.columns[0]
|
|
320
|
+
|
|
321
|
+
for _, row in df.iterrows():
|
|
322
|
+
if "table_schema" in row:
|
|
323
|
+
parts = [db_name, row["table_schema"], row[col_name]]
|
|
243
324
|
else:
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
raise Exception("There are no allowed schemas in ds")
|
|
249
|
-
|
|
250
|
-
for schema_name in schemas_intersection:
|
|
251
|
-
schema_sub_df = response.data_frame[response.data_frame["table_schema"] == schema_name]
|
|
252
|
-
if self._mindsdb_db_struct[db_name][schema_name] is None:
|
|
253
|
-
# all tables from schema allowed
|
|
254
|
-
for row in schema_sub_df:
|
|
255
|
-
result_tables.append([db_name, schema_name, row["table_name"]])
|
|
256
|
-
else:
|
|
257
|
-
for table_name in self._mindsdb_db_struct[db_name][schema_name]:
|
|
258
|
-
if table_name in schema_sub_df["table_name"].values:
|
|
259
|
-
result_tables.append([db_name, schema_name, table_name])
|
|
325
|
+
parts = [db_name, row[col_name]]
|
|
326
|
+
if self._tables_to_include.match(Identifier(parts=parts)):
|
|
327
|
+
if not self._tables_to_ignore.match(Identifier(parts=parts)):
|
|
328
|
+
result_tables.append(parts)
|
|
260
329
|
|
|
261
330
|
result_tables = [".".join(x) for x in result_tables]
|
|
262
331
|
if self._cache:
|
|
@@ -269,7 +338,28 @@ class SQLAgent:
|
|
|
269
338
|
Returns:
|
|
270
339
|
Iterable[str]: list with knowledge base names
|
|
271
340
|
"""
|
|
272
|
-
|
|
341
|
+
|
|
342
|
+
if not self._knowledge_bases_to_include and not self._knowledge_bases_to_ignore:
|
|
343
|
+
# white or black list have to be set
|
|
344
|
+
return []
|
|
345
|
+
|
|
346
|
+
# Filter knowledge bases based on ignore list
|
|
347
|
+
kb_names = []
|
|
348
|
+
for kb_name in self.get_all_knowledge_base_names():
|
|
349
|
+
kb = Identifier(parts=[self.knowledge_base_database, kb_name])
|
|
350
|
+
if self._knowledge_bases_to_include and not self._knowledge_bases_to_include.match(kb):
|
|
351
|
+
continue
|
|
352
|
+
if not self._knowledge_bases_to_ignore.match(kb):
|
|
353
|
+
kb_names.append(kb_name)
|
|
354
|
+
return kb_names
|
|
355
|
+
|
|
356
|
+
def get_all_knowledge_base_names(self) -> Iterable[str]:
|
|
357
|
+
"""Get a list of all knowledge bases
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
Iterable[str]: list with knowledge base names
|
|
361
|
+
"""
|
|
362
|
+
# cache_key = f"{ctx.company_id}_{self.knowledge_base_database}_knowledge_bases"
|
|
273
363
|
|
|
274
364
|
# todo we need to fix the cache, file cache can potentially store out of data information
|
|
275
365
|
# # first check cache and return if found
|
|
@@ -278,58 +368,18 @@ class SQLAgent:
|
|
|
278
368
|
# if cached_kbs:
|
|
279
369
|
# return cached_kbs
|
|
280
370
|
|
|
281
|
-
if self._knowledge_bases_to_include:
|
|
282
|
-
return self._knowledge_bases_to_include
|
|
283
|
-
|
|
284
371
|
try:
|
|
285
372
|
# Query to get all knowledge bases
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
result = self._call_engine(query, database=self.knowledge_base_database)
|
|
289
|
-
except Exception as e:
|
|
290
|
-
# If the direct query fails, try a different approach
|
|
291
|
-
# This handles the case where knowledge_base_database is not a valid integration
|
|
292
|
-
logger.warning(f"Error querying knowledge bases from {self.knowledge_base_database}: {str(e)}")
|
|
293
|
-
# Try to get knowledge bases directly from the project database
|
|
294
|
-
try:
|
|
295
|
-
# Get knowledge bases from the project database
|
|
296
|
-
kb_controller = self._command_executor.session.kb_controller
|
|
297
|
-
kb_names = [kb["name"] for kb in kb_controller.list()]
|
|
298
|
-
|
|
299
|
-
# Filter knowledge bases based on include list
|
|
300
|
-
if self._knowledge_bases_to_include:
|
|
301
|
-
kb_names = [kb_name for kb_name in kb_names if kb_name in self._knowledge_bases_to_include]
|
|
302
|
-
if not kb_names:
|
|
303
|
-
logger.warning(
|
|
304
|
-
f"No knowledge bases found in the include list: {self._knowledge_bases_to_include}"
|
|
305
|
-
)
|
|
306
|
-
return []
|
|
307
|
-
|
|
308
|
-
return kb_names
|
|
309
|
-
|
|
310
|
-
# Filter knowledge bases based on ignore list
|
|
311
|
-
kb_names = [kb_name for kb_name in kb_names if kb_name not in self._knowledge_bases_to_ignore]
|
|
312
|
-
|
|
313
|
-
if self._cache:
|
|
314
|
-
self._cache.set(cache_key, set(kb_names))
|
|
315
|
-
|
|
316
|
-
return kb_names
|
|
317
|
-
except Exception as inner_e:
|
|
318
|
-
logger.error(f"Error getting knowledge bases from kb_controller: {str(inner_e)}")
|
|
319
|
-
return []
|
|
320
|
-
|
|
321
|
-
if not result:
|
|
322
|
-
return []
|
|
373
|
+
ast_query = Show(category="Knowledge Bases")
|
|
374
|
+
result = self._command_executor.execute_command(ast_query, database_name=self.knowledge_base_database)
|
|
323
375
|
|
|
324
376
|
# Filter knowledge bases based on ignore list
|
|
325
377
|
kb_names = []
|
|
326
|
-
for row in result:
|
|
327
|
-
|
|
328
|
-
if kb_name not in self._knowledge_bases_to_ignore:
|
|
329
|
-
kb_names.append(kb_name)
|
|
378
|
+
for row in result.data.records:
|
|
379
|
+
kb_names.append(row["NAME"])
|
|
330
380
|
|
|
331
|
-
if self._cache:
|
|
332
|
-
|
|
381
|
+
# if self._cache:
|
|
382
|
+
# self._cache.set(cache_key, set(kb_names))
|
|
333
383
|
|
|
334
384
|
return kb_names
|
|
335
385
|
except Exception as e:
|
|
@@ -369,7 +419,7 @@ class SQLAgent:
|
|
|
369
419
|
table_identifier = tables_idx.get(tuple(table_parts))
|
|
370
420
|
|
|
371
421
|
if table_identifier is None:
|
|
372
|
-
raise ValueError(f"Table {
|
|
422
|
+
raise ValueError(f"Table {table_name} not found in the database")
|
|
373
423
|
tables.append(table_identifier)
|
|
374
424
|
|
|
375
425
|
return tables
|
|
@@ -411,13 +461,14 @@ class SQLAgent:
|
|
|
411
461
|
if len(parts) == 1:
|
|
412
462
|
raise ValueError(f"Invalid table name: {name}. Expected format is 'database.table'.")
|
|
413
463
|
|
|
414
|
-
database_table_map
|
|
464
|
+
database_table_map.setdefault(parts[0], []).append(parts[1])
|
|
415
465
|
|
|
416
466
|
data_catalog_str = ""
|
|
417
467
|
for database_name, table_names in database_table_map.items():
|
|
418
468
|
data_catalog_reader = DataCatalogReader(database_name=database_name, table_names=table_names)
|
|
419
469
|
|
|
420
|
-
|
|
470
|
+
result = data_catalog_reader.read_metadata_as_string()
|
|
471
|
+
data_catalog_str += str(result or "")
|
|
421
472
|
|
|
422
473
|
return data_catalog_str
|
|
423
474
|
|
|
@@ -430,7 +481,7 @@ class SQLAgent:
|
|
|
430
481
|
|
|
431
482
|
split = name.split(".")
|
|
432
483
|
if len(split) > 1:
|
|
433
|
-
all_tables.append(Identifier(parts=[split[0], split[1]]))
|
|
484
|
+
all_tables.append(Identifier(parts=[split[0], split[-1]]))
|
|
434
485
|
else:
|
|
435
486
|
all_tables.append(Identifier(name))
|
|
436
487
|
|
mindsdb/interfaces/storage/db.py
CHANGED
|
@@ -684,10 +684,10 @@ class MetaColumns(Base):
|
|
|
684
684
|
if self.default_value:
|
|
685
685
|
column_info += f"\n{pad}- Default Value: {self.default_value}"
|
|
686
686
|
|
|
687
|
-
|
|
687
|
+
stats = self.meta_column_statistics or []
|
|
688
|
+
if stats and callable(getattr(stats[0], "as_string", None)):
|
|
688
689
|
column_info += f"\n\n{pad}- Column Statistics:"
|
|
689
|
-
column_info += f"\n{
|
|
690
|
-
|
|
690
|
+
column_info += f"\n{stats[0].as_string(indent + 4)}"
|
|
691
691
|
return column_info
|
|
692
692
|
|
|
693
693
|
|
|
@@ -708,18 +708,20 @@ class MetaColumnStatistics(Base):
|
|
|
708
708
|
inner_pad = " " * (indent + 4)
|
|
709
709
|
|
|
710
710
|
column_statistics = ""
|
|
711
|
+
most_common_values = self.most_common_values or []
|
|
712
|
+
most_common_frequencies = self.most_common_frequencies or []
|
|
711
713
|
|
|
712
|
-
if
|
|
714
|
+
if most_common_values and most_common_frequencies:
|
|
713
715
|
column_statistics += f"{pad}- Top 10 Most Common Values and Frequencies:"
|
|
714
|
-
for i in range(min(10, len(
|
|
715
|
-
freq =
|
|
716
|
+
for i in range(min(10, len(most_common_values))):
|
|
717
|
+
freq = most_common_frequencies[i]
|
|
716
718
|
try:
|
|
717
719
|
percent = float(freq) * 100
|
|
718
720
|
freq_str = f"{percent:.2f}%"
|
|
719
721
|
except (ValueError, TypeError):
|
|
720
722
|
freq_str = str(freq)
|
|
721
723
|
|
|
722
|
-
column_statistics += f"\n{inner_pad}- {
|
|
724
|
+
column_statistics += f"\n{inner_pad}- {most_common_values[i]}: {freq_str}"
|
|
723
725
|
column_statistics += "\n"
|
|
724
726
|
|
|
725
727
|
if self.null_percentage:
|