MindsDB 25.6.2.0__py3-none-any.whl → 25.6.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/a2a/agent.py +25 -4
- mindsdb/api/a2a/task_manager.py +68 -6
- mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +91 -84
- mindsdb/api/http/namespaces/knowledge_bases.py +132 -154
- mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +219 -28
- mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +3 -0
- mindsdb/integrations/handlers/openai_handler/openai_handler.py +277 -356
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +94 -8
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +19 -1
- mindsdb/integrations/libs/api_handler.py +19 -1
- mindsdb/integrations/libs/base.py +86 -2
- mindsdb/interfaces/agents/agents_controller.py +32 -6
- mindsdb/interfaces/agents/constants.py +1 -0
- mindsdb/interfaces/agents/mindsdb_database_agent.py +23 -18
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +22 -6
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +4 -0
- mindsdb/interfaces/database/integrations.py +4 -2
- mindsdb/interfaces/knowledge_base/controller.py +3 -15
- mindsdb/interfaces/knowledge_base/evaluate.py +0 -3
- mindsdb/interfaces/skills/skills_controller.py +0 -23
- mindsdb/interfaces/skills/sql_agent.py +8 -4
- mindsdb/interfaces/storage/db.py +20 -4
- mindsdb/utilities/config.py +5 -1
- {mindsdb-25.6.2.0.dist-info → mindsdb-25.6.3.0.dist-info}/METADATA +250 -250
- {mindsdb-25.6.2.0.dist-info → mindsdb-25.6.3.0.dist-info}/RECORD +30 -30
- {mindsdb-25.6.2.0.dist-info → mindsdb-25.6.3.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.6.2.0.dist-info → mindsdb-25.6.3.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.6.2.0.dist-info → mindsdb-25.6.3.0.dist-info}/top_level.txt +0 -0
|
@@ -70,8 +70,8 @@ class SalesforceHandler(MetaAPIHandler):
|
|
|
70
70
|
)
|
|
71
71
|
self.is_connected = True
|
|
72
72
|
|
|
73
|
-
|
|
74
|
-
for resource_name in
|
|
73
|
+
resource_tables = self._get_resource_names()
|
|
74
|
+
for resource_name in resource_tables:
|
|
75
75
|
table_class = create_table_class(resource_name)
|
|
76
76
|
self._register_table(resource_name, table_class(self))
|
|
77
77
|
|
|
@@ -154,23 +154,109 @@ class SalesforceHandler(MetaAPIHandler):
|
|
|
154
154
|
|
|
155
155
|
return response
|
|
156
156
|
|
|
157
|
-
def _get_resource_names(self) ->
|
|
157
|
+
def _get_resource_names(self) -> List[str]:
|
|
158
158
|
"""
|
|
159
|
-
Retrieves the names of the Salesforce resources.
|
|
160
|
-
|
|
159
|
+
Retrieves the names of the Salesforce resources, with more aggressive filtering to remove tables.
|
|
161
160
|
Returns:
|
|
162
|
-
|
|
161
|
+
List[str]: A list of filtered resource names.
|
|
163
162
|
"""
|
|
164
163
|
if not self.resource_names:
|
|
165
|
-
|
|
166
|
-
self.resource_names = [
|
|
164
|
+
all_resources = [
|
|
167
165
|
resource["name"]
|
|
168
166
|
for resource in self.connection.sobjects.describe()["sobjects"]
|
|
169
167
|
if resource.get("queryable", False)
|
|
170
168
|
]
|
|
171
169
|
|
|
170
|
+
# Define patterns for tables to be filtered out.
|
|
171
|
+
# Expanded suffixes and prefixes and exact matches
|
|
172
|
+
ignore_suffixes = ("Share", "History", "Feed", "ChangeEvent", "Tag", "Permission", "Setup", "Consent")
|
|
173
|
+
ignore_prefixes = (
|
|
174
|
+
"Apex",
|
|
175
|
+
"CommPlatform",
|
|
176
|
+
"Lightning",
|
|
177
|
+
"Flow",
|
|
178
|
+
"Transaction",
|
|
179
|
+
"AI",
|
|
180
|
+
"Aura",
|
|
181
|
+
"ContentWorkspace",
|
|
182
|
+
"Collaboration",
|
|
183
|
+
"Datacloud",
|
|
184
|
+
)
|
|
185
|
+
ignore_exact = {
|
|
186
|
+
"EntityDefinition",
|
|
187
|
+
"FieldDefinition",
|
|
188
|
+
"RecordType",
|
|
189
|
+
"CaseStatus",
|
|
190
|
+
"UserRole",
|
|
191
|
+
"UserLicense",
|
|
192
|
+
"UserPermissionAccess",
|
|
193
|
+
"UserRecordAccess",
|
|
194
|
+
"Folder",
|
|
195
|
+
"Group",
|
|
196
|
+
"Note",
|
|
197
|
+
"ProcessDefinition",
|
|
198
|
+
"ProcessInstance",
|
|
199
|
+
"ContentFolder",
|
|
200
|
+
"ContentDocumentSubscription",
|
|
201
|
+
"DashboardComponent",
|
|
202
|
+
"Report",
|
|
203
|
+
"Dashboard",
|
|
204
|
+
"Topic",
|
|
205
|
+
"TopicAssignment",
|
|
206
|
+
"Period",
|
|
207
|
+
"Partner",
|
|
208
|
+
"PackageLicense",
|
|
209
|
+
"ColorDefinition",
|
|
210
|
+
"DataUsePurpose",
|
|
211
|
+
"DataUseLegalBasis",
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
ignore_substrings = (
|
|
215
|
+
"CleanInfo",
|
|
216
|
+
"Template",
|
|
217
|
+
"Rule",
|
|
218
|
+
"Definition",
|
|
219
|
+
"Status",
|
|
220
|
+
"Policy",
|
|
221
|
+
"Setting",
|
|
222
|
+
"Access",
|
|
223
|
+
"Config",
|
|
224
|
+
"Subscription",
|
|
225
|
+
"DataType",
|
|
226
|
+
"MilestoneType",
|
|
227
|
+
"Entitlement",
|
|
228
|
+
"Auth",
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
filtered = []
|
|
232
|
+
for r in all_resources:
|
|
233
|
+
if (
|
|
234
|
+
not r.endswith(ignore_suffixes)
|
|
235
|
+
and not r.startswith(ignore_prefixes)
|
|
236
|
+
and not any(sub in r for sub in ignore_substrings)
|
|
237
|
+
and r not in ignore_exact
|
|
238
|
+
):
|
|
239
|
+
filtered.append(r)
|
|
240
|
+
|
|
241
|
+
self.resource_names = [r for r in filtered]
|
|
172
242
|
return self.resource_names
|
|
173
243
|
|
|
244
|
+
def meta_get_handler_info(self, **kwargs) -> str:
|
|
245
|
+
"""
|
|
246
|
+
Retrieves information about the design and implementation of the API handler.
|
|
247
|
+
This should include, but not be limited to, the following:
|
|
248
|
+
- The type of SQL queries and operations that the handler supports.
|
|
249
|
+
- etc.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
kwargs: Additional keyword arguments that may be used in generating the handler information.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
str: A string containing information about the API handler's design and implementation.
|
|
256
|
+
"""
|
|
257
|
+
# TODO: Relationships? Aliases?
|
|
258
|
+
return "When filtering on a Date or DateTime field, the value MUST be an unquoted literal in YYYY-MM-DD or YYYY-MM-DDThh:mm:ssZ format. For example, CloseDate >= 2025-05-28 is correct; CloseDate >= '2025-05-28' is incorrect."
|
|
259
|
+
|
|
174
260
|
def meta_get_tables(self, table_names: Optional[List[str]] = None) -> Response:
|
|
175
261
|
"""
|
|
176
262
|
Retrieves metadata for the specified tables (or all tables if no list is provided).
|
|
@@ -6,7 +6,7 @@ from snowflake.sqlalchemy import snowdialect
|
|
|
6
6
|
from snowflake import connector
|
|
7
7
|
from snowflake.connector.errors import NotSupportedError
|
|
8
8
|
from snowflake.connector.cursor import SnowflakeCursor, ResultMetadata
|
|
9
|
-
from typing import Optional, List
|
|
9
|
+
from typing import Any, Optional, List
|
|
10
10
|
|
|
11
11
|
from mindsdb_sql_parser.ast.base import ASTNode
|
|
12
12
|
from mindsdb_sql_parser.ast import Select, Identifier
|
|
@@ -706,3 +706,21 @@ class SnowflakeHandler(MetaDatabaseHandler):
|
|
|
706
706
|
except Exception as e:
|
|
707
707
|
logger.error(f"Exception in meta_get_primary_keys: {e!r}")
|
|
708
708
|
return Response(RESPONSE_TYPE.ERROR, error_message=f"Exception querying primary keys: {e!r}")
|
|
709
|
+
|
|
710
|
+
def meta_get_handler_info(self, **kwargs: Any) -> str:
|
|
711
|
+
"""
|
|
712
|
+
Retrieves information about the design and implementation of the database handler.
|
|
713
|
+
This should include, but not be limited to, the following:
|
|
714
|
+
- The type of SQL queries and operations that the handler supports.
|
|
715
|
+
- etc.
|
|
716
|
+
|
|
717
|
+
Args:
|
|
718
|
+
kwargs: Additional keyword arguments that may be used in generating the handler information.
|
|
719
|
+
|
|
720
|
+
Returns:
|
|
721
|
+
str: A string containing information about the database handler's design and implementation.
|
|
722
|
+
"""
|
|
723
|
+
return (
|
|
724
|
+
"To query columns that contain special characters, use ticks around the column name, e.g. `column name`.\n"
|
|
725
|
+
"DO NOT use double quotes for this purpose."
|
|
726
|
+
)
|
|
@@ -457,8 +457,11 @@ class APIHandler(BaseHandler):
|
|
|
457
457
|
|
|
458
458
|
def query(self, query: ASTNode):
|
|
459
459
|
if isinstance(query, Select):
|
|
460
|
+
# If the list method exists, it should be overridden in the child class.
|
|
461
|
+
# The APIResource class could be used as a base class by overriding the select method, but not the list method.
|
|
460
462
|
table = self._get_table(query.from_table)
|
|
461
|
-
|
|
463
|
+
list_method = getattr(table, "list", None)
|
|
464
|
+
if not list_method or (list_method and list_method.__func__ is APIResource.list):
|
|
462
465
|
# for back compatibility, targets wasn't passed in previous version
|
|
463
466
|
query.targets = [Star()]
|
|
464
467
|
result = self._get_table(query.from_table).select(query)
|
|
@@ -515,6 +518,21 @@ class MetaAPIHandler(APIHandler):
|
|
|
515
518
|
This class is used when the handler is also needed to store information in the data catalog.
|
|
516
519
|
"""
|
|
517
520
|
|
|
521
|
+
def meta_get_handler_info(self, **kwargs) -> str:
|
|
522
|
+
"""
|
|
523
|
+
Retrieves information about the design and implementation of the API handler.
|
|
524
|
+
This should include, but not be limited to, the following:
|
|
525
|
+
- The type of SQL queries and operations that the handler supports.
|
|
526
|
+
- etc.
|
|
527
|
+
|
|
528
|
+
Args:
|
|
529
|
+
kwargs: Additional keyword arguments that may be used in generating the handler information.
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
str: A string containing information about the API handler's design and implementation.
|
|
533
|
+
"""
|
|
534
|
+
pass
|
|
535
|
+
|
|
518
536
|
def meta_get_tables(self, table_names: Optional[List[str]] = None, **kwargs) -> Response:
|
|
519
537
|
"""
|
|
520
538
|
Retrieves metadata for the specified tables (or all tables if no list is provided).
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import ast
|
|
2
|
+
import concurrent.futures
|
|
2
3
|
import inspect
|
|
3
4
|
import textwrap
|
|
4
5
|
from _ast import AnnAssign, AugAssign
|
|
@@ -8,7 +9,7 @@ import pandas as pd
|
|
|
8
9
|
from mindsdb_sql_parser.ast.base import ASTNode
|
|
9
10
|
from mindsdb.utilities import log
|
|
10
11
|
|
|
11
|
-
from mindsdb.integrations.libs.response import HandlerResponse, HandlerStatusResponse
|
|
12
|
+
from mindsdb.integrations.libs.response import HandlerResponse, HandlerStatusResponse, RESPONSE_TYPE
|
|
12
13
|
|
|
13
14
|
logger = log.getLogger(__name__)
|
|
14
15
|
|
|
@@ -156,6 +157,7 @@ class MetaDatabaseHandler(DatabaseHandler):
|
|
|
156
157
|
def meta_get_column_statistics(self, table_names: Optional[List[str]]) -> HandlerResponse:
|
|
157
158
|
"""
|
|
158
159
|
Returns metadata statisical information about the columns in the tables to be stored in the data catalog.
|
|
160
|
+
Either this method should be overridden in the handler or `meta_get_column_statistics_for_table` should be implemented.
|
|
159
161
|
|
|
160
162
|
Returns:
|
|
161
163
|
HandlerResponse: The response should consist of the following columns:
|
|
@@ -168,7 +170,74 @@ class MetaDatabaseHandler(DatabaseHandler):
|
|
|
168
170
|
- MAXIMUM_VALUE (str): Maximum value in the column (optional).
|
|
169
171
|
- DISTINCT_VALUES_COUNT (int): Count of distinct values in the column (optional).
|
|
170
172
|
"""
|
|
171
|
-
|
|
173
|
+
method = getattr(self, "meta_get_column_statistics_for_table")
|
|
174
|
+
if method.__func__ is not MetaDatabaseHandler.meta_get_column_statistics_for_table:
|
|
175
|
+
meta_columns = self.meta_get_columns(table_names)
|
|
176
|
+
grouped_columns = (
|
|
177
|
+
meta_columns.data_frame.groupby("table_name")
|
|
178
|
+
.agg(
|
|
179
|
+
{
|
|
180
|
+
"column_name": list,
|
|
181
|
+
}
|
|
182
|
+
)
|
|
183
|
+
.reset_index()
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
|
|
187
|
+
futures = []
|
|
188
|
+
|
|
189
|
+
results = []
|
|
190
|
+
with executor:
|
|
191
|
+
for _, row in grouped_columns.iterrows():
|
|
192
|
+
table_name = row["table_name"]
|
|
193
|
+
columns = row["column_name"]
|
|
194
|
+
futures.append(executor.submit(self.meta_get_column_statistics_for_table, table_name, columns))
|
|
195
|
+
|
|
196
|
+
for future in concurrent.futures.as_completed(futures):
|
|
197
|
+
try:
|
|
198
|
+
result = future.result(timeout=120)
|
|
199
|
+
if result.resp_type == RESPONSE_TYPE.TABLE:
|
|
200
|
+
results.append(result.data_frame)
|
|
201
|
+
else:
|
|
202
|
+
logger.error(
|
|
203
|
+
f"Error retrieving column statistics for table {table_name}: {result.error_message}"
|
|
204
|
+
)
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logger.error(f"Exception occurred while retrieving column statistics for table {table_name}: {e}")
|
|
207
|
+
|
|
208
|
+
if not results:
|
|
209
|
+
logger.warning("No column statistics could be retrieved for the specified tables.")
|
|
210
|
+
return HandlerResponse(RESPONSE_TYPE.ERROR, error_message="No column statistics could be retrieved.")
|
|
211
|
+
return HandlerResponse(
|
|
212
|
+
RESPONSE_TYPE.TABLE, pd.concat(results, ignore_index=True) if results else pd.DataFrame()
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
else:
|
|
216
|
+
raise NotImplementedError()
|
|
217
|
+
|
|
218
|
+
def meta_get_column_statistics_for_table(
|
|
219
|
+
self, table_name: str, column_names: Optional[List[str]] = None
|
|
220
|
+
) -> HandlerResponse:
|
|
221
|
+
"""
|
|
222
|
+
Returns metadata statistical information about the columns in a specific table to be stored in the data catalog.
|
|
223
|
+
Either this method should be implemented in the handler or `meta_get_column_statistics` should be overridden.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
table_name (str): Name of the table.
|
|
227
|
+
column_names (Optional[List[str]]): List of column names to retrieve statistics for. If None, statistics for all columns will be returned.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
HandlerResponse: The response should consist of the following columns:
|
|
231
|
+
- TABLE_NAME (str): Name of the table.
|
|
232
|
+
- COLUMN_NAME (str): Name of the column.
|
|
233
|
+
- MOST_COMMON_VALUES (List[str]): Most common values in the column (optional).
|
|
234
|
+
- MOST_COMMON_FREQUENCIES (List[str]): Frequencies of the most common values in the column (optional).
|
|
235
|
+
- NULL_PERCENTAGE: Percentage of NULL values in the column (optional).
|
|
236
|
+
- MINIMUM_VALUE (str): Minimum value in the column (optional).
|
|
237
|
+
- MAXIMUM_VALUE (str): Maximum value in the column (optional).
|
|
238
|
+
- DISTINCT_VALUES_COUNT (int): Count of distinct values in the column (optional).
|
|
239
|
+
"""
|
|
240
|
+
pass
|
|
172
241
|
|
|
173
242
|
def meta_get_primary_keys(self, table_names: Optional[List[str]]) -> HandlerResponse:
|
|
174
243
|
"""
|
|
@@ -197,6 +266,21 @@ class MetaDatabaseHandler(DatabaseHandler):
|
|
|
197
266
|
"""
|
|
198
267
|
raise NotImplementedError()
|
|
199
268
|
|
|
269
|
+
def meta_get_handler_info(self, **kwargs) -> str:
|
|
270
|
+
"""
|
|
271
|
+
Retrieves information about the design and implementation of the database handler.
|
|
272
|
+
This should include, but not be limited to, the following:
|
|
273
|
+
- The type of SQL queries and operations that the handler supports.
|
|
274
|
+
- etc.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
kwargs: Additional keyword arguments that may be used in generating the handler information.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
str: A string containing information about the database handler's design and implementation.
|
|
281
|
+
"""
|
|
282
|
+
pass
|
|
283
|
+
|
|
200
284
|
|
|
201
285
|
class ArgProbeMixin:
|
|
202
286
|
"""
|
|
@@ -10,6 +10,7 @@ import pandas as pd
|
|
|
10
10
|
from mindsdb.interfaces.storage import db
|
|
11
11
|
from mindsdb.interfaces.storage.db import Predictor
|
|
12
12
|
from mindsdb.utilities.context import context as ctx
|
|
13
|
+
from mindsdb.interfaces.data_catalog.data_catalog_loader import DataCatalogLoader
|
|
13
14
|
from mindsdb.interfaces.database.projects import ProjectController
|
|
14
15
|
from mindsdb.interfaces.model.functions import PredictorRecordNotFound
|
|
15
16
|
from mindsdb.interfaces.model.model_controller import ModelController
|
|
@@ -52,7 +53,7 @@ class AgentsController:
|
|
|
52
53
|
"""
|
|
53
54
|
Checks if a model exists, and gets the provider of the model.
|
|
54
55
|
|
|
55
|
-
The provider is either the provider of the model
|
|
56
|
+
The provider is either the provider of the model or the provider given as an argument.
|
|
56
57
|
|
|
57
58
|
Parameters:
|
|
58
59
|
model_name (str): The name of the model
|
|
@@ -325,12 +326,37 @@ class AgentsController:
|
|
|
325
326
|
db.session.rollback()
|
|
326
327
|
raise ValueError(f"Skill with name does not exist: {skill_name}")
|
|
327
328
|
|
|
328
|
-
# Add table restrictions if this is a text2sql skill
|
|
329
|
-
if existing_skill.type == "sql" and (include_tables or ignore_tables):
|
|
330
|
-
parameters["tables"] = include_tables or ignore_tables
|
|
331
|
-
|
|
332
|
-
# Add knowledge base restrictions if this is a text2sql skill
|
|
333
329
|
if existing_skill.type == "sql":
|
|
330
|
+
# Run Data Catalog loader if enabled
|
|
331
|
+
if config.get("data_catalog", {}).get("enabled", False):
|
|
332
|
+
if include_tables:
|
|
333
|
+
database_table_map = {}
|
|
334
|
+
for table in include_tables:
|
|
335
|
+
parts = table.split(".", 1)
|
|
336
|
+
database_table_map[parts[0]] = database_table_map.get(parts[0], []) + [parts[1]]
|
|
337
|
+
|
|
338
|
+
for database_name, table_names in database_table_map.items():
|
|
339
|
+
data_catalog_loader = DataCatalogLoader(
|
|
340
|
+
database_name=database_name, table_names=table_names
|
|
341
|
+
)
|
|
342
|
+
data_catalog_loader.load_metadata()
|
|
343
|
+
|
|
344
|
+
elif "database" in existing_skill.params:
|
|
345
|
+
data_catalog_loader = DataCatalogLoader(
|
|
346
|
+
database_name=existing_skill.params["database"],
|
|
347
|
+
table_names=parameters["tables"] if "tables" in parameters else None,
|
|
348
|
+
)
|
|
349
|
+
data_catalog_loader.load_metadata()
|
|
350
|
+
|
|
351
|
+
else:
|
|
352
|
+
raise ValueError(
|
|
353
|
+
"Data Catalog loading is enabled, but the provided parameters are insufficient to load metadata. "
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# Add table restrictions if this is a text2sql skill
|
|
357
|
+
if include_tables or ignore_tables:
|
|
358
|
+
parameters["tables"] = include_tables or ignore_tables
|
|
359
|
+
|
|
334
360
|
# Pass database parameter if provided
|
|
335
361
|
if database and "database" not in parameters:
|
|
336
362
|
parameters["database"] = database
|
|
@@ -221,6 +221,7 @@ You are an AI assistant powered by MindsDB. When answering questions, follow the
|
|
|
221
221
|
2. For questions about database tables and their contents:
|
|
222
222
|
- Use the sql_db_query to query the tables directly
|
|
223
223
|
- You can join tables if needed to get comprehensive information
|
|
224
|
+
- You are running on a federated query engine, so joins across multiple databases are allowed and supported
|
|
224
225
|
- **Important Rule for SQL Queries:** If you formulate an SQL query as part of answering a user's question, you *must* then use the `sql_db_query` tool to execute that query and get its results. The SQL query string itself is NOT the final answer to the user unless the user has specifically asked for the query. Your final AI response should be based on the *results* obtained from executing the query.
|
|
225
226
|
|
|
226
227
|
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
|
|
2
|
+
Wrapper around MindsDB's executor and integration controller following the implementation of the original
|
|
3
|
+
langchain.sql_database.SQLDatabase class to partly replicate its behavior.
|
|
4
4
|
"""
|
|
5
|
+
|
|
5
6
|
import traceback
|
|
6
7
|
from typing import Any, Iterable, List, Optional
|
|
7
8
|
|
|
@@ -13,26 +14,25 @@ logger = log.getLogger(__name__)
|
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
def extract_essential(input: str) -> str:
|
|
16
|
-
"""
|
|
17
|
-
|
|
17
|
+
"""Sometimes LLM include to input unnecessary data. We can't control stochastic nature of LLM, so we need to
|
|
18
|
+
'clean' input somehow. LLM prompt contains instruction to enclose input between '$START$' and '$STOP$'.
|
|
18
19
|
"""
|
|
19
|
-
if
|
|
20
|
-
input = input.partition(
|
|
21
|
-
if
|
|
22
|
-
input = input.partition(
|
|
23
|
-
return input.strip(
|
|
20
|
+
if "$START$" in input:
|
|
21
|
+
input = input.partition("$START$")[-1]
|
|
22
|
+
if "$STOP$" in input:
|
|
23
|
+
input = input.partition("$STOP$")[0]
|
|
24
|
+
return input.strip(" ")
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
class MindsDBSQL(SQLDatabase):
|
|
27
28
|
@staticmethod
|
|
28
|
-
def custom_init(
|
|
29
|
-
sql_agent: 'SQLAgent'
|
|
30
|
-
) -> 'MindsDBSQL':
|
|
29
|
+
def custom_init(sql_agent: "SQLAgent") -> "MindsDBSQL":
|
|
31
30
|
instance = MindsDBSQL()
|
|
32
31
|
instance._sql_agent = sql_agent
|
|
33
32
|
return instance
|
|
34
33
|
|
|
35
34
|
""" Can't modify signature, as LangChain does a Pydantic check."""
|
|
35
|
+
|
|
36
36
|
def __init__(
|
|
37
37
|
self,
|
|
38
38
|
engine: Optional[Any] = None,
|
|
@@ -51,7 +51,7 @@ class MindsDBSQL(SQLDatabase):
|
|
|
51
51
|
|
|
52
52
|
@property
|
|
53
53
|
def dialect(self) -> str:
|
|
54
|
-
return
|
|
54
|
+
return "mindsdb"
|
|
55
55
|
|
|
56
56
|
@property
|
|
57
57
|
def table_info(self) -> str:
|
|
@@ -93,23 +93,26 @@ class MindsDBSQL(SQLDatabase):
|
|
|
93
93
|
command = extract_essential(command)
|
|
94
94
|
|
|
95
95
|
try:
|
|
96
|
-
|
|
97
96
|
# Log the query for debugging
|
|
98
97
|
logger.info(f"Executing SQL query: {command}")
|
|
99
98
|
|
|
99
|
+
# Removing backticks causes in query execution.
|
|
100
100
|
# remove backticks
|
|
101
|
-
command = command.replace('`', '')
|
|
101
|
+
# command = command.replace('`', '')
|
|
102
102
|
|
|
103
103
|
# Parse the SQL string to an AST object first
|
|
104
104
|
from mindsdb_sql_parser import parse_sql
|
|
105
|
+
|
|
105
106
|
ast_query = parse_sql(command)
|
|
106
107
|
|
|
107
108
|
# Now execute the parsed query
|
|
108
|
-
result = self._sql_agent.skill_tool.get_command_executor().execute_command(
|
|
109
|
+
result = self._sql_agent.skill_tool.get_command_executor().execute_command(
|
|
110
|
+
ast_query, database_name="mindsdb"
|
|
111
|
+
)
|
|
109
112
|
|
|
110
113
|
# Convert ExecuteAnswer to a DataFrame for easier manipulation
|
|
111
114
|
df = None
|
|
112
|
-
if hasattr(result,
|
|
115
|
+
if hasattr(result, "data") and hasattr(result.data, "data_frame"):
|
|
113
116
|
df = result.data.data_frame
|
|
114
117
|
else:
|
|
115
118
|
# Fallback to to_df when data_frame attr not available
|
|
@@ -130,7 +133,9 @@ class MindsDBSQL(SQLDatabase):
|
|
|
130
133
|
except Exception as e:
|
|
131
134
|
logger.error(f"Error executing SQL command: {str(e)}\n{traceback.format_exc()}")
|
|
132
135
|
# If this is a knowledge base query, provide a more helpful error message
|
|
133
|
-
if "knowledge_base" in command.lower() or any(
|
|
136
|
+
if "knowledge_base" in command.lower() or any(
|
|
137
|
+
kb in command for kb in self._sql_agent.get_usable_knowledge_base_names()
|
|
138
|
+
):
|
|
134
139
|
return f"Error executing knowledge base query: {str(e)}. Please check that the knowledge base exists and your query syntax is correct."
|
|
135
140
|
return f"Error: {str(e)}"
|
|
136
141
|
|
|
@@ -60,15 +60,20 @@ class DataCatalogLoader(BaseDataCatalog):
|
|
|
60
60
|
"""
|
|
61
61
|
self.logger.info(f"Loading tables for {self.database_name}")
|
|
62
62
|
response = self.data_handler.meta_get_tables(self.table_names)
|
|
63
|
-
if response.resp_type
|
|
63
|
+
if response.resp_type == RESPONSE_TYPE.ERROR:
|
|
64
64
|
self.logger.error(f"Failed to load tables for {self.database_name}: {response.error_message}")
|
|
65
65
|
return []
|
|
66
|
+
elif response.resp_type == RESPONSE_TYPE.OK:
|
|
67
|
+
self.logger.error(f"No tables found for {self.database_name}.")
|
|
68
|
+
return []
|
|
66
69
|
|
|
67
70
|
df = response.data_frame
|
|
68
71
|
if df.empty:
|
|
69
72
|
self.logger.info(f"No tables to add for {self.database_name}.")
|
|
70
73
|
return []
|
|
71
74
|
|
|
75
|
+
df.columns = df.columns.str.lower()
|
|
76
|
+
|
|
72
77
|
# Filter out tables that are already loaded in the data catalog
|
|
73
78
|
if loaded_table_names:
|
|
74
79
|
df = df[~df["table_name"].isin(loaded_table_names)]
|
|
@@ -77,7 +82,6 @@ class DataCatalogLoader(BaseDataCatalog):
|
|
|
77
82
|
self.logger.info(f"No new tables to load for {self.database_name}.")
|
|
78
83
|
return []
|
|
79
84
|
|
|
80
|
-
df.columns = df.columns.str.lower()
|
|
81
85
|
tables = self._add_table_metadata(df)
|
|
82
86
|
self.logger.info(f"Tables loaded for {self.database_name}.")
|
|
83
87
|
return tables
|
|
@@ -117,9 +121,12 @@ class DataCatalogLoader(BaseDataCatalog):
|
|
|
117
121
|
"""
|
|
118
122
|
self.logger.info(f"Loading columns for {self.database_name}")
|
|
119
123
|
response = self.data_handler.meta_get_columns(self.table_names)
|
|
120
|
-
if response.resp_type
|
|
124
|
+
if response.resp_type == RESPONSE_TYPE.ERROR:
|
|
121
125
|
self.logger.error(f"Failed to load columns for {self.database_name}: {response.error_message}")
|
|
122
126
|
return []
|
|
127
|
+
elif response.resp_type == RESPONSE_TYPE.OK:
|
|
128
|
+
self.logger.error(f"No columns found for {self.database_name}.")
|
|
129
|
+
return []
|
|
123
130
|
|
|
124
131
|
df = response.data_frame
|
|
125
132
|
if df.empty:
|
|
@@ -162,9 +169,12 @@ class DataCatalogLoader(BaseDataCatalog):
|
|
|
162
169
|
"""
|
|
163
170
|
self.logger.info(f"Loading column statistics for {self.database_name}")
|
|
164
171
|
response = self.data_handler.meta_get_column_statistics(self.table_names)
|
|
165
|
-
if response.resp_type
|
|
172
|
+
if response.resp_type == RESPONSE_TYPE.ERROR:
|
|
166
173
|
self.logger.error(f"Failed to load column statistics for {self.database_name}: {response.error_message}")
|
|
167
174
|
return
|
|
175
|
+
elif response.resp_type == RESPONSE_TYPE.OK:
|
|
176
|
+
self.logger.error(f"No column statistics found for {self.database_name}.")
|
|
177
|
+
return
|
|
168
178
|
|
|
169
179
|
df = response.data_frame
|
|
170
180
|
if df.empty:
|
|
@@ -222,9 +232,12 @@ class DataCatalogLoader(BaseDataCatalog):
|
|
|
222
232
|
"""
|
|
223
233
|
self.logger.info(f"Loading primary keys for {self.database_name}")
|
|
224
234
|
response = self.data_handler.meta_get_primary_keys(self.table_names)
|
|
225
|
-
if response.resp_type
|
|
235
|
+
if response.resp_type == RESPONSE_TYPE.ERROR:
|
|
226
236
|
self.logger.error(f"Failed to load primary keys for {self.database_name}: {response.error_message}")
|
|
227
237
|
return
|
|
238
|
+
elif response.resp_type == RESPONSE_TYPE.OK:
|
|
239
|
+
self.logger.error(f"No primary keys found for {self.database_name}.")
|
|
240
|
+
return
|
|
228
241
|
|
|
229
242
|
df = response.data_frame
|
|
230
243
|
if df.empty:
|
|
@@ -271,9 +284,12 @@ class DataCatalogLoader(BaseDataCatalog):
|
|
|
271
284
|
"""
|
|
272
285
|
self.logger.info(f"Loading foreign keys for {self.database_name}")
|
|
273
286
|
response = self.data_handler.meta_get_foreign_keys(self.table_names)
|
|
274
|
-
if response.resp_type
|
|
287
|
+
if response.resp_type == RESPONSE_TYPE.ERROR:
|
|
275
288
|
self.logger.error(f"Failed to foreign keys for {self.database_name}: {response.error_message}")
|
|
276
289
|
return
|
|
290
|
+
elif response.resp_type == RESPONSE_TYPE.OK:
|
|
291
|
+
self.logger.error(f"No foreign keys found for {self.database_name}.")
|
|
292
|
+
return
|
|
277
293
|
|
|
278
294
|
df = response.data_frame
|
|
279
295
|
if df.empty:
|
|
@@ -17,7 +17,11 @@ class DataCatalogReader(BaseDataCatalog):
|
|
|
17
17
|
if not tables:
|
|
18
18
|
self.logger.warning(f"No metadata found for database '{self.database_name}'")
|
|
19
19
|
return f"No metadata found for database '{self.database_name}'"
|
|
20
|
+
|
|
20
21
|
metadata_str = "Data Catalog: \n"
|
|
22
|
+
if hasattr(self.data_handler, "meta_get_handler_info"):
|
|
23
|
+
metadata_str += self.data_handler.meta_get_handler_info() + "\n\n"
|
|
24
|
+
|
|
21
25
|
for table in tables:
|
|
22
26
|
metadata_str += table.as_string() + "\n\n"
|
|
23
27
|
return metadata_str
|
|
@@ -256,7 +256,9 @@ class IntegrationController:
|
|
|
256
256
|
|
|
257
257
|
# Remove the integration metadata from the data catalog (if enabled).
|
|
258
258
|
# TODO: Can this be handled via cascading delete in the database?
|
|
259
|
-
if
|
|
259
|
+
if self.get_handler_meta(integration_record.engine).get("type") == HANDLER_TYPE.DATA and Config().get(
|
|
260
|
+
"data_catalog", {}
|
|
261
|
+
).get("enabled", False):
|
|
260
262
|
data_catalog_reader = DataCatalogLoader(database_name=name)
|
|
261
263
|
data_catalog_reader.unload_metadata()
|
|
262
264
|
|
|
@@ -798,7 +800,7 @@ class IntegrationController:
|
|
|
798
800
|
for item in code.body:
|
|
799
801
|
if isinstance(item, ast.ClassDef):
|
|
800
802
|
bases = [base.id for base in item.bases]
|
|
801
|
-
if "APIHandler" in bases:
|
|
803
|
+
if "APIHandler" in bases or "MetaAPIHandler" in bases:
|
|
802
804
|
return "api"
|
|
803
805
|
return "sql"
|
|
804
806
|
|
|
@@ -1201,22 +1201,10 @@ class KnowledgeBaseController:
|
|
|
1201
1201
|
project_names = {i.id: i.name for i in project_controller.get_list()}
|
|
1202
1202
|
|
|
1203
1203
|
for record in query:
|
|
1204
|
-
|
|
1205
|
-
|
|
1204
|
+
kb = record.as_dict(with_secrets=self.session.show_secrets)
|
|
1205
|
+
kb["project_name"] = project_names[record.project_id]
|
|
1206
1206
|
|
|
1207
|
-
data.append(
|
|
1208
|
-
{
|
|
1209
|
-
"id": record.id,
|
|
1210
|
-
"name": record.name,
|
|
1211
|
-
"project_id": record.project_id,
|
|
1212
|
-
"project_name": project_names[record.project_id],
|
|
1213
|
-
"embedding_model": embedding_model.name if embedding_model is not None else None,
|
|
1214
|
-
"vector_database": None if vector_database is None else vector_database.name,
|
|
1215
|
-
"vector_database_table": record.vector_database_table,
|
|
1216
|
-
"query_id": record.query_id,
|
|
1217
|
-
"params": record.params,
|
|
1218
|
-
}
|
|
1219
|
-
)
|
|
1207
|
+
data.append(kb)
|
|
1220
1208
|
|
|
1221
1209
|
return data
|
|
1222
1210
|
|
|
@@ -492,8 +492,6 @@ class EvaluateDocID(EvaluateBase):
|
|
|
492
492
|
total_questions = len(stats)
|
|
493
493
|
total_found = sum([1 for stat in stats if stat["doc_found"]])
|
|
494
494
|
|
|
495
|
-
total_accurately_retrieved = sum([1 for stat in stats if stat["doc_found"]])
|
|
496
|
-
|
|
497
495
|
accurate_in_top_10 = sum([1 for stat in stats if stat["doc_found"] and stat["doc_position"] < 10])
|
|
498
496
|
|
|
499
497
|
# calculate recall curve by position
|
|
@@ -512,7 +510,6 @@ class EvaluateDocID(EvaluateBase):
|
|
|
512
510
|
return {
|
|
513
511
|
"total": total_questions,
|
|
514
512
|
"total_found": total_found,
|
|
515
|
-
"retrieved_in_top_k": total_accurately_retrieved,
|
|
516
513
|
"retrieved_in_top_10": accurate_in_top_10,
|
|
517
514
|
"cumulative_recall": cumulative_recall,
|
|
518
515
|
"avg_query_time": avg_query_time,
|