MindsDB 25.6.3.0__py3-none-any.whl → 25.6.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +71 -43
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +16 -1
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/system_tables.py +314 -1
- mindsdb/api/executor/planner/plan_join.py +1 -1
- mindsdb/api/executor/planner/query_planner.py +7 -1
- mindsdb/api/executor/utilities/sql.py +18 -19
- mindsdb/integrations/handlers/lindorm_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/ludwig_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +2 -0
- mindsdb/integrations/libs/api_handler.py +6 -7
- mindsdb/interfaces/agents/constants.py +44 -0
- mindsdb/interfaces/agents/langchain_agent.py +8 -1
- mindsdb/interfaces/agents/mindsdb_database_agent.py +5 -17
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +19 -2
- mindsdb/interfaces/knowledge_base/controller.py +23 -13
- mindsdb/interfaces/knowledge_base/evaluate.py +3 -3
- mindsdb/interfaces/knowledge_base/preprocessing/document_loader.py +17 -86
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +30 -3
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +40 -28
- mindsdb/interfaces/skills/skill_tool.py +91 -88
- mindsdb/interfaces/skills/sql_agent.py +1 -1
- {mindsdb-25.6.3.0.dist-info → mindsdb-25.6.4.0.dist-info}/METADATA +255 -253
- {mindsdb-25.6.3.0.dist-info → mindsdb-25.6.4.0.dist-info}/RECORD +28 -28
- {mindsdb-25.6.3.0.dist-info → mindsdb-25.6.4.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.6.3.0.dist-info → mindsdb-25.6.4.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.6.3.0.dist-info → mindsdb-25.6.4.0.dist-info}/top_level.txt +0 -0
|
@@ -433,16 +433,15 @@ class APIHandler(BaseHandler):
|
|
|
433
433
|
Args:
|
|
434
434
|
name (str): the handler name
|
|
435
435
|
"""
|
|
436
|
-
|
|
437
436
|
self._tables = {}
|
|
438
437
|
|
|
439
438
|
def _register_table(self, table_name: str, table_class: Any):
|
|
440
439
|
"""
|
|
441
440
|
Register the data resource. For e.g if you are using Twitter API it registers the `tweets` resource from `/api/v2/tweets`.
|
|
442
441
|
"""
|
|
443
|
-
if table_name in self._tables:
|
|
442
|
+
if table_name.lower() in self._tables:
|
|
444
443
|
raise TableAlreadyExists(f"Table with name {table_name} already exists for this handler")
|
|
445
|
-
self._tables[table_name] = table_class
|
|
444
|
+
self._tables[table_name.lower()] = table_class
|
|
446
445
|
|
|
447
446
|
def _get_table(self, name: Identifier):
|
|
448
447
|
"""
|
|
@@ -450,10 +449,10 @@ class APIHandler(BaseHandler):
|
|
|
450
449
|
Args:
|
|
451
450
|
name (Identifier): the table name
|
|
452
451
|
"""
|
|
453
|
-
name = name.parts[-1]
|
|
454
|
-
if name
|
|
455
|
-
|
|
456
|
-
|
|
452
|
+
name = name.parts[-1].lower()
|
|
453
|
+
if name in self._tables:
|
|
454
|
+
return self._tables[name]
|
|
455
|
+
raise TableNotFound(f"Table not found: {name}")
|
|
457
456
|
|
|
458
457
|
def query(self, query: ASTNode):
|
|
459
458
|
if isinstance(query, Select):
|
|
@@ -171,6 +171,8 @@ NVIDIA_NIM_CHAT_MODELS = (
|
|
|
171
171
|
)
|
|
172
172
|
|
|
173
173
|
GOOGLE_GEMINI_CHAT_MODELS = (
|
|
174
|
+
"gemini-2.5-pro",
|
|
175
|
+
"gemini-2.5-flash",
|
|
174
176
|
"gemini-2.5-pro-preview-03-25",
|
|
175
177
|
"gemini-2.0-flash",
|
|
176
178
|
"gemini-2.0-flash-lite",
|
|
@@ -228,3 +230,45 @@ You are an AI assistant powered by MindsDB. When answering questions, follow the
|
|
|
228
230
|
For factual questions, ALWAYS use the available tools to look up information rather than relying on your internal knowledge.
|
|
229
231
|
|
|
230
232
|
"""
|
|
233
|
+
|
|
234
|
+
MINDSDB_PREFIX = """You are an AI assistant powered by MindsDB. When answering questions, follow these guidelines:
|
|
235
|
+
|
|
236
|
+
1. For questions about database tables and their contents:
|
|
237
|
+
- Use the sql_db_query to query the tables directly
|
|
238
|
+
- You can join tables if needed to get comprehensive information
|
|
239
|
+
- You are running on a federated query engine, so joins across multiple databases are allowed and supported
|
|
240
|
+
- **Important Rule for SQL Queries:** If you formulate an SQL query as part of answering a user's question, you *must* then use the `sql_db_query` tool to execute that query and get its results. The SQL query string itself is NOT the final answer to the user unless the user has specifically asked for the query. Your final AI response should be based on the *results* obtained from executing the query.
|
|
241
|
+
|
|
242
|
+
2. For factual questions about specific topics, use the knowledge base tools, if available, in this sequence:
|
|
243
|
+
- First use kb_list_tool to see available knowledge bases
|
|
244
|
+
- Then use kb_info_tool to understand the structure of relevant knowledge bases
|
|
245
|
+
- Finally use kb_query_tool to query the knowledge base for specific information
|
|
246
|
+
|
|
247
|
+
For factual questions, ALWAYS use the available tools to look up information rather than relying on your internal knowledge.
|
|
248
|
+
|
|
249
|
+
Here is the user's question: {{question}}
|
|
250
|
+
|
|
251
|
+
TOOLS:
|
|
252
|
+
------
|
|
253
|
+
|
|
254
|
+
Assistant has access to the following tools:"""
|
|
255
|
+
|
|
256
|
+
EXPLICIT_FORMAT_INSTRUCTIONS = """
|
|
257
|
+
<< TOOL CALLING INSTRUCTIONS >>
|
|
258
|
+
|
|
259
|
+
**It is critical you use the following format to call a tool**
|
|
260
|
+
|
|
261
|
+
```
|
|
262
|
+
Thought: Do I need to use a tool? Yes
|
|
263
|
+
Action: the action to take, should be one of [{tool_names}]
|
|
264
|
+
Action Input: the input to the action
|
|
265
|
+
Observation: the result of the action
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:
|
|
269
|
+
|
|
270
|
+
```
|
|
271
|
+
Thought: Do I need to use a tool? No
|
|
272
|
+
{ai_prefix}: [your response here]
|
|
273
|
+
```
|
|
274
|
+
"""
|
|
@@ -58,6 +58,8 @@ from mindsdb.interfaces.agents.constants import (
|
|
|
58
58
|
TRACE_ID_COLUMN,
|
|
59
59
|
DEFAULT_AGENT_SYSTEM_PROMPT,
|
|
60
60
|
WRITER_CHAT_MODELS,
|
|
61
|
+
MINDSDB_PREFIX,
|
|
62
|
+
EXPLICIT_FORMAT_INSTRUCTIONS,
|
|
61
63
|
)
|
|
62
64
|
from mindsdb.interfaces.skills.skill_tool import skill_tool, SkillData
|
|
63
65
|
from langchain_anthropic import ChatAnthropic
|
|
@@ -426,7 +428,12 @@ class LangchainAgent:
|
|
|
426
428
|
llm,
|
|
427
429
|
agent=agent_type,
|
|
428
430
|
# Use custom output parser to handle flaky LLMs that don't ALWAYS conform to output format.
|
|
429
|
-
agent_kwargs={
|
|
431
|
+
agent_kwargs={
|
|
432
|
+
"output_parser": SafeOutputParser(),
|
|
433
|
+
"prefix": MINDSDB_PREFIX, # Override default "Assistant is a large language model..." text
|
|
434
|
+
"format_instructions": EXPLICIT_FORMAT_INSTRUCTIONS, # More explicit tool calling instructions
|
|
435
|
+
"ai_prefix": "AI",
|
|
436
|
+
},
|
|
430
437
|
# Calls the agent's LLM Chain one final time to generate a final answer based on the previous steps
|
|
431
438
|
early_stopping_method="generate",
|
|
432
439
|
handle_parsing_errors=self._handle_parsing_errors,
|
|
@@ -111,24 +111,12 @@ class MindsDBSQL(SQLDatabase):
|
|
|
111
111
|
)
|
|
112
112
|
|
|
113
113
|
# Convert ExecuteAnswer to a DataFrame for easier manipulation
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
df
|
|
114
|
+
if result.data is not None:
|
|
115
|
+
df = result.data.to_df()
|
|
116
|
+
return df.to_string(index=False)
|
|
117
|
+
|
|
117
118
|
else:
|
|
118
|
-
|
|
119
|
-
try:
|
|
120
|
-
df = result.data.to_df()
|
|
121
|
-
except Exception:
|
|
122
|
-
df = None
|
|
123
|
-
|
|
124
|
-
# Default behaviour (string)
|
|
125
|
-
if df is not None:
|
|
126
|
-
if not df.empty:
|
|
127
|
-
return df.to_string(index=False)
|
|
128
|
-
else:
|
|
129
|
-
return "Query executed successfully, but returned no data."
|
|
130
|
-
|
|
131
|
-
return str(result)
|
|
119
|
+
return "Query executed successfully, but returned no data."
|
|
132
120
|
|
|
133
121
|
except Exception as e:
|
|
134
122
|
logger.error(f"Error executing SQL command: {str(e)}\n{traceback.format_exc()}")
|
|
@@ -11,8 +11,6 @@ class DataCatalogReader(BaseDataCatalog):
|
|
|
11
11
|
"""
|
|
12
12
|
Read the metadata from the data catalog and return it as a string.
|
|
13
13
|
"""
|
|
14
|
-
if not self.is_data_catalog_supported():
|
|
15
|
-
return f"Data catalog is not supported for database '{self.database_name}'."
|
|
16
14
|
tables = self._read_metadata()
|
|
17
15
|
if not tables:
|
|
18
16
|
self.logger.warning(f"No metadata found for database '{self.database_name}'")
|
|
@@ -26,10 +24,29 @@ class DataCatalogReader(BaseDataCatalog):
|
|
|
26
24
|
metadata_str += table.as_string() + "\n\n"
|
|
27
25
|
return metadata_str
|
|
28
26
|
|
|
27
|
+
def read_metadata_as_records(self) -> list:
|
|
28
|
+
"""
|
|
29
|
+
Read the metadata from the data catalog and return it as a list of database records.
|
|
30
|
+
"""
|
|
31
|
+
tables = self._read_metadata()
|
|
32
|
+
if not tables:
|
|
33
|
+
self.logger.warning(f"No metadata found for database '{self.database_name}'")
|
|
34
|
+
return []
|
|
35
|
+
return tables
|
|
36
|
+
|
|
37
|
+
def get_handler_info(self) -> str:
|
|
38
|
+
"""
|
|
39
|
+
Get the handler info for the database.
|
|
40
|
+
"""
|
|
41
|
+
return self.data_handler.meta_get_handler_info()
|
|
42
|
+
|
|
29
43
|
def _read_metadata(self) -> list:
|
|
30
44
|
"""
|
|
31
45
|
Read the metadata from the data catalog and return it in a structured format.
|
|
32
46
|
"""
|
|
47
|
+
if not self.is_data_catalog_supported():
|
|
48
|
+
return f"Data catalog is not supported for database '{self.database_name}'."
|
|
49
|
+
|
|
33
50
|
query = db.session.query(db.MetaTables).filter_by(integration_id=self.integration_id)
|
|
34
51
|
if self.table_names:
|
|
35
52
|
cleaned_table_names = [name.strip("`").split(".")[-1] for name in self.table_names]
|
|
@@ -9,6 +9,7 @@ import numpy as np
|
|
|
9
9
|
|
|
10
10
|
from mindsdb_sql_parser.ast import BinaryOperation, Constant, Identifier, Select, Update, Delete, Star
|
|
11
11
|
from mindsdb_sql_parser.ast.mindsdb import CreatePredictor
|
|
12
|
+
from mindsdb_sql_parser import parse_sql
|
|
12
13
|
|
|
13
14
|
from mindsdb.integrations.utilities.query_traversal import query_traversal
|
|
14
15
|
|
|
@@ -55,8 +56,13 @@ def get_model_params(model_params: dict, default_config_key: str):
|
|
|
55
56
|
combined_model_params = copy.deepcopy(config.get(default_config_key, {}))
|
|
56
57
|
|
|
57
58
|
if model_params:
|
|
59
|
+
if not isinstance(model_params, dict):
|
|
60
|
+
raise ValueError("Model parameters must be passed as a JSON object")
|
|
61
|
+
|
|
58
62
|
combined_model_params.update(model_params)
|
|
59
63
|
|
|
64
|
+
combined_model_params.pop("use_default_llm", None)
|
|
65
|
+
|
|
60
66
|
return combined_model_params
|
|
61
67
|
|
|
62
68
|
|
|
@@ -359,23 +365,30 @@ class KnowledgeBaseTable:
|
|
|
359
365
|
|
|
360
366
|
def insert_query_result(self, query: str, project_name: str):
|
|
361
367
|
"""Process and insert SQL query results"""
|
|
362
|
-
|
|
363
|
-
raise ValueError("Document loader not configured")
|
|
368
|
+
ast_query = parse_sql(query)
|
|
364
369
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
370
|
+
command_executor = ExecuteCommands(self.session)
|
|
371
|
+
response = command_executor.execute_command(ast_query, project_name)
|
|
372
|
+
|
|
373
|
+
if response.error_code is not None:
|
|
374
|
+
raise ValueError(f"Error executing query: {response.error_message}")
|
|
375
|
+
|
|
376
|
+
if response.data is None:
|
|
377
|
+
raise ValueError("Query returned no data")
|
|
378
|
+
|
|
379
|
+
records = response.data.records
|
|
380
|
+
df = pd.DataFrame(records)
|
|
381
|
+
|
|
382
|
+
self.insert(df)
|
|
368
383
|
|
|
369
384
|
def insert_rows(self, rows: List[Dict]):
|
|
370
385
|
"""Process and insert raw data rows"""
|
|
371
386
|
if not rows:
|
|
372
387
|
return
|
|
373
388
|
|
|
374
|
-
|
|
375
|
-
Document(content=row.get("content", ""), id=row.get("id"), metadata=row.get("metadata", {})) for row in rows
|
|
376
|
-
]
|
|
389
|
+
df = pd.DataFrame(rows)
|
|
377
390
|
|
|
378
|
-
self.
|
|
391
|
+
self.insert(df)
|
|
379
392
|
|
|
380
393
|
def insert_documents(self, documents: List[Document]):
|
|
381
394
|
"""Process and insert documents with preprocessing if configured"""
|
|
@@ -944,10 +957,7 @@ class KnowledgeBaseController:
|
|
|
944
957
|
# # it is params for model
|
|
945
958
|
# embedding_params.update(params["embedding_model"])
|
|
946
959
|
|
|
947
|
-
|
|
948
|
-
if not isinstance(params["embedding_model"], dict):
|
|
949
|
-
raise ValueError("embedding_model should be JSON object with model parameters.")
|
|
950
|
-
embedding_params.update(params["embedding_model"])
|
|
960
|
+
embedding_params = get_model_params(params.get("embedding_model", {}), "default_embedding_model")
|
|
951
961
|
|
|
952
962
|
# if model_name is None: # Legacy
|
|
953
963
|
model_name = self._create_embedding_model(
|
|
@@ -168,13 +168,13 @@ class EvaluateBase:
|
|
|
168
168
|
test_data = self.generate_test_data(gen_params)
|
|
169
169
|
|
|
170
170
|
self.save_to_table(test_table, test_data, is_replace=True)
|
|
171
|
-
else:
|
|
172
|
-
test_data = self.read_from_table(test_table)
|
|
173
171
|
|
|
174
172
|
if params.get("evaluate", True) is False:
|
|
175
173
|
# no evaluate is required
|
|
176
174
|
return pd.DataFrame()
|
|
177
175
|
|
|
176
|
+
test_data = self.read_from_table(test_table)
|
|
177
|
+
|
|
178
178
|
scores = self.evaluate(test_data)
|
|
179
179
|
scores["name"] = self.name
|
|
180
180
|
scores["created_at"] = dt.datetime.now()
|
|
@@ -511,6 +511,6 @@ class EvaluateDocID(EvaluateBase):
|
|
|
511
511
|
"total": total_questions,
|
|
512
512
|
"total_found": total_found,
|
|
513
513
|
"retrieved_in_top_10": accurate_in_top_10,
|
|
514
|
-
"cumulative_recall": cumulative_recall,
|
|
514
|
+
"cumulative_recall": json.dumps(cumulative_recall),
|
|
515
515
|
"avg_query_time": avg_query_time,
|
|
516
516
|
}
|
|
@@ -2,7 +2,6 @@ import os
|
|
|
2
2
|
from typing import List, Iterator
|
|
3
3
|
from langchain_core.documents import Document as LangchainDocument
|
|
4
4
|
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
|
5
|
-
import pandas as pd
|
|
6
5
|
|
|
7
6
|
from mindsdb.interfaces.file.file_controller import FileController
|
|
8
7
|
from mindsdb.integrations.utilities.rag.loaders.file_loader import FileLoader
|
|
@@ -20,12 +19,12 @@ class DocumentLoader:
|
|
|
20
19
|
"""Handles loading documents from various sources including SQL queries"""
|
|
21
20
|
|
|
22
21
|
def __init__(
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
22
|
+
self,
|
|
23
|
+
file_controller: FileController,
|
|
24
|
+
file_splitter: FileSplitter,
|
|
25
|
+
markdown_splitter: MarkdownHeaderTextSplitter,
|
|
26
|
+
file_loader_class=FileLoader,
|
|
27
|
+
mysql_proxy=None,
|
|
29
28
|
):
|
|
30
29
|
"""
|
|
31
30
|
Initialize with required dependencies
|
|
@@ -52,8 +51,8 @@ class DocumentLoader:
|
|
|
52
51
|
for doc in loader.lazy_load():
|
|
53
52
|
# Add file extension to metadata for proper splitting
|
|
54
53
|
extension = os.path.splitext(file_path)[1].lower()
|
|
55
|
-
doc.metadata[
|
|
56
|
-
doc.metadata[
|
|
54
|
+
doc.metadata["extension"] = extension
|
|
55
|
+
doc.metadata["source"] = file_name
|
|
57
56
|
|
|
58
57
|
# Use FileSplitter to handle the document based on its type
|
|
59
58
|
split_docs = self.file_splitter.split_documents([doc])
|
|
@@ -62,34 +61,22 @@ class DocumentLoader:
|
|
|
62
61
|
metadata = doc.metadata.copy()
|
|
63
62
|
metadata.update(split_doc.metadata or {})
|
|
64
63
|
|
|
65
|
-
yield Document(
|
|
66
|
-
content=split_doc.page_content,
|
|
67
|
-
metadata=metadata
|
|
68
|
-
)
|
|
64
|
+
yield Document(content=split_doc.page_content, metadata=metadata)
|
|
69
65
|
|
|
70
66
|
def load_web_pages(
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
67
|
+
self,
|
|
68
|
+
urls: List[str],
|
|
69
|
+
crawl_depth: int,
|
|
70
|
+
limit: int,
|
|
71
|
+
filters: List[str] = None,
|
|
76
72
|
) -> Iterator[Document]:
|
|
77
73
|
"""Load and split documents from web pages"""
|
|
78
|
-
websites_df = get_all_websites(
|
|
79
|
-
urls,
|
|
80
|
-
crawl_depth=crawl_depth,
|
|
81
|
-
limit=limit,
|
|
82
|
-
filters=filters
|
|
83
|
-
)
|
|
74
|
+
websites_df = get_all_websites(urls, crawl_depth=crawl_depth, limit=limit, filters=filters)
|
|
84
75
|
|
|
85
76
|
for _, row in websites_df.iterrows():
|
|
86
77
|
# Create a document with HTML extension for proper splitting
|
|
87
78
|
doc = LangchainDocument(
|
|
88
|
-
page_content=row[
|
|
89
|
-
metadata={
|
|
90
|
-
'extension': '.html',
|
|
91
|
-
'url': row['url']
|
|
92
|
-
}
|
|
79
|
+
page_content=row["text_content"], metadata={"extension": ".html", "url": row["url"]}
|
|
93
80
|
)
|
|
94
81
|
|
|
95
82
|
# Use FileSplitter to handle HTML content
|
|
@@ -98,60 +85,4 @@ class DocumentLoader:
|
|
|
98
85
|
metadata = doc.metadata.copy()
|
|
99
86
|
metadata.update(split_doc.metadata or {})
|
|
100
87
|
|
|
101
|
-
yield Document(
|
|
102
|
-
content=split_doc.page_content,
|
|
103
|
-
metadata=metadata
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
def load_query_result(self, query: str, project_name: str) -> Iterator[Document]:
|
|
107
|
-
"""
|
|
108
|
-
Load documents from SQL query results
|
|
109
|
-
|
|
110
|
-
Args:
|
|
111
|
-
query: SQL query to execute
|
|
112
|
-
project_name: Name of the project context
|
|
113
|
-
|
|
114
|
-
Returns:
|
|
115
|
-
Iterator of Document objects
|
|
116
|
-
|
|
117
|
-
Raises:
|
|
118
|
-
ValueError: If mysql_proxy is not configured or query returns no data
|
|
119
|
-
"""
|
|
120
|
-
if not self.mysql_proxy:
|
|
121
|
-
raise ValueError("MySQL proxy not configured")
|
|
122
|
-
|
|
123
|
-
if not query:
|
|
124
|
-
return
|
|
125
|
-
|
|
126
|
-
# Set project context and execute query
|
|
127
|
-
self.mysql_proxy.set_context({'db': project_name})
|
|
128
|
-
query_result = self.mysql_proxy.process_query(query)
|
|
129
|
-
|
|
130
|
-
if query_result.type != 'table':
|
|
131
|
-
raise ValueError('Query returned no data')
|
|
132
|
-
|
|
133
|
-
# Convert query result to DataFrame
|
|
134
|
-
df = query_result.data.to_df()
|
|
135
|
-
|
|
136
|
-
# Process each row into a Document
|
|
137
|
-
for _, row in df.iterrows():
|
|
138
|
-
# Extract id, content and metadata
|
|
139
|
-
content = str(row.get('content', ''))
|
|
140
|
-
id = row.get('id', None)
|
|
141
|
-
|
|
142
|
-
# Convert remaining columns to metadata
|
|
143
|
-
metadata = {
|
|
144
|
-
col: str(row[col])
|
|
145
|
-
for col in df.columns
|
|
146
|
-
if col != 'content' and not pd.isna(row[col])
|
|
147
|
-
}
|
|
148
|
-
metadata['source'] = 'query'
|
|
149
|
-
|
|
150
|
-
# Split content using recursive splitter
|
|
151
|
-
if content:
|
|
152
|
-
|
|
153
|
-
yield Document(
|
|
154
|
-
id=id,
|
|
155
|
-
content=content,
|
|
156
|
-
metadata=metadata
|
|
157
|
-
)
|
|
88
|
+
yield Document(content=split_doc.page_content, metadata=metadata)
|
|
@@ -3,6 +3,28 @@ import re
|
|
|
3
3
|
import json
|
|
4
4
|
from pydantic import BaseModel, Field
|
|
5
5
|
from langchain_core.tools import BaseTool
|
|
6
|
+
from mindsdb_sql_parser.ast import Describe, Select, Identifier, Constant, Star
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def llm_str_strip(s):
|
|
10
|
+
length = -1
|
|
11
|
+
while length != len(s):
|
|
12
|
+
length = len(s)
|
|
13
|
+
|
|
14
|
+
# remove ```
|
|
15
|
+
if s.startswith("```"):
|
|
16
|
+
s = s[3:]
|
|
17
|
+
if s.endswith("```"):
|
|
18
|
+
s = s[:-3]
|
|
19
|
+
|
|
20
|
+
# remove trailing new lines
|
|
21
|
+
s = s.strip("\n")
|
|
22
|
+
|
|
23
|
+
# remove extra quotes
|
|
24
|
+
for q in ('"', "'", "`"):
|
|
25
|
+
if s.count(q) == 1:
|
|
26
|
+
s = s.strip(q)
|
|
27
|
+
return s
|
|
6
28
|
|
|
7
29
|
|
|
8
30
|
class KnowledgeBaseListToolInput(BaseModel):
|
|
@@ -63,12 +85,14 @@ class KnowledgeBaseInfoTool(BaseTool):
|
|
|
63
85
|
return [kb.strip() for kb in tool_input.split(",")]
|
|
64
86
|
# If it's just a single string without formatting, return it as a single item
|
|
65
87
|
if tool_input.strip():
|
|
66
|
-
return [tool_input
|
|
88
|
+
return [llm_str_strip(tool_input)]
|
|
67
89
|
return []
|
|
68
90
|
|
|
69
91
|
# Extract and clean the knowledge base names
|
|
70
92
|
kb_names_str = match.group(1).strip()
|
|
71
93
|
kb_names = re.findall(r"`([^`]+)`", kb_names_str)
|
|
94
|
+
|
|
95
|
+
kb_names = [llm_str_strip(n) for n in kb_names]
|
|
72
96
|
return kb_names
|
|
73
97
|
|
|
74
98
|
def _run(self, tool_input: str) -> str:
|
|
@@ -83,7 +107,7 @@ class KnowledgeBaseInfoTool(BaseTool):
|
|
|
83
107
|
for kb_name in kb_names:
|
|
84
108
|
try:
|
|
85
109
|
# Get knowledge base schema
|
|
86
|
-
schema_result = self.db.run_no_throw(
|
|
110
|
+
schema_result = self.db.run_no_throw(str(Describe(kb_name, type="knowledge_base")))
|
|
87
111
|
|
|
88
112
|
if not schema_result:
|
|
89
113
|
results.append(f"Knowledge base `{kb_name}` not found or has no schema information.")
|
|
@@ -111,7 +135,9 @@ class KnowledgeBaseInfoTool(BaseTool):
|
|
|
111
135
|
kb_info += "```\n\n"
|
|
112
136
|
|
|
113
137
|
# Get sample data
|
|
114
|
-
sample_data = self.db.run_no_throw(
|
|
138
|
+
sample_data = self.db.run_no_throw(
|
|
139
|
+
str(Select(targets=[Star()], from_table=Identifier(kb_name), limit=Constant(20)))
|
|
140
|
+
)
|
|
115
141
|
|
|
116
142
|
# Sample data
|
|
117
143
|
kb_info += "### Sample Data:\n"
|
|
@@ -196,6 +222,7 @@ class KnowledgeBaseQueryTool(BaseTool):
|
|
|
196
222
|
|
|
197
223
|
try:
|
|
198
224
|
# Execute the query
|
|
225
|
+
query = llm_str_strip(query)
|
|
199
226
|
result = self.db.run_no_throw(query)
|
|
200
227
|
|
|
201
228
|
if not result:
|
|
@@ -10,25 +10,27 @@ from mindsdb.interfaces.skills.custom.text2sql.mindsdb_sql_tool import MindsDBSQ
|
|
|
10
10
|
from mindsdb.interfaces.skills.custom.text2sql.mindsdb_kb_tools import (
|
|
11
11
|
KnowledgeBaseListTool,
|
|
12
12
|
KnowledgeBaseInfoTool,
|
|
13
|
-
KnowledgeBaseQueryTool
|
|
13
|
+
KnowledgeBaseQueryTool,
|
|
14
14
|
)
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class MindsDBSQLToolkit(SQLDatabaseToolkit):
|
|
18
|
+
include_knowledge_base_tools: bool = True
|
|
18
19
|
|
|
19
|
-
def get_tools(self, prefix=
|
|
20
|
-
|
|
20
|
+
def get_tools(self, prefix="") -> List[BaseTool]:
|
|
21
21
|
current_date_time = datetime.now().strftime("%Y-%m-%d %H:%M")
|
|
22
22
|
|
|
23
23
|
"""Get the tools in the toolkit."""
|
|
24
24
|
list_sql_database_tool = ListSQLDatabaseTool(
|
|
25
|
-
name=f
|
|
25
|
+
name=f"sql_db_list_tables{prefix}",
|
|
26
26
|
db=self.db,
|
|
27
|
-
description=dedent(
|
|
27
|
+
description=dedent(
|
|
28
|
+
"""\n
|
|
28
29
|
Input is an empty string, output is a comma-separated list of tables in the database. Each table name is escaped using backticks.
|
|
29
30
|
Each table name in the list may be in one of two formats: database_name.`table_name` or database_name.schema_name.`table_name`.
|
|
30
31
|
Table names in response to the user must be escaped using backticks.
|
|
31
|
-
"""
|
|
32
|
+
"""
|
|
33
|
+
),
|
|
32
34
|
)
|
|
33
35
|
|
|
34
36
|
info_sql_database_tool_description = (
|
|
@@ -45,11 +47,11 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
|
|
|
45
47
|
" $START$ table1 table2 table3 $STOP$\n"
|
|
46
48
|
)
|
|
47
49
|
info_sql_database_tool = InfoSQLDatabaseTool(
|
|
48
|
-
name=f
|
|
49
|
-
db=self.db, description=info_sql_database_tool_description
|
|
50
|
+
name=f"sql_db_schema{prefix}", db=self.db, description=info_sql_database_tool_description
|
|
50
51
|
)
|
|
51
52
|
|
|
52
|
-
query_sql_database_tool_description = dedent(
|
|
53
|
+
query_sql_database_tool_description = dedent(
|
|
54
|
+
f"""\
|
|
53
55
|
Input: A detailed and well-structured SQL query. The query must be enclosed between the symbols $START$ and $STOP$.
|
|
54
56
|
Output: Database result or error message. For errors, rewrite and retry the query. For 'Unknown column' errors, use '{info_sql_database_tool.name}' to check table fields.
|
|
55
57
|
This system is a highly intelligent and reliable PostgreSQL SQL skill designed to work with databases.
|
|
@@ -93,11 +95,11 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
|
|
|
93
95
|
- When asked about yourself or your maker, state that you are a Data-Mind, created by MindsDB to help answer data questions.
|
|
94
96
|
- When asked about your purpose or how you can help, explore the available data sources and then explain that you can answer questions based on the connected data. Provide a few relevant example questions that you could answer for the user about their data.
|
|
95
97
|
Adhere to these guidelines for all queries and responses. Ask for clarification if needed.
|
|
96
|
-
"""
|
|
98
|
+
"""
|
|
99
|
+
)
|
|
97
100
|
|
|
98
101
|
query_sql_database_tool = QuerySQLDataBaseTool(
|
|
99
|
-
name=f
|
|
100
|
-
db=self.db, description=query_sql_database_tool_description
|
|
102
|
+
name=f"sql_db_query{prefix}", db=self.db, description=query_sql_database_tool_description
|
|
101
103
|
)
|
|
102
104
|
|
|
103
105
|
mindsdb_sql_parser_tool_description = (
|
|
@@ -108,15 +110,24 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
|
|
|
108
110
|
f"ALWAYS run this tool before executing a query with {query_sql_database_tool.name}. "
|
|
109
111
|
)
|
|
110
112
|
mindsdb_sql_parser_tool = MindsDBSQLParserTool(
|
|
111
|
-
name=f
|
|
112
|
-
description=mindsdb_sql_parser_tool_description
|
|
113
|
+
name=f"mindsdb_sql_parser_tool{prefix}", description=mindsdb_sql_parser_tool_description
|
|
113
114
|
)
|
|
114
115
|
|
|
116
|
+
sql_tools = [
|
|
117
|
+
query_sql_database_tool,
|
|
118
|
+
info_sql_database_tool,
|
|
119
|
+
list_sql_database_tool,
|
|
120
|
+
mindsdb_sql_parser_tool,
|
|
121
|
+
]
|
|
122
|
+
if not self.include_knowledge_base_tools:
|
|
123
|
+
return sql_tools
|
|
124
|
+
|
|
115
125
|
# Knowledge base tools
|
|
116
126
|
kb_list_tool = KnowledgeBaseListTool(
|
|
117
|
-
name=f
|
|
127
|
+
name=f"kb_list_tool{prefix}",
|
|
118
128
|
db=self.db,
|
|
119
|
-
description=dedent(
|
|
129
|
+
description=dedent(
|
|
130
|
+
"""\
|
|
120
131
|
Lists all available knowledge bases that can be queried.
|
|
121
132
|
Input: No input required, just call the tool directly.
|
|
122
133
|
Output: A table of all available knowledge bases with their names and creation dates.
|
|
@@ -125,13 +136,15 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
|
|
|
125
136
|
Each knowledge base name is escaped using backticks.
|
|
126
137
|
|
|
127
138
|
Example usage: kb_list_tool()
|
|
128
|
-
"""
|
|
139
|
+
"""
|
|
140
|
+
),
|
|
129
141
|
)
|
|
130
142
|
|
|
131
143
|
kb_info_tool = KnowledgeBaseInfoTool(
|
|
132
|
-
name=f
|
|
144
|
+
name=f"kb_info_tool{prefix}",
|
|
133
145
|
db=self.db,
|
|
134
|
-
description=dedent(
|
|
146
|
+
description=dedent(
|
|
147
|
+
f"""\
|
|
135
148
|
Gets detailed information about specific knowledge bases including their structure and metadata fields.
|
|
136
149
|
|
|
137
150
|
Input: A knowledge base name as a simple string.
|
|
@@ -143,13 +156,15 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
|
|
|
143
156
|
Example usage: kb_info_tool("kb_name")
|
|
144
157
|
|
|
145
158
|
Make sure the knowledge base exists by calling {kb_list_tool.name} first.
|
|
146
|
-
"""
|
|
159
|
+
"""
|
|
160
|
+
),
|
|
147
161
|
)
|
|
148
162
|
|
|
149
163
|
kb_query_tool = KnowledgeBaseQueryTool(
|
|
150
|
-
name=f
|
|
164
|
+
name=f"kb_query_tool{prefix}",
|
|
151
165
|
db=self.db,
|
|
152
|
-
description=dedent(
|
|
166
|
+
description=dedent(
|
|
167
|
+
f"""\
|
|
153
168
|
Queries knowledge bases using SQL syntax to retrieve relevant information.
|
|
154
169
|
|
|
155
170
|
Input: A SQL query string that targets a knowledge base.
|
|
@@ -192,15 +207,12 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
|
|
|
192
207
|
- Always include a semicolon at the end of your SQL query
|
|
193
208
|
|
|
194
209
|
For factual questions, use this tool to retrieve information rather than relying on the model's knowledge.
|
|
195
|
-
"""
|
|
210
|
+
"""
|
|
211
|
+
),
|
|
196
212
|
)
|
|
197
213
|
|
|
198
214
|
# Return standard SQL tools and knowledge base tools
|
|
199
|
-
return [
|
|
200
|
-
query_sql_database_tool,
|
|
201
|
-
info_sql_database_tool,
|
|
202
|
-
list_sql_database_tool,
|
|
203
|
-
mindsdb_sql_parser_tool,
|
|
215
|
+
return sql_tools + [
|
|
204
216
|
kb_list_tool,
|
|
205
217
|
kb_info_tool,
|
|
206
218
|
kb_query_tool,
|