MindsDB 25.6.3.0__py3-none-any.whl → 25.6.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +1 -1
- mindsdb/api/executor/utilities/sql.py +18 -19
- mindsdb/integrations/handlers/lindorm_handler/requirements.txt +1 -1
- mindsdb/interfaces/agents/mindsdb_database_agent.py +5 -17
- mindsdb/interfaces/knowledge_base/controller.py +26 -9
- mindsdb/interfaces/knowledge_base/preprocessing/document_loader.py +17 -86
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +28 -3
- mindsdb/interfaces/skills/sql_agent.py +1 -1
- {mindsdb-25.6.3.0.dist-info → mindsdb-25.6.3.1.dist-info}/METADATA +249 -249
- {mindsdb-25.6.3.0.dist-info → mindsdb-25.6.3.1.dist-info}/RECORD +14 -14
- {mindsdb-25.6.3.0.dist-info → mindsdb-25.6.3.1.dist-info}/WHEEL +0 -0
- {mindsdb-25.6.3.0.dist-info → mindsdb-25.6.3.1.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.6.3.0.dist-info → mindsdb-25.6.3.1.dist-info}/top_level.txt +0 -0
mindsdb/__about__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
__title__ = "MindsDB"
|
|
2
2
|
__package_name__ = "mindsdb"
|
|
3
|
-
__version__ = "25.6.3.
|
|
3
|
+
__version__ = "25.6.3.1"
|
|
4
4
|
__description__ = "MindsDB's AI SQL Server enables developers to build AI tools that need access to real-time data to perform their tasks"
|
|
5
5
|
__email__ = "jorge@mindsdb.com"
|
|
6
6
|
__author__ = "MindsDB Inc"
|
|
@@ -154,7 +154,7 @@ class ProjectDataNode(DataNode):
|
|
|
154
154
|
|
|
155
155
|
return DataHubResponse(data_frame=df, columns=columns_info)
|
|
156
156
|
|
|
157
|
-
raise EntityNotExistsError(f"Can't select from {query_table} in project")
|
|
157
|
+
raise EntityNotExistsError(f"Can't select from <{query_table}> in project")
|
|
158
158
|
else:
|
|
159
159
|
raise NotImplementedError(f"Query not supported {query}")
|
|
160
160
|
|
|
@@ -64,26 +64,25 @@ def query_df_with_type_infer_fallback(query_str: str, dataframes: dict, user_fun
|
|
|
64
64
|
pandas.columns
|
|
65
65
|
"""
|
|
66
66
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
67
|
+
with duckdb.connect(database=":memory:") as con:
|
|
68
|
+
if user_functions:
|
|
69
|
+
user_functions.register(con)
|
|
70
|
+
|
|
71
|
+
for name, value in dataframes.items():
|
|
72
|
+
con.register(name, value)
|
|
73
|
+
|
|
74
|
+
exception = None
|
|
75
|
+
for sample_size in [1000, 10000, 1000000]:
|
|
76
|
+
try:
|
|
77
|
+
con.execute(f"set global pandas_analyze_sample={sample_size};")
|
|
78
|
+
result_df = con.execute(query_str).fetchdf()
|
|
79
|
+
except InvalidInputException as e:
|
|
80
|
+
exception = e
|
|
81
|
+
else:
|
|
82
|
+
break
|
|
81
83
|
else:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
raise exception
|
|
85
|
-
description = con.description
|
|
86
|
-
con.close()
|
|
84
|
+
raise exception
|
|
85
|
+
description = con.description
|
|
87
86
|
|
|
88
87
|
return result_df, description
|
|
89
88
|
|
|
@@ -111,24 +111,12 @@ class MindsDBSQL(SQLDatabase):
|
|
|
111
111
|
)
|
|
112
112
|
|
|
113
113
|
# Convert ExecuteAnswer to a DataFrame for easier manipulation
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
df
|
|
114
|
+
if result.data is not None:
|
|
115
|
+
df = result.data.to_df()
|
|
116
|
+
return df.to_string(index=False)
|
|
117
|
+
|
|
117
118
|
else:
|
|
118
|
-
|
|
119
|
-
try:
|
|
120
|
-
df = result.data.to_df()
|
|
121
|
-
except Exception:
|
|
122
|
-
df = None
|
|
123
|
-
|
|
124
|
-
# Default behaviour (string)
|
|
125
|
-
if df is not None:
|
|
126
|
-
if not df.empty:
|
|
127
|
-
return df.to_string(index=False)
|
|
128
|
-
else:
|
|
129
|
-
return "Query executed successfully, but returned no data."
|
|
130
|
-
|
|
131
|
-
return str(result)
|
|
119
|
+
return "Query executed successfully, but returned no data."
|
|
132
120
|
|
|
133
121
|
except Exception as e:
|
|
134
122
|
logger.error(f"Error executing SQL command: {str(e)}\n{traceback.format_exc()}")
|
|
@@ -9,6 +9,7 @@ import numpy as np
|
|
|
9
9
|
|
|
10
10
|
from mindsdb_sql_parser.ast import BinaryOperation, Constant, Identifier, Select, Update, Delete, Star
|
|
11
11
|
from mindsdb_sql_parser.ast.mindsdb import CreatePredictor
|
|
12
|
+
from mindsdb_sql_parser import parse_sql
|
|
12
13
|
|
|
13
14
|
from mindsdb.integrations.utilities.query_traversal import query_traversal
|
|
14
15
|
|
|
@@ -52,6 +53,13 @@ def get_model_params(model_params: dict, default_config_key: str):
|
|
|
52
53
|
"""
|
|
53
54
|
Get model parameters by combining default config with user provided parameters.
|
|
54
55
|
"""
|
|
56
|
+
# If the default config key is for reranking and the switch to use the default LLM is enabled,
|
|
57
|
+
# switch to the default LLM model.
|
|
58
|
+
if default_config_key == "default_reranking_model" and config.get("default_reranking_model").get(
|
|
59
|
+
"use_default_llm", False
|
|
60
|
+
):
|
|
61
|
+
default_config_key = "default_llm_model"
|
|
62
|
+
|
|
55
63
|
combined_model_params = copy.deepcopy(config.get(default_config_key, {}))
|
|
56
64
|
|
|
57
65
|
if model_params:
|
|
@@ -97,6 +105,8 @@ def get_reranking_model_from_params(reranking_model_params: dict):
|
|
|
97
105
|
params_copy["api_key"] = get_api_key(provider, params_copy, strict=False)
|
|
98
106
|
params_copy["model"] = params_copy.pop("model_name", None)
|
|
99
107
|
|
|
108
|
+
params_copy.pop("use_default_llm", None)
|
|
109
|
+
|
|
100
110
|
return BaseLLMReranker(**params_copy)
|
|
101
111
|
|
|
102
112
|
|
|
@@ -359,23 +369,30 @@ class KnowledgeBaseTable:
|
|
|
359
369
|
|
|
360
370
|
def insert_query_result(self, query: str, project_name: str):
|
|
361
371
|
"""Process and insert SQL query results"""
|
|
362
|
-
|
|
363
|
-
raise ValueError("Document loader not configured")
|
|
372
|
+
ast_query = parse_sql(query)
|
|
364
373
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
374
|
+
command_executor = ExecuteCommands(self.session)
|
|
375
|
+
response = command_executor.execute_command(ast_query, project_name)
|
|
376
|
+
|
|
377
|
+
if response.error_code is not None:
|
|
378
|
+
raise ValueError(f"Error executing query: {response.error_message}")
|
|
379
|
+
|
|
380
|
+
if response.data is None:
|
|
381
|
+
raise ValueError("Query returned no data")
|
|
382
|
+
|
|
383
|
+
records = response.data.records
|
|
384
|
+
df = pd.DataFrame(records)
|
|
385
|
+
|
|
386
|
+
self.insert(df)
|
|
368
387
|
|
|
369
388
|
def insert_rows(self, rows: List[Dict]):
|
|
370
389
|
"""Process and insert raw data rows"""
|
|
371
390
|
if not rows:
|
|
372
391
|
return
|
|
373
392
|
|
|
374
|
-
|
|
375
|
-
Document(content=row.get("content", ""), id=row.get("id"), metadata=row.get("metadata", {})) for row in rows
|
|
376
|
-
]
|
|
393
|
+
df = pd.DataFrame(rows)
|
|
377
394
|
|
|
378
|
-
self.
|
|
395
|
+
self.insert(df)
|
|
379
396
|
|
|
380
397
|
def insert_documents(self, documents: List[Document]):
|
|
381
398
|
"""Process and insert documents with preprocessing if configured"""
|
|
@@ -2,7 +2,6 @@ import os
|
|
|
2
2
|
from typing import List, Iterator
|
|
3
3
|
from langchain_core.documents import Document as LangchainDocument
|
|
4
4
|
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
|
5
|
-
import pandas as pd
|
|
6
5
|
|
|
7
6
|
from mindsdb.interfaces.file.file_controller import FileController
|
|
8
7
|
from mindsdb.integrations.utilities.rag.loaders.file_loader import FileLoader
|
|
@@ -20,12 +19,12 @@ class DocumentLoader:
|
|
|
20
19
|
"""Handles loading documents from various sources including SQL queries"""
|
|
21
20
|
|
|
22
21
|
def __init__(
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
22
|
+
self,
|
|
23
|
+
file_controller: FileController,
|
|
24
|
+
file_splitter: FileSplitter,
|
|
25
|
+
markdown_splitter: MarkdownHeaderTextSplitter,
|
|
26
|
+
file_loader_class=FileLoader,
|
|
27
|
+
mysql_proxy=None,
|
|
29
28
|
):
|
|
30
29
|
"""
|
|
31
30
|
Initialize with required dependencies
|
|
@@ -52,8 +51,8 @@ class DocumentLoader:
|
|
|
52
51
|
for doc in loader.lazy_load():
|
|
53
52
|
# Add file extension to metadata for proper splitting
|
|
54
53
|
extension = os.path.splitext(file_path)[1].lower()
|
|
55
|
-
doc.metadata[
|
|
56
|
-
doc.metadata[
|
|
54
|
+
doc.metadata["extension"] = extension
|
|
55
|
+
doc.metadata["source"] = file_name
|
|
57
56
|
|
|
58
57
|
# Use FileSplitter to handle the document based on its type
|
|
59
58
|
split_docs = self.file_splitter.split_documents([doc])
|
|
@@ -62,34 +61,22 @@ class DocumentLoader:
|
|
|
62
61
|
metadata = doc.metadata.copy()
|
|
63
62
|
metadata.update(split_doc.metadata or {})
|
|
64
63
|
|
|
65
|
-
yield Document(
|
|
66
|
-
content=split_doc.page_content,
|
|
67
|
-
metadata=metadata
|
|
68
|
-
)
|
|
64
|
+
yield Document(content=split_doc.page_content, metadata=metadata)
|
|
69
65
|
|
|
70
66
|
def load_web_pages(
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
67
|
+
self,
|
|
68
|
+
urls: List[str],
|
|
69
|
+
crawl_depth: int,
|
|
70
|
+
limit: int,
|
|
71
|
+
filters: List[str] = None,
|
|
76
72
|
) -> Iterator[Document]:
|
|
77
73
|
"""Load and split documents from web pages"""
|
|
78
|
-
websites_df = get_all_websites(
|
|
79
|
-
urls,
|
|
80
|
-
crawl_depth=crawl_depth,
|
|
81
|
-
limit=limit,
|
|
82
|
-
filters=filters
|
|
83
|
-
)
|
|
74
|
+
websites_df = get_all_websites(urls, crawl_depth=crawl_depth, limit=limit, filters=filters)
|
|
84
75
|
|
|
85
76
|
for _, row in websites_df.iterrows():
|
|
86
77
|
# Create a document with HTML extension for proper splitting
|
|
87
78
|
doc = LangchainDocument(
|
|
88
|
-
page_content=row[
|
|
89
|
-
metadata={
|
|
90
|
-
'extension': '.html',
|
|
91
|
-
'url': row['url']
|
|
92
|
-
}
|
|
79
|
+
page_content=row["text_content"], metadata={"extension": ".html", "url": row["url"]}
|
|
93
80
|
)
|
|
94
81
|
|
|
95
82
|
# Use FileSplitter to handle HTML content
|
|
@@ -98,60 +85,4 @@ class DocumentLoader:
|
|
|
98
85
|
metadata = doc.metadata.copy()
|
|
99
86
|
metadata.update(split_doc.metadata or {})
|
|
100
87
|
|
|
101
|
-
yield Document(
|
|
102
|
-
content=split_doc.page_content,
|
|
103
|
-
metadata=metadata
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
def load_query_result(self, query: str, project_name: str) -> Iterator[Document]:
|
|
107
|
-
"""
|
|
108
|
-
Load documents from SQL query results
|
|
109
|
-
|
|
110
|
-
Args:
|
|
111
|
-
query: SQL query to execute
|
|
112
|
-
project_name: Name of the project context
|
|
113
|
-
|
|
114
|
-
Returns:
|
|
115
|
-
Iterator of Document objects
|
|
116
|
-
|
|
117
|
-
Raises:
|
|
118
|
-
ValueError: If mysql_proxy is not configured or query returns no data
|
|
119
|
-
"""
|
|
120
|
-
if not self.mysql_proxy:
|
|
121
|
-
raise ValueError("MySQL proxy not configured")
|
|
122
|
-
|
|
123
|
-
if not query:
|
|
124
|
-
return
|
|
125
|
-
|
|
126
|
-
# Set project context and execute query
|
|
127
|
-
self.mysql_proxy.set_context({'db': project_name})
|
|
128
|
-
query_result = self.mysql_proxy.process_query(query)
|
|
129
|
-
|
|
130
|
-
if query_result.type != 'table':
|
|
131
|
-
raise ValueError('Query returned no data')
|
|
132
|
-
|
|
133
|
-
# Convert query result to DataFrame
|
|
134
|
-
df = query_result.data.to_df()
|
|
135
|
-
|
|
136
|
-
# Process each row into a Document
|
|
137
|
-
for _, row in df.iterrows():
|
|
138
|
-
# Extract id, content and metadata
|
|
139
|
-
content = str(row.get('content', ''))
|
|
140
|
-
id = row.get('id', None)
|
|
141
|
-
|
|
142
|
-
# Convert remaining columns to metadata
|
|
143
|
-
metadata = {
|
|
144
|
-
col: str(row[col])
|
|
145
|
-
for col in df.columns
|
|
146
|
-
if col != 'content' and not pd.isna(row[col])
|
|
147
|
-
}
|
|
148
|
-
metadata['source'] = 'query'
|
|
149
|
-
|
|
150
|
-
# Split content using recursive splitter
|
|
151
|
-
if content:
|
|
152
|
-
|
|
153
|
-
yield Document(
|
|
154
|
-
id=id,
|
|
155
|
-
content=content,
|
|
156
|
-
metadata=metadata
|
|
157
|
-
)
|
|
88
|
+
yield Document(content=split_doc.page_content, metadata=metadata)
|
|
@@ -3,6 +3,7 @@ import re
|
|
|
3
3
|
import json
|
|
4
4
|
from pydantic import BaseModel, Field
|
|
5
5
|
from langchain_core.tools import BaseTool
|
|
6
|
+
from mindsdb_sql_parser.ast import Describe, Select, Identifier, Constant, Star
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class KnowledgeBaseListToolInput(BaseModel):
|
|
@@ -55,6 +56,26 @@ class KnowledgeBaseInfoTool(BaseTool):
|
|
|
55
56
|
except (json.JSONDecodeError, TypeError):
|
|
56
57
|
pass
|
|
57
58
|
|
|
59
|
+
def strip(s):
|
|
60
|
+
length = -1
|
|
61
|
+
while length != len(s):
|
|
62
|
+
length = len(s)
|
|
63
|
+
|
|
64
|
+
# remove ```
|
|
65
|
+
if s.startswith("```"):
|
|
66
|
+
s = s[3:]
|
|
67
|
+
if s.endswith("```"):
|
|
68
|
+
s = s[:-3]
|
|
69
|
+
|
|
70
|
+
# remove trailing new lines
|
|
71
|
+
s = s.strip("\n")
|
|
72
|
+
|
|
73
|
+
# remove extra quotes
|
|
74
|
+
for q in ('"', "'", "`"):
|
|
75
|
+
if s.count(q) == 1:
|
|
76
|
+
s = s.strip(q)
|
|
77
|
+
return s
|
|
78
|
+
|
|
58
79
|
# Finally, try the original regex pattern for $START$ and $STOP$ markers
|
|
59
80
|
match = re.search(r"\$START\$(.*?)\$STOP\$", tool_input, re.DOTALL)
|
|
60
81
|
if not match:
|
|
@@ -63,12 +84,14 @@ class KnowledgeBaseInfoTool(BaseTool):
|
|
|
63
84
|
return [kb.strip() for kb in tool_input.split(",")]
|
|
64
85
|
# If it's just a single string without formatting, return it as a single item
|
|
65
86
|
if tool_input.strip():
|
|
66
|
-
return [
|
|
87
|
+
return [strip(tool_input)]
|
|
67
88
|
return []
|
|
68
89
|
|
|
69
90
|
# Extract and clean the knowledge base names
|
|
70
91
|
kb_names_str = match.group(1).strip()
|
|
71
92
|
kb_names = re.findall(r"`([^`]+)`", kb_names_str)
|
|
93
|
+
|
|
94
|
+
kb_names = [strip(n) for n in kb_names]
|
|
72
95
|
return kb_names
|
|
73
96
|
|
|
74
97
|
def _run(self, tool_input: str) -> str:
|
|
@@ -83,7 +106,7 @@ class KnowledgeBaseInfoTool(BaseTool):
|
|
|
83
106
|
for kb_name in kb_names:
|
|
84
107
|
try:
|
|
85
108
|
# Get knowledge base schema
|
|
86
|
-
schema_result = self.db.run_no_throw(
|
|
109
|
+
schema_result = self.db.run_no_throw(str(Describe(kb_name, type="knowledge_base")))
|
|
87
110
|
|
|
88
111
|
if not schema_result:
|
|
89
112
|
results.append(f"Knowledge base `{kb_name}` not found or has no schema information.")
|
|
@@ -111,7 +134,9 @@ class KnowledgeBaseInfoTool(BaseTool):
|
|
|
111
134
|
kb_info += "```\n\n"
|
|
112
135
|
|
|
113
136
|
# Get sample data
|
|
114
|
-
sample_data = self.db.run_no_throw(
|
|
137
|
+
sample_data = self.db.run_no_throw(
|
|
138
|
+
str(Select(targets=[Star()], from_table=Identifier(kb_name), limit=Constant(20)))
|
|
139
|
+
)
|
|
115
140
|
|
|
116
141
|
# Sample data
|
|
117
142
|
kb_info += "### Sample Data:\n"
|