MindsDB 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +53 -94
- mindsdb/api/a2a/agent.py +30 -206
- mindsdb/api/a2a/common/server/server.py +26 -27
- mindsdb/api/a2a/task_manager.py +93 -227
- mindsdb/api/a2a/utils.py +21 -0
- mindsdb/api/executor/command_executor.py +8 -6
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
- mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
- mindsdb/api/executor/planner/query_prepare.py +68 -87
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
- mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
- mindsdb/api/executor/utilities/sql.py +97 -21
- mindsdb/api/http/namespaces/agents.py +126 -201
- mindsdb/api/http/namespaces/config.py +12 -1
- mindsdb/api/http/namespaces/file.py +49 -24
- mindsdb/api/mcp/start.py +45 -31
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
- mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
- mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
- mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +244 -141
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/libs/keyword_search_base.py +41 -0
- mindsdb/integrations/libs/vectordatabase_handler.py +114 -84
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
- mindsdb/integrations/utilities/sql_utils.py +11 -0
- mindsdb/interfaces/agents/agents_controller.py +29 -9
- mindsdb/interfaces/agents/langchain_agent.py +7 -5
- mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
- mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
- mindsdb/interfaces/database/projects.py +1 -3
- mindsdb/interfaces/functions/controller.py +54 -64
- mindsdb/interfaces/functions/to_markdown.py +47 -14
- mindsdb/interfaces/knowledge_base/controller.py +228 -110
- mindsdb/interfaces/knowledge_base/evaluate.py +18 -6
- mindsdb/interfaces/knowledge_base/executor.py +346 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
- mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
- mindsdb/interfaces/skills/sql_agent.py +181 -130
- mindsdb/interfaces/storage/db.py +9 -7
- mindsdb/utilities/config.py +58 -40
- mindsdb/utilities/exception.py +58 -7
- mindsdb/utilities/security.py +54 -11
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +245 -259
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +61 -58
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0
|
@@ -160,7 +160,7 @@ class AgentsController:
|
|
|
160
160
|
Parameters:
|
|
161
161
|
name (str): The name of the new agent
|
|
162
162
|
project_name (str): The containing project
|
|
163
|
-
model_name (str): The name of the existing ML model the agent will use
|
|
163
|
+
model_name (str | dict): The name of the existing ML model the agent will use
|
|
164
164
|
skills (List[Union[str, dict]]): List of existing skill names to add to the new agent, or list of dicts
|
|
165
165
|
with one of keys is "name", and other is additional parameters for relationship agent<>skill
|
|
166
166
|
provider (str): The provider of the model
|
|
@@ -172,6 +172,9 @@ class AgentsController:
|
|
|
172
172
|
include_knowledge_bases: List of knowledge bases to include for text2sql skills
|
|
173
173
|
ignore_knowledge_bases: List of knowledge bases to ignore for text2sql skills
|
|
174
174
|
<provider>_api_key: API key for the provider (e.g., openai_api_key)
|
|
175
|
+
data: Dict, data sources for an agent, keys:
|
|
176
|
+
- knowledge_bases: List of KBs to use (alternative to `include_knowledge_bases`)
|
|
177
|
+
- tables: list of tables to use (alternative to `include_tables`)
|
|
175
178
|
|
|
176
179
|
Returns:
|
|
177
180
|
agent (db.Agents): The created agent
|
|
@@ -188,12 +191,17 @@ class AgentsController:
|
|
|
188
191
|
if agent is not None:
|
|
189
192
|
raise ValueError(f"Agent with name already exists: {name}")
|
|
190
193
|
|
|
191
|
-
if model_name is not None:
|
|
192
|
-
_, provider = self.check_model_provider(model_name, provider)
|
|
193
|
-
|
|
194
194
|
# No need to copy params since we're not preserving the original reference
|
|
195
195
|
params = params or {}
|
|
196
196
|
|
|
197
|
+
if isinstance(model_name, dict):
|
|
198
|
+
# move into params
|
|
199
|
+
params["model"] = model_name
|
|
200
|
+
model_name = None
|
|
201
|
+
|
|
202
|
+
if model_name is not None:
|
|
203
|
+
_, provider = self.check_model_provider(model_name, provider)
|
|
204
|
+
|
|
197
205
|
if model_name is None:
|
|
198
206
|
logger.warning("'model_name' param is not provided. Using default global llm model at runtime.")
|
|
199
207
|
|
|
@@ -230,6 +238,12 @@ class AgentsController:
|
|
|
230
238
|
if "database" in params or need_params:
|
|
231
239
|
params["database"] = database
|
|
232
240
|
|
|
241
|
+
if "data" in params:
|
|
242
|
+
if include_knowledge_bases is None:
|
|
243
|
+
include_knowledge_bases = params["data"].get("knowledge_bases")
|
|
244
|
+
if include_tables is None:
|
|
245
|
+
include_tables = params["data"].get("tables")
|
|
246
|
+
|
|
233
247
|
if "knowledge_base_database" in params or include_knowledge_bases or ignore_knowledge_bases:
|
|
234
248
|
params["knowledge_base_database"] = knowledge_base_database
|
|
235
249
|
|
|
@@ -549,13 +563,19 @@ class AgentsController:
|
|
|
549
563
|
agent.deleted_at = datetime.datetime.now()
|
|
550
564
|
db.session.commit()
|
|
551
565
|
|
|
552
|
-
def get_agent_llm_params(self,
|
|
566
|
+
def get_agent_llm_params(self, agent_params: dict):
|
|
553
567
|
"""
|
|
554
568
|
Get agent LLM parameters by combining default config with user provided parameters.
|
|
555
569
|
Similar to how knowledge bases handle default parameters.
|
|
556
570
|
"""
|
|
557
571
|
combined_model_params = copy.deepcopy(config.get("default_llm", {}))
|
|
558
572
|
|
|
573
|
+
if "model" in agent_params:
|
|
574
|
+
model_params = agent_params["model"]
|
|
575
|
+
else:
|
|
576
|
+
# params for LLM can be arbitrary
|
|
577
|
+
model_params = agent_params
|
|
578
|
+
|
|
559
579
|
if model_params:
|
|
560
580
|
combined_model_params.update(model_params)
|
|
561
581
|
|
|
@@ -596,9 +616,9 @@ class AgentsController:
|
|
|
596
616
|
db.session.commit()
|
|
597
617
|
|
|
598
618
|
# Get agent parameters and combine with default LLM parameters at runtime
|
|
599
|
-
|
|
619
|
+
llm_params = self.get_agent_llm_params(agent.params)
|
|
600
620
|
|
|
601
|
-
lang_agent = LangchainAgent(agent, model,
|
|
621
|
+
lang_agent = LangchainAgent(agent, model, llm_params=llm_params)
|
|
602
622
|
return lang_agent.get_completion(messages)
|
|
603
623
|
|
|
604
624
|
def _get_completion_stream(
|
|
@@ -636,7 +656,7 @@ class AgentsController:
|
|
|
636
656
|
db.session.commit()
|
|
637
657
|
|
|
638
658
|
# Get agent parameters and combine with default LLM parameters at runtime
|
|
639
|
-
|
|
659
|
+
llm_params = self.get_agent_llm_params(agent.params)
|
|
640
660
|
|
|
641
|
-
lang_agent = LangchainAgent(agent, model=model,
|
|
661
|
+
lang_agent = LangchainAgent(agent, model=model, llm_params=llm_params)
|
|
642
662
|
return lang_agent.get_completion(messages, stream=True)
|
|
@@ -228,7 +228,7 @@ def process_chunk(chunk):
|
|
|
228
228
|
|
|
229
229
|
|
|
230
230
|
class LangchainAgent:
|
|
231
|
-
def __init__(self, agent: db.Agents, model: dict = None,
|
|
231
|
+
def __init__(self, agent: db.Agents, model: dict = None, llm_params: dict = None):
|
|
232
232
|
self.agent = agent
|
|
233
233
|
self.model = model
|
|
234
234
|
|
|
@@ -241,12 +241,12 @@ class LangchainAgent:
|
|
|
241
241
|
self.mdb_langfuse_callback_handler: Optional[object] = None # custom (see langfuse_callback_handler.py)
|
|
242
242
|
|
|
243
243
|
self.langfuse_client_wrapper = LangfuseClientWrapper()
|
|
244
|
-
self.args = self._initialize_args(
|
|
244
|
+
self.args = self._initialize_args(llm_params)
|
|
245
245
|
|
|
246
246
|
# Back compatibility for old models
|
|
247
247
|
self.provider = self.args.get("provider", get_llm_provider(self.args))
|
|
248
248
|
|
|
249
|
-
def _initialize_args(self,
|
|
249
|
+
def _initialize_args(self, llm_params: dict = None) -> dict:
|
|
250
250
|
"""
|
|
251
251
|
Initialize the arguments for agent execution.
|
|
252
252
|
|
|
@@ -254,14 +254,16 @@ class LangchainAgent:
|
|
|
254
254
|
The params are already merged with defaults by AgentsController.get_agent_llm_params.
|
|
255
255
|
|
|
256
256
|
Args:
|
|
257
|
-
|
|
257
|
+
llm_params: Parameters for agent execution (already merged with defaults)
|
|
258
258
|
|
|
259
259
|
Returns:
|
|
260
260
|
dict: Final parameters for agent execution
|
|
261
261
|
"""
|
|
262
262
|
# Use the parameters passed to the method (already merged with defaults by AgentsController)
|
|
263
263
|
# No fallback needed as AgentsController.get_agent_llm_params already handles this
|
|
264
|
-
args = params.copy()
|
|
264
|
+
args = self.agent.params.copy()
|
|
265
|
+
if llm_params:
|
|
266
|
+
args.update(llm_params)
|
|
265
267
|
|
|
266
268
|
# Set model name and provider if given in create agent otherwise use global llm defaults
|
|
267
269
|
# AgentsController.get_agent_llm_params
|
|
@@ -71,11 +71,11 @@ class MCPLangchainAgent(LangchainAgent):
|
|
|
71
71
|
self,
|
|
72
72
|
agent: db.Agents,
|
|
73
73
|
model: dict = None,
|
|
74
|
-
|
|
74
|
+
llm_params: dict = None,
|
|
75
75
|
mcp_host: str = "127.0.0.1",
|
|
76
76
|
mcp_port: int = 47337,
|
|
77
77
|
):
|
|
78
|
-
super().__init__(agent, model,
|
|
78
|
+
super().__init__(agent, model, llm_params)
|
|
79
79
|
self.mcp_host = mcp_host
|
|
80
80
|
self.mcp_port = mcp_port
|
|
81
81
|
self.exit_stack = AsyncExitStack()
|
|
@@ -251,10 +251,10 @@ def create_mcp_agent(
|
|
|
251
251
|
raise ValueError(f"Agent {agent_name} not found in project {project_name}")
|
|
252
252
|
|
|
253
253
|
# Get merged parameters (defaults + agent params)
|
|
254
|
-
|
|
254
|
+
llm_params = agent_controller.get_agent_llm_params(agent_db.params)
|
|
255
255
|
|
|
256
256
|
# Create MCP agent with merged parameters
|
|
257
|
-
mcp_agent = MCPLangchainAgent(agent_db,
|
|
257
|
+
mcp_agent = MCPLangchainAgent(agent_db, llm_params=llm_params, mcp_host=mcp_host, mcp_port=mcp_port)
|
|
258
258
|
|
|
259
259
|
# Wrap for LiteLLM compatibility
|
|
260
260
|
return LiteLLMAgentWrapper(mcp_agent)
|
|
@@ -96,27 +96,7 @@ class MindsDBSQL(SQLDatabase):
|
|
|
96
96
|
# Log the query for debugging
|
|
97
97
|
logger.info(f"Executing SQL query: {command}")
|
|
98
98
|
|
|
99
|
-
|
|
100
|
-
# remove backticks
|
|
101
|
-
# command = command.replace('`', '')
|
|
102
|
-
|
|
103
|
-
# Parse the SQL string to an AST object first
|
|
104
|
-
from mindsdb_sql_parser import parse_sql
|
|
105
|
-
|
|
106
|
-
ast_query = parse_sql(command)
|
|
107
|
-
|
|
108
|
-
# Now execute the parsed query
|
|
109
|
-
result = self._sql_agent.skill_tool.get_command_executor().execute_command(
|
|
110
|
-
ast_query, database_name="mindsdb"
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
# Convert ExecuteAnswer to a DataFrame for easier manipulation
|
|
114
|
-
if result.data is not None:
|
|
115
|
-
df = result.data.to_df()
|
|
116
|
-
return df.to_string(index=False)
|
|
117
|
-
|
|
118
|
-
else:
|
|
119
|
-
return "Query executed successfully, but returned no data."
|
|
99
|
+
return self._sql_agent.query(command)
|
|
120
100
|
|
|
121
101
|
except Exception as e:
|
|
122
102
|
logger.error(f"Error executing SQL command: {str(e)}\n{traceback.format_exc()}")
|
|
@@ -127,28 +107,6 @@ class MindsDBSQL(SQLDatabase):
|
|
|
127
107
|
return f"Error executing knowledge base query: {str(e)}. Please check that the knowledge base exists and your query syntax is correct."
|
|
128
108
|
return f"Error: {str(e)}"
|
|
129
109
|
|
|
130
|
-
# def run_no_throw(self, command: str, fetch: str = "all") -> str:
|
|
131
|
-
# """Execute a SQL command and return the result as a string.
|
|
132
|
-
#
|
|
133
|
-
# This method catches any exceptions and returns an error message instead of raising an exception.
|
|
134
|
-
#
|
|
135
|
-
# Args:
|
|
136
|
-
# command: The SQL command to execute
|
|
137
|
-
# fetch: Whether to fetch 'all' results or just 'one'
|
|
138
|
-
#
|
|
139
|
-
# Returns:
|
|
140
|
-
# A string representation of the result or an error message
|
|
141
|
-
# """
|
|
142
|
-
# command = extract_essential(command)
|
|
143
|
-
# try:
|
|
144
|
-
# return self._sql_agent.query_safe(command)
|
|
145
|
-
# except Exception as e:
|
|
146
|
-
# logger.error(f"Error executing SQL command: {str(e)}")
|
|
147
|
-
# # If this is a knowledge base query, provide a more helpful error message
|
|
148
|
-
# if "knowledge_base" in command.lower() or any(kb in command for kb in self._sql_agent.get_usable_knowledge_base_names()):
|
|
149
|
-
# return f"Error executing knowledge base query: {str(e)}. Please check that the knowledge base exists and your query syntax is correct."
|
|
150
|
-
# return f"Error: {str(e)}"
|
|
151
|
-
|
|
152
110
|
def get_usable_knowledge_base_names(self) -> List[str]:
|
|
153
111
|
"""Get a list of usable knowledge base names.
|
|
154
112
|
|
|
@@ -160,3 +118,12 @@ class MindsDBSQL(SQLDatabase):
|
|
|
160
118
|
except Exception as e:
|
|
161
119
|
logger.error(f"Error getting usable knowledge base names: {str(e)}")
|
|
162
120
|
return []
|
|
121
|
+
|
|
122
|
+
def check_knowledge_base_permission(self, name):
|
|
123
|
+
"""Get a list of usable knowledge base names.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
A list of knowledge base names that can be used in queries
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
return self._sql_agent.check_knowledge_base_permission(name)
|
|
@@ -18,7 +18,9 @@ class DataCatalogReader(BaseDataCatalog):
|
|
|
18
18
|
|
|
19
19
|
metadata_str = "Data Catalog: \n"
|
|
20
20
|
if hasattr(self.data_handler, "meta_get_handler_info"):
|
|
21
|
-
|
|
21
|
+
info = self.data_handler.meta_get_handler_info()
|
|
22
|
+
if info:
|
|
23
|
+
metadata_str += info + "\n\n"
|
|
22
24
|
|
|
23
25
|
for table in tables:
|
|
24
26
|
metadata_str += table.as_string() + "\n\n"
|
|
@@ -362,9 +362,7 @@ class Project:
|
|
|
362
362
|
|
|
363
363
|
columns = [ASSISTANT_COLUMN, USER_COLUMN]
|
|
364
364
|
case "KNOWLEDGE_BASE":
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
columns = list(KB_TO_VECTORDB_COLUMNS.keys()) + ["metadata", "relevance", "distance"]
|
|
365
|
+
columns = ["id", "chunk_id", "chunk_content", "metadata", "relevance", "distance"]
|
|
368
366
|
case "TABLE":
|
|
369
367
|
# like 'mindsdb.models'
|
|
370
368
|
pass
|
|
@@ -7,15 +7,15 @@ from mindsdb.utilities.config import config
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def python_to_duckdb_type(py_type):
|
|
10
|
-
if py_type ==
|
|
10
|
+
if py_type == "int":
|
|
11
11
|
return BIGINT
|
|
12
|
-
elif py_type ==
|
|
12
|
+
elif py_type == "float":
|
|
13
13
|
return DOUBLE
|
|
14
|
-
elif py_type ==
|
|
14
|
+
elif py_type == "str":
|
|
15
15
|
return VARCHAR
|
|
16
|
-
elif py_type ==
|
|
16
|
+
elif py_type == "bool":
|
|
17
17
|
return BOOLEAN
|
|
18
|
-
elif py_type ==
|
|
18
|
+
elif py_type == "bytes":
|
|
19
19
|
return BLOB
|
|
20
20
|
else:
|
|
21
21
|
# Unknown
|
|
@@ -53,8 +53,8 @@ class BYOMFunctionsController:
|
|
|
53
53
|
# first run
|
|
54
54
|
self.byom_engines = []
|
|
55
55
|
for name, info in self.session.integration_controller.get_all().items():
|
|
56
|
-
if info[
|
|
57
|
-
if info[
|
|
56
|
+
if info["type"] == "ml" and info["engine"] == "byom":
|
|
57
|
+
if info["connection_data"].get("mode") == "custom_function":
|
|
58
58
|
self.byom_engines.append(name)
|
|
59
59
|
return self.byom_engines
|
|
60
60
|
|
|
@@ -63,7 +63,7 @@ class BYOMFunctionsController:
|
|
|
63
63
|
ml_handler = self.session.integration_controller.get_ml_handler(engine)
|
|
64
64
|
|
|
65
65
|
storage = HandlerStorage(ml_handler.integration_id)
|
|
66
|
-
methods = storage.json_get(
|
|
66
|
+
methods = storage.json_get("methods")
|
|
67
67
|
self.byom_methods[engine] = methods
|
|
68
68
|
self.byom_handlers[engine] = ml_handler
|
|
69
69
|
|
|
@@ -81,7 +81,7 @@ class BYOMFunctionsController:
|
|
|
81
81
|
# do nothing
|
|
82
82
|
return
|
|
83
83
|
|
|
84
|
-
new_name = f
|
|
84
|
+
new_name = f"{node.namespace}_{fnc_name}"
|
|
85
85
|
node.op = new_name
|
|
86
86
|
|
|
87
87
|
if new_name in self.callbacks:
|
|
@@ -91,16 +91,13 @@ class BYOMFunctionsController:
|
|
|
91
91
|
def callback(*args):
|
|
92
92
|
return self.method_call(engine, fnc_name, args)
|
|
93
93
|
|
|
94
|
-
input_types = [
|
|
95
|
-
param['type']
|
|
96
|
-
for param in methods[fnc_name]['input_params']
|
|
97
|
-
]
|
|
94
|
+
input_types = [param["type"] for param in methods[fnc_name]["input_params"]]
|
|
98
95
|
|
|
99
96
|
meta = {
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
97
|
+
"name": new_name,
|
|
98
|
+
"callback": callback,
|
|
99
|
+
"input_types": input_types,
|
|
100
|
+
"output_type": methods[fnc_name]["output_type"],
|
|
104
101
|
}
|
|
105
102
|
|
|
106
103
|
self.callbacks[new_name] = meta
|
|
@@ -114,7 +111,6 @@ class BYOMFunctionsController:
|
|
|
114
111
|
|
|
115
112
|
|
|
116
113
|
class FunctionController(BYOMFunctionsController):
|
|
117
|
-
|
|
118
114
|
def __init__(self, *args, **kwargs):
|
|
119
115
|
super().__init__(*args, **kwargs)
|
|
120
116
|
|
|
@@ -124,10 +120,10 @@ class FunctionController(BYOMFunctionsController):
|
|
|
124
120
|
return meta
|
|
125
121
|
|
|
126
122
|
# builtin functions
|
|
127
|
-
if node.op.lower() ==
|
|
123
|
+
if node.op.lower() == "llm":
|
|
128
124
|
return self.llm_call_function(node)
|
|
129
125
|
|
|
130
|
-
elif node.op.lower() ==
|
|
126
|
+
elif node.op.lower() == "to_markdown":
|
|
131
127
|
return self.to_markdown_call_function(node)
|
|
132
128
|
|
|
133
129
|
def llm_call_function(self, node):
|
|
@@ -141,70 +137,74 @@ class FunctionController(BYOMFunctionsController):
|
|
|
141
137
|
try:
|
|
142
138
|
from langchain_core.messages import HumanMessage
|
|
143
139
|
from mindsdb.interfaces.agents.langchain_agent import create_chat_model
|
|
140
|
+
|
|
144
141
|
llm = create_chat_model(chat_model_params)
|
|
145
142
|
except Exception as e:
|
|
146
|
-
raise RuntimeError(f
|
|
143
|
+
raise RuntimeError(f"Unable to use LLM function, check ENV variables: {e}")
|
|
147
144
|
|
|
148
145
|
def callback(question):
|
|
149
146
|
resp = llm([HumanMessage(question)])
|
|
150
147
|
return resp.content
|
|
151
148
|
|
|
152
|
-
meta = {
|
|
153
|
-
'name': name,
|
|
154
|
-
'callback': callback,
|
|
155
|
-
'input_types': ['str'],
|
|
156
|
-
'output_type': 'str'
|
|
157
|
-
}
|
|
149
|
+
meta = {"name": name, "callback": callback, "input_types": ["str"], "output_type": "str"}
|
|
158
150
|
self.callbacks[name] = meta
|
|
159
151
|
return meta
|
|
160
152
|
|
|
161
153
|
def to_markdown_call_function(self, node):
|
|
162
154
|
# load on-demand because lib is heavy
|
|
163
155
|
from mindsdb.interfaces.functions.to_markdown import ToMarkdown
|
|
156
|
+
|
|
164
157
|
name = node.op.lower()
|
|
165
158
|
|
|
166
159
|
if name in self.callbacks:
|
|
167
160
|
return self.callbacks[name]
|
|
168
161
|
|
|
169
|
-
def
|
|
170
|
-
|
|
171
|
-
|
|
162
|
+
def prepare_chat_model_params(chat_model_params: dict) -> dict:
|
|
163
|
+
"""
|
|
164
|
+
Parepares the chat model parameters for the ToMarkdown function.
|
|
165
|
+
"""
|
|
172
166
|
params_copy = copy.deepcopy(chat_model_params)
|
|
173
|
-
params_copy[
|
|
174
|
-
|
|
175
|
-
|
|
167
|
+
params_copy["model"] = params_copy.pop("model_name")
|
|
168
|
+
|
|
169
|
+
# Set the base_url for the Google provider.
|
|
170
|
+
if params_copy["provider"] == "google" and "base_url" not in params_copy:
|
|
171
|
+
params_copy["base_url"] = "https://generativelanguage.googleapis.com/v1beta/"
|
|
172
|
+
|
|
173
|
+
params_copy.pop("api_keys")
|
|
174
|
+
params_copy.pop("provider")
|
|
175
|
+
|
|
176
|
+
return params_copy
|
|
177
|
+
|
|
178
|
+
def callback(file_path_or_url):
|
|
179
|
+
chat_model_params = self._parse_chat_model_params("TO_MARKDOWN_FUNCTION_")
|
|
180
|
+
chat_model_params = prepare_chat_model_params(chat_model_params)
|
|
176
181
|
|
|
177
182
|
to_markdown = ToMarkdown()
|
|
178
|
-
return to_markdown.call(file_path_or_url, **
|
|
183
|
+
return to_markdown.call(file_path_or_url, **chat_model_params)
|
|
179
184
|
|
|
180
|
-
meta = {
|
|
181
|
-
'name': name,
|
|
182
|
-
'callback': callback,
|
|
183
|
-
'input_types': ['str'],
|
|
184
|
-
'output_type': 'str'
|
|
185
|
-
}
|
|
185
|
+
meta = {"name": name, "callback": callback, "input_types": ["str"], "output_type": "str"}
|
|
186
186
|
self.callbacks[name] = meta
|
|
187
187
|
return meta
|
|
188
188
|
|
|
189
|
-
def _parse_chat_model_params(self, param_prefix: str =
|
|
189
|
+
def _parse_chat_model_params(self, param_prefix: str = "LLM_FUNCTION_"):
|
|
190
190
|
"""
|
|
191
191
|
Parses the environment variables for chat model parameters.
|
|
192
192
|
"""
|
|
193
193
|
chat_model_params = config.get("default_llm") or {}
|
|
194
194
|
for k, v in os.environ.items():
|
|
195
195
|
if k.startswith(param_prefix):
|
|
196
|
-
param_name = k[len(param_prefix):]
|
|
197
|
-
if param_name ==
|
|
198
|
-
chat_model_params[
|
|
196
|
+
param_name = k[len(param_prefix) :]
|
|
197
|
+
if param_name == "MODEL":
|
|
198
|
+
chat_model_params["model_name"] = v
|
|
199
199
|
else:
|
|
200
200
|
chat_model_params[param_name.lower()] = v
|
|
201
201
|
|
|
202
|
-
if
|
|
203
|
-
chat_model_params[
|
|
202
|
+
if "provider" not in chat_model_params:
|
|
203
|
+
chat_model_params["provider"] = "openai"
|
|
204
204
|
|
|
205
|
-
if
|
|
205
|
+
if "api_key" in chat_model_params:
|
|
206
206
|
# move to api_keys dict
|
|
207
|
-
chat_model_params["api_keys"] = {chat_model_params[
|
|
207
|
+
chat_model_params["api_keys"] = {chat_model_params["provider"]: chat_model_params["api_key"]}
|
|
208
208
|
|
|
209
209
|
return chat_model_params
|
|
210
210
|
|
|
@@ -215,33 +215,23 @@ class DuckDBFunctions:
|
|
|
215
215
|
self.functions = {}
|
|
216
216
|
|
|
217
217
|
def check_function(self, node):
|
|
218
|
-
|
|
219
218
|
meta = self.controller.check_function(node)
|
|
220
219
|
if meta is None:
|
|
221
220
|
return
|
|
222
221
|
|
|
223
|
-
name = meta[
|
|
222
|
+
name = meta["name"]
|
|
224
223
|
|
|
225
224
|
if name in self.functions:
|
|
226
225
|
return
|
|
227
226
|
|
|
228
|
-
input_types = [
|
|
229
|
-
python_to_duckdb_type(param)
|
|
230
|
-
for param in meta['input_types']
|
|
231
|
-
]
|
|
227
|
+
input_types = [python_to_duckdb_type(param) for param in meta["input_types"]]
|
|
232
228
|
|
|
233
229
|
self.functions[name] = {
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
230
|
+
"callback": function_maker(len(input_types), meta["callback"]),
|
|
231
|
+
"input": input_types,
|
|
232
|
+
"output": python_to_duckdb_type(meta["output_type"]),
|
|
237
233
|
}
|
|
238
234
|
|
|
239
235
|
def register(self, connection):
|
|
240
236
|
for name, info in self.functions.items():
|
|
241
|
-
connection.create_function(
|
|
242
|
-
name,
|
|
243
|
-
info['callback'],
|
|
244
|
-
info['input'],
|
|
245
|
-
info['output'],
|
|
246
|
-
null_handling="special"
|
|
247
|
-
)
|
|
237
|
+
connection.create_function(name, info["callback"], info["input"], info["output"], null_handling="special")
|
|
@@ -2,6 +2,7 @@ from io import BytesIO
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import Union
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
|
+
import xml.etree.ElementTree as ET
|
|
5
6
|
|
|
6
7
|
from aipdf import ocr
|
|
7
8
|
import mimetypes
|
|
@@ -12,6 +13,7 @@ class ToMarkdown:
|
|
|
12
13
|
"""
|
|
13
14
|
Extracts the content of documents of various formats in markdown format.
|
|
14
15
|
"""
|
|
16
|
+
|
|
15
17
|
def __init__(self):
|
|
16
18
|
"""
|
|
17
19
|
Initializes the ToMarkdown class.
|
|
@@ -24,24 +26,28 @@ class ToMarkdown:
|
|
|
24
26
|
file_extension = self._get_file_extension(file_path_or_url)
|
|
25
27
|
file_content = self._get_file_content(file_path_or_url)
|
|
26
28
|
|
|
27
|
-
if file_extension ==
|
|
29
|
+
if file_extension == ".pdf":
|
|
28
30
|
return self._pdf_to_markdown(file_content, **kwargs)
|
|
31
|
+
|
|
32
|
+
elif file_extension in (".xml", ".nessus"):
|
|
33
|
+
return self._xml_to_markdown(file_content, **kwargs)
|
|
34
|
+
|
|
29
35
|
else:
|
|
30
36
|
raise ValueError(f"Unsupported file type: {file_extension}.")
|
|
31
37
|
|
|
32
|
-
def _get_file_content(self, file_path_or_url: str) ->
|
|
38
|
+
def _get_file_content(self, file_path_or_url: str) -> BytesIO:
|
|
33
39
|
"""
|
|
34
40
|
Retrieves the content of a file.
|
|
35
41
|
"""
|
|
36
42
|
parsed_url = urlparse(file_path_or_url)
|
|
37
|
-
if parsed_url.scheme in (
|
|
43
|
+
if parsed_url.scheme in ("http", "https"):
|
|
38
44
|
response = requests.get(file_path_or_url)
|
|
39
45
|
if response.status_code == 200:
|
|
40
|
-
return response
|
|
46
|
+
return BytesIO(response.content)
|
|
41
47
|
else:
|
|
42
|
-
raise RuntimeError(f
|
|
48
|
+
raise RuntimeError(f"Unable to retrieve file from URL: {file_path_or_url}")
|
|
43
49
|
else:
|
|
44
|
-
with open(file_path_or_url,
|
|
50
|
+
with open(file_path_or_url, "rb") as file:
|
|
45
51
|
return BytesIO(file.read())
|
|
46
52
|
|
|
47
53
|
def _get_file_extension(self, file_path_or_url: str) -> str:
|
|
@@ -49,13 +55,13 @@ class ToMarkdown:
|
|
|
49
55
|
Retrieves the file extension from a file path or URL.
|
|
50
56
|
"""
|
|
51
57
|
parsed_url = urlparse(file_path_or_url)
|
|
52
|
-
if parsed_url.scheme in (
|
|
58
|
+
if parsed_url.scheme in ("http", "https"):
|
|
53
59
|
try:
|
|
54
60
|
# Make a HEAD request to get headers without downloading the file.
|
|
55
61
|
response = requests.head(file_path_or_url, allow_redirects=True)
|
|
56
|
-
content_type = response.headers.get(
|
|
62
|
+
content_type = response.headers.get("Content-Type", "")
|
|
57
63
|
if content_type:
|
|
58
|
-
ext = mimetypes.guess_extension(content_type.split(
|
|
64
|
+
ext = mimetypes.guess_extension(content_type.split(";")[0].strip())
|
|
59
65
|
if ext:
|
|
60
66
|
return ext
|
|
61
67
|
|
|
@@ -64,16 +70,43 @@ class ToMarkdown:
|
|
|
64
70
|
if ext:
|
|
65
71
|
return ext
|
|
66
72
|
except requests.RequestException:
|
|
67
|
-
raise RuntimeError(f
|
|
73
|
+
raise RuntimeError(f"Unable to retrieve file extension from URL: {file_path_or_url}")
|
|
68
74
|
else:
|
|
69
75
|
return os.path.splitext(file_path_or_url)[1]
|
|
70
76
|
|
|
71
|
-
def _pdf_to_markdown(self, file_content: Union[requests.Response,
|
|
77
|
+
def _pdf_to_markdown(self, file_content: Union[requests.Response, BytesIO], **kwargs) -> str:
|
|
72
78
|
"""
|
|
73
79
|
Converts a PDF file to markdown.
|
|
74
80
|
"""
|
|
75
|
-
if isinstance(file_content, requests.Response):
|
|
76
|
-
file_content = BytesIO(file_content.content)
|
|
77
|
-
|
|
78
81
|
markdown_pages = ocr(file_content, **kwargs)
|
|
79
82
|
return "\n\n---\n\n".join(markdown_pages)
|
|
83
|
+
|
|
84
|
+
def _xml_to_markdown(self, file_content: Union[requests.Response, BytesIO], **kwargs) -> str:
|
|
85
|
+
"""
|
|
86
|
+
Converts an XML (or Nessus) file to markdown.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def parse_element(element: ET.Element, depth: int = 0) -> str:
|
|
90
|
+
"""
|
|
91
|
+
Recursively parses an XML element and converts it to markdown.
|
|
92
|
+
"""
|
|
93
|
+
markdown = []
|
|
94
|
+
heading = "#" * (depth + 1)
|
|
95
|
+
|
|
96
|
+
markdown.append(f"{heading} {element.tag}")
|
|
97
|
+
|
|
98
|
+
for key, val in element.attrib.items():
|
|
99
|
+
markdown.append(f"- **{key}**: {val}")
|
|
100
|
+
|
|
101
|
+
text = (element.text or "").strip()
|
|
102
|
+
if text:
|
|
103
|
+
markdown.append(f"\n{text}\n")
|
|
104
|
+
|
|
105
|
+
for child in element:
|
|
106
|
+
markdown.append(parse_element(child, depth + 1))
|
|
107
|
+
|
|
108
|
+
return "\n".join(markdown)
|
|
109
|
+
|
|
110
|
+
root = ET.fromstring(file_content.read().decode("utf-8"))
|
|
111
|
+
markdown_content = parse_element(root)
|
|
112
|
+
return markdown_content
|