MindsDB 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (61) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +53 -94
  3. mindsdb/api/a2a/agent.py +30 -206
  4. mindsdb/api/a2a/common/server/server.py +26 -27
  5. mindsdb/api/a2a/task_manager.py +93 -227
  6. mindsdb/api/a2a/utils.py +21 -0
  7. mindsdb/api/executor/command_executor.py +8 -6
  8. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
  9. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
  10. mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
  11. mindsdb/api/executor/planner/query_prepare.py +68 -87
  12. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
  13. mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
  14. mindsdb/api/executor/utilities/sql.py +97 -21
  15. mindsdb/api/http/namespaces/agents.py +126 -201
  16. mindsdb/api/http/namespaces/config.py +12 -1
  17. mindsdb/api/http/namespaces/file.py +49 -24
  18. mindsdb/api/mcp/start.py +45 -31
  19. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
  20. mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
  21. mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
  22. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
  23. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
  24. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
  25. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  26. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
  27. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +244 -141
  28. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
  29. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
  30. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
  31. mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
  32. mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
  33. mindsdb/integrations/libs/keyword_search_base.py +41 -0
  34. mindsdb/integrations/libs/vectordatabase_handler.py +114 -84
  35. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
  36. mindsdb/integrations/utilities/sql_utils.py +11 -0
  37. mindsdb/interfaces/agents/agents_controller.py +29 -9
  38. mindsdb/interfaces/agents/langchain_agent.py +7 -5
  39. mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
  40. mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
  41. mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
  42. mindsdb/interfaces/database/projects.py +1 -3
  43. mindsdb/interfaces/functions/controller.py +54 -64
  44. mindsdb/interfaces/functions/to_markdown.py +47 -14
  45. mindsdb/interfaces/knowledge_base/controller.py +228 -110
  46. mindsdb/interfaces/knowledge_base/evaluate.py +18 -6
  47. mindsdb/interfaces/knowledge_base/executor.py +346 -0
  48. mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
  49. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
  50. mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
  51. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
  52. mindsdb/interfaces/skills/sql_agent.py +181 -130
  53. mindsdb/interfaces/storage/db.py +9 -7
  54. mindsdb/utilities/config.py +58 -40
  55. mindsdb/utilities/exception.py +58 -7
  56. mindsdb/utilities/security.py +54 -11
  57. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +245 -259
  58. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +61 -58
  59. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
  60. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
  61. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0
@@ -160,7 +160,7 @@ class AgentsController:
160
160
  Parameters:
161
161
  name (str): The name of the new agent
162
162
  project_name (str): The containing project
163
- model_name (str): The name of the existing ML model the agent will use
163
+ model_name (str | dict): The name of the existing ML model the agent will use
164
164
  skills (List[Union[str, dict]]): List of existing skill names to add to the new agent, or list of dicts
165
165
  with one of keys is "name", and other is additional parameters for relationship agent<>skill
166
166
  provider (str): The provider of the model
@@ -172,6 +172,9 @@ class AgentsController:
172
172
  include_knowledge_bases: List of knowledge bases to include for text2sql skills
173
173
  ignore_knowledge_bases: List of knowledge bases to ignore for text2sql skills
174
174
  <provider>_api_key: API key for the provider (e.g., openai_api_key)
175
+ data: Dict, data sources for an agent, keys:
176
+ - knowledge_bases: List of KBs to use (alternative to `include_knowledge_bases`)
177
+ - tables: list of tables to use (alternative to `include_tables`)
175
178
 
176
179
  Returns:
177
180
  agent (db.Agents): The created agent
@@ -188,12 +191,17 @@ class AgentsController:
188
191
  if agent is not None:
189
192
  raise ValueError(f"Agent with name already exists: {name}")
190
193
 
191
- if model_name is not None:
192
- _, provider = self.check_model_provider(model_name, provider)
193
-
194
194
  # No need to copy params since we're not preserving the original reference
195
195
  params = params or {}
196
196
 
197
+ if isinstance(model_name, dict):
198
+ # move into params
199
+ params["model"] = model_name
200
+ model_name = None
201
+
202
+ if model_name is not None:
203
+ _, provider = self.check_model_provider(model_name, provider)
204
+
197
205
  if model_name is None:
198
206
  logger.warning("'model_name' param is not provided. Using default global llm model at runtime.")
199
207
 
@@ -230,6 +238,12 @@ class AgentsController:
230
238
  if "database" in params or need_params:
231
239
  params["database"] = database
232
240
 
241
+ if "data" in params:
242
+ if include_knowledge_bases is None:
243
+ include_knowledge_bases = params["data"].get("knowledge_bases")
244
+ if include_tables is None:
245
+ include_tables = params["data"].get("tables")
246
+
233
247
  if "knowledge_base_database" in params or include_knowledge_bases or ignore_knowledge_bases:
234
248
  params["knowledge_base_database"] = knowledge_base_database
235
249
 
@@ -549,13 +563,19 @@ class AgentsController:
549
563
  agent.deleted_at = datetime.datetime.now()
550
564
  db.session.commit()
551
565
 
552
- def get_agent_llm_params(self, model_params: dict):
566
+ def get_agent_llm_params(self, agent_params: dict):
553
567
  """
554
568
  Get agent LLM parameters by combining default config with user provided parameters.
555
569
  Similar to how knowledge bases handle default parameters.
556
570
  """
557
571
  combined_model_params = copy.deepcopy(config.get("default_llm", {}))
558
572
 
573
+ if "model" in agent_params:
574
+ model_params = agent_params["model"]
575
+ else:
576
+ # params for LLM can be arbitrary
577
+ model_params = agent_params
578
+
559
579
  if model_params:
560
580
  combined_model_params.update(model_params)
561
581
 
@@ -596,9 +616,9 @@ class AgentsController:
596
616
  db.session.commit()
597
617
 
598
618
  # Get agent parameters and combine with default LLM parameters at runtime
599
- agent_params = self.get_agent_llm_params(agent.params)
619
+ llm_params = self.get_agent_llm_params(agent.params)
600
620
 
601
- lang_agent = LangchainAgent(agent, model, params=agent_params)
621
+ lang_agent = LangchainAgent(agent, model, llm_params=llm_params)
602
622
  return lang_agent.get_completion(messages)
603
623
 
604
624
  def _get_completion_stream(
@@ -636,7 +656,7 @@ class AgentsController:
636
656
  db.session.commit()
637
657
 
638
658
  # Get agent parameters and combine with default LLM parameters at runtime
639
- agent_params = self.get_agent_llm_params(agent.params)
659
+ llm_params = self.get_agent_llm_params(agent.params)
640
660
 
641
- lang_agent = LangchainAgent(agent, model=model, params=agent_params)
661
+ lang_agent = LangchainAgent(agent, model=model, llm_params=llm_params)
642
662
  return lang_agent.get_completion(messages, stream=True)
@@ -228,7 +228,7 @@ def process_chunk(chunk):
228
228
 
229
229
 
230
230
  class LangchainAgent:
231
- def __init__(self, agent: db.Agents, model: dict = None, params: dict = None):
231
+ def __init__(self, agent: db.Agents, model: dict = None, llm_params: dict = None):
232
232
  self.agent = agent
233
233
  self.model = model
234
234
 
@@ -241,12 +241,12 @@ class LangchainAgent:
241
241
  self.mdb_langfuse_callback_handler: Optional[object] = None # custom (see langfuse_callback_handler.py)
242
242
 
243
243
  self.langfuse_client_wrapper = LangfuseClientWrapper()
244
- self.args = self._initialize_args(params)
244
+ self.args = self._initialize_args(llm_params)
245
245
 
246
246
  # Back compatibility for old models
247
247
  self.provider = self.args.get("provider", get_llm_provider(self.args))
248
248
 
249
- def _initialize_args(self, params: dict = None) -> dict:
249
+ def _initialize_args(self, llm_params: dict = None) -> dict:
250
250
  """
251
251
  Initialize the arguments for agent execution.
252
252
 
@@ -254,14 +254,16 @@ class LangchainAgent:
254
254
  The params are already merged with defaults by AgentsController.get_agent_llm_params.
255
255
 
256
256
  Args:
257
- params: Parameters for agent execution (already merged with defaults)
257
+ llm_params: Parameters for agent execution (already merged with defaults)
258
258
 
259
259
  Returns:
260
260
  dict: Final parameters for agent execution
261
261
  """
262
262
  # Use the parameters passed to the method (already merged with defaults by AgentsController)
263
263
  # No fallback needed as AgentsController.get_agent_llm_params already handles this
264
- args = params.copy() if params else {}
264
+ args = self.agent.params.copy()
265
+ if llm_params:
266
+ args.update(llm_params)
265
267
 
266
268
  # Set model name and provider if given in create agent otherwise use global llm defaults
267
269
  # AgentsController.get_agent_llm_params
@@ -71,11 +71,11 @@ class MCPLangchainAgent(LangchainAgent):
71
71
  self,
72
72
  agent: db.Agents,
73
73
  model: dict = None,
74
- params: dict = None,
74
+ llm_params: dict = None,
75
75
  mcp_host: str = "127.0.0.1",
76
76
  mcp_port: int = 47337,
77
77
  ):
78
- super().__init__(agent, model, params)
78
+ super().__init__(agent, model, llm_params)
79
79
  self.mcp_host = mcp_host
80
80
  self.mcp_port = mcp_port
81
81
  self.exit_stack = AsyncExitStack()
@@ -251,10 +251,10 @@ def create_mcp_agent(
251
251
  raise ValueError(f"Agent {agent_name} not found in project {project_name}")
252
252
 
253
253
  # Get merged parameters (defaults + agent params)
254
- merged_params = agent_controller.get_agent_llm_params(agent_db.params)
254
+ llm_params = agent_controller.get_agent_llm_params(agent_db.params)
255
255
 
256
256
  # Create MCP agent with merged parameters
257
- mcp_agent = MCPLangchainAgent(agent_db, params=merged_params, mcp_host=mcp_host, mcp_port=mcp_port)
257
+ mcp_agent = MCPLangchainAgent(agent_db, llm_params=llm_params, mcp_host=mcp_host, mcp_port=mcp_port)
258
258
 
259
259
  # Wrap for LiteLLM compatibility
260
260
  return LiteLLMAgentWrapper(mcp_agent)
@@ -96,27 +96,7 @@ class MindsDBSQL(SQLDatabase):
96
96
  # Log the query for debugging
97
97
  logger.info(f"Executing SQL query: {command}")
98
98
 
99
- # Removing backticks causes in query execution.
100
- # remove backticks
101
- # command = command.replace('`', '')
102
-
103
- # Parse the SQL string to an AST object first
104
- from mindsdb_sql_parser import parse_sql
105
-
106
- ast_query = parse_sql(command)
107
-
108
- # Now execute the parsed query
109
- result = self._sql_agent.skill_tool.get_command_executor().execute_command(
110
- ast_query, database_name="mindsdb"
111
- )
112
-
113
- # Convert ExecuteAnswer to a DataFrame for easier manipulation
114
- if result.data is not None:
115
- df = result.data.to_df()
116
- return df.to_string(index=False)
117
-
118
- else:
119
- return "Query executed successfully, but returned no data."
99
+ return self._sql_agent.query(command)
120
100
 
121
101
  except Exception as e:
122
102
  logger.error(f"Error executing SQL command: {str(e)}\n{traceback.format_exc()}")
@@ -127,28 +107,6 @@ class MindsDBSQL(SQLDatabase):
127
107
  return f"Error executing knowledge base query: {str(e)}. Please check that the knowledge base exists and your query syntax is correct."
128
108
  return f"Error: {str(e)}"
129
109
 
130
- # def run_no_throw(self, command: str, fetch: str = "all") -> str:
131
- # """Execute a SQL command and return the result as a string.
132
- #
133
- # This method catches any exceptions and returns an error message instead of raising an exception.
134
- #
135
- # Args:
136
- # command: The SQL command to execute
137
- # fetch: Whether to fetch 'all' results or just 'one'
138
- #
139
- # Returns:
140
- # A string representation of the result or an error message
141
- # """
142
- # command = extract_essential(command)
143
- # try:
144
- # return self._sql_agent.query_safe(command)
145
- # except Exception as e:
146
- # logger.error(f"Error executing SQL command: {str(e)}")
147
- # # If this is a knowledge base query, provide a more helpful error message
148
- # if "knowledge_base" in command.lower() or any(kb in command for kb in self._sql_agent.get_usable_knowledge_base_names()):
149
- # return f"Error executing knowledge base query: {str(e)}. Please check that the knowledge base exists and your query syntax is correct."
150
- # return f"Error: {str(e)}"
151
-
152
110
  def get_usable_knowledge_base_names(self) -> List[str]:
153
111
  """Get a list of usable knowledge base names.
154
112
 
@@ -160,3 +118,12 @@ class MindsDBSQL(SQLDatabase):
160
118
  except Exception as e:
161
119
  logger.error(f"Error getting usable knowledge base names: {str(e)}")
162
120
  return []
121
+
122
+ def check_knowledge_base_permission(self, name):
123
+ """Get a list of usable knowledge base names.
124
+
125
+ Returns:
126
+ A list of knowledge base names that can be used in queries
127
+ """
128
+
129
+ return self._sql_agent.check_knowledge_base_permission(name)
@@ -18,7 +18,9 @@ class DataCatalogReader(BaseDataCatalog):
18
18
 
19
19
  metadata_str = "Data Catalog: \n"
20
20
  if hasattr(self.data_handler, "meta_get_handler_info"):
21
- metadata_str += self.data_handler.meta_get_handler_info() + "\n\n"
21
+ info = self.data_handler.meta_get_handler_info()
22
+ if info:
23
+ metadata_str += info + "\n\n"
22
24
 
23
25
  for table in tables:
24
26
  metadata_str += table.as_string() + "\n\n"
@@ -362,9 +362,7 @@ class Project:
362
362
 
363
363
  columns = [ASSISTANT_COLUMN, USER_COLUMN]
364
364
  case "KNOWLEDGE_BASE":
365
- from mindsdb.interfaces.knowledge_base.controller import KB_TO_VECTORDB_COLUMNS
366
-
367
- columns = list(KB_TO_VECTORDB_COLUMNS.keys()) + ["metadata", "relevance", "distance"]
365
+ columns = ["id", "chunk_id", "chunk_content", "metadata", "relevance", "distance"]
368
366
  case "TABLE":
369
367
  # like 'mindsdb.models'
370
368
  pass
@@ -7,15 +7,15 @@ from mindsdb.utilities.config import config
7
7
 
8
8
 
9
9
  def python_to_duckdb_type(py_type):
10
- if py_type == 'int':
10
+ if py_type == "int":
11
11
  return BIGINT
12
- elif py_type == 'float':
12
+ elif py_type == "float":
13
13
  return DOUBLE
14
- elif py_type == 'str':
14
+ elif py_type == "str":
15
15
  return VARCHAR
16
- elif py_type == 'bool':
16
+ elif py_type == "bool":
17
17
  return BOOLEAN
18
- elif py_type == 'bytes':
18
+ elif py_type == "bytes":
19
19
  return BLOB
20
20
  else:
21
21
  # Unknown
@@ -53,8 +53,8 @@ class BYOMFunctionsController:
53
53
  # first run
54
54
  self.byom_engines = []
55
55
  for name, info in self.session.integration_controller.get_all().items():
56
- if info['type'] == 'ml' and info['engine'] == 'byom':
57
- if info['connection_data'].get('mode') == 'custom_function':
56
+ if info["type"] == "ml" and info["engine"] == "byom":
57
+ if info["connection_data"].get("mode") == "custom_function":
58
58
  self.byom_engines.append(name)
59
59
  return self.byom_engines
60
60
 
@@ -63,7 +63,7 @@ class BYOMFunctionsController:
63
63
  ml_handler = self.session.integration_controller.get_ml_handler(engine)
64
64
 
65
65
  storage = HandlerStorage(ml_handler.integration_id)
66
- methods = storage.json_get('methods')
66
+ methods = storage.json_get("methods")
67
67
  self.byom_methods[engine] = methods
68
68
  self.byom_handlers[engine] = ml_handler
69
69
 
@@ -81,7 +81,7 @@ class BYOMFunctionsController:
81
81
  # do nothing
82
82
  return
83
83
 
84
- new_name = f'{node.namespace}_{fnc_name}'
84
+ new_name = f"{node.namespace}_{fnc_name}"
85
85
  node.op = new_name
86
86
 
87
87
  if new_name in self.callbacks:
@@ -91,16 +91,13 @@ class BYOMFunctionsController:
91
91
  def callback(*args):
92
92
  return self.method_call(engine, fnc_name, args)
93
93
 
94
- input_types = [
95
- param['type']
96
- for param in methods[fnc_name]['input_params']
97
- ]
94
+ input_types = [param["type"] for param in methods[fnc_name]["input_params"]]
98
95
 
99
96
  meta = {
100
- 'name': new_name,
101
- 'callback': callback,
102
- 'input_types': input_types,
103
- 'output_type': methods[fnc_name]['output_type']
97
+ "name": new_name,
98
+ "callback": callback,
99
+ "input_types": input_types,
100
+ "output_type": methods[fnc_name]["output_type"],
104
101
  }
105
102
 
106
103
  self.callbacks[new_name] = meta
@@ -114,7 +111,6 @@ class BYOMFunctionsController:
114
111
 
115
112
 
116
113
  class FunctionController(BYOMFunctionsController):
117
-
118
114
  def __init__(self, *args, **kwargs):
119
115
  super().__init__(*args, **kwargs)
120
116
 
@@ -124,10 +120,10 @@ class FunctionController(BYOMFunctionsController):
124
120
  return meta
125
121
 
126
122
  # builtin functions
127
- if node.op.lower() == 'llm':
123
+ if node.op.lower() == "llm":
128
124
  return self.llm_call_function(node)
129
125
 
130
- elif node.op.lower() == 'to_markdown':
126
+ elif node.op.lower() == "to_markdown":
131
127
  return self.to_markdown_call_function(node)
132
128
 
133
129
  def llm_call_function(self, node):
@@ -141,70 +137,74 @@ class FunctionController(BYOMFunctionsController):
141
137
  try:
142
138
  from langchain_core.messages import HumanMessage
143
139
  from mindsdb.interfaces.agents.langchain_agent import create_chat_model
140
+
144
141
  llm = create_chat_model(chat_model_params)
145
142
  except Exception as e:
146
- raise RuntimeError(f'Unable to use LLM function, check ENV variables: {e}')
143
+ raise RuntimeError(f"Unable to use LLM function, check ENV variables: {e}")
147
144
 
148
145
  def callback(question):
149
146
  resp = llm([HumanMessage(question)])
150
147
  return resp.content
151
148
 
152
- meta = {
153
- 'name': name,
154
- 'callback': callback,
155
- 'input_types': ['str'],
156
- 'output_type': 'str'
157
- }
149
+ meta = {"name": name, "callback": callback, "input_types": ["str"], "output_type": "str"}
158
150
  self.callbacks[name] = meta
159
151
  return meta
160
152
 
161
153
  def to_markdown_call_function(self, node):
162
154
  # load on-demand because lib is heavy
163
155
  from mindsdb.interfaces.functions.to_markdown import ToMarkdown
156
+
164
157
  name = node.op.lower()
165
158
 
166
159
  if name in self.callbacks:
167
160
  return self.callbacks[name]
168
161
 
169
- def callback(file_path_or_url):
170
- chat_model_params = self._parse_chat_model_params('TO_MARKDOWN_FUNCTION_')
171
-
162
+ def prepare_chat_model_params(chat_model_params: dict) -> dict:
163
+ """
164
+ Parepares the chat model parameters for the ToMarkdown function.
165
+ """
172
166
  params_copy = copy.deepcopy(chat_model_params)
173
- params_copy['model'] = params_copy.pop('model_name')
174
- params_copy.pop('api_keys')
175
- params_copy.pop('provider')
167
+ params_copy["model"] = params_copy.pop("model_name")
168
+
169
+ # Set the base_url for the Google provider.
170
+ if params_copy["provider"] == "google" and "base_url" not in params_copy:
171
+ params_copy["base_url"] = "https://generativelanguage.googleapis.com/v1beta/"
172
+
173
+ params_copy.pop("api_keys")
174
+ params_copy.pop("provider")
175
+
176
+ return params_copy
177
+
178
+ def callback(file_path_or_url):
179
+ chat_model_params = self._parse_chat_model_params("TO_MARKDOWN_FUNCTION_")
180
+ chat_model_params = prepare_chat_model_params(chat_model_params)
176
181
 
177
182
  to_markdown = ToMarkdown()
178
- return to_markdown.call(file_path_or_url, **params_copy)
183
+ return to_markdown.call(file_path_or_url, **chat_model_params)
179
184
 
180
- meta = {
181
- 'name': name,
182
- 'callback': callback,
183
- 'input_types': ['str'],
184
- 'output_type': 'str'
185
- }
185
+ meta = {"name": name, "callback": callback, "input_types": ["str"], "output_type": "str"}
186
186
  self.callbacks[name] = meta
187
187
  return meta
188
188
 
189
- def _parse_chat_model_params(self, param_prefix: str = 'LLM_FUNCTION_'):
189
+ def _parse_chat_model_params(self, param_prefix: str = "LLM_FUNCTION_"):
190
190
  """
191
191
  Parses the environment variables for chat model parameters.
192
192
  """
193
193
  chat_model_params = config.get("default_llm") or {}
194
194
  for k, v in os.environ.items():
195
195
  if k.startswith(param_prefix):
196
- param_name = k[len(param_prefix):]
197
- if param_name == 'MODEL':
198
- chat_model_params['model_name'] = v
196
+ param_name = k[len(param_prefix) :]
197
+ if param_name == "MODEL":
198
+ chat_model_params["model_name"] = v
199
199
  else:
200
200
  chat_model_params[param_name.lower()] = v
201
201
 
202
- if 'provider' not in chat_model_params:
203
- chat_model_params['provider'] = 'openai'
202
+ if "provider" not in chat_model_params:
203
+ chat_model_params["provider"] = "openai"
204
204
 
205
- if 'api_key' in chat_model_params:
205
+ if "api_key" in chat_model_params:
206
206
  # move to api_keys dict
207
- chat_model_params["api_keys"] = {chat_model_params['provider']: chat_model_params['api_key']}
207
+ chat_model_params["api_keys"] = {chat_model_params["provider"]: chat_model_params["api_key"]}
208
208
 
209
209
  return chat_model_params
210
210
 
@@ -215,33 +215,23 @@ class DuckDBFunctions:
215
215
  self.functions = {}
216
216
 
217
217
  def check_function(self, node):
218
-
219
218
  meta = self.controller.check_function(node)
220
219
  if meta is None:
221
220
  return
222
221
 
223
- name = meta['name']
222
+ name = meta["name"]
224
223
 
225
224
  if name in self.functions:
226
225
  return
227
226
 
228
- input_types = [
229
- python_to_duckdb_type(param)
230
- for param in meta['input_types']
231
- ]
227
+ input_types = [python_to_duckdb_type(param) for param in meta["input_types"]]
232
228
 
233
229
  self.functions[name] = {
234
- 'callback': function_maker(len(input_types), meta['callback']),
235
- 'input': input_types,
236
- 'output': python_to_duckdb_type(meta['output_type'])
230
+ "callback": function_maker(len(input_types), meta["callback"]),
231
+ "input": input_types,
232
+ "output": python_to_duckdb_type(meta["output_type"]),
237
233
  }
238
234
 
239
235
  def register(self, connection):
240
236
  for name, info in self.functions.items():
241
- connection.create_function(
242
- name,
243
- info['callback'],
244
- info['input'],
245
- info['output'],
246
- null_handling="special"
247
- )
237
+ connection.create_function(name, info["callback"], info["input"], info["output"], null_handling="special")
@@ -2,6 +2,7 @@ from io import BytesIO
2
2
  import os
3
3
  from typing import Union
4
4
  from urllib.parse import urlparse
5
+ import xml.etree.ElementTree as ET
5
6
 
6
7
  from aipdf import ocr
7
8
  import mimetypes
@@ -12,6 +13,7 @@ class ToMarkdown:
12
13
  """
13
14
  Extracts the content of documents of various formats in markdown format.
14
15
  """
16
+
15
17
  def __init__(self):
16
18
  """
17
19
  Initializes the ToMarkdown class.
@@ -24,24 +26,28 @@ class ToMarkdown:
24
26
  file_extension = self._get_file_extension(file_path_or_url)
25
27
  file_content = self._get_file_content(file_path_or_url)
26
28
 
27
- if file_extension == '.pdf':
29
+ if file_extension == ".pdf":
28
30
  return self._pdf_to_markdown(file_content, **kwargs)
31
+
32
+ elif file_extension in (".xml", ".nessus"):
33
+ return self._xml_to_markdown(file_content, **kwargs)
34
+
29
35
  else:
30
36
  raise ValueError(f"Unsupported file type: {file_extension}.")
31
37
 
32
- def _get_file_content(self, file_path_or_url: str) -> str:
38
+ def _get_file_content(self, file_path_or_url: str) -> BytesIO:
33
39
  """
34
40
  Retrieves the content of a file.
35
41
  """
36
42
  parsed_url = urlparse(file_path_or_url)
37
- if parsed_url.scheme in ('http', 'https'):
43
+ if parsed_url.scheme in ("http", "https"):
38
44
  response = requests.get(file_path_or_url)
39
45
  if response.status_code == 200:
40
- return response
46
+ return BytesIO(response.content)
41
47
  else:
42
- raise RuntimeError(f'Unable to retrieve file from URL: {file_path_or_url}')
48
+ raise RuntimeError(f"Unable to retrieve file from URL: {file_path_or_url}")
43
49
  else:
44
- with open(file_path_or_url, 'rb') as file:
50
+ with open(file_path_or_url, "rb") as file:
45
51
  return BytesIO(file.read())
46
52
 
47
53
  def _get_file_extension(self, file_path_or_url: str) -> str:
@@ -49,13 +55,13 @@ class ToMarkdown:
49
55
  Retrieves the file extension from a file path or URL.
50
56
  """
51
57
  parsed_url = urlparse(file_path_or_url)
52
- if parsed_url.scheme in ('http', 'https'):
58
+ if parsed_url.scheme in ("http", "https"):
53
59
  try:
54
60
  # Make a HEAD request to get headers without downloading the file.
55
61
  response = requests.head(file_path_or_url, allow_redirects=True)
56
- content_type = response.headers.get('Content-Type', '')
62
+ content_type = response.headers.get("Content-Type", "")
57
63
  if content_type:
58
- ext = mimetypes.guess_extension(content_type.split(';')[0].strip())
64
+ ext = mimetypes.guess_extension(content_type.split(";")[0].strip())
59
65
  if ext:
60
66
  return ext
61
67
 
@@ -64,16 +70,43 @@ class ToMarkdown:
64
70
  if ext:
65
71
  return ext
66
72
  except requests.RequestException:
67
- raise RuntimeError(f'Unable to retrieve file extension from URL: {file_path_or_url}')
73
+ raise RuntimeError(f"Unable to retrieve file extension from URL: {file_path_or_url}")
68
74
  else:
69
75
  return os.path.splitext(file_path_or_url)[1]
70
76
 
71
- def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes], **kwargs) -> str:
77
+ def _pdf_to_markdown(self, file_content: Union[requests.Response, BytesIO], **kwargs) -> str:
72
78
  """
73
79
  Converts a PDF file to markdown.
74
80
  """
75
- if isinstance(file_content, requests.Response):
76
- file_content = BytesIO(file_content.content)
77
-
78
81
  markdown_pages = ocr(file_content, **kwargs)
79
82
  return "\n\n---\n\n".join(markdown_pages)
83
+
84
+ def _xml_to_markdown(self, file_content: Union[requests.Response, BytesIO], **kwargs) -> str:
85
+ """
86
+ Converts an XML (or Nessus) file to markdown.
87
+ """
88
+
89
+ def parse_element(element: ET.Element, depth: int = 0) -> str:
90
+ """
91
+ Recursively parses an XML element and converts it to markdown.
92
+ """
93
+ markdown = []
94
+ heading = "#" * (depth + 1)
95
+
96
+ markdown.append(f"{heading} {element.tag}")
97
+
98
+ for key, val in element.attrib.items():
99
+ markdown.append(f"- **{key}**: {val}")
100
+
101
+ text = (element.text or "").strip()
102
+ if text:
103
+ markdown.append(f"\n{text}\n")
104
+
105
+ for child in element:
106
+ markdown.append(parse_element(child, depth + 1))
107
+
108
+ return "\n".join(markdown)
109
+
110
+ root = ET.fromstring(file_content.read().decode("utf-8"))
111
+ markdown_content = parse_element(root)
112
+ return markdown_content