MindsDB 25.7.1.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

@@ -18,6 +18,8 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
18
18
  DistanceFunction,
19
19
  TableField,
20
20
  )
21
+ from mindsdb.integrations.libs.keyword_search_base import KeywordSearchBase
22
+ from mindsdb.integrations.utilities.sql_utils import KeywordSearchArgs
21
23
  from mindsdb.utilities import log
22
24
  from mindsdb.utilities.profiler import profiler
23
25
  from mindsdb.utilities.context import context as ctx
@@ -26,7 +28,7 @@ logger = log.getLogger(__name__)
26
28
 
27
29
 
28
30
  # todo Issue #7316 add support for different indexes and search algorithms e.g. cosine similarity or L2 norm
29
- class PgVectorHandler(PostgresHandler, VectorStoreHandler):
31
+ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
30
32
  """This handler handles connection and execution of the PostgreSQL with pgvector extension statements."""
31
33
 
32
34
  name = "pgvector"
@@ -228,6 +230,40 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
228
230
  else:
229
231
  return ""
230
232
 
233
+ @staticmethod
234
+ def _construct_where_clause_with_keywords(filter_conditions=None, keyword_query=None, content_column_name=None):
235
+ if not keyword_query or not content_column_name:
236
+ return PgVectorHandler._construct_where_clause(filter_conditions)
237
+
238
+ keyword_query_condition = (
239
+ f"""to_tsvector('english', {content_column_name}) @@ websearch_to_tsquery('english', '{keyword_query}')"""
240
+ )
241
+ if filter_conditions is None:
242
+ return ""
243
+
244
+ where_clauses = []
245
+
246
+ for item in filter_conditions:
247
+ key = item["name"]
248
+
249
+ if item["op"].lower() in ("in", "not in"):
250
+ values = list(repr(i) for i in item["value"])
251
+ item["value"] = "({})".format(", ".join(values))
252
+ else:
253
+ if item["value"] is None:
254
+ item["value"] = "null"
255
+ else:
256
+ item["value"] = repr(item["value"])
257
+ where_clauses.append(f"{key} {item['op']} {item['value']}")
258
+
259
+ where_clauses.append(keyword_query_condition)
260
+ if len(where_clauses) > 1:
261
+ return f"WHERE {' AND '.join(where_clauses)}"
262
+ elif len(where_clauses) == 1:
263
+ return f"WHERE {where_clauses[0]}"
264
+ else:
265
+ return ""
266
+
231
267
  @staticmethod
232
268
  def _construct_full_after_from_clause(
233
269
  where_clause: str,
@@ -236,6 +272,36 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
236
272
  ) -> str:
237
273
  return f"{where_clause} {offset_clause} {limit_clause}"
238
274
 
275
+ def _build_keyword_bm25_query(
276
+ self,
277
+ table_name: str,
278
+ query: str,
279
+ columns: List[str] = None,
280
+ content_column_name: str = "content",
281
+ conditions: List[FilterCondition] = None,
282
+ limit: int = None,
283
+ offset: int = None,
284
+ ):
285
+ if columns is None:
286
+ columns = ["id", "content", "metadata"]
287
+
288
+ filter_conditions, _ = self._translate_conditions(conditions)
289
+
290
+ # given filter conditions, construct where clause
291
+ where_clause = self._construct_where_clause_with_keywords(filter_conditions, query, content_column_name)
292
+
293
+ query = f"""
294
+ SELECT
295
+ {", ".join(columns)},
296
+ ts_rank_cd(to_tsvector('english', {content_column_name}), websearch_to_tsquery('english', '{query}')) as distance
297
+ FROM
298
+ {table_name}
299
+ {where_clause if where_clause else ""}
300
+ {f"LIMIT {limit}" if limit else ""}
301
+ {f"OFFSET {offset}" if offset else ""};"""
302
+
303
+ return query
304
+
239
305
  def _build_select_query(
240
306
  self,
241
307
  table_name: str,
@@ -320,6 +386,33 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
320
386
  columns = ["id", "content", "embeddings", "metadata"]
321
387
 
322
388
  query = self._build_select_query(table_name, columns, conditions, limit, offset)
389
+
390
+ result = self.raw_query(query)
391
+
392
+ # ensure embeddings are returned as string so they can be parsed by mindsdb
393
+ if "embeddings" in columns:
394
+ result["embeddings"] = result["embeddings"].astype(str)
395
+
396
+ return result
397
+
398
+ def keyword_select(
399
+ self,
400
+ table_name: str,
401
+ columns: List[str] = None,
402
+ conditions: List[FilterCondition] = None,
403
+ offset: int = None,
404
+ limit: int = None,
405
+ keyword_search_args: KeywordSearchArgs = None,
406
+ ) -> pd.DataFrame:
407
+ table_name = self._check_table(table_name)
408
+
409
+ if columns is None:
410
+ columns = ["id", "content", "embeddings", "metadata"]
411
+ content_column_name = keyword_search_args.column
412
+ query = self._build_keyword_bm25_query(
413
+ table_name, keyword_search_args.query, columns, content_column_name, conditions, limit, offset
414
+ )
415
+
323
416
  result = self.raw_query(query)
324
417
 
325
418
  # ensure embeddings are returned as string so they can be parsed by mindsdb
@@ -271,10 +271,11 @@ class SalesforceHandler(MetaAPIHandler):
271
271
 
272
272
  # Retrieve the metadata for all Salesforce resources.
273
273
  main_metadata = connection.sobjects.describe()
274
-
275
274
  if table_names:
276
275
  # Filter the metadata for the specified tables.
277
- main_metadata = [resource for resource in main_metadata["sobjects"] if resource["name"] in table_names]
276
+ main_metadata = [
277
+ resource for resource in main_metadata["sobjects"] if resource["name"].lower() in table_names
278
+ ]
278
279
  else:
279
280
  main_metadata = main_metadata["sobjects"]
280
281
 
@@ -165,7 +165,7 @@ def create_table_class(resource_name: Text) -> MetaAPIResource:
165
165
  client = self.handler.connect()
166
166
 
167
167
  resource_metadata = next(
168
- (resource for resource in main_metadata if resource["name"] == resource_name),
168
+ (resource for resource in main_metadata if resource["name"].lower() == resource_name),
169
169
  )
170
170
 
171
171
  # Get row count if Id column is aggregatable.
@@ -0,0 +1,41 @@
1
+ from mindsdb_sql_parser.ast import Select
2
+ from typing import List
3
+ import pandas as pd
4
+
5
+ from mindsdb.integrations.utilities.sql_utils import FilterCondition, KeywordSearchArgs
6
+
7
+
8
+ class KeywordSearchBase:
9
+ """
10
+ Base class for keyword search integrations.
11
+ This class provides a common interface for keyword search functionality.
12
+ """
13
+
14
+ def __init__(self, *args, **kwargs):
15
+ pass
16
+
17
+ def dispatch_keyword_select(
18
+ self, query: Select, conditions: List[FilterCondition] = None, keyword_search_args: KeywordSearchArgs = None
19
+ ):
20
+ """Dispatches a keyword search select query to the appropriate method."""
21
+ raise NotImplementedError()
22
+
23
+ def keyword_select(
24
+ self,
25
+ table_name: str,
26
+ columns: List[str] = None,
27
+ conditions: List[FilterCondition] = None,
28
+ offset: int = None,
29
+ limit: int = None,
30
+ ) -> pd.DataFrame:
31
+ """Select data from table
32
+
33
+ Args:
34
+ table_name (str): table name
35
+ columns (List[str]): columns to select
36
+ conditions (List[FilterCondition]): conditions to select
37
+
38
+ Returns:
39
+ HandlerResponse
40
+ """
41
+ raise NotImplementedError()
@@ -21,7 +21,7 @@ from mindsdb_sql_parser.ast.base import ASTNode
21
21
 
22
22
  from mindsdb.integrations.libs.response import RESPONSE_TYPE, HandlerResponse
23
23
  from mindsdb.utilities import log
24
- from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
24
+ from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator, KeywordSearchArgs
25
25
 
26
26
  from mindsdb.integrations.utilities.query_traversal import query_traversal
27
27
  from .base import BaseHandler
@@ -372,44 +372,65 @@ class VectorStoreHandler(BaseHandler):
372
372
  return self.delete(table_name, conditions=conditions)
373
373
 
374
374
  def dispatch_select(
375
- self, query: Select, conditions: List[FilterCondition] = None, allowed_metadata_columns: List[str] = None
375
+ self,
376
+ query: Select,
377
+ conditions: Optional[List[FilterCondition]] = None,
378
+ allowed_metadata_columns: List[str] = None,
379
+ keyword_search_args: Optional[KeywordSearchArgs] = None,
376
380
  ):
377
381
  """
378
- Dispatch select query to the appropriate method.
382
+ Dispatches a select query to the appropriate method, handling both
383
+ standard selections and keyword searches based on the provided arguments.
379
384
  """
380
- # parse key arguments
385
+ # 1. Parse common query arguments
381
386
  table_name = query.from_table.parts[-1]
382
- # if targets are star, select all columns
387
+
388
+ # If targets are a star (*), select all schema columns
383
389
  if isinstance(query.targets[0], Star):
384
390
  columns = [col["name"] for col in self.SCHEMA]
385
391
  else:
386
392
  columns = [col.parts[-1] for col in query.targets]
387
393
 
394
+ # 2. Validate columns
388
395
  if not self._is_columns_allowed(columns):
389
- raise Exception(f"Columns {columns} not allowed.Allowed columns are {[col['name'] for col in self.SCHEMA]}")
396
+ allowed_cols = [col["name"] for col in self.SCHEMA]
397
+ raise Exception(f"Columns {columns} not allowed. Allowed columns are {allowed_cols}")
390
398
 
391
- # check if columns are allowed
399
+ # 3. Extract and process conditions
392
400
  if conditions is None:
393
401
  where_statement = query.where
394
402
  conditions = self.extract_conditions(where_statement)
395
403
  self._convert_metadata_filters(conditions, allowed_metadata_columns=allowed_metadata_columns)
396
404
 
397
- # get offset and limit
405
+ # 4. Get offset and limit
398
406
  offset = query.offset.value if query.offset is not None else None
399
407
  limit = query.limit.value if query.limit is not None else None
400
408
 
401
- # dispatch select
402
- try:
403
- return self.select(
409
+ # 5. Conditionally dispatch to the correct select method
410
+ if keyword_search_args:
411
+ # It's a keyword search
412
+ return self.keyword_select(
404
413
  table_name,
405
414
  columns=columns,
406
415
  conditions=conditions,
407
416
  offset=offset,
408
417
  limit=limit,
418
+ keyword_search_args=keyword_search_args,
409
419
  )
410
- except Exception as e:
411
- handler_engine = self.__class__.name
412
- raise VectorHandlerException(f"Error in {handler_engine} database: {e}")
420
+ else:
421
+ # It's a standard select
422
+ try:
423
+ return self.select(
424
+ table_name,
425
+ columns=columns,
426
+ conditions=conditions,
427
+ offset=offset,
428
+ limit=limit,
429
+ )
430
+
431
+ except Exception as e:
432
+ handler_engine = self.__class__.name
433
+ raise VectorHandlerException(f"Error in {handler_engine} database: {e}")
413
434
 
414
435
  def _dispatch(self, query: ASTNode) -> HandlerResponse:
415
436
  """
@@ -60,6 +60,17 @@ class FilterCondition:
60
60
  """
61
61
 
62
62
 
63
+ class KeywordSearchArgs:
64
+ def __init__(self, column: str, query: str):
65
+ """
66
+ Args:
67
+ column: The column to search in.
68
+ query: The search query string.
69
+ """
70
+ self.column = column
71
+ self.query = query
72
+
73
+
63
74
  class SortColumn:
64
75
  def __init__(self, column: str, ascending: bool = True):
65
76
  self.column = column
@@ -362,9 +362,7 @@ class Project:
362
362
 
363
363
  columns = [ASSISTANT_COLUMN, USER_COLUMN]
364
364
  case "KNOWLEDGE_BASE":
365
- from mindsdb.interfaces.knowledge_base.controller import KB_TO_VECTORDB_COLUMNS
366
-
367
- columns = list(KB_TO_VECTORDB_COLUMNS.keys()) + ["metadata", "relevance", "distance"]
365
+ columns = ["id", "chunk_id", "chunk_content", "metadata", "relevance", "distance"]
368
366
  case "TABLE":
369
367
  # like 'mindsdb.models'
370
368
  pass
@@ -7,15 +7,15 @@ from mindsdb.utilities.config import config
7
7
 
8
8
 
9
9
  def python_to_duckdb_type(py_type):
10
- if py_type == 'int':
10
+ if py_type == "int":
11
11
  return BIGINT
12
- elif py_type == 'float':
12
+ elif py_type == "float":
13
13
  return DOUBLE
14
- elif py_type == 'str':
14
+ elif py_type == "str":
15
15
  return VARCHAR
16
- elif py_type == 'bool':
16
+ elif py_type == "bool":
17
17
  return BOOLEAN
18
- elif py_type == 'bytes':
18
+ elif py_type == "bytes":
19
19
  return BLOB
20
20
  else:
21
21
  # Unknown
@@ -53,8 +53,8 @@ class BYOMFunctionsController:
53
53
  # first run
54
54
  self.byom_engines = []
55
55
  for name, info in self.session.integration_controller.get_all().items():
56
- if info['type'] == 'ml' and info['engine'] == 'byom':
57
- if info['connection_data'].get('mode') == 'custom_function':
56
+ if info["type"] == "ml" and info["engine"] == "byom":
57
+ if info["connection_data"].get("mode") == "custom_function":
58
58
  self.byom_engines.append(name)
59
59
  return self.byom_engines
60
60
 
@@ -63,7 +63,7 @@ class BYOMFunctionsController:
63
63
  ml_handler = self.session.integration_controller.get_ml_handler(engine)
64
64
 
65
65
  storage = HandlerStorage(ml_handler.integration_id)
66
- methods = storage.json_get('methods')
66
+ methods = storage.json_get("methods")
67
67
  self.byom_methods[engine] = methods
68
68
  self.byom_handlers[engine] = ml_handler
69
69
 
@@ -81,7 +81,7 @@ class BYOMFunctionsController:
81
81
  # do nothing
82
82
  return
83
83
 
84
- new_name = f'{node.namespace}_{fnc_name}'
84
+ new_name = f"{node.namespace}_{fnc_name}"
85
85
  node.op = new_name
86
86
 
87
87
  if new_name in self.callbacks:
@@ -91,16 +91,13 @@ class BYOMFunctionsController:
91
91
  def callback(*args):
92
92
  return self.method_call(engine, fnc_name, args)
93
93
 
94
- input_types = [
95
- param['type']
96
- for param in methods[fnc_name]['input_params']
97
- ]
94
+ input_types = [param["type"] for param in methods[fnc_name]["input_params"]]
98
95
 
99
96
  meta = {
100
- 'name': new_name,
101
- 'callback': callback,
102
- 'input_types': input_types,
103
- 'output_type': methods[fnc_name]['output_type']
97
+ "name": new_name,
98
+ "callback": callback,
99
+ "input_types": input_types,
100
+ "output_type": methods[fnc_name]["output_type"],
104
101
  }
105
102
 
106
103
  self.callbacks[new_name] = meta
@@ -114,7 +111,6 @@ class BYOMFunctionsController:
114
111
 
115
112
 
116
113
  class FunctionController(BYOMFunctionsController):
117
-
118
114
  def __init__(self, *args, **kwargs):
119
115
  super().__init__(*args, **kwargs)
120
116
 
@@ -124,10 +120,10 @@ class FunctionController(BYOMFunctionsController):
124
120
  return meta
125
121
 
126
122
  # builtin functions
127
- if node.op.lower() == 'llm':
123
+ if node.op.lower() == "llm":
128
124
  return self.llm_call_function(node)
129
125
 
130
- elif node.op.lower() == 'to_markdown':
126
+ elif node.op.lower() == "to_markdown":
131
127
  return self.to_markdown_call_function(node)
132
128
 
133
129
  def llm_call_function(self, node):
@@ -141,70 +137,74 @@ class FunctionController(BYOMFunctionsController):
141
137
  try:
142
138
  from langchain_core.messages import HumanMessage
143
139
  from mindsdb.interfaces.agents.langchain_agent import create_chat_model
140
+
144
141
  llm = create_chat_model(chat_model_params)
145
142
  except Exception as e:
146
- raise RuntimeError(f'Unable to use LLM function, check ENV variables: {e}')
143
+ raise RuntimeError(f"Unable to use LLM function, check ENV variables: {e}")
147
144
 
148
145
  def callback(question):
149
146
  resp = llm([HumanMessage(question)])
150
147
  return resp.content
151
148
 
152
- meta = {
153
- 'name': name,
154
- 'callback': callback,
155
- 'input_types': ['str'],
156
- 'output_type': 'str'
157
- }
149
+ meta = {"name": name, "callback": callback, "input_types": ["str"], "output_type": "str"}
158
150
  self.callbacks[name] = meta
159
151
  return meta
160
152
 
161
153
  def to_markdown_call_function(self, node):
162
154
  # load on-demand because lib is heavy
163
155
  from mindsdb.interfaces.functions.to_markdown import ToMarkdown
156
+
164
157
  name = node.op.lower()
165
158
 
166
159
  if name in self.callbacks:
167
160
  return self.callbacks[name]
168
161
 
169
- def callback(file_path_or_url):
170
- chat_model_params = self._parse_chat_model_params('TO_MARKDOWN_FUNCTION_')
171
-
162
+ def prepare_chat_model_params(chat_model_params: dict) -> dict:
163
+ """
164
+ Parepares the chat model parameters for the ToMarkdown function.
165
+ """
172
166
  params_copy = copy.deepcopy(chat_model_params)
173
- params_copy['model'] = params_copy.pop('model_name')
174
- params_copy.pop('api_keys')
175
- params_copy.pop('provider')
167
+ params_copy["model"] = params_copy.pop("model_name")
168
+
169
+ # Set the base_url for the Google provider.
170
+ if params_copy["provider"] == "google" and "base_url" not in params_copy:
171
+ params_copy["base_url"] = "https://generativelanguage.googleapis.com/v1beta/"
172
+
173
+ params_copy.pop("api_keys")
174
+ params_copy.pop("provider")
175
+
176
+ return params_copy
177
+
178
+ def callback(file_path_or_url):
179
+ chat_model_params = self._parse_chat_model_params("TO_MARKDOWN_FUNCTION_")
180
+ chat_model_params = prepare_chat_model_params(chat_model_params)
176
181
 
177
182
  to_markdown = ToMarkdown()
178
- return to_markdown.call(file_path_or_url, **params_copy)
183
+ return to_markdown.call(file_path_or_url, **chat_model_params)
179
184
 
180
- meta = {
181
- 'name': name,
182
- 'callback': callback,
183
- 'input_types': ['str'],
184
- 'output_type': 'str'
185
- }
185
+ meta = {"name": name, "callback": callback, "input_types": ["str"], "output_type": "str"}
186
186
  self.callbacks[name] = meta
187
187
  return meta
188
188
 
189
- def _parse_chat_model_params(self, param_prefix: str = 'LLM_FUNCTION_'):
189
+ def _parse_chat_model_params(self, param_prefix: str = "LLM_FUNCTION_"):
190
190
  """
191
191
  Parses the environment variables for chat model parameters.
192
192
  """
193
193
  chat_model_params = config.get("default_llm") or {}
194
194
  for k, v in os.environ.items():
195
195
  if k.startswith(param_prefix):
196
- param_name = k[len(param_prefix):]
197
- if param_name == 'MODEL':
198
- chat_model_params['model_name'] = v
196
+ param_name = k[len(param_prefix) :]
197
+ if param_name == "MODEL":
198
+ chat_model_params["model_name"] = v
199
199
  else:
200
200
  chat_model_params[param_name.lower()] = v
201
201
 
202
- if 'provider' not in chat_model_params:
203
- chat_model_params['provider'] = 'openai'
202
+ if "provider" not in chat_model_params:
203
+ chat_model_params["provider"] = "openai"
204
204
 
205
- if 'api_key' in chat_model_params:
205
+ if "api_key" in chat_model_params:
206
206
  # move to api_keys dict
207
- chat_model_params["api_keys"] = {chat_model_params['provider']: chat_model_params['api_key']}
207
+ chat_model_params["api_keys"] = {chat_model_params["provider"]: chat_model_params["api_key"]}
208
208
 
209
209
  return chat_model_params
210
210
 
@@ -215,33 +215,23 @@ class DuckDBFunctions:
215
215
  self.functions = {}
216
216
 
217
217
  def check_function(self, node):
218
-
219
218
  meta = self.controller.check_function(node)
220
219
  if meta is None:
221
220
  return
222
221
 
223
- name = meta['name']
222
+ name = meta["name"]
224
223
 
225
224
  if name in self.functions:
226
225
  return
227
226
 
228
- input_types = [
229
- python_to_duckdb_type(param)
230
- for param in meta['input_types']
231
- ]
227
+ input_types = [python_to_duckdb_type(param) for param in meta["input_types"]]
232
228
 
233
229
  self.functions[name] = {
234
- 'callback': function_maker(len(input_types), meta['callback']),
235
- 'input': input_types,
236
- 'output': python_to_duckdb_type(meta['output_type'])
230
+ "callback": function_maker(len(input_types), meta["callback"]),
231
+ "input": input_types,
232
+ "output": python_to_duckdb_type(meta["output_type"]),
237
233
  }
238
234
 
239
235
  def register(self, connection):
240
236
  for name, info in self.functions.items():
241
- connection.create_function(
242
- name,
243
- info['callback'],
244
- info['input'],
245
- info['output'],
246
- null_handling="special"
247
- )
237
+ connection.create_function(name, info["callback"], info["input"], info["output"], null_handling="special")
@@ -2,6 +2,7 @@ from io import BytesIO
2
2
  import os
3
3
  from typing import Union
4
4
  from urllib.parse import urlparse
5
+ import xml.etree.ElementTree as ET
5
6
 
6
7
  from aipdf import ocr
7
8
  import mimetypes
@@ -12,6 +13,7 @@ class ToMarkdown:
12
13
  """
13
14
  Extracts the content of documents of various formats in markdown format.
14
15
  """
16
+
15
17
  def __init__(self):
16
18
  """
17
19
  Initializes the ToMarkdown class.
@@ -24,24 +26,28 @@ class ToMarkdown:
24
26
  file_extension = self._get_file_extension(file_path_or_url)
25
27
  file_content = self._get_file_content(file_path_or_url)
26
28
 
27
- if file_extension == '.pdf':
29
+ if file_extension == ".pdf":
28
30
  return self._pdf_to_markdown(file_content, **kwargs)
31
+
32
+ elif file_extension in (".xml", ".nessus"):
33
+ return self._xml_to_markdown(file_content, **kwargs)
34
+
29
35
  else:
30
36
  raise ValueError(f"Unsupported file type: {file_extension}.")
31
37
 
32
- def _get_file_content(self, file_path_or_url: str) -> str:
38
+ def _get_file_content(self, file_path_or_url: str) -> BytesIO:
33
39
  """
34
40
  Retrieves the content of a file.
35
41
  """
36
42
  parsed_url = urlparse(file_path_or_url)
37
- if parsed_url.scheme in ('http', 'https'):
43
+ if parsed_url.scheme in ("http", "https"):
38
44
  response = requests.get(file_path_or_url)
39
45
  if response.status_code == 200:
40
- return response
46
+ return BytesIO(response.content)
41
47
  else:
42
- raise RuntimeError(f'Unable to retrieve file from URL: {file_path_or_url}')
48
+ raise RuntimeError(f"Unable to retrieve file from URL: {file_path_or_url}")
43
49
  else:
44
- with open(file_path_or_url, 'rb') as file:
50
+ with open(file_path_or_url, "rb") as file:
45
51
  return BytesIO(file.read())
46
52
 
47
53
  def _get_file_extension(self, file_path_or_url: str) -> str:
@@ -49,13 +55,13 @@ class ToMarkdown:
49
55
  Retrieves the file extension from a file path or URL.
50
56
  """
51
57
  parsed_url = urlparse(file_path_or_url)
52
- if parsed_url.scheme in ('http', 'https'):
58
+ if parsed_url.scheme in ("http", "https"):
53
59
  try:
54
60
  # Make a HEAD request to get headers without downloading the file.
55
61
  response = requests.head(file_path_or_url, allow_redirects=True)
56
- content_type = response.headers.get('Content-Type', '')
62
+ content_type = response.headers.get("Content-Type", "")
57
63
  if content_type:
58
- ext = mimetypes.guess_extension(content_type.split(';')[0].strip())
64
+ ext = mimetypes.guess_extension(content_type.split(";")[0].strip())
59
65
  if ext:
60
66
  return ext
61
67
 
@@ -64,16 +70,43 @@ class ToMarkdown:
64
70
  if ext:
65
71
  return ext
66
72
  except requests.RequestException:
67
- raise RuntimeError(f'Unable to retrieve file extension from URL: {file_path_or_url}')
73
+ raise RuntimeError(f"Unable to retrieve file extension from URL: {file_path_or_url}")
68
74
  else:
69
75
  return os.path.splitext(file_path_or_url)[1]
70
76
 
71
- def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes], **kwargs) -> str:
77
+ def _pdf_to_markdown(self, file_content: Union[requests.Response, BytesIO], **kwargs) -> str:
72
78
  """
73
79
  Converts a PDF file to markdown.
74
80
  """
75
- if isinstance(file_content, requests.Response):
76
- file_content = BytesIO(file_content.content)
77
-
78
81
  markdown_pages = ocr(file_content, **kwargs)
79
82
  return "\n\n---\n\n".join(markdown_pages)
83
+
84
+ def _xml_to_markdown(self, file_content: Union[requests.Response, BytesIO], **kwargs) -> str:
85
+ """
86
+ Converts an XML (or Nessus) file to markdown.
87
+ """
88
+
89
+ def parse_element(element: ET.Element, depth: int = 0) -> str:
90
+ """
91
+ Recursively parses an XML element and converts it to markdown.
92
+ """
93
+ markdown = []
94
+ heading = "#" * (depth + 1)
95
+
96
+ markdown.append(f"{heading} {element.tag}")
97
+
98
+ for key, val in element.attrib.items():
99
+ markdown.append(f"- **{key}**: {val}")
100
+
101
+ text = (element.text or "").strip()
102
+ if text:
103
+ markdown.append(f"\n{text}\n")
104
+
105
+ for child in element:
106
+ markdown.append(parse_element(child, depth + 1))
107
+
108
+ return "\n".join(markdown)
109
+
110
+ root = ET.fromstring(file_content.read().decode("utf-8"))
111
+ markdown_content = parse_element(root)
112
+ return markdown_content