MindsDB 25.7.1.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +53 -94
- mindsdb/api/a2a/agent.py +30 -206
- mindsdb/api/a2a/common/server/server.py +26 -27
- mindsdb/api/a2a/task_manager.py +93 -227
- mindsdb/api/a2a/utils.py +21 -0
- mindsdb/api/executor/utilities/sql.py +97 -21
- mindsdb/api/http/namespaces/agents.py +126 -201
- mindsdb/api/http/namespaces/config.py +12 -1
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +94 -1
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
- mindsdb/integrations/libs/keyword_search_base.py +41 -0
- mindsdb/integrations/libs/vectordatabase_handler.py +35 -14
- mindsdb/integrations/utilities/sql_utils.py +11 -0
- mindsdb/interfaces/database/projects.py +1 -3
- mindsdb/interfaces/functions/controller.py +54 -64
- mindsdb/interfaces/functions/to_markdown.py +47 -14
- mindsdb/interfaces/knowledge_base/controller.py +127 -35
- mindsdb/interfaces/knowledge_base/evaluate.py +2 -2
- mindsdb/utilities/config.py +46 -39
- mindsdb/utilities/exception.py +11 -0
- {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +244 -244
- {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +27 -25
- {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,8 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
|
|
|
18
18
|
DistanceFunction,
|
|
19
19
|
TableField,
|
|
20
20
|
)
|
|
21
|
+
from mindsdb.integrations.libs.keyword_search_base import KeywordSearchBase
|
|
22
|
+
from mindsdb.integrations.utilities.sql_utils import KeywordSearchArgs
|
|
21
23
|
from mindsdb.utilities import log
|
|
22
24
|
from mindsdb.utilities.profiler import profiler
|
|
23
25
|
from mindsdb.utilities.context import context as ctx
|
|
@@ -26,7 +28,7 @@ logger = log.getLogger(__name__)
|
|
|
26
28
|
|
|
27
29
|
|
|
28
30
|
# todo Issue #7316 add support for different indexes and search algorithms e.g. cosine similarity or L2 norm
|
|
29
|
-
class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
31
|
+
class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
|
|
30
32
|
"""This handler handles connection and execution of the PostgreSQL with pgvector extension statements."""
|
|
31
33
|
|
|
32
34
|
name = "pgvector"
|
|
@@ -228,6 +230,40 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
228
230
|
else:
|
|
229
231
|
return ""
|
|
230
232
|
|
|
233
|
+
@staticmethod
|
|
234
|
+
def _construct_where_clause_with_keywords(filter_conditions=None, keyword_query=None, content_column_name=None):
|
|
235
|
+
if not keyword_query or not content_column_name:
|
|
236
|
+
return PgVectorHandler._construct_where_clause(filter_conditions)
|
|
237
|
+
|
|
238
|
+
keyword_query_condition = (
|
|
239
|
+
f"""to_tsvector('english', {content_column_name}) @@ websearch_to_tsquery('english', '{keyword_query}')"""
|
|
240
|
+
)
|
|
241
|
+
if filter_conditions is None:
|
|
242
|
+
return ""
|
|
243
|
+
|
|
244
|
+
where_clauses = []
|
|
245
|
+
|
|
246
|
+
for item in filter_conditions:
|
|
247
|
+
key = item["name"]
|
|
248
|
+
|
|
249
|
+
if item["op"].lower() in ("in", "not in"):
|
|
250
|
+
values = list(repr(i) for i in item["value"])
|
|
251
|
+
item["value"] = "({})".format(", ".join(values))
|
|
252
|
+
else:
|
|
253
|
+
if item["value"] is None:
|
|
254
|
+
item["value"] = "null"
|
|
255
|
+
else:
|
|
256
|
+
item["value"] = repr(item["value"])
|
|
257
|
+
where_clauses.append(f"{key} {item['op']} {item['value']}")
|
|
258
|
+
|
|
259
|
+
where_clauses.append(keyword_query_condition)
|
|
260
|
+
if len(where_clauses) > 1:
|
|
261
|
+
return f"WHERE {' AND '.join(where_clauses)}"
|
|
262
|
+
elif len(where_clauses) == 1:
|
|
263
|
+
return f"WHERE {where_clauses[0]}"
|
|
264
|
+
else:
|
|
265
|
+
return ""
|
|
266
|
+
|
|
231
267
|
@staticmethod
|
|
232
268
|
def _construct_full_after_from_clause(
|
|
233
269
|
where_clause: str,
|
|
@@ -236,6 +272,36 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
236
272
|
) -> str:
|
|
237
273
|
return f"{where_clause} {offset_clause} {limit_clause}"
|
|
238
274
|
|
|
275
|
+
def _build_keyword_bm25_query(
|
|
276
|
+
self,
|
|
277
|
+
table_name: str,
|
|
278
|
+
query: str,
|
|
279
|
+
columns: List[str] = None,
|
|
280
|
+
content_column_name: str = "content",
|
|
281
|
+
conditions: List[FilterCondition] = None,
|
|
282
|
+
limit: int = None,
|
|
283
|
+
offset: int = None,
|
|
284
|
+
):
|
|
285
|
+
if columns is None:
|
|
286
|
+
columns = ["id", "content", "metadata"]
|
|
287
|
+
|
|
288
|
+
filter_conditions, _ = self._translate_conditions(conditions)
|
|
289
|
+
|
|
290
|
+
# given filter conditions, construct where clause
|
|
291
|
+
where_clause = self._construct_where_clause_with_keywords(filter_conditions, query, content_column_name)
|
|
292
|
+
|
|
293
|
+
query = f"""
|
|
294
|
+
SELECT
|
|
295
|
+
{", ".join(columns)},
|
|
296
|
+
ts_rank_cd(to_tsvector('english', {content_column_name}), websearch_to_tsquery('english', '{query}')) as distance
|
|
297
|
+
FROM
|
|
298
|
+
{table_name}
|
|
299
|
+
{where_clause if where_clause else ""}
|
|
300
|
+
{f"LIMIT {limit}" if limit else ""}
|
|
301
|
+
{f"OFFSET {offset}" if offset else ""};"""
|
|
302
|
+
|
|
303
|
+
return query
|
|
304
|
+
|
|
239
305
|
def _build_select_query(
|
|
240
306
|
self,
|
|
241
307
|
table_name: str,
|
|
@@ -320,6 +386,33 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
320
386
|
columns = ["id", "content", "embeddings", "metadata"]
|
|
321
387
|
|
|
322
388
|
query = self._build_select_query(table_name, columns, conditions, limit, offset)
|
|
389
|
+
|
|
390
|
+
result = self.raw_query(query)
|
|
391
|
+
|
|
392
|
+
# ensure embeddings are returned as string so they can be parsed by mindsdb
|
|
393
|
+
if "embeddings" in columns:
|
|
394
|
+
result["embeddings"] = result["embeddings"].astype(str)
|
|
395
|
+
|
|
396
|
+
return result
|
|
397
|
+
|
|
398
|
+
def keyword_select(
|
|
399
|
+
self,
|
|
400
|
+
table_name: str,
|
|
401
|
+
columns: List[str] = None,
|
|
402
|
+
conditions: List[FilterCondition] = None,
|
|
403
|
+
offset: int = None,
|
|
404
|
+
limit: int = None,
|
|
405
|
+
keyword_search_args: KeywordSearchArgs = None,
|
|
406
|
+
) -> pd.DataFrame:
|
|
407
|
+
table_name = self._check_table(table_name)
|
|
408
|
+
|
|
409
|
+
if columns is None:
|
|
410
|
+
columns = ["id", "content", "embeddings", "metadata"]
|
|
411
|
+
content_column_name = keyword_search_args.column
|
|
412
|
+
query = self._build_keyword_bm25_query(
|
|
413
|
+
table_name, keyword_search_args.query, columns, content_column_name, conditions, limit, offset
|
|
414
|
+
)
|
|
415
|
+
|
|
323
416
|
result = self.raw_query(query)
|
|
324
417
|
|
|
325
418
|
# ensure embeddings are returned as string so they can be parsed by mindsdb
|
|
@@ -271,10 +271,11 @@ class SalesforceHandler(MetaAPIHandler):
|
|
|
271
271
|
|
|
272
272
|
# Retrieve the metadata for all Salesforce resources.
|
|
273
273
|
main_metadata = connection.sobjects.describe()
|
|
274
|
-
|
|
275
274
|
if table_names:
|
|
276
275
|
# Filter the metadata for the specified tables.
|
|
277
|
-
main_metadata = [
|
|
276
|
+
main_metadata = [
|
|
277
|
+
resource for resource in main_metadata["sobjects"] if resource["name"].lower() in table_names
|
|
278
|
+
]
|
|
278
279
|
else:
|
|
279
280
|
main_metadata = main_metadata["sobjects"]
|
|
280
281
|
|
|
@@ -165,7 +165,7 @@ def create_table_class(resource_name: Text) -> MetaAPIResource:
|
|
|
165
165
|
client = self.handler.connect()
|
|
166
166
|
|
|
167
167
|
resource_metadata = next(
|
|
168
|
-
(resource for resource in main_metadata if resource["name"] == resource_name),
|
|
168
|
+
(resource for resource in main_metadata if resource["name"].lower() == resource_name),
|
|
169
169
|
)
|
|
170
170
|
|
|
171
171
|
# Get row count if Id column is aggregatable.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from mindsdb_sql_parser.ast import Select
|
|
2
|
+
from typing import List
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from mindsdb.integrations.utilities.sql_utils import FilterCondition, KeywordSearchArgs
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class KeywordSearchBase:
|
|
9
|
+
"""
|
|
10
|
+
Base class for keyword search integrations.
|
|
11
|
+
This class provides a common interface for keyword search functionality.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, *args, **kwargs):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
def dispatch_keyword_select(
|
|
18
|
+
self, query: Select, conditions: List[FilterCondition] = None, keyword_search_args: KeywordSearchArgs = None
|
|
19
|
+
):
|
|
20
|
+
"""Dispatches a keyword search select query to the appropriate method."""
|
|
21
|
+
raise NotImplementedError()
|
|
22
|
+
|
|
23
|
+
def keyword_select(
|
|
24
|
+
self,
|
|
25
|
+
table_name: str,
|
|
26
|
+
columns: List[str] = None,
|
|
27
|
+
conditions: List[FilterCondition] = None,
|
|
28
|
+
offset: int = None,
|
|
29
|
+
limit: int = None,
|
|
30
|
+
) -> pd.DataFrame:
|
|
31
|
+
"""Select data from table
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
table_name (str): table name
|
|
35
|
+
columns (List[str]): columns to select
|
|
36
|
+
conditions (List[FilterCondition]): conditions to select
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
HandlerResponse
|
|
40
|
+
"""
|
|
41
|
+
raise NotImplementedError()
|
|
@@ -21,7 +21,7 @@ from mindsdb_sql_parser.ast.base import ASTNode
|
|
|
21
21
|
|
|
22
22
|
from mindsdb.integrations.libs.response import RESPONSE_TYPE, HandlerResponse
|
|
23
23
|
from mindsdb.utilities import log
|
|
24
|
-
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
|
|
24
|
+
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator, KeywordSearchArgs
|
|
25
25
|
|
|
26
26
|
from mindsdb.integrations.utilities.query_traversal import query_traversal
|
|
27
27
|
from .base import BaseHandler
|
|
@@ -372,44 +372,65 @@ class VectorStoreHandler(BaseHandler):
|
|
|
372
372
|
return self.delete(table_name, conditions=conditions)
|
|
373
373
|
|
|
374
374
|
def dispatch_select(
|
|
375
|
-
self,
|
|
375
|
+
self,
|
|
376
|
+
query: Select,
|
|
377
|
+
conditions: Optional[List[FilterCondition]] = None,
|
|
378
|
+
allowed_metadata_columns: List[str] = None,
|
|
379
|
+
keyword_search_args: Optional[KeywordSearchArgs] = None,
|
|
376
380
|
):
|
|
377
381
|
"""
|
|
378
|
-
|
|
382
|
+
Dispatches a select query to the appropriate method, handling both
|
|
383
|
+
standard selections and keyword searches based on the provided arguments.
|
|
379
384
|
"""
|
|
380
|
-
#
|
|
385
|
+
# 1. Parse common query arguments
|
|
381
386
|
table_name = query.from_table.parts[-1]
|
|
382
|
-
|
|
387
|
+
|
|
388
|
+
# If targets are a star (*), select all schema columns
|
|
383
389
|
if isinstance(query.targets[0], Star):
|
|
384
390
|
columns = [col["name"] for col in self.SCHEMA]
|
|
385
391
|
else:
|
|
386
392
|
columns = [col.parts[-1] for col in query.targets]
|
|
387
393
|
|
|
394
|
+
# 2. Validate columns
|
|
388
395
|
if not self._is_columns_allowed(columns):
|
|
389
|
-
|
|
396
|
+
allowed_cols = [col["name"] for col in self.SCHEMA]
|
|
397
|
+
raise Exception(f"Columns {columns} not allowed. Allowed columns are {allowed_cols}")
|
|
390
398
|
|
|
391
|
-
#
|
|
399
|
+
# 3. Extract and process conditions
|
|
392
400
|
if conditions is None:
|
|
393
401
|
where_statement = query.where
|
|
394
402
|
conditions = self.extract_conditions(where_statement)
|
|
395
403
|
self._convert_metadata_filters(conditions, allowed_metadata_columns=allowed_metadata_columns)
|
|
396
404
|
|
|
397
|
-
#
|
|
405
|
+
# 4. Get offset and limit
|
|
398
406
|
offset = query.offset.value if query.offset is not None else None
|
|
399
407
|
limit = query.limit.value if query.limit is not None else None
|
|
400
408
|
|
|
401
|
-
# dispatch select
|
|
402
|
-
|
|
403
|
-
|
|
409
|
+
# 5. Conditionally dispatch to the correct select method
|
|
410
|
+
if keyword_search_args:
|
|
411
|
+
# It's a keyword search
|
|
412
|
+
return self.keyword_select(
|
|
404
413
|
table_name,
|
|
405
414
|
columns=columns,
|
|
406
415
|
conditions=conditions,
|
|
407
416
|
offset=offset,
|
|
408
417
|
limit=limit,
|
|
418
|
+
keyword_search_args=keyword_search_args,
|
|
409
419
|
)
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
420
|
+
else:
|
|
421
|
+
# It's a standard select
|
|
422
|
+
try:
|
|
423
|
+
return self.select(
|
|
424
|
+
table_name,
|
|
425
|
+
columns=columns,
|
|
426
|
+
conditions=conditions,
|
|
427
|
+
offset=offset,
|
|
428
|
+
limit=limit,
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
except Exception as e:
|
|
432
|
+
handler_engine = self.__class__.name
|
|
433
|
+
raise VectorHandlerException(f"Error in {handler_engine} database: {e}")
|
|
413
434
|
|
|
414
435
|
def _dispatch(self, query: ASTNode) -> HandlerResponse:
|
|
415
436
|
"""
|
|
@@ -60,6 +60,17 @@ class FilterCondition:
|
|
|
60
60
|
"""
|
|
61
61
|
|
|
62
62
|
|
|
63
|
+
class KeywordSearchArgs:
|
|
64
|
+
def __init__(self, column: str, query: str):
|
|
65
|
+
"""
|
|
66
|
+
Args:
|
|
67
|
+
column: The column to search in.
|
|
68
|
+
query: The search query string.
|
|
69
|
+
"""
|
|
70
|
+
self.column = column
|
|
71
|
+
self.query = query
|
|
72
|
+
|
|
73
|
+
|
|
63
74
|
class SortColumn:
|
|
64
75
|
def __init__(self, column: str, ascending: bool = True):
|
|
65
76
|
self.column = column
|
|
@@ -362,9 +362,7 @@ class Project:
|
|
|
362
362
|
|
|
363
363
|
columns = [ASSISTANT_COLUMN, USER_COLUMN]
|
|
364
364
|
case "KNOWLEDGE_BASE":
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
columns = list(KB_TO_VECTORDB_COLUMNS.keys()) + ["metadata", "relevance", "distance"]
|
|
365
|
+
columns = ["id", "chunk_id", "chunk_content", "metadata", "relevance", "distance"]
|
|
368
366
|
case "TABLE":
|
|
369
367
|
# like 'mindsdb.models'
|
|
370
368
|
pass
|
|
@@ -7,15 +7,15 @@ from mindsdb.utilities.config import config
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def python_to_duckdb_type(py_type):
|
|
10
|
-
if py_type ==
|
|
10
|
+
if py_type == "int":
|
|
11
11
|
return BIGINT
|
|
12
|
-
elif py_type ==
|
|
12
|
+
elif py_type == "float":
|
|
13
13
|
return DOUBLE
|
|
14
|
-
elif py_type ==
|
|
14
|
+
elif py_type == "str":
|
|
15
15
|
return VARCHAR
|
|
16
|
-
elif py_type ==
|
|
16
|
+
elif py_type == "bool":
|
|
17
17
|
return BOOLEAN
|
|
18
|
-
elif py_type ==
|
|
18
|
+
elif py_type == "bytes":
|
|
19
19
|
return BLOB
|
|
20
20
|
else:
|
|
21
21
|
# Unknown
|
|
@@ -53,8 +53,8 @@ class BYOMFunctionsController:
|
|
|
53
53
|
# first run
|
|
54
54
|
self.byom_engines = []
|
|
55
55
|
for name, info in self.session.integration_controller.get_all().items():
|
|
56
|
-
if info[
|
|
57
|
-
if info[
|
|
56
|
+
if info["type"] == "ml" and info["engine"] == "byom":
|
|
57
|
+
if info["connection_data"].get("mode") == "custom_function":
|
|
58
58
|
self.byom_engines.append(name)
|
|
59
59
|
return self.byom_engines
|
|
60
60
|
|
|
@@ -63,7 +63,7 @@ class BYOMFunctionsController:
|
|
|
63
63
|
ml_handler = self.session.integration_controller.get_ml_handler(engine)
|
|
64
64
|
|
|
65
65
|
storage = HandlerStorage(ml_handler.integration_id)
|
|
66
|
-
methods = storage.json_get(
|
|
66
|
+
methods = storage.json_get("methods")
|
|
67
67
|
self.byom_methods[engine] = methods
|
|
68
68
|
self.byom_handlers[engine] = ml_handler
|
|
69
69
|
|
|
@@ -81,7 +81,7 @@ class BYOMFunctionsController:
|
|
|
81
81
|
# do nothing
|
|
82
82
|
return
|
|
83
83
|
|
|
84
|
-
new_name = f
|
|
84
|
+
new_name = f"{node.namespace}_{fnc_name}"
|
|
85
85
|
node.op = new_name
|
|
86
86
|
|
|
87
87
|
if new_name in self.callbacks:
|
|
@@ -91,16 +91,13 @@ class BYOMFunctionsController:
|
|
|
91
91
|
def callback(*args):
|
|
92
92
|
return self.method_call(engine, fnc_name, args)
|
|
93
93
|
|
|
94
|
-
input_types = [
|
|
95
|
-
param['type']
|
|
96
|
-
for param in methods[fnc_name]['input_params']
|
|
97
|
-
]
|
|
94
|
+
input_types = [param["type"] for param in methods[fnc_name]["input_params"]]
|
|
98
95
|
|
|
99
96
|
meta = {
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
97
|
+
"name": new_name,
|
|
98
|
+
"callback": callback,
|
|
99
|
+
"input_types": input_types,
|
|
100
|
+
"output_type": methods[fnc_name]["output_type"],
|
|
104
101
|
}
|
|
105
102
|
|
|
106
103
|
self.callbacks[new_name] = meta
|
|
@@ -114,7 +111,6 @@ class BYOMFunctionsController:
|
|
|
114
111
|
|
|
115
112
|
|
|
116
113
|
class FunctionController(BYOMFunctionsController):
|
|
117
|
-
|
|
118
114
|
def __init__(self, *args, **kwargs):
|
|
119
115
|
super().__init__(*args, **kwargs)
|
|
120
116
|
|
|
@@ -124,10 +120,10 @@ class FunctionController(BYOMFunctionsController):
|
|
|
124
120
|
return meta
|
|
125
121
|
|
|
126
122
|
# builtin functions
|
|
127
|
-
if node.op.lower() ==
|
|
123
|
+
if node.op.lower() == "llm":
|
|
128
124
|
return self.llm_call_function(node)
|
|
129
125
|
|
|
130
|
-
elif node.op.lower() ==
|
|
126
|
+
elif node.op.lower() == "to_markdown":
|
|
131
127
|
return self.to_markdown_call_function(node)
|
|
132
128
|
|
|
133
129
|
def llm_call_function(self, node):
|
|
@@ -141,70 +137,74 @@ class FunctionController(BYOMFunctionsController):
|
|
|
141
137
|
try:
|
|
142
138
|
from langchain_core.messages import HumanMessage
|
|
143
139
|
from mindsdb.interfaces.agents.langchain_agent import create_chat_model
|
|
140
|
+
|
|
144
141
|
llm = create_chat_model(chat_model_params)
|
|
145
142
|
except Exception as e:
|
|
146
|
-
raise RuntimeError(f
|
|
143
|
+
raise RuntimeError(f"Unable to use LLM function, check ENV variables: {e}")
|
|
147
144
|
|
|
148
145
|
def callback(question):
|
|
149
146
|
resp = llm([HumanMessage(question)])
|
|
150
147
|
return resp.content
|
|
151
148
|
|
|
152
|
-
meta = {
|
|
153
|
-
'name': name,
|
|
154
|
-
'callback': callback,
|
|
155
|
-
'input_types': ['str'],
|
|
156
|
-
'output_type': 'str'
|
|
157
|
-
}
|
|
149
|
+
meta = {"name": name, "callback": callback, "input_types": ["str"], "output_type": "str"}
|
|
158
150
|
self.callbacks[name] = meta
|
|
159
151
|
return meta
|
|
160
152
|
|
|
161
153
|
def to_markdown_call_function(self, node):
|
|
162
154
|
# load on-demand because lib is heavy
|
|
163
155
|
from mindsdb.interfaces.functions.to_markdown import ToMarkdown
|
|
156
|
+
|
|
164
157
|
name = node.op.lower()
|
|
165
158
|
|
|
166
159
|
if name in self.callbacks:
|
|
167
160
|
return self.callbacks[name]
|
|
168
161
|
|
|
169
|
-
def
|
|
170
|
-
|
|
171
|
-
|
|
162
|
+
def prepare_chat_model_params(chat_model_params: dict) -> dict:
|
|
163
|
+
"""
|
|
164
|
+
Parepares the chat model parameters for the ToMarkdown function.
|
|
165
|
+
"""
|
|
172
166
|
params_copy = copy.deepcopy(chat_model_params)
|
|
173
|
-
params_copy[
|
|
174
|
-
|
|
175
|
-
|
|
167
|
+
params_copy["model"] = params_copy.pop("model_name")
|
|
168
|
+
|
|
169
|
+
# Set the base_url for the Google provider.
|
|
170
|
+
if params_copy["provider"] == "google" and "base_url" not in params_copy:
|
|
171
|
+
params_copy["base_url"] = "https://generativelanguage.googleapis.com/v1beta/"
|
|
172
|
+
|
|
173
|
+
params_copy.pop("api_keys")
|
|
174
|
+
params_copy.pop("provider")
|
|
175
|
+
|
|
176
|
+
return params_copy
|
|
177
|
+
|
|
178
|
+
def callback(file_path_or_url):
|
|
179
|
+
chat_model_params = self._parse_chat_model_params("TO_MARKDOWN_FUNCTION_")
|
|
180
|
+
chat_model_params = prepare_chat_model_params(chat_model_params)
|
|
176
181
|
|
|
177
182
|
to_markdown = ToMarkdown()
|
|
178
|
-
return to_markdown.call(file_path_or_url, **
|
|
183
|
+
return to_markdown.call(file_path_or_url, **chat_model_params)
|
|
179
184
|
|
|
180
|
-
meta = {
|
|
181
|
-
'name': name,
|
|
182
|
-
'callback': callback,
|
|
183
|
-
'input_types': ['str'],
|
|
184
|
-
'output_type': 'str'
|
|
185
|
-
}
|
|
185
|
+
meta = {"name": name, "callback": callback, "input_types": ["str"], "output_type": "str"}
|
|
186
186
|
self.callbacks[name] = meta
|
|
187
187
|
return meta
|
|
188
188
|
|
|
189
|
-
def _parse_chat_model_params(self, param_prefix: str =
|
|
189
|
+
def _parse_chat_model_params(self, param_prefix: str = "LLM_FUNCTION_"):
|
|
190
190
|
"""
|
|
191
191
|
Parses the environment variables for chat model parameters.
|
|
192
192
|
"""
|
|
193
193
|
chat_model_params = config.get("default_llm") or {}
|
|
194
194
|
for k, v in os.environ.items():
|
|
195
195
|
if k.startswith(param_prefix):
|
|
196
|
-
param_name = k[len(param_prefix):]
|
|
197
|
-
if param_name ==
|
|
198
|
-
chat_model_params[
|
|
196
|
+
param_name = k[len(param_prefix) :]
|
|
197
|
+
if param_name == "MODEL":
|
|
198
|
+
chat_model_params["model_name"] = v
|
|
199
199
|
else:
|
|
200
200
|
chat_model_params[param_name.lower()] = v
|
|
201
201
|
|
|
202
|
-
if
|
|
203
|
-
chat_model_params[
|
|
202
|
+
if "provider" not in chat_model_params:
|
|
203
|
+
chat_model_params["provider"] = "openai"
|
|
204
204
|
|
|
205
|
-
if
|
|
205
|
+
if "api_key" in chat_model_params:
|
|
206
206
|
# move to api_keys dict
|
|
207
|
-
chat_model_params["api_keys"] = {chat_model_params[
|
|
207
|
+
chat_model_params["api_keys"] = {chat_model_params["provider"]: chat_model_params["api_key"]}
|
|
208
208
|
|
|
209
209
|
return chat_model_params
|
|
210
210
|
|
|
@@ -215,33 +215,23 @@ class DuckDBFunctions:
|
|
|
215
215
|
self.functions = {}
|
|
216
216
|
|
|
217
217
|
def check_function(self, node):
|
|
218
|
-
|
|
219
218
|
meta = self.controller.check_function(node)
|
|
220
219
|
if meta is None:
|
|
221
220
|
return
|
|
222
221
|
|
|
223
|
-
name = meta[
|
|
222
|
+
name = meta["name"]
|
|
224
223
|
|
|
225
224
|
if name in self.functions:
|
|
226
225
|
return
|
|
227
226
|
|
|
228
|
-
input_types = [
|
|
229
|
-
python_to_duckdb_type(param)
|
|
230
|
-
for param in meta['input_types']
|
|
231
|
-
]
|
|
227
|
+
input_types = [python_to_duckdb_type(param) for param in meta["input_types"]]
|
|
232
228
|
|
|
233
229
|
self.functions[name] = {
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
230
|
+
"callback": function_maker(len(input_types), meta["callback"]),
|
|
231
|
+
"input": input_types,
|
|
232
|
+
"output": python_to_duckdb_type(meta["output_type"]),
|
|
237
233
|
}
|
|
238
234
|
|
|
239
235
|
def register(self, connection):
|
|
240
236
|
for name, info in self.functions.items():
|
|
241
|
-
connection.create_function(
|
|
242
|
-
name,
|
|
243
|
-
info['callback'],
|
|
244
|
-
info['input'],
|
|
245
|
-
info['output'],
|
|
246
|
-
null_handling="special"
|
|
247
|
-
)
|
|
237
|
+
connection.create_function(name, info["callback"], info["input"], info["output"], null_handling="special")
|
|
@@ -2,6 +2,7 @@ from io import BytesIO
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import Union
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
|
+
import xml.etree.ElementTree as ET
|
|
5
6
|
|
|
6
7
|
from aipdf import ocr
|
|
7
8
|
import mimetypes
|
|
@@ -12,6 +13,7 @@ class ToMarkdown:
|
|
|
12
13
|
"""
|
|
13
14
|
Extracts the content of documents of various formats in markdown format.
|
|
14
15
|
"""
|
|
16
|
+
|
|
15
17
|
def __init__(self):
|
|
16
18
|
"""
|
|
17
19
|
Initializes the ToMarkdown class.
|
|
@@ -24,24 +26,28 @@ class ToMarkdown:
|
|
|
24
26
|
file_extension = self._get_file_extension(file_path_or_url)
|
|
25
27
|
file_content = self._get_file_content(file_path_or_url)
|
|
26
28
|
|
|
27
|
-
if file_extension ==
|
|
29
|
+
if file_extension == ".pdf":
|
|
28
30
|
return self._pdf_to_markdown(file_content, **kwargs)
|
|
31
|
+
|
|
32
|
+
elif file_extension in (".xml", ".nessus"):
|
|
33
|
+
return self._xml_to_markdown(file_content, **kwargs)
|
|
34
|
+
|
|
29
35
|
else:
|
|
30
36
|
raise ValueError(f"Unsupported file type: {file_extension}.")
|
|
31
37
|
|
|
32
|
-
def _get_file_content(self, file_path_or_url: str) ->
|
|
38
|
+
def _get_file_content(self, file_path_or_url: str) -> BytesIO:
|
|
33
39
|
"""
|
|
34
40
|
Retrieves the content of a file.
|
|
35
41
|
"""
|
|
36
42
|
parsed_url = urlparse(file_path_or_url)
|
|
37
|
-
if parsed_url.scheme in (
|
|
43
|
+
if parsed_url.scheme in ("http", "https"):
|
|
38
44
|
response = requests.get(file_path_or_url)
|
|
39
45
|
if response.status_code == 200:
|
|
40
|
-
return response
|
|
46
|
+
return BytesIO(response.content)
|
|
41
47
|
else:
|
|
42
|
-
raise RuntimeError(f
|
|
48
|
+
raise RuntimeError(f"Unable to retrieve file from URL: {file_path_or_url}")
|
|
43
49
|
else:
|
|
44
|
-
with open(file_path_or_url,
|
|
50
|
+
with open(file_path_or_url, "rb") as file:
|
|
45
51
|
return BytesIO(file.read())
|
|
46
52
|
|
|
47
53
|
def _get_file_extension(self, file_path_or_url: str) -> str:
|
|
@@ -49,13 +55,13 @@ class ToMarkdown:
|
|
|
49
55
|
Retrieves the file extension from a file path or URL.
|
|
50
56
|
"""
|
|
51
57
|
parsed_url = urlparse(file_path_or_url)
|
|
52
|
-
if parsed_url.scheme in (
|
|
58
|
+
if parsed_url.scheme in ("http", "https"):
|
|
53
59
|
try:
|
|
54
60
|
# Make a HEAD request to get headers without downloading the file.
|
|
55
61
|
response = requests.head(file_path_or_url, allow_redirects=True)
|
|
56
|
-
content_type = response.headers.get(
|
|
62
|
+
content_type = response.headers.get("Content-Type", "")
|
|
57
63
|
if content_type:
|
|
58
|
-
ext = mimetypes.guess_extension(content_type.split(
|
|
64
|
+
ext = mimetypes.guess_extension(content_type.split(";")[0].strip())
|
|
59
65
|
if ext:
|
|
60
66
|
return ext
|
|
61
67
|
|
|
@@ -64,16 +70,43 @@ class ToMarkdown:
|
|
|
64
70
|
if ext:
|
|
65
71
|
return ext
|
|
66
72
|
except requests.RequestException:
|
|
67
|
-
raise RuntimeError(f
|
|
73
|
+
raise RuntimeError(f"Unable to retrieve file extension from URL: {file_path_or_url}")
|
|
68
74
|
else:
|
|
69
75
|
return os.path.splitext(file_path_or_url)[1]
|
|
70
76
|
|
|
71
|
-
def _pdf_to_markdown(self, file_content: Union[requests.Response,
|
|
77
|
+
def _pdf_to_markdown(self, file_content: Union[requests.Response, BytesIO], **kwargs) -> str:
|
|
72
78
|
"""
|
|
73
79
|
Converts a PDF file to markdown.
|
|
74
80
|
"""
|
|
75
|
-
if isinstance(file_content, requests.Response):
|
|
76
|
-
file_content = BytesIO(file_content.content)
|
|
77
|
-
|
|
78
81
|
markdown_pages = ocr(file_content, **kwargs)
|
|
79
82
|
return "\n\n---\n\n".join(markdown_pages)
|
|
83
|
+
|
|
84
|
+
def _xml_to_markdown(self, file_content: Union[requests.Response, BytesIO], **kwargs) -> str:
|
|
85
|
+
"""
|
|
86
|
+
Converts an XML (or Nessus) file to markdown.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def parse_element(element: ET.Element, depth: int = 0) -> str:
|
|
90
|
+
"""
|
|
91
|
+
Recursively parses an XML element and converts it to markdown.
|
|
92
|
+
"""
|
|
93
|
+
markdown = []
|
|
94
|
+
heading = "#" * (depth + 1)
|
|
95
|
+
|
|
96
|
+
markdown.append(f"{heading} {element.tag}")
|
|
97
|
+
|
|
98
|
+
for key, val in element.attrib.items():
|
|
99
|
+
markdown.append(f"- **{key}**: {val}")
|
|
100
|
+
|
|
101
|
+
text = (element.text or "").strip()
|
|
102
|
+
if text:
|
|
103
|
+
markdown.append(f"\n{text}\n")
|
|
104
|
+
|
|
105
|
+
for child in element:
|
|
106
|
+
markdown.append(parse_element(child, depth + 1))
|
|
107
|
+
|
|
108
|
+
return "\n".join(markdown)
|
|
109
|
+
|
|
110
|
+
root = ET.fromstring(file_content.read().decode("utf-8"))
|
|
111
|
+
markdown_content = parse_element(root)
|
|
112
|
+
return markdown_content
|