MindsDB 25.4.1.0__py3-none-any.whl → 25.4.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/command_executor.py +62 -61
- mindsdb/api/executor/data_types/answer.py +9 -12
- mindsdb/api/executor/datahub/classes/response.py +11 -0
- mindsdb/api/executor/datahub/datanodes/datanode.py +4 -4
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +7 -9
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +22 -16
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +20 -20
- mindsdb/api/executor/planner/plan_join.py +1 -1
- mindsdb/api/executor/planner/steps.py +2 -1
- mindsdb/api/executor/sql_query/result_set.py +10 -7
- mindsdb/api/executor/sql_query/sql_query.py +36 -82
- mindsdb/api/executor/sql_query/steps/delete_step.py +2 -3
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +5 -3
- mindsdb/api/executor/sql_query/steps/insert_step.py +2 -2
- mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -2
- mindsdb/api/executor/sql_query/steps/subselect_step.py +20 -8
- mindsdb/api/executor/sql_query/steps/update_step.py +4 -6
- mindsdb/api/http/namespaces/sql.py +4 -1
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/ok_packet.py +1 -1
- mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +4 -27
- mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +1 -0
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +38 -37
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +23 -13
- mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +1 -1
- mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +3 -2
- mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +4 -4
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +19 -5
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +9 -4
- mindsdb/integrations/handlers/redshift_handler/redshift_handler.py +1 -1
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +18 -11
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +1 -2
- mindsdb/integrations/libs/response.py +9 -4
- mindsdb/integrations/libs/vectordatabase_handler.py +17 -5
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +8 -98
- mindsdb/interfaces/database/log.py +8 -9
- mindsdb/interfaces/database/projects.py +1 -5
- mindsdb/interfaces/functions/controller.py +59 -17
- mindsdb/interfaces/functions/to_markdown.py +194 -0
- mindsdb/interfaces/jobs/jobs_controller.py +3 -3
- mindsdb/interfaces/knowledge_base/controller.py +101 -60
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +3 -14
- mindsdb/interfaces/query_context/context_controller.py +3 -1
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/METADATA +231 -230
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/RECORD +48 -46
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
3
|
from duckdb.typing import BIGINT, DOUBLE, VARCHAR, BLOB, BOOLEAN
|
|
4
|
+
from mindsdb.interfaces.functions.to_markdown import ToMarkdown
|
|
4
5
|
from mindsdb.interfaces.storage.model_fs import HandlerStorage
|
|
5
6
|
|
|
6
7
|
|
|
@@ -121,32 +122,20 @@ class FunctionController(BYOMFunctionsController):
|
|
|
121
122
|
if meta is not None:
|
|
122
123
|
return meta
|
|
123
124
|
|
|
124
|
-
# builtin
|
|
125
|
+
# builtin functions
|
|
125
126
|
if node.op.lower() == 'llm':
|
|
126
127
|
return self.llm_call_function(node)
|
|
127
128
|
|
|
129
|
+
elif node.op.lower() == 'to_markdown':
|
|
130
|
+
return self.to_markdown_call_function(node)
|
|
131
|
+
|
|
128
132
|
def llm_call_function(self, node):
|
|
129
133
|
name = node.op.lower()
|
|
130
134
|
|
|
131
135
|
if name in self.callbacks:
|
|
132
136
|
return self.callbacks[name]
|
|
133
137
|
|
|
134
|
-
|
|
135
|
-
chat_model_params = {}
|
|
136
|
-
for k, v in os.environ.items():
|
|
137
|
-
if k.startswith(param_prefix):
|
|
138
|
-
param_name = k[len(param_prefix):]
|
|
139
|
-
if param_name == 'MODEL':
|
|
140
|
-
chat_model_params['model_name'] = v
|
|
141
|
-
else:
|
|
142
|
-
chat_model_params[param_name.lower()] = v
|
|
143
|
-
|
|
144
|
-
if 'provider' not in chat_model_params:
|
|
145
|
-
chat_model_params['provider'] = 'openai'
|
|
146
|
-
|
|
147
|
-
if 'api_key' in chat_model_params:
|
|
148
|
-
# move to api_keys dict
|
|
149
|
-
chat_model_params["api_keys"] = {chat_model_params['provider']: chat_model_params['api_key']}
|
|
138
|
+
chat_model_params = self._parse_chat_model_params()
|
|
150
139
|
|
|
151
140
|
try:
|
|
152
141
|
from langchain_core.messages import HumanMessage
|
|
@@ -168,6 +157,59 @@ class FunctionController(BYOMFunctionsController):
|
|
|
168
157
|
self.callbacks[name] = meta
|
|
169
158
|
return meta
|
|
170
159
|
|
|
160
|
+
def to_markdown_call_function(self, node):
|
|
161
|
+
name = node.op.lower()
|
|
162
|
+
|
|
163
|
+
if name in self.callbacks:
|
|
164
|
+
return self.callbacks[name]
|
|
165
|
+
|
|
166
|
+
def callback(file_path_or_url, use_llm):
|
|
167
|
+
chat_model_params = self._parse_chat_model_params()
|
|
168
|
+
|
|
169
|
+
llm_client = None
|
|
170
|
+
llm_model = None
|
|
171
|
+
try:
|
|
172
|
+
from mindsdb.interfaces.agents.langchain_agent import create_chat_model
|
|
173
|
+
llm = create_chat_model(chat_model_params)
|
|
174
|
+
llm_client = llm.root_client
|
|
175
|
+
llm_model = llm.model_name
|
|
176
|
+
except Exception:
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
to_markdown = ToMarkdown(use_llm, llm_client, llm_model)
|
|
180
|
+
return to_markdown.call(file_path_or_url)
|
|
181
|
+
|
|
182
|
+
meta = {
|
|
183
|
+
'name': name,
|
|
184
|
+
'callback': callback,
|
|
185
|
+
'input_types': ['str', 'bool'],
|
|
186
|
+
'output_type': 'str'
|
|
187
|
+
}
|
|
188
|
+
self.callbacks[name] = meta
|
|
189
|
+
return meta
|
|
190
|
+
|
|
191
|
+
def _parse_chat_model_params(self, param_prefix: str = 'LLM_FUNCTION_'):
|
|
192
|
+
"""
|
|
193
|
+
Parses the environment variables for chat model parameters.
|
|
194
|
+
"""
|
|
195
|
+
chat_model_params = {}
|
|
196
|
+
for k, v in os.environ.items():
|
|
197
|
+
if k.startswith(param_prefix):
|
|
198
|
+
param_name = k[len(param_prefix):]
|
|
199
|
+
if param_name == 'MODEL':
|
|
200
|
+
chat_model_params['model_name'] = v
|
|
201
|
+
else:
|
|
202
|
+
chat_model_params[param_name.lower()] = v
|
|
203
|
+
|
|
204
|
+
if 'provider' not in chat_model_params:
|
|
205
|
+
chat_model_params['provider'] = 'openai'
|
|
206
|
+
|
|
207
|
+
if 'api_key' in chat_model_params:
|
|
208
|
+
# move to api_keys dict
|
|
209
|
+
chat_model_params["api_keys"] = {chat_model_params['provider']: chat_model_params['api_key']}
|
|
210
|
+
|
|
211
|
+
return chat_model_params
|
|
212
|
+
|
|
171
213
|
|
|
172
214
|
class DuckDBFunctions:
|
|
173
215
|
def __init__(self, controller):
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
import os
|
|
4
|
+
from typing import Union
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
import fitz # PyMuPDF
|
|
8
|
+
from markitdown import MarkItDown
|
|
9
|
+
import mimetypes
|
|
10
|
+
from openai import OpenAI
|
|
11
|
+
import requests
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ToMarkdown:
|
|
15
|
+
"""
|
|
16
|
+
Extracts the content of documents of various formats in markdown format.
|
|
17
|
+
"""
|
|
18
|
+
def __init__(self, use_llm: bool, llm_client: OpenAI = None, llm_model: str = None):
|
|
19
|
+
"""
|
|
20
|
+
Initializes the ToMarkdown class.
|
|
21
|
+
"""
|
|
22
|
+
# If use_llm is True, llm_client and llm_model must be provided.
|
|
23
|
+
if use_llm and (llm_client is None or llm_model is None):
|
|
24
|
+
raise ValueError('LLM client and model must be provided when use_llm is True.')
|
|
25
|
+
|
|
26
|
+
# If use_llm is False, set llm_client and llm_model to None even if they are provided.
|
|
27
|
+
if not use_llm:
|
|
28
|
+
llm_client = None
|
|
29
|
+
llm_model = None
|
|
30
|
+
|
|
31
|
+
# Only OpenAI is supported for now.
|
|
32
|
+
# TODO: Add support for other LLMs.
|
|
33
|
+
if llm_client is not None and not isinstance(llm_client, OpenAI):
|
|
34
|
+
raise ValueError('Only OpenAI models are supported at the moment.')
|
|
35
|
+
|
|
36
|
+
self.use_llm = use_llm
|
|
37
|
+
self.llm_client = llm_client
|
|
38
|
+
self.llm_model = llm_model
|
|
39
|
+
|
|
40
|
+
def call(self, file_path_or_url: str) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Converts a file to markdown.
|
|
43
|
+
"""
|
|
44
|
+
file_extension = self._get_file_extension(file_path_or_url)
|
|
45
|
+
file = self._get_file_content(file_path_or_url)
|
|
46
|
+
|
|
47
|
+
if file_extension == '.pdf':
|
|
48
|
+
return self._pdf_to_markdown(file)
|
|
49
|
+
elif file_extension in ['.jpg', '.jpeg', '.png', '.gif']:
|
|
50
|
+
return self._image_to_markdown(file)
|
|
51
|
+
else:
|
|
52
|
+
return self._other_to_markdown(file)
|
|
53
|
+
|
|
54
|
+
def _get_file_content(self, file_path_or_url: str) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Retrieves the content of a file.
|
|
57
|
+
"""
|
|
58
|
+
parsed_url = urlparse(file_path_or_url)
|
|
59
|
+
if parsed_url.scheme in ('http', 'https'):
|
|
60
|
+
response = requests.get(file_path_or_url)
|
|
61
|
+
if response.status_code == 200:
|
|
62
|
+
return response
|
|
63
|
+
else:
|
|
64
|
+
raise RuntimeError(f'Unable to retrieve file from URL: {file_path_or_url}')
|
|
65
|
+
else:
|
|
66
|
+
with open(file_path_or_url, 'rb') as file:
|
|
67
|
+
return BytesIO(file.read())
|
|
68
|
+
|
|
69
|
+
def _get_file_extension(self, file_path_or_url: str) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Retrieves the file extension from a file path or URL.
|
|
72
|
+
"""
|
|
73
|
+
parsed_url = urlparse(file_path_or_url)
|
|
74
|
+
if parsed_url.scheme in ('http', 'https'):
|
|
75
|
+
try:
|
|
76
|
+
# Make a HEAD request to get headers without downloading the file.
|
|
77
|
+
response = requests.head(file_path_or_url, allow_redirects=True)
|
|
78
|
+
content_type = response.headers.get('Content-Type', '')
|
|
79
|
+
if content_type:
|
|
80
|
+
ext = mimetypes.guess_extension(content_type.split(';')[0].strip())
|
|
81
|
+
if ext:
|
|
82
|
+
return ext
|
|
83
|
+
|
|
84
|
+
# Fallback to extracting extension from the URL path
|
|
85
|
+
ext = os.path.splitext(parsed_url.path)[1]
|
|
86
|
+
if ext:
|
|
87
|
+
return ext
|
|
88
|
+
except requests.RequestException:
|
|
89
|
+
raise RuntimeError(f'Unable to retrieve file extension from URL: {file_path_or_url}')
|
|
90
|
+
else:
|
|
91
|
+
return os.path.splitext(file_path_or_url)[1]
|
|
92
|
+
|
|
93
|
+
def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes]) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Converts a PDF file to markdown.
|
|
96
|
+
"""
|
|
97
|
+
if self.llm_client is None:
|
|
98
|
+
return self._pdf_to_markdown_no_llm(file_content)
|
|
99
|
+
else:
|
|
100
|
+
return self._pdf_to_markdown_llm(file_content)
|
|
101
|
+
|
|
102
|
+
def _pdf_to_markdown_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
103
|
+
"""
|
|
104
|
+
Converts a PDF file to markdown using LLM.
|
|
105
|
+
The LLM is used mainly for the purpose of generating descriptions of any images in the PDF.
|
|
106
|
+
"""
|
|
107
|
+
if isinstance(file_content, requests.Response):
|
|
108
|
+
file_content = BytesIO(file_content.content)
|
|
109
|
+
|
|
110
|
+
document = fitz.open(stream=file_content, filetype="pdf")
|
|
111
|
+
|
|
112
|
+
markdown_content = []
|
|
113
|
+
for page_num in range(len(document)):
|
|
114
|
+
page = document.load_page(page_num)
|
|
115
|
+
|
|
116
|
+
# Get text blocks with coordinates.
|
|
117
|
+
page_content = []
|
|
118
|
+
blocks = page.get_text("blocks")
|
|
119
|
+
for block in blocks:
|
|
120
|
+
x0, y0, x1, y1, text, _, _ = block
|
|
121
|
+
if text.strip(): # Skip empty or whitespace blocks.
|
|
122
|
+
page_content.append((y0, text.strip()))
|
|
123
|
+
|
|
124
|
+
# Extract images from the page.
|
|
125
|
+
image_list = page.get_images(full=True)
|
|
126
|
+
for img_index, img in enumerate(image_list):
|
|
127
|
+
xref = img[0]
|
|
128
|
+
base_image = document.extract_image(xref)
|
|
129
|
+
image_bytes = base_image["image"]
|
|
130
|
+
|
|
131
|
+
# Use actual image y-coordinate if available.
|
|
132
|
+
y0 = float(base_image.get("y", 0))
|
|
133
|
+
image_description = self._generate_image_description(image_bytes)
|
|
134
|
+
page_content.append((y0, f""))
|
|
135
|
+
|
|
136
|
+
# Sort the content by y0 coordinate
|
|
137
|
+
page_content.sort(key=lambda x: x[0])
|
|
138
|
+
|
|
139
|
+
# Add sorted content to the markdown
|
|
140
|
+
for _, text in page_content:
|
|
141
|
+
markdown_content.append(text)
|
|
142
|
+
markdown_content.append("\n")
|
|
143
|
+
|
|
144
|
+
document.close()
|
|
145
|
+
|
|
146
|
+
return "\n".join(markdown_content)
|
|
147
|
+
|
|
148
|
+
def _generate_image_description(self, image_bytes: bytes) -> str:
|
|
149
|
+
"""
|
|
150
|
+
Generates a description of the image using LLM.
|
|
151
|
+
"""
|
|
152
|
+
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
|
|
153
|
+
|
|
154
|
+
response = self.llm_client.chat.completions.create(
|
|
155
|
+
model=self.llm_model,
|
|
156
|
+
messages=[
|
|
157
|
+
{
|
|
158
|
+
"role": "user",
|
|
159
|
+
"content": [
|
|
160
|
+
{"type": "text", "text": "Describe this image"},
|
|
161
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
|
162
|
+
],
|
|
163
|
+
}
|
|
164
|
+
],
|
|
165
|
+
)
|
|
166
|
+
description = response.choices[0].message.content
|
|
167
|
+
return description
|
|
168
|
+
|
|
169
|
+
def _pdf_to_markdown_no_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
170
|
+
"""
|
|
171
|
+
Converts a PDF file to markdown without using LLM.
|
|
172
|
+
"""
|
|
173
|
+
md = MarkItDown(enable_plugins=True)
|
|
174
|
+
result = md.convert(file_content)
|
|
175
|
+
return result.markdown
|
|
176
|
+
|
|
177
|
+
def _image_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
178
|
+
"""
|
|
179
|
+
Converts images to markdown.
|
|
180
|
+
"""
|
|
181
|
+
if not self.use_llm or self.llm_client is None:
|
|
182
|
+
raise ValueError('LLM client must be enabled to convert images to markdown.')
|
|
183
|
+
|
|
184
|
+
md = MarkItDown(llm_client=self.llm_client, llm_model=self.llm_model, enable_plugins=True)
|
|
185
|
+
result = md.convert(file_content)
|
|
186
|
+
return result.markdown
|
|
187
|
+
|
|
188
|
+
def _other_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
189
|
+
"""
|
|
190
|
+
Converts other file formats to markdown.
|
|
191
|
+
"""
|
|
192
|
+
md = MarkItDown(enable_plugins=True)
|
|
193
|
+
result = md.convert(file_content)
|
|
194
|
+
return result.markdown
|
|
@@ -337,10 +337,10 @@ class JobsController:
|
|
|
337
337
|
BinaryOperation(op='=', args=[Identifier('project'), Constant(project_name)])
|
|
338
338
|
])
|
|
339
339
|
)
|
|
340
|
-
|
|
340
|
+
response = logs_db_controller.query(query)
|
|
341
341
|
|
|
342
|
-
names = [i['name'] for i in columns]
|
|
343
|
-
return
|
|
342
|
+
names = [i['name'] for i in response.columns]
|
|
343
|
+
return response.data_frame[names].to_dict(orient='records')
|
|
344
344
|
|
|
345
345
|
|
|
346
346
|
class JobsExecutor:
|
|
@@ -35,6 +35,7 @@ from mindsdb.interfaces.knowledge_base.preprocessing.models import Preprocessing
|
|
|
35
35
|
from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import PreprocessorFactory
|
|
36
36
|
from mindsdb.interfaces.model.functions import PredictorRecordNotFound
|
|
37
37
|
from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
|
|
38
|
+
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
|
|
38
39
|
|
|
39
40
|
from mindsdb.api.executor.command_executor import ExecuteCommands
|
|
40
41
|
from mindsdb.utilities import log
|
|
@@ -85,88 +86,124 @@ class KnowledgeBaseTable:
|
|
|
85
86
|
"""
|
|
86
87
|
logger.debug(f"Processing select query: {query}")
|
|
87
88
|
|
|
88
|
-
#
|
|
89
|
-
|
|
90
|
-
|
|
89
|
+
# Extract the content query text for potential reranking
|
|
90
|
+
|
|
91
|
+
db_handler = self.get_vector_db()
|
|
91
92
|
|
|
93
|
+
logger.debug("Replaced content with embeddings in where clause")
|
|
92
94
|
# set table name
|
|
93
95
|
query.from_table = Identifier(parts=[self._kb.vector_database_table])
|
|
94
96
|
logger.debug(f"Set table name to: {self._kb.vector_database_table}")
|
|
95
97
|
|
|
96
|
-
|
|
97
|
-
targets = []
|
|
98
|
+
requested_kb_columns = []
|
|
98
99
|
for target in query.targets:
|
|
99
100
|
if isinstance(target, Star):
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
101
|
+
requested_kb_columns = None
|
|
102
|
+
break
|
|
103
|
+
else:
|
|
104
|
+
requested_kb_columns.append(target.parts[-1].lower())
|
|
105
|
+
|
|
106
|
+
query.targets = [
|
|
107
|
+
Identifier(TableField.ID.value),
|
|
108
|
+
Identifier(TableField.CONTENT.value),
|
|
109
|
+
Identifier(TableField.METADATA.value),
|
|
110
|
+
Identifier(TableField.DISTANCE.value),
|
|
111
|
+
]
|
|
109
112
|
|
|
110
113
|
# Get response from vector db
|
|
111
|
-
db_handler = self.get_vector_db()
|
|
112
114
|
logger.debug(f"Using vector db handler: {type(db_handler)}")
|
|
113
115
|
|
|
114
|
-
conditions
|
|
116
|
+
# extract values from conditions and prepare for vectordb
|
|
117
|
+
conditions = []
|
|
118
|
+
query_text = None
|
|
119
|
+
reranking_threshold = None
|
|
120
|
+
query_conditions = db_handler.extract_conditions(query.where)
|
|
121
|
+
if query_conditions is not None:
|
|
122
|
+
for item in query_conditions:
|
|
123
|
+
if item.column == "reranking_threshold" and item.op.value == "=":
|
|
124
|
+
try:
|
|
125
|
+
reranking_threshold = float(item.value)
|
|
126
|
+
# Validate range: must be between 0 and 1
|
|
127
|
+
if not (0 <= reranking_threshold <= 1):
|
|
128
|
+
raise ValueError(f"reranking_threshold must be between 0 and 1, got: {reranking_threshold}")
|
|
129
|
+
logger.debug(f"Found reranking_threshold in query: {reranking_threshold}")
|
|
130
|
+
except (ValueError, TypeError) as e:
|
|
131
|
+
error_msg = f"Invalid reranking_threshold value: {item.value}. {str(e)}"
|
|
132
|
+
logger.error(error_msg)
|
|
133
|
+
raise ValueError(error_msg)
|
|
134
|
+
elif item.column == TableField.CONTENT.value:
|
|
135
|
+
query_text = item.value
|
|
136
|
+
|
|
137
|
+
# replace content with embeddings
|
|
138
|
+
conditions.append(FilterCondition(
|
|
139
|
+
column=TableField.EMBEDDINGS.value,
|
|
140
|
+
value=self._content_to_embeddings(item.value),
|
|
141
|
+
op=FilterOperator.EQUAL,
|
|
142
|
+
))
|
|
143
|
+
else:
|
|
144
|
+
conditions.append(item)
|
|
145
|
+
|
|
146
|
+
logger.debug(f"Extracted query text: {query_text}")
|
|
147
|
+
|
|
115
148
|
self.addapt_conditions_columns(conditions)
|
|
116
149
|
df = db_handler.dispatch_select(query, conditions)
|
|
150
|
+
df = self.addapt_result_columns(df)
|
|
117
151
|
|
|
118
|
-
|
|
152
|
+
logger.debug(f"Query returned {len(df)} rows")
|
|
153
|
+
logger.debug(f"Columns in response: {df.columns.tolist()}")
|
|
154
|
+
# Check if we have a rerank_model configured in KB params
|
|
119
155
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
156
|
+
df = self.add_relevance(df, query_text, reranking_threshold)
|
|
157
|
+
|
|
158
|
+
# filter by targets
|
|
159
|
+
if requested_kb_columns is not None:
|
|
160
|
+
df = df[requested_kb_columns]
|
|
161
|
+
return df
|
|
162
|
+
|
|
163
|
+
def add_relevance(self, df, query_text, reranking_threshold=None):
|
|
164
|
+
relevance_column = TableField.RELEVANCE.value
|
|
127
165
|
|
|
128
166
|
rerank_model = self._kb.params.get("rerank_model")
|
|
129
|
-
if rerank_model and
|
|
167
|
+
if rerank_model and query_text and len(df) > 0:
|
|
168
|
+
# Use reranker for relevance score
|
|
130
169
|
try:
|
|
131
|
-
logger.info(f"Using reranker model
|
|
132
|
-
|
|
133
|
-
#
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
is_binary_op = isinstance(node, BinaryOperation)
|
|
143
|
-
is_identifier = isinstance(node.args[0], Identifier)
|
|
144
|
-
is_content = node.args[0].parts[-1].lower() == 'content'
|
|
145
|
-
is_constant = isinstance(node.args[1], Constant)
|
|
146
|
-
if is_binary_op and is_identifier and is_content and is_constant:
|
|
147
|
-
query_text = node.args[1].value
|
|
148
|
-
query_traversal(query.where, extract_content)
|
|
149
|
-
logger.debug(f"Extracted query text: {query_text}")
|
|
150
|
-
# Get scores from reranker
|
|
170
|
+
logger.info(f"Using reranker model {rerank_model} for relevance calculation")
|
|
171
|
+
reranker_params = {"model": rerank_model}
|
|
172
|
+
# Apply custom filtering threshold if provided
|
|
173
|
+
if reranking_threshold is not None:
|
|
174
|
+
reranker_params["filtering_threshold"] = reranking_threshold
|
|
175
|
+
logger.info(f"Using custom filtering threshold: {reranking_threshold}")
|
|
176
|
+
|
|
177
|
+
reranker = LLMReranker(**reranker_params)
|
|
178
|
+
# Get documents to rerank
|
|
179
|
+
documents = df['chunk_content'].tolist()
|
|
180
|
+
# Use the get_scores method with disable_events=True
|
|
151
181
|
scores = reranker.get_scores(query_text, documents)
|
|
152
|
-
# Add scores as
|
|
182
|
+
# Add scores as the relevance column
|
|
183
|
+
df[relevance_column] = scores
|
|
184
|
+
|
|
185
|
+
# Filter by threshold
|
|
153
186
|
scores_array = np.array(scores)
|
|
154
|
-
# Add temporary column for sorting
|
|
155
|
-
df['_relevance_score'] = scores
|
|
156
|
-
# Filter by score threshold using numpy array for element-wise comparison
|
|
157
187
|
df = df[scores_array > reranker.filtering_threshold]
|
|
158
|
-
|
|
159
|
-
df = df.sort_values(by='_relevance_score', ascending=False)
|
|
160
|
-
# Remove temporary column
|
|
161
|
-
# df = df.drop(columns=['_relevance_score'])
|
|
162
|
-
# Apply original limit if it exists
|
|
163
|
-
if query.limit and len(df) > query.limit.value:
|
|
164
|
-
df = df.iloc[:query.limit.value]
|
|
165
|
-
logger.debug(f"Applied reranking with model {rerank_model}")
|
|
188
|
+
logger.debug(f"Applied reranking with model {rerank_model}, threshold: {reranker.filtering_threshold}")
|
|
166
189
|
except Exception as e:
|
|
167
190
|
logger.error(f"Error during reranking: {str(e)}")
|
|
191
|
+
# Fallback to distance-based relevance
|
|
192
|
+
if 'distance' in df.columns:
|
|
193
|
+
df[relevance_column] = 1 / (1 + df['distance'])
|
|
194
|
+
else:
|
|
195
|
+
logger.info("No distance or reranker available")
|
|
168
196
|
|
|
169
|
-
|
|
197
|
+
elif 'distance' in df.columns:
|
|
198
|
+
# Calculate relevance from distance
|
|
199
|
+
logger.info("Calculating relevance from vector distance")
|
|
200
|
+
df[relevance_column] = 1 / (1 + df['distance'])
|
|
201
|
+
|
|
202
|
+
else:
|
|
203
|
+
df[relevance_column] = None
|
|
204
|
+
df['distance'] = None
|
|
205
|
+
# Sort by relevance
|
|
206
|
+
df = df.sort_values(by=relevance_column, ascending=False)
|
|
170
207
|
return df
|
|
171
208
|
|
|
172
209
|
def addapt_conditions_columns(self, conditions):
|
|
@@ -186,7 +223,9 @@ class KnowledgeBaseTable:
|
|
|
186
223
|
|
|
187
224
|
columns = list(df.columns)
|
|
188
225
|
# update id, get from metadata
|
|
189
|
-
df[TableField.ID.value] = df[TableField.METADATA.value].apply(
|
|
226
|
+
df[TableField.ID.value] = df[TableField.METADATA.value].apply(
|
|
227
|
+
lambda m: None if m is None else m.get('original_row_id')
|
|
228
|
+
)
|
|
190
229
|
|
|
191
230
|
# id on first place
|
|
192
231
|
return df[[TableField.ID.value] + columns]
|
|
@@ -276,7 +315,9 @@ class KnowledgeBaseTable:
|
|
|
276
315
|
|
|
277
316
|
# send to vectordb
|
|
278
317
|
db_handler = self.get_vector_db()
|
|
279
|
-
db_handler.
|
|
318
|
+
conditions = db_handler.extract_conditions(query.where)
|
|
319
|
+
self.addapt_conditions_columns(conditions)
|
|
320
|
+
db_handler.dispatch_update(query, conditions)
|
|
280
321
|
|
|
281
322
|
def delete_query(self, query: Delete):
|
|
282
323
|
"""
|
|
@@ -92,9 +92,7 @@ class DocumentPreprocessor:
|
|
|
92
92
|
|
|
93
93
|
def _generate_chunk_id(
|
|
94
94
|
self,
|
|
95
|
-
content: str,
|
|
96
95
|
chunk_index: Optional[int] = None,
|
|
97
|
-
content_column: str = None,
|
|
98
96
|
provided_id: str = None,
|
|
99
97
|
) -> str:
|
|
100
98
|
"""Generate deterministic ID for a chunk"""
|
|
@@ -262,15 +260,8 @@ Please give a short succinct context to situate this chunk within the overall do
|
|
|
262
260
|
if doc.metadata:
|
|
263
261
|
metadata.update(doc.metadata)
|
|
264
262
|
|
|
265
|
-
# Pass through doc.id and content_column
|
|
266
|
-
content_column = (
|
|
267
|
-
doc.metadata.get("content_column") if doc.metadata else None
|
|
268
|
-
)
|
|
269
263
|
chunk_id = self._generate_chunk_id(
|
|
270
|
-
|
|
271
|
-
chunk_index,
|
|
272
|
-
content_column=content_column,
|
|
273
|
-
provided_id=doc.id,
|
|
264
|
+
chunk_index=chunk_index, provided_id=doc.id
|
|
274
265
|
)
|
|
275
266
|
processed_chunks.append(
|
|
276
267
|
ProcessedChunk(
|
|
@@ -335,7 +326,7 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
|
|
|
335
326
|
|
|
336
327
|
# Pass through doc.id and content_column
|
|
337
328
|
id = self._generate_chunk_id(
|
|
338
|
-
|
|
329
|
+
chunk_index=0, provided_id=doc.id
|
|
339
330
|
)
|
|
340
331
|
processed_chunks.append(
|
|
341
332
|
ProcessedChunk(
|
|
@@ -358,9 +349,7 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
|
|
|
358
349
|
|
|
359
350
|
# Pass through doc.id and content_column
|
|
360
351
|
chunk_id = self._generate_chunk_id(
|
|
361
|
-
|
|
362
|
-
i,
|
|
363
|
-
content_column=content_column,
|
|
352
|
+
chunk_index=i,
|
|
364
353
|
provided_id=doc.id,
|
|
365
354
|
)
|
|
366
355
|
processed_chunks.append(
|
|
@@ -156,10 +156,12 @@ class QueryContextController:
|
|
|
156
156
|
last_values = {}
|
|
157
157
|
for query, info in l_query.get_init_queries():
|
|
158
158
|
|
|
159
|
-
|
|
159
|
+
response = dn.query(
|
|
160
160
|
query=query,
|
|
161
161
|
session=session
|
|
162
162
|
)
|
|
163
|
+
data = response.data_frame
|
|
164
|
+
columns_info = response.columns
|
|
163
165
|
|
|
164
166
|
if len(data) == 0:
|
|
165
167
|
value = None
|