MindsDB 25.4.1.0__py3-none-any.whl → 25.4.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (48) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/executor/command_executor.py +62 -61
  3. mindsdb/api/executor/data_types/answer.py +9 -12
  4. mindsdb/api/executor/datahub/classes/response.py +11 -0
  5. mindsdb/api/executor/datahub/datanodes/datanode.py +4 -4
  6. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +7 -9
  7. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +22 -16
  8. mindsdb/api/executor/datahub/datanodes/project_datanode.py +20 -20
  9. mindsdb/api/executor/planner/plan_join.py +1 -1
  10. mindsdb/api/executor/planner/steps.py +2 -1
  11. mindsdb/api/executor/sql_query/result_set.py +10 -7
  12. mindsdb/api/executor/sql_query/sql_query.py +36 -82
  13. mindsdb/api/executor/sql_query/steps/delete_step.py +2 -3
  14. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +5 -3
  15. mindsdb/api/executor/sql_query/steps/insert_step.py +2 -2
  16. mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -2
  17. mindsdb/api/executor/sql_query/steps/subselect_step.py +20 -8
  18. mindsdb/api/executor/sql_query/steps/update_step.py +4 -6
  19. mindsdb/api/http/namespaces/sql.py +4 -1
  20. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/ok_packet.py +1 -1
  21. mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +4 -27
  22. mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +1 -0
  23. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +38 -37
  24. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +23 -13
  25. mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +1 -1
  26. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +3 -2
  27. mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +4 -4
  28. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +19 -5
  29. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +9 -4
  30. mindsdb/integrations/handlers/redshift_handler/redshift_handler.py +1 -1
  31. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +18 -11
  32. mindsdb/integrations/libs/ml_handler_process/learn_process.py +1 -2
  33. mindsdb/integrations/libs/response.py +9 -4
  34. mindsdb/integrations/libs/vectordatabase_handler.py +17 -5
  35. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +8 -98
  36. mindsdb/interfaces/database/log.py +8 -9
  37. mindsdb/interfaces/database/projects.py +1 -5
  38. mindsdb/interfaces/functions/controller.py +59 -17
  39. mindsdb/interfaces/functions/to_markdown.py +194 -0
  40. mindsdb/interfaces/jobs/jobs_controller.py +3 -3
  41. mindsdb/interfaces/knowledge_base/controller.py +101 -60
  42. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +3 -14
  43. mindsdb/interfaces/query_context/context_controller.py +3 -1
  44. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/METADATA +231 -230
  45. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/RECORD +48 -46
  46. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/WHEEL +0 -0
  47. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/licenses/LICENSE +0 -0
  48. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  import os
2
2
 
3
3
  from duckdb.typing import BIGINT, DOUBLE, VARCHAR, BLOB, BOOLEAN
4
+ from mindsdb.interfaces.functions.to_markdown import ToMarkdown
4
5
  from mindsdb.interfaces.storage.model_fs import HandlerStorage
5
6
 
6
7
 
@@ -121,32 +122,20 @@ class FunctionController(BYOMFunctionsController):
121
122
  if meta is not None:
122
123
  return meta
123
124
 
124
- # builtin function
125
+ # builtin functions
125
126
  if node.op.lower() == 'llm':
126
127
  return self.llm_call_function(node)
127
128
 
129
+ elif node.op.lower() == 'to_markdown':
130
+ return self.to_markdown_call_function(node)
131
+
128
132
  def llm_call_function(self, node):
129
133
  name = node.op.lower()
130
134
 
131
135
  if name in self.callbacks:
132
136
  return self.callbacks[name]
133
137
 
134
- param_prefix = 'LLM_FUNCTION_'
135
- chat_model_params = {}
136
- for k, v in os.environ.items():
137
- if k.startswith(param_prefix):
138
- param_name = k[len(param_prefix):]
139
- if param_name == 'MODEL':
140
- chat_model_params['model_name'] = v
141
- else:
142
- chat_model_params[param_name.lower()] = v
143
-
144
- if 'provider' not in chat_model_params:
145
- chat_model_params['provider'] = 'openai'
146
-
147
- if 'api_key' in chat_model_params:
148
- # move to api_keys dict
149
- chat_model_params["api_keys"] = {chat_model_params['provider']: chat_model_params['api_key']}
138
+ chat_model_params = self._parse_chat_model_params()
150
139
 
151
140
  try:
152
141
  from langchain_core.messages import HumanMessage
@@ -168,6 +157,59 @@ class FunctionController(BYOMFunctionsController):
168
157
  self.callbacks[name] = meta
169
158
  return meta
170
159
 
160
+ def to_markdown_call_function(self, node):
161
+ name = node.op.lower()
162
+
163
+ if name in self.callbacks:
164
+ return self.callbacks[name]
165
+
166
+ def callback(file_path_or_url, use_llm):
167
+ chat_model_params = self._parse_chat_model_params()
168
+
169
+ llm_client = None
170
+ llm_model = None
171
+ try:
172
+ from mindsdb.interfaces.agents.langchain_agent import create_chat_model
173
+ llm = create_chat_model(chat_model_params)
174
+ llm_client = llm.root_client
175
+ llm_model = llm.model_name
176
+ except Exception:
177
+ pass
178
+
179
+ to_markdown = ToMarkdown(use_llm, llm_client, llm_model)
180
+ return to_markdown.call(file_path_or_url)
181
+
182
+ meta = {
183
+ 'name': name,
184
+ 'callback': callback,
185
+ 'input_types': ['str', 'bool'],
186
+ 'output_type': 'str'
187
+ }
188
+ self.callbacks[name] = meta
189
+ return meta
190
+
191
+ def _parse_chat_model_params(self, param_prefix: str = 'LLM_FUNCTION_'):
192
+ """
193
+ Parses the environment variables for chat model parameters.
194
+ """
195
+ chat_model_params = {}
196
+ for k, v in os.environ.items():
197
+ if k.startswith(param_prefix):
198
+ param_name = k[len(param_prefix):]
199
+ if param_name == 'MODEL':
200
+ chat_model_params['model_name'] = v
201
+ else:
202
+ chat_model_params[param_name.lower()] = v
203
+
204
+ if 'provider' not in chat_model_params:
205
+ chat_model_params['provider'] = 'openai'
206
+
207
+ if 'api_key' in chat_model_params:
208
+ # move to api_keys dict
209
+ chat_model_params["api_keys"] = {chat_model_params['provider']: chat_model_params['api_key']}
210
+
211
+ return chat_model_params
212
+
171
213
 
172
214
  class DuckDBFunctions:
173
215
  def __init__(self, controller):
@@ -0,0 +1,194 @@
1
+ import base64
2
+ from io import BytesIO
3
+ import os
4
+ from typing import Union
5
+ from urllib.parse import urlparse
6
+
7
+ import fitz # PyMuPDF
8
+ from markitdown import MarkItDown
9
+ import mimetypes
10
+ from openai import OpenAI
11
+ import requests
12
+
13
+
14
+ class ToMarkdown:
15
+ """
16
+ Extracts the content of documents of various formats in markdown format.
17
+ """
18
+ def __init__(self, use_llm: bool, llm_client: OpenAI = None, llm_model: str = None):
19
+ """
20
+ Initializes the ToMarkdown class.
21
+ """
22
+ # If use_llm is True, llm_client and llm_model must be provided.
23
+ if use_llm and (llm_client is None or llm_model is None):
24
+ raise ValueError('LLM client and model must be provided when use_llm is True.')
25
+
26
+ # If use_llm is False, set llm_client and llm_model to None even if they are provided.
27
+ if not use_llm:
28
+ llm_client = None
29
+ llm_model = None
30
+
31
+ # Only OpenAI is supported for now.
32
+ # TODO: Add support for other LLMs.
33
+ if llm_client is not None and not isinstance(llm_client, OpenAI):
34
+ raise ValueError('Only OpenAI models are supported at the moment.')
35
+
36
+ self.use_llm = use_llm
37
+ self.llm_client = llm_client
38
+ self.llm_model = llm_model
39
+
40
+ def call(self, file_path_or_url: str) -> str:
41
+ """
42
+ Converts a file to markdown.
43
+ """
44
+ file_extension = self._get_file_extension(file_path_or_url)
45
+ file = self._get_file_content(file_path_or_url)
46
+
47
+ if file_extension == '.pdf':
48
+ return self._pdf_to_markdown(file)
49
+ elif file_extension in ['.jpg', '.jpeg', '.png', '.gif']:
50
+ return self._image_to_markdown(file)
51
+ else:
52
+ return self._other_to_markdown(file)
53
+
54
+ def _get_file_content(self, file_path_or_url: str) -> str:
55
+ """
56
+ Retrieves the content of a file.
57
+ """
58
+ parsed_url = urlparse(file_path_or_url)
59
+ if parsed_url.scheme in ('http', 'https'):
60
+ response = requests.get(file_path_or_url)
61
+ if response.status_code == 200:
62
+ return response
63
+ else:
64
+ raise RuntimeError(f'Unable to retrieve file from URL: {file_path_or_url}')
65
+ else:
66
+ with open(file_path_or_url, 'rb') as file:
67
+ return BytesIO(file.read())
68
+
69
+ def _get_file_extension(self, file_path_or_url: str) -> str:
70
+ """
71
+ Retrieves the file extension from a file path or URL.
72
+ """
73
+ parsed_url = urlparse(file_path_or_url)
74
+ if parsed_url.scheme in ('http', 'https'):
75
+ try:
76
+ # Make a HEAD request to get headers without downloading the file.
77
+ response = requests.head(file_path_or_url, allow_redirects=True)
78
+ content_type = response.headers.get('Content-Type', '')
79
+ if content_type:
80
+ ext = mimetypes.guess_extension(content_type.split(';')[0].strip())
81
+ if ext:
82
+ return ext
83
+
84
+ # Fallback to extracting extension from the URL path
85
+ ext = os.path.splitext(parsed_url.path)[1]
86
+ if ext:
87
+ return ext
88
+ except requests.RequestException:
89
+ raise RuntimeError(f'Unable to retrieve file extension from URL: {file_path_or_url}')
90
+ else:
91
+ return os.path.splitext(file_path_or_url)[1]
92
+
93
+ def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes]) -> str:
94
+ """
95
+ Converts a PDF file to markdown.
96
+ """
97
+ if self.llm_client is None:
98
+ return self._pdf_to_markdown_no_llm(file_content)
99
+ else:
100
+ return self._pdf_to_markdown_llm(file_content)
101
+
102
+ def _pdf_to_markdown_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
103
+ """
104
+ Converts a PDF file to markdown using LLM.
105
+ The LLM is used mainly for the purpose of generating descriptions of any images in the PDF.
106
+ """
107
+ if isinstance(file_content, requests.Response):
108
+ file_content = BytesIO(file_content.content)
109
+
110
+ document = fitz.open(stream=file_content, filetype="pdf")
111
+
112
+ markdown_content = []
113
+ for page_num in range(len(document)):
114
+ page = document.load_page(page_num)
115
+
116
+ # Get text blocks with coordinates.
117
+ page_content = []
118
+ blocks = page.get_text("blocks")
119
+ for block in blocks:
120
+ x0, y0, x1, y1, text, _, _ = block
121
+ if text.strip(): # Skip empty or whitespace blocks.
122
+ page_content.append((y0, text.strip()))
123
+
124
+ # Extract images from the page.
125
+ image_list = page.get_images(full=True)
126
+ for img_index, img in enumerate(image_list):
127
+ xref = img[0]
128
+ base_image = document.extract_image(xref)
129
+ image_bytes = base_image["image"]
130
+
131
+ # Use actual image y-coordinate if available.
132
+ y0 = float(base_image.get("y", 0))
133
+ image_description = self._generate_image_description(image_bytes)
134
+ page_content.append((y0, f"![{image_description}](image_{page_num + 1}_{img_index + 1}.png)"))
135
+
136
+ # Sort the content by y0 coordinate
137
+ page_content.sort(key=lambda x: x[0])
138
+
139
+ # Add sorted content to the markdown
140
+ for _, text in page_content:
141
+ markdown_content.append(text)
142
+ markdown_content.append("\n")
143
+
144
+ document.close()
145
+
146
+ return "\n".join(markdown_content)
147
+
148
+ def _generate_image_description(self, image_bytes: bytes) -> str:
149
+ """
150
+ Generates a description of the image using LLM.
151
+ """
152
+ image_base64 = base64.b64encode(image_bytes).decode("utf-8")
153
+
154
+ response = self.llm_client.chat.completions.create(
155
+ model=self.llm_model,
156
+ messages=[
157
+ {
158
+ "role": "user",
159
+ "content": [
160
+ {"type": "text", "text": "Describe this image"},
161
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
162
+ ],
163
+ }
164
+ ],
165
+ )
166
+ description = response.choices[0].message.content
167
+ return description
168
+
169
+ def _pdf_to_markdown_no_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
170
+ """
171
+ Converts a PDF file to markdown without using LLM.
172
+ """
173
+ md = MarkItDown(enable_plugins=True)
174
+ result = md.convert(file_content)
175
+ return result.markdown
176
+
177
+ def _image_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
178
+ """
179
+ Converts images to markdown.
180
+ """
181
+ if not self.use_llm or self.llm_client is None:
182
+ raise ValueError('LLM client must be enabled to convert images to markdown.')
183
+
184
+ md = MarkItDown(llm_client=self.llm_client, llm_model=self.llm_model, enable_plugins=True)
185
+ result = md.convert(file_content)
186
+ return result.markdown
187
+
188
+ def _other_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
189
+ """
190
+ Converts other file formats to markdown.
191
+ """
192
+ md = MarkItDown(enable_plugins=True)
193
+ result = md.convert(file_content)
194
+ return result.markdown
@@ -337,10 +337,10 @@ class JobsController:
337
337
  BinaryOperation(op='=', args=[Identifier('project'), Constant(project_name)])
338
338
  ])
339
339
  )
340
- data, columns = logs_db_controller.query(query)
340
+ response = logs_db_controller.query(query)
341
341
 
342
- names = [i['name'] for i in columns]
343
- return data[names].to_dict(orient='records')
342
+ names = [i['name'] for i in response.columns]
343
+ return response.data_frame[names].to_dict(orient='records')
344
344
 
345
345
 
346
346
  class JobsExecutor:
@@ -35,6 +35,7 @@ from mindsdb.interfaces.knowledge_base.preprocessing.models import Preprocessing
35
35
  from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import PreprocessorFactory
36
36
  from mindsdb.interfaces.model.functions import PredictorRecordNotFound
37
37
  from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
38
+ from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
38
39
 
39
40
  from mindsdb.api.executor.command_executor import ExecuteCommands
40
41
  from mindsdb.utilities import log
@@ -85,88 +86,124 @@ class KnowledgeBaseTable:
85
86
  """
86
87
  logger.debug(f"Processing select query: {query}")
87
88
 
88
- # replace content with embeddings
89
- query_traversal(query.where, self._replace_query_content)
90
- logger.debug("Replaced content with embeddings in where clause")
89
+ # Extract the content query text for potential reranking
90
+
91
+ db_handler = self.get_vector_db()
91
92
 
93
+ logger.debug("Replaced content with embeddings in where clause")
92
94
  # set table name
93
95
  query.from_table = Identifier(parts=[self._kb.vector_database_table])
94
96
  logger.debug(f"Set table name to: {self._kb.vector_database_table}")
95
97
 
96
- # remove embeddings from result
97
- targets = []
98
+ requested_kb_columns = []
98
99
  for target in query.targets:
99
100
  if isinstance(target, Star):
100
- targets.extend([
101
- Identifier(TableField.ID.value),
102
- Identifier(TableField.CONTENT.value),
103
- Identifier(TableField.METADATA.value),
104
- ])
105
- elif isinstance(target, Identifier) and target.parts[-1].lower() != TableField.EMBEDDINGS.value:
106
- targets.append(target)
107
- query.targets = targets
108
- logger.debug(f"Modified query targets: {targets}")
101
+ requested_kb_columns = None
102
+ break
103
+ else:
104
+ requested_kb_columns.append(target.parts[-1].lower())
105
+
106
+ query.targets = [
107
+ Identifier(TableField.ID.value),
108
+ Identifier(TableField.CONTENT.value),
109
+ Identifier(TableField.METADATA.value),
110
+ Identifier(TableField.DISTANCE.value),
111
+ ]
109
112
 
110
113
  # Get response from vector db
111
- db_handler = self.get_vector_db()
112
114
  logger.debug(f"Using vector db handler: {type(db_handler)}")
113
115
 
114
- conditions = db_handler.extract_conditions(query.where)
116
+ # extract values from conditions and prepare for vectordb
117
+ conditions = []
118
+ query_text = None
119
+ reranking_threshold = None
120
+ query_conditions = db_handler.extract_conditions(query.where)
121
+ if query_conditions is not None:
122
+ for item in query_conditions:
123
+ if item.column == "reranking_threshold" and item.op.value == "=":
124
+ try:
125
+ reranking_threshold = float(item.value)
126
+ # Validate range: must be between 0 and 1
127
+ if not (0 <= reranking_threshold <= 1):
128
+ raise ValueError(f"reranking_threshold must be between 0 and 1, got: {reranking_threshold}")
129
+ logger.debug(f"Found reranking_threshold in query: {reranking_threshold}")
130
+ except (ValueError, TypeError) as e:
131
+ error_msg = f"Invalid reranking_threshold value: {item.value}. {str(e)}"
132
+ logger.error(error_msg)
133
+ raise ValueError(error_msg)
134
+ elif item.column == TableField.CONTENT.value:
135
+ query_text = item.value
136
+
137
+ # replace content with embeddings
138
+ conditions.append(FilterCondition(
139
+ column=TableField.EMBEDDINGS.value,
140
+ value=self._content_to_embeddings(item.value),
141
+ op=FilterOperator.EQUAL,
142
+ ))
143
+ else:
144
+ conditions.append(item)
145
+
146
+ logger.debug(f"Extracted query text: {query_text}")
147
+
115
148
  self.addapt_conditions_columns(conditions)
116
149
  df = db_handler.dispatch_select(query, conditions)
150
+ df = self.addapt_result_columns(df)
117
151
 
118
- if df is not None:
152
+ logger.debug(f"Query returned {len(df)} rows")
153
+ logger.debug(f"Columns in response: {df.columns.tolist()}")
154
+ # Check if we have a rerank_model configured in KB params
119
155
 
120
- logger.debug(f"Query returned {len(df)} rows")
121
- logger.debug(f"Columns in response: {df.columns.tolist()}")
122
- # Log a sample of IDs to help diagnose issues
123
- if not df.empty:
124
- logger.debug(f"Sample of IDs in response: {df['id'].head().tolist()}")
125
- else:
126
- logger.warning("Query returned no data")
156
+ df = self.add_relevance(df, query_text, reranking_threshold)
157
+
158
+ # filter by targets
159
+ if requested_kb_columns is not None:
160
+ df = df[requested_kb_columns]
161
+ return df
162
+
163
+ def add_relevance(self, df, query_text, reranking_threshold=None):
164
+ relevance_column = TableField.RELEVANCE.value
127
165
 
128
166
  rerank_model = self._kb.params.get("rerank_model")
129
- if rerank_model and df is not None and not df.empty:
167
+ if rerank_model and query_text and len(df) > 0:
168
+ # Use reranker for relevance score
130
169
  try:
131
- logger.info(f"Using reranker model: {rerank_model}")
132
- reranker = LLMReranker(model=rerank_model)
133
- # convert response from a dataframe to a list of strings
134
- content_column = df[TableField.CONTENT.value]
135
- # convert to list
136
- documents = content_column.tolist()
137
- # Extract query text from WHERE clause if it exists
138
- query_text = ""
139
- if query.where:
140
- def extract_content(node, **kwargs):
141
- nonlocal query_text
142
- is_binary_op = isinstance(node, BinaryOperation)
143
- is_identifier = isinstance(node.args[0], Identifier)
144
- is_content = node.args[0].parts[-1].lower() == 'content'
145
- is_constant = isinstance(node.args[1], Constant)
146
- if is_binary_op and is_identifier and is_content and is_constant:
147
- query_text = node.args[1].value
148
- query_traversal(query.where, extract_content)
149
- logger.debug(f"Extracted query text: {query_text}")
150
- # Get scores from reranker
170
+ logger.info(f"Using reranker model {rerank_model} for relevance calculation")
171
+ reranker_params = {"model": rerank_model}
172
+ # Apply custom filtering threshold if provided
173
+ if reranking_threshold is not None:
174
+ reranker_params["filtering_threshold"] = reranking_threshold
175
+ logger.info(f"Using custom filtering threshold: {reranking_threshold}")
176
+
177
+ reranker = LLMReranker(**reranker_params)
178
+ # Get documents to rerank
179
+ documents = df['chunk_content'].tolist()
180
+ # Use the get_scores method with disable_events=True
151
181
  scores = reranker.get_scores(query_text, documents)
152
- # Add scores as a new column for filtering
182
+ # Add scores as the relevance column
183
+ df[relevance_column] = scores
184
+
185
+ # Filter by threshold
153
186
  scores_array = np.array(scores)
154
- # Add temporary column for sorting
155
- df['_relevance_score'] = scores
156
- # Filter by score threshold using numpy array for element-wise comparison
157
187
  df = df[scores_array > reranker.filtering_threshold]
158
- # Sort by relevance (higher score = more relevant)
159
- df = df.sort_values(by='_relevance_score', ascending=False)
160
- # Remove temporary column
161
- # df = df.drop(columns=['_relevance_score'])
162
- # Apply original limit if it exists
163
- if query.limit and len(df) > query.limit.value:
164
- df = df.iloc[:query.limit.value]
165
- logger.debug(f"Applied reranking with model {rerank_model}")
188
+ logger.debug(f"Applied reranking with model {rerank_model}, threshold: {reranker.filtering_threshold}")
166
189
  except Exception as e:
167
190
  logger.error(f"Error during reranking: {str(e)}")
191
+ # Fallback to distance-based relevance
192
+ if 'distance' in df.columns:
193
+ df[relevance_column] = 1 / (1 + df['distance'])
194
+ else:
195
+ logger.info("No distance or reranker available")
168
196
 
169
- df = self.addapt_result_columns(df)
197
+ elif 'distance' in df.columns:
198
+ # Calculate relevance from distance
199
+ logger.info("Calculating relevance from vector distance")
200
+ df[relevance_column] = 1 / (1 + df['distance'])
201
+
202
+ else:
203
+ df[relevance_column] = None
204
+ df['distance'] = None
205
+ # Sort by relevance
206
+ df = df.sort_values(by=relevance_column, ascending=False)
170
207
  return df
171
208
 
172
209
  def addapt_conditions_columns(self, conditions):
@@ -186,7 +223,9 @@ class KnowledgeBaseTable:
186
223
 
187
224
  columns = list(df.columns)
188
225
  # update id, get from metadata
189
- df[TableField.ID.value] = df[TableField.METADATA.value].apply(lambda m: m.get('original_row_id'))
226
+ df[TableField.ID.value] = df[TableField.METADATA.value].apply(
227
+ lambda m: None if m is None else m.get('original_row_id')
228
+ )
190
229
 
191
230
  # id on first place
192
231
  return df[[TableField.ID.value] + columns]
@@ -276,7 +315,9 @@ class KnowledgeBaseTable:
276
315
 
277
316
  # send to vectordb
278
317
  db_handler = self.get_vector_db()
279
- db_handler.query(query)
318
+ conditions = db_handler.extract_conditions(query.where)
319
+ self.addapt_conditions_columns(conditions)
320
+ db_handler.dispatch_update(query, conditions)
280
321
 
281
322
  def delete_query(self, query: Delete):
282
323
  """
@@ -92,9 +92,7 @@ class DocumentPreprocessor:
92
92
 
93
93
  def _generate_chunk_id(
94
94
  self,
95
- content: str,
96
95
  chunk_index: Optional[int] = None,
97
- content_column: str = None,
98
96
  provided_id: str = None,
99
97
  ) -> str:
100
98
  """Generate deterministic ID for a chunk"""
@@ -262,15 +260,8 @@ Please give a short succinct context to situate this chunk within the overall do
262
260
  if doc.metadata:
263
261
  metadata.update(doc.metadata)
264
262
 
265
- # Pass through doc.id and content_column
266
- content_column = (
267
- doc.metadata.get("content_column") if doc.metadata else None
268
- )
269
263
  chunk_id = self._generate_chunk_id(
270
- processed_content,
271
- chunk_index,
272
- content_column=content_column,
273
- provided_id=doc.id,
264
+ chunk_index=chunk_index, provided_id=doc.id
274
265
  )
275
266
  processed_chunks.append(
276
267
  ProcessedChunk(
@@ -335,7 +326,7 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
335
326
 
336
327
  # Pass through doc.id and content_column
337
328
  id = self._generate_chunk_id(
338
- chunk_doc.content, content_column=content_column, provided_id=doc.id
329
+ chunk_index=0, provided_id=doc.id
339
330
  )
340
331
  processed_chunks.append(
341
332
  ProcessedChunk(
@@ -358,9 +349,7 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
358
349
 
359
350
  # Pass through doc.id and content_column
360
351
  chunk_id = self._generate_chunk_id(
361
- chunk_doc.content,
362
- i,
363
- content_column=content_column,
352
+ chunk_index=i,
364
353
  provided_id=doc.id,
365
354
  )
366
355
  processed_chunks.append(
@@ -156,10 +156,12 @@ class QueryContextController:
156
156
  last_values = {}
157
157
  for query, info in l_query.get_init_queries():
158
158
 
159
- data, columns_info = dn.query(
159
+ response = dn.query(
160
160
  query=query,
161
161
  session=session
162
162
  )
163
+ data = response.data_frame
164
+ columns_info = response.columns
163
165
 
164
166
  if len(data) == 0:
165
167
  value = None