MindsDB 25.4.1.0__py3-none-any.whl → 25.4.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (63) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/executor/command_executor.py +91 -61
  3. mindsdb/api/executor/data_types/answer.py +9 -12
  4. mindsdb/api/executor/datahub/classes/response.py +11 -0
  5. mindsdb/api/executor/datahub/datanodes/datanode.py +4 -4
  6. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +10 -11
  7. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +22 -16
  8. mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +43 -1
  9. mindsdb/api/executor/datahub/datanodes/project_datanode.py +20 -20
  10. mindsdb/api/executor/planner/plan_join.py +2 -2
  11. mindsdb/api/executor/planner/query_plan.py +1 -0
  12. mindsdb/api/executor/planner/query_planner.py +86 -14
  13. mindsdb/api/executor/planner/steps.py +11 -2
  14. mindsdb/api/executor/sql_query/result_set.py +10 -7
  15. mindsdb/api/executor/sql_query/sql_query.py +69 -84
  16. mindsdb/api/executor/sql_query/steps/__init__.py +1 -0
  17. mindsdb/api/executor/sql_query/steps/delete_step.py +2 -3
  18. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +5 -3
  19. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +288 -0
  20. mindsdb/api/executor/sql_query/steps/insert_step.py +2 -2
  21. mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -2
  22. mindsdb/api/executor/sql_query/steps/subselect_step.py +20 -8
  23. mindsdb/api/executor/sql_query/steps/update_step.py +4 -6
  24. mindsdb/api/http/namespaces/sql.py +4 -1
  25. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/ok_packet.py +1 -1
  26. mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +4 -27
  27. mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +1 -0
  28. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +38 -37
  29. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +23 -13
  30. mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +17 -16
  31. mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +1 -0
  32. mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +1 -1
  33. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +3 -2
  34. mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +4 -4
  35. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +26 -16
  36. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +36 -7
  37. mindsdb/integrations/handlers/redshift_handler/redshift_handler.py +1 -1
  38. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +18 -11
  39. mindsdb/integrations/libs/llm/config.py +11 -1
  40. mindsdb/integrations/libs/llm/utils.py +12 -0
  41. mindsdb/integrations/libs/ml_handler_process/learn_process.py +1 -2
  42. mindsdb/integrations/libs/response.py +9 -4
  43. mindsdb/integrations/libs/vectordatabase_handler.py +17 -5
  44. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +8 -98
  45. mindsdb/interfaces/agents/constants.py +12 -1
  46. mindsdb/interfaces/agents/langchain_agent.py +6 -0
  47. mindsdb/interfaces/database/log.py +8 -9
  48. mindsdb/interfaces/database/projects.py +1 -5
  49. mindsdb/interfaces/functions/controller.py +59 -17
  50. mindsdb/interfaces/functions/to_markdown.py +194 -0
  51. mindsdb/interfaces/jobs/jobs_controller.py +3 -3
  52. mindsdb/interfaces/knowledge_base/controller.py +223 -97
  53. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +3 -14
  54. mindsdb/interfaces/query_context/context_controller.py +224 -1
  55. mindsdb/interfaces/storage/db.py +23 -0
  56. mindsdb/migrations/versions/2025-03-21_fda503400e43_queries.py +45 -0
  57. mindsdb/utilities/context_executor.py +1 -1
  58. mindsdb/utilities/partitioning.py +35 -20
  59. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/METADATA +227 -224
  60. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/RECORD +63 -59
  61. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/WHEEL +0 -0
  62. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/licenses/LICENSE +0 -0
  63. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/top_level.txt +0 -0
@@ -54,7 +54,7 @@ class LLMReranker(BaseDocumentCompressor):
54
54
  max_retries=2 # Client-level retries
55
55
  )
56
56
 
57
- async def search_relevancy(self, query: str, document: str) -> Any:
57
+ async def search_relevancy(self, query: str, document: str, custom_event: bool = True) -> Any:
58
58
  await self._init_client()
59
59
 
60
60
  async with self._semaphore:
@@ -82,7 +82,8 @@ class LLMReranker(BaseDocumentCompressor):
82
82
  }
83
83
 
84
84
  # Stream reranking update.
85
- dispatch_custom_event("rerank", rerank_data)
85
+ if custom_event:
86
+ dispatch_custom_event("rerank", rerank_data)
86
87
  return rerank_data
87
88
 
88
89
  except Exception as e:
@@ -93,7 +94,7 @@ class LLMReranker(BaseDocumentCompressor):
93
94
  retry_delay = self.retry_delay * (2 ** attempt) + random.uniform(0, 0.1)
94
95
  await asyncio.sleep(retry_delay)
95
96
 
96
- async def _rank(self, query_document_pairs: List[Tuple[str, str]]) -> List[Tuple[str, float]]:
97
+ async def _rank(self, query_document_pairs: List[Tuple[str, str]], custom_event: bool = True) -> List[Tuple[str, float]]:
97
98
  ranked_results = []
98
99
 
99
100
  # Process in larger batches for better throughput
@@ -102,7 +103,7 @@ class LLMReranker(BaseDocumentCompressor):
102
103
  batch = query_document_pairs[i:i + batch_size]
103
104
  try:
104
105
  results = await asyncio.gather(
105
- *[self.search_relevancy(query=query, document=document) for (query, document) in batch],
106
+ *[self.search_relevancy(query=query, document=document, custom_event=custom_event) for (query, document) in batch],
106
107
  return_exceptions=True
107
108
  )
108
109
 
@@ -227,16 +228,7 @@ class LLMReranker(BaseDocumentCompressor):
227
228
  "remove_irrelevant": self.remove_irrelevant,
228
229
  }
229
230
 
230
- def get_scores(self, query: str, documents: list[str], disable_events: bool = True):
231
- """
232
- Get relevance scores for documents given a query.
233
- Args:
234
- query: The query text
235
- documents: List of document texts to score
236
- disable_events: Whether to disable event dispatching (default True)
237
- Returns:
238
- List of relevance scores
239
- """
231
+ def get_scores(self, query: str, documents: list[str], custom_event: bool = False):
240
232
  query_document_pairs = [(query, doc) for doc in documents]
241
233
  # Create event loop and run async code
242
234
  import asyncio
@@ -246,89 +238,7 @@ class LLMReranker(BaseDocumentCompressor):
246
238
  # If no running loop exists, create a new one
247
239
  loop = asyncio.new_event_loop()
248
240
  asyncio.set_event_loop(loop)
249
- # If disable_events is True, we need to modify the _rank function to not use dispatch_custom_event
250
- if disable_events:
251
- # Create a wrapper function that doesn't dispatch events
252
- async def _rank_without_events(query_document_pairs):
253
- ranked_results = []
254
- # Process in larger batches for better throughput
255
- batch_size = min(self.max_concurrent_requests * 2, len(query_document_pairs))
256
- for i in range(0, len(query_document_pairs), batch_size):
257
- batch = query_document_pairs[i:i + batch_size]
258
- try:
259
- # Define a no-events version of search_relevancy inside this closure
260
- async def search_relevancy_no_events(query, document):
261
- await self._init_client()
262
- async with self._semaphore:
263
- for attempt in range(self.max_retries):
264
- try:
265
- response = await self.client.chat.completions.create(
266
- model=self.model,
267
- messages=[
268
- {"role": "system", "content": "Rate the relevance of the document to the query. Respond with 'yes' or 'no'."},
269
- {"role": "user", "content": f"Query: {query}\nDocument: {document}\nIs this document relevant?"}
270
- ],
271
- temperature=self.temperature,
272
- n=1,
273
- logprobs=True,
274
- max_tokens=1
275
- )
276
- # Extract response and confidence score
277
- answer = response.choices[0].message.content
278
- logprob = response.choices[0].logprobs.content[0].logprob
279
- # No event dispatch here
280
- return {"document": document, "answer": answer, "logprob": logprob}
281
- except Exception as e:
282
- if attempt == self.max_retries - 1:
283
- log.error(f"Failed after {self.max_retries} attempts: {str(e)}")
284
- raise
285
- # Exponential backoff with jitter
286
- retry_delay = self.retry_delay * (2 ** attempt) + random.uniform(0, 0.1)
287
- await asyncio.sleep(retry_delay)
288
- # Use our no-events version for this batch
289
- results = await asyncio.gather(
290
- *[search_relevancy_no_events(query=query, document=document) for (query, document) in batch],
291
- return_exceptions=True
292
- )
293
- for idx, result in enumerate(results):
294
- if isinstance(result, Exception):
295
- log.error(f"Error processing document {i+idx}: {str(result)}")
296
- ranked_results.append((batch[idx][1], 0.0))
297
- continue
298
- answer = result["answer"]
299
- logprob = result["logprob"]
300
- prob = math.exp(logprob)
301
- # Convert answer to score using the model's confidence
302
- if answer.lower().strip() == "yes":
303
- score = prob # If yes, use the model's confidence
304
- elif answer.lower().strip() == "no":
305
- score = 1 - prob # If no, invert the confidence
306
- else:
307
- score = 0.5 * prob # For unclear answers, reduce confidence
308
- ranked_results.append((batch[idx][1], score))
309
- # Check if we should stop early
310
- try:
311
- high_scoring_docs = [r for r in ranked_results if r[1] >= self.filtering_threshold]
312
- can_stop_early = (
313
- self.early_stop # Early stopping is enabled
314
- and self.num_docs_to_keep # We have a target number of docs
315
- and len(high_scoring_docs) >= self.num_docs_to_keep # Found enough good docs
316
- and score >= self.early_stop_threshold # Current doc is good enough
317
- )
318
- if can_stop_early:
319
- log.info(f"Early stopping after finding {self.num_docs_to_keep} documents with high confidence")
320
- return ranked_results
321
- except Exception as e:
322
- # Don't let early stopping errors stop the whole process
323
- log.warning(f"Error in early stopping check: {str(e)}")
324
- except Exception as e:
325
- log.error(f"Batch processing error: {str(e)}")
326
- continue
327
- return ranked_results
328
- # Use our no-events version
329
- documents_and_scores = loop.run_until_complete(_rank_without_events(query_document_pairs))
330
- else:
331
- # Use the original _rank method
332
- documents_and_scores = loop.run_until_complete(self._rank(query_document_pairs))
241
+
242
+ documents_and_scores = loop.run_until_complete(self._rank(query_document_pairs, custom_event=custom_event))
333
243
  scores = [score for _, score in documents_and_scores]
334
244
  return scores
@@ -15,7 +15,8 @@ SUPPORTED_PROVIDERS = {
15
15
  "litellm",
16
16
  "ollama",
17
17
  "nvidia_nim",
18
- "vllm"
18
+ "vllm",
19
+ "google"
19
20
  }
20
21
  # Chat models
21
22
  ANTHROPIC_CHAT_MODELS = (
@@ -153,6 +154,15 @@ NVIDIA_NIM_CHAT_MODELS = (
153
154
  "ibm/granite-34b-code-instruct",
154
155
  )
155
156
 
157
+ GOOGLE_GEMINI_CHAT_MODELS = (
158
+ "gemini-2.5-pro-preview-03-25",
159
+ "gemini-2.0-flash",
160
+ "gemini-2.0-flash-lite",
161
+ "gemini-1.5-flash",
162
+ "gemini-1.5-flash-8b",
163
+ "gemini-1.5-pro",
164
+ )
165
+
156
166
  # Define a read-only dictionary mapping providers to their models
157
167
  PROVIDER_TO_MODELS = MappingProxyType(
158
168
  {
@@ -160,6 +170,7 @@ PROVIDER_TO_MODELS = MappingProxyType(
160
170
  "ollama": OLLAMA_CHAT_MODELS,
161
171
  "openai": OPEN_AI_CHAT_MODELS,
162
172
  "nvidia_nim": NVIDIA_NIM_CHAT_MODELS,
173
+ "google": GOOGLE_GEMINI_CHAT_MODELS,
163
174
  }
164
175
  )
165
176
 
@@ -15,6 +15,7 @@ from langchain_community.chat_models import (
15
15
  ChatAnyscale,
16
16
  ChatLiteLLM,
17
17
  ChatOllama)
18
+ from langchain_google_genai import ChatGoogleGenerativeAI
18
19
  from langchain_core.agents import AgentAction, AgentStep
19
20
  from langchain_core.callbacks.base import BaseCallbackHandler
20
21
 
@@ -50,6 +51,7 @@ from .constants import (
50
51
  DEFAULT_TIKTOKEN_MODEL_NAME,
51
52
  SUPPORTED_PROVIDERS,
52
53
  ANTHROPIC_CHAT_MODELS,
54
+ GOOGLE_GEMINI_CHAT_MODELS,
53
55
  OLLAMA_CHAT_MODELS,
54
56
  NVIDIA_NIM_CHAT_MODELS,
55
57
  USER_COLUMN,
@@ -85,6 +87,8 @@ def get_llm_provider(args: Dict) -> str:
85
87
  return "ollama"
86
88
  if args["model_name"] in NVIDIA_NIM_CHAT_MODELS:
87
89
  return "nvidia_nim"
90
+ if args["model_name"] in GOOGLE_GEMINI_CHAT_MODELS:
91
+ return "google"
88
92
 
89
93
  # For vLLM, require explicit provider specification
90
94
  raise ValueError("Invalid model name. Please define a supported llm provider")
@@ -162,6 +166,8 @@ def create_chat_model(args: Dict):
162
166
  return ChatOllama(**model_kwargs)
163
167
  if args["provider"] == "nvidia_nim":
164
168
  return ChatNVIDIA(**model_kwargs)
169
+ if args["provider"] == "google":
170
+ return ChatGoogleGenerativeAI(**model_kwargs)
165
171
  if args["provider"] == "mindsdb":
166
172
  return ChatMindsdb(**model_kwargs)
167
173
  raise ValueError(f'Unknown provider: {args["provider"]}')
@@ -1,21 +1,21 @@
1
+ from typing import List
1
2
  from copy import deepcopy
2
3
  from abc import ABC, abstractmethod
3
- from typing import List, Union, Tuple
4
4
  from collections import OrderedDict
5
5
 
6
6
  import pandas as pd
7
-
8
7
  from mindsdb_sql_parser import parse_sql
9
8
  from mindsdb_sql_parser.ast import Select, Identifier, Star, BinaryOperation, Constant, Join, Function
10
9
  from mindsdb_sql_parser.utils import JoinType
10
+
11
11
  from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
12
12
  from mindsdb.integrations.utilities.query_traversal import query_traversal
13
-
14
13
  from mindsdb.utilities.functions import resolve_table_identifier
15
14
  from mindsdb.api.executor.utilities.sql import get_query_tables
16
15
  from mindsdb.utilities.exception import EntityNotExistsError
17
16
  import mindsdb.interfaces.storage.db as db
18
17
  from mindsdb.utilities.context import context as ctx
18
+ from mindsdb.api.executor.datahub.classes.response import DataHubResponse
19
19
  from mindsdb.api.executor.datahub.classes.tables_row import (
20
20
  TABLES_ROW_TYPE,
21
21
  TablesRow,
@@ -223,8 +223,7 @@ class LogDBController:
223
223
  for table_name in self._tables.keys()
224
224
  ]
225
225
 
226
- def query(self, query: Select = None, native_query: str = None,
227
- session=None, return_as: str = 'split') -> Union[pd.DataFrame, Tuple[pd.DataFrame, list]]:
226
+ def query(self, query: Select = None, native_query: str = None, session=None) -> DataHubResponse:
228
227
  if native_query is not None:
229
228
  if query is not None:
230
229
  raise Exception("'query' and 'native_query' arguments can not be used together")
@@ -286,12 +285,12 @@ class LogDBController:
286
285
  df[df_column_name] = df[df_column_name].astype(column_type)
287
286
  # endregion
288
287
 
289
- if return_as != 'split':
290
- return df
291
-
292
288
  columns_info = [{
293
289
  'name': k,
294
290
  'type': v
295
291
  } for k, v in df.dtypes.items()]
296
292
 
297
- return df, columns_info
293
+ return DataHubResponse(
294
+ data_frame=df,
295
+ columns=columns_info
296
+ )
@@ -137,14 +137,10 @@ class Project:
137
137
  view_meta['query_ast'],
138
138
  session=session
139
139
  )
140
- result = sqlquery.fetch(view='dataframe')
141
-
140
+ df = sqlquery.fetched_data.to_df()
142
141
  finally:
143
142
  query_context_controller.release_context('view', view_meta['id'])
144
143
 
145
- if result['success'] is False:
146
- raise Exception(f"Cant execute view query: {view_meta['query_ast']}")
147
- df = result['result']
148
144
  # remove duplicated columns
149
145
  df = df.loc[:, ~df.columns.duplicated()]
150
146
 
@@ -1,6 +1,7 @@
1
1
  import os
2
2
 
3
3
  from duckdb.typing import BIGINT, DOUBLE, VARCHAR, BLOB, BOOLEAN
4
+ from mindsdb.interfaces.functions.to_markdown import ToMarkdown
4
5
  from mindsdb.interfaces.storage.model_fs import HandlerStorage
5
6
 
6
7
 
@@ -121,32 +122,20 @@ class FunctionController(BYOMFunctionsController):
121
122
  if meta is not None:
122
123
  return meta
123
124
 
124
- # builtin function
125
+ # builtin functions
125
126
  if node.op.lower() == 'llm':
126
127
  return self.llm_call_function(node)
127
128
 
129
+ elif node.op.lower() == 'to_markdown':
130
+ return self.to_markdown_call_function(node)
131
+
128
132
  def llm_call_function(self, node):
129
133
  name = node.op.lower()
130
134
 
131
135
  if name in self.callbacks:
132
136
  return self.callbacks[name]
133
137
 
134
- param_prefix = 'LLM_FUNCTION_'
135
- chat_model_params = {}
136
- for k, v in os.environ.items():
137
- if k.startswith(param_prefix):
138
- param_name = k[len(param_prefix):]
139
- if param_name == 'MODEL':
140
- chat_model_params['model_name'] = v
141
- else:
142
- chat_model_params[param_name.lower()] = v
143
-
144
- if 'provider' not in chat_model_params:
145
- chat_model_params['provider'] = 'openai'
146
-
147
- if 'api_key' in chat_model_params:
148
- # move to api_keys dict
149
- chat_model_params["api_keys"] = {chat_model_params['provider']: chat_model_params['api_key']}
138
+ chat_model_params = self._parse_chat_model_params()
150
139
 
151
140
  try:
152
141
  from langchain_core.messages import HumanMessage
@@ -168,6 +157,59 @@ class FunctionController(BYOMFunctionsController):
168
157
  self.callbacks[name] = meta
169
158
  return meta
170
159
 
160
+ def to_markdown_call_function(self, node):
161
+ name = node.op.lower()
162
+
163
+ if name in self.callbacks:
164
+ return self.callbacks[name]
165
+
166
+ def callback(file_path_or_url, use_llm):
167
+ chat_model_params = self._parse_chat_model_params()
168
+
169
+ llm_client = None
170
+ llm_model = None
171
+ try:
172
+ from mindsdb.interfaces.agents.langchain_agent import create_chat_model
173
+ llm = create_chat_model(chat_model_params)
174
+ llm_client = llm.root_client
175
+ llm_model = llm.model_name
176
+ except Exception:
177
+ pass
178
+
179
+ to_markdown = ToMarkdown(use_llm, llm_client, llm_model)
180
+ return to_markdown.call(file_path_or_url)
181
+
182
+ meta = {
183
+ 'name': name,
184
+ 'callback': callback,
185
+ 'input_types': ['str', 'bool'],
186
+ 'output_type': 'str'
187
+ }
188
+ self.callbacks[name] = meta
189
+ return meta
190
+
191
+ def _parse_chat_model_params(self, param_prefix: str = 'LLM_FUNCTION_'):
192
+ """
193
+ Parses the environment variables for chat model parameters.
194
+ """
195
+ chat_model_params = {}
196
+ for k, v in os.environ.items():
197
+ if k.startswith(param_prefix):
198
+ param_name = k[len(param_prefix):]
199
+ if param_name == 'MODEL':
200
+ chat_model_params['model_name'] = v
201
+ else:
202
+ chat_model_params[param_name.lower()] = v
203
+
204
+ if 'provider' not in chat_model_params:
205
+ chat_model_params['provider'] = 'openai'
206
+
207
+ if 'api_key' in chat_model_params:
208
+ # move to api_keys dict
209
+ chat_model_params["api_keys"] = {chat_model_params['provider']: chat_model_params['api_key']}
210
+
211
+ return chat_model_params
212
+
171
213
 
172
214
  class DuckDBFunctions:
173
215
  def __init__(self, controller):
@@ -0,0 +1,194 @@
1
+ import base64
2
+ from io import BytesIO
3
+ import os
4
+ from typing import Union
5
+ from urllib.parse import urlparse
6
+
7
+ import fitz # PyMuPDF
8
+ from markitdown import MarkItDown
9
+ import mimetypes
10
+ from openai import OpenAI
11
+ import requests
12
+
13
+
14
+ class ToMarkdown:
15
+ """
16
+ Extracts the content of documents of various formats in markdown format.
17
+ """
18
+ def __init__(self, use_llm: bool, llm_client: OpenAI = None, llm_model: str = None):
19
+ """
20
+ Initializes the ToMarkdown class.
21
+ """
22
+ # If use_llm is True, llm_client and llm_model must be provided.
23
+ if use_llm and (llm_client is None or llm_model is None):
24
+ raise ValueError('LLM client and model must be provided when use_llm is True.')
25
+
26
+ # If use_llm is False, set llm_client and llm_model to None even if they are provided.
27
+ if not use_llm:
28
+ llm_client = None
29
+ llm_model = None
30
+
31
+ # Only OpenAI is supported for now.
32
+ # TODO: Add support for other LLMs.
33
+ if llm_client is not None and not isinstance(llm_client, OpenAI):
34
+ raise ValueError('Only OpenAI models are supported at the moment.')
35
+
36
+ self.use_llm = use_llm
37
+ self.llm_client = llm_client
38
+ self.llm_model = llm_model
39
+
40
+ def call(self, file_path_or_url: str) -> str:
41
+ """
42
+ Converts a file to markdown.
43
+ """
44
+ file_extension = self._get_file_extension(file_path_or_url)
45
+ file = self._get_file_content(file_path_or_url)
46
+
47
+ if file_extension == '.pdf':
48
+ return self._pdf_to_markdown(file)
49
+ elif file_extension in ['.jpg', '.jpeg', '.png', '.gif']:
50
+ return self._image_to_markdown(file)
51
+ else:
52
+ return self._other_to_markdown(file)
53
+
54
+ def _get_file_content(self, file_path_or_url: str) -> str:
55
+ """
56
+ Retrieves the content of a file.
57
+ """
58
+ parsed_url = urlparse(file_path_or_url)
59
+ if parsed_url.scheme in ('http', 'https'):
60
+ response = requests.get(file_path_or_url)
61
+ if response.status_code == 200:
62
+ return response
63
+ else:
64
+ raise RuntimeError(f'Unable to retrieve file from URL: {file_path_or_url}')
65
+ else:
66
+ with open(file_path_or_url, 'rb') as file:
67
+ return BytesIO(file.read())
68
+
69
+ def _get_file_extension(self, file_path_or_url: str) -> str:
70
+ """
71
+ Retrieves the file extension from a file path or URL.
72
+ """
73
+ parsed_url = urlparse(file_path_or_url)
74
+ if parsed_url.scheme in ('http', 'https'):
75
+ try:
76
+ # Make a HEAD request to get headers without downloading the file.
77
+ response = requests.head(file_path_or_url, allow_redirects=True)
78
+ content_type = response.headers.get('Content-Type', '')
79
+ if content_type:
80
+ ext = mimetypes.guess_extension(content_type.split(';')[0].strip())
81
+ if ext:
82
+ return ext
83
+
84
+ # Fallback to extracting extension from the URL path
85
+ ext = os.path.splitext(parsed_url.path)[1]
86
+ if ext:
87
+ return ext
88
+ except requests.RequestException:
89
+ raise RuntimeError(f'Unable to retrieve file extension from URL: {file_path_or_url}')
90
+ else:
91
+ return os.path.splitext(file_path_or_url)[1]
92
+
93
+ def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes]) -> str:
94
+ """
95
+ Converts a PDF file to markdown.
96
+ """
97
+ if self.llm_client is None:
98
+ return self._pdf_to_markdown_no_llm(file_content)
99
+ else:
100
+ return self._pdf_to_markdown_llm(file_content)
101
+
102
+ def _pdf_to_markdown_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
103
+ """
104
+ Converts a PDF file to markdown using LLM.
105
+ The LLM is used mainly for the purpose of generating descriptions of any images in the PDF.
106
+ """
107
+ if isinstance(file_content, requests.Response):
108
+ file_content = BytesIO(file_content.content)
109
+
110
+ document = fitz.open(stream=file_content, filetype="pdf")
111
+
112
+ markdown_content = []
113
+ for page_num in range(len(document)):
114
+ page = document.load_page(page_num)
115
+
116
+ # Get text blocks with coordinates.
117
+ page_content = []
118
+ blocks = page.get_text("blocks")
119
+ for block in blocks:
120
+ x0, y0, x1, y1, text, _, _ = block
121
+ if text.strip(): # Skip empty or whitespace blocks.
122
+ page_content.append((y0, text.strip()))
123
+
124
+ # Extract images from the page.
125
+ image_list = page.get_images(full=True)
126
+ for img_index, img in enumerate(image_list):
127
+ xref = img[0]
128
+ base_image = document.extract_image(xref)
129
+ image_bytes = base_image["image"]
130
+
131
+ # Use actual image y-coordinate if available.
132
+ y0 = float(base_image.get("y", 0))
133
+ image_description = self._generate_image_description(image_bytes)
134
+ page_content.append((y0, f"![{image_description}](image_{page_num + 1}_{img_index + 1}.png)"))
135
+
136
+ # Sort the content by y0 coordinate
137
+ page_content.sort(key=lambda x: x[0])
138
+
139
+ # Add sorted content to the markdown
140
+ for _, text in page_content:
141
+ markdown_content.append(text)
142
+ markdown_content.append("\n")
143
+
144
+ document.close()
145
+
146
+ return "\n".join(markdown_content)
147
+
148
+ def _generate_image_description(self, image_bytes: bytes) -> str:
149
+ """
150
+ Generates a description of the image using LLM.
151
+ """
152
+ image_base64 = base64.b64encode(image_bytes).decode("utf-8")
153
+
154
+ response = self.llm_client.chat.completions.create(
155
+ model=self.llm_model,
156
+ messages=[
157
+ {
158
+ "role": "user",
159
+ "content": [
160
+ {"type": "text", "text": "Describe this image"},
161
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
162
+ ],
163
+ }
164
+ ],
165
+ )
166
+ description = response.choices[0].message.content
167
+ return description
168
+
169
+ def _pdf_to_markdown_no_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
170
+ """
171
+ Converts a PDF file to markdown without using LLM.
172
+ """
173
+ md = MarkItDown(enable_plugins=True)
174
+ result = md.convert(file_content)
175
+ return result.markdown
176
+
177
+ def _image_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
178
+ """
179
+ Converts images to markdown.
180
+ """
181
+ if not self.use_llm or self.llm_client is None:
182
+ raise ValueError('LLM client must be enabled to convert images to markdown.')
183
+
184
+ md = MarkItDown(llm_client=self.llm_client, llm_model=self.llm_model, enable_plugins=True)
185
+ result = md.convert(file_content)
186
+ return result.markdown
187
+
188
+ def _other_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
189
+ """
190
+ Converts other file formats to markdown.
191
+ """
192
+ md = MarkItDown(enable_plugins=True)
193
+ result = md.convert(file_content)
194
+ return result.markdown
@@ -337,10 +337,10 @@ class JobsController:
337
337
  BinaryOperation(op='=', args=[Identifier('project'), Constant(project_name)])
338
338
  ])
339
339
  )
340
- data, columns = logs_db_controller.query(query)
340
+ response = logs_db_controller.query(query)
341
341
 
342
- names = [i['name'] for i in columns]
343
- return data[names].to_dict(orient='records')
342
+ names = [i['name'] for i in response.columns]
343
+ return response.data_frame[names].to_dict(orient='records')
344
344
 
345
345
 
346
346
  class JobsExecutor: