MindsDB 25.4.1.0__py3-none-any.whl → 25.4.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/command_executor.py +91 -61
- mindsdb/api/executor/data_types/answer.py +9 -12
- mindsdb/api/executor/datahub/classes/response.py +11 -0
- mindsdb/api/executor/datahub/datanodes/datanode.py +4 -4
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +10 -11
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +22 -16
- mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +43 -1
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +20 -20
- mindsdb/api/executor/planner/plan_join.py +2 -2
- mindsdb/api/executor/planner/query_plan.py +1 -0
- mindsdb/api/executor/planner/query_planner.py +86 -14
- mindsdb/api/executor/planner/steps.py +11 -2
- mindsdb/api/executor/sql_query/result_set.py +10 -7
- mindsdb/api/executor/sql_query/sql_query.py +69 -84
- mindsdb/api/executor/sql_query/steps/__init__.py +1 -0
- mindsdb/api/executor/sql_query/steps/delete_step.py +2 -3
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +5 -3
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +288 -0
- mindsdb/api/executor/sql_query/steps/insert_step.py +2 -2
- mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -2
- mindsdb/api/executor/sql_query/steps/subselect_step.py +20 -8
- mindsdb/api/executor/sql_query/steps/update_step.py +4 -6
- mindsdb/api/http/namespaces/sql.py +4 -1
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/ok_packet.py +1 -1
- mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +4 -27
- mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +1 -0
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +38 -37
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +23 -13
- mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +17 -16
- mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +1 -0
- mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +1 -1
- mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +3 -2
- mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +4 -4
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +26 -16
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +36 -7
- mindsdb/integrations/handlers/redshift_handler/redshift_handler.py +1 -1
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +18 -11
- mindsdb/integrations/libs/llm/config.py +11 -1
- mindsdb/integrations/libs/llm/utils.py +12 -0
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +1 -2
- mindsdb/integrations/libs/response.py +9 -4
- mindsdb/integrations/libs/vectordatabase_handler.py +17 -5
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +8 -98
- mindsdb/interfaces/agents/constants.py +12 -1
- mindsdb/interfaces/agents/langchain_agent.py +6 -0
- mindsdb/interfaces/database/log.py +8 -9
- mindsdb/interfaces/database/projects.py +1 -5
- mindsdb/interfaces/functions/controller.py +59 -17
- mindsdb/interfaces/functions/to_markdown.py +194 -0
- mindsdb/interfaces/jobs/jobs_controller.py +3 -3
- mindsdb/interfaces/knowledge_base/controller.py +223 -97
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +3 -14
- mindsdb/interfaces/query_context/context_controller.py +224 -1
- mindsdb/interfaces/storage/db.py +23 -0
- mindsdb/migrations/versions/2025-03-21_fda503400e43_queries.py +45 -0
- mindsdb/utilities/context_executor.py +1 -1
- mindsdb/utilities/partitioning.py +35 -20
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/METADATA +227 -224
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/RECORD +63 -59
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/WHEEL +0 -0
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/top_level.txt +0 -0
|
@@ -54,7 +54,7 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
54
54
|
max_retries=2 # Client-level retries
|
|
55
55
|
)
|
|
56
56
|
|
|
57
|
-
async def search_relevancy(self, query: str, document: str) -> Any:
|
|
57
|
+
async def search_relevancy(self, query: str, document: str, custom_event: bool = True) -> Any:
|
|
58
58
|
await self._init_client()
|
|
59
59
|
|
|
60
60
|
async with self._semaphore:
|
|
@@ -82,7 +82,8 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
82
82
|
}
|
|
83
83
|
|
|
84
84
|
# Stream reranking update.
|
|
85
|
-
|
|
85
|
+
if custom_event:
|
|
86
|
+
dispatch_custom_event("rerank", rerank_data)
|
|
86
87
|
return rerank_data
|
|
87
88
|
|
|
88
89
|
except Exception as e:
|
|
@@ -93,7 +94,7 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
93
94
|
retry_delay = self.retry_delay * (2 ** attempt) + random.uniform(0, 0.1)
|
|
94
95
|
await asyncio.sleep(retry_delay)
|
|
95
96
|
|
|
96
|
-
async def _rank(self, query_document_pairs: List[Tuple[str, str]]) -> List[Tuple[str, float]]:
|
|
97
|
+
async def _rank(self, query_document_pairs: List[Tuple[str, str]], custom_event: bool = True) -> List[Tuple[str, float]]:
|
|
97
98
|
ranked_results = []
|
|
98
99
|
|
|
99
100
|
# Process in larger batches for better throughput
|
|
@@ -102,7 +103,7 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
102
103
|
batch = query_document_pairs[i:i + batch_size]
|
|
103
104
|
try:
|
|
104
105
|
results = await asyncio.gather(
|
|
105
|
-
*[self.search_relevancy(query=query, document=document) for (query, document) in batch],
|
|
106
|
+
*[self.search_relevancy(query=query, document=document, custom_event=custom_event) for (query, document) in batch],
|
|
106
107
|
return_exceptions=True
|
|
107
108
|
)
|
|
108
109
|
|
|
@@ -227,16 +228,7 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
227
228
|
"remove_irrelevant": self.remove_irrelevant,
|
|
228
229
|
}
|
|
229
230
|
|
|
230
|
-
def get_scores(self, query: str, documents: list[str],
|
|
231
|
-
"""
|
|
232
|
-
Get relevance scores for documents given a query.
|
|
233
|
-
Args:
|
|
234
|
-
query: The query text
|
|
235
|
-
documents: List of document texts to score
|
|
236
|
-
disable_events: Whether to disable event dispatching (default True)
|
|
237
|
-
Returns:
|
|
238
|
-
List of relevance scores
|
|
239
|
-
"""
|
|
231
|
+
def get_scores(self, query: str, documents: list[str], custom_event: bool = False):
|
|
240
232
|
query_document_pairs = [(query, doc) for doc in documents]
|
|
241
233
|
# Create event loop and run async code
|
|
242
234
|
import asyncio
|
|
@@ -246,89 +238,7 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
246
238
|
# If no running loop exists, create a new one
|
|
247
239
|
loop = asyncio.new_event_loop()
|
|
248
240
|
asyncio.set_event_loop(loop)
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
# Create a wrapper function that doesn't dispatch events
|
|
252
|
-
async def _rank_without_events(query_document_pairs):
|
|
253
|
-
ranked_results = []
|
|
254
|
-
# Process in larger batches for better throughput
|
|
255
|
-
batch_size = min(self.max_concurrent_requests * 2, len(query_document_pairs))
|
|
256
|
-
for i in range(0, len(query_document_pairs), batch_size):
|
|
257
|
-
batch = query_document_pairs[i:i + batch_size]
|
|
258
|
-
try:
|
|
259
|
-
# Define a no-events version of search_relevancy inside this closure
|
|
260
|
-
async def search_relevancy_no_events(query, document):
|
|
261
|
-
await self._init_client()
|
|
262
|
-
async with self._semaphore:
|
|
263
|
-
for attempt in range(self.max_retries):
|
|
264
|
-
try:
|
|
265
|
-
response = await self.client.chat.completions.create(
|
|
266
|
-
model=self.model,
|
|
267
|
-
messages=[
|
|
268
|
-
{"role": "system", "content": "Rate the relevance of the document to the query. Respond with 'yes' or 'no'."},
|
|
269
|
-
{"role": "user", "content": f"Query: {query}\nDocument: {document}\nIs this document relevant?"}
|
|
270
|
-
],
|
|
271
|
-
temperature=self.temperature,
|
|
272
|
-
n=1,
|
|
273
|
-
logprobs=True,
|
|
274
|
-
max_tokens=1
|
|
275
|
-
)
|
|
276
|
-
# Extract response and confidence score
|
|
277
|
-
answer = response.choices[0].message.content
|
|
278
|
-
logprob = response.choices[0].logprobs.content[0].logprob
|
|
279
|
-
# No event dispatch here
|
|
280
|
-
return {"document": document, "answer": answer, "logprob": logprob}
|
|
281
|
-
except Exception as e:
|
|
282
|
-
if attempt == self.max_retries - 1:
|
|
283
|
-
log.error(f"Failed after {self.max_retries} attempts: {str(e)}")
|
|
284
|
-
raise
|
|
285
|
-
# Exponential backoff with jitter
|
|
286
|
-
retry_delay = self.retry_delay * (2 ** attempt) + random.uniform(0, 0.1)
|
|
287
|
-
await asyncio.sleep(retry_delay)
|
|
288
|
-
# Use our no-events version for this batch
|
|
289
|
-
results = await asyncio.gather(
|
|
290
|
-
*[search_relevancy_no_events(query=query, document=document) for (query, document) in batch],
|
|
291
|
-
return_exceptions=True
|
|
292
|
-
)
|
|
293
|
-
for idx, result in enumerate(results):
|
|
294
|
-
if isinstance(result, Exception):
|
|
295
|
-
log.error(f"Error processing document {i+idx}: {str(result)}")
|
|
296
|
-
ranked_results.append((batch[idx][1], 0.0))
|
|
297
|
-
continue
|
|
298
|
-
answer = result["answer"]
|
|
299
|
-
logprob = result["logprob"]
|
|
300
|
-
prob = math.exp(logprob)
|
|
301
|
-
# Convert answer to score using the model's confidence
|
|
302
|
-
if answer.lower().strip() == "yes":
|
|
303
|
-
score = prob # If yes, use the model's confidence
|
|
304
|
-
elif answer.lower().strip() == "no":
|
|
305
|
-
score = 1 - prob # If no, invert the confidence
|
|
306
|
-
else:
|
|
307
|
-
score = 0.5 * prob # For unclear answers, reduce confidence
|
|
308
|
-
ranked_results.append((batch[idx][1], score))
|
|
309
|
-
# Check if we should stop early
|
|
310
|
-
try:
|
|
311
|
-
high_scoring_docs = [r for r in ranked_results if r[1] >= self.filtering_threshold]
|
|
312
|
-
can_stop_early = (
|
|
313
|
-
self.early_stop # Early stopping is enabled
|
|
314
|
-
and self.num_docs_to_keep # We have a target number of docs
|
|
315
|
-
and len(high_scoring_docs) >= self.num_docs_to_keep # Found enough good docs
|
|
316
|
-
and score >= self.early_stop_threshold # Current doc is good enough
|
|
317
|
-
)
|
|
318
|
-
if can_stop_early:
|
|
319
|
-
log.info(f"Early stopping after finding {self.num_docs_to_keep} documents with high confidence")
|
|
320
|
-
return ranked_results
|
|
321
|
-
except Exception as e:
|
|
322
|
-
# Don't let early stopping errors stop the whole process
|
|
323
|
-
log.warning(f"Error in early stopping check: {str(e)}")
|
|
324
|
-
except Exception as e:
|
|
325
|
-
log.error(f"Batch processing error: {str(e)}")
|
|
326
|
-
continue
|
|
327
|
-
return ranked_results
|
|
328
|
-
# Use our no-events version
|
|
329
|
-
documents_and_scores = loop.run_until_complete(_rank_without_events(query_document_pairs))
|
|
330
|
-
else:
|
|
331
|
-
# Use the original _rank method
|
|
332
|
-
documents_and_scores = loop.run_until_complete(self._rank(query_document_pairs))
|
|
241
|
+
|
|
242
|
+
documents_and_scores = loop.run_until_complete(self._rank(query_document_pairs, custom_event=custom_event))
|
|
333
243
|
scores = [score for _, score in documents_and_scores]
|
|
334
244
|
return scores
|
|
@@ -15,7 +15,8 @@ SUPPORTED_PROVIDERS = {
|
|
|
15
15
|
"litellm",
|
|
16
16
|
"ollama",
|
|
17
17
|
"nvidia_nim",
|
|
18
|
-
"vllm"
|
|
18
|
+
"vllm",
|
|
19
|
+
"google"
|
|
19
20
|
}
|
|
20
21
|
# Chat models
|
|
21
22
|
ANTHROPIC_CHAT_MODELS = (
|
|
@@ -153,6 +154,15 @@ NVIDIA_NIM_CHAT_MODELS = (
|
|
|
153
154
|
"ibm/granite-34b-code-instruct",
|
|
154
155
|
)
|
|
155
156
|
|
|
157
|
+
GOOGLE_GEMINI_CHAT_MODELS = (
|
|
158
|
+
"gemini-2.5-pro-preview-03-25",
|
|
159
|
+
"gemini-2.0-flash",
|
|
160
|
+
"gemini-2.0-flash-lite",
|
|
161
|
+
"gemini-1.5-flash",
|
|
162
|
+
"gemini-1.5-flash-8b",
|
|
163
|
+
"gemini-1.5-pro",
|
|
164
|
+
)
|
|
165
|
+
|
|
156
166
|
# Define a read-only dictionary mapping providers to their models
|
|
157
167
|
PROVIDER_TO_MODELS = MappingProxyType(
|
|
158
168
|
{
|
|
@@ -160,6 +170,7 @@ PROVIDER_TO_MODELS = MappingProxyType(
|
|
|
160
170
|
"ollama": OLLAMA_CHAT_MODELS,
|
|
161
171
|
"openai": OPEN_AI_CHAT_MODELS,
|
|
162
172
|
"nvidia_nim": NVIDIA_NIM_CHAT_MODELS,
|
|
173
|
+
"google": GOOGLE_GEMINI_CHAT_MODELS,
|
|
163
174
|
}
|
|
164
175
|
)
|
|
165
176
|
|
|
@@ -15,6 +15,7 @@ from langchain_community.chat_models import (
|
|
|
15
15
|
ChatAnyscale,
|
|
16
16
|
ChatLiteLLM,
|
|
17
17
|
ChatOllama)
|
|
18
|
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
18
19
|
from langchain_core.agents import AgentAction, AgentStep
|
|
19
20
|
from langchain_core.callbacks.base import BaseCallbackHandler
|
|
20
21
|
|
|
@@ -50,6 +51,7 @@ from .constants import (
|
|
|
50
51
|
DEFAULT_TIKTOKEN_MODEL_NAME,
|
|
51
52
|
SUPPORTED_PROVIDERS,
|
|
52
53
|
ANTHROPIC_CHAT_MODELS,
|
|
54
|
+
GOOGLE_GEMINI_CHAT_MODELS,
|
|
53
55
|
OLLAMA_CHAT_MODELS,
|
|
54
56
|
NVIDIA_NIM_CHAT_MODELS,
|
|
55
57
|
USER_COLUMN,
|
|
@@ -85,6 +87,8 @@ def get_llm_provider(args: Dict) -> str:
|
|
|
85
87
|
return "ollama"
|
|
86
88
|
if args["model_name"] in NVIDIA_NIM_CHAT_MODELS:
|
|
87
89
|
return "nvidia_nim"
|
|
90
|
+
if args["model_name"] in GOOGLE_GEMINI_CHAT_MODELS:
|
|
91
|
+
return "google"
|
|
88
92
|
|
|
89
93
|
# For vLLM, require explicit provider specification
|
|
90
94
|
raise ValueError("Invalid model name. Please define a supported llm provider")
|
|
@@ -162,6 +166,8 @@ def create_chat_model(args: Dict):
|
|
|
162
166
|
return ChatOllama(**model_kwargs)
|
|
163
167
|
if args["provider"] == "nvidia_nim":
|
|
164
168
|
return ChatNVIDIA(**model_kwargs)
|
|
169
|
+
if args["provider"] == "google":
|
|
170
|
+
return ChatGoogleGenerativeAI(**model_kwargs)
|
|
165
171
|
if args["provider"] == "mindsdb":
|
|
166
172
|
return ChatMindsdb(**model_kwargs)
|
|
167
173
|
raise ValueError(f'Unknown provider: {args["provider"]}')
|
|
@@ -1,21 +1,21 @@
|
|
|
1
|
+
from typing import List
|
|
1
2
|
from copy import deepcopy
|
|
2
3
|
from abc import ABC, abstractmethod
|
|
3
|
-
from typing import List, Union, Tuple
|
|
4
4
|
from collections import OrderedDict
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
|
-
|
|
8
7
|
from mindsdb_sql_parser import parse_sql
|
|
9
8
|
from mindsdb_sql_parser.ast import Select, Identifier, Star, BinaryOperation, Constant, Join, Function
|
|
10
9
|
from mindsdb_sql_parser.utils import JoinType
|
|
10
|
+
|
|
11
11
|
from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
|
|
12
12
|
from mindsdb.integrations.utilities.query_traversal import query_traversal
|
|
13
|
-
|
|
14
13
|
from mindsdb.utilities.functions import resolve_table_identifier
|
|
15
14
|
from mindsdb.api.executor.utilities.sql import get_query_tables
|
|
16
15
|
from mindsdb.utilities.exception import EntityNotExistsError
|
|
17
16
|
import mindsdb.interfaces.storage.db as db
|
|
18
17
|
from mindsdb.utilities.context import context as ctx
|
|
18
|
+
from mindsdb.api.executor.datahub.classes.response import DataHubResponse
|
|
19
19
|
from mindsdb.api.executor.datahub.classes.tables_row import (
|
|
20
20
|
TABLES_ROW_TYPE,
|
|
21
21
|
TablesRow,
|
|
@@ -223,8 +223,7 @@ class LogDBController:
|
|
|
223
223
|
for table_name in self._tables.keys()
|
|
224
224
|
]
|
|
225
225
|
|
|
226
|
-
def query(self, query: Select = None, native_query: str = None,
|
|
227
|
-
session=None, return_as: str = 'split') -> Union[pd.DataFrame, Tuple[pd.DataFrame, list]]:
|
|
226
|
+
def query(self, query: Select = None, native_query: str = None, session=None) -> DataHubResponse:
|
|
228
227
|
if native_query is not None:
|
|
229
228
|
if query is not None:
|
|
230
229
|
raise Exception("'query' and 'native_query' arguments can not be used together")
|
|
@@ -286,12 +285,12 @@ class LogDBController:
|
|
|
286
285
|
df[df_column_name] = df[df_column_name].astype(column_type)
|
|
287
286
|
# endregion
|
|
288
287
|
|
|
289
|
-
if return_as != 'split':
|
|
290
|
-
return df
|
|
291
|
-
|
|
292
288
|
columns_info = [{
|
|
293
289
|
'name': k,
|
|
294
290
|
'type': v
|
|
295
291
|
} for k, v in df.dtypes.items()]
|
|
296
292
|
|
|
297
|
-
return
|
|
293
|
+
return DataHubResponse(
|
|
294
|
+
data_frame=df,
|
|
295
|
+
columns=columns_info
|
|
296
|
+
)
|
|
@@ -137,14 +137,10 @@ class Project:
|
|
|
137
137
|
view_meta['query_ast'],
|
|
138
138
|
session=session
|
|
139
139
|
)
|
|
140
|
-
|
|
141
|
-
|
|
140
|
+
df = sqlquery.fetched_data.to_df()
|
|
142
141
|
finally:
|
|
143
142
|
query_context_controller.release_context('view', view_meta['id'])
|
|
144
143
|
|
|
145
|
-
if result['success'] is False:
|
|
146
|
-
raise Exception(f"Cant execute view query: {view_meta['query_ast']}")
|
|
147
|
-
df = result['result']
|
|
148
144
|
# remove duplicated columns
|
|
149
145
|
df = df.loc[:, ~df.columns.duplicated()]
|
|
150
146
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
3
|
from duckdb.typing import BIGINT, DOUBLE, VARCHAR, BLOB, BOOLEAN
|
|
4
|
+
from mindsdb.interfaces.functions.to_markdown import ToMarkdown
|
|
4
5
|
from mindsdb.interfaces.storage.model_fs import HandlerStorage
|
|
5
6
|
|
|
6
7
|
|
|
@@ -121,32 +122,20 @@ class FunctionController(BYOMFunctionsController):
|
|
|
121
122
|
if meta is not None:
|
|
122
123
|
return meta
|
|
123
124
|
|
|
124
|
-
# builtin
|
|
125
|
+
# builtin functions
|
|
125
126
|
if node.op.lower() == 'llm':
|
|
126
127
|
return self.llm_call_function(node)
|
|
127
128
|
|
|
129
|
+
elif node.op.lower() == 'to_markdown':
|
|
130
|
+
return self.to_markdown_call_function(node)
|
|
131
|
+
|
|
128
132
|
def llm_call_function(self, node):
|
|
129
133
|
name = node.op.lower()
|
|
130
134
|
|
|
131
135
|
if name in self.callbacks:
|
|
132
136
|
return self.callbacks[name]
|
|
133
137
|
|
|
134
|
-
|
|
135
|
-
chat_model_params = {}
|
|
136
|
-
for k, v in os.environ.items():
|
|
137
|
-
if k.startswith(param_prefix):
|
|
138
|
-
param_name = k[len(param_prefix):]
|
|
139
|
-
if param_name == 'MODEL':
|
|
140
|
-
chat_model_params['model_name'] = v
|
|
141
|
-
else:
|
|
142
|
-
chat_model_params[param_name.lower()] = v
|
|
143
|
-
|
|
144
|
-
if 'provider' not in chat_model_params:
|
|
145
|
-
chat_model_params['provider'] = 'openai'
|
|
146
|
-
|
|
147
|
-
if 'api_key' in chat_model_params:
|
|
148
|
-
# move to api_keys dict
|
|
149
|
-
chat_model_params["api_keys"] = {chat_model_params['provider']: chat_model_params['api_key']}
|
|
138
|
+
chat_model_params = self._parse_chat_model_params()
|
|
150
139
|
|
|
151
140
|
try:
|
|
152
141
|
from langchain_core.messages import HumanMessage
|
|
@@ -168,6 +157,59 @@ class FunctionController(BYOMFunctionsController):
|
|
|
168
157
|
self.callbacks[name] = meta
|
|
169
158
|
return meta
|
|
170
159
|
|
|
160
|
+
def to_markdown_call_function(self, node):
|
|
161
|
+
name = node.op.lower()
|
|
162
|
+
|
|
163
|
+
if name in self.callbacks:
|
|
164
|
+
return self.callbacks[name]
|
|
165
|
+
|
|
166
|
+
def callback(file_path_or_url, use_llm):
|
|
167
|
+
chat_model_params = self._parse_chat_model_params()
|
|
168
|
+
|
|
169
|
+
llm_client = None
|
|
170
|
+
llm_model = None
|
|
171
|
+
try:
|
|
172
|
+
from mindsdb.interfaces.agents.langchain_agent import create_chat_model
|
|
173
|
+
llm = create_chat_model(chat_model_params)
|
|
174
|
+
llm_client = llm.root_client
|
|
175
|
+
llm_model = llm.model_name
|
|
176
|
+
except Exception:
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
to_markdown = ToMarkdown(use_llm, llm_client, llm_model)
|
|
180
|
+
return to_markdown.call(file_path_or_url)
|
|
181
|
+
|
|
182
|
+
meta = {
|
|
183
|
+
'name': name,
|
|
184
|
+
'callback': callback,
|
|
185
|
+
'input_types': ['str', 'bool'],
|
|
186
|
+
'output_type': 'str'
|
|
187
|
+
}
|
|
188
|
+
self.callbacks[name] = meta
|
|
189
|
+
return meta
|
|
190
|
+
|
|
191
|
+
def _parse_chat_model_params(self, param_prefix: str = 'LLM_FUNCTION_'):
|
|
192
|
+
"""
|
|
193
|
+
Parses the environment variables for chat model parameters.
|
|
194
|
+
"""
|
|
195
|
+
chat_model_params = {}
|
|
196
|
+
for k, v in os.environ.items():
|
|
197
|
+
if k.startswith(param_prefix):
|
|
198
|
+
param_name = k[len(param_prefix):]
|
|
199
|
+
if param_name == 'MODEL':
|
|
200
|
+
chat_model_params['model_name'] = v
|
|
201
|
+
else:
|
|
202
|
+
chat_model_params[param_name.lower()] = v
|
|
203
|
+
|
|
204
|
+
if 'provider' not in chat_model_params:
|
|
205
|
+
chat_model_params['provider'] = 'openai'
|
|
206
|
+
|
|
207
|
+
if 'api_key' in chat_model_params:
|
|
208
|
+
# move to api_keys dict
|
|
209
|
+
chat_model_params["api_keys"] = {chat_model_params['provider']: chat_model_params['api_key']}
|
|
210
|
+
|
|
211
|
+
return chat_model_params
|
|
212
|
+
|
|
171
213
|
|
|
172
214
|
class DuckDBFunctions:
|
|
173
215
|
def __init__(self, controller):
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
import os
|
|
4
|
+
from typing import Union
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
import fitz # PyMuPDF
|
|
8
|
+
from markitdown import MarkItDown
|
|
9
|
+
import mimetypes
|
|
10
|
+
from openai import OpenAI
|
|
11
|
+
import requests
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ToMarkdown:
|
|
15
|
+
"""
|
|
16
|
+
Extracts the content of documents of various formats in markdown format.
|
|
17
|
+
"""
|
|
18
|
+
def __init__(self, use_llm: bool, llm_client: OpenAI = None, llm_model: str = None):
|
|
19
|
+
"""
|
|
20
|
+
Initializes the ToMarkdown class.
|
|
21
|
+
"""
|
|
22
|
+
# If use_llm is True, llm_client and llm_model must be provided.
|
|
23
|
+
if use_llm and (llm_client is None or llm_model is None):
|
|
24
|
+
raise ValueError('LLM client and model must be provided when use_llm is True.')
|
|
25
|
+
|
|
26
|
+
# If use_llm is False, set llm_client and llm_model to None even if they are provided.
|
|
27
|
+
if not use_llm:
|
|
28
|
+
llm_client = None
|
|
29
|
+
llm_model = None
|
|
30
|
+
|
|
31
|
+
# Only OpenAI is supported for now.
|
|
32
|
+
# TODO: Add support for other LLMs.
|
|
33
|
+
if llm_client is not None and not isinstance(llm_client, OpenAI):
|
|
34
|
+
raise ValueError('Only OpenAI models are supported at the moment.')
|
|
35
|
+
|
|
36
|
+
self.use_llm = use_llm
|
|
37
|
+
self.llm_client = llm_client
|
|
38
|
+
self.llm_model = llm_model
|
|
39
|
+
|
|
40
|
+
def call(self, file_path_or_url: str) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Converts a file to markdown.
|
|
43
|
+
"""
|
|
44
|
+
file_extension = self._get_file_extension(file_path_or_url)
|
|
45
|
+
file = self._get_file_content(file_path_or_url)
|
|
46
|
+
|
|
47
|
+
if file_extension == '.pdf':
|
|
48
|
+
return self._pdf_to_markdown(file)
|
|
49
|
+
elif file_extension in ['.jpg', '.jpeg', '.png', '.gif']:
|
|
50
|
+
return self._image_to_markdown(file)
|
|
51
|
+
else:
|
|
52
|
+
return self._other_to_markdown(file)
|
|
53
|
+
|
|
54
|
+
def _get_file_content(self, file_path_or_url: str) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Retrieves the content of a file.
|
|
57
|
+
"""
|
|
58
|
+
parsed_url = urlparse(file_path_or_url)
|
|
59
|
+
if parsed_url.scheme in ('http', 'https'):
|
|
60
|
+
response = requests.get(file_path_or_url)
|
|
61
|
+
if response.status_code == 200:
|
|
62
|
+
return response
|
|
63
|
+
else:
|
|
64
|
+
raise RuntimeError(f'Unable to retrieve file from URL: {file_path_or_url}')
|
|
65
|
+
else:
|
|
66
|
+
with open(file_path_or_url, 'rb') as file:
|
|
67
|
+
return BytesIO(file.read())
|
|
68
|
+
|
|
69
|
+
def _get_file_extension(self, file_path_or_url: str) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Retrieves the file extension from a file path or URL.
|
|
72
|
+
"""
|
|
73
|
+
parsed_url = urlparse(file_path_or_url)
|
|
74
|
+
if parsed_url.scheme in ('http', 'https'):
|
|
75
|
+
try:
|
|
76
|
+
# Make a HEAD request to get headers without downloading the file.
|
|
77
|
+
response = requests.head(file_path_or_url, allow_redirects=True)
|
|
78
|
+
content_type = response.headers.get('Content-Type', '')
|
|
79
|
+
if content_type:
|
|
80
|
+
ext = mimetypes.guess_extension(content_type.split(';')[0].strip())
|
|
81
|
+
if ext:
|
|
82
|
+
return ext
|
|
83
|
+
|
|
84
|
+
# Fallback to extracting extension from the URL path
|
|
85
|
+
ext = os.path.splitext(parsed_url.path)[1]
|
|
86
|
+
if ext:
|
|
87
|
+
return ext
|
|
88
|
+
except requests.RequestException:
|
|
89
|
+
raise RuntimeError(f'Unable to retrieve file extension from URL: {file_path_or_url}')
|
|
90
|
+
else:
|
|
91
|
+
return os.path.splitext(file_path_or_url)[1]
|
|
92
|
+
|
|
93
|
+
def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes]) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Converts a PDF file to markdown.
|
|
96
|
+
"""
|
|
97
|
+
if self.llm_client is None:
|
|
98
|
+
return self._pdf_to_markdown_no_llm(file_content)
|
|
99
|
+
else:
|
|
100
|
+
return self._pdf_to_markdown_llm(file_content)
|
|
101
|
+
|
|
102
|
+
def _pdf_to_markdown_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
103
|
+
"""
|
|
104
|
+
Converts a PDF file to markdown using LLM.
|
|
105
|
+
The LLM is used mainly for the purpose of generating descriptions of any images in the PDF.
|
|
106
|
+
"""
|
|
107
|
+
if isinstance(file_content, requests.Response):
|
|
108
|
+
file_content = BytesIO(file_content.content)
|
|
109
|
+
|
|
110
|
+
document = fitz.open(stream=file_content, filetype="pdf")
|
|
111
|
+
|
|
112
|
+
markdown_content = []
|
|
113
|
+
for page_num in range(len(document)):
|
|
114
|
+
page = document.load_page(page_num)
|
|
115
|
+
|
|
116
|
+
# Get text blocks with coordinates.
|
|
117
|
+
page_content = []
|
|
118
|
+
blocks = page.get_text("blocks")
|
|
119
|
+
for block in blocks:
|
|
120
|
+
x0, y0, x1, y1, text, _, _ = block
|
|
121
|
+
if text.strip(): # Skip empty or whitespace blocks.
|
|
122
|
+
page_content.append((y0, text.strip()))
|
|
123
|
+
|
|
124
|
+
# Extract images from the page.
|
|
125
|
+
image_list = page.get_images(full=True)
|
|
126
|
+
for img_index, img in enumerate(image_list):
|
|
127
|
+
xref = img[0]
|
|
128
|
+
base_image = document.extract_image(xref)
|
|
129
|
+
image_bytes = base_image["image"]
|
|
130
|
+
|
|
131
|
+
# Use actual image y-coordinate if available.
|
|
132
|
+
y0 = float(base_image.get("y", 0))
|
|
133
|
+
image_description = self._generate_image_description(image_bytes)
|
|
134
|
+
page_content.append((y0, f""))
|
|
135
|
+
|
|
136
|
+
# Sort the content by y0 coordinate
|
|
137
|
+
page_content.sort(key=lambda x: x[0])
|
|
138
|
+
|
|
139
|
+
# Add sorted content to the markdown
|
|
140
|
+
for _, text in page_content:
|
|
141
|
+
markdown_content.append(text)
|
|
142
|
+
markdown_content.append("\n")
|
|
143
|
+
|
|
144
|
+
document.close()
|
|
145
|
+
|
|
146
|
+
return "\n".join(markdown_content)
|
|
147
|
+
|
|
148
|
+
def _generate_image_description(self, image_bytes: bytes) -> str:
|
|
149
|
+
"""
|
|
150
|
+
Generates a description of the image using LLM.
|
|
151
|
+
"""
|
|
152
|
+
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
|
|
153
|
+
|
|
154
|
+
response = self.llm_client.chat.completions.create(
|
|
155
|
+
model=self.llm_model,
|
|
156
|
+
messages=[
|
|
157
|
+
{
|
|
158
|
+
"role": "user",
|
|
159
|
+
"content": [
|
|
160
|
+
{"type": "text", "text": "Describe this image"},
|
|
161
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
|
162
|
+
],
|
|
163
|
+
}
|
|
164
|
+
],
|
|
165
|
+
)
|
|
166
|
+
description = response.choices[0].message.content
|
|
167
|
+
return description
|
|
168
|
+
|
|
169
|
+
def _pdf_to_markdown_no_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
170
|
+
"""
|
|
171
|
+
Converts a PDF file to markdown without using LLM.
|
|
172
|
+
"""
|
|
173
|
+
md = MarkItDown(enable_plugins=True)
|
|
174
|
+
result = md.convert(file_content)
|
|
175
|
+
return result.markdown
|
|
176
|
+
|
|
177
|
+
def _image_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
178
|
+
"""
|
|
179
|
+
Converts images to markdown.
|
|
180
|
+
"""
|
|
181
|
+
if not self.use_llm or self.llm_client is None:
|
|
182
|
+
raise ValueError('LLM client must be enabled to convert images to markdown.')
|
|
183
|
+
|
|
184
|
+
md = MarkItDown(llm_client=self.llm_client, llm_model=self.llm_model, enable_plugins=True)
|
|
185
|
+
result = md.convert(file_content)
|
|
186
|
+
return result.markdown
|
|
187
|
+
|
|
188
|
+
def _other_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
|
|
189
|
+
"""
|
|
190
|
+
Converts other file formats to markdown.
|
|
191
|
+
"""
|
|
192
|
+
md = MarkItDown(enable_plugins=True)
|
|
193
|
+
result = md.convert(file_content)
|
|
194
|
+
return result.markdown
|
|
@@ -337,10 +337,10 @@ class JobsController:
|
|
|
337
337
|
BinaryOperation(op='=', args=[Identifier('project'), Constant(project_name)])
|
|
338
338
|
])
|
|
339
339
|
)
|
|
340
|
-
|
|
340
|
+
response = logs_db_controller.query(query)
|
|
341
341
|
|
|
342
|
-
names = [i['name'] for i in columns]
|
|
343
|
-
return
|
|
342
|
+
names = [i['name'] for i in response.columns]
|
|
343
|
+
return response.data_frame[names].to_dict(orient='records')
|
|
344
344
|
|
|
345
345
|
|
|
346
346
|
class JobsExecutor:
|