MindsDB 25.1.2.1__py3-none-any.whl → 25.1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/METADATA +246 -255
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/RECORD +94 -83
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +5 -3
- mindsdb/api/executor/__init__.py +0 -1
- mindsdb/api/executor/command_executor.py +2 -1
- mindsdb/api/executor/data_types/answer.py +1 -1
- mindsdb/api/executor/datahub/datanodes/datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +8 -3
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +9 -26
- mindsdb/api/executor/sql_query/__init__.py +1 -0
- mindsdb/api/executor/sql_query/result_set.py +36 -21
- mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +1 -1
- mindsdb/api/executor/sql_query/steps/join_step.py +4 -4
- mindsdb/api/executor/sql_query/steps/map_reduce_step.py +6 -39
- mindsdb/api/executor/utilities/sql.py +2 -10
- mindsdb/api/http/namespaces/agents.py +3 -1
- mindsdb/api/http/namespaces/knowledge_bases.py +3 -3
- mindsdb/api/http/namespaces/sql.py +3 -1
- mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +2 -1
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +7 -0
- mindsdb/api/postgres/postgres_proxy/executor/executor.py +2 -1
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +2 -2
- mindsdb/integrations/handlers/chromadb_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/databricks_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/file_handler/file_handler.py +1 -1
- mindsdb/integrations/handlers/file_handler/requirements.txt +0 -4
- mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +17 -1
- mindsdb/integrations/handlers/jira_handler/jira_handler.py +15 -1
- mindsdb/integrations/handlers/jira_handler/jira_table.py +52 -31
- mindsdb/integrations/handlers/langchain_embedding_handler/fastapi_embeddings.py +82 -0
- mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +8 -1
- mindsdb/integrations/handlers/langchain_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_handler.py +1 -1
- mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +8 -0
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +50 -16
- mindsdb/integrations/handlers/pinecone_handler/pinecone_handler.py +123 -72
- mindsdb/integrations/handlers/pinecone_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +12 -6
- mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +5 -3
- mindsdb/integrations/handlers/slack_handler/slack_handler.py +13 -2
- mindsdb/integrations/handlers/slack_handler/slack_tables.py +21 -1
- mindsdb/integrations/handlers/web_handler/requirements.txt +0 -1
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +2 -2
- mindsdb/integrations/utilities/files/__init__.py +0 -0
- mindsdb/integrations/utilities/files/file_reader.py +258 -0
- mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +2 -1
- mindsdb/integrations/utilities/handlers/auth_utilities/microsoft/ms_graph_api_auth_utilities.py +8 -3
- mindsdb/integrations/utilities/rag/chains/map_reduce_summarizer_chain.py +5 -9
- mindsdb/integrations/utilities/rag/loaders/vector_store_loader/pgvector.py +76 -27
- mindsdb/integrations/utilities/rag/loaders/vector_store_loader/vector_store_loader.py +18 -1
- mindsdb/integrations/utilities/rag/pipelines/rag.py +74 -21
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +166 -108
- mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +108 -78
- mindsdb/integrations/utilities/rag/settings.py +37 -16
- mindsdb/integrations/utilities/sql_utils.py +1 -1
- mindsdb/interfaces/agents/agents_controller.py +18 -8
- mindsdb/interfaces/agents/constants.py +1 -0
- mindsdb/interfaces/agents/langchain_agent.py +124 -157
- mindsdb/interfaces/agents/langfuse_callback_handler.py +4 -37
- mindsdb/interfaces/agents/mindsdb_database_agent.py +21 -13
- mindsdb/interfaces/chatbot/chatbot_controller.py +7 -11
- mindsdb/interfaces/chatbot/chatbot_task.py +16 -5
- mindsdb/interfaces/chatbot/memory.py +58 -13
- mindsdb/interfaces/database/integrations.py +5 -1
- mindsdb/interfaces/database/projects.py +55 -16
- mindsdb/interfaces/database/views.py +12 -25
- mindsdb/interfaces/knowledge_base/controller.py +39 -15
- mindsdb/interfaces/knowledge_base/preprocessing/document_loader.py +7 -26
- mindsdb/interfaces/model/functions.py +15 -4
- mindsdb/interfaces/model/model_controller.py +4 -7
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +51 -40
- mindsdb/interfaces/skills/retrieval_tool.py +10 -3
- mindsdb/interfaces/skills/skill_tool.py +97 -54
- mindsdb/interfaces/skills/skills_controller.py +7 -3
- mindsdb/interfaces/skills/sql_agent.py +127 -41
- mindsdb/interfaces/storage/db.py +1 -1
- mindsdb/migrations/versions/2025-01-15_c06c35f7e8e1_project_company.py +88 -0
- mindsdb/utilities/cache.py +7 -4
- mindsdb/utilities/context.py +11 -1
- mindsdb/utilities/langfuse.py +279 -0
- mindsdb/utilities/log.py +20 -2
- mindsdb/utilities/otel/__init__.py +206 -0
- mindsdb/utilities/otel/logger.py +25 -0
- mindsdb/utilities/otel/meter.py +19 -0
- mindsdb/utilities/otel/metric_handlers/__init__.py +25 -0
- mindsdb/utilities/otel/tracer.py +16 -0
- mindsdb/utilities/partitioning.py +52 -0
- mindsdb/utilities/render/sqlalchemy_render.py +7 -1
- mindsdb/utilities/utils.py +34 -0
- mindsdb/utilities/otel.py +0 -72
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/LICENSE +0 -0
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/WHEEL +0 -0
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/top_level.txt +0 -0
|
@@ -231,6 +231,9 @@ class SlackHandler(APIChatHandler):
|
|
|
231
231
|
'polling': {
|
|
232
232
|
'type': 'realtime',
|
|
233
233
|
},
|
|
234
|
+
'memory': {
|
|
235
|
+
'type': 'handler',
|
|
236
|
+
},
|
|
234
237
|
'tables': [
|
|
235
238
|
{
|
|
236
239
|
'chat_table': {
|
|
@@ -238,7 +241,7 @@ class SlackHandler(APIChatHandler):
|
|
|
238
241
|
'chat_id_col': 'channel_id',
|
|
239
242
|
'username_col': 'user',
|
|
240
243
|
'text_col': 'text',
|
|
241
|
-
'time_col': '
|
|
244
|
+
'time_col': 'created_at',
|
|
242
245
|
}
|
|
243
246
|
},
|
|
244
247
|
{
|
|
@@ -264,7 +267,7 @@ class SlackHandler(APIChatHandler):
|
|
|
264
267
|
user_info = web_connection.auth_test().data
|
|
265
268
|
return user_info['bot_id']
|
|
266
269
|
|
|
267
|
-
def subscribe(self, stop_event: threading.Event, callback: Callable, **kwargs: Any) -> None:
|
|
270
|
+
def subscribe(self, stop_event: threading.Event, callback: Callable, table_name: Text, columns: List = None, **kwargs: Any) -> None:
|
|
268
271
|
"""
|
|
269
272
|
Subscribes to the Slack API using the Socket Mode for real-time responses to messages.
|
|
270
273
|
|
|
@@ -274,6 +277,14 @@ class SlackHandler(APIChatHandler):
|
|
|
274
277
|
table_name (Text): The name of the table to subscribe to.
|
|
275
278
|
kwargs: Arbitrary keyword arguments.
|
|
276
279
|
"""
|
|
280
|
+
if table_name not in ['messages', 'threads']:
|
|
281
|
+
raise RuntimeError(f'Table {table_name} is not supported for subscription.')
|
|
282
|
+
|
|
283
|
+
# Raise an error if columns are provided.
|
|
284
|
+
# Since Slack subscriptions depend on events and not changes to the virtual tables, columns are not supported.
|
|
285
|
+
if columns:
|
|
286
|
+
raise RuntimeError('Columns are not supported for Slack subscriptions.')
|
|
287
|
+
|
|
277
288
|
self._socket_connection = SocketModeClient(
|
|
278
289
|
# This app-level token will be used only for establishing a connection.
|
|
279
290
|
app_token=self.connection_data['app_token'], # xapp-A111-222-xyz
|
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
|
6
6
|
from slack_sdk.errors import SlackApiError
|
|
7
7
|
|
|
8
8
|
from mindsdb.integrations.libs.api_handler import APIResource
|
|
9
|
-
from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions, FilterCondition, FilterOperator
|
|
9
|
+
from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions, FilterCondition, FilterOperator, SortColumn
|
|
10
10
|
from mindsdb.utilities import log
|
|
11
11
|
|
|
12
12
|
logger = log.getLogger(__name__)
|
|
@@ -203,6 +203,7 @@ class SlackMessagesTable(APIResource):
|
|
|
203
203
|
self,
|
|
204
204
|
conditions: List[FilterCondition] = None,
|
|
205
205
|
limit: int = None,
|
|
206
|
+
sort: List[SortColumn] = None,
|
|
206
207
|
**kwargs: Any
|
|
207
208
|
) -> pd.DataFrame:
|
|
208
209
|
"""
|
|
@@ -222,6 +223,7 @@ class SlackMessagesTable(APIResource):
|
|
|
222
223
|
Args:
|
|
223
224
|
conditions (List[FilterCondition]): The conditions to filter the messages.
|
|
224
225
|
limit (int): The limit of the messages to return.
|
|
226
|
+
sort (List[SortColumn]): The columns to sort the messages by.
|
|
225
227
|
kwargs (Any): Arbitrary keyword arguments.
|
|
226
228
|
|
|
227
229
|
Raises:
|
|
@@ -306,6 +308,14 @@ class SlackMessagesTable(APIResource):
|
|
|
306
308
|
# Translate the time stamp into a 'created_at' field.
|
|
307
309
|
result['created_at'] = pd.to_datetime(result['ts'].astype(float), unit='s').dt.strftime('%Y-%m-%d %H:%M:%S')
|
|
308
310
|
|
|
311
|
+
# Sort the messages by the specified columns.
|
|
312
|
+
if sort:
|
|
313
|
+
result.sort_values(
|
|
314
|
+
by=[col.column for col in sort],
|
|
315
|
+
ascending=[col.ascending for col in sort],
|
|
316
|
+
inplace=True
|
|
317
|
+
)
|
|
318
|
+
|
|
309
319
|
return result
|
|
310
320
|
|
|
311
321
|
def insert(self, query: Insert):
|
|
@@ -496,6 +506,7 @@ class SlackThreadsTable(APIResource):
|
|
|
496
506
|
self,
|
|
497
507
|
conditions: List[FilterCondition] = None,
|
|
498
508
|
limit: int = None,
|
|
509
|
+
sort: List[SortColumn] = None,
|
|
499
510
|
**kwargs: Any
|
|
500
511
|
) -> pd.DataFrame:
|
|
501
512
|
"""
|
|
@@ -514,6 +525,7 @@ class SlackThreadsTable(APIResource):
|
|
|
514
525
|
Args:
|
|
515
526
|
conditions (List[FilterCondition]): The conditions to filter the messages.
|
|
516
527
|
limit (int): The limit of the messages to return.
|
|
528
|
+
sort (List[SortColumn]): The columns to sort the messages by.
|
|
517
529
|
kwargs (Any): Arbitrary keyword arguments.
|
|
518
530
|
|
|
519
531
|
Raises:
|
|
@@ -591,6 +603,14 @@ class SlackThreadsTable(APIResource):
|
|
|
591
603
|
result['channel_id'] = params['channel']
|
|
592
604
|
result['channel_name'] = channel['name'] if 'name' in channel else None
|
|
593
605
|
|
|
606
|
+
# Sort the messages by the specified columns.
|
|
607
|
+
if sort:
|
|
608
|
+
result.sort_values(
|
|
609
|
+
by=[col.column for col in sort],
|
|
610
|
+
ascending=[col.ascending for col in sort],
|
|
611
|
+
inplace=True
|
|
612
|
+
)
|
|
613
|
+
|
|
594
614
|
return result
|
|
595
615
|
|
|
596
616
|
def insert(self, query: Insert):
|
|
@@ -8,7 +8,7 @@ from sqlalchemy.orm.attributes import flag_modified
|
|
|
8
8
|
from mindsdb_sql_parser import parse_sql
|
|
9
9
|
from mindsdb_sql_parser.ast import Identifier, Select, Star, NativeQuery
|
|
10
10
|
|
|
11
|
-
from mindsdb.api.executor import SQLQuery
|
|
11
|
+
from mindsdb.api.executor.sql_query import SQLQuery
|
|
12
12
|
import mindsdb.utilities.profiler as profiler
|
|
13
13
|
from mindsdb.utilities.functions import mark_process
|
|
14
14
|
from mindsdb.utilities.config import Config
|
|
@@ -72,7 +72,7 @@ def learn_process(data_integration_ref: dict, problem_definition: dict, fetch_da
|
|
|
72
72
|
elif data_integration_ref['type'] == 'view':
|
|
73
73
|
project = database_controller.get_project(project_name)
|
|
74
74
|
query_ast = parse_sql(fetch_data_query)
|
|
75
|
-
view_meta = project.
|
|
75
|
+
view_meta = project.get_view_meta(query_ast)
|
|
76
76
|
sqlquery = SQLQuery(view_meta['query_ast'], session=sql_session)
|
|
77
77
|
elif data_integration_ref['type'] == 'project':
|
|
78
78
|
query_ast = parse_sql(fetch_data_query)
|
|
File without changes
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
import traceback
|
|
2
|
+
import json
|
|
3
|
+
import csv
|
|
4
|
+
from io import BytesIO, StringIO
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import codecs
|
|
7
|
+
|
|
8
|
+
import filetype
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from charset_normalizer import from_bytes
|
|
11
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
12
|
+
|
|
13
|
+
from mindsdb.utilities import log
|
|
14
|
+
|
|
15
|
+
logger = log.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
DEFAULT_CHUNK_SIZE = 500
|
|
18
|
+
DEFAULT_CHUNK_OVERLAP = 250
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FileDetectError(Exception):
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def decode(file_obj: BytesIO) -> StringIO:
|
|
26
|
+
byte_str = file_obj.read()
|
|
27
|
+
# Move it to StringIO
|
|
28
|
+
try:
|
|
29
|
+
# Handle Microsoft's BOM "special" UTF-8 encoding
|
|
30
|
+
if byte_str.startswith(codecs.BOM_UTF8):
|
|
31
|
+
data_str = StringIO(byte_str.decode("utf-8-sig"))
|
|
32
|
+
else:
|
|
33
|
+
file_encoding_meta = from_bytes(
|
|
34
|
+
byte_str[: 32 * 1024],
|
|
35
|
+
steps=32, # Number of steps/block to extract from my_byte_str
|
|
36
|
+
chunk_size=1024, # Set block size of each extraction)
|
|
37
|
+
explain=False,
|
|
38
|
+
)
|
|
39
|
+
best_meta = file_encoding_meta.best()
|
|
40
|
+
errors = "strict"
|
|
41
|
+
if best_meta is not None:
|
|
42
|
+
encoding = file_encoding_meta.best().encoding
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
data_str = StringIO(byte_str.decode(encoding, errors))
|
|
46
|
+
except UnicodeDecodeError:
|
|
47
|
+
encoding = "utf-8"
|
|
48
|
+
errors = "replace"
|
|
49
|
+
|
|
50
|
+
data_str = StringIO(byte_str.decode(encoding, errors))
|
|
51
|
+
else:
|
|
52
|
+
encoding = "utf-8"
|
|
53
|
+
errors = "replace"
|
|
54
|
+
|
|
55
|
+
data_str = StringIO(byte_str.decode(encoding, errors))
|
|
56
|
+
except Exception as e:
|
|
57
|
+
logger.error(traceback.format_exc())
|
|
58
|
+
raise FileDetectError("Could not load into string") from e
|
|
59
|
+
|
|
60
|
+
return data_str
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class FormatDetector:
|
|
64
|
+
|
|
65
|
+
def get(self, name, file_obj: BytesIO = None):
|
|
66
|
+
format = self.get_format_by_name(name)
|
|
67
|
+
if format is None and file_obj is not None:
|
|
68
|
+
format = self.get_format_by_content(file_obj)
|
|
69
|
+
|
|
70
|
+
if format is not None:
|
|
71
|
+
return format
|
|
72
|
+
raise FileDetectError(f'Unable to detect format: {name}')
|
|
73
|
+
|
|
74
|
+
def get_format_by_name(self, filename):
|
|
75
|
+
extension = Path(filename).suffix.strip(".").lower()
|
|
76
|
+
if extension == "tsv":
|
|
77
|
+
extension = "csv"
|
|
78
|
+
return extension or None
|
|
79
|
+
|
|
80
|
+
def get_format_by_content(self, file_obj):
|
|
81
|
+
if self.is_parquet(file_obj):
|
|
82
|
+
return "parquet"
|
|
83
|
+
|
|
84
|
+
file_type = filetype.guess(file_obj)
|
|
85
|
+
if file_type is None:
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
if file_type.mime in {
|
|
89
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
90
|
+
"application/vnd.ms-excel",
|
|
91
|
+
}:
|
|
92
|
+
return 'xlsx'
|
|
93
|
+
|
|
94
|
+
if file_type.mime == 'application/pdf':
|
|
95
|
+
return "pdf"
|
|
96
|
+
|
|
97
|
+
file_obj = decode(file_obj)
|
|
98
|
+
|
|
99
|
+
if self.is_json(file_obj):
|
|
100
|
+
return "json"
|
|
101
|
+
|
|
102
|
+
if self.is_csv(file_obj):
|
|
103
|
+
return "csv"
|
|
104
|
+
|
|
105
|
+
def is_json(self, data_obj: StringIO) -> bool:
|
|
106
|
+
# see if its JSON
|
|
107
|
+
text = data_obj.read(100).strip()
|
|
108
|
+
data_obj.seek(0)
|
|
109
|
+
if len(text) > 0:
|
|
110
|
+
# it looks like a json, then try to parse it
|
|
111
|
+
if text.startswith("{") or text.startswith("["):
|
|
112
|
+
try:
|
|
113
|
+
json.loads(data_obj.read())
|
|
114
|
+
return True
|
|
115
|
+
except Exception:
|
|
116
|
+
return False
|
|
117
|
+
finally:
|
|
118
|
+
data_obj.seek(0)
|
|
119
|
+
return False
|
|
120
|
+
|
|
121
|
+
def is_csv(self, data_obj: StringIO) -> bool:
|
|
122
|
+
sample = data_obj.readline() # trying to get dialect from header
|
|
123
|
+
data_obj.seek(0)
|
|
124
|
+
try:
|
|
125
|
+
csv.Sniffer().sniff(sample)
|
|
126
|
+
|
|
127
|
+
except Exception:
|
|
128
|
+
return False
|
|
129
|
+
|
|
130
|
+
def is_parquet(self, data: BytesIO) -> bool:
|
|
131
|
+
# Check first and last 4 bytes equal to PAR1.
|
|
132
|
+
# Refer: https://parquet.apache.org/docs/file-format/
|
|
133
|
+
parquet_sig = b"PAR1"
|
|
134
|
+
data.seek(0, 0)
|
|
135
|
+
start_meta = data.read(4)
|
|
136
|
+
data.seek(-4, 2)
|
|
137
|
+
end_meta = data.read()
|
|
138
|
+
data.seek(0)
|
|
139
|
+
if start_meta == parquet_sig and end_meta == parquet_sig:
|
|
140
|
+
return True
|
|
141
|
+
return False
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class FileReader:
|
|
145
|
+
|
|
146
|
+
def _get_csv_dialect(self, buffer) -> csv.Dialect:
|
|
147
|
+
sample = buffer.readline() # trying to get dialect from header
|
|
148
|
+
buffer.seek(0)
|
|
149
|
+
try:
|
|
150
|
+
if isinstance(sample, bytes):
|
|
151
|
+
sample = sample.decode()
|
|
152
|
+
accepted_csv_delimiters = [",", "\t", ";"]
|
|
153
|
+
try:
|
|
154
|
+
dialect = csv.Sniffer().sniff(
|
|
155
|
+
sample, delimiters=accepted_csv_delimiters
|
|
156
|
+
)
|
|
157
|
+
dialect.doublequote = (
|
|
158
|
+
True # assume that all csvs have " as string escape
|
|
159
|
+
)
|
|
160
|
+
except Exception:
|
|
161
|
+
dialect = csv.reader(sample).dialect
|
|
162
|
+
if dialect.delimiter not in accepted_csv_delimiters:
|
|
163
|
+
raise Exception(
|
|
164
|
+
f"CSV delimeter '{dialect.delimiter}' is not supported"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
except csv.Error:
|
|
168
|
+
dialect = None
|
|
169
|
+
return dialect
|
|
170
|
+
|
|
171
|
+
def read(self, format, file_obj: BytesIO, **kwargs) -> pd.DataFrame:
|
|
172
|
+
func = {
|
|
173
|
+
'parquet': self.read_parquet,
|
|
174
|
+
'csv': self.read_csv,
|
|
175
|
+
'xlsx': self.read_excel,
|
|
176
|
+
'pdf': self.read_pdf,
|
|
177
|
+
'json': self.read_json,
|
|
178
|
+
'txt': self.read_txt,
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if format not in func:
|
|
182
|
+
raise FileDetectError(f'Unsupported format: {format}')
|
|
183
|
+
func = func[format]
|
|
184
|
+
|
|
185
|
+
return func(file_obj, **kwargs)
|
|
186
|
+
|
|
187
|
+
def read_csv(self, file_obj: BytesIO, **kwargs):
|
|
188
|
+
file_obj = decode(file_obj)
|
|
189
|
+
dialect = self._get_csv_dialect(file_obj)
|
|
190
|
+
|
|
191
|
+
return pd.read_csv(file_obj, sep=dialect.delimiter, index_col=False)
|
|
192
|
+
|
|
193
|
+
def read_txt(self, file_obj: BytesIO, **kwargs):
|
|
194
|
+
file_obj = decode(file_obj)
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
from langchain_core.documents import Document
|
|
198
|
+
except ImportError:
|
|
199
|
+
raise ImportError(
|
|
200
|
+
"To import TXT document please install 'langchain-community':\n"
|
|
201
|
+
" pip install langchain-community"
|
|
202
|
+
)
|
|
203
|
+
text = file_obj.read()
|
|
204
|
+
|
|
205
|
+
file_name = None
|
|
206
|
+
if hasattr(file_obj, "name"):
|
|
207
|
+
file_name = file_obj.name
|
|
208
|
+
metadata = {"source": file_name}
|
|
209
|
+
documents = [Document(page_content=text, metadata=metadata)]
|
|
210
|
+
|
|
211
|
+
text_splitter = RecursiveCharacterTextSplitter(
|
|
212
|
+
chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
docs = text_splitter.split_documents(documents)
|
|
216
|
+
return pd.DataFrame(
|
|
217
|
+
[
|
|
218
|
+
{"content": doc.page_content, "metadata": doc.metadata}
|
|
219
|
+
for doc in docs
|
|
220
|
+
]
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
def read_pdf(self, file_obj: BytesIO, **kwargs):
|
|
224
|
+
import fitz # pymupdf
|
|
225
|
+
|
|
226
|
+
with fitz.open(stream=file_obj) as pdf: # open pdf
|
|
227
|
+
text = chr(12).join([page.get_text() for page in pdf])
|
|
228
|
+
|
|
229
|
+
text_splitter = RecursiveCharacterTextSplitter(
|
|
230
|
+
chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
split_text = text_splitter.split_text(text)
|
|
234
|
+
|
|
235
|
+
return pd.DataFrame(
|
|
236
|
+
{"content": split_text, "metadata": [{}] * len(split_text)}
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
def read_json(self, file_obj: BytesIO, **kwargs):
|
|
240
|
+
file_obj = decode(file_obj)
|
|
241
|
+
file_obj.seek(0)
|
|
242
|
+
json_doc = json.loads(file_obj.read())
|
|
243
|
+
return pd.json_normalize(json_doc, max_level=0)
|
|
244
|
+
|
|
245
|
+
def read_parquet(self, file_obj: BytesIO, **kwargs):
|
|
246
|
+
return pd.read_parquet(file_obj)
|
|
247
|
+
|
|
248
|
+
def read_excel(self, file_obj: BytesIO, sheet_name=None, **kwargs) -> pd.DataFrame:
|
|
249
|
+
|
|
250
|
+
file_obj.seek(0)
|
|
251
|
+
with pd.ExcelFile(file_obj) as xls:
|
|
252
|
+
if sheet_name is None:
|
|
253
|
+
# No sheet specified: Return list of sheets
|
|
254
|
+
sheet_list = xls.sheet_names
|
|
255
|
+
return pd.DataFrame(sheet_list, columns=["Sheet_Name"])
|
|
256
|
+
else:
|
|
257
|
+
# Specific sheet requested: Load that sheet
|
|
258
|
+
return pd.read_excel(xls, sheet_name=sheet_name)
|
|
@@ -131,7 +131,8 @@ class MSGraphAPIBaseClient:
|
|
|
131
131
|
response = self._make_request(api_url, params)
|
|
132
132
|
|
|
133
133
|
# If the response content is a binary file or a TSV file, return the raw content.
|
|
134
|
-
if response.headers["Content-Type"] in ("application/octet-stream", "text/
|
|
134
|
+
if response.headers["Content-Type"] in ("application/octet-stream", "text/plain",
|
|
135
|
+
"text/tab-separated-values", "application/pdf"):
|
|
135
136
|
return response.content
|
|
136
137
|
# Otherwise, return the JSON content.
|
|
137
138
|
else:
|
mindsdb/integrations/utilities/handlers/auth_utilities/microsoft/ms_graph_api_auth_utilities.py
CHANGED
|
@@ -43,9 +43,14 @@ class MSGraphAPIDelegatedPermissionsManager:
|
|
|
43
43
|
# Set the redirect URI based on the request origin.
|
|
44
44
|
# If the request origin is 127.0.0.1 (localhost), replace it with localhost.
|
|
45
45
|
# This is done because the only HTTP origin allowed in Microsoft Entra ID app registration is localhost.
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
try:
|
|
47
|
+
request_origin = request.headers.get('ORIGIN') or (request.scheme + '://' + request.host)
|
|
48
|
+
if not request_origin:
|
|
49
|
+
raise AuthException('Request origin could not be determined!')
|
|
50
|
+
except RuntimeError:
|
|
51
|
+
# if it is outside of request context (streaming in agent)
|
|
52
|
+
request_origin = ''
|
|
53
|
+
|
|
49
54
|
request_origin = request_origin.replace('127.0.0.1', 'localhost') if 'http://127.0.0.1' in request_origin else request_origin
|
|
50
55
|
self.redirect_uri = request_origin + '/verify-auth'
|
|
51
56
|
|
|
@@ -23,7 +23,7 @@ logger = log.getLogger(__name__)
|
|
|
23
23
|
Summary = namedtuple('Summary', ['source_id', 'content'])
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
def create_map_reduce_documents_chain(summarization_config: SummarizationConfig, input: str) ->
|
|
26
|
+
def create_map_reduce_documents_chain(summarization_config: SummarizationConfig, input: str) -> ReduceDocumentsChain:
|
|
27
27
|
'''Creats a chain that map reduces documents into a single consolidated summary
|
|
28
28
|
|
|
29
29
|
Args:
|
|
@@ -43,7 +43,7 @@ def create_map_reduce_documents_chain(summarization_config: SummarizationConfig,
|
|
|
43
43
|
if 'input' in map_prompt.input_variables:
|
|
44
44
|
map_prompt = map_prompt.partial(input=input)
|
|
45
45
|
# Handles summarization of individual chunks.
|
|
46
|
-
map_chain = LLMChain(llm=summarization_llm, prompt=map_prompt)
|
|
46
|
+
# map_chain = LLMChain(llm=summarization_llm, prompt=map_prompt)
|
|
47
47
|
|
|
48
48
|
reduce_prompt_template = summarization_config.reduce_prompt_template
|
|
49
49
|
reduce_prompt = PromptTemplate.from_template(reduce_prompt_template)
|
|
@@ -60,18 +60,12 @@ def create_map_reduce_documents_chain(summarization_config: SummarizationConfig,
|
|
|
60
60
|
)
|
|
61
61
|
|
|
62
62
|
# Combines & iteratively reduces mapped documents.
|
|
63
|
-
|
|
63
|
+
return ReduceDocumentsChain(
|
|
64
64
|
combine_documents_chain=combine_documents_chain,
|
|
65
65
|
collapse_documents_chain=combine_documents_chain,
|
|
66
66
|
# Max number of tokens to group documents into.
|
|
67
67
|
token_max=summarization_config.max_summarization_tokens
|
|
68
68
|
)
|
|
69
|
-
return MapReduceDocumentsChain(
|
|
70
|
-
llm_chain=map_chain,
|
|
71
|
-
reduce_documents_chain=reduce_documents_chain,
|
|
72
|
-
document_variable_name='docs',
|
|
73
|
-
return_intermediate_steps=False
|
|
74
|
-
)
|
|
75
69
|
|
|
76
70
|
|
|
77
71
|
class MapReduceSummarizerChain(Chain):
|
|
@@ -135,6 +129,8 @@ class MapReduceSummarizerChain(Chain):
|
|
|
135
129
|
document_chunks = []
|
|
136
130
|
for _, row in all_source_chunks.iterrows():
|
|
137
131
|
metadata = row.get(self.metadata_column_name, {})
|
|
132
|
+
if row.get('chunk_id', None) is not None:
|
|
133
|
+
metadata['chunk_index'] = row.get('chunk_id', 0)
|
|
138
134
|
document_chunks.append(Document(page_content=row[self.content_column_name], metadata=metadata))
|
|
139
135
|
# Sort by chunk index if present in metadata so the full document is in its original order.
|
|
140
136
|
document_chunks.sort(key=lambda doc: doc.metadata.get('chunk_index', 0) if doc.metadata else 0)
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
from typing import Any, List, Optional, Dict
|
|
1
|
+
from typing import Any, List, Union, Optional, Dict
|
|
2
2
|
|
|
3
3
|
from langchain_community.vectorstores import PGVector
|
|
4
4
|
from langchain_community.vectorstores.pgvector import Base
|
|
5
5
|
|
|
6
|
-
from pgvector.sqlalchemy import Vector
|
|
6
|
+
from pgvector.sqlalchemy import SPARSEVEC, Vector
|
|
7
7
|
import sqlalchemy as sa
|
|
8
8
|
from sqlalchemy.dialects.postgresql import JSON
|
|
9
9
|
|
|
@@ -15,9 +15,17 @@ _generated_sa_tables = {}
|
|
|
15
15
|
|
|
16
16
|
class PGVectorMDB(PGVector):
|
|
17
17
|
"""
|
|
18
|
-
|
|
18
|
+
langchain_community.vectorstores.PGVector adapted for mindsdb vector store table structure
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
|
+
def __init__(self, *args, is_sparse: bool = False, vector_size: Optional[int] = None, **kwargs):
|
|
22
|
+
# todo get is_sparse and vector_size from kb vector table
|
|
23
|
+
self.is_sparse = is_sparse
|
|
24
|
+
if is_sparse and vector_size is None:
|
|
25
|
+
raise ValueError("vector_size is required when is_sparse=True")
|
|
26
|
+
self.vector_size = vector_size
|
|
27
|
+
super().__init__(*args, **kwargs)
|
|
28
|
+
|
|
21
29
|
def __post_init__(
|
|
22
30
|
self,
|
|
23
31
|
) -> None:
|
|
@@ -32,53 +40,94 @@ class PGVectorMDB(PGVector):
|
|
|
32
40
|
__tablename__ = collection_name
|
|
33
41
|
|
|
34
42
|
id = sa.Column(sa.Integer, primary_key=True)
|
|
35
|
-
embedding
|
|
36
|
-
|
|
37
|
-
|
|
43
|
+
embedding = sa.Column(
|
|
44
|
+
"embeddings",
|
|
45
|
+
SPARSEVEC() if self.is_sparse else Vector() if self.vector_size is None else
|
|
46
|
+
SPARSEVEC(self.vector_size) if self.is_sparse else Vector(self.vector_size)
|
|
47
|
+
)
|
|
48
|
+
document = sa.Column("content", sa.String, nullable=True)
|
|
49
|
+
cmetadata = sa.Column("metadata", JSON, nullable=True)
|
|
38
50
|
|
|
39
51
|
_generated_sa_tables[collection_name] = EmbeddingStore
|
|
40
52
|
|
|
41
53
|
self.EmbeddingStore = _generated_sa_tables[collection_name]
|
|
42
54
|
|
|
43
55
|
def __query_collection(
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
56
|
+
self,
|
|
57
|
+
embedding: Union[List[float], Dict[int, float], str],
|
|
58
|
+
k: int = 4,
|
|
59
|
+
filter: Optional[Dict[str, str]] = None,
|
|
48
60
|
) -> List[Any]:
|
|
49
61
|
"""Query the collection."""
|
|
50
62
|
with Session(self._bind) as session:
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
63
|
+
if self.is_sparse:
|
|
64
|
+
# Sparse vectors: expect string in format "{key:value,...}/size" or dictionary
|
|
65
|
+
if isinstance(embedding, dict):
|
|
66
|
+
from pgvector.utils import SparseVector
|
|
67
|
+
embedding = SparseVector(embedding, self.vector_size)
|
|
68
|
+
embedding_str = embedding.to_text()
|
|
69
|
+
elif isinstance(embedding, str):
|
|
70
|
+
# Use string as is - it should already be in the correct format
|
|
71
|
+
embedding_str = embedding
|
|
72
|
+
# Use inner product for sparse vectors
|
|
73
|
+
distance_op = "<#>"
|
|
74
|
+
# For inner product, larger values are better matches
|
|
75
|
+
order_direction = "ASC"
|
|
76
|
+
else:
|
|
77
|
+
# Dense vectors: expect string in JSON array format or list of floats
|
|
78
|
+
if isinstance(embedding, list):
|
|
79
|
+
embedding_str = f"[{','.join(str(x) for x in embedding)}]"
|
|
80
|
+
elif isinstance(embedding, str):
|
|
81
|
+
embedding_str = embedding
|
|
82
|
+
# Use cosine similarity for dense vectors
|
|
83
|
+
distance_op = "<=>"
|
|
84
|
+
# For cosine similarity, smaller values are better matches
|
|
85
|
+
order_direction = "ASC"
|
|
86
|
+
|
|
87
|
+
# Use SQL directly for vector comparison
|
|
88
|
+
query = sa.text(
|
|
89
|
+
f"""
|
|
90
|
+
SELECT t.*, t.embeddings {distance_op} '{embedding_str}' as distance
|
|
91
|
+
FROM {self.collection_name} t
|
|
92
|
+
ORDER BY distance {order_direction}
|
|
93
|
+
LIMIT {k}
|
|
94
|
+
"""
|
|
60
95
|
)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
96
|
+
results = session.execute(query).all()
|
|
97
|
+
|
|
98
|
+
# Convert results to the expected format
|
|
99
|
+
formatted_results = []
|
|
100
|
+
for rec in results:
|
|
101
|
+
metadata = rec.metadata if bool(rec.metadata) else {0: 0}
|
|
102
|
+
embedding_store = self.EmbeddingStore()
|
|
103
|
+
embedding_store.document = rec.content
|
|
104
|
+
embedding_store.cmetadata = metadata
|
|
105
|
+
result = type(
|
|
106
|
+
'Result', (), {
|
|
107
|
+
'EmbeddingStore': embedding_store,
|
|
108
|
+
'distance': rec.distance
|
|
109
|
+
}
|
|
110
|
+
)
|
|
111
|
+
formatted_results.append(result)
|
|
64
112
|
|
|
65
|
-
|
|
113
|
+
return formatted_results
|
|
66
114
|
|
|
67
115
|
# aliases for different langchain versions
|
|
68
116
|
def _PGVector__query_collection(self, *args, **kwargs):
|
|
117
|
+
|
|
69
118
|
return self.__query_collection(*args, **kwargs)
|
|
70
119
|
|
|
71
120
|
def _query_collection(self, *args, **kwargs):
|
|
72
121
|
return self.__query_collection(*args, **kwargs)
|
|
73
122
|
|
|
74
123
|
def create_collection(self):
|
|
75
|
-
raise RuntimeError(
|
|
124
|
+
raise RuntimeError("Forbidden")
|
|
76
125
|
|
|
77
126
|
def delete_collection(self):
|
|
78
|
-
raise RuntimeError(
|
|
127
|
+
raise RuntimeError("Forbidden")
|
|
79
128
|
|
|
80
129
|
def delete(self, *args, **kwargs):
|
|
81
|
-
raise RuntimeError(
|
|
130
|
+
raise RuntimeError("Forbidden")
|
|
82
131
|
|
|
83
132
|
def add_embeddings(self, *args, **kwargs):
|
|
84
|
-
raise RuntimeError(
|
|
133
|
+
raise RuntimeError("Forbidden")
|
|
@@ -7,6 +7,7 @@ from pydantic import BaseModel
|
|
|
7
7
|
|
|
8
8
|
from mindsdb.integrations.utilities.rag.settings import VectorStoreType, VectorStoreConfig
|
|
9
9
|
from mindsdb.integrations.utilities.rag.loaders.vector_store_loader.MDBVectorStore import MDBVectorStore
|
|
10
|
+
from mindsdb.integrations.utilities.rag.loaders.vector_store_loader.pgvector import PGVectorMDB
|
|
10
11
|
from mindsdb.utilities import log
|
|
11
12
|
|
|
12
13
|
|
|
@@ -28,6 +29,20 @@ class VectorStoreLoader(BaseModel):
|
|
|
28
29
|
Loads the vector store based on the provided config and embeddings model
|
|
29
30
|
:return:
|
|
30
31
|
"""
|
|
32
|
+
if self.config.is_sparse is not None and self.config.vector_size is not None and self.config.kb_table is not None:
|
|
33
|
+
# Only use PGVector store for sparse vectors.
|
|
34
|
+
db_handler = self.config.kb_table.get_vector_db()
|
|
35
|
+
db_args = db_handler.connection_args
|
|
36
|
+
# Assume we are always using PGVector & psycopg2.
|
|
37
|
+
connection_str = f"postgresql+psycopg2://{db_args.get('user')}:{db_args.get('password')}@{db_args.get('host')}:{db_args.get('port')}/{db_args.get('dbname', db_args.get('database'))}"
|
|
38
|
+
|
|
39
|
+
return PGVectorMDB(
|
|
40
|
+
connection_string=connection_str,
|
|
41
|
+
collection_name=self.config.kb_table._kb.vector_database_table,
|
|
42
|
+
embedding_function=self.embedding_model,
|
|
43
|
+
is_sparse=self.config.is_sparse,
|
|
44
|
+
vector_size=self.config.vector_size
|
|
45
|
+
)
|
|
31
46
|
return MDBVectorStore(kb_table=self.config.kb_table)
|
|
32
47
|
|
|
33
48
|
|
|
@@ -56,5 +71,7 @@ class VectorStoreFactory:
|
|
|
56
71
|
return PGVectorMDB(
|
|
57
72
|
connection_string=settings.connection_string,
|
|
58
73
|
collection_name=settings.collection_name,
|
|
59
|
-
embedding_function=embedding_model
|
|
74
|
+
embedding_function=embedding_model,
|
|
75
|
+
is_sparse=settings.is_sparse,
|
|
76
|
+
vector_size=settings.vector_size
|
|
60
77
|
)
|