MindsDB 25.1.2.1__py3-none-any.whl → 25.1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (95) hide show
  1. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/METADATA +246 -255
  2. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/RECORD +94 -83
  3. mindsdb/__about__.py +1 -1
  4. mindsdb/__main__.py +5 -3
  5. mindsdb/api/executor/__init__.py +0 -1
  6. mindsdb/api/executor/command_executor.py +2 -1
  7. mindsdb/api/executor/data_types/answer.py +1 -1
  8. mindsdb/api/executor/datahub/datanodes/datanode.py +1 -1
  9. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
  10. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +8 -3
  11. mindsdb/api/executor/datahub/datanodes/project_datanode.py +9 -26
  12. mindsdb/api/executor/sql_query/__init__.py +1 -0
  13. mindsdb/api/executor/sql_query/result_set.py +36 -21
  14. mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +1 -1
  15. mindsdb/api/executor/sql_query/steps/join_step.py +4 -4
  16. mindsdb/api/executor/sql_query/steps/map_reduce_step.py +6 -39
  17. mindsdb/api/executor/utilities/sql.py +2 -10
  18. mindsdb/api/http/namespaces/agents.py +3 -1
  19. mindsdb/api/http/namespaces/knowledge_bases.py +3 -3
  20. mindsdb/api/http/namespaces/sql.py +3 -1
  21. mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +2 -1
  22. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +7 -0
  23. mindsdb/api/postgres/postgres_proxy/executor/executor.py +2 -1
  24. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +2 -2
  25. mindsdb/integrations/handlers/chromadb_handler/requirements.txt +1 -1
  26. mindsdb/integrations/handlers/databricks_handler/requirements.txt +1 -1
  27. mindsdb/integrations/handlers/file_handler/file_handler.py +1 -1
  28. mindsdb/integrations/handlers/file_handler/requirements.txt +0 -4
  29. mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +17 -1
  30. mindsdb/integrations/handlers/jira_handler/jira_handler.py +15 -1
  31. mindsdb/integrations/handlers/jira_handler/jira_table.py +52 -31
  32. mindsdb/integrations/handlers/langchain_embedding_handler/fastapi_embeddings.py +82 -0
  33. mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +8 -1
  34. mindsdb/integrations/handlers/langchain_handler/requirements.txt +1 -1
  35. mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_handler.py +1 -1
  36. mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +8 -0
  37. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +50 -16
  38. mindsdb/integrations/handlers/pinecone_handler/pinecone_handler.py +123 -72
  39. mindsdb/integrations/handlers/pinecone_handler/requirements.txt +1 -1
  40. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +12 -6
  41. mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +5 -3
  42. mindsdb/integrations/handlers/slack_handler/slack_handler.py +13 -2
  43. mindsdb/integrations/handlers/slack_handler/slack_tables.py +21 -1
  44. mindsdb/integrations/handlers/web_handler/requirements.txt +0 -1
  45. mindsdb/integrations/libs/ml_handler_process/learn_process.py +2 -2
  46. mindsdb/integrations/utilities/files/__init__.py +0 -0
  47. mindsdb/integrations/utilities/files/file_reader.py +258 -0
  48. mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +2 -1
  49. mindsdb/integrations/utilities/handlers/auth_utilities/microsoft/ms_graph_api_auth_utilities.py +8 -3
  50. mindsdb/integrations/utilities/rag/chains/map_reduce_summarizer_chain.py +5 -9
  51. mindsdb/integrations/utilities/rag/loaders/vector_store_loader/pgvector.py +76 -27
  52. mindsdb/integrations/utilities/rag/loaders/vector_store_loader/vector_store_loader.py +18 -1
  53. mindsdb/integrations/utilities/rag/pipelines/rag.py +74 -21
  54. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +166 -108
  55. mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +108 -78
  56. mindsdb/integrations/utilities/rag/settings.py +37 -16
  57. mindsdb/integrations/utilities/sql_utils.py +1 -1
  58. mindsdb/interfaces/agents/agents_controller.py +18 -8
  59. mindsdb/interfaces/agents/constants.py +1 -0
  60. mindsdb/interfaces/agents/langchain_agent.py +124 -157
  61. mindsdb/interfaces/agents/langfuse_callback_handler.py +4 -37
  62. mindsdb/interfaces/agents/mindsdb_database_agent.py +21 -13
  63. mindsdb/interfaces/chatbot/chatbot_controller.py +7 -11
  64. mindsdb/interfaces/chatbot/chatbot_task.py +16 -5
  65. mindsdb/interfaces/chatbot/memory.py +58 -13
  66. mindsdb/interfaces/database/integrations.py +5 -1
  67. mindsdb/interfaces/database/projects.py +55 -16
  68. mindsdb/interfaces/database/views.py +12 -25
  69. mindsdb/interfaces/knowledge_base/controller.py +39 -15
  70. mindsdb/interfaces/knowledge_base/preprocessing/document_loader.py +7 -26
  71. mindsdb/interfaces/model/functions.py +15 -4
  72. mindsdb/interfaces/model/model_controller.py +4 -7
  73. mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +51 -40
  74. mindsdb/interfaces/skills/retrieval_tool.py +10 -3
  75. mindsdb/interfaces/skills/skill_tool.py +97 -54
  76. mindsdb/interfaces/skills/skills_controller.py +7 -3
  77. mindsdb/interfaces/skills/sql_agent.py +127 -41
  78. mindsdb/interfaces/storage/db.py +1 -1
  79. mindsdb/migrations/versions/2025-01-15_c06c35f7e8e1_project_company.py +88 -0
  80. mindsdb/utilities/cache.py +7 -4
  81. mindsdb/utilities/context.py +11 -1
  82. mindsdb/utilities/langfuse.py +279 -0
  83. mindsdb/utilities/log.py +20 -2
  84. mindsdb/utilities/otel/__init__.py +206 -0
  85. mindsdb/utilities/otel/logger.py +25 -0
  86. mindsdb/utilities/otel/meter.py +19 -0
  87. mindsdb/utilities/otel/metric_handlers/__init__.py +25 -0
  88. mindsdb/utilities/otel/tracer.py +16 -0
  89. mindsdb/utilities/partitioning.py +52 -0
  90. mindsdb/utilities/render/sqlalchemy_render.py +7 -1
  91. mindsdb/utilities/utils.py +34 -0
  92. mindsdb/utilities/otel.py +0 -72
  93. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/LICENSE +0 -0
  94. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/WHEEL +0 -0
  95. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/top_level.txt +0 -0
@@ -231,6 +231,9 @@ class SlackHandler(APIChatHandler):
231
231
  'polling': {
232
232
  'type': 'realtime',
233
233
  },
234
+ 'memory': {
235
+ 'type': 'handler',
236
+ },
234
237
  'tables': [
235
238
  {
236
239
  'chat_table': {
@@ -238,7 +241,7 @@ class SlackHandler(APIChatHandler):
238
241
  'chat_id_col': 'channel_id',
239
242
  'username_col': 'user',
240
243
  'text_col': 'text',
241
- 'time_col': 'thread_ts',
244
+ 'time_col': 'created_at',
242
245
  }
243
246
  },
244
247
  {
@@ -264,7 +267,7 @@ class SlackHandler(APIChatHandler):
264
267
  user_info = web_connection.auth_test().data
265
268
  return user_info['bot_id']
266
269
 
267
- def subscribe(self, stop_event: threading.Event, callback: Callable, **kwargs: Any) -> None:
270
+ def subscribe(self, stop_event: threading.Event, callback: Callable, table_name: Text, columns: List = None, **kwargs: Any) -> None:
268
271
  """
269
272
  Subscribes to the Slack API using the Socket Mode for real-time responses to messages.
270
273
 
@@ -274,6 +277,14 @@ class SlackHandler(APIChatHandler):
274
277
  table_name (Text): The name of the table to subscribe to.
275
278
  kwargs: Arbitrary keyword arguments.
276
279
  """
280
+ if table_name not in ['messages', 'threads']:
281
+ raise RuntimeError(f'Table {table_name} is not supported for subscription.')
282
+
283
+ # Raise an error if columns are provided.
284
+ # Since Slack subscriptions depend on events and not changes to the virtual tables, columns are not supported.
285
+ if columns:
286
+ raise RuntimeError('Columns are not supported for Slack subscriptions.')
287
+
277
288
  self._socket_connection = SocketModeClient(
278
289
  # This app-level token will be used only for establishing a connection.
279
290
  app_token=self.connection_data['app_token'], # xapp-A111-222-xyz
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  from slack_sdk.errors import SlackApiError
7
7
 
8
8
  from mindsdb.integrations.libs.api_handler import APIResource
9
- from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions, FilterCondition, FilterOperator
9
+ from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions, FilterCondition, FilterOperator, SortColumn
10
10
  from mindsdb.utilities import log
11
11
 
12
12
  logger = log.getLogger(__name__)
@@ -203,6 +203,7 @@ class SlackMessagesTable(APIResource):
203
203
  self,
204
204
  conditions: List[FilterCondition] = None,
205
205
  limit: int = None,
206
+ sort: List[SortColumn] = None,
206
207
  **kwargs: Any
207
208
  ) -> pd.DataFrame:
208
209
  """
@@ -222,6 +223,7 @@ class SlackMessagesTable(APIResource):
222
223
  Args:
223
224
  conditions (List[FilterCondition]): The conditions to filter the messages.
224
225
  limit (int): The limit of the messages to return.
226
+ sort (List[SortColumn]): The columns to sort the messages by.
225
227
  kwargs (Any): Arbitrary keyword arguments.
226
228
 
227
229
  Raises:
@@ -306,6 +308,14 @@ class SlackMessagesTable(APIResource):
306
308
  # Translate the time stamp into a 'created_at' field.
307
309
  result['created_at'] = pd.to_datetime(result['ts'].astype(float), unit='s').dt.strftime('%Y-%m-%d %H:%M:%S')
308
310
 
311
+ # Sort the messages by the specified columns.
312
+ if sort:
313
+ result.sort_values(
314
+ by=[col.column for col in sort],
315
+ ascending=[col.ascending for col in sort],
316
+ inplace=True
317
+ )
318
+
309
319
  return result
310
320
 
311
321
  def insert(self, query: Insert):
@@ -496,6 +506,7 @@ class SlackThreadsTable(APIResource):
496
506
  self,
497
507
  conditions: List[FilterCondition] = None,
498
508
  limit: int = None,
509
+ sort: List[SortColumn] = None,
499
510
  **kwargs: Any
500
511
  ) -> pd.DataFrame:
501
512
  """
@@ -514,6 +525,7 @@ class SlackThreadsTable(APIResource):
514
525
  Args:
515
526
  conditions (List[FilterCondition]): The conditions to filter the messages.
516
527
  limit (int): The limit of the messages to return.
528
+ sort (List[SortColumn]): The columns to sort the messages by.
517
529
  kwargs (Any): Arbitrary keyword arguments.
518
530
 
519
531
  Raises:
@@ -591,6 +603,14 @@ class SlackThreadsTable(APIResource):
591
603
  result['channel_id'] = params['channel']
592
604
  result['channel_name'] = channel['name'] if 'name' in channel else None
593
605
 
606
+ # Sort the messages by the specified columns.
607
+ if sort:
608
+ result.sort_values(
609
+ by=[col.column for col in sort],
610
+ ascending=[col.ascending for col in sort],
611
+ inplace=True
612
+ )
613
+
594
614
  return result
595
615
 
596
616
  def insert(self, query: Insert):
@@ -1,3 +1,2 @@
1
- pymupdf
2
1
  html2text
3
2
  bs4
@@ -8,7 +8,7 @@ from sqlalchemy.orm.attributes import flag_modified
8
8
  from mindsdb_sql_parser import parse_sql
9
9
  from mindsdb_sql_parser.ast import Identifier, Select, Star, NativeQuery
10
10
 
11
- from mindsdb.api.executor import SQLQuery
11
+ from mindsdb.api.executor.sql_query import SQLQuery
12
12
  import mindsdb.utilities.profiler as profiler
13
13
  from mindsdb.utilities.functions import mark_process
14
14
  from mindsdb.utilities.config import Config
@@ -72,7 +72,7 @@ def learn_process(data_integration_ref: dict, problem_definition: dict, fetch_da
72
72
  elif data_integration_ref['type'] == 'view':
73
73
  project = database_controller.get_project(project_name)
74
74
  query_ast = parse_sql(fetch_data_query)
75
- view_meta = project.query_view(query_ast)
75
+ view_meta = project.get_view_meta(query_ast)
76
76
  sqlquery = SQLQuery(view_meta['query_ast'], session=sql_session)
77
77
  elif data_integration_ref['type'] == 'project':
78
78
  query_ast = parse_sql(fetch_data_query)
File without changes
@@ -0,0 +1,258 @@
1
+ import traceback
2
+ import json
3
+ import csv
4
+ from io import BytesIO, StringIO
5
+ from pathlib import Path
6
+ import codecs
7
+
8
+ import filetype
9
+ import pandas as pd
10
+ from charset_normalizer import from_bytes
11
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
12
+
13
+ from mindsdb.utilities import log
14
+
15
+ logger = log.getLogger(__name__)
16
+
17
+ DEFAULT_CHUNK_SIZE = 500
18
+ DEFAULT_CHUNK_OVERLAP = 250
19
+
20
+
21
+ class FileDetectError(Exception):
22
+ ...
23
+
24
+
25
+ def decode(file_obj: BytesIO) -> StringIO:
26
+ byte_str = file_obj.read()
27
+ # Move it to StringIO
28
+ try:
29
+ # Handle Microsoft's BOM "special" UTF-8 encoding
30
+ if byte_str.startswith(codecs.BOM_UTF8):
31
+ data_str = StringIO(byte_str.decode("utf-8-sig"))
32
+ else:
33
+ file_encoding_meta = from_bytes(
34
+ byte_str[: 32 * 1024],
35
+ steps=32, # Number of steps/block to extract from my_byte_str
36
+ chunk_size=1024, # Set block size of each extraction)
37
+ explain=False,
38
+ )
39
+ best_meta = file_encoding_meta.best()
40
+ errors = "strict"
41
+ if best_meta is not None:
42
+ encoding = file_encoding_meta.best().encoding
43
+
44
+ try:
45
+ data_str = StringIO(byte_str.decode(encoding, errors))
46
+ except UnicodeDecodeError:
47
+ encoding = "utf-8"
48
+ errors = "replace"
49
+
50
+ data_str = StringIO(byte_str.decode(encoding, errors))
51
+ else:
52
+ encoding = "utf-8"
53
+ errors = "replace"
54
+
55
+ data_str = StringIO(byte_str.decode(encoding, errors))
56
+ except Exception as e:
57
+ logger.error(traceback.format_exc())
58
+ raise FileDetectError("Could not load into string") from e
59
+
60
+ return data_str
61
+
62
+
63
+ class FormatDetector:
64
+
65
+ def get(self, name, file_obj: BytesIO = None):
66
+ format = self.get_format_by_name(name)
67
+ if format is None and file_obj is not None:
68
+ format = self.get_format_by_content(file_obj)
69
+
70
+ if format is not None:
71
+ return format
72
+ raise FileDetectError(f'Unable to detect format: {name}')
73
+
74
+ def get_format_by_name(self, filename):
75
+ extension = Path(filename).suffix.strip(".").lower()
76
+ if extension == "tsv":
77
+ extension = "csv"
78
+ return extension or None
79
+
80
+ def get_format_by_content(self, file_obj):
81
+ if self.is_parquet(file_obj):
82
+ return "parquet"
83
+
84
+ file_type = filetype.guess(file_obj)
85
+ if file_type is None:
86
+ return
87
+
88
+ if file_type.mime in {
89
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
90
+ "application/vnd.ms-excel",
91
+ }:
92
+ return 'xlsx'
93
+
94
+ if file_type.mime == 'application/pdf':
95
+ return "pdf"
96
+
97
+ file_obj = decode(file_obj)
98
+
99
+ if self.is_json(file_obj):
100
+ return "json"
101
+
102
+ if self.is_csv(file_obj):
103
+ return "csv"
104
+
105
+ def is_json(self, data_obj: StringIO) -> bool:
106
+ # see if its JSON
107
+ text = data_obj.read(100).strip()
108
+ data_obj.seek(0)
109
+ if len(text) > 0:
110
+ # it looks like a json, then try to parse it
111
+ if text.startswith("{") or text.startswith("["):
112
+ try:
113
+ json.loads(data_obj.read())
114
+ return True
115
+ except Exception:
116
+ return False
117
+ finally:
118
+ data_obj.seek(0)
119
+ return False
120
+
121
+ def is_csv(self, data_obj: StringIO) -> bool:
122
+ sample = data_obj.readline() # trying to get dialect from header
123
+ data_obj.seek(0)
124
+ try:
125
+ csv.Sniffer().sniff(sample)
126
+
127
+ except Exception:
128
+ return False
129
+
130
+ def is_parquet(self, data: BytesIO) -> bool:
131
+ # Check first and last 4 bytes equal to PAR1.
132
+ # Refer: https://parquet.apache.org/docs/file-format/
133
+ parquet_sig = b"PAR1"
134
+ data.seek(0, 0)
135
+ start_meta = data.read(4)
136
+ data.seek(-4, 2)
137
+ end_meta = data.read()
138
+ data.seek(0)
139
+ if start_meta == parquet_sig and end_meta == parquet_sig:
140
+ return True
141
+ return False
142
+
143
+
144
+ class FileReader:
145
+
146
+ def _get_csv_dialect(self, buffer) -> csv.Dialect:
147
+ sample = buffer.readline() # trying to get dialect from header
148
+ buffer.seek(0)
149
+ try:
150
+ if isinstance(sample, bytes):
151
+ sample = sample.decode()
152
+ accepted_csv_delimiters = [",", "\t", ";"]
153
+ try:
154
+ dialect = csv.Sniffer().sniff(
155
+ sample, delimiters=accepted_csv_delimiters
156
+ )
157
+ dialect.doublequote = (
158
+ True # assume that all csvs have " as string escape
159
+ )
160
+ except Exception:
161
+ dialect = csv.reader(sample).dialect
162
+ if dialect.delimiter not in accepted_csv_delimiters:
163
+ raise Exception(
164
+ f"CSV delimeter '{dialect.delimiter}' is not supported"
165
+ )
166
+
167
+ except csv.Error:
168
+ dialect = None
169
+ return dialect
170
+
171
+ def read(self, format, file_obj: BytesIO, **kwargs) -> pd.DataFrame:
172
+ func = {
173
+ 'parquet': self.read_parquet,
174
+ 'csv': self.read_csv,
175
+ 'xlsx': self.read_excel,
176
+ 'pdf': self.read_pdf,
177
+ 'json': self.read_json,
178
+ 'txt': self.read_txt,
179
+ }
180
+
181
+ if format not in func:
182
+ raise FileDetectError(f'Unsupported format: {format}')
183
+ func = func[format]
184
+
185
+ return func(file_obj, **kwargs)
186
+
187
+ def read_csv(self, file_obj: BytesIO, **kwargs):
188
+ file_obj = decode(file_obj)
189
+ dialect = self._get_csv_dialect(file_obj)
190
+
191
+ return pd.read_csv(file_obj, sep=dialect.delimiter, index_col=False)
192
+
193
+ def read_txt(self, file_obj: BytesIO, **kwargs):
194
+ file_obj = decode(file_obj)
195
+
196
+ try:
197
+ from langchain_core.documents import Document
198
+ except ImportError:
199
+ raise ImportError(
200
+ "To import TXT document please install 'langchain-community':\n"
201
+ " pip install langchain-community"
202
+ )
203
+ text = file_obj.read()
204
+
205
+ file_name = None
206
+ if hasattr(file_obj, "name"):
207
+ file_name = file_obj.name
208
+ metadata = {"source": file_name}
209
+ documents = [Document(page_content=text, metadata=metadata)]
210
+
211
+ text_splitter = RecursiveCharacterTextSplitter(
212
+ chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP
213
+ )
214
+
215
+ docs = text_splitter.split_documents(documents)
216
+ return pd.DataFrame(
217
+ [
218
+ {"content": doc.page_content, "metadata": doc.metadata}
219
+ for doc in docs
220
+ ]
221
+ )
222
+
223
+ def read_pdf(self, file_obj: BytesIO, **kwargs):
224
+ import fitz # pymupdf
225
+
226
+ with fitz.open(stream=file_obj) as pdf: # open pdf
227
+ text = chr(12).join([page.get_text() for page in pdf])
228
+
229
+ text_splitter = RecursiveCharacterTextSplitter(
230
+ chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP
231
+ )
232
+
233
+ split_text = text_splitter.split_text(text)
234
+
235
+ return pd.DataFrame(
236
+ {"content": split_text, "metadata": [{}] * len(split_text)}
237
+ )
238
+
239
+ def read_json(self, file_obj: BytesIO, **kwargs):
240
+ file_obj = decode(file_obj)
241
+ file_obj.seek(0)
242
+ json_doc = json.loads(file_obj.read())
243
+ return pd.json_normalize(json_doc, max_level=0)
244
+
245
+ def read_parquet(self, file_obj: BytesIO, **kwargs):
246
+ return pd.read_parquet(file_obj)
247
+
248
+ def read_excel(self, file_obj: BytesIO, sheet_name=None, **kwargs) -> pd.DataFrame:
249
+
250
+ file_obj.seek(0)
251
+ with pd.ExcelFile(file_obj) as xls:
252
+ if sheet_name is None:
253
+ # No sheet specified: Return list of sheets
254
+ sheet_list = xls.sheet_names
255
+ return pd.DataFrame(sheet_list, columns=["Sheet_Name"])
256
+ else:
257
+ # Specific sheet requested: Load that sheet
258
+ return pd.read_excel(xls, sheet_name=sheet_name)
@@ -131,7 +131,8 @@ class MSGraphAPIBaseClient:
131
131
  response = self._make_request(api_url, params)
132
132
 
133
133
  # If the response content is a binary file or a TSV file, return the raw content.
134
- if response.headers["Content-Type"] in ("application/octet-stream", "text/tab-separated-values"):
134
+ if response.headers["Content-Type"] in ("application/octet-stream", "text/plain",
135
+ "text/tab-separated-values", "application/pdf"):
135
136
  return response.content
136
137
  # Otherwise, return the JSON content.
137
138
  else:
@@ -43,9 +43,14 @@ class MSGraphAPIDelegatedPermissionsManager:
43
43
  # Set the redirect URI based on the request origin.
44
44
  # If the request origin is 127.0.0.1 (localhost), replace it with localhost.
45
45
  # This is done because the only HTTP origin allowed in Microsoft Entra ID app registration is localhost.
46
- request_origin = request.headers.get('ORIGIN') or (request.scheme + '://' + request.host)
47
- if not request_origin:
48
- raise AuthException('Request origin could not be determined!')
46
+ try:
47
+ request_origin = request.headers.get('ORIGIN') or (request.scheme + '://' + request.host)
48
+ if not request_origin:
49
+ raise AuthException('Request origin could not be determined!')
50
+ except RuntimeError:
51
+ # if it is outside of request context (streaming in agent)
52
+ request_origin = ''
53
+
49
54
  request_origin = request_origin.replace('127.0.0.1', 'localhost') if 'http://127.0.0.1' in request_origin else request_origin
50
55
  self.redirect_uri = request_origin + '/verify-auth'
51
56
 
@@ -23,7 +23,7 @@ logger = log.getLogger(__name__)
23
23
  Summary = namedtuple('Summary', ['source_id', 'content'])
24
24
 
25
25
 
26
- def create_map_reduce_documents_chain(summarization_config: SummarizationConfig, input: str) -> MapReduceDocumentsChain:
26
+ def create_map_reduce_documents_chain(summarization_config: SummarizationConfig, input: str) -> ReduceDocumentsChain:
27
27
  '''Creats a chain that map reduces documents into a single consolidated summary
28
28
 
29
29
  Args:
@@ -43,7 +43,7 @@ def create_map_reduce_documents_chain(summarization_config: SummarizationConfig,
43
43
  if 'input' in map_prompt.input_variables:
44
44
  map_prompt = map_prompt.partial(input=input)
45
45
  # Handles summarization of individual chunks.
46
- map_chain = LLMChain(llm=summarization_llm, prompt=map_prompt)
46
+ # map_chain = LLMChain(llm=summarization_llm, prompt=map_prompt)
47
47
 
48
48
  reduce_prompt_template = summarization_config.reduce_prompt_template
49
49
  reduce_prompt = PromptTemplate.from_template(reduce_prompt_template)
@@ -60,18 +60,12 @@ def create_map_reduce_documents_chain(summarization_config: SummarizationConfig,
60
60
  )
61
61
 
62
62
  # Combines & iteratively reduces mapped documents.
63
- reduce_documents_chain = ReduceDocumentsChain(
63
+ return ReduceDocumentsChain(
64
64
  combine_documents_chain=combine_documents_chain,
65
65
  collapse_documents_chain=combine_documents_chain,
66
66
  # Max number of tokens to group documents into.
67
67
  token_max=summarization_config.max_summarization_tokens
68
68
  )
69
- return MapReduceDocumentsChain(
70
- llm_chain=map_chain,
71
- reduce_documents_chain=reduce_documents_chain,
72
- document_variable_name='docs',
73
- return_intermediate_steps=False
74
- )
75
69
 
76
70
 
77
71
  class MapReduceSummarizerChain(Chain):
@@ -135,6 +129,8 @@ class MapReduceSummarizerChain(Chain):
135
129
  document_chunks = []
136
130
  for _, row in all_source_chunks.iterrows():
137
131
  metadata = row.get(self.metadata_column_name, {})
132
+ if row.get('chunk_id', None) is not None:
133
+ metadata['chunk_index'] = row.get('chunk_id', 0)
138
134
  document_chunks.append(Document(page_content=row[self.content_column_name], metadata=metadata))
139
135
  # Sort by chunk index if present in metadata so the full document is in its original order.
140
136
  document_chunks.sort(key=lambda doc: doc.metadata.get('chunk_index', 0) if doc.metadata else 0)
@@ -1,9 +1,9 @@
1
- from typing import Any, List, Optional, Dict
1
+ from typing import Any, List, Union, Optional, Dict
2
2
 
3
3
  from langchain_community.vectorstores import PGVector
4
4
  from langchain_community.vectorstores.pgvector import Base
5
5
 
6
- from pgvector.sqlalchemy import Vector
6
+ from pgvector.sqlalchemy import SPARSEVEC, Vector
7
7
  import sqlalchemy as sa
8
8
  from sqlalchemy.dialects.postgresql import JSON
9
9
 
@@ -15,9 +15,17 @@ _generated_sa_tables = {}
15
15
 
16
16
  class PGVectorMDB(PGVector):
17
17
  """
18
- langchain_community.vectorstores.PGVector adapted for mindsdb vector store table structure
18
+ langchain_community.vectorstores.PGVector adapted for mindsdb vector store table structure
19
19
  """
20
20
 
21
+ def __init__(self, *args, is_sparse: bool = False, vector_size: Optional[int] = None, **kwargs):
22
+ # todo get is_sparse and vector_size from kb vector table
23
+ self.is_sparse = is_sparse
24
+ if is_sparse and vector_size is None:
25
+ raise ValueError("vector_size is required when is_sparse=True")
26
+ self.vector_size = vector_size
27
+ super().__init__(*args, **kwargs)
28
+
21
29
  def __post_init__(
22
30
  self,
23
31
  ) -> None:
@@ -32,53 +40,94 @@ class PGVectorMDB(PGVector):
32
40
  __tablename__ = collection_name
33
41
 
34
42
  id = sa.Column(sa.Integer, primary_key=True)
35
- embedding: Vector = sa.Column('embeddings', Vector())
36
- document = sa.Column('content', sa.String, nullable=True)
37
- cmetadata = sa.Column('metadata', JSON, nullable=True)
43
+ embedding = sa.Column(
44
+ "embeddings",
45
+ SPARSEVEC() if self.is_sparse else Vector() if self.vector_size is None else
46
+ SPARSEVEC(self.vector_size) if self.is_sparse else Vector(self.vector_size)
47
+ )
48
+ document = sa.Column("content", sa.String, nullable=True)
49
+ cmetadata = sa.Column("metadata", JSON, nullable=True)
38
50
 
39
51
  _generated_sa_tables[collection_name] = EmbeddingStore
40
52
 
41
53
  self.EmbeddingStore = _generated_sa_tables[collection_name]
42
54
 
43
55
  def __query_collection(
44
- self,
45
- embedding: List[float],
46
- k: int = 4,
47
- filter: Optional[Dict[str, str]] = None,
56
+ self,
57
+ embedding: Union[List[float], Dict[int, float], str],
58
+ k: int = 4,
59
+ filter: Optional[Dict[str, str]] = None,
48
60
  ) -> List[Any]:
49
61
  """Query the collection."""
50
62
  with Session(self._bind) as session:
51
-
52
- results: List[Any] = (
53
- session.query(
54
- self.EmbeddingStore,
55
- self.distance_strategy(embedding).label("distance"),
56
- )
57
- .order_by(sa.asc("distance"))
58
- .limit(k)
59
- .all()
63
+ if self.is_sparse:
64
+ # Sparse vectors: expect string in format "{key:value,...}/size" or dictionary
65
+ if isinstance(embedding, dict):
66
+ from pgvector.utils import SparseVector
67
+ embedding = SparseVector(embedding, self.vector_size)
68
+ embedding_str = embedding.to_text()
69
+ elif isinstance(embedding, str):
70
+ # Use string as is - it should already be in the correct format
71
+ embedding_str = embedding
72
+ # Use inner product for sparse vectors
73
+ distance_op = "<#>"
74
+ # For inner product, larger values are better matches
75
+ order_direction = "ASC"
76
+ else:
77
+ # Dense vectors: expect string in JSON array format or list of floats
78
+ if isinstance(embedding, list):
79
+ embedding_str = f"[{','.join(str(x) for x in embedding)}]"
80
+ elif isinstance(embedding, str):
81
+ embedding_str = embedding
82
+ # Use cosine similarity for dense vectors
83
+ distance_op = "<=>"
84
+ # For cosine similarity, smaller values are better matches
85
+ order_direction = "ASC"
86
+
87
+ # Use SQL directly for vector comparison
88
+ query = sa.text(
89
+ f"""
90
+ SELECT t.*, t.embeddings {distance_op} '{embedding_str}' as distance
91
+ FROM {self.collection_name} t
92
+ ORDER BY distance {order_direction}
93
+ LIMIT {k}
94
+ """
60
95
  )
61
- for rec, _ in results:
62
- if not bool(rec.cmetadata):
63
- rec.cmetadata = {0: 0}
96
+ results = session.execute(query).all()
97
+
98
+ # Convert results to the expected format
99
+ formatted_results = []
100
+ for rec in results:
101
+ metadata = rec.metadata if bool(rec.metadata) else {0: 0}
102
+ embedding_store = self.EmbeddingStore()
103
+ embedding_store.document = rec.content
104
+ embedding_store.cmetadata = metadata
105
+ result = type(
106
+ 'Result', (), {
107
+ 'EmbeddingStore': embedding_store,
108
+ 'distance': rec.distance
109
+ }
110
+ )
111
+ formatted_results.append(result)
64
112
 
65
- return results
113
+ return formatted_results
66
114
 
67
115
  # aliases for different langchain versions
68
116
  def _PGVector__query_collection(self, *args, **kwargs):
117
+
69
118
  return self.__query_collection(*args, **kwargs)
70
119
 
71
120
  def _query_collection(self, *args, **kwargs):
72
121
  return self.__query_collection(*args, **kwargs)
73
122
 
74
123
  def create_collection(self):
75
- raise RuntimeError('Forbidden')
124
+ raise RuntimeError("Forbidden")
76
125
 
77
126
  def delete_collection(self):
78
- raise RuntimeError('Forbidden')
127
+ raise RuntimeError("Forbidden")
79
128
 
80
129
  def delete(self, *args, **kwargs):
81
- raise RuntimeError('Forbidden')
130
+ raise RuntimeError("Forbidden")
82
131
 
83
132
  def add_embeddings(self, *args, **kwargs):
84
- raise RuntimeError('Forbidden')
133
+ raise RuntimeError("Forbidden")
@@ -7,6 +7,7 @@ from pydantic import BaseModel
7
7
 
8
8
  from mindsdb.integrations.utilities.rag.settings import VectorStoreType, VectorStoreConfig
9
9
  from mindsdb.integrations.utilities.rag.loaders.vector_store_loader.MDBVectorStore import MDBVectorStore
10
+ from mindsdb.integrations.utilities.rag.loaders.vector_store_loader.pgvector import PGVectorMDB
10
11
  from mindsdb.utilities import log
11
12
 
12
13
 
@@ -28,6 +29,20 @@ class VectorStoreLoader(BaseModel):
28
29
  Loads the vector store based on the provided config and embeddings model
29
30
  :return:
30
31
  """
32
+ if self.config.is_sparse is not None and self.config.vector_size is not None and self.config.kb_table is not None:
33
+ # Only use PGVector store for sparse vectors.
34
+ db_handler = self.config.kb_table.get_vector_db()
35
+ db_args = db_handler.connection_args
36
+ # Assume we are always using PGVector & psycopg2.
37
+ connection_str = f"postgresql+psycopg2://{db_args.get('user')}:{db_args.get('password')}@{db_args.get('host')}:{db_args.get('port')}/{db_args.get('dbname', db_args.get('database'))}"
38
+
39
+ return PGVectorMDB(
40
+ connection_string=connection_str,
41
+ collection_name=self.config.kb_table._kb.vector_database_table,
42
+ embedding_function=self.embedding_model,
43
+ is_sparse=self.config.is_sparse,
44
+ vector_size=self.config.vector_size
45
+ )
31
46
  return MDBVectorStore(kb_table=self.config.kb_table)
32
47
 
33
48
 
@@ -56,5 +71,7 @@ class VectorStoreFactory:
56
71
  return PGVectorMDB(
57
72
  connection_string=settings.connection_string,
58
73
  collection_name=settings.collection_name,
59
- embedding_function=embedding_model
74
+ embedding_function=embedding_model,
75
+ is_sparse=settings.is_sparse,
76
+ vector_size=settings.vector_size
60
77
  )