MindsDB 25.2.1.2__py3-none-any.whl → 25.2.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (33) hide show
  1. {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/METADATA +234 -230
  2. {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/RECORD +33 -33
  3. mindsdb/__about__.py +1 -1
  4. mindsdb/api/executor/command_executor.py +1 -57
  5. mindsdb/api/executor/datahub/datanodes/system_tables.py +34 -33
  6. mindsdb/api/executor/planner/query_planner.py +7 -2
  7. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +19 -11
  8. mindsdb/api/executor/sql_query/steps/subselect_step.py +44 -2
  9. mindsdb/integrations/handlers/byom_handler/byom_handler.py +1 -1
  10. mindsdb/integrations/handlers/byom_handler/requirements.txt +1 -1
  11. mindsdb/integrations/handlers/file_handler/file_handler.py +13 -320
  12. mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +60 -156
  13. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +1 -1
  14. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +1 -1
  15. mindsdb/integrations/handlers/lancedb_handler/requirements.txt +1 -1
  16. mindsdb/integrations/handlers/lightwood_handler/requirements.txt +3 -3
  17. mindsdb/integrations/handlers/ms_one_drive_handler/ms_graph_api_one_drive_client.py +3 -3
  18. mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +2 -20
  19. mindsdb/integrations/handlers/salesforce_handler/connection_args.py +9 -1
  20. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +2 -1
  21. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +1 -1
  22. mindsdb/integrations/handlers/writer_handler/requirements.txt +1 -1
  23. mindsdb/integrations/utilities/files/file_reader.py +120 -61
  24. mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +1 -8
  25. mindsdb/integrations/utilities/query_traversal.py +42 -37
  26. mindsdb/interfaces/agents/langfuse_callback_handler.py +205 -27
  27. mindsdb/interfaces/file/file_controller.py +1 -1
  28. mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +12 -2
  29. mindsdb/utilities/config.py +2 -2
  30. mindsdb/utilities/render/sqlalchemy_render.py +52 -19
  31. {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/LICENSE +0 -0
  32. {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/WHEEL +0 -0
  33. {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- lightwood~=24.12.3.0
2
- lightwood[extra]~=24.12.3.0
3
- lightwood[xai]~=24.12.3.0
1
+ lightwood>=25.2.2.0
2
+ lightwood[extra]>=25.2.2.0
3
+ lightwood[xai]>=25.2.2.0
4
4
  type_infer==0.0.20
@@ -76,15 +76,15 @@ class MSGraphAPIOneDriveClient(MSGraphAPIBaseClient):
76
76
  child_items = []
77
77
  for items in self.fetch_paginated_data(f"me/drive/items/{item_id}/children"):
78
78
  for item in items:
79
- path = f"{path}/{item['name']}"
79
+ child_path = f"{path}/{item['name']}"
80
80
  # If the item is a folder, get its child items.
81
81
  if "folder" in item:
82
82
  # Recursively get the child items of the folder.
83
- child_items.extend(self.get_child_items(item["id"], path))
83
+ child_items.extend(self.get_child_items(item["id"], child_path))
84
84
 
85
85
  else:
86
86
  # Add the path to the item.
87
- item["path"] = path
87
+ item["path"] = child_path
88
88
  child_items.append(item)
89
89
 
90
90
  return child_items
@@ -84,25 +84,7 @@ class FileTable(APIResource):
84
84
  client = self.handler.connect()
85
85
 
86
86
  file_content = client.get_item_content(table_name)
87
- file_extension = table_name.split(".")[-1]
88
87
 
89
- # Read the file content based and return a DataFrame based on the file extension.
90
- if file_extension == "csv":
91
- df = pd.read_csv(BytesIO(file_content))
88
+ reader = FileReader(file=BytesIO(file_content), name=table_name)
92
89
 
93
- elif file_extension == "tsv":
94
- df = pd.read_csv(BytesIO(file_content), sep="\t")
95
-
96
- elif file_extension == "json":
97
- df = pd.DataFrame(file_content)
98
-
99
- elif file_extension == "parquet":
100
- df = pd.read_parquet(BytesIO(file_content))
101
-
102
- elif file_extension == "pdf":
103
- df = FileReader().read_pdf(BytesIO(file_content))
104
-
105
- elif file_extension == "txt":
106
- df = FileReader().read_txt(BytesIO(file_content))
107
-
108
- return df
90
+ return reader.to_df()
@@ -28,6 +28,13 @@ connection_args = OrderedDict(
28
28
  'description': 'The client secret (consumer secret) from a connected app in Salesforce.',
29
29
  'required': True,
30
30
  'label': 'Client Secret (Consumer Secret)'
31
+ },
32
+ is_sandbox={
33
+ 'type': ARG_TYPE.BOOL,
34
+ 'description': 'Set this to True if you need to connect to a sandbox, False for production environments. '
35
+ 'If not provided defaults to False.',
36
+ 'required': False,
37
+ 'label': 'Is Sandbox'
31
38
  }
32
39
  )
33
40
 
@@ -35,5 +42,6 @@ connection_args_example = OrderedDict(
35
42
  username='demo@example.com',
36
43
  password='demo_password',
37
44
  client_id='3MVG9lKcPoNINVBIPJjdw1J9LLM82HnZz9Yh7ZJnY',
38
- client_secret='5A52C1A1E21DF9012IODC9ISNXXAADDA9'
45
+ client_secret='5A52C1A1E21DF9012IODC9ISNXXAADDA9',
46
+ is_sandbox=True
39
47
  )
@@ -88,7 +88,8 @@ class SalesforceHandler(APIHandler):
88
88
  username=self.connection_data['username'],
89
89
  password=self.connection_data['password'],
90
90
  client_id=self.connection_data['client_id'],
91
- client_secret=self.connection_data['client_secret']
91
+ client_secret=self.connection_data['client_secret'],
92
+ is_sandbox=self.connection_data.get('is_sandbox', False)
92
93
  )
93
94
  self.is_connected = True
94
95
  return self.connection
@@ -17,7 +17,7 @@ from mindsdb.integrations.libs.response import (
17
17
 
18
18
  try:
19
19
  import pyarrow as pa
20
- memory_pool = pa.memory_pool()
20
+ memory_pool = pa.default_memory_pool()
21
21
  except Exception:
22
22
  memory_pool = None
23
23
 
@@ -1,4 +1,4 @@
1
1
  -r mindsdb/integrations/handlers/rag_handler/requirements.txt
2
- nltk>=3.8.1
2
+ nltk>=3.9
3
3
  rouge-score>=0.1.2
4
4
  scipy
@@ -1,7 +1,7 @@
1
1
  import traceback
2
2
  import json
3
3
  import csv
4
- from io import BytesIO, StringIO
4
+ from io import BytesIO, StringIO, IOBase
5
5
  from pathlib import Path
6
6
  import codecs
7
7
 
@@ -9,6 +9,7 @@ import filetype
9
9
  import pandas as pd
10
10
  from charset_normalizer import from_bytes
11
11
  from langchain_text_splitters import RecursiveCharacterTextSplitter
12
+ import fitz # pymupdf
12
13
 
13
14
  from mindsdb.utilities import log
14
15
 
@@ -22,7 +23,8 @@ class FileDetectError(Exception):
22
23
  ...
23
24
 
24
25
 
25
- def decode(file_obj: BytesIO) -> StringIO:
26
+ def decode(file_obj: IOBase) -> StringIO:
27
+ file_obj.seek(0)
26
28
  byte_str = file_obj.read()
27
29
  # Move it to StringIO
28
30
  try:
@@ -62,39 +64,87 @@ def decode(file_obj: BytesIO) -> StringIO:
62
64
 
63
65
  class FormatDetector:
64
66
 
65
- def get(self, name, file_obj: BytesIO = None):
66
- format = self.get_format_by_name(name)
67
- if format is None and file_obj is not None:
68
- format = self.get_format_by_content(file_obj)
67
+ supported_formats = ['parquet', 'csv', 'xlsx', 'pdf', 'json', 'txt']
68
+
69
+ def __init__(
70
+ self,
71
+ path: str = None,
72
+ name: str = None,
73
+ file: IOBase = None
74
+ ):
75
+ """
76
+ File format detector
77
+ One of these arguments has to be passed: `path` or `file`
78
+
79
+ :param path: path to the file
80
+ :param name: name of the file
81
+ :param file: file descriptor (via open(...), of BytesIO(...))
82
+ """
83
+ if path is not None:
84
+ file = open(path, 'rb')
85
+
86
+ elif file is not None:
87
+ if name is None:
88
+ if hasattr(file, 'name'):
89
+ path = file.name
90
+ else:
91
+ path = 'file'
92
+ else:
93
+ raise FileDetectError('Wrong arguments: path or file is required')
94
+
95
+ if name is None:
96
+ name = Path(path).name
97
+
98
+ self.name = name
99
+ self.file_obj = file
100
+ self.format = None
69
101
 
102
+ self.parameters = {}
103
+
104
+ def get_format(self) -> str:
105
+ if self.format is not None:
106
+ return self.format
107
+
108
+ format = self.get_format_by_name()
70
109
  if format is not None:
71
- return format
72
- raise FileDetectError(f'Unable to detect format: {name}')
110
+ if format not in self.supported_formats:
111
+ raise FileDetectError(f'Not supported format: {format}')
112
+
113
+ if format is None and self.file_obj is not None:
114
+ format = self.get_format_by_content()
115
+ self.file_obj.seek(0)
73
116
 
74
- def get_format_by_name(self, filename):
75
- extension = Path(filename).suffix.strip(".").lower()
117
+ if format is None:
118
+ raise FileDetectError(f'Unable to detect format: {self.name}')
119
+
120
+ self.format = format
121
+ return format
122
+
123
+ def get_format_by_name(self):
124
+ extension = Path(self.name).suffix.strip(".").lower()
76
125
  if extension == "tsv":
77
126
  extension = "csv"
127
+ self.parameters['delimiter'] = '\t'
128
+
78
129
  return extension or None
79
130
 
80
- def get_format_by_content(self, file_obj):
81
- if self.is_parquet(file_obj):
131
+ def get_format_by_content(self):
132
+ if self.is_parquet(self.file_obj):
82
133
  return "parquet"
83
134
 
84
- file_type = filetype.guess(file_obj)
85
- if file_type is None:
86
- return
135
+ file_type = filetype.guess(self.file_obj)
136
+ if file_type is not None:
87
137
 
88
- if file_type.mime in {
89
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
90
- "application/vnd.ms-excel",
91
- }:
92
- return 'xlsx'
138
+ if file_type.mime in {
139
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
140
+ "application/vnd.ms-excel",
141
+ }:
142
+ return 'xlsx'
93
143
 
94
- if file_type.mime == 'application/pdf':
95
- return "pdf"
144
+ if file_type.mime == 'application/pdf':
145
+ return "pdf"
96
146
 
97
- file_obj = decode(file_obj)
147
+ file_obj = decode(self.file_obj)
98
148
 
99
149
  if self.is_json(file_obj):
100
150
  return "json"
@@ -102,8 +152,10 @@ class FormatDetector:
102
152
  if self.is_csv(file_obj):
103
153
  return "csv"
104
154
 
105
- def is_json(self, data_obj: StringIO) -> bool:
155
+ @staticmethod
156
+ def is_json(data_obj: StringIO) -> bool:
106
157
  # see if its JSON
158
+ data_obj.seek(0)
107
159
  text = data_obj.read(100).strip()
108
160
  data_obj.seek(0)
109
161
  if len(text) > 0:
@@ -114,20 +166,25 @@ class FormatDetector:
114
166
  return True
115
167
  except Exception:
116
168
  return False
117
- finally:
118
- data_obj.seek(0)
119
169
  return False
120
170
 
121
- def is_csv(self, data_obj: StringIO) -> bool:
122
- sample = data_obj.readline() # trying to get dialect from header
171
+ @classmethod
172
+ def is_csv(cls, data_obj: StringIO) -> bool:
123
173
  data_obj.seek(0)
174
+ sample = data_obj.readline() # trying to get dialect from header
124
175
  try:
176
+ data_obj.seek(0)
125
177
  csv.Sniffer().sniff(sample)
126
178
 
179
+ # Avoid a false-positive for json files
180
+ if cls.is_json(data_obj):
181
+ return False
182
+ return True
127
183
  except Exception:
128
184
  return False
129
185
 
130
- def is_parquet(self, data: BytesIO) -> bool:
186
+ @staticmethod
187
+ def is_parquet(data: IOBase) -> bool:
131
188
  # Check first and last 4 bytes equal to PAR1.
132
189
  # Refer: https://parquet.apache.org/docs/file-format/
133
190
  parquet_sig = b"PAR1"
@@ -141,15 +198,31 @@ class FormatDetector:
141
198
  return False
142
199
 
143
200
 
144
- class FileReader:
201
+ class FileReader(FormatDetector):
202
+
203
+ def to_df(self, **kwargs) -> pd.DataFrame:
204
+ format = self.get_format()
205
+
206
+ func = getattr(self, f'read_{format}', None)
207
+ if func is None:
208
+ raise FileDetectError(f'Unsupported format: {format}')
209
+
210
+ self.file_obj.seek(0)
211
+ kwargs.update(self.parameters)
212
+ return func(self.file_obj, name=self.name, **kwargs)
145
213
 
146
- def _get_csv_dialect(self, buffer) -> csv.Dialect:
214
+ @staticmethod
215
+ def _get_csv_dialect(buffer, delimiter=None) -> csv.Dialect:
147
216
  sample = buffer.readline() # trying to get dialect from header
148
217
  buffer.seek(0)
149
218
  try:
150
219
  if isinstance(sample, bytes):
151
220
  sample = sample.decode()
152
- accepted_csv_delimiters = [",", "\t", ";"]
221
+
222
+ if delimiter is not None:
223
+ accepted_csv_delimiters = [delimiter]
224
+ else:
225
+ accepted_csv_delimiters = [",", "\t", ";"]
153
226
  try:
154
227
  dialect = csv.Sniffer().sniff(
155
228
  sample, delimiters=accepted_csv_delimiters
@@ -168,29 +241,15 @@ class FileReader:
168
241
  dialect = None
169
242
  return dialect
170
243
 
171
- def read(self, format, file_obj: BytesIO, **kwargs) -> pd.DataFrame:
172
- func = {
173
- 'parquet': self.read_parquet,
174
- 'csv': self.read_csv,
175
- 'xlsx': self.read_excel,
176
- 'pdf': self.read_pdf,
177
- 'json': self.read_json,
178
- 'txt': self.read_txt,
179
- }
180
-
181
- if format not in func:
182
- raise FileDetectError(f'Unsupported format: {format}')
183
- func = func[format]
184
-
185
- return func(file_obj, **kwargs)
186
-
187
- def read_csv(self, file_obj: BytesIO, **kwargs):
244
+ @classmethod
245
+ def read_csv(cls, file_obj: BytesIO, delimiter=None, **kwargs):
188
246
  file_obj = decode(file_obj)
189
- dialect = self._get_csv_dialect(file_obj)
247
+ dialect = cls._get_csv_dialect(file_obj, delimiter=delimiter)
190
248
 
191
249
  return pd.read_csv(file_obj, sep=dialect.delimiter, index_col=False)
192
250
 
193
- def read_txt(self, file_obj: BytesIO, **kwargs):
251
+ @staticmethod
252
+ def read_txt(file_obj: BytesIO, name=None, **kwargs):
194
253
  file_obj = decode(file_obj)
195
254
 
196
255
  try:
@@ -202,10 +261,7 @@ class FileReader:
202
261
  )
203
262
  text = file_obj.read()
204
263
 
205
- file_name = None
206
- if hasattr(file_obj, "name"):
207
- file_name = file_obj.name
208
- metadata = {"source": file_name}
264
+ metadata = {"source": name}
209
265
  documents = [Document(page_content=text, metadata=metadata)]
210
266
 
211
267
  text_splitter = RecursiveCharacterTextSplitter(
@@ -220,10 +276,10 @@ class FileReader:
220
276
  ]
221
277
  )
222
278
 
223
- def read_pdf(self, file_obj: BytesIO, **kwargs):
224
- import fitz # pymupdf
279
+ @staticmethod
280
+ def read_pdf(file_obj: BytesIO, **kwargs):
225
281
 
226
- with fitz.open(stream=file_obj) as pdf: # open pdf
282
+ with fitz.open(stream=file_obj.read()) as pdf: # open pdf
227
283
  text = chr(12).join([page.get_text() for page in pdf])
228
284
 
229
285
  text_splitter = RecursiveCharacterTextSplitter(
@@ -236,16 +292,19 @@ class FileReader:
236
292
  {"content": split_text, "metadata": [{}] * len(split_text)}
237
293
  )
238
294
 
239
- def read_json(self, file_obj: BytesIO, **kwargs):
295
+ @staticmethod
296
+ def read_json(file_obj: BytesIO, **kwargs):
240
297
  file_obj = decode(file_obj)
241
298
  file_obj.seek(0)
242
299
  json_doc = json.loads(file_obj.read())
243
300
  return pd.json_normalize(json_doc, max_level=0)
244
301
 
245
- def read_parquet(self, file_obj: BytesIO, **kwargs):
302
+ @staticmethod
303
+ def read_parquet(file_obj: BytesIO, **kwargs):
246
304
  return pd.read_parquet(file_obj)
247
305
 
248
- def read_excel(self, file_obj: BytesIO, sheet_name=None, **kwargs) -> pd.DataFrame:
306
+ @staticmethod
307
+ def read_xlsx(file_obj: BytesIO, sheet_name=None, **kwargs) -> pd.DataFrame:
249
308
 
250
309
  file_obj.seek(0)
251
310
  with pd.ExcelFile(file_obj) as xls:
@@ -129,11 +129,4 @@ class MSGraphAPIBaseClient:
129
129
  api_url = self._get_api_url(endpoint)
130
130
 
131
131
  response = self._make_request(api_url, params)
132
-
133
- # If the response content is a binary file or a TSV file, return the raw content.
134
- if response.headers["Content-Type"] in ("application/octet-stream", "text/plain",
135
- "text/tab-separated-values", "application/pdf"):
136
- return response.content
137
- # Otherwise, return the JSON content.
138
- else:
139
- return response.json()
132
+ return response.content