MindsDB 25.2.1.2__py3-none-any.whl → 25.2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/METADATA +234 -230
- {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/RECORD +33 -33
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/command_executor.py +1 -57
- mindsdb/api/executor/datahub/datanodes/system_tables.py +34 -33
- mindsdb/api/executor/planner/query_planner.py +7 -2
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +19 -11
- mindsdb/api/executor/sql_query/steps/subselect_step.py +44 -2
- mindsdb/integrations/handlers/byom_handler/byom_handler.py +1 -1
- mindsdb/integrations/handlers/byom_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/file_handler/file_handler.py +13 -320
- mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +60 -156
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +1 -1
- mindsdb/integrations/handlers/lancedb_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/lightwood_handler/requirements.txt +3 -3
- mindsdb/integrations/handlers/ms_one_drive_handler/ms_graph_api_one_drive_client.py +3 -3
- mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +2 -20
- mindsdb/integrations/handlers/salesforce_handler/connection_args.py +9 -1
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +2 -1
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +1 -1
- mindsdb/integrations/handlers/writer_handler/requirements.txt +1 -1
- mindsdb/integrations/utilities/files/file_reader.py +120 -61
- mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +1 -8
- mindsdb/integrations/utilities/query_traversal.py +42 -37
- mindsdb/interfaces/agents/langfuse_callback_handler.py +205 -27
- mindsdb/interfaces/file/file_controller.py +1 -1
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +12 -2
- mindsdb/utilities/config.py +2 -2
- mindsdb/utilities/render/sqlalchemy_render.py +52 -19
- {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/LICENSE +0 -0
- {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/WHEEL +0 -0
- {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
lightwood
|
|
2
|
-
lightwood[extra]
|
|
3
|
-
lightwood[xai]
|
|
1
|
+
lightwood>=25.2.2.0
|
|
2
|
+
lightwood[extra]>=25.2.2.0
|
|
3
|
+
lightwood[xai]>=25.2.2.0
|
|
4
4
|
type_infer==0.0.20
|
|
@@ -76,15 +76,15 @@ class MSGraphAPIOneDriveClient(MSGraphAPIBaseClient):
|
|
|
76
76
|
child_items = []
|
|
77
77
|
for items in self.fetch_paginated_data(f"me/drive/items/{item_id}/children"):
|
|
78
78
|
for item in items:
|
|
79
|
-
|
|
79
|
+
child_path = f"{path}/{item['name']}"
|
|
80
80
|
# If the item is a folder, get its child items.
|
|
81
81
|
if "folder" in item:
|
|
82
82
|
# Recursively get the child items of the folder.
|
|
83
|
-
child_items.extend(self.get_child_items(item["id"],
|
|
83
|
+
child_items.extend(self.get_child_items(item["id"], child_path))
|
|
84
84
|
|
|
85
85
|
else:
|
|
86
86
|
# Add the path to the item.
|
|
87
|
-
item["path"] =
|
|
87
|
+
item["path"] = child_path
|
|
88
88
|
child_items.append(item)
|
|
89
89
|
|
|
90
90
|
return child_items
|
|
@@ -84,25 +84,7 @@ class FileTable(APIResource):
|
|
|
84
84
|
client = self.handler.connect()
|
|
85
85
|
|
|
86
86
|
file_content = client.get_item_content(table_name)
|
|
87
|
-
file_extension = table_name.split(".")[-1]
|
|
88
87
|
|
|
89
|
-
|
|
90
|
-
if file_extension == "csv":
|
|
91
|
-
df = pd.read_csv(BytesIO(file_content))
|
|
88
|
+
reader = FileReader(file=BytesIO(file_content), name=table_name)
|
|
92
89
|
|
|
93
|
-
|
|
94
|
-
df = pd.read_csv(BytesIO(file_content), sep="\t")
|
|
95
|
-
|
|
96
|
-
elif file_extension == "json":
|
|
97
|
-
df = pd.DataFrame(file_content)
|
|
98
|
-
|
|
99
|
-
elif file_extension == "parquet":
|
|
100
|
-
df = pd.read_parquet(BytesIO(file_content))
|
|
101
|
-
|
|
102
|
-
elif file_extension == "pdf":
|
|
103
|
-
df = FileReader().read_pdf(BytesIO(file_content))
|
|
104
|
-
|
|
105
|
-
elif file_extension == "txt":
|
|
106
|
-
df = FileReader().read_txt(BytesIO(file_content))
|
|
107
|
-
|
|
108
|
-
return df
|
|
90
|
+
return reader.to_df()
|
|
@@ -28,6 +28,13 @@ connection_args = OrderedDict(
|
|
|
28
28
|
'description': 'The client secret (consumer secret) from a connected app in Salesforce.',
|
|
29
29
|
'required': True,
|
|
30
30
|
'label': 'Client Secret (Consumer Secret)'
|
|
31
|
+
},
|
|
32
|
+
is_sandbox={
|
|
33
|
+
'type': ARG_TYPE.BOOL,
|
|
34
|
+
'description': 'Set this to True if you need to connect to a sandbox, False for production environments. '
|
|
35
|
+
'If not provided defaults to False.',
|
|
36
|
+
'required': False,
|
|
37
|
+
'label': 'Is Sandbox'
|
|
31
38
|
}
|
|
32
39
|
)
|
|
33
40
|
|
|
@@ -35,5 +42,6 @@ connection_args_example = OrderedDict(
|
|
|
35
42
|
username='demo@example.com',
|
|
36
43
|
password='demo_password',
|
|
37
44
|
client_id='3MVG9lKcPoNINVBIPJjdw1J9LLM82HnZz9Yh7ZJnY',
|
|
38
|
-
client_secret='5A52C1A1E21DF9012IODC9ISNXXAADDA9'
|
|
45
|
+
client_secret='5A52C1A1E21DF9012IODC9ISNXXAADDA9',
|
|
46
|
+
is_sandbox=True
|
|
39
47
|
)
|
|
@@ -88,7 +88,8 @@ class SalesforceHandler(APIHandler):
|
|
|
88
88
|
username=self.connection_data['username'],
|
|
89
89
|
password=self.connection_data['password'],
|
|
90
90
|
client_id=self.connection_data['client_id'],
|
|
91
|
-
client_secret=self.connection_data['client_secret']
|
|
91
|
+
client_secret=self.connection_data['client_secret'],
|
|
92
|
+
is_sandbox=self.connection_data.get('is_sandbox', False)
|
|
92
93
|
)
|
|
93
94
|
self.is_connected = True
|
|
94
95
|
return self.connection
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import traceback
|
|
2
2
|
import json
|
|
3
3
|
import csv
|
|
4
|
-
from io import BytesIO, StringIO
|
|
4
|
+
from io import BytesIO, StringIO, IOBase
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
import codecs
|
|
7
7
|
|
|
@@ -9,6 +9,7 @@ import filetype
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
from charset_normalizer import from_bytes
|
|
11
11
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
12
|
+
import fitz # pymupdf
|
|
12
13
|
|
|
13
14
|
from mindsdb.utilities import log
|
|
14
15
|
|
|
@@ -22,7 +23,8 @@ class FileDetectError(Exception):
|
|
|
22
23
|
...
|
|
23
24
|
|
|
24
25
|
|
|
25
|
-
def decode(file_obj:
|
|
26
|
+
def decode(file_obj: IOBase) -> StringIO:
|
|
27
|
+
file_obj.seek(0)
|
|
26
28
|
byte_str = file_obj.read()
|
|
27
29
|
# Move it to StringIO
|
|
28
30
|
try:
|
|
@@ -62,39 +64,87 @@ def decode(file_obj: BytesIO) -> StringIO:
|
|
|
62
64
|
|
|
63
65
|
class FormatDetector:
|
|
64
66
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
67
|
+
supported_formats = ['parquet', 'csv', 'xlsx', 'pdf', 'json', 'txt']
|
|
68
|
+
|
|
69
|
+
def __init__(
|
|
70
|
+
self,
|
|
71
|
+
path: str = None,
|
|
72
|
+
name: str = None,
|
|
73
|
+
file: IOBase = None
|
|
74
|
+
):
|
|
75
|
+
"""
|
|
76
|
+
File format detector
|
|
77
|
+
One of these arguments has to be passed: `path` or `file`
|
|
78
|
+
|
|
79
|
+
:param path: path to the file
|
|
80
|
+
:param name: name of the file
|
|
81
|
+
:param file: file descriptor (via open(...), of BytesIO(...))
|
|
82
|
+
"""
|
|
83
|
+
if path is not None:
|
|
84
|
+
file = open(path, 'rb')
|
|
85
|
+
|
|
86
|
+
elif file is not None:
|
|
87
|
+
if name is None:
|
|
88
|
+
if hasattr(file, 'name'):
|
|
89
|
+
path = file.name
|
|
90
|
+
else:
|
|
91
|
+
path = 'file'
|
|
92
|
+
else:
|
|
93
|
+
raise FileDetectError('Wrong arguments: path or file is required')
|
|
94
|
+
|
|
95
|
+
if name is None:
|
|
96
|
+
name = Path(path).name
|
|
97
|
+
|
|
98
|
+
self.name = name
|
|
99
|
+
self.file_obj = file
|
|
100
|
+
self.format = None
|
|
69
101
|
|
|
102
|
+
self.parameters = {}
|
|
103
|
+
|
|
104
|
+
def get_format(self) -> str:
|
|
105
|
+
if self.format is not None:
|
|
106
|
+
return self.format
|
|
107
|
+
|
|
108
|
+
format = self.get_format_by_name()
|
|
70
109
|
if format is not None:
|
|
71
|
-
|
|
72
|
-
|
|
110
|
+
if format not in self.supported_formats:
|
|
111
|
+
raise FileDetectError(f'Not supported format: {format}')
|
|
112
|
+
|
|
113
|
+
if format is None and self.file_obj is not None:
|
|
114
|
+
format = self.get_format_by_content()
|
|
115
|
+
self.file_obj.seek(0)
|
|
73
116
|
|
|
74
|
-
|
|
75
|
-
|
|
117
|
+
if format is None:
|
|
118
|
+
raise FileDetectError(f'Unable to detect format: {self.name}')
|
|
119
|
+
|
|
120
|
+
self.format = format
|
|
121
|
+
return format
|
|
122
|
+
|
|
123
|
+
def get_format_by_name(self):
|
|
124
|
+
extension = Path(self.name).suffix.strip(".").lower()
|
|
76
125
|
if extension == "tsv":
|
|
77
126
|
extension = "csv"
|
|
127
|
+
self.parameters['delimiter'] = '\t'
|
|
128
|
+
|
|
78
129
|
return extension or None
|
|
79
130
|
|
|
80
|
-
def get_format_by_content(self
|
|
81
|
-
if self.is_parquet(file_obj):
|
|
131
|
+
def get_format_by_content(self):
|
|
132
|
+
if self.is_parquet(self.file_obj):
|
|
82
133
|
return "parquet"
|
|
83
134
|
|
|
84
|
-
file_type = filetype.guess(file_obj)
|
|
85
|
-
if file_type is None:
|
|
86
|
-
return
|
|
135
|
+
file_type = filetype.guess(self.file_obj)
|
|
136
|
+
if file_type is not None:
|
|
87
137
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
138
|
+
if file_type.mime in {
|
|
139
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
140
|
+
"application/vnd.ms-excel",
|
|
141
|
+
}:
|
|
142
|
+
return 'xlsx'
|
|
93
143
|
|
|
94
|
-
|
|
95
|
-
|
|
144
|
+
if file_type.mime == 'application/pdf':
|
|
145
|
+
return "pdf"
|
|
96
146
|
|
|
97
|
-
file_obj = decode(file_obj)
|
|
147
|
+
file_obj = decode(self.file_obj)
|
|
98
148
|
|
|
99
149
|
if self.is_json(file_obj):
|
|
100
150
|
return "json"
|
|
@@ -102,8 +152,10 @@ class FormatDetector:
|
|
|
102
152
|
if self.is_csv(file_obj):
|
|
103
153
|
return "csv"
|
|
104
154
|
|
|
105
|
-
|
|
155
|
+
@staticmethod
|
|
156
|
+
def is_json(data_obj: StringIO) -> bool:
|
|
106
157
|
# see if its JSON
|
|
158
|
+
data_obj.seek(0)
|
|
107
159
|
text = data_obj.read(100).strip()
|
|
108
160
|
data_obj.seek(0)
|
|
109
161
|
if len(text) > 0:
|
|
@@ -114,20 +166,25 @@ class FormatDetector:
|
|
|
114
166
|
return True
|
|
115
167
|
except Exception:
|
|
116
168
|
return False
|
|
117
|
-
finally:
|
|
118
|
-
data_obj.seek(0)
|
|
119
169
|
return False
|
|
120
170
|
|
|
121
|
-
|
|
122
|
-
|
|
171
|
+
@classmethod
|
|
172
|
+
def is_csv(cls, data_obj: StringIO) -> bool:
|
|
123
173
|
data_obj.seek(0)
|
|
174
|
+
sample = data_obj.readline() # trying to get dialect from header
|
|
124
175
|
try:
|
|
176
|
+
data_obj.seek(0)
|
|
125
177
|
csv.Sniffer().sniff(sample)
|
|
126
178
|
|
|
179
|
+
# Avoid a false-positive for json files
|
|
180
|
+
if cls.is_json(data_obj):
|
|
181
|
+
return False
|
|
182
|
+
return True
|
|
127
183
|
except Exception:
|
|
128
184
|
return False
|
|
129
185
|
|
|
130
|
-
|
|
186
|
+
@staticmethod
|
|
187
|
+
def is_parquet(data: IOBase) -> bool:
|
|
131
188
|
# Check first and last 4 bytes equal to PAR1.
|
|
132
189
|
# Refer: https://parquet.apache.org/docs/file-format/
|
|
133
190
|
parquet_sig = b"PAR1"
|
|
@@ -141,15 +198,31 @@ class FormatDetector:
|
|
|
141
198
|
return False
|
|
142
199
|
|
|
143
200
|
|
|
144
|
-
class FileReader:
|
|
201
|
+
class FileReader(FormatDetector):
|
|
202
|
+
|
|
203
|
+
def to_df(self, **kwargs) -> pd.DataFrame:
|
|
204
|
+
format = self.get_format()
|
|
205
|
+
|
|
206
|
+
func = getattr(self, f'read_{format}', None)
|
|
207
|
+
if func is None:
|
|
208
|
+
raise FileDetectError(f'Unsupported format: {format}')
|
|
209
|
+
|
|
210
|
+
self.file_obj.seek(0)
|
|
211
|
+
kwargs.update(self.parameters)
|
|
212
|
+
return func(self.file_obj, name=self.name, **kwargs)
|
|
145
213
|
|
|
146
|
-
|
|
214
|
+
@staticmethod
|
|
215
|
+
def _get_csv_dialect(buffer, delimiter=None) -> csv.Dialect:
|
|
147
216
|
sample = buffer.readline() # trying to get dialect from header
|
|
148
217
|
buffer.seek(0)
|
|
149
218
|
try:
|
|
150
219
|
if isinstance(sample, bytes):
|
|
151
220
|
sample = sample.decode()
|
|
152
|
-
|
|
221
|
+
|
|
222
|
+
if delimiter is not None:
|
|
223
|
+
accepted_csv_delimiters = [delimiter]
|
|
224
|
+
else:
|
|
225
|
+
accepted_csv_delimiters = [",", "\t", ";"]
|
|
153
226
|
try:
|
|
154
227
|
dialect = csv.Sniffer().sniff(
|
|
155
228
|
sample, delimiters=accepted_csv_delimiters
|
|
@@ -168,29 +241,15 @@ class FileReader:
|
|
|
168
241
|
dialect = None
|
|
169
242
|
return dialect
|
|
170
243
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
'parquet': self.read_parquet,
|
|
174
|
-
'csv': self.read_csv,
|
|
175
|
-
'xlsx': self.read_excel,
|
|
176
|
-
'pdf': self.read_pdf,
|
|
177
|
-
'json': self.read_json,
|
|
178
|
-
'txt': self.read_txt,
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
if format not in func:
|
|
182
|
-
raise FileDetectError(f'Unsupported format: {format}')
|
|
183
|
-
func = func[format]
|
|
184
|
-
|
|
185
|
-
return func(file_obj, **kwargs)
|
|
186
|
-
|
|
187
|
-
def read_csv(self, file_obj: BytesIO, **kwargs):
|
|
244
|
+
@classmethod
|
|
245
|
+
def read_csv(cls, file_obj: BytesIO, delimiter=None, **kwargs):
|
|
188
246
|
file_obj = decode(file_obj)
|
|
189
|
-
dialect =
|
|
247
|
+
dialect = cls._get_csv_dialect(file_obj, delimiter=delimiter)
|
|
190
248
|
|
|
191
249
|
return pd.read_csv(file_obj, sep=dialect.delimiter, index_col=False)
|
|
192
250
|
|
|
193
|
-
|
|
251
|
+
@staticmethod
|
|
252
|
+
def read_txt(file_obj: BytesIO, name=None, **kwargs):
|
|
194
253
|
file_obj = decode(file_obj)
|
|
195
254
|
|
|
196
255
|
try:
|
|
@@ -202,10 +261,7 @@ class FileReader:
|
|
|
202
261
|
)
|
|
203
262
|
text = file_obj.read()
|
|
204
263
|
|
|
205
|
-
|
|
206
|
-
if hasattr(file_obj, "name"):
|
|
207
|
-
file_name = file_obj.name
|
|
208
|
-
metadata = {"source": file_name}
|
|
264
|
+
metadata = {"source": name}
|
|
209
265
|
documents = [Document(page_content=text, metadata=metadata)]
|
|
210
266
|
|
|
211
267
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
@@ -220,10 +276,10 @@ class FileReader:
|
|
|
220
276
|
]
|
|
221
277
|
)
|
|
222
278
|
|
|
223
|
-
|
|
224
|
-
|
|
279
|
+
@staticmethod
|
|
280
|
+
def read_pdf(file_obj: BytesIO, **kwargs):
|
|
225
281
|
|
|
226
|
-
with fitz.open(stream=file_obj) as pdf: # open pdf
|
|
282
|
+
with fitz.open(stream=file_obj.read()) as pdf: # open pdf
|
|
227
283
|
text = chr(12).join([page.get_text() for page in pdf])
|
|
228
284
|
|
|
229
285
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
@@ -236,16 +292,19 @@ class FileReader:
|
|
|
236
292
|
{"content": split_text, "metadata": [{}] * len(split_text)}
|
|
237
293
|
)
|
|
238
294
|
|
|
239
|
-
|
|
295
|
+
@staticmethod
|
|
296
|
+
def read_json(file_obj: BytesIO, **kwargs):
|
|
240
297
|
file_obj = decode(file_obj)
|
|
241
298
|
file_obj.seek(0)
|
|
242
299
|
json_doc = json.loads(file_obj.read())
|
|
243
300
|
return pd.json_normalize(json_doc, max_level=0)
|
|
244
301
|
|
|
245
|
-
|
|
302
|
+
@staticmethod
|
|
303
|
+
def read_parquet(file_obj: BytesIO, **kwargs):
|
|
246
304
|
return pd.read_parquet(file_obj)
|
|
247
305
|
|
|
248
|
-
|
|
306
|
+
@staticmethod
|
|
307
|
+
def read_xlsx(file_obj: BytesIO, sheet_name=None, **kwargs) -> pd.DataFrame:
|
|
249
308
|
|
|
250
309
|
file_obj.seek(0)
|
|
251
310
|
with pd.ExcelFile(file_obj) as xls:
|
|
@@ -129,11 +129,4 @@ class MSGraphAPIBaseClient:
|
|
|
129
129
|
api_url = self._get_api_url(endpoint)
|
|
130
130
|
|
|
131
131
|
response = self._make_request(api_url, params)
|
|
132
|
-
|
|
133
|
-
# If the response content is a binary file or a TSV file, return the raw content.
|
|
134
|
-
if response.headers["Content-Type"] in ("application/octet-stream", "text/plain",
|
|
135
|
-
"text/tab-separated-values", "application/pdf"):
|
|
136
|
-
return response.content
|
|
137
|
-
# Otherwise, return the JSON content.
|
|
138
|
-
else:
|
|
139
|
-
return response.json()
|
|
132
|
+
return response.content
|