MindsDB 25.2.1.2__py3-none-any.whl → 25.2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.0.dist-info}/METADATA +223 -223
- {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.0.dist-info}/RECORD +23 -23
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/command_executor.py +0 -56
- mindsdb/api/executor/planner/query_planner.py +7 -2
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +19 -11
- mindsdb/api/executor/sql_query/steps/subselect_step.py +44 -2
- mindsdb/integrations/handlers/file_handler/file_handler.py +13 -320
- mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +60 -156
- mindsdb/integrations/handlers/ms_one_drive_handler/ms_graph_api_one_drive_client.py +3 -3
- mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +2 -20
- mindsdb/integrations/handlers/salesforce_handler/connection_args.py +9 -1
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +2 -1
- mindsdb/integrations/utilities/files/file_reader.py +120 -61
- mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +1 -8
- mindsdb/integrations/utilities/query_traversal.py +42 -37
- mindsdb/interfaces/agents/langfuse_callback_handler.py +205 -27
- mindsdb/interfaces/file/file_controller.py +1 -1
- mindsdb/utilities/config.py +2 -2
- mindsdb/utilities/render/sqlalchemy_render.py +52 -19
- {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.0.dist-info}/LICENSE +0 -0
- {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.0.dist-info}/WHEEL +0 -0
- {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,18 +1,9 @@
|
|
|
1
|
-
import codecs
|
|
2
|
-
import csv
|
|
3
|
-
import json
|
|
4
1
|
import os
|
|
5
2
|
import shutil
|
|
6
3
|
import tempfile
|
|
7
|
-
import traceback
|
|
8
|
-
from io import BytesIO, StringIO
|
|
9
4
|
from pathlib import Path
|
|
10
|
-
from urllib.parse import urlparse
|
|
11
5
|
|
|
12
|
-
import filetype
|
|
13
6
|
import pandas as pd
|
|
14
|
-
import requests
|
|
15
|
-
from charset_normalizer import from_bytes
|
|
16
7
|
from mindsdb_sql_parser import parse_sql
|
|
17
8
|
from mindsdb_sql_parser.ast import CreateTable, DropTables, Insert, Select
|
|
18
9
|
from mindsdb_sql_parser.ast.base import ASTNode
|
|
@@ -23,7 +14,9 @@ from mindsdb.integrations.libs.response import RESPONSE_TYPE
|
|
|
23
14
|
from mindsdb.integrations.libs.response import HandlerResponse as Response
|
|
24
15
|
from mindsdb.integrations.libs.response import HandlerStatusResponse as StatusResponse
|
|
25
16
|
from mindsdb.utilities import log
|
|
26
|
-
|
|
17
|
+
|
|
18
|
+
from mindsdb.integrations.utilities.files.file_reader import FileReader
|
|
19
|
+
|
|
27
20
|
|
|
28
21
|
logger = log.getLogger(__name__)
|
|
29
22
|
|
|
@@ -144,14 +137,9 @@ class FileHandler(DatabaseHandler):
|
|
|
144
137
|
else:
|
|
145
138
|
sheet_name = None
|
|
146
139
|
file_path = self.file_controller.get_file_path(table_name)
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
self.custom_parser,
|
|
151
|
-
self.chunk_size,
|
|
152
|
-
self.chunk_overlap,
|
|
153
|
-
sheet_name=sheet_name
|
|
154
|
-
)
|
|
140
|
+
|
|
141
|
+
df = self.handle_source(file_path, sheet_name=sheet_name)
|
|
142
|
+
|
|
155
143
|
# Process the SELECT query
|
|
156
144
|
result_df = query_df(df, query)
|
|
157
145
|
return Response(RESPONSE_TYPE.TABLE, data_frame=result_df)
|
|
@@ -160,14 +148,9 @@ class FileHandler(DatabaseHandler):
|
|
|
160
148
|
table_name = query.table.parts[-1]
|
|
161
149
|
file_path = self.file_controller.get_file_path(table_name)
|
|
162
150
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
self.clean_rows,
|
|
167
|
-
self.custom_parser,
|
|
168
|
-
self.chunk_size,
|
|
169
|
-
self.chunk_overlap,
|
|
170
|
-
)
|
|
151
|
+
file_reader = FileReader(path=file_path)
|
|
152
|
+
|
|
153
|
+
df = file_reader.to_df()
|
|
171
154
|
|
|
172
155
|
# Create a new dataframe with the values from the query
|
|
173
156
|
new_df = pd.DataFrame(query.values, columns=[col.name for col in query.columns])
|
|
@@ -193,306 +176,16 @@ class FileHandler(DatabaseHandler):
|
|
|
193
176
|
return self.query(ast)
|
|
194
177
|
|
|
195
178
|
@staticmethod
|
|
196
|
-
def
|
|
197
|
-
file_path
|
|
198
|
-
clean_rows=True,
|
|
199
|
-
custom_parser=None,
|
|
200
|
-
chunk_size=DEFAULT_CHUNK_SIZE,
|
|
201
|
-
chunk_overlap=DEFAULT_CHUNK_OVERLAP,
|
|
202
|
-
sheet_name=None # for "xlsx", "xls" files
|
|
203
|
-
):
|
|
204
|
-
"""
|
|
205
|
-
This function takes a file path and returns a pandas dataframe
|
|
206
|
-
"""
|
|
207
|
-
# get file data io, format and dialect
|
|
208
|
-
data, fmt, dialect = FileHandler._get_data_io(file_path)
|
|
209
|
-
data.seek(0) # make sure we are at 0 in file pointer
|
|
210
|
-
|
|
211
|
-
if custom_parser:
|
|
212
|
-
header, file_data = custom_parser(data, fmt)
|
|
213
|
-
df = pd.DataFrame(file_data, columns=header)
|
|
214
|
-
|
|
215
|
-
elif fmt == "parquet":
|
|
216
|
-
df = pd.read_parquet(data)
|
|
217
|
-
|
|
218
|
-
elif fmt == "csv":
|
|
219
|
-
df = pd.read_csv(data, sep=dialect.delimiter, index_col=False)
|
|
220
|
-
|
|
221
|
-
elif fmt in ["xlsx", "xls"]:
|
|
222
|
-
data.seek(0)
|
|
223
|
-
with pd.ExcelFile(data) as xls:
|
|
224
|
-
if sheet_name is None:
|
|
225
|
-
# No sheet specified: Return list of sheets
|
|
226
|
-
sheet_list = xls.sheet_names
|
|
227
|
-
df = pd.DataFrame(sheet_list, columns=["Sheet_Name"])
|
|
228
|
-
else:
|
|
229
|
-
# Specific sheet requested: Load that sheet
|
|
230
|
-
df = pd.read_excel(xls, sheet_name=sheet_name)
|
|
231
|
-
|
|
232
|
-
elif fmt == "json":
|
|
233
|
-
data.seek(0)
|
|
234
|
-
json_doc = json.loads(data.read())
|
|
235
|
-
df = pd.json_normalize(json_doc, max_level=0)
|
|
236
|
-
|
|
237
|
-
elif fmt == "txt" or fmt == "pdf":
|
|
238
|
-
text_splitter = RecursiveCharacterTextSplitter(
|
|
239
|
-
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
240
|
-
)
|
|
241
|
-
|
|
242
|
-
if fmt == "txt":
|
|
243
|
-
try:
|
|
244
|
-
from langchain_community.document_loaders import TextLoader
|
|
245
|
-
except ImportError:
|
|
246
|
-
raise ImportError(
|
|
247
|
-
"To import TXT document please install 'langchain-community':\n"
|
|
248
|
-
" pip install langchain-community"
|
|
249
|
-
)
|
|
250
|
-
loader = TextLoader(file_path, encoding="utf8")
|
|
251
|
-
docs = text_splitter.split_documents(loader.load())
|
|
252
|
-
df = pd.DataFrame(
|
|
253
|
-
[
|
|
254
|
-
{"content": doc.page_content, "metadata": doc.metadata}
|
|
255
|
-
for doc in docs
|
|
256
|
-
]
|
|
257
|
-
)
|
|
179
|
+
def handle_source(file_path, **kwargs):
|
|
180
|
+
file_reader = FileReader(path=file_path)
|
|
258
181
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
import fitz # pymupdf
|
|
262
|
-
|
|
263
|
-
with fitz.open(file_path) as pdf: # open pdf
|
|
264
|
-
text = chr(12).join([page.get_text() for page in pdf])
|
|
265
|
-
|
|
266
|
-
split_text = text_splitter.split_text(text)
|
|
267
|
-
|
|
268
|
-
df = pd.DataFrame(
|
|
269
|
-
{"content": split_text, "metadata": [{}] * len(split_text)}
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
else:
|
|
273
|
-
raise ValueError(
|
|
274
|
-
"Could not load file into any format, supported formats are csv, json, xls, xlsx, pdf, txt"
|
|
275
|
-
)
|
|
182
|
+
df = file_reader.to_df(**kwargs)
|
|
276
183
|
|
|
277
184
|
header = df.columns.values.tolist()
|
|
278
185
|
|
|
279
186
|
df.columns = [key.strip() for key in header]
|
|
280
187
|
df = df.applymap(clean_cell)
|
|
281
|
-
|
|
282
|
-
header = [x.strip() for x in header]
|
|
283
|
-
col_map = dict((col, col) for col in header)
|
|
284
|
-
return df, col_map
|
|
285
|
-
|
|
286
|
-
@staticmethod
|
|
287
|
-
def is_it_parquet(data: BytesIO) -> bool:
|
|
288
|
-
# Check first and last 4 bytes equal to PAR1.
|
|
289
|
-
# Refer: https://parquet.apache.org/docs/file-format/
|
|
290
|
-
parquet_sig = b"PAR1"
|
|
291
|
-
data.seek(0, 0)
|
|
292
|
-
start_meta = data.read(4)
|
|
293
|
-
data.seek(-4, 2)
|
|
294
|
-
end_meta = data.read()
|
|
295
|
-
data.seek(0)
|
|
296
|
-
if start_meta == parquet_sig and end_meta == parquet_sig:
|
|
297
|
-
return True
|
|
298
|
-
return False
|
|
299
|
-
|
|
300
|
-
@staticmethod
|
|
301
|
-
def is_it_xlsx(file_path: str) -> bool:
|
|
302
|
-
file_type = filetype.guess(file_path)
|
|
303
|
-
if file_type and file_type.mime in {
|
|
304
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
305
|
-
"application/vnd.ms-excel",
|
|
306
|
-
}:
|
|
307
|
-
return True
|
|
308
|
-
return False
|
|
309
|
-
|
|
310
|
-
@staticmethod
|
|
311
|
-
def is_it_json(data_str: StringIO) -> bool:
|
|
312
|
-
# see if its JSON
|
|
313
|
-
text = data_str.read(100).strip()
|
|
314
|
-
data_str.seek(0)
|
|
315
|
-
if len(text) > 0:
|
|
316
|
-
# it it looks like a json, then try to parse it
|
|
317
|
-
if text.startswith("{") or text.startswith("["):
|
|
318
|
-
try:
|
|
319
|
-
json.loads(data_str.read())
|
|
320
|
-
return True
|
|
321
|
-
except Exception:
|
|
322
|
-
return False
|
|
323
|
-
finally:
|
|
324
|
-
data_str.seek(0)
|
|
325
|
-
return False
|
|
326
|
-
|
|
327
|
-
@staticmethod
|
|
328
|
-
def is_it_csv(data_str: StringIO) -> bool:
|
|
329
|
-
sample = data_str.readline() # trying to get dialect from header
|
|
330
|
-
data_str.seek(0)
|
|
331
|
-
try:
|
|
332
|
-
csv.Sniffer().sniff(sample)
|
|
333
|
-
# Avoid a false-positive for json files
|
|
334
|
-
try:
|
|
335
|
-
json.loads(data_str.read())
|
|
336
|
-
data_str.seek(0)
|
|
337
|
-
return False
|
|
338
|
-
except json.decoder.JSONDecodeError:
|
|
339
|
-
data_str.seek(0)
|
|
340
|
-
return True
|
|
341
|
-
except Exception:
|
|
342
|
-
return False
|
|
343
|
-
|
|
344
|
-
@staticmethod
|
|
345
|
-
def _get_data_io(file_path):
|
|
346
|
-
"""
|
|
347
|
-
@TODO: Use python-magic to simplify the function and detect the file types as the xlsx example
|
|
348
|
-
This gets a file either url or local file and defines what the format is as well as dialect
|
|
349
|
-
:param file: file path or url
|
|
350
|
-
:return: data_io, format, dialect
|
|
351
|
-
"""
|
|
352
|
-
|
|
353
|
-
data = BytesIO()
|
|
354
|
-
data_str = None
|
|
355
|
-
dialect = None
|
|
356
|
-
|
|
357
|
-
try:
|
|
358
|
-
with open(file_path, "rb") as fp:
|
|
359
|
-
data = BytesIO(fp.read())
|
|
360
|
-
except Exception as e:
|
|
361
|
-
error = "Could not load file, possible exception : {exception}".format(
|
|
362
|
-
exception=e
|
|
363
|
-
)
|
|
364
|
-
logger.error(error)
|
|
365
|
-
raise ValueError(error)
|
|
366
|
-
|
|
367
|
-
suffix = Path(file_path).suffix.strip(".").lower()
|
|
368
|
-
if suffix not in ("csv", "json", "xlsx", "parquet"):
|
|
369
|
-
if FileHandler.is_it_parquet(data):
|
|
370
|
-
suffix = "parquet"
|
|
371
|
-
elif FileHandler.is_it_xlsx(file_path):
|
|
372
|
-
suffix = "xlsx"
|
|
373
|
-
|
|
374
|
-
if suffix == "parquet":
|
|
375
|
-
return data, "parquet", dialect
|
|
376
|
-
|
|
377
|
-
if suffix == "xlsx":
|
|
378
|
-
return data, "xlsx", dialect
|
|
379
|
-
|
|
380
|
-
if suffix == "txt":
|
|
381
|
-
return data, "txt", dialect
|
|
382
|
-
|
|
383
|
-
if suffix == "pdf":
|
|
384
|
-
return data, "pdf", dialect
|
|
385
|
-
|
|
386
|
-
byte_str = data.read()
|
|
387
|
-
# Move it to StringIO
|
|
388
|
-
try:
|
|
389
|
-
# Handle Microsoft's BOM "special" UTF-8 encoding
|
|
390
|
-
if byte_str.startswith(codecs.BOM_UTF8):
|
|
391
|
-
data_str = StringIO(byte_str.decode("utf-8-sig"))
|
|
392
|
-
else:
|
|
393
|
-
file_encoding_meta = from_bytes(
|
|
394
|
-
byte_str[: 32 * 1024],
|
|
395
|
-
steps=32, # Number of steps/block to extract from my_byte_str
|
|
396
|
-
chunk_size=1024, # Set block size of each extraction)
|
|
397
|
-
explain=False,
|
|
398
|
-
)
|
|
399
|
-
best_meta = file_encoding_meta.best()
|
|
400
|
-
errors = "strict"
|
|
401
|
-
if best_meta is not None:
|
|
402
|
-
encoding = file_encoding_meta.best().encoding
|
|
403
|
-
|
|
404
|
-
try:
|
|
405
|
-
data_str = StringIO(byte_str.decode(encoding, errors))
|
|
406
|
-
except UnicodeDecodeError:
|
|
407
|
-
encoding = "utf-8"
|
|
408
|
-
errors = "replace"
|
|
409
|
-
|
|
410
|
-
data_str = StringIO(byte_str.decode(encoding, errors))
|
|
411
|
-
else:
|
|
412
|
-
encoding = "utf-8"
|
|
413
|
-
errors = "replace"
|
|
414
|
-
|
|
415
|
-
data_str = StringIO(byte_str.decode(encoding, errors))
|
|
416
|
-
except Exception:
|
|
417
|
-
logger.error(traceback.format_exc())
|
|
418
|
-
logger.error("Could not load into string")
|
|
419
|
-
|
|
420
|
-
if suffix not in ("csv", "json"):
|
|
421
|
-
if FileHandler.is_it_json(data_str):
|
|
422
|
-
suffix = "json"
|
|
423
|
-
elif FileHandler.is_it_csv(data_str):
|
|
424
|
-
suffix = "csv"
|
|
425
|
-
|
|
426
|
-
if suffix == "json":
|
|
427
|
-
return data_str, suffix, dialect
|
|
428
|
-
|
|
429
|
-
if suffix == "csv":
|
|
430
|
-
try:
|
|
431
|
-
dialect = FileHandler._get_csv_dialect(data_str)
|
|
432
|
-
if dialect:
|
|
433
|
-
return data_str, "csv", dialect
|
|
434
|
-
except Exception:
|
|
435
|
-
logger.error("Could not detect format for this file")
|
|
436
|
-
logger.error(traceback.format_exc())
|
|
437
|
-
|
|
438
|
-
data_str.seek(0)
|
|
439
|
-
data.seek(0)
|
|
440
|
-
|
|
441
|
-
# No file type identified
|
|
442
|
-
return data, None, dialect
|
|
443
|
-
|
|
444
|
-
@staticmethod
|
|
445
|
-
def _get_file_path(path) -> str:
|
|
446
|
-
try:
|
|
447
|
-
is_url = urlparse(path).scheme in ("http", "https")
|
|
448
|
-
except Exception:
|
|
449
|
-
is_url = False
|
|
450
|
-
if is_url:
|
|
451
|
-
path = FileHandler._fetch_url(path)
|
|
452
|
-
return path
|
|
453
|
-
|
|
454
|
-
@staticmethod
|
|
455
|
-
def _get_csv_dialect(buffer) -> csv.Dialect:
|
|
456
|
-
sample = buffer.readline() # trying to get dialect from header
|
|
457
|
-
buffer.seek(0)
|
|
458
|
-
try:
|
|
459
|
-
if isinstance(sample, bytes):
|
|
460
|
-
sample = sample.decode()
|
|
461
|
-
accepted_csv_delimiters = [",", "\t", ";"]
|
|
462
|
-
try:
|
|
463
|
-
dialect = csv.Sniffer().sniff(
|
|
464
|
-
sample, delimiters=accepted_csv_delimiters
|
|
465
|
-
)
|
|
466
|
-
dialect.doublequote = (
|
|
467
|
-
True # assume that all csvs have " as string escape
|
|
468
|
-
)
|
|
469
|
-
except Exception:
|
|
470
|
-
dialect = csv.reader(sample).dialect
|
|
471
|
-
if dialect.delimiter not in accepted_csv_delimiters:
|
|
472
|
-
raise Exception(
|
|
473
|
-
f"CSV delimeter '{dialect.delimiter}' is not supported"
|
|
474
|
-
)
|
|
475
|
-
|
|
476
|
-
except csv.Error:
|
|
477
|
-
dialect = None
|
|
478
|
-
return dialect
|
|
479
|
-
|
|
480
|
-
@staticmethod
|
|
481
|
-
def _fetch_url(url: str) -> str:
|
|
482
|
-
temp_dir = tempfile.mkdtemp(prefix="mindsdb_file_url_")
|
|
483
|
-
try:
|
|
484
|
-
r = requests.get(url, stream=True)
|
|
485
|
-
if r.status_code == 200:
|
|
486
|
-
with open(os.path.join(temp_dir, "file"), "wb") as f:
|
|
487
|
-
for chunk in r:
|
|
488
|
-
f.write(chunk)
|
|
489
|
-
else:
|
|
490
|
-
raise Exception(f"Response status code is {r.status_code}")
|
|
491
|
-
except Exception as e:
|
|
492
|
-
logger.error(f"Error during getting {url}")
|
|
493
|
-
logger.error(e)
|
|
494
|
-
raise
|
|
495
|
-
return os.path.join(temp_dir, "file")
|
|
188
|
+
return df
|
|
496
189
|
|
|
497
190
|
def get_tables(self) -> Response:
|
|
498
191
|
"""
|
|
@@ -1,20 +1,20 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import os
|
|
3
2
|
import shutil
|
|
4
3
|
import tempfile
|
|
5
4
|
from io import BytesIO, StringIO
|
|
6
|
-
from
|
|
5
|
+
from pathlib import Path
|
|
7
6
|
|
|
8
7
|
import pandas
|
|
9
8
|
import pytest
|
|
10
|
-
import responses
|
|
11
9
|
from mindsdb_sql_parser.exceptions import ParsingException
|
|
12
|
-
from mindsdb_sql_parser.ast import CreateTable, DropTables, Identifier, Insert,
|
|
10
|
+
from mindsdb_sql_parser.ast import CreateTable, DropTables, Identifier, Insert, TableColumn, Update
|
|
13
11
|
from pytest_lazyfixture import lazy_fixture
|
|
14
12
|
|
|
15
13
|
from mindsdb.integrations.handlers.file_handler.file_handler import FileHandler
|
|
16
14
|
from mindsdb.integrations.libs.response import RESPONSE_TYPE
|
|
17
|
-
|
|
15
|
+
|
|
16
|
+
from mindsdb.integrations.utilities.files.file_reader import FileReader
|
|
17
|
+
|
|
18
18
|
|
|
19
19
|
# Define a table to use as content for all of the file types
|
|
20
20
|
# This data needs to match that saved in the files in the ./data/ dir (except pdf and txt files)
|
|
@@ -110,21 +110,21 @@ class TestIsItX:
|
|
|
110
110
|
)
|
|
111
111
|
def test_is_it_csv(self, file_path, result):
|
|
112
112
|
with open(file_path, "r") as fh:
|
|
113
|
-
assert
|
|
113
|
+
assert FileReader.is_csv(StringIO(fh.read())) is result
|
|
114
114
|
|
|
115
115
|
@pytest.mark.parametrize(
|
|
116
116
|
"file_path,result",
|
|
117
117
|
[
|
|
118
|
-
(lazy_fixture("csv_file"),
|
|
119
|
-
(lazy_fixture("xlsx_file"),
|
|
120
|
-
(lazy_fixture("json_file"),
|
|
121
|
-
(lazy_fixture("parquet_file"),
|
|
122
|
-
(lazy_fixture("txt_file"),
|
|
123
|
-
(lazy_fixture("pdf_file"),
|
|
118
|
+
(lazy_fixture("csv_file"), 'csv'),
|
|
119
|
+
(lazy_fixture("xlsx_file"), 'xlsx'),
|
|
120
|
+
(lazy_fixture("json_file"), 'json'),
|
|
121
|
+
(lazy_fixture("parquet_file"), 'parquet'),
|
|
122
|
+
(lazy_fixture("txt_file"), 'txt'),
|
|
123
|
+
(lazy_fixture("pdf_file"), 'pdf'),
|
|
124
124
|
],
|
|
125
125
|
)
|
|
126
|
-
def
|
|
127
|
-
assert
|
|
126
|
+
def test_format(self, file_path, result):
|
|
127
|
+
assert FileReader(path=file_path).get_format() == result
|
|
128
128
|
|
|
129
129
|
# We can't test xlsx or parquet here because they're binary files
|
|
130
130
|
@pytest.mark.parametrize(
|
|
@@ -137,7 +137,7 @@ class TestIsItX:
|
|
|
137
137
|
)
|
|
138
138
|
def test_is_it_json(self, file_path, result):
|
|
139
139
|
with open(file_path, "r") as fh:
|
|
140
|
-
assert
|
|
140
|
+
assert FileReader.is_json(StringIO(fh.read())) is result
|
|
141
141
|
|
|
142
142
|
@pytest.mark.parametrize(
|
|
143
143
|
"file_path,result",
|
|
@@ -152,7 +152,7 @@ class TestIsItX:
|
|
|
152
152
|
)
|
|
153
153
|
def test_is_it_parquet(self, file_path, result):
|
|
154
154
|
with open(file_path, "rb") as fh:
|
|
155
|
-
assert
|
|
155
|
+
assert FileReader.is_parquet(BytesIO(fh.read())) is result
|
|
156
156
|
|
|
157
157
|
|
|
158
158
|
class TestQuery:
|
|
@@ -182,64 +182,6 @@ class TestQuery:
|
|
|
182
182
|
|
|
183
183
|
assert response.type == RESPONSE_TYPE.ERROR
|
|
184
184
|
|
|
185
|
-
def test_query_select(self, csv_file):
|
|
186
|
-
"""Test a valid select query"""
|
|
187
|
-
expected_df = pandas.read_csv(csv_file)
|
|
188
|
-
|
|
189
|
-
# This is temporary because the file controller currently absconds with our file when we save it:
|
|
190
|
-
# https://github.com/mindsdb/mindsdb/issues/8141
|
|
191
|
-
csv_tmp = os.path.join(tempfile.gettempdir(), "test.csv")
|
|
192
|
-
if os.path.exists(csv_tmp):
|
|
193
|
-
os.remove(csv_tmp)
|
|
194
|
-
shutil.copy(csv_file, csv_tmp)
|
|
195
|
-
|
|
196
|
-
# Configure mindsdb and set up the file controller
|
|
197
|
-
# Ideally this would be a lot simpler..
|
|
198
|
-
db_file = tempfile.mkstemp(prefix="mindsdb_db_")[1]
|
|
199
|
-
config = {"storage_db": "sqlite:///" + db_file}
|
|
200
|
-
fdi, cfg_file = tempfile.mkstemp(prefix="mindsdb_conf_")
|
|
201
|
-
with os.fdopen(fdi, "w") as fd:
|
|
202
|
-
json.dump(config, fd)
|
|
203
|
-
os.environ["MINDSDB_CONFIG_PATH"] = cfg_file
|
|
204
|
-
|
|
205
|
-
from mindsdb.utilities.config import Config
|
|
206
|
-
|
|
207
|
-
Config()
|
|
208
|
-
from mindsdb.interfaces.storage import db
|
|
209
|
-
|
|
210
|
-
db.init()
|
|
211
|
-
db.session.rollback()
|
|
212
|
-
db.Base.metadata.drop_all(db.engine)
|
|
213
|
-
|
|
214
|
-
# create
|
|
215
|
-
db.Base.metadata.create_all(db.engine)
|
|
216
|
-
|
|
217
|
-
# fill with data
|
|
218
|
-
r = db.Integration(name="files", data={}, engine="files")
|
|
219
|
-
db.session.add(r)
|
|
220
|
-
db.session.flush()
|
|
221
|
-
# Config #
|
|
222
|
-
|
|
223
|
-
file_controller = FileController()
|
|
224
|
-
file_controller.save_file(
|
|
225
|
-
os.path.splitext(os.path.basename(csv_file))[0], csv_tmp
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
file_handler = FileHandler(file_controller=file_controller)
|
|
229
|
-
response = file_handler.query(
|
|
230
|
-
Select(
|
|
231
|
-
targets=[Star()],
|
|
232
|
-
from_table=Identifier(
|
|
233
|
-
parts=[os.path.splitext(os.path.basename(csv_file))[0]]
|
|
234
|
-
),
|
|
235
|
-
)
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
assert response.type == RESPONSE_TYPE.TABLE
|
|
239
|
-
assert response.error_code == 0
|
|
240
|
-
assert response.error_message is None
|
|
241
|
-
assert expected_df.equals(response.data_frame)
|
|
242
|
-
|
|
243
185
|
def test_query_insert(self, csv_file, monkeypatch):
|
|
244
186
|
"""Test an invalid insert query"""
|
|
245
187
|
# Create a temporary file to save the csv file to.
|
|
@@ -322,26 +264,6 @@ class TestQuery:
|
|
|
322
264
|
file_handler.native_query("INVALID QUERY")
|
|
323
265
|
|
|
324
266
|
|
|
325
|
-
def test_get_file_path_with_file_path():
|
|
326
|
-
"""Test an valid native table query"""
|
|
327
|
-
file_path = "example.txt"
|
|
328
|
-
result = FileHandler._get_file_path(file_path)
|
|
329
|
-
assert result == file_path
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
@patch("mindsdb.integrations.handlers.file_handler.file_handler.FileHandler._fetch_url")
|
|
333
|
-
def test_get_file_path_with_url(mock_fetch_url):
|
|
334
|
-
url = "http://example.com/file.txt"
|
|
335
|
-
expected_result = "some_file_path"
|
|
336
|
-
# we test _fetch_url separately below. Mock it for this test
|
|
337
|
-
mock_fetch_url.return_value = expected_result
|
|
338
|
-
|
|
339
|
-
result = FileHandler._get_file_path(url)
|
|
340
|
-
|
|
341
|
-
assert result == expected_result
|
|
342
|
-
mock_fetch_url.assert_called_with(url)
|
|
343
|
-
|
|
344
|
-
|
|
345
267
|
@pytest.mark.parametrize(
|
|
346
268
|
"file_path,expected_columns",
|
|
347
269
|
[
|
|
@@ -354,48 +276,44 @@ def test_get_file_path_with_url(mock_fetch_url):
|
|
|
354
276
|
],
|
|
355
277
|
)
|
|
356
278
|
def test_handle_source(file_path, expected_columns):
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
279
|
+
|
|
280
|
+
def get_reader(file_path):
|
|
281
|
+
# using path
|
|
282
|
+
reader = FileReader(path=file_path)
|
|
283
|
+
yield reader
|
|
284
|
+
|
|
285
|
+
# using file descriptor
|
|
286
|
+
with open(file_path, 'rb') as fd:
|
|
287
|
+
reader = FileReader(file=fd)
|
|
288
|
+
yield reader
|
|
289
|
+
fd.seek(0)
|
|
290
|
+
content = fd.read()
|
|
291
|
+
|
|
292
|
+
# using bytesio
|
|
293
|
+
fd = BytesIO(content)
|
|
294
|
+
reader = FileReader(file=fd, name=Path(file_path).name)
|
|
295
|
+
yield reader
|
|
296
|
+
|
|
297
|
+
# using different methods to create reader
|
|
298
|
+
for reader in get_reader(file_path):
|
|
299
|
+
df = reader.to_df()
|
|
361
300
|
assert isinstance(df, pandas.DataFrame)
|
|
362
301
|
|
|
363
|
-
|
|
364
|
-
assert len(df) == len(test_excel_sheet_content) - 1
|
|
365
|
-
assert df.values.tolist() == test_excel_sheet_content[1:]
|
|
366
|
-
sheet_name = test_excel_sheet_content[1][0]
|
|
302
|
+
if reader.get_format() == 'xlsx':
|
|
367
303
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
304
|
+
assert df.columns.tolist() == test_excel_sheet_content[0]
|
|
305
|
+
assert len(df) == len(test_excel_sheet_content) - 1
|
|
306
|
+
assert df.values.tolist() == test_excel_sheet_content[1:]
|
|
307
|
+
sheet_name = test_excel_sheet_content[1][0]
|
|
371
308
|
|
|
372
|
-
|
|
373
|
-
if not file_path.endswith(".pdf") and not file_path.endswith(".txt"):
|
|
374
|
-
assert len(df) == len(test_file_content) - 1
|
|
375
|
-
assert df.values.tolist() == test_file_content[1:]
|
|
309
|
+
df = reader.to_df(sheet_name=sheet_name)
|
|
376
310
|
|
|
311
|
+
assert df.columns.tolist() == expected_columns
|
|
377
312
|
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
(lazy_fixture("xlsx_file"), "xlsx", None, BytesIO),
|
|
383
|
-
(lazy_fixture("json_file"), "json", None, StringIO),
|
|
384
|
-
(lazy_fixture("parquet_file"), "parquet", None, BytesIO),
|
|
385
|
-
(lazy_fixture("pdf_file"), "pdf", None, BytesIO),
|
|
386
|
-
(lazy_fixture("txt_file"), "txt", None, BytesIO),
|
|
387
|
-
],
|
|
388
|
-
)
|
|
389
|
-
def test_get_data_io(
|
|
390
|
-
file_path, expected_file_type, expected_delimiter, expected_data_type
|
|
391
|
-
):
|
|
392
|
-
data_io, file_type, file_dialect = FileHandler._get_data_io(file_path)
|
|
393
|
-
assert file_type == expected_file_type
|
|
394
|
-
assert type(data_io) == expected_data_type
|
|
395
|
-
if expected_delimiter is None:
|
|
396
|
-
assert file_dialect is None
|
|
397
|
-
else:
|
|
398
|
-
assert file_dialect.delimiter == expected_delimiter
|
|
313
|
+
# The pdf and txt files have some different content
|
|
314
|
+
if reader.get_format() not in ("pdf", "txt"):
|
|
315
|
+
assert len(df) == len(test_file_content) - 1
|
|
316
|
+
assert df.values.tolist() == test_file_content[1:]
|
|
399
317
|
|
|
400
318
|
|
|
401
319
|
@pytest.mark.parametrize(
|
|
@@ -407,10 +325,21 @@ def test_get_data_io(
|
|
|
407
325
|
],
|
|
408
326
|
)
|
|
409
327
|
def test_check_valid_dialects(csv_string, delimiter):
|
|
410
|
-
dialect =
|
|
328
|
+
dialect = FileReader._get_csv_dialect(csv_string)
|
|
411
329
|
assert dialect.delimiter == delimiter
|
|
412
330
|
|
|
413
331
|
|
|
332
|
+
def test_tsv():
|
|
333
|
+
file = BytesIO(b"example;csv;file\tname")
|
|
334
|
+
|
|
335
|
+
reader = FileReader(file=file, name='test.tsv')
|
|
336
|
+
assert reader.get_format() == 'csv'
|
|
337
|
+
assert reader.parameters['delimiter'] == '\t'
|
|
338
|
+
|
|
339
|
+
df = reader.to_df()
|
|
340
|
+
assert len(df.columns) == 2
|
|
341
|
+
|
|
342
|
+
|
|
414
343
|
def test_check_invalid_dialects():
|
|
415
344
|
with pytest.raises(Exception):
|
|
416
345
|
FileHandler._get_csv_dialect("example csv file")
|
|
@@ -420,31 +349,6 @@ def test_check_invalid_dialects():
|
|
|
420
349
|
FileHandler._get_csv_dialect("example|csv|file")
|
|
421
350
|
|
|
422
351
|
|
|
423
|
-
@responses.activate
|
|
424
|
-
def test_fetch_url():
|
|
425
|
-
file_content = "Fake File Content 1234567890"
|
|
426
|
-
file_url = "https://test.fake/robots.txt"
|
|
427
|
-
responses.add(
|
|
428
|
-
responses.GET, file_url, body=file_content, status=200
|
|
429
|
-
) # mock the response
|
|
430
|
-
|
|
431
|
-
file_path = FileHandler._fetch_url(file_url)
|
|
432
|
-
with open(file_path, "r") as fh:
|
|
433
|
-
saved_file_content = fh.read()
|
|
434
|
-
|
|
435
|
-
assert saved_file_content == file_content
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
@responses.activate
|
|
439
|
-
def test_fetch_url_raises():
|
|
440
|
-
responses.add(responses.GET, "https://google.com", status=404)
|
|
441
|
-
|
|
442
|
-
with pytest.raises(Exception):
|
|
443
|
-
FileHandler._fetch_url("obvious_broken_url")
|
|
444
|
-
with pytest.raises(Exception):
|
|
445
|
-
FileHandler._fetch_url("https://google.com") # will get 404 response
|
|
446
|
-
|
|
447
|
-
|
|
448
352
|
def test_get_tables():
|
|
449
353
|
file_handler = FileHandler(file_controller=MockFileController())
|
|
450
354
|
response = file_handler.get_tables()
|
|
@@ -76,15 +76,15 @@ class MSGraphAPIOneDriveClient(MSGraphAPIBaseClient):
|
|
|
76
76
|
child_items = []
|
|
77
77
|
for items in self.fetch_paginated_data(f"me/drive/items/{item_id}/children"):
|
|
78
78
|
for item in items:
|
|
79
|
-
|
|
79
|
+
child_path = f"{path}/{item['name']}"
|
|
80
80
|
# If the item is a folder, get its child items.
|
|
81
81
|
if "folder" in item:
|
|
82
82
|
# Recursively get the child items of the folder.
|
|
83
|
-
child_items.extend(self.get_child_items(item["id"],
|
|
83
|
+
child_items.extend(self.get_child_items(item["id"], child_path))
|
|
84
84
|
|
|
85
85
|
else:
|
|
86
86
|
# Add the path to the item.
|
|
87
|
-
item["path"] =
|
|
87
|
+
item["path"] = child_path
|
|
88
88
|
child_items.append(item)
|
|
89
89
|
|
|
90
90
|
return child_items
|