MindsDB 25.2.1.2__py3-none-any.whl → 25.2.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (33) hide show
  1. {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/METADATA +234 -230
  2. {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/RECORD +33 -33
  3. mindsdb/__about__.py +1 -1
  4. mindsdb/api/executor/command_executor.py +1 -57
  5. mindsdb/api/executor/datahub/datanodes/system_tables.py +34 -33
  6. mindsdb/api/executor/planner/query_planner.py +7 -2
  7. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +19 -11
  8. mindsdb/api/executor/sql_query/steps/subselect_step.py +44 -2
  9. mindsdb/integrations/handlers/byom_handler/byom_handler.py +1 -1
  10. mindsdb/integrations/handlers/byom_handler/requirements.txt +1 -1
  11. mindsdb/integrations/handlers/file_handler/file_handler.py +13 -320
  12. mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +60 -156
  13. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +1 -1
  14. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +1 -1
  15. mindsdb/integrations/handlers/lancedb_handler/requirements.txt +1 -1
  16. mindsdb/integrations/handlers/lightwood_handler/requirements.txt +3 -3
  17. mindsdb/integrations/handlers/ms_one_drive_handler/ms_graph_api_one_drive_client.py +3 -3
  18. mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +2 -20
  19. mindsdb/integrations/handlers/salesforce_handler/connection_args.py +9 -1
  20. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +2 -1
  21. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +1 -1
  22. mindsdb/integrations/handlers/writer_handler/requirements.txt +1 -1
  23. mindsdb/integrations/utilities/files/file_reader.py +120 -61
  24. mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +1 -8
  25. mindsdb/integrations/utilities/query_traversal.py +42 -37
  26. mindsdb/interfaces/agents/langfuse_callback_handler.py +205 -27
  27. mindsdb/interfaces/file/file_controller.py +1 -1
  28. mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +12 -2
  29. mindsdb/utilities/config.py +2 -2
  30. mindsdb/utilities/render/sqlalchemy_render.py +52 -19
  31. {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/LICENSE +0 -0
  32. {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/WHEEL +0 -0
  33. {MindsDB-25.2.1.2.dist-info → MindsDB-25.2.2.1.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,9 @@
1
- import codecs
2
- import csv
3
- import json
4
1
  import os
5
2
  import shutil
6
3
  import tempfile
7
- import traceback
8
- from io import BytesIO, StringIO
9
4
  from pathlib import Path
10
- from urllib.parse import urlparse
11
5
 
12
- import filetype
13
6
  import pandas as pd
14
- import requests
15
- from charset_normalizer import from_bytes
16
7
  from mindsdb_sql_parser import parse_sql
17
8
  from mindsdb_sql_parser.ast import CreateTable, DropTables, Insert, Select
18
9
  from mindsdb_sql_parser.ast.base import ASTNode
@@ -23,7 +14,9 @@ from mindsdb.integrations.libs.response import RESPONSE_TYPE
23
14
  from mindsdb.integrations.libs.response import HandlerResponse as Response
24
15
  from mindsdb.integrations.libs.response import HandlerStatusResponse as StatusResponse
25
16
  from mindsdb.utilities import log
26
- from langchain_text_splitters import RecursiveCharacterTextSplitter
17
+
18
+ from mindsdb.integrations.utilities.files.file_reader import FileReader
19
+
27
20
 
28
21
  logger = log.getLogger(__name__)
29
22
 
@@ -144,14 +137,9 @@ class FileHandler(DatabaseHandler):
144
137
  else:
145
138
  sheet_name = None
146
139
  file_path = self.file_controller.get_file_path(table_name)
147
- df, _columns = self._handle_source(
148
- file_path,
149
- self.clean_rows,
150
- self.custom_parser,
151
- self.chunk_size,
152
- self.chunk_overlap,
153
- sheet_name=sheet_name
154
- )
140
+
141
+ df = self.handle_source(file_path, sheet_name=sheet_name)
142
+
155
143
  # Process the SELECT query
156
144
  result_df = query_df(df, query)
157
145
  return Response(RESPONSE_TYPE.TABLE, data_frame=result_df)
@@ -160,14 +148,9 @@ class FileHandler(DatabaseHandler):
160
148
  table_name = query.table.parts[-1]
161
149
  file_path = self.file_controller.get_file_path(table_name)
162
150
 
163
- # Load the existing data from the file
164
- df, _ = self._handle_source(
165
- file_path,
166
- self.clean_rows,
167
- self.custom_parser,
168
- self.chunk_size,
169
- self.chunk_overlap,
170
- )
151
+ file_reader = FileReader(path=file_path)
152
+
153
+ df = file_reader.to_df()
171
154
 
172
155
  # Create a new dataframe with the values from the query
173
156
  new_df = pd.DataFrame(query.values, columns=[col.name for col in query.columns])
@@ -193,306 +176,16 @@ class FileHandler(DatabaseHandler):
193
176
  return self.query(ast)
194
177
 
195
178
  @staticmethod
196
- def _handle_source(
197
- file_path,
198
- clean_rows=True,
199
- custom_parser=None,
200
- chunk_size=DEFAULT_CHUNK_SIZE,
201
- chunk_overlap=DEFAULT_CHUNK_OVERLAP,
202
- sheet_name=None # for "xlsx", "xls" files
203
- ):
204
- """
205
- This function takes a file path and returns a pandas dataframe
206
- """
207
- # get file data io, format and dialect
208
- data, fmt, dialect = FileHandler._get_data_io(file_path)
209
- data.seek(0) # make sure we are at 0 in file pointer
210
-
211
- if custom_parser:
212
- header, file_data = custom_parser(data, fmt)
213
- df = pd.DataFrame(file_data, columns=header)
214
-
215
- elif fmt == "parquet":
216
- df = pd.read_parquet(data)
217
-
218
- elif fmt == "csv":
219
- df = pd.read_csv(data, sep=dialect.delimiter, index_col=False)
220
-
221
- elif fmt in ["xlsx", "xls"]:
222
- data.seek(0)
223
- with pd.ExcelFile(data) as xls:
224
- if sheet_name is None:
225
- # No sheet specified: Return list of sheets
226
- sheet_list = xls.sheet_names
227
- df = pd.DataFrame(sheet_list, columns=["Sheet_Name"])
228
- else:
229
- # Specific sheet requested: Load that sheet
230
- df = pd.read_excel(xls, sheet_name=sheet_name)
231
-
232
- elif fmt == "json":
233
- data.seek(0)
234
- json_doc = json.loads(data.read())
235
- df = pd.json_normalize(json_doc, max_level=0)
236
-
237
- elif fmt == "txt" or fmt == "pdf":
238
- text_splitter = RecursiveCharacterTextSplitter(
239
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
240
- )
241
-
242
- if fmt == "txt":
243
- try:
244
- from langchain_community.document_loaders import TextLoader
245
- except ImportError:
246
- raise ImportError(
247
- "To import TXT document please install 'langchain-community':\n"
248
- " pip install langchain-community"
249
- )
250
- loader = TextLoader(file_path, encoding="utf8")
251
- docs = text_splitter.split_documents(loader.load())
252
- df = pd.DataFrame(
253
- [
254
- {"content": doc.page_content, "metadata": doc.metadata}
255
- for doc in docs
256
- ]
257
- )
179
+ def handle_source(file_path, **kwargs):
180
+ file_reader = FileReader(path=file_path)
258
181
 
259
- elif fmt == "pdf":
260
-
261
- import fitz # pymupdf
262
-
263
- with fitz.open(file_path) as pdf: # open pdf
264
- text = chr(12).join([page.get_text() for page in pdf])
265
-
266
- split_text = text_splitter.split_text(text)
267
-
268
- df = pd.DataFrame(
269
- {"content": split_text, "metadata": [{}] * len(split_text)}
270
- )
271
-
272
- else:
273
- raise ValueError(
274
- "Could not load file into any format, supported formats are csv, json, xls, xlsx, pdf, txt"
275
- )
182
+ df = file_reader.to_df(**kwargs)
276
183
 
277
184
  header = df.columns.values.tolist()
278
185
 
279
186
  df.columns = [key.strip() for key in header]
280
187
  df = df.applymap(clean_cell)
281
-
282
- header = [x.strip() for x in header]
283
- col_map = dict((col, col) for col in header)
284
- return df, col_map
285
-
286
- @staticmethod
287
- def is_it_parquet(data: BytesIO) -> bool:
288
- # Check first and last 4 bytes equal to PAR1.
289
- # Refer: https://parquet.apache.org/docs/file-format/
290
- parquet_sig = b"PAR1"
291
- data.seek(0, 0)
292
- start_meta = data.read(4)
293
- data.seek(-4, 2)
294
- end_meta = data.read()
295
- data.seek(0)
296
- if start_meta == parquet_sig and end_meta == parquet_sig:
297
- return True
298
- return False
299
-
300
- @staticmethod
301
- def is_it_xlsx(file_path: str) -> bool:
302
- file_type = filetype.guess(file_path)
303
- if file_type and file_type.mime in {
304
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
305
- "application/vnd.ms-excel",
306
- }:
307
- return True
308
- return False
309
-
310
- @staticmethod
311
- def is_it_json(data_str: StringIO) -> bool:
312
- # see if its JSON
313
- text = data_str.read(100).strip()
314
- data_str.seek(0)
315
- if len(text) > 0:
316
- # it it looks like a json, then try to parse it
317
- if text.startswith("{") or text.startswith("["):
318
- try:
319
- json.loads(data_str.read())
320
- return True
321
- except Exception:
322
- return False
323
- finally:
324
- data_str.seek(0)
325
- return False
326
-
327
- @staticmethod
328
- def is_it_csv(data_str: StringIO) -> bool:
329
- sample = data_str.readline() # trying to get dialect from header
330
- data_str.seek(0)
331
- try:
332
- csv.Sniffer().sniff(sample)
333
- # Avoid a false-positive for json files
334
- try:
335
- json.loads(data_str.read())
336
- data_str.seek(0)
337
- return False
338
- except json.decoder.JSONDecodeError:
339
- data_str.seek(0)
340
- return True
341
- except Exception:
342
- return False
343
-
344
- @staticmethod
345
- def _get_data_io(file_path):
346
- """
347
- @TODO: Use python-magic to simplify the function and detect the file types as the xlsx example
348
- This gets a file either url or local file and defines what the format is as well as dialect
349
- :param file: file path or url
350
- :return: data_io, format, dialect
351
- """
352
-
353
- data = BytesIO()
354
- data_str = None
355
- dialect = None
356
-
357
- try:
358
- with open(file_path, "rb") as fp:
359
- data = BytesIO(fp.read())
360
- except Exception as e:
361
- error = "Could not load file, possible exception : {exception}".format(
362
- exception=e
363
- )
364
- logger.error(error)
365
- raise ValueError(error)
366
-
367
- suffix = Path(file_path).suffix.strip(".").lower()
368
- if suffix not in ("csv", "json", "xlsx", "parquet"):
369
- if FileHandler.is_it_parquet(data):
370
- suffix = "parquet"
371
- elif FileHandler.is_it_xlsx(file_path):
372
- suffix = "xlsx"
373
-
374
- if suffix == "parquet":
375
- return data, "parquet", dialect
376
-
377
- if suffix == "xlsx":
378
- return data, "xlsx", dialect
379
-
380
- if suffix == "txt":
381
- return data, "txt", dialect
382
-
383
- if suffix == "pdf":
384
- return data, "pdf", dialect
385
-
386
- byte_str = data.read()
387
- # Move it to StringIO
388
- try:
389
- # Handle Microsoft's BOM "special" UTF-8 encoding
390
- if byte_str.startswith(codecs.BOM_UTF8):
391
- data_str = StringIO(byte_str.decode("utf-8-sig"))
392
- else:
393
- file_encoding_meta = from_bytes(
394
- byte_str[: 32 * 1024],
395
- steps=32, # Number of steps/block to extract from my_byte_str
396
- chunk_size=1024, # Set block size of each extraction)
397
- explain=False,
398
- )
399
- best_meta = file_encoding_meta.best()
400
- errors = "strict"
401
- if best_meta is not None:
402
- encoding = file_encoding_meta.best().encoding
403
-
404
- try:
405
- data_str = StringIO(byte_str.decode(encoding, errors))
406
- except UnicodeDecodeError:
407
- encoding = "utf-8"
408
- errors = "replace"
409
-
410
- data_str = StringIO(byte_str.decode(encoding, errors))
411
- else:
412
- encoding = "utf-8"
413
- errors = "replace"
414
-
415
- data_str = StringIO(byte_str.decode(encoding, errors))
416
- except Exception:
417
- logger.error(traceback.format_exc())
418
- logger.error("Could not load into string")
419
-
420
- if suffix not in ("csv", "json"):
421
- if FileHandler.is_it_json(data_str):
422
- suffix = "json"
423
- elif FileHandler.is_it_csv(data_str):
424
- suffix = "csv"
425
-
426
- if suffix == "json":
427
- return data_str, suffix, dialect
428
-
429
- if suffix == "csv":
430
- try:
431
- dialect = FileHandler._get_csv_dialect(data_str)
432
- if dialect:
433
- return data_str, "csv", dialect
434
- except Exception:
435
- logger.error("Could not detect format for this file")
436
- logger.error(traceback.format_exc())
437
-
438
- data_str.seek(0)
439
- data.seek(0)
440
-
441
- # No file type identified
442
- return data, None, dialect
443
-
444
- @staticmethod
445
- def _get_file_path(path) -> str:
446
- try:
447
- is_url = urlparse(path).scheme in ("http", "https")
448
- except Exception:
449
- is_url = False
450
- if is_url:
451
- path = FileHandler._fetch_url(path)
452
- return path
453
-
454
- @staticmethod
455
- def _get_csv_dialect(buffer) -> csv.Dialect:
456
- sample = buffer.readline() # trying to get dialect from header
457
- buffer.seek(0)
458
- try:
459
- if isinstance(sample, bytes):
460
- sample = sample.decode()
461
- accepted_csv_delimiters = [",", "\t", ";"]
462
- try:
463
- dialect = csv.Sniffer().sniff(
464
- sample, delimiters=accepted_csv_delimiters
465
- )
466
- dialect.doublequote = (
467
- True # assume that all csvs have " as string escape
468
- )
469
- except Exception:
470
- dialect = csv.reader(sample).dialect
471
- if dialect.delimiter not in accepted_csv_delimiters:
472
- raise Exception(
473
- f"CSV delimeter '{dialect.delimiter}' is not supported"
474
- )
475
-
476
- except csv.Error:
477
- dialect = None
478
- return dialect
479
-
480
- @staticmethod
481
- def _fetch_url(url: str) -> str:
482
- temp_dir = tempfile.mkdtemp(prefix="mindsdb_file_url_")
483
- try:
484
- r = requests.get(url, stream=True)
485
- if r.status_code == 200:
486
- with open(os.path.join(temp_dir, "file"), "wb") as f:
487
- for chunk in r:
488
- f.write(chunk)
489
- else:
490
- raise Exception(f"Response status code is {r.status_code}")
491
- except Exception as e:
492
- logger.error(f"Error during getting {url}")
493
- logger.error(e)
494
- raise
495
- return os.path.join(temp_dir, "file")
188
+ return df
496
189
 
497
190
  def get_tables(self) -> Response:
498
191
  """
@@ -1,20 +1,20 @@
1
- import json
2
1
  import os
3
2
  import shutil
4
3
  import tempfile
5
4
  from io import BytesIO, StringIO
6
- from unittest.mock import patch
5
+ from pathlib import Path
7
6
 
8
7
  import pandas
9
8
  import pytest
10
- import responses
11
9
  from mindsdb_sql_parser.exceptions import ParsingException
12
- from mindsdb_sql_parser.ast import CreateTable, DropTables, Identifier, Insert, Select, Star, TableColumn, Update
10
+ from mindsdb_sql_parser.ast import CreateTable, DropTables, Identifier, Insert, TableColumn, Update
13
11
  from pytest_lazyfixture import lazy_fixture
14
12
 
15
13
  from mindsdb.integrations.handlers.file_handler.file_handler import FileHandler
16
14
  from mindsdb.integrations.libs.response import RESPONSE_TYPE
17
- from mindsdb.interfaces.file.file_controller import FileController
15
+
16
+ from mindsdb.integrations.utilities.files.file_reader import FileReader
17
+
18
18
 
19
19
  # Define a table to use as content for all of the file types
20
20
  # This data needs to match that saved in the files in the ./data/ dir (except pdf and txt files)
@@ -110,21 +110,21 @@ class TestIsItX:
110
110
  )
111
111
  def test_is_it_csv(self, file_path, result):
112
112
  with open(file_path, "r") as fh:
113
- assert FileHandler.is_it_csv(StringIO(fh.read())) is result
113
+ assert FileReader.is_csv(StringIO(fh.read())) is result
114
114
 
115
115
  @pytest.mark.parametrize(
116
116
  "file_path,result",
117
117
  [
118
- (lazy_fixture("csv_file"), False),
119
- (lazy_fixture("xlsx_file"), True),
120
- (lazy_fixture("json_file"), False),
121
- (lazy_fixture("parquet_file"), False),
122
- (lazy_fixture("txt_file"), False),
123
- (lazy_fixture("pdf_file"), False),
118
+ (lazy_fixture("csv_file"), 'csv'),
119
+ (lazy_fixture("xlsx_file"), 'xlsx'),
120
+ (lazy_fixture("json_file"), 'json'),
121
+ (lazy_fixture("parquet_file"), 'parquet'),
122
+ (lazy_fixture("txt_file"), 'txt'),
123
+ (lazy_fixture("pdf_file"), 'pdf'),
124
124
  ],
125
125
  )
126
- def test_is_it_xlsx(self, file_path, result):
127
- assert FileHandler.is_it_xlsx(file_path) is result
126
+ def test_format(self, file_path, result):
127
+ assert FileReader(path=file_path).get_format() == result
128
128
 
129
129
  # We can't test xlsx or parquet here because they're binary files
130
130
  @pytest.mark.parametrize(
@@ -137,7 +137,7 @@ class TestIsItX:
137
137
  )
138
138
  def test_is_it_json(self, file_path, result):
139
139
  with open(file_path, "r") as fh:
140
- assert FileHandler.is_it_json(StringIO(fh.read())) is result
140
+ assert FileReader.is_json(StringIO(fh.read())) is result
141
141
 
142
142
  @pytest.mark.parametrize(
143
143
  "file_path,result",
@@ -152,7 +152,7 @@ class TestIsItX:
152
152
  )
153
153
  def test_is_it_parquet(self, file_path, result):
154
154
  with open(file_path, "rb") as fh:
155
- assert FileHandler.is_it_parquet(BytesIO(fh.read())) is result
155
+ assert FileReader.is_parquet(BytesIO(fh.read())) is result
156
156
 
157
157
 
158
158
  class TestQuery:
@@ -182,64 +182,6 @@ class TestQuery:
182
182
 
183
183
  assert response.type == RESPONSE_TYPE.ERROR
184
184
 
185
- def test_query_select(self, csv_file):
186
- """Test a valid select query"""
187
- expected_df = pandas.read_csv(csv_file)
188
-
189
- # This is temporary because the file controller currently absconds with our file when we save it:
190
- # https://github.com/mindsdb/mindsdb/issues/8141
191
- csv_tmp = os.path.join(tempfile.gettempdir(), "test.csv")
192
- if os.path.exists(csv_tmp):
193
- os.remove(csv_tmp)
194
- shutil.copy(csv_file, csv_tmp)
195
-
196
- # Configure mindsdb and set up the file controller
197
- # Ideally this would be a lot simpler..
198
- db_file = tempfile.mkstemp(prefix="mindsdb_db_")[1]
199
- config = {"storage_db": "sqlite:///" + db_file}
200
- fdi, cfg_file = tempfile.mkstemp(prefix="mindsdb_conf_")
201
- with os.fdopen(fdi, "w") as fd:
202
- json.dump(config, fd)
203
- os.environ["MINDSDB_CONFIG_PATH"] = cfg_file
204
-
205
- from mindsdb.utilities.config import Config
206
-
207
- Config()
208
- from mindsdb.interfaces.storage import db
209
-
210
- db.init()
211
- db.session.rollback()
212
- db.Base.metadata.drop_all(db.engine)
213
-
214
- # create
215
- db.Base.metadata.create_all(db.engine)
216
-
217
- # fill with data
218
- r = db.Integration(name="files", data={}, engine="files")
219
- db.session.add(r)
220
- db.session.flush()
221
- # Config #
222
-
223
- file_controller = FileController()
224
- file_controller.save_file(
225
- os.path.splitext(os.path.basename(csv_file))[0], csv_tmp
226
- )
227
-
228
- file_handler = FileHandler(file_controller=file_controller)
229
- response = file_handler.query(
230
- Select(
231
- targets=[Star()],
232
- from_table=Identifier(
233
- parts=[os.path.splitext(os.path.basename(csv_file))[0]]
234
- ),
235
- )
236
- )
237
-
238
- assert response.type == RESPONSE_TYPE.TABLE
239
- assert response.error_code == 0
240
- assert response.error_message is None
241
- assert expected_df.equals(response.data_frame)
242
-
243
185
  def test_query_insert(self, csv_file, monkeypatch):
244
186
  """Test an invalid insert query"""
245
187
  # Create a temporary file to save the csv file to.
@@ -322,26 +264,6 @@ class TestQuery:
322
264
  file_handler.native_query("INVALID QUERY")
323
265
 
324
266
 
325
- def test_get_file_path_with_file_path():
326
- """Test an valid native table query"""
327
- file_path = "example.txt"
328
- result = FileHandler._get_file_path(file_path)
329
- assert result == file_path
330
-
331
-
332
- @patch("mindsdb.integrations.handlers.file_handler.file_handler.FileHandler._fetch_url")
333
- def test_get_file_path_with_url(mock_fetch_url):
334
- url = "http://example.com/file.txt"
335
- expected_result = "some_file_path"
336
- # we test _fetch_url separately below. Mock it for this test
337
- mock_fetch_url.return_value = expected_result
338
-
339
- result = FileHandler._get_file_path(url)
340
-
341
- assert result == expected_result
342
- mock_fetch_url.assert_called_with(url)
343
-
344
-
345
267
  @pytest.mark.parametrize(
346
268
  "file_path,expected_columns",
347
269
  [
@@ -354,48 +276,44 @@ def test_get_file_path_with_url(mock_fetch_url):
354
276
  ],
355
277
  )
356
278
  def test_handle_source(file_path, expected_columns):
357
- sheet_name = None
358
- # Excel files return a list of sheets when queried without a sheet name
359
- if file_path.endswith(".xlsx"):
360
- df, _ = FileHandler._handle_source(file_path)
279
+
280
+ def get_reader(file_path):
281
+ # using path
282
+ reader = FileReader(path=file_path)
283
+ yield reader
284
+
285
+ # using file descriptor
286
+ with open(file_path, 'rb') as fd:
287
+ reader = FileReader(file=fd)
288
+ yield reader
289
+ fd.seek(0)
290
+ content = fd.read()
291
+
292
+ # using bytesio
293
+ fd = BytesIO(content)
294
+ reader = FileReader(file=fd, name=Path(file_path).name)
295
+ yield reader
296
+
297
+ # using different methods to create reader
298
+ for reader in get_reader(file_path):
299
+ df = reader.to_df()
361
300
  assert isinstance(df, pandas.DataFrame)
362
301
 
363
- assert df.columns.tolist() == test_excel_sheet_content[0]
364
- assert len(df) == len(test_excel_sheet_content) - 1
365
- assert df.values.tolist() == test_excel_sheet_content[1:]
366
- sheet_name = test_excel_sheet_content[1][0]
302
+ if reader.get_format() == 'xlsx':
367
303
 
368
- df, _ = FileHandler._handle_source(file_path, sheet_name=sheet_name)
369
- assert isinstance(df, pandas.DataFrame)
370
- assert df.columns.tolist() == expected_columns
304
+ assert df.columns.tolist() == test_excel_sheet_content[0]
305
+ assert len(df) == len(test_excel_sheet_content) - 1
306
+ assert df.values.tolist() == test_excel_sheet_content[1:]
307
+ sheet_name = test_excel_sheet_content[1][0]
371
308
 
372
- # The pdf and txt files have some different content
373
- if not file_path.endswith(".pdf") and not file_path.endswith(".txt"):
374
- assert len(df) == len(test_file_content) - 1
375
- assert df.values.tolist() == test_file_content[1:]
309
+ df = reader.to_df(sheet_name=sheet_name)
376
310
 
311
+ assert df.columns.tolist() == expected_columns
377
312
 
378
- @pytest.mark.parametrize(
379
- "file_path,expected_file_type,expected_delimiter,expected_data_type",
380
- [
381
- (lazy_fixture("csv_file"), "csv", ",", StringIO),
382
- (lazy_fixture("xlsx_file"), "xlsx", None, BytesIO),
383
- (lazy_fixture("json_file"), "json", None, StringIO),
384
- (lazy_fixture("parquet_file"), "parquet", None, BytesIO),
385
- (lazy_fixture("pdf_file"), "pdf", None, BytesIO),
386
- (lazy_fixture("txt_file"), "txt", None, BytesIO),
387
- ],
388
- )
389
- def test_get_data_io(
390
- file_path, expected_file_type, expected_delimiter, expected_data_type
391
- ):
392
- data_io, file_type, file_dialect = FileHandler._get_data_io(file_path)
393
- assert file_type == expected_file_type
394
- assert type(data_io) == expected_data_type
395
- if expected_delimiter is None:
396
- assert file_dialect is None
397
- else:
398
- assert file_dialect.delimiter == expected_delimiter
313
+ # The pdf and txt files have some different content
314
+ if reader.get_format() not in ("pdf", "txt"):
315
+ assert len(df) == len(test_file_content) - 1
316
+ assert df.values.tolist() == test_file_content[1:]
399
317
 
400
318
 
401
319
  @pytest.mark.parametrize(
@@ -407,10 +325,21 @@ def test_get_data_io(
407
325
  ],
408
326
  )
409
327
  def test_check_valid_dialects(csv_string, delimiter):
410
- dialect = FileHandler._get_csv_dialect(csv_string)
328
+ dialect = FileReader._get_csv_dialect(csv_string)
411
329
  assert dialect.delimiter == delimiter
412
330
 
413
331
 
332
+ def test_tsv():
333
+ file = BytesIO(b"example;csv;file\tname")
334
+
335
+ reader = FileReader(file=file, name='test.tsv')
336
+ assert reader.get_format() == 'csv'
337
+ assert reader.parameters['delimiter'] == '\t'
338
+
339
+ df = reader.to_df()
340
+ assert len(df.columns) == 2
341
+
342
+
414
343
  def test_check_invalid_dialects():
415
344
  with pytest.raises(Exception):
416
345
  FileHandler._get_csv_dialect("example csv file")
@@ -420,31 +349,6 @@ def test_check_invalid_dialects():
420
349
  FileHandler._get_csv_dialect("example|csv|file")
421
350
 
422
351
 
423
- @responses.activate
424
- def test_fetch_url():
425
- file_content = "Fake File Content 1234567890"
426
- file_url = "https://test.fake/robots.txt"
427
- responses.add(
428
- responses.GET, file_url, body=file_content, status=200
429
- ) # mock the response
430
-
431
- file_path = FileHandler._fetch_url(file_url)
432
- with open(file_path, "r") as fh:
433
- saved_file_content = fh.read()
434
-
435
- assert saved_file_content == file_content
436
-
437
-
438
- @responses.activate
439
- def test_fetch_url_raises():
440
- responses.add(responses.GET, "https://google.com", status=404)
441
-
442
- with pytest.raises(Exception):
443
- FileHandler._fetch_url("obvious_broken_url")
444
- with pytest.raises(Exception):
445
- FileHandler._fetch_url("https://google.com") # will get 404 response
446
-
447
-
448
352
  def test_get_tables():
449
353
  file_handler = FileHandler(file_controller=MockFileController())
450
354
  response = file_handler.get_tables()
@@ -1,5 +1,5 @@
1
1
  datasets==2.16.1
2
2
  evaluate
3
3
  torch
4
- nltk
4
+ nltk>=3.9
5
5
  huggingface-hub
@@ -1,6 +1,6 @@
1
1
  datasets==2.16.1
2
2
  evaluate
3
- nltk
3
+ nltk>=3.9
4
4
  huggingface-hub
5
5
  # Needs to be installed with `pip install --extra-index-url https://download.pytorch.org/whl/ .[huggingface_cpu]`
6
6
  torch==2.2.0+cpu
@@ -1,3 +1,3 @@
1
1
  lancedb~=0.3.1
2
2
  lance
3
- pyarrow~=14.0.1
3
+ pyarrow~=19.0.0