deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1193 @@
|
|
1
|
+
from unittest import TestCase
|
2
|
+
import numpy as np
|
3
|
+
import tempfile
|
4
|
+
import fsspec
|
5
|
+
import gzip
|
6
|
+
import polars as pl
|
7
|
+
from deltacat.types.media import ContentType, ContentEncoding
|
8
|
+
from deltacat.utils.numpy import (
|
9
|
+
file_to_ndarray,
|
10
|
+
slice_ndarray,
|
11
|
+
ndarray_size,
|
12
|
+
ndarray_to_file,
|
13
|
+
)
|
14
|
+
from deltacat.utils.pandas import ReadKwargsProviderPandasCsvPureUtf8
|
15
|
+
|
16
|
+
|
17
|
+
class TestNumpyReaders(TestCase):
|
18
|
+
def setUp(self):
|
19
|
+
# Create test data files for reading
|
20
|
+
self.fs = fsspec.filesystem("file")
|
21
|
+
self.base_path = tempfile.mkdtemp()
|
22
|
+
self.fs.makedirs(self.base_path, exist_ok=True)
|
23
|
+
|
24
|
+
# Create test data as 2D array (3 rows, 3 columns)
|
25
|
+
self.expected_data = np.array(
|
26
|
+
[["a,b\tc|d", "1", "1.1"], ["e,f\tg|h", "2", "2.2"], ["test", "3", "3.3"]]
|
27
|
+
)
|
28
|
+
|
29
|
+
# Write test files in different formats
|
30
|
+
self._create_test_files()
|
31
|
+
|
32
|
+
def tearDown(self):
|
33
|
+
self.fs.rm(self.base_path, recursive=True)
|
34
|
+
|
35
|
+
def _create_test_files(self):
|
36
|
+
# Create CSV file (GZIP compressed)
|
37
|
+
csv_path = f"{self.base_path}/test.csv"
|
38
|
+
with self.fs.open(csv_path, "wb") as f:
|
39
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
40
|
+
content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
|
41
|
+
gz.write(content.encode("utf-8"))
|
42
|
+
|
43
|
+
# Create TSV file (GZIP compressed)
|
44
|
+
tsv_path = f"{self.base_path}/test.tsv"
|
45
|
+
with self.fs.open(tsv_path, "wb") as f:
|
46
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
47
|
+
content = '"a,b\tc|d"\t1\t1.1\n"e,f\tg|h"\t2\t2.2\ntest\t3\t3.3\n'
|
48
|
+
gz.write(content.encode("utf-8"))
|
49
|
+
|
50
|
+
# Create PSV file (GZIP compressed)
|
51
|
+
psv_path = f"{self.base_path}/test.psv"
|
52
|
+
with self.fs.open(psv_path, "wb") as f:
|
53
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
54
|
+
content = '"a,b\tc|d"|1|1.1\n"e,f\tg|h"|2|2.2\ntest|3|3.3\n'
|
55
|
+
gz.write(content.encode("utf-8"))
|
56
|
+
|
57
|
+
# Create unescaped TSV file (GZIP compressed)
|
58
|
+
unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
|
59
|
+
with self.fs.open(unescaped_tsv_path, "wb") as f:
|
60
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
61
|
+
content = "abc\t1\t1.1\ndef\t2\t2.2\nghi\t3\t3.3\n"
|
62
|
+
gz.write(content.encode("utf-8"))
|
63
|
+
|
64
|
+
# Create Parquet file
|
65
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
66
|
+
import pandas as pd
|
67
|
+
|
68
|
+
df = pd.DataFrame(
|
69
|
+
{
|
70
|
+
"col1": ["a,b\tc|d", "e,f\tg|h", "test"],
|
71
|
+
"col2": [1, 2, 3],
|
72
|
+
"col3": [1.1, 2.2, 3.3],
|
73
|
+
}
|
74
|
+
)
|
75
|
+
df.to_parquet(parquet_path, index=False)
|
76
|
+
|
77
|
+
# Create Feather file
|
78
|
+
feather_path = f"{self.base_path}/test.feather"
|
79
|
+
df.to_feather(feather_path)
|
80
|
+
|
81
|
+
# Create JSON file (GZIP compressed, NDJSON format)
|
82
|
+
json_path = f"{self.base_path}/test.json"
|
83
|
+
with self.fs.open(json_path, "wb") as f:
|
84
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
85
|
+
json_str = df.to_json(orient="records", lines=True)
|
86
|
+
gz.write(json_str.encode("utf-8"))
|
87
|
+
|
88
|
+
# Create Avro file using polars
|
89
|
+
avro_path = f"{self.base_path}/test.avro"
|
90
|
+
pl_df = pl.from_pandas(df)
|
91
|
+
pl_df.write_avro(avro_path)
|
92
|
+
|
93
|
+
# Create ORC file
|
94
|
+
orc_path = f"{self.base_path}/test.orc"
|
95
|
+
df.to_orc(orc_path, index=False)
|
96
|
+
|
97
|
+
def test_file_to_ndarray_csv(self):
|
98
|
+
# Test reading CSV with file_to_ndarray
|
99
|
+
csv_path = f"{self.base_path}/test.csv"
|
100
|
+
|
101
|
+
result = file_to_ndarray(
|
102
|
+
csv_path,
|
103
|
+
ContentType.CSV.value,
|
104
|
+
ContentEncoding.GZIP.value,
|
105
|
+
filesystem=self.fs,
|
106
|
+
column_names=["col1", "col2", "col3"],
|
107
|
+
)
|
108
|
+
|
109
|
+
assert result.shape == (3, 3)
|
110
|
+
assert result[0, 0] == "a,b\tc|d"
|
111
|
+
assert result[1, 0] == "e,f\tg|h"
|
112
|
+
assert result[2, 0] == "test"
|
113
|
+
|
114
|
+
def test_file_to_ndarray_tsv(self):
|
115
|
+
# Test reading TSV with file_to_ndarray
|
116
|
+
tsv_path = f"{self.base_path}/test.tsv"
|
117
|
+
|
118
|
+
result = file_to_ndarray(
|
119
|
+
tsv_path,
|
120
|
+
ContentType.TSV.value,
|
121
|
+
ContentEncoding.GZIP.value,
|
122
|
+
filesystem=self.fs,
|
123
|
+
column_names=["col1", "col2", "col3"],
|
124
|
+
)
|
125
|
+
|
126
|
+
assert result.shape == (3, 3)
|
127
|
+
assert result[0, 0] == "a,b\tc|d"
|
128
|
+
assert result[1, 0] == "e,f\tg|h"
|
129
|
+
assert result[2, 0] == "test"
|
130
|
+
|
131
|
+
def test_file_to_ndarray_psv(self):
|
132
|
+
# Test reading PSV with file_to_ndarray
|
133
|
+
psv_path = f"{self.base_path}/test.psv"
|
134
|
+
|
135
|
+
result = file_to_ndarray(
|
136
|
+
psv_path,
|
137
|
+
ContentType.PSV.value,
|
138
|
+
ContentEncoding.GZIP.value,
|
139
|
+
filesystem=self.fs,
|
140
|
+
column_names=["col1", "col2", "col3"],
|
141
|
+
)
|
142
|
+
|
143
|
+
assert result.shape == (3, 3)
|
144
|
+
assert result[0, 0] == "a,b\tc|d"
|
145
|
+
assert result[1, 0] == "e,f\tg|h"
|
146
|
+
assert result[2, 0] == "test"
|
147
|
+
|
148
|
+
def test_file_to_ndarray_unescaped_tsv(self):
|
149
|
+
# Test reading unescaped TSV with file_to_ndarray
|
150
|
+
unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
|
151
|
+
|
152
|
+
result = file_to_ndarray(
|
153
|
+
unescaped_tsv_path,
|
154
|
+
ContentType.UNESCAPED_TSV.value,
|
155
|
+
ContentEncoding.GZIP.value,
|
156
|
+
filesystem=self.fs,
|
157
|
+
column_names=["col1", "col2", "col3"],
|
158
|
+
)
|
159
|
+
|
160
|
+
assert result.shape == (3, 3)
|
161
|
+
assert result[0, 0] == "abc"
|
162
|
+
assert result[1, 0] == "def"
|
163
|
+
assert result[2, 0] == "ghi"
|
164
|
+
|
165
|
+
def test_file_to_ndarray_parquet(self):
|
166
|
+
# Test reading Parquet with file_to_ndarray
|
167
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
168
|
+
|
169
|
+
result = file_to_ndarray(
|
170
|
+
parquet_path, ContentType.PARQUET.value, filesystem=self.fs
|
171
|
+
)
|
172
|
+
|
173
|
+
assert result.shape == (3, 3)
|
174
|
+
assert result[0, 0] == "a,b\tc|d"
|
175
|
+
assert result[1, 0] == "e,f\tg|h"
|
176
|
+
assert result[2, 0] == "test"
|
177
|
+
|
178
|
+
def test_file_to_ndarray_feather(self):
|
179
|
+
# Test reading Feather with file_to_ndarray
|
180
|
+
feather_path = f"{self.base_path}/test.feather"
|
181
|
+
|
182
|
+
result = file_to_ndarray(
|
183
|
+
feather_path, ContentType.FEATHER.value, filesystem=self.fs
|
184
|
+
)
|
185
|
+
|
186
|
+
assert result.shape == (3, 3)
|
187
|
+
assert result[0, 0] == "a,b\tc|d"
|
188
|
+
assert result[1, 0] == "e,f\tg|h"
|
189
|
+
assert result[2, 0] == "test"
|
190
|
+
|
191
|
+
def test_file_to_ndarray_json(self):
|
192
|
+
# Test reading JSON with file_to_ndarray
|
193
|
+
json_path = f"{self.base_path}/test.json"
|
194
|
+
|
195
|
+
result = file_to_ndarray(
|
196
|
+
json_path,
|
197
|
+
ContentType.JSON.value,
|
198
|
+
ContentEncoding.GZIP.value,
|
199
|
+
filesystem=self.fs,
|
200
|
+
)
|
201
|
+
|
202
|
+
assert result.shape == (3, 3)
|
203
|
+
# Note: JSON column order might differ, so check by value presence
|
204
|
+
assert "a,b\tc|d" in result.flatten()
|
205
|
+
assert "e,f\tg|h" in result.flatten()
|
206
|
+
assert "test" in result.flatten()
|
207
|
+
|
208
|
+
def test_file_to_ndarray_avro(self):
|
209
|
+
# Test reading Avro with file_to_ndarray
|
210
|
+
avro_path = f"{self.base_path}/test.avro"
|
211
|
+
|
212
|
+
result = file_to_ndarray(avro_path, ContentType.AVRO.value, filesystem=self.fs)
|
213
|
+
|
214
|
+
assert result.shape == (3, 3)
|
215
|
+
assert result[0, 0] == "a,b\tc|d"
|
216
|
+
assert result[1, 0] == "e,f\tg|h"
|
217
|
+
assert result[2, 0] == "test"
|
218
|
+
|
219
|
+
def test_file_to_ndarray_orc(self):
|
220
|
+
# Test reading ORC with file_to_ndarray
|
221
|
+
orc_path = f"{self.base_path}/test.orc"
|
222
|
+
|
223
|
+
result = file_to_ndarray(orc_path, ContentType.ORC.value, filesystem=self.fs)
|
224
|
+
|
225
|
+
assert result.shape == (3, 3)
|
226
|
+
assert result[0, 0] == "a,b\tc|d"
|
227
|
+
assert result[1, 0] == "e,f\tg|h"
|
228
|
+
assert result[2, 0] == "test"
|
229
|
+
|
230
|
+
def test_file_to_ndarray_with_column_selection(self):
|
231
|
+
# Test reading with column selection
|
232
|
+
csv_path = f"{self.base_path}/test.csv"
|
233
|
+
|
234
|
+
result = file_to_ndarray(
|
235
|
+
csv_path,
|
236
|
+
ContentType.CSV.value,
|
237
|
+
ContentEncoding.GZIP.value,
|
238
|
+
filesystem=self.fs,
|
239
|
+
column_names=["col1", "col2", "col3"],
|
240
|
+
include_columns=["col1", "col2"],
|
241
|
+
)
|
242
|
+
|
243
|
+
assert result.shape == (3, 2) # Should only have 2 columns
|
244
|
+
assert result[0, 0] == "a,b\tc|d"
|
245
|
+
assert result[1, 0] == "e,f\tg|h"
|
246
|
+
assert result[2, 0] == "test"
|
247
|
+
|
248
|
+
def test_file_to_ndarray_with_kwargs_provider(self):
|
249
|
+
# Test reading with kwargs provider (forces string types)
|
250
|
+
csv_path = f"{self.base_path}/test.csv"
|
251
|
+
provider = ReadKwargsProviderPandasCsvPureUtf8(
|
252
|
+
include_columns=["col1", "col2", "col3"]
|
253
|
+
)
|
254
|
+
|
255
|
+
result = file_to_ndarray(
|
256
|
+
csv_path,
|
257
|
+
ContentType.CSV.value,
|
258
|
+
ContentEncoding.GZIP.value,
|
259
|
+
filesystem=self.fs,
|
260
|
+
column_names=["col1", "col2", "col3"],
|
261
|
+
pd_read_func_kwargs_provider=provider,
|
262
|
+
)
|
263
|
+
|
264
|
+
assert result.shape == (3, 3)
|
265
|
+
assert result[0, 0] == "a,b\tc|d"
|
266
|
+
# With string types provider, numbers should also be strings
|
267
|
+
assert result[0, 1] == "1"
|
268
|
+
assert result[0, 2] == "1.1"
|
269
|
+
|
270
|
+
def test_file_to_ndarray_filesystem_inference(self):
|
271
|
+
# Test filesystem inference when no filesystem is provided
|
272
|
+
json_path = f"{self.base_path}/test.json"
|
273
|
+
|
274
|
+
result = file_to_ndarray(
|
275
|
+
json_path,
|
276
|
+
ContentType.JSON.value,
|
277
|
+
ContentEncoding.GZIP.value
|
278
|
+
# No filesystem provided - should be inferred
|
279
|
+
)
|
280
|
+
|
281
|
+
assert result.shape == (3, 3)
|
282
|
+
# JSON might have different column ordering
|
283
|
+
assert "a,b\tc|d" in result.flatten()
|
284
|
+
assert "e,f\tg|h" in result.flatten()
|
285
|
+
assert "test" in result.flatten()
|
286
|
+
|
287
|
+
def test_file_to_ndarray_bzip2_compression(self):
|
288
|
+
# Test BZIP2 compression handling
|
289
|
+
import bz2
|
290
|
+
|
291
|
+
# Create a BZIP2 compressed CSV file
|
292
|
+
csv_content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
|
293
|
+
compressed_content = bz2.compress(csv_content.encode("utf-8"))
|
294
|
+
|
295
|
+
bz2_path = f"{self.base_path}/test.csv.bz2"
|
296
|
+
with self.fs.open(bz2_path, "wb") as f:
|
297
|
+
f.write(compressed_content)
|
298
|
+
|
299
|
+
result = file_to_ndarray(
|
300
|
+
bz2_path,
|
301
|
+
ContentType.CSV.value,
|
302
|
+
ContentEncoding.BZIP2.value,
|
303
|
+
filesystem=self.fs,
|
304
|
+
column_names=["col1", "col2", "col3"],
|
305
|
+
)
|
306
|
+
|
307
|
+
assert result.shape == (3, 3)
|
308
|
+
assert result[0, 0] == "a,b\tc|d"
|
309
|
+
assert result[1, 0] == "e,f\tg|h"
|
310
|
+
assert result[2, 0] == "test"
|
311
|
+
|
312
|
+
def test_slice_ndarray(self):
|
313
|
+
# Test slicing functionality
|
314
|
+
arr = np.arange(10).reshape(10, 1)
|
315
|
+
|
316
|
+
# Test without max_len (should return original array)
|
317
|
+
result = slice_ndarray(arr, None)
|
318
|
+
assert len(result) == 1
|
319
|
+
np.testing.assert_array_equal(result[0], arr)
|
320
|
+
|
321
|
+
# Test with max_len
|
322
|
+
result = slice_ndarray(arr, 3)
|
323
|
+
assert len(result) == 4 # 10 rows / 3 = 3 full slices + 1 remainder
|
324
|
+
assert result[0].shape == (3, 1)
|
325
|
+
assert result[1].shape == (3, 1)
|
326
|
+
assert result[2].shape == (3, 1)
|
327
|
+
assert result[3].shape == (1, 1) # remainder
|
328
|
+
|
329
|
+
# Verify data integrity
|
330
|
+
np.testing.assert_array_equal(result[0], arr[:3])
|
331
|
+
np.testing.assert_array_equal(result[1], arr[3:6])
|
332
|
+
np.testing.assert_array_equal(result[2], arr[6:9])
|
333
|
+
np.testing.assert_array_equal(result[3], arr[9:])
|
334
|
+
|
335
|
+
def test_ndarray_size(self):
|
336
|
+
# Test size calculation
|
337
|
+
arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float64)
|
338
|
+
size = ndarray_size(arr)
|
339
|
+
expected_size = arr.nbytes
|
340
|
+
assert size == expected_size
|
341
|
+
|
342
|
+
def test_ndarray_to_file(self):
|
343
|
+
# Test writing ndarray to file
|
344
|
+
arr = np.array([1, 2, 3, 4, 5])
|
345
|
+
path = f"{self.base_path}/test_output.parquet"
|
346
|
+
|
347
|
+
ndarray_to_file(
|
348
|
+
arr, path, self.fs, lambda x: path, content_type=ContentType.PARQUET.value
|
349
|
+
)
|
350
|
+
|
351
|
+
assert self.fs.exists(path), "file was not written"
|
352
|
+
|
353
|
+
# Verify we can read it back (though this tests the write functionality)
|
354
|
+
import pandas as pd
|
355
|
+
|
356
|
+
result_df = pd.read_parquet(path)
|
357
|
+
assert len(result_df) == 5
|
358
|
+
assert "0" in result_df.columns
|
359
|
+
|
360
|
+
def test_ndarray_to_file_different_content_types(self):
|
361
|
+
# Test writing ndarray to different file formats
|
362
|
+
arr = np.array([1, 2, 3, 4, 5])
|
363
|
+
|
364
|
+
# Test Parquet
|
365
|
+
parquet_path = f"{self.base_path}/test_output.parquet"
|
366
|
+
ndarray_to_file(
|
367
|
+
arr,
|
368
|
+
parquet_path,
|
369
|
+
self.fs,
|
370
|
+
lambda x: parquet_path,
|
371
|
+
content_type=ContentType.PARQUET.value,
|
372
|
+
)
|
373
|
+
assert self.fs.exists(parquet_path)
|
374
|
+
|
375
|
+
# Test Feather
|
376
|
+
feather_path = f"{self.base_path}/test_output.feather"
|
377
|
+
ndarray_to_file(
|
378
|
+
arr,
|
379
|
+
feather_path,
|
380
|
+
self.fs,
|
381
|
+
lambda x: feather_path,
|
382
|
+
content_type=ContentType.FEATHER.value,
|
383
|
+
)
|
384
|
+
assert self.fs.exists(feather_path)
|
385
|
+
|
386
|
+
# Test CSV (compressed)
|
387
|
+
csv_path = f"{self.base_path}/test_output.csv"
|
388
|
+
ndarray_to_file(
|
389
|
+
arr,
|
390
|
+
csv_path,
|
391
|
+
self.fs,
|
392
|
+
lambda x: csv_path,
|
393
|
+
content_type=ContentType.CSV.value,
|
394
|
+
)
|
395
|
+
assert self.fs.exists(csv_path)
|
396
|
+
|
397
|
+
# Test JSON (compressed)
|
398
|
+
json_path = f"{self.base_path}/test_output.json"
|
399
|
+
ndarray_to_file(
|
400
|
+
arr,
|
401
|
+
json_path,
|
402
|
+
self.fs,
|
403
|
+
lambda x: json_path,
|
404
|
+
content_type=ContentType.JSON.value,
|
405
|
+
)
|
406
|
+
assert self.fs.exists(json_path)
|
407
|
+
|
408
|
+
def test_ndarray_to_file_different_dtypes(self):
|
409
|
+
# Test writing arrays with different data types
|
410
|
+
|
411
|
+
# Integer array
|
412
|
+
int_arr = np.array([1, 2, 3, 4, 5], dtype=np.int64)
|
413
|
+
int_path = f"{self.base_path}/test_int.parquet"
|
414
|
+
ndarray_to_file(
|
415
|
+
int_arr,
|
416
|
+
int_path,
|
417
|
+
self.fs,
|
418
|
+
lambda x: int_path,
|
419
|
+
content_type=ContentType.PARQUET.value,
|
420
|
+
)
|
421
|
+
assert self.fs.exists(int_path)
|
422
|
+
|
423
|
+
# Float array
|
424
|
+
float_arr = np.array([1.1, 2.2, 3.3, 4.4, 5.5], dtype=np.float64)
|
425
|
+
float_path = f"{self.base_path}/test_float.parquet"
|
426
|
+
ndarray_to_file(
|
427
|
+
float_arr,
|
428
|
+
float_path,
|
429
|
+
self.fs,
|
430
|
+
lambda x: float_path,
|
431
|
+
content_type=ContentType.PARQUET.value,
|
432
|
+
)
|
433
|
+
assert self.fs.exists(float_path)
|
434
|
+
|
435
|
+
# String array (object dtype)
|
436
|
+
str_arr = np.array(["a", "b", "c", "d", "e"], dtype=object)
|
437
|
+
str_path = f"{self.base_path}/test_str.parquet"
|
438
|
+
ndarray_to_file(
|
439
|
+
str_arr,
|
440
|
+
str_path,
|
441
|
+
self.fs,
|
442
|
+
lambda x: str_path,
|
443
|
+
content_type=ContentType.PARQUET.value,
|
444
|
+
)
|
445
|
+
assert self.fs.exists(str_path)
|
446
|
+
|
447
|
+
def test_ndarray_to_file_2d_array(self):
|
448
|
+
# Test writing 2D arrays
|
449
|
+
arr_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
450
|
+
path = f"{self.base_path}/test_2d.parquet"
|
451
|
+
|
452
|
+
ndarray_to_file(
|
453
|
+
arr_2d,
|
454
|
+
path,
|
455
|
+
self.fs,
|
456
|
+
lambda x: path,
|
457
|
+
content_type=ContentType.PARQUET.value,
|
458
|
+
)
|
459
|
+
|
460
|
+
assert self.fs.exists(path)
|
461
|
+
|
462
|
+
# Verify the file structure
|
463
|
+
import pandas as pd
|
464
|
+
|
465
|
+
result_df = pd.read_parquet(path)
|
466
|
+
assert len(result_df) == 3 # 3 rows (first dimension)
|
467
|
+
# 2D array should have columns "0", "1", "2"
|
468
|
+
assert list(result_df.columns) == ["0", "1", "2"]
|
469
|
+
# Verify the data values are correct (convert back to numpy for comparison)
|
470
|
+
result_array = result_df.to_numpy()
|
471
|
+
np.testing.assert_array_equal(result_array, arr_2d)
|
472
|
+
|
473
|
+
def test_ndarray_to_file_empty_array(self):
|
474
|
+
# Test writing empty arrays
|
475
|
+
empty_arr = np.array([])
|
476
|
+
path = f"{self.base_path}/test_empty.parquet"
|
477
|
+
|
478
|
+
ndarray_to_file(
|
479
|
+
empty_arr,
|
480
|
+
path,
|
481
|
+
self.fs,
|
482
|
+
lambda x: path,
|
483
|
+
content_type=ContentType.PARQUET.value,
|
484
|
+
)
|
485
|
+
|
486
|
+
assert self.fs.exists(path)
|
487
|
+
|
488
|
+
# Verify the file structure
|
489
|
+
import pandas as pd
|
490
|
+
|
491
|
+
result_df = pd.read_parquet(path)
|
492
|
+
assert len(result_df) == 0 # Empty DataFrame
|
493
|
+
assert "0" in result_df.columns
|
494
|
+
|
495
|
+
def test_ndarray_to_file_large_array(self):
|
496
|
+
# Test writing larger arrays
|
497
|
+
large_arr = np.arange(1000)
|
498
|
+
path = f"{self.base_path}/test_large.parquet"
|
499
|
+
|
500
|
+
ndarray_to_file(
|
501
|
+
large_arr,
|
502
|
+
path,
|
503
|
+
self.fs,
|
504
|
+
lambda x: path,
|
505
|
+
content_type=ContentType.PARQUET.value,
|
506
|
+
)
|
507
|
+
|
508
|
+
assert self.fs.exists(path)
|
509
|
+
|
510
|
+
# Verify the file can be read and has correct size
|
511
|
+
import pandas as pd
|
512
|
+
|
513
|
+
result_df = pd.read_parquet(path)
|
514
|
+
assert len(result_df) == 1000
|
515
|
+
assert "0" in result_df.columns
|
516
|
+
|
517
|
+
def test_ndarray_to_file_with_custom_kwargs(self):
|
518
|
+
# Test writing with custom kwargs
|
519
|
+
arr = np.array([1, 2, 3, 4, 5])
|
520
|
+
path = f"{self.base_path}/test_kwargs.parquet"
|
521
|
+
|
522
|
+
# Add some custom write kwargs (these will be passed to PyArrow)
|
523
|
+
ndarray_to_file(
|
524
|
+
arr,
|
525
|
+
path,
|
526
|
+
self.fs,
|
527
|
+
lambda x: path,
|
528
|
+
content_type=ContentType.PARQUET.value,
|
529
|
+
compression="snappy", # Custom compression
|
530
|
+
)
|
531
|
+
|
532
|
+
assert self.fs.exists(path)
|
533
|
+
|
534
|
+
# Verify the file was written (basic check)
|
535
|
+
import pandas as pd
|
536
|
+
|
537
|
+
result_df = pd.read_parquet(path)
|
538
|
+
assert len(result_df) == 5
|
539
|
+
assert "0" in result_df.columns
|
540
|
+
|
541
|
+
def test_ndarray_to_file_readback_verification(self):
|
542
|
+
# Test that we can read back the exact data we wrote
|
543
|
+
original_arr = np.array([1.1, 2.2, 3.3, 4.4, 5.5])
|
544
|
+
path = f"{self.base_path}/test_readback.parquet"
|
545
|
+
|
546
|
+
# Write the array
|
547
|
+
ndarray_to_file(
|
548
|
+
original_arr,
|
549
|
+
path,
|
550
|
+
self.fs,
|
551
|
+
lambda x: path,
|
552
|
+
content_type=ContentType.PARQUET.value,
|
553
|
+
)
|
554
|
+
|
555
|
+
# Read it back using pandas and verify content
|
556
|
+
import pandas as pd
|
557
|
+
|
558
|
+
result_df = pd.read_parquet(path)
|
559
|
+
readback_arr = np.array(result_df["0"].tolist())
|
560
|
+
|
561
|
+
# Check that the data matches
|
562
|
+
np.testing.assert_array_almost_equal(original_arr, readback_arr)
|
563
|
+
|
564
|
+
def test_ndarray_to_file_different_filesystems(self):
|
565
|
+
# Test with different filesystem implementations
|
566
|
+
arr = np.array([1, 2, 3, 4, 5])
|
567
|
+
|
568
|
+
# Test with fsspec filesystem (already used in other tests)
|
569
|
+
fsspec_path = f"{self.base_path}/test_fsspec.parquet"
|
570
|
+
ndarray_to_file(
|
571
|
+
arr,
|
572
|
+
fsspec_path,
|
573
|
+
self.fs,
|
574
|
+
lambda x: fsspec_path,
|
575
|
+
content_type=ContentType.PARQUET.value,
|
576
|
+
)
|
577
|
+
assert self.fs.exists(fsspec_path)
|
578
|
+
|
579
|
+
# Test with None filesystem (should infer local filesystem)
|
580
|
+
local_path = f"{self.base_path}/test_local.parquet"
|
581
|
+
ndarray_to_file(
|
582
|
+
arr,
|
583
|
+
local_path,
|
584
|
+
None, # No filesystem specified
|
585
|
+
lambda x: local_path,
|
586
|
+
content_type=ContentType.PARQUET.value,
|
587
|
+
)
|
588
|
+
# Check if file exists using the fsspec filesystem
|
589
|
+
assert self.fs.exists(local_path)
|
590
|
+
|
591
|
+
def test_ndarray_to_file_boolean_array(self):
|
592
|
+
# Test writing boolean arrays
|
593
|
+
bool_arr = np.array([True, False, True, False, True])
|
594
|
+
path = f"{self.base_path}/test_bool.parquet"
|
595
|
+
|
596
|
+
ndarray_to_file(
|
597
|
+
bool_arr,
|
598
|
+
path,
|
599
|
+
self.fs,
|
600
|
+
lambda x: path,
|
601
|
+
content_type=ContentType.PARQUET.value,
|
602
|
+
)
|
603
|
+
|
604
|
+
assert self.fs.exists(path)
|
605
|
+
|
606
|
+
# Verify the file structure and content
|
607
|
+
import pandas as pd
|
608
|
+
|
609
|
+
result_df = pd.read_parquet(path)
|
610
|
+
assert len(result_df) == 5
|
611
|
+
assert "0" in result_df.columns
|
612
|
+
|
613
|
+
# Check that boolean values are preserved
|
614
|
+
readback_arr = np.array(result_df["0"].tolist())
|
615
|
+
np.testing.assert_array_equal(bool_arr, readback_arr)
|
616
|
+
|
617
|
+
def test_ndarray_to_file_complex_dtypes(self):
|
618
|
+
# Test writing arrays with complex dtypes
|
619
|
+
complex_arr = np.array([1 + 2j, 3 + 4j, 5 + 6j])
|
620
|
+
path = f"{self.base_path}/test_complex.parquet"
|
621
|
+
|
622
|
+
# Note: Complex numbers might not be directly supported by all formats
|
623
|
+
# This test may need to handle conversion or errors gracefully
|
624
|
+
try:
|
625
|
+
ndarray_to_file(
|
626
|
+
complex_arr,
|
627
|
+
path,
|
628
|
+
self.fs,
|
629
|
+
lambda x: path,
|
630
|
+
content_type=ContentType.PARQUET.value,
|
631
|
+
)
|
632
|
+
assert self.fs.exists(path)
|
633
|
+
except (TypeError, ValueError, NotImplementedError):
|
634
|
+
# Complex dtypes might not be supported by PyArrow/Parquet
|
635
|
+
# This is acceptable behavior
|
636
|
+
pass
|
637
|
+
|
638
|
+
|
639
|
+
class TestNumpyFileSystemSupport(TestCase):
|
640
|
+
"""
|
641
|
+
Comprehensive tests for numpy file operations with different filesystem types.
|
642
|
+
Tests fsspec AbstractFileSystem, PyArrow FileSystem, and auto-inferred filesystem.
|
643
|
+
"""
|
644
|
+
|
645
|
+
def setUp(self):
|
646
|
+
import pyarrow.fs as pafs
|
647
|
+
|
648
|
+
# Create test data as numpy array
|
649
|
+
# All formats preserve mixed types when converted to numpy, so use object dtype for all
|
650
|
+
self.test_data = np.array(
|
651
|
+
[["value1", 1, 1.1], ["value2", 2, 2.2], ["value3", 3, 3.3]], dtype=object
|
652
|
+
)
|
653
|
+
|
654
|
+
# Set up temporary directory
|
655
|
+
self.temp_dir = tempfile.mkdtemp()
|
656
|
+
|
657
|
+
# Set up different filesystem types
|
658
|
+
self.fsspec_fs = fsspec.filesystem("file")
|
659
|
+
self.pyarrow_fs = pafs.LocalFileSystem()
|
660
|
+
|
661
|
+
# Create test files for each content type
|
662
|
+
self._create_test_files()
|
663
|
+
|
664
|
+
def tearDown(self):
|
665
|
+
import shutil
|
666
|
+
|
667
|
+
shutil.rmtree(self.temp_dir)
|
668
|
+
|
669
|
+
def _create_test_files(self):
|
670
|
+
"""Create test files in different formats with different compression types."""
|
671
|
+
import gzip
|
672
|
+
import bz2
|
673
|
+
import pandas as pd
|
674
|
+
|
675
|
+
# Create pandas DataFrame for file creation
|
676
|
+
df = pd.DataFrame(
|
677
|
+
{
|
678
|
+
"col1": ["value1", "value2", "value3"],
|
679
|
+
"col2": [1, 2, 3],
|
680
|
+
"col3": [1.1, 2.2, 3.3],
|
681
|
+
}
|
682
|
+
)
|
683
|
+
|
684
|
+
# CSV files without headers to match test data structure
|
685
|
+
csv_data = "value1,1,1.1\nvalue2,2,2.2\nvalue3,3,3.3\n"
|
686
|
+
|
687
|
+
# Create uncompressed CSV
|
688
|
+
with open(f"{self.temp_dir}/test.csv", "w") as f:
|
689
|
+
f.write(csv_data)
|
690
|
+
|
691
|
+
# Create GZIP compressed CSV
|
692
|
+
with gzip.open(f"{self.temp_dir}/test_gzip.csv.gz", "wt") as f:
|
693
|
+
f.write(csv_data)
|
694
|
+
|
695
|
+
# Create BZIP2 compressed CSV
|
696
|
+
with bz2.open(f"{self.temp_dir}/test_bzip2.csv.bz2", "wt") as f:
|
697
|
+
f.write(csv_data)
|
698
|
+
|
699
|
+
# Parquet file
|
700
|
+
df.to_parquet(f"{self.temp_dir}/test.parquet", index=False)
|
701
|
+
|
702
|
+
# Feather file
|
703
|
+
df.to_feather(f"{self.temp_dir}/test.feather")
|
704
|
+
|
705
|
+
# JSON file (NDJSON format)
|
706
|
+
json_str = df.to_json(orient="records", lines=True)
|
707
|
+
with open(f"{self.temp_dir}/test.json", "w") as f:
|
708
|
+
f.write(json_str)
|
709
|
+
|
710
|
+
# AVRO file (using polars since pandas delegates to polars for AVRO)
|
711
|
+
import polars as pl
|
712
|
+
|
713
|
+
pl_df = pl.from_pandas(df)
|
714
|
+
pl_df.write_avro(f"{self.temp_dir}/test.avro")
|
715
|
+
|
716
|
+
# ORC file
|
717
|
+
df.to_orc(f"{self.temp_dir}/test.orc")
|
718
|
+
|
719
|
+
def _assert_arrays_equal(self, result, expected):
|
720
|
+
"""Helper to assert numpy arrays are equal."""
|
721
|
+
assert (
|
722
|
+
result.shape == expected.shape
|
723
|
+
), f"Shape mismatch: {result.shape} vs {expected.shape}"
|
724
|
+
np.testing.assert_array_equal(result, expected)
|
725
|
+
|
726
|
+
def test_csv_with_fsspec_filesystem(self):
|
727
|
+
"""Test CSV reading with fsspec AbstractFileSystem."""
|
728
|
+
# Test uncompressed CSV
|
729
|
+
result = file_to_ndarray(
|
730
|
+
f"{self.temp_dir}/test.csv",
|
731
|
+
ContentType.CSV.value,
|
732
|
+
ContentEncoding.IDENTITY.value,
|
733
|
+
filesystem=self.fsspec_fs,
|
734
|
+
column_names=["col1", "col2", "col3"],
|
735
|
+
)
|
736
|
+
self._assert_arrays_equal(result, self.test_data)
|
737
|
+
|
738
|
+
# Test GZIP compressed CSV
|
739
|
+
result = file_to_ndarray(
|
740
|
+
f"{self.temp_dir}/test_gzip.csv.gz",
|
741
|
+
ContentType.CSV.value,
|
742
|
+
ContentEncoding.GZIP.value,
|
743
|
+
filesystem=self.fsspec_fs,
|
744
|
+
column_names=["col1", "col2", "col3"],
|
745
|
+
)
|
746
|
+
self._assert_arrays_equal(result, self.test_data)
|
747
|
+
|
748
|
+
# Test BZIP2 compressed CSV
|
749
|
+
result = file_to_ndarray(
|
750
|
+
f"{self.temp_dir}/test_bzip2.csv.bz2",
|
751
|
+
ContentType.CSV.value,
|
752
|
+
ContentEncoding.BZIP2.value,
|
753
|
+
filesystem=self.fsspec_fs,
|
754
|
+
column_names=["col1", "col2", "col3"],
|
755
|
+
)
|
756
|
+
self._assert_arrays_equal(result, self.test_data)
|
757
|
+
|
758
|
+
def test_csv_with_pyarrow_filesystem(self):
|
759
|
+
"""Test CSV reading with PyArrow FileSystem."""
|
760
|
+
# Test uncompressed CSV
|
761
|
+
result = file_to_ndarray(
|
762
|
+
f"{self.temp_dir}/test.csv",
|
763
|
+
ContentType.CSV.value,
|
764
|
+
ContentEncoding.IDENTITY.value,
|
765
|
+
filesystem=self.pyarrow_fs,
|
766
|
+
column_names=["col1", "col2", "col3"],
|
767
|
+
)
|
768
|
+
self._assert_arrays_equal(result, self.test_data)
|
769
|
+
|
770
|
+
# Test GZIP compressed CSV
|
771
|
+
result = file_to_ndarray(
|
772
|
+
f"{self.temp_dir}/test_gzip.csv.gz",
|
773
|
+
ContentType.CSV.value,
|
774
|
+
ContentEncoding.GZIP.value,
|
775
|
+
filesystem=self.pyarrow_fs,
|
776
|
+
column_names=["col1", "col2", "col3"],
|
777
|
+
)
|
778
|
+
self._assert_arrays_equal(result, self.test_data)
|
779
|
+
|
780
|
+
def test_csv_with_auto_inferred_filesystem(self):
|
781
|
+
"""Test CSV reading with automatically inferred filesystem."""
|
782
|
+
# Test uncompressed CSV (filesystem=None, should auto-infer)
|
783
|
+
result = file_to_ndarray(
|
784
|
+
f"{self.temp_dir}/test.csv",
|
785
|
+
ContentType.CSV.value,
|
786
|
+
ContentEncoding.IDENTITY.value,
|
787
|
+
filesystem=None,
|
788
|
+
column_names=["col1", "col2", "col3"],
|
789
|
+
)
|
790
|
+
self._assert_arrays_equal(result, self.test_data)
|
791
|
+
|
792
|
+
def test_parquet_with_different_filesystems(self):
|
793
|
+
"""Test Parquet reading with different filesystem types."""
|
794
|
+
# Test with fsspec
|
795
|
+
result = file_to_ndarray(
|
796
|
+
f"{self.temp_dir}/test.parquet",
|
797
|
+
ContentType.PARQUET.value,
|
798
|
+
ContentEncoding.IDENTITY.value,
|
799
|
+
filesystem=self.fsspec_fs,
|
800
|
+
)
|
801
|
+
self._assert_arrays_equal(result, self.test_data)
|
802
|
+
|
803
|
+
# Test with PyArrow
|
804
|
+
result = file_to_ndarray(
|
805
|
+
f"{self.temp_dir}/test.parquet",
|
806
|
+
ContentType.PARQUET.value,
|
807
|
+
ContentEncoding.IDENTITY.value,
|
808
|
+
filesystem=self.pyarrow_fs,
|
809
|
+
)
|
810
|
+
self._assert_arrays_equal(result, self.test_data)
|
811
|
+
|
812
|
+
# Test with auto-inferred
|
813
|
+
result = file_to_ndarray(
|
814
|
+
f"{self.temp_dir}/test.parquet",
|
815
|
+
ContentType.PARQUET.value,
|
816
|
+
ContentEncoding.IDENTITY.value,
|
817
|
+
filesystem=None,
|
818
|
+
)
|
819
|
+
self._assert_arrays_equal(result, self.test_data)
|
820
|
+
|
821
|
+
def test_feather_with_different_filesystems(self):
|
822
|
+
"""Test Feather reading with different filesystem types."""
|
823
|
+
# Test with fsspec
|
824
|
+
result = file_to_ndarray(
|
825
|
+
f"{self.temp_dir}/test.feather",
|
826
|
+
ContentType.FEATHER.value,
|
827
|
+
ContentEncoding.IDENTITY.value,
|
828
|
+
filesystem=self.fsspec_fs,
|
829
|
+
)
|
830
|
+
self._assert_arrays_equal(result, self.test_data)
|
831
|
+
|
832
|
+
# Test with PyArrow
|
833
|
+
result = file_to_ndarray(
|
834
|
+
f"{self.temp_dir}/test.feather",
|
835
|
+
ContentType.FEATHER.value,
|
836
|
+
ContentEncoding.IDENTITY.value,
|
837
|
+
filesystem=self.pyarrow_fs,
|
838
|
+
)
|
839
|
+
self._assert_arrays_equal(result, self.test_data)
|
840
|
+
|
841
|
+
# Test with auto-inferred
|
842
|
+
result = file_to_ndarray(
|
843
|
+
f"{self.temp_dir}/test.feather",
|
844
|
+
ContentType.FEATHER.value,
|
845
|
+
ContentEncoding.IDENTITY.value,
|
846
|
+
filesystem=None,
|
847
|
+
)
|
848
|
+
self._assert_arrays_equal(result, self.test_data)
|
849
|
+
|
850
|
+
def test_json_with_different_filesystems(self):
|
851
|
+
"""Test JSON reading with different filesystem types."""
|
852
|
+
# Test with fsspec
|
853
|
+
result = file_to_ndarray(
|
854
|
+
f"{self.temp_dir}/test.json",
|
855
|
+
ContentType.JSON.value,
|
856
|
+
ContentEncoding.IDENTITY.value,
|
857
|
+
filesystem=self.fsspec_fs,
|
858
|
+
)
|
859
|
+
self._assert_arrays_equal(result, self.test_data)
|
860
|
+
|
861
|
+
# Test with PyArrow
|
862
|
+
result = file_to_ndarray(
|
863
|
+
f"{self.temp_dir}/test.json",
|
864
|
+
ContentType.JSON.value,
|
865
|
+
ContentEncoding.IDENTITY.value,
|
866
|
+
filesystem=self.pyarrow_fs,
|
867
|
+
)
|
868
|
+
self._assert_arrays_equal(result, self.test_data)
|
869
|
+
|
870
|
+
# Test with auto-inferred
|
871
|
+
result = file_to_ndarray(
|
872
|
+
f"{self.temp_dir}/test.json",
|
873
|
+
ContentType.JSON.value,
|
874
|
+
ContentEncoding.IDENTITY.value,
|
875
|
+
filesystem=None,
|
876
|
+
)
|
877
|
+
self._assert_arrays_equal(result, self.test_data)
|
878
|
+
|
879
|
+
def test_avro_with_different_filesystems(self):
|
880
|
+
"""Test AVRO reading with different filesystem types."""
|
881
|
+
# Test with fsspec
|
882
|
+
result = file_to_ndarray(
|
883
|
+
f"{self.temp_dir}/test.avro",
|
884
|
+
ContentType.AVRO.value,
|
885
|
+
ContentEncoding.IDENTITY.value,
|
886
|
+
filesystem=self.fsspec_fs,
|
887
|
+
)
|
888
|
+
self._assert_arrays_equal(result, self.test_data)
|
889
|
+
|
890
|
+
# Test with PyArrow
|
891
|
+
result = file_to_ndarray(
|
892
|
+
f"{self.temp_dir}/test.avro",
|
893
|
+
ContentType.AVRO.value,
|
894
|
+
ContentEncoding.IDENTITY.value,
|
895
|
+
filesystem=self.pyarrow_fs,
|
896
|
+
)
|
897
|
+
self._assert_arrays_equal(result, self.test_data)
|
898
|
+
|
899
|
+
# Test with auto-inferred
|
900
|
+
result = file_to_ndarray(
|
901
|
+
f"{self.temp_dir}/test.avro",
|
902
|
+
ContentType.AVRO.value,
|
903
|
+
ContentEncoding.IDENTITY.value,
|
904
|
+
filesystem=None,
|
905
|
+
)
|
906
|
+
self._assert_arrays_equal(result, self.test_data)
|
907
|
+
|
908
|
+
def test_orc_with_different_filesystems(self):
|
909
|
+
"""Test ORC reading with different filesystem types."""
|
910
|
+
# Test with fsspec
|
911
|
+
result = file_to_ndarray(
|
912
|
+
f"{self.temp_dir}/test.orc",
|
913
|
+
ContentType.ORC.value,
|
914
|
+
ContentEncoding.IDENTITY.value,
|
915
|
+
filesystem=self.fsspec_fs,
|
916
|
+
)
|
917
|
+
self._assert_arrays_equal(result, self.test_data)
|
918
|
+
|
919
|
+
# Test with PyArrow
|
920
|
+
result = file_to_ndarray(
|
921
|
+
f"{self.temp_dir}/test.orc",
|
922
|
+
ContentType.ORC.value,
|
923
|
+
ContentEncoding.IDENTITY.value,
|
924
|
+
filesystem=self.pyarrow_fs,
|
925
|
+
)
|
926
|
+
self._assert_arrays_equal(result, self.test_data)
|
927
|
+
|
928
|
+
# Test with auto-inferred
|
929
|
+
result = file_to_ndarray(
|
930
|
+
f"{self.temp_dir}/test.orc",
|
931
|
+
ContentType.ORC.value,
|
932
|
+
ContentEncoding.IDENTITY.value,
|
933
|
+
filesystem=None,
|
934
|
+
)
|
935
|
+
self._assert_arrays_equal(result, self.test_data)
|
936
|
+
|
937
|
+
def test_file_to_ndarray_with_different_filesystems(self):
|
938
|
+
"""Test file_to_ndarray with different filesystem types for all content types."""
|
939
|
+
test_cases = [
|
940
|
+
(
|
941
|
+
f"{self.temp_dir}/test.csv",
|
942
|
+
ContentType.CSV.value,
|
943
|
+
ContentEncoding.IDENTITY.value,
|
944
|
+
{"column_names": ["col1", "col2", "col3"]},
|
945
|
+
self.test_data,
|
946
|
+
),
|
947
|
+
(
|
948
|
+
f"{self.temp_dir}/test_gzip.csv.gz",
|
949
|
+
ContentType.CSV.value,
|
950
|
+
ContentEncoding.GZIP.value,
|
951
|
+
{"column_names": ["col1", "col2", "col3"]},
|
952
|
+
self.test_data,
|
953
|
+
),
|
954
|
+
(
|
955
|
+
f"{self.temp_dir}/test.parquet",
|
956
|
+
ContentType.PARQUET.value,
|
957
|
+
ContentEncoding.IDENTITY.value,
|
958
|
+
{},
|
959
|
+
self.test_data,
|
960
|
+
),
|
961
|
+
(
|
962
|
+
f"{self.temp_dir}/test.feather",
|
963
|
+
ContentType.FEATHER.value,
|
964
|
+
ContentEncoding.IDENTITY.value,
|
965
|
+
{},
|
966
|
+
self.test_data,
|
967
|
+
),
|
968
|
+
(
|
969
|
+
f"{self.temp_dir}/test.json",
|
970
|
+
ContentType.JSON.value,
|
971
|
+
ContentEncoding.IDENTITY.value,
|
972
|
+
{},
|
973
|
+
self.test_data,
|
974
|
+
),
|
975
|
+
(
|
976
|
+
f"{self.temp_dir}/test.avro",
|
977
|
+
ContentType.AVRO.value,
|
978
|
+
ContentEncoding.IDENTITY.value,
|
979
|
+
{},
|
980
|
+
self.test_data,
|
981
|
+
),
|
982
|
+
(
|
983
|
+
f"{self.temp_dir}/test.orc",
|
984
|
+
ContentType.ORC.value,
|
985
|
+
ContentEncoding.IDENTITY.value,
|
986
|
+
{},
|
987
|
+
self.test_data,
|
988
|
+
),
|
989
|
+
]
|
990
|
+
|
991
|
+
filesystems = [
|
992
|
+
("fsspec", self.fsspec_fs),
|
993
|
+
("pyarrow", self.pyarrow_fs),
|
994
|
+
("auto-inferred", None),
|
995
|
+
]
|
996
|
+
|
997
|
+
for (
|
998
|
+
path,
|
999
|
+
content_type,
|
1000
|
+
content_encoding,
|
1001
|
+
extra_kwargs,
|
1002
|
+
expected_data,
|
1003
|
+
) in test_cases:
|
1004
|
+
for fs_name, filesystem in filesystems:
|
1005
|
+
with self.subTest(
|
1006
|
+
content_type=content_type,
|
1007
|
+
filesystem=fs_name,
|
1008
|
+
encoding=content_encoding,
|
1009
|
+
):
|
1010
|
+
result = file_to_ndarray(
|
1011
|
+
path=path,
|
1012
|
+
content_type=content_type,
|
1013
|
+
content_encoding=content_encoding,
|
1014
|
+
filesystem=filesystem,
|
1015
|
+
**extra_kwargs,
|
1016
|
+
)
|
1017
|
+
self._assert_arrays_equal(result, expected_data)
|
1018
|
+
|
1019
|
+
def test_compression_encoding_with_different_filesystems(self):
|
1020
|
+
"""Test that compression encoding works correctly with different filesystem types."""
|
1021
|
+
test_cases = [
|
1022
|
+
(f"{self.temp_dir}/test.csv", ContentEncoding.IDENTITY.value),
|
1023
|
+
(f"{self.temp_dir}/test_gzip.csv.gz", ContentEncoding.GZIP.value),
|
1024
|
+
(f"{self.temp_dir}/test_bzip2.csv.bz2", ContentEncoding.BZIP2.value),
|
1025
|
+
]
|
1026
|
+
|
1027
|
+
filesystems = [
|
1028
|
+
("fsspec", self.fsspec_fs),
|
1029
|
+
("pyarrow", self.pyarrow_fs),
|
1030
|
+
("auto-inferred", None),
|
1031
|
+
]
|
1032
|
+
|
1033
|
+
for path, content_encoding in test_cases:
|
1034
|
+
for fs_name, filesystem in filesystems:
|
1035
|
+
with self.subTest(encoding=content_encoding, filesystem=fs_name):
|
1036
|
+
result = file_to_ndarray(
|
1037
|
+
path=path,
|
1038
|
+
content_type=ContentType.CSV.value,
|
1039
|
+
content_encoding=content_encoding,
|
1040
|
+
filesystem=filesystem,
|
1041
|
+
column_names=["col1", "col2", "col3"],
|
1042
|
+
)
|
1043
|
+
self._assert_arrays_equal(result, self.test_data)
|
1044
|
+
|
1045
|
+
def test_filesystem_open_kwargs(self):
|
1046
|
+
"""Test that filesystem open kwargs are properly passed through."""
|
1047
|
+
# Test with custom fs_open_kwargs
|
1048
|
+
result = file_to_ndarray(
|
1049
|
+
f"{self.temp_dir}/test.csv",
|
1050
|
+
ContentType.CSV.value,
|
1051
|
+
ContentEncoding.IDENTITY.value,
|
1052
|
+
filesystem=self.fsspec_fs,
|
1053
|
+
fs_open_kwargs={
|
1054
|
+
"encoding": "utf-8"
|
1055
|
+
}, # This should be passed to filesystem.open()
|
1056
|
+
column_names=["col1", "col2", "col3"],
|
1057
|
+
)
|
1058
|
+
self._assert_arrays_equal(result, self.test_data)
|
1059
|
+
|
1060
|
+
def test_delimited_formats_with_different_filesystems(self):
|
1061
|
+
"""Test delimited formats (TSV, PSV, etc.) with different filesystem types."""
|
1062
|
+
# Create TSV test file without headers to match test data structure
|
1063
|
+
tsv_data = "value1\t1\t1.1\nvalue2\t2\t2.2\nvalue3\t3\t3.3\n"
|
1064
|
+
with open(f"{self.temp_dir}/test.tsv", "w") as f:
|
1065
|
+
f.write(tsv_data)
|
1066
|
+
|
1067
|
+
# Create PSV test file without headers to match test data structure
|
1068
|
+
psv_data = "value1|1|1.1\nvalue2|2|2.2\nvalue3|3|3.3\n"
|
1069
|
+
with open(f"{self.temp_dir}/test.psv", "w") as f:
|
1070
|
+
f.write(psv_data)
|
1071
|
+
|
1072
|
+
delimited_test_cases = [
|
1073
|
+
(
|
1074
|
+
f"{self.temp_dir}/test.tsv",
|
1075
|
+
ContentType.TSV.value,
|
1076
|
+
{"sep": "\t", "column_names": ["col1", "col2", "col3"]},
|
1077
|
+
),
|
1078
|
+
(
|
1079
|
+
f"{self.temp_dir}/test.psv",
|
1080
|
+
ContentType.PSV.value,
|
1081
|
+
{"sep": "|", "column_names": ["col1", "col2", "col3"]},
|
1082
|
+
),
|
1083
|
+
]
|
1084
|
+
|
1085
|
+
filesystems = [
|
1086
|
+
("fsspec", self.fsspec_fs),
|
1087
|
+
("pyarrow", self.pyarrow_fs),
|
1088
|
+
("auto-inferred", None),
|
1089
|
+
]
|
1090
|
+
|
1091
|
+
for path, content_type, extra_kwargs in delimited_test_cases:
|
1092
|
+
for fs_name, filesystem in filesystems:
|
1093
|
+
with self.subTest(content_type=content_type, filesystem=fs_name):
|
1094
|
+
result = file_to_ndarray(
|
1095
|
+
path=path,
|
1096
|
+
content_type=content_type,
|
1097
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
1098
|
+
filesystem=filesystem,
|
1099
|
+
**extra_kwargs,
|
1100
|
+
)
|
1101
|
+
self._assert_arrays_equal(result, self.test_data)
|
1102
|
+
|
1103
|
+
def test_numpy_array_conversion_consistency(self):
|
1104
|
+
"""Test that numpy array conversion is consistent across filesystem types."""
|
1105
|
+
# Test that the same data produces the same numpy array regardless of filesystem
|
1106
|
+
filesystems = [
|
1107
|
+
("fsspec", self.fsspec_fs),
|
1108
|
+
("pyarrow", self.pyarrow_fs),
|
1109
|
+
("auto-inferred", None),
|
1110
|
+
]
|
1111
|
+
|
1112
|
+
# Use Parquet as it preserves data types well
|
1113
|
+
parquet_path = f"{self.temp_dir}/test.parquet"
|
1114
|
+
|
1115
|
+
results = []
|
1116
|
+
for fs_name, filesystem in filesystems:
|
1117
|
+
result = file_to_ndarray(
|
1118
|
+
parquet_path,
|
1119
|
+
ContentType.PARQUET.value,
|
1120
|
+
ContentEncoding.IDENTITY.value,
|
1121
|
+
filesystem=filesystem,
|
1122
|
+
)
|
1123
|
+
results.append((fs_name, result))
|
1124
|
+
|
1125
|
+
# All results should be identical
|
1126
|
+
reference_result = results[0][1]
|
1127
|
+
for fs_name, result in results[1:]:
|
1128
|
+
with self.subTest(filesystem=fs_name):
|
1129
|
+
self._assert_arrays_equal(result, reference_result)
|
1130
|
+
|
1131
|
+
def test_dtype_preservation_across_filesystems(self):
|
1132
|
+
"""Test that data types are preserved across different filesystem types."""
|
1133
|
+
import pandas as pd
|
1134
|
+
|
1135
|
+
# Create a DataFrame with mixed data types
|
1136
|
+
df = pd.DataFrame(
|
1137
|
+
{
|
1138
|
+
"int_col": [1, 2, 3],
|
1139
|
+
"float_col": [1.1, 2.2, 3.3],
|
1140
|
+
"str_col": ["a", "b", "c"],
|
1141
|
+
}
|
1142
|
+
)
|
1143
|
+
|
1144
|
+
# Save as Parquet (preserves types best)
|
1145
|
+
parquet_path = f"{self.temp_dir}/test_dtypes.parquet"
|
1146
|
+
df.to_parquet(parquet_path, index=False)
|
1147
|
+
|
1148
|
+
filesystems = [
|
1149
|
+
("fsspec", self.fsspec_fs),
|
1150
|
+
("pyarrow", self.pyarrow_fs),
|
1151
|
+
("auto-inferred", None),
|
1152
|
+
]
|
1153
|
+
|
1154
|
+
# Test that data types are consistent across filesystems
|
1155
|
+
dtypes = []
|
1156
|
+
for fs_name, filesystem in filesystems:
|
1157
|
+
result = file_to_ndarray(
|
1158
|
+
parquet_path,
|
1159
|
+
ContentType.PARQUET.value,
|
1160
|
+
ContentEncoding.IDENTITY.value,
|
1161
|
+
filesystem=filesystem,
|
1162
|
+
)
|
1163
|
+
dtypes.append((fs_name, result.dtype))
|
1164
|
+
|
1165
|
+
# All dtypes should be the same (object type for mixed data)
|
1166
|
+
reference_dtype = dtypes[0][1]
|
1167
|
+
for fs_name, dtype in dtypes[1:]:
|
1168
|
+
with self.subTest(filesystem=fs_name):
|
1169
|
+
assert (
|
1170
|
+
dtype == reference_dtype
|
1171
|
+
), f"Dtype mismatch for {fs_name}: {dtype} vs {reference_dtype}"
|
1172
|
+
|
1173
|
+
def test_error_handling_across_filesystems(self):
|
1174
|
+
"""Test that error handling is consistent across filesystem types."""
|
1175
|
+
filesystems = [
|
1176
|
+
("fsspec", self.fsspec_fs),
|
1177
|
+
("pyarrow", self.pyarrow_fs),
|
1178
|
+
("auto-inferred", None),
|
1179
|
+
]
|
1180
|
+
|
1181
|
+
# Test with non-existent file
|
1182
|
+
for fs_name, filesystem in filesystems:
|
1183
|
+
with self.subTest(filesystem=fs_name):
|
1184
|
+
with self.assertRaises(
|
1185
|
+
Exception
|
1186
|
+
): # Should raise some kind of file not found error
|
1187
|
+
file_to_ndarray(
|
1188
|
+
f"{self.temp_dir}/nonexistent.csv",
|
1189
|
+
ContentType.CSV.value,
|
1190
|
+
ContentEncoding.IDENTITY.value,
|
1191
|
+
filesystem=filesystem,
|
1192
|
+
column_names=["col1", "col2", "col3"],
|
1193
|
+
)
|