deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1040 @@
|
|
1
|
+
from unittest import TestCase
|
2
|
+
import polars as pl
|
3
|
+
import pandas as pd
|
4
|
+
import tempfile
|
5
|
+
import fsspec
|
6
|
+
import gzip
|
7
|
+
import json
|
8
|
+
import io
|
9
|
+
from deltacat.types.media import ContentType, ContentEncoding
|
10
|
+
from deltacat.utils.polars import (
|
11
|
+
dataframe_to_file,
|
12
|
+
file_to_dataframe,
|
13
|
+
content_type_to_reader_kwargs,
|
14
|
+
_add_column_kwargs,
|
15
|
+
ReadKwargsProviderPolarsStringTypes,
|
16
|
+
concat_dataframes,
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
class TestPolarsWriters(TestCase):
|
21
|
+
def setUp(self):
|
22
|
+
# Create a test DataFrame with data that includes delimiters
|
23
|
+
self.df = pl.DataFrame({"col1": ["a,b\tc|d", "e,f\tg|h"], "col2": [1, 2]})
|
24
|
+
self.fs = fsspec.filesystem("file")
|
25
|
+
self.base_path = tempfile.mkdtemp()
|
26
|
+
self.fs.makedirs(self.base_path, exist_ok=True)
|
27
|
+
|
28
|
+
def tearDown(self):
|
29
|
+
self.fs.rm(self.base_path, recursive=True)
|
30
|
+
|
31
|
+
def test_write_feather(self):
|
32
|
+
path = f"{self.base_path}/test.feather"
|
33
|
+
|
34
|
+
dataframe_to_file(
|
35
|
+
self.df,
|
36
|
+
path,
|
37
|
+
self.fs,
|
38
|
+
lambda x: path,
|
39
|
+
content_type=ContentType.FEATHER.value,
|
40
|
+
)
|
41
|
+
assert self.fs.exists(path), "file was not written"
|
42
|
+
|
43
|
+
# Verify content
|
44
|
+
result = pl.read_ipc(path)
|
45
|
+
assert result.equals(self.df)
|
46
|
+
|
47
|
+
def test_write_csv(self):
|
48
|
+
path = f"{self.base_path}/test.csv.gz"
|
49
|
+
|
50
|
+
dataframe_to_file(
|
51
|
+
self.df, path, self.fs, lambda x: path, content_type=ContentType.CSV.value
|
52
|
+
)
|
53
|
+
assert self.fs.exists(path), "file was not written"
|
54
|
+
|
55
|
+
# Verify content (should be GZIP compressed)
|
56
|
+
with self.fs.open(path, "rb") as f:
|
57
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
58
|
+
content = gz.read().decode("utf-8")
|
59
|
+
# Should be quoted due to commas in data
|
60
|
+
assert '"a,b\tc|d",1' in content
|
61
|
+
assert '"e,f\tg|h",2' in content
|
62
|
+
|
63
|
+
def test_write_tsv(self):
|
64
|
+
path = f"{self.base_path}/test.tsv.gz"
|
65
|
+
|
66
|
+
dataframe_to_file(
|
67
|
+
self.df,
|
68
|
+
path,
|
69
|
+
self.fs,
|
70
|
+
lambda x: path,
|
71
|
+
content_type=ContentType.TSV.value,
|
72
|
+
)
|
73
|
+
assert self.fs.exists(path), "file was not written"
|
74
|
+
|
75
|
+
# Verify content (should be GZIP compressed)
|
76
|
+
with self.fs.open(path, "rb") as f:
|
77
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
78
|
+
content = gz.read().decode("utf-8")
|
79
|
+
# Polars writes TSV with tab separators
|
80
|
+
assert '"a,b\tc|d"\t1' in content
|
81
|
+
assert '"e,f\tg|h"\t2' in content
|
82
|
+
|
83
|
+
def test_write_psv(self):
|
84
|
+
path = f"{self.base_path}/test.psv.gz"
|
85
|
+
|
86
|
+
dataframe_to_file(
|
87
|
+
self.df,
|
88
|
+
path,
|
89
|
+
self.fs,
|
90
|
+
lambda x: path,
|
91
|
+
content_type=ContentType.PSV.value,
|
92
|
+
)
|
93
|
+
assert self.fs.exists(path), "file was not written"
|
94
|
+
|
95
|
+
# Verify content (should be GZIP compressed)
|
96
|
+
with self.fs.open(path, "rb") as f:
|
97
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
98
|
+
content = gz.read().decode("utf-8")
|
99
|
+
# Polars writes PSV with pipe separators
|
100
|
+
assert '"a,b\tc|d"|1' in content
|
101
|
+
assert '"e,f\tg|h"|2' in content
|
102
|
+
|
103
|
+
def test_write_unescaped_tsv(self):
|
104
|
+
# Create DataFrame without delimiters for unescaped TSV
|
105
|
+
df = pl.DataFrame({"col1": ["abc", "def"], "col2": [1, 2]})
|
106
|
+
path = f"{self.base_path}/test.tsv.gz"
|
107
|
+
|
108
|
+
dataframe_to_file(
|
109
|
+
df,
|
110
|
+
path,
|
111
|
+
self.fs,
|
112
|
+
lambda x: path,
|
113
|
+
content_type=ContentType.UNESCAPED_TSV.value,
|
114
|
+
)
|
115
|
+
assert self.fs.exists(path), "file was not written"
|
116
|
+
|
117
|
+
# Verify content (should be GZIP compressed)
|
118
|
+
with self.fs.open(path, "rb") as f:
|
119
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
120
|
+
content = gz.read().decode("utf-8")
|
121
|
+
# With quote_char=None for unescaped TSV, should use tab separators
|
122
|
+
assert "abc\t1" in content
|
123
|
+
assert "def\t2" in content
|
124
|
+
|
125
|
+
def test_write_orc(self):
|
126
|
+
path = f"{self.base_path}/test.orc"
|
127
|
+
|
128
|
+
dataframe_to_file(
|
129
|
+
self.df, path, self.fs, lambda x: path, content_type=ContentType.ORC.value
|
130
|
+
)
|
131
|
+
assert self.fs.exists(path), "file was not written"
|
132
|
+
|
133
|
+
# Verify content by reading with pandas (since polars delegates to PyArrow)
|
134
|
+
result = pd.read_orc(path)
|
135
|
+
expected = self.df.to_pandas()
|
136
|
+
pd.testing.assert_frame_equal(result, expected)
|
137
|
+
|
138
|
+
def test_write_parquet(self):
|
139
|
+
path = f"{self.base_path}/test.parquet"
|
140
|
+
|
141
|
+
dataframe_to_file(
|
142
|
+
self.df,
|
143
|
+
path,
|
144
|
+
self.fs,
|
145
|
+
lambda x: path,
|
146
|
+
content_type=ContentType.PARQUET.value,
|
147
|
+
)
|
148
|
+
assert self.fs.exists(path), "file was not written"
|
149
|
+
|
150
|
+
# Verify content
|
151
|
+
result = pl.read_parquet(path)
|
152
|
+
assert result.equals(self.df)
|
153
|
+
|
154
|
+
def test_write_json(self):
|
155
|
+
path = f"{self.base_path}/test.json.gz"
|
156
|
+
|
157
|
+
dataframe_to_file(
|
158
|
+
self.df, path, self.fs, lambda x: path, content_type=ContentType.JSON.value
|
159
|
+
)
|
160
|
+
assert self.fs.exists(path), "file was not written"
|
161
|
+
|
162
|
+
# Verify content (should be GZIP compressed, newline-delimited JSON)
|
163
|
+
with self.fs.open(path, "rb") as f:
|
164
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
165
|
+
content = gz.read().decode("utf-8")
|
166
|
+
# Each line should be a valid JSON object
|
167
|
+
lines = [
|
168
|
+
line for line in content.split("\n") if line
|
169
|
+
] # Skip empty lines
|
170
|
+
assert len(lines) == 2 # 2 records
|
171
|
+
assert json.loads(lines[0]) == {"col1": "a,b\tc|d", "col2": 1}
|
172
|
+
assert json.loads(lines[1]) == {"col1": "e,f\tg|h", "col2": 2}
|
173
|
+
|
174
|
+
def test_write_avro(self):
|
175
|
+
path = f"{self.base_path}/test.avro"
|
176
|
+
|
177
|
+
dataframe_to_file(
|
178
|
+
self.df, path, self.fs, lambda x: path, content_type=ContentType.AVRO.value
|
179
|
+
)
|
180
|
+
assert self.fs.exists(path), "file was not written"
|
181
|
+
|
182
|
+
# Verify content by reading with polars
|
183
|
+
result = pl.read_avro(path)
|
184
|
+
assert result.equals(self.df)
|
185
|
+
|
186
|
+
|
187
|
+
class TestPolarsReaders(TestCase):
|
188
|
+
def setUp(self):
|
189
|
+
# Create test data files for reading
|
190
|
+
self.fs = fsspec.filesystem("file")
|
191
|
+
self.base_path = tempfile.mkdtemp()
|
192
|
+
self.fs.makedirs(self.base_path, exist_ok=True)
|
193
|
+
|
194
|
+
# Create test DataFrame
|
195
|
+
self.df = pl.DataFrame(
|
196
|
+
{
|
197
|
+
"col1": ["a,b\tc|d", "e,f\tg|h", "test"],
|
198
|
+
"col2": [1, 2, 3],
|
199
|
+
"col3": [1.1, 2.2, 3.3],
|
200
|
+
}
|
201
|
+
)
|
202
|
+
|
203
|
+
# Write test files in different formats
|
204
|
+
self._create_test_files()
|
205
|
+
|
206
|
+
def tearDown(self):
|
207
|
+
self.fs.rm(self.base_path, recursive=True)
|
208
|
+
|
209
|
+
def _create_test_files(self):
|
210
|
+
"""Create test files for reading tests with the original test data structure."""
|
211
|
+
import gzip
|
212
|
+
import bz2
|
213
|
+
|
214
|
+
# Create CSV file (GZIP compressed) with the original test data
|
215
|
+
csv_path = f"{self.base_path}/test.csv"
|
216
|
+
with self.fs.open(csv_path, "wb") as f:
|
217
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
218
|
+
content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
|
219
|
+
gz.write(content.encode("utf-8"))
|
220
|
+
|
221
|
+
# Create TSV file (GZIP compressed)
|
222
|
+
tsv_path = f"{self.base_path}/test.tsv"
|
223
|
+
with self.fs.open(tsv_path, "wb") as f:
|
224
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
225
|
+
content = '"a,b\tc|d"\t1\t1.1\n"e,f\tg|h"\t2\t2.2\ntest\t3\t3.3\n'
|
226
|
+
gz.write(content.encode("utf-8"))
|
227
|
+
|
228
|
+
# Create PSV file (GZIP compressed)
|
229
|
+
psv_path = f"{self.base_path}/test.psv"
|
230
|
+
with self.fs.open(psv_path, "wb") as f:
|
231
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
232
|
+
content = '"a,b\tc|d"|1|1.1\n"e,f\tg|h"|2|2.2\ntest|3|3.3\n'
|
233
|
+
gz.write(content.encode("utf-8"))
|
234
|
+
|
235
|
+
# Create unescaped TSV file (GZIP compressed)
|
236
|
+
unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
|
237
|
+
with self.fs.open(unescaped_tsv_path, "wb") as f:
|
238
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
239
|
+
content = "abc\t1\t1.1\ndef\t2\t2.2\nghi\t3\t3.3\n"
|
240
|
+
gz.write(content.encode("utf-8"))
|
241
|
+
|
242
|
+
# Create Parquet file
|
243
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
244
|
+
self.df.write_parquet(parquet_path)
|
245
|
+
|
246
|
+
# Create Feather file
|
247
|
+
feather_path = f"{self.base_path}/test.feather"
|
248
|
+
self.df.write_ipc(feather_path)
|
249
|
+
|
250
|
+
# Create JSON file (GZIP compressed, NDJSON format)
|
251
|
+
json_path = f"{self.base_path}/test.json"
|
252
|
+
with self.fs.open(json_path, "wb") as f:
|
253
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
254
|
+
# Use proper NDJSON format - one JSON object per line
|
255
|
+
lines = []
|
256
|
+
for i in range(len(self.df)):
|
257
|
+
row = self.df.row(i)
|
258
|
+
json_obj = {"col1": row[0], "col2": row[1], "col3": row[2]}
|
259
|
+
lines.append(json.dumps(json_obj))
|
260
|
+
content = "\n".join(lines) + "\n"
|
261
|
+
gz.write(content.encode("utf-8"))
|
262
|
+
|
263
|
+
# Create Avro file
|
264
|
+
avro_path = f"{self.base_path}/test.avro"
|
265
|
+
self.df.write_avro(avro_path)
|
266
|
+
|
267
|
+
# Create ORC file using pandas (since polars delegates to pandas for ORC)
|
268
|
+
orc_path = f"{self.base_path}/test.orc"
|
269
|
+
self.df.to_pandas().to_orc(orc_path)
|
270
|
+
|
271
|
+
# Create BZIP2 compressed CSV for compression tests
|
272
|
+
bzip2_path = f"{self.base_path}/test_bzip2.csv.bz2"
|
273
|
+
with bz2.open(bzip2_path, "wt") as f:
|
274
|
+
f.write('"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n')
|
275
|
+
|
276
|
+
def test_content_type_to_reader_kwargs(self):
|
277
|
+
# Test CSV kwargs
|
278
|
+
csv_kwargs = content_type_to_reader_kwargs(ContentType.CSV.value)
|
279
|
+
expected_csv = {"separator": ",", "has_header": False}
|
280
|
+
assert csv_kwargs == expected_csv
|
281
|
+
|
282
|
+
# Test TSV kwargs
|
283
|
+
tsv_kwargs = content_type_to_reader_kwargs(ContentType.TSV.value)
|
284
|
+
expected_tsv = {"separator": "\t", "has_header": False}
|
285
|
+
assert tsv_kwargs == expected_tsv
|
286
|
+
|
287
|
+
# Test PSV kwargs
|
288
|
+
psv_kwargs = content_type_to_reader_kwargs(ContentType.PSV.value)
|
289
|
+
expected_psv = {"separator": "|", "has_header": False}
|
290
|
+
assert psv_kwargs == expected_psv
|
291
|
+
|
292
|
+
# Test unescaped TSV kwargs
|
293
|
+
unescaped_kwargs = content_type_to_reader_kwargs(
|
294
|
+
ContentType.UNESCAPED_TSV.value
|
295
|
+
)
|
296
|
+
expected_unescaped = {
|
297
|
+
"separator": "\t",
|
298
|
+
"has_header": False,
|
299
|
+
"null_values": [""],
|
300
|
+
"quote_char": None,
|
301
|
+
}
|
302
|
+
assert unescaped_kwargs == expected_unescaped
|
303
|
+
|
304
|
+
# Test Parquet kwargs (should be empty)
|
305
|
+
parquet_kwargs = content_type_to_reader_kwargs(ContentType.PARQUET.value)
|
306
|
+
assert parquet_kwargs == {}
|
307
|
+
|
308
|
+
def test_add_column_kwargs(self):
|
309
|
+
kwargs = {}
|
310
|
+
column_names = ["col1", "col2", "col3"]
|
311
|
+
include_columns = ["col1", "col2"]
|
312
|
+
|
313
|
+
# Test CSV column kwargs
|
314
|
+
_add_column_kwargs(ContentType.CSV.value, column_names, include_columns, kwargs)
|
315
|
+
assert kwargs["new_columns"] == column_names
|
316
|
+
assert kwargs["columns"] == include_columns
|
317
|
+
|
318
|
+
# Test Parquet column kwargs
|
319
|
+
kwargs = {}
|
320
|
+
_add_column_kwargs(
|
321
|
+
ContentType.PARQUET.value, column_names, include_columns, kwargs
|
322
|
+
)
|
323
|
+
assert kwargs["columns"] == include_columns
|
324
|
+
assert "new_columns" not in kwargs
|
325
|
+
|
326
|
+
def test_read_csv_from_file(self):
|
327
|
+
csv_path = f"{self.base_path}/test.csv"
|
328
|
+
|
329
|
+
# Read using polars directly to test our reader logic
|
330
|
+
with self.fs.open(csv_path, "rb") as f:
|
331
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
332
|
+
source = io.BytesIO(gz.read())
|
333
|
+
|
334
|
+
kwargs = content_type_to_reader_kwargs(ContentType.CSV.value)
|
335
|
+
kwargs["new_columns"] = ["col1", "col2", "col3"]
|
336
|
+
|
337
|
+
result = pl.read_csv(source, **kwargs)
|
338
|
+
|
339
|
+
# Verify basic structure
|
340
|
+
assert len(result) == 3
|
341
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
342
|
+
assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
343
|
+
|
344
|
+
def test_read_tsv_from_file(self):
|
345
|
+
tsv_path = f"{self.base_path}/test.tsv"
|
346
|
+
|
347
|
+
with self.fs.open(tsv_path, "rb") as f:
|
348
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
349
|
+
source = io.BytesIO(gz.read())
|
350
|
+
|
351
|
+
kwargs = content_type_to_reader_kwargs(ContentType.TSV.value)
|
352
|
+
kwargs["new_columns"] = ["col1", "col2", "col3"]
|
353
|
+
|
354
|
+
result = pl.read_csv(source, **kwargs)
|
355
|
+
|
356
|
+
assert len(result) == 3
|
357
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
358
|
+
assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
359
|
+
|
360
|
+
def test_read_psv_from_file(self):
|
361
|
+
psv_path = f"{self.base_path}/test.psv"
|
362
|
+
|
363
|
+
with self.fs.open(psv_path, "rb") as f:
|
364
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
365
|
+
source = io.BytesIO(gz.read())
|
366
|
+
|
367
|
+
kwargs = content_type_to_reader_kwargs(ContentType.PSV.value)
|
368
|
+
kwargs["new_columns"] = ["col1", "col2", "col3"]
|
369
|
+
|
370
|
+
result = pl.read_csv(source, **kwargs)
|
371
|
+
|
372
|
+
assert len(result) == 3
|
373
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
374
|
+
assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
375
|
+
|
376
|
+
def test_read_parquet_from_file(self):
|
377
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
378
|
+
result = pl.read_parquet(parquet_path)
|
379
|
+
|
380
|
+
assert len(result) == 3
|
381
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
382
|
+
assert result.equals(self.df)
|
383
|
+
|
384
|
+
def test_read_feather_from_file(self):
|
385
|
+
feather_path = f"{self.base_path}/test.feather"
|
386
|
+
result = pl.read_ipc(feather_path)
|
387
|
+
|
388
|
+
assert len(result) == 3
|
389
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
390
|
+
assert result.equals(self.df)
|
391
|
+
|
392
|
+
def test_read_json_from_file(self):
|
393
|
+
json_path = f"{self.base_path}/test.json"
|
394
|
+
|
395
|
+
with self.fs.open(json_path, "rb") as f:
|
396
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
397
|
+
source = io.BytesIO(gz.read())
|
398
|
+
|
399
|
+
result = pl.read_ndjson(source)
|
400
|
+
|
401
|
+
assert len(result) == 3
|
402
|
+
assert set(result.columns) == {"col1", "col2", "col3"}
|
403
|
+
assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
404
|
+
|
405
|
+
def test_read_avro_from_file(self):
|
406
|
+
avro_path = f"{self.base_path}/test.avro"
|
407
|
+
result = pl.read_avro(avro_path)
|
408
|
+
|
409
|
+
assert len(result) == 3
|
410
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
411
|
+
assert result.equals(self.df)
|
412
|
+
|
413
|
+
def test_read_orc_from_file(self):
|
414
|
+
# Test ORC reading via pandas conversion
|
415
|
+
orc_path = f"{self.base_path}/test.orc"
|
416
|
+
|
417
|
+
# Read with pandas and convert to polars (mimicking our ORC handling)
|
418
|
+
import pandas as pd
|
419
|
+
|
420
|
+
pd_df = pd.read_orc(orc_path)
|
421
|
+
result = pl.from_pandas(pd_df)
|
422
|
+
|
423
|
+
assert len(result) == 3
|
424
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
425
|
+
# Convert both to pandas for comparison due to potential type differences
|
426
|
+
pd.testing.assert_frame_equal(result.to_pandas(), self.df.to_pandas())
|
427
|
+
|
428
|
+
def test_read_kwargs_provider_string_types(self):
|
429
|
+
# Test the string types provider
|
430
|
+
provider = ReadKwargsProviderPolarsStringTypes()
|
431
|
+
kwargs = {"separator": ",", "has_header": False}
|
432
|
+
|
433
|
+
# Apply string types
|
434
|
+
result_kwargs = provider._get_kwargs(ContentType.CSV.value, kwargs)
|
435
|
+
|
436
|
+
# Should add infer_schema=False for string type inference
|
437
|
+
assert "infer_schema" in result_kwargs
|
438
|
+
assert result_kwargs["infer_schema"] is False
|
439
|
+
|
440
|
+
def test_concat_dataframes(self):
|
441
|
+
# Test concatenation of multiple dataframes
|
442
|
+
df1 = pl.DataFrame({"col1": ["a"], "col2": [1]})
|
443
|
+
df2 = pl.DataFrame({"col1": ["b"], "col2": [2]})
|
444
|
+
df3 = pl.DataFrame({"col1": ["c"], "col2": [3]})
|
445
|
+
|
446
|
+
# Test normal concatenation
|
447
|
+
result = concat_dataframes([df1, df2, df3])
|
448
|
+
assert len(result) == 3
|
449
|
+
assert result["col1"].to_list() == ["a", "b", "c"]
|
450
|
+
|
451
|
+
# Test single dataframe
|
452
|
+
result = concat_dataframes([df1])
|
453
|
+
assert result.equals(df1)
|
454
|
+
|
455
|
+
# Test empty list
|
456
|
+
result = concat_dataframes([])
|
457
|
+
assert result is None
|
458
|
+
|
459
|
+
# Test None input
|
460
|
+
result = concat_dataframes(None)
|
461
|
+
assert result is None
|
462
|
+
|
463
|
+
def test_file_to_dataframe_csv(self):
|
464
|
+
# Test reading CSV with file_to_dataframe
|
465
|
+
csv_path = f"{self.base_path}/test.csv"
|
466
|
+
|
467
|
+
result = file_to_dataframe(
|
468
|
+
csv_path,
|
469
|
+
ContentType.CSV.value,
|
470
|
+
ContentEncoding.GZIP.value,
|
471
|
+
filesystem=self.fs,
|
472
|
+
column_names=["col1", "col2", "col3"],
|
473
|
+
)
|
474
|
+
|
475
|
+
assert len(result) == 3
|
476
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
477
|
+
assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
478
|
+
|
479
|
+
def test_file_to_dataframe_tsv(self):
|
480
|
+
# Test reading TSV with file_to_dataframe
|
481
|
+
tsv_path = f"{self.base_path}/test.tsv"
|
482
|
+
|
483
|
+
result = file_to_dataframe(
|
484
|
+
tsv_path,
|
485
|
+
ContentType.TSV.value,
|
486
|
+
ContentEncoding.GZIP.value,
|
487
|
+
filesystem=self.fs,
|
488
|
+
column_names=["col1", "col2", "col3"],
|
489
|
+
)
|
490
|
+
|
491
|
+
assert len(result) == 3
|
492
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
493
|
+
assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
494
|
+
|
495
|
+
def test_file_to_dataframe_parquet(self):
|
496
|
+
# Test reading Parquet with file_to_dataframe
|
497
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
498
|
+
|
499
|
+
result = file_to_dataframe(
|
500
|
+
parquet_path, ContentType.PARQUET.value, filesystem=self.fs
|
501
|
+
)
|
502
|
+
|
503
|
+
assert len(result) == 3
|
504
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
505
|
+
assert result.equals(self.df)
|
506
|
+
|
507
|
+
def test_file_to_dataframe_feather(self):
|
508
|
+
# Test reading Feather with file_to_dataframe
|
509
|
+
feather_path = f"{self.base_path}/test.feather"
|
510
|
+
|
511
|
+
result = file_to_dataframe(
|
512
|
+
feather_path, ContentType.FEATHER.value, filesystem=self.fs
|
513
|
+
)
|
514
|
+
|
515
|
+
assert len(result) == 3
|
516
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
517
|
+
assert result.equals(self.df)
|
518
|
+
|
519
|
+
def test_file_to_dataframe_json(self):
|
520
|
+
# Test reading JSON with file_to_dataframe
|
521
|
+
json_path = f"{self.base_path}/test.json"
|
522
|
+
|
523
|
+
result = file_to_dataframe(
|
524
|
+
json_path,
|
525
|
+
ContentType.JSON.value,
|
526
|
+
ContentEncoding.GZIP.value,
|
527
|
+
filesystem=self.fs,
|
528
|
+
)
|
529
|
+
|
530
|
+
assert len(result) == 3
|
531
|
+
assert set(result.columns) == {"col1", "col2", "col3"}
|
532
|
+
assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
533
|
+
|
534
|
+
def test_file_to_dataframe_avro(self):
|
535
|
+
# Test reading Avro with file_to_dataframe
|
536
|
+
avro_path = f"{self.base_path}/test.avro"
|
537
|
+
|
538
|
+
result = file_to_dataframe(
|
539
|
+
avro_path, ContentType.AVRO.value, filesystem=self.fs
|
540
|
+
)
|
541
|
+
|
542
|
+
assert len(result) == 3
|
543
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
544
|
+
assert result.equals(self.df)
|
545
|
+
|
546
|
+
def test_file_to_dataframe_orc(self):
|
547
|
+
# Test reading ORC with file_to_dataframe
|
548
|
+
orc_path = f"{self.base_path}/test.orc"
|
549
|
+
|
550
|
+
result = file_to_dataframe(orc_path, ContentType.ORC.value, filesystem=self.fs)
|
551
|
+
|
552
|
+
assert len(result) == 3
|
553
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
554
|
+
# Convert both to pandas for comparison due to potential type differences
|
555
|
+
pd.testing.assert_frame_equal(result.to_pandas(), self.df.to_pandas())
|
556
|
+
|
557
|
+
def test_file_to_dataframe_with_column_selection(self):
|
558
|
+
# Test reading with column selection
|
559
|
+
csv_path = f"{self.base_path}/test.csv"
|
560
|
+
|
561
|
+
# When has_header=False and we specify columns, we need to use column indices or
|
562
|
+
# not provide new_columns. Let's test by just specifying the first 2 columns by index
|
563
|
+
result = file_to_dataframe(
|
564
|
+
csv_path,
|
565
|
+
ContentType.CSV.value,
|
566
|
+
ContentEncoding.GZIP.value,
|
567
|
+
filesystem=self.fs,
|
568
|
+
include_columns=[0, 1], # Select first two columns by index
|
569
|
+
)
|
570
|
+
|
571
|
+
assert len(result) == 3
|
572
|
+
assert len(result.columns) == 2 # Should only have 2 columns
|
573
|
+
# With auto-generated column names when has_header=False
|
574
|
+
assert list(result.columns) == ["column_1", "column_2"]
|
575
|
+
|
576
|
+
def test_file_to_dataframe_with_kwargs_provider(self):
|
577
|
+
# Test reading with kwargs provider
|
578
|
+
csv_path = f"{self.base_path}/test.csv"
|
579
|
+
provider = ReadKwargsProviderPolarsStringTypes(
|
580
|
+
include_columns=["column_1", "column_2", "column_3"]
|
581
|
+
)
|
582
|
+
|
583
|
+
result = file_to_dataframe(
|
584
|
+
csv_path,
|
585
|
+
ContentType.CSV.value,
|
586
|
+
ContentEncoding.GZIP.value,
|
587
|
+
filesystem=self.fs,
|
588
|
+
pl_read_func_kwargs_provider=provider,
|
589
|
+
)
|
590
|
+
|
591
|
+
assert len(result) == 3
|
592
|
+
assert list(result.columns) == ["column_1", "column_2", "column_3"]
|
593
|
+
# With string types provider, all columns should be strings
|
594
|
+
assert all(result[col].dtype == pl.Utf8 for col in result.columns)
|
595
|
+
|
596
|
+
def test_file_to_dataframe_filesystem_inference(self):
|
597
|
+
# Test filesystem inference when no filesystem is provided
|
598
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
599
|
+
|
600
|
+
result = file_to_dataframe(
|
601
|
+
parquet_path,
|
602
|
+
ContentType.PARQUET.value
|
603
|
+
# No filesystem provided - should be inferred
|
604
|
+
)
|
605
|
+
|
606
|
+
assert len(result) == 3
|
607
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
608
|
+
assert result.equals(self.df)
|
609
|
+
|
610
|
+
def test_file_to_dataframe_unsupported_content_type(self):
|
611
|
+
# Test error handling for unsupported content type
|
612
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
613
|
+
|
614
|
+
with self.assertRaises(NotImplementedError) as context:
|
615
|
+
file_to_dataframe(
|
616
|
+
parquet_path, "unsupported/content-type", filesystem=self.fs
|
617
|
+
)
|
618
|
+
|
619
|
+
assert "not implemented" in str(context.exception)
|
620
|
+
|
621
|
+
def test_file_to_dataframe_bzip2_compression(self):
|
622
|
+
# Test BZIP2 compression handling
|
623
|
+
import bz2
|
624
|
+
|
625
|
+
# Create a BZIP2 compressed CSV file
|
626
|
+
csv_content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
|
627
|
+
compressed_content = bz2.compress(csv_content.encode("utf-8"))
|
628
|
+
|
629
|
+
bz2_path = f"{self.base_path}/test.csv.bz2"
|
630
|
+
with self.fs.open(bz2_path, "wb") as f:
|
631
|
+
f.write(compressed_content)
|
632
|
+
|
633
|
+
result = file_to_dataframe(
|
634
|
+
bz2_path,
|
635
|
+
ContentType.CSV.value,
|
636
|
+
ContentEncoding.BZIP2.value,
|
637
|
+
filesystem=self.fs,
|
638
|
+
column_names=["col1", "col2", "col3"],
|
639
|
+
)
|
640
|
+
|
641
|
+
assert len(result) == 3
|
642
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
643
|
+
assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
644
|
+
|
645
|
+
|
646
|
+
class TestPolarsFileSystemSupport(TestCase):
|
647
|
+
"""
|
648
|
+
Comprehensive tests for encoding-aware reader functions with different filesystem types.
|
649
|
+
Tests fsspec AbstractFileSystem, PyArrow FileSystem, and auto-inferred filesystem.
|
650
|
+
"""
|
651
|
+
|
652
|
+
def setUp(self):
|
653
|
+
import pyarrow.fs as pafs
|
654
|
+
|
655
|
+
# Create test data
|
656
|
+
self.test_data = pl.DataFrame(
|
657
|
+
{
|
658
|
+
"col1": ["value1", "value2", "value3"],
|
659
|
+
"col2": [1, 2, 3],
|
660
|
+
"col3": [1.1, 2.2, 3.3],
|
661
|
+
}
|
662
|
+
)
|
663
|
+
|
664
|
+
# Set up temporary directory
|
665
|
+
self.temp_dir = tempfile.mkdtemp()
|
666
|
+
|
667
|
+
# Set up different filesystem types
|
668
|
+
self.fsspec_fs = fsspec.filesystem("file")
|
669
|
+
self.pyarrow_fs = pafs.LocalFileSystem()
|
670
|
+
|
671
|
+
# Create test files for each content type
|
672
|
+
self._create_test_files()
|
673
|
+
|
674
|
+
def tearDown(self):
|
675
|
+
import shutil
|
676
|
+
|
677
|
+
shutil.rmtree(self.temp_dir)
|
678
|
+
|
679
|
+
def _create_test_files(self):
|
680
|
+
"""Create test files in different formats with different compression types."""
|
681
|
+
import gzip
|
682
|
+
import bz2
|
683
|
+
|
684
|
+
# CSV files
|
685
|
+
csv_data = "col1,col2,col3\nvalue1,1,1.1\nvalue2,2,2.2\nvalue3,3,3.3\n"
|
686
|
+
|
687
|
+
# Create uncompressed CSV
|
688
|
+
with open(f"{self.temp_dir}/test.csv", "w") as f:
|
689
|
+
f.write(csv_data)
|
690
|
+
|
691
|
+
# Create GZIP compressed CSV (fix: properly close the file)
|
692
|
+
with gzip.open(f"{self.temp_dir}/test_gzip.csv.gz", "wt") as f:
|
693
|
+
f.write(csv_data)
|
694
|
+
|
695
|
+
# Create BZIP2 compressed CSV (fix: properly close the file)
|
696
|
+
with bz2.open(f"{self.temp_dir}/test_bzip2.csv.bz2", "wt") as f:
|
697
|
+
f.write(csv_data)
|
698
|
+
|
699
|
+
# Parquet file
|
700
|
+
self.test_data.write_parquet(f"{self.temp_dir}/test.parquet")
|
701
|
+
|
702
|
+
# Feather/IPC file
|
703
|
+
self.test_data.write_ipc(f"{self.temp_dir}/test.feather")
|
704
|
+
|
705
|
+
# JSON file (NDJSON)
|
706
|
+
json_data = '{"col1":"value1","col2":1,"col3":1.1}\n{"col1":"value2","col2":2,"col3":2.2}\n{"col1":"value3","col2":3,"col3":3.3}\n'
|
707
|
+
with open(f"{self.temp_dir}/test.json", "w") as f:
|
708
|
+
f.write(json_data)
|
709
|
+
|
710
|
+
# AVRO file
|
711
|
+
self.test_data.write_avro(f"{self.temp_dir}/test.avro")
|
712
|
+
|
713
|
+
# ORC file (via pandas since polars delegates to pandas for ORC)
|
714
|
+
self.test_data.to_pandas().to_orc(f"{self.temp_dir}/test.orc")
|
715
|
+
|
716
|
+
def _assert_dataframes_equal(self, result, expected):
|
717
|
+
"""Helper to assert polars dataframes are equal."""
|
718
|
+
# Convert to pandas for comparison since polars equality can be tricky with floating point
|
719
|
+
pd.testing.assert_frame_equal(
|
720
|
+
result.to_pandas().reset_index(drop=True),
|
721
|
+
expected.to_pandas().reset_index(drop=True),
|
722
|
+
check_dtype=False, # Allow minor type differences
|
723
|
+
)
|
724
|
+
|
725
|
+
def test_csv_with_fsspec_filesystem(self):
|
726
|
+
"""Test CSV reading with fsspec AbstractFileSystem."""
|
727
|
+
from deltacat.utils.polars import read_csv
|
728
|
+
|
729
|
+
# Test uncompressed CSV
|
730
|
+
result = read_csv(
|
731
|
+
f"{self.temp_dir}/test.csv",
|
732
|
+
filesystem=self.fsspec_fs,
|
733
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
734
|
+
has_header=True,
|
735
|
+
)
|
736
|
+
self._assert_dataframes_equal(result, self.test_data)
|
737
|
+
|
738
|
+
# Test GZIP compressed CSV
|
739
|
+
result = read_csv(
|
740
|
+
f"{self.temp_dir}/test_gzip.csv.gz",
|
741
|
+
filesystem=self.fsspec_fs,
|
742
|
+
content_encoding=ContentEncoding.GZIP.value,
|
743
|
+
has_header=True,
|
744
|
+
)
|
745
|
+
self._assert_dataframes_equal(result, self.test_data)
|
746
|
+
|
747
|
+
# Test BZIP2 compressed CSV
|
748
|
+
result = read_csv(
|
749
|
+
f"{self.temp_dir}/test_bzip2.csv.bz2",
|
750
|
+
filesystem=self.fsspec_fs,
|
751
|
+
content_encoding=ContentEncoding.BZIP2.value,
|
752
|
+
has_header=True,
|
753
|
+
)
|
754
|
+
self._assert_dataframes_equal(result, self.test_data)
|
755
|
+
|
756
|
+
def test_csv_with_pyarrow_filesystem(self):
|
757
|
+
"""Test CSV reading with PyArrow FileSystem."""
|
758
|
+
from deltacat.utils.polars import read_csv
|
759
|
+
|
760
|
+
# Test uncompressed CSV
|
761
|
+
result = read_csv(
|
762
|
+
f"{self.temp_dir}/test.csv",
|
763
|
+
filesystem=self.pyarrow_fs,
|
764
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
765
|
+
has_header=True,
|
766
|
+
)
|
767
|
+
self._assert_dataframes_equal(result, self.test_data)
|
768
|
+
|
769
|
+
# Test GZIP compressed CSV
|
770
|
+
result = read_csv(
|
771
|
+
f"{self.temp_dir}/test_gzip.csv.gz",
|
772
|
+
filesystem=self.pyarrow_fs,
|
773
|
+
content_encoding=ContentEncoding.GZIP.value,
|
774
|
+
has_header=True,
|
775
|
+
)
|
776
|
+
self._assert_dataframes_equal(result, self.test_data)
|
777
|
+
|
778
|
+
def test_csv_with_auto_inferred_filesystem(self):
|
779
|
+
"""Test CSV reading with automatically inferred filesystem."""
|
780
|
+
from deltacat.utils.polars import read_csv
|
781
|
+
|
782
|
+
# Test uncompressed CSV (filesystem=None, should auto-infer)
|
783
|
+
result = read_csv(
|
784
|
+
f"{self.temp_dir}/test.csv",
|
785
|
+
filesystem=None,
|
786
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
787
|
+
has_header=True,
|
788
|
+
)
|
789
|
+
self._assert_dataframes_equal(result, self.test_data)
|
790
|
+
|
791
|
+
def test_parquet_with_different_filesystems(self):
|
792
|
+
"""Test Parquet reading with different filesystem types."""
|
793
|
+
from deltacat.utils.polars import read_parquet
|
794
|
+
|
795
|
+
# Test with fsspec
|
796
|
+
result = read_parquet(
|
797
|
+
f"{self.temp_dir}/test.parquet",
|
798
|
+
filesystem=self.fsspec_fs,
|
799
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
800
|
+
)
|
801
|
+
self._assert_dataframes_equal(result, self.test_data)
|
802
|
+
|
803
|
+
# Test with PyArrow
|
804
|
+
result = read_parquet(
|
805
|
+
f"{self.temp_dir}/test.parquet",
|
806
|
+
filesystem=self.pyarrow_fs,
|
807
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
808
|
+
)
|
809
|
+
self._assert_dataframes_equal(result, self.test_data)
|
810
|
+
|
811
|
+
# Test with auto-inferred
|
812
|
+
result = read_parquet(
|
813
|
+
f"{self.temp_dir}/test.parquet",
|
814
|
+
filesystem=None,
|
815
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
816
|
+
)
|
817
|
+
self._assert_dataframes_equal(result, self.test_data)
|
818
|
+
|
819
|
+
def test_feather_with_different_filesystems(self):
|
820
|
+
"""Test Feather/IPC reading with different filesystem types."""
|
821
|
+
from deltacat.utils.polars import read_ipc
|
822
|
+
|
823
|
+
# Test with fsspec
|
824
|
+
result = read_ipc(
|
825
|
+
f"{self.temp_dir}/test.feather",
|
826
|
+
filesystem=self.fsspec_fs,
|
827
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
828
|
+
)
|
829
|
+
self._assert_dataframes_equal(result, self.test_data)
|
830
|
+
|
831
|
+
# Test with PyArrow
|
832
|
+
result = read_ipc(
|
833
|
+
f"{self.temp_dir}/test.feather",
|
834
|
+
filesystem=self.pyarrow_fs,
|
835
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
836
|
+
)
|
837
|
+
self._assert_dataframes_equal(result, self.test_data)
|
838
|
+
|
839
|
+
# Test with auto-inferred
|
840
|
+
result = read_ipc(
|
841
|
+
f"{self.temp_dir}/test.feather",
|
842
|
+
filesystem=None,
|
843
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
844
|
+
)
|
845
|
+
self._assert_dataframes_equal(result, self.test_data)
|
846
|
+
|
847
|
+
def test_json_with_different_filesystems(self):
|
848
|
+
"""Test JSON reading with different filesystem types."""
|
849
|
+
from deltacat.utils.polars import read_ndjson
|
850
|
+
|
851
|
+
# Test with fsspec
|
852
|
+
result = read_ndjson(
|
853
|
+
f"{self.temp_dir}/test.json",
|
854
|
+
filesystem=self.fsspec_fs,
|
855
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
856
|
+
)
|
857
|
+
self._assert_dataframes_equal(result, self.test_data)
|
858
|
+
|
859
|
+
# Test with PyArrow
|
860
|
+
result = read_ndjson(
|
861
|
+
f"{self.temp_dir}/test.json",
|
862
|
+
filesystem=self.pyarrow_fs,
|
863
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
864
|
+
)
|
865
|
+
self._assert_dataframes_equal(result, self.test_data)
|
866
|
+
|
867
|
+
# Test with auto-inferred
|
868
|
+
result = read_ndjson(
|
869
|
+
f"{self.temp_dir}/test.json",
|
870
|
+
filesystem=None,
|
871
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
872
|
+
)
|
873
|
+
self._assert_dataframes_equal(result, self.test_data)
|
874
|
+
|
875
|
+
def test_avro_with_different_filesystems(self):
|
876
|
+
"""Test AVRO reading with different filesystem types."""
|
877
|
+
from deltacat.utils.polars import read_avro
|
878
|
+
|
879
|
+
# Test with fsspec
|
880
|
+
result = read_avro(
|
881
|
+
f"{self.temp_dir}/test.avro",
|
882
|
+
filesystem=self.fsspec_fs,
|
883
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
884
|
+
)
|
885
|
+
self._assert_dataframes_equal(result, self.test_data)
|
886
|
+
|
887
|
+
# Test with PyArrow
|
888
|
+
result = read_avro(
|
889
|
+
f"{self.temp_dir}/test.avro",
|
890
|
+
filesystem=self.pyarrow_fs,
|
891
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
892
|
+
)
|
893
|
+
self._assert_dataframes_equal(result, self.test_data)
|
894
|
+
|
895
|
+
# Test with auto-inferred
|
896
|
+
result = read_avro(
|
897
|
+
f"{self.temp_dir}/test.avro",
|
898
|
+
filesystem=None,
|
899
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
900
|
+
)
|
901
|
+
self._assert_dataframes_equal(result, self.test_data)
|
902
|
+
|
903
|
+
def test_orc_with_different_filesystems(self):
|
904
|
+
"""Test ORC reading with different filesystem types."""
|
905
|
+
from deltacat.utils.polars import read_orc
|
906
|
+
|
907
|
+
# Test with fsspec
|
908
|
+
result = read_orc(
|
909
|
+
f"{self.temp_dir}/test.orc",
|
910
|
+
filesystem=self.fsspec_fs,
|
911
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
912
|
+
)
|
913
|
+
self._assert_dataframes_equal(result, self.test_data)
|
914
|
+
|
915
|
+
# Test with PyArrow
|
916
|
+
result = read_orc(
|
917
|
+
f"{self.temp_dir}/test.orc",
|
918
|
+
filesystem=self.pyarrow_fs,
|
919
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
920
|
+
)
|
921
|
+
self._assert_dataframes_equal(result, self.test_data)
|
922
|
+
|
923
|
+
# Test with auto-inferred
|
924
|
+
result = read_orc(
|
925
|
+
f"{self.temp_dir}/test.orc",
|
926
|
+
filesystem=None,
|
927
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
928
|
+
)
|
929
|
+
self._assert_dataframes_equal(result, self.test_data)
|
930
|
+
|
931
|
+
def test_file_to_dataframe_with_different_filesystems(self):
|
932
|
+
"""Test file_to_dataframe with different filesystem types for all content types."""
|
933
|
+
test_cases = [
|
934
|
+
(
|
935
|
+
f"{self.temp_dir}/test.csv",
|
936
|
+
ContentType.CSV.value,
|
937
|
+
ContentEncoding.IDENTITY.value,
|
938
|
+
{"has_header": True},
|
939
|
+
),
|
940
|
+
(
|
941
|
+
f"{self.temp_dir}/test_gzip.csv.gz",
|
942
|
+
ContentType.CSV.value,
|
943
|
+
ContentEncoding.GZIP.value,
|
944
|
+
{"has_header": True},
|
945
|
+
),
|
946
|
+
(
|
947
|
+
f"{self.temp_dir}/test.parquet",
|
948
|
+
ContentType.PARQUET.value,
|
949
|
+
ContentEncoding.IDENTITY.value,
|
950
|
+
{},
|
951
|
+
),
|
952
|
+
(
|
953
|
+
f"{self.temp_dir}/test.feather",
|
954
|
+
ContentType.FEATHER.value,
|
955
|
+
ContentEncoding.IDENTITY.value,
|
956
|
+
{},
|
957
|
+
),
|
958
|
+
(
|
959
|
+
f"{self.temp_dir}/test.json",
|
960
|
+
ContentType.JSON.value,
|
961
|
+
ContentEncoding.IDENTITY.value,
|
962
|
+
{},
|
963
|
+
),
|
964
|
+
(
|
965
|
+
f"{self.temp_dir}/test.avro",
|
966
|
+
ContentType.AVRO.value,
|
967
|
+
ContentEncoding.IDENTITY.value,
|
968
|
+
{},
|
969
|
+
),
|
970
|
+
(
|
971
|
+
f"{self.temp_dir}/test.orc",
|
972
|
+
ContentType.ORC.value,
|
973
|
+
ContentEncoding.IDENTITY.value,
|
974
|
+
{},
|
975
|
+
),
|
976
|
+
]
|
977
|
+
|
978
|
+
filesystems = [
|
979
|
+
("fsspec", self.fsspec_fs),
|
980
|
+
("pyarrow", self.pyarrow_fs),
|
981
|
+
("auto-inferred", None),
|
982
|
+
]
|
983
|
+
|
984
|
+
for path, content_type, content_encoding, extra_kwargs in test_cases:
|
985
|
+
for fs_name, filesystem in filesystems:
|
986
|
+
with self.subTest(
|
987
|
+
content_type=content_type,
|
988
|
+
filesystem=fs_name,
|
989
|
+
encoding=content_encoding,
|
990
|
+
):
|
991
|
+
result = file_to_dataframe(
|
992
|
+
path=path,
|
993
|
+
content_type=content_type,
|
994
|
+
content_encoding=content_encoding,
|
995
|
+
filesystem=filesystem,
|
996
|
+
**extra_kwargs,
|
997
|
+
)
|
998
|
+
self._assert_dataframes_equal(result, self.test_data)
|
999
|
+
|
1000
|
+
def test_compression_encoding_with_different_filesystems(self):
|
1001
|
+
"""Test that compression encoding works correctly with different filesystem types."""
|
1002
|
+
test_cases = [
|
1003
|
+
(f"{self.temp_dir}/test.csv", ContentEncoding.IDENTITY.value),
|
1004
|
+
(f"{self.temp_dir}/test_gzip.csv.gz", ContentEncoding.GZIP.value),
|
1005
|
+
(f"{self.temp_dir}/test_bzip2.csv.bz2", ContentEncoding.BZIP2.value),
|
1006
|
+
]
|
1007
|
+
|
1008
|
+
filesystems = [
|
1009
|
+
("fsspec", self.fsspec_fs),
|
1010
|
+
("pyarrow", self.pyarrow_fs),
|
1011
|
+
("auto-inferred", None),
|
1012
|
+
]
|
1013
|
+
|
1014
|
+
for path, content_encoding in test_cases:
|
1015
|
+
for fs_name, filesystem in filesystems:
|
1016
|
+
with self.subTest(encoding=content_encoding, filesystem=fs_name):
|
1017
|
+
result = file_to_dataframe(
|
1018
|
+
path=path,
|
1019
|
+
content_type=ContentType.CSV.value,
|
1020
|
+
content_encoding=content_encoding,
|
1021
|
+
filesystem=filesystem,
|
1022
|
+
has_header=True,
|
1023
|
+
)
|
1024
|
+
self._assert_dataframes_equal(result, self.test_data)
|
1025
|
+
|
1026
|
+
def test_filesystem_open_kwargs(self):
|
1027
|
+
"""Test that filesystem open kwargs are properly passed through."""
|
1028
|
+
from deltacat.utils.polars import read_csv
|
1029
|
+
|
1030
|
+
# Test with custom fs_open_kwargs
|
1031
|
+
result = read_csv(
|
1032
|
+
f"{self.temp_dir}/test.csv",
|
1033
|
+
filesystem=self.fsspec_fs,
|
1034
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
1035
|
+
fs_open_kwargs={
|
1036
|
+
"encoding": "utf-8"
|
1037
|
+
}, # This should be passed to filesystem.open()
|
1038
|
+
has_header=True,
|
1039
|
+
)
|
1040
|
+
self._assert_dataframes_equal(result, self.test_data)
|