deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1106 @@
|
|
1
|
+
from unittest import TestCase
|
2
|
+
import csv
|
3
|
+
import pandas as pd
|
4
|
+
import tempfile
|
5
|
+
import fsspec
|
6
|
+
import gzip
|
7
|
+
import json
|
8
|
+
import polars as pl
|
9
|
+
from deltacat.types.media import ContentType, ContentEncoding
|
10
|
+
from deltacat.utils.pandas import (
|
11
|
+
dataframe_to_file,
|
12
|
+
file_to_dataframe,
|
13
|
+
content_type_to_reader_kwargs,
|
14
|
+
_add_column_kwargs,
|
15
|
+
ReadKwargsProviderPandasCsvPureUtf8,
|
16
|
+
concat_dataframes,
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
class TestPandasWriters(TestCase):
|
21
|
+
def setUp(self):
|
22
|
+
# Create a test DataFrame with data that includes delimiters
|
23
|
+
self.df = pd.DataFrame({"col1": ["a,b\tc|d", "e,f\tg|h"], "col2": [1, 2]})
|
24
|
+
self.fs = fsspec.filesystem("file")
|
25
|
+
self.base_path = tempfile.mkdtemp()
|
26
|
+
self.fs.makedirs(self.base_path, exist_ok=True)
|
27
|
+
|
28
|
+
def tearDown(self):
|
29
|
+
self.fs.rm(self.base_path, recursive=True)
|
30
|
+
|
31
|
+
def test_write_feather(self):
|
32
|
+
path = f"{self.base_path}/test.feather"
|
33
|
+
|
34
|
+
dataframe_to_file(
|
35
|
+
self.df,
|
36
|
+
path,
|
37
|
+
self.fs,
|
38
|
+
lambda x: path,
|
39
|
+
content_type=ContentType.FEATHER.value,
|
40
|
+
)
|
41
|
+
assert self.fs.exists(path), "file was not written"
|
42
|
+
|
43
|
+
# Verify content
|
44
|
+
result = pd.read_feather(path)
|
45
|
+
pd.testing.assert_frame_equal(result, self.df)
|
46
|
+
|
47
|
+
def test_write_csv(self):
|
48
|
+
path = f"{self.base_path}/test.csv"
|
49
|
+
|
50
|
+
dataframe_to_file(
|
51
|
+
self.df, path, self.fs, lambda x: path, content_type=ContentType.CSV.value
|
52
|
+
)
|
53
|
+
assert self.fs.exists(path), "file was not written"
|
54
|
+
|
55
|
+
# Verify content (should be GZIP compressed)
|
56
|
+
with self.fs.open(path, "rb") as f:
|
57
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
58
|
+
content = gz.read().decode("utf-8")
|
59
|
+
# Should be quoted due to commas in data
|
60
|
+
assert '"a,b\tc|d",1' in content
|
61
|
+
assert '"e,f\tg|h",2' in content
|
62
|
+
|
63
|
+
def test_write_tsv(self):
|
64
|
+
path = f"{self.base_path}/test.tsv"
|
65
|
+
|
66
|
+
dataframe_to_file(
|
67
|
+
self.df,
|
68
|
+
path,
|
69
|
+
self.fs,
|
70
|
+
lambda x: path,
|
71
|
+
content_type=ContentType.TSV.value,
|
72
|
+
)
|
73
|
+
assert self.fs.exists(path), "file was not written"
|
74
|
+
|
75
|
+
# Verify content (should be GZIP compressed)
|
76
|
+
with self.fs.open(path, "rb") as f:
|
77
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
78
|
+
content = gz.read().decode("utf-8")
|
79
|
+
# Should be quoted due to tabs in data
|
80
|
+
assert '"a,b\tc|d"\t1' in content
|
81
|
+
assert '"e,f\tg|h"\t2' in content
|
82
|
+
|
83
|
+
def test_write_psv(self):
|
84
|
+
path = f"{self.base_path}/test.psv"
|
85
|
+
|
86
|
+
dataframe_to_file(
|
87
|
+
self.df,
|
88
|
+
path,
|
89
|
+
self.fs,
|
90
|
+
lambda x: path,
|
91
|
+
content_type=ContentType.PSV.value,
|
92
|
+
)
|
93
|
+
assert self.fs.exists(path), "file was not written"
|
94
|
+
|
95
|
+
# Verify content (should be GZIP compressed)
|
96
|
+
with self.fs.open(path, "rb") as f:
|
97
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
98
|
+
content = gz.read().decode("utf-8")
|
99
|
+
# Should be quoted due to pipes in data
|
100
|
+
assert '"a,b\tc|d"|1' in content
|
101
|
+
assert '"e,f\tg|h"|2' in content
|
102
|
+
|
103
|
+
def test_write_unescaped_tsv(self):
|
104
|
+
# Create DataFrame without delimiters for unescaped TSV
|
105
|
+
df = pd.DataFrame({"col1": ["abc", "def"], "col2": [1, 2]})
|
106
|
+
path = f"{self.base_path}/test.tsv"
|
107
|
+
|
108
|
+
dataframe_to_file(
|
109
|
+
df,
|
110
|
+
path,
|
111
|
+
self.fs,
|
112
|
+
lambda x: path,
|
113
|
+
content_type=ContentType.UNESCAPED_TSV.value,
|
114
|
+
)
|
115
|
+
assert self.fs.exists(path), "file was not written"
|
116
|
+
|
117
|
+
# Verify content (should be GZIP compressed)
|
118
|
+
with self.fs.open(path, "rb") as f:
|
119
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
120
|
+
content = gz.read().decode("utf-8")
|
121
|
+
# With quoting_style="none", strings should not be quoted
|
122
|
+
assert "abc\t1" in content
|
123
|
+
assert "def\t2" in content
|
124
|
+
|
125
|
+
def test_write_orc(self):
|
126
|
+
path = f"{self.base_path}/test.orc"
|
127
|
+
|
128
|
+
dataframe_to_file(
|
129
|
+
self.df, path, self.fs, lambda x: path, content_type=ContentType.ORC.value
|
130
|
+
)
|
131
|
+
assert self.fs.exists(path), "file was not written"
|
132
|
+
|
133
|
+
# Verify content
|
134
|
+
result = pd.read_orc(path)
|
135
|
+
pd.testing.assert_frame_equal(result, self.df)
|
136
|
+
|
137
|
+
def test_write_parquet(self):
|
138
|
+
path = f"{self.base_path}/test.parquet"
|
139
|
+
|
140
|
+
dataframe_to_file(
|
141
|
+
self.df,
|
142
|
+
path,
|
143
|
+
self.fs,
|
144
|
+
lambda x: path,
|
145
|
+
content_type=ContentType.PARQUET.value,
|
146
|
+
)
|
147
|
+
assert self.fs.exists(path), "file was not written"
|
148
|
+
|
149
|
+
# Verify content
|
150
|
+
result = pd.read_parquet(path)
|
151
|
+
pd.testing.assert_frame_equal(result, self.df)
|
152
|
+
|
153
|
+
def test_write_json(self):
|
154
|
+
path = f"{self.base_path}/test.json"
|
155
|
+
|
156
|
+
dataframe_to_file(
|
157
|
+
self.df,
|
158
|
+
path,
|
159
|
+
self.fs,
|
160
|
+
lambda x: path,
|
161
|
+
content_type=ContentType.JSON.value,
|
162
|
+
orient="records", # Write each record as a separate JSON object
|
163
|
+
lines=True, # This should create NDJSON format
|
164
|
+
)
|
165
|
+
assert self.fs.exists(path), "file was not written"
|
166
|
+
|
167
|
+
# Verify content (should be GZIP compressed NDJSON format)
|
168
|
+
with self.fs.open(path, "rb") as f:
|
169
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
170
|
+
content = gz.read().decode("utf-8").strip()
|
171
|
+
# Content should be NDJSON format: each line is a separate JSON object
|
172
|
+
lines = content.split("\n")
|
173
|
+
assert len(lines) == 2 # 2 records
|
174
|
+
|
175
|
+
# Parse each line as a separate JSON object
|
176
|
+
data = [json.loads(line) for line in lines]
|
177
|
+
assert data[0] == {"col1": "a,b\tc|d", "col2": 1}
|
178
|
+
assert data[1] == {"col1": "e,f\tg|h", "col2": 2}
|
179
|
+
|
180
|
+
def test_write_avro(self):
|
181
|
+
path = f"{self.base_path}/test.avro"
|
182
|
+
|
183
|
+
dataframe_to_file(
|
184
|
+
self.df, path, self.fs, lambda x: path, content_type=ContentType.AVRO.value
|
185
|
+
)
|
186
|
+
assert self.fs.exists(path), "file was not written"
|
187
|
+
|
188
|
+
# Verify content by reading with polars
|
189
|
+
result = pl.read_avro(path).to_pandas()
|
190
|
+
pd.testing.assert_frame_equal(result, self.df)
|
191
|
+
|
192
|
+
|
193
|
+
class TestPandasReaders(TestCase):
|
194
|
+
def setUp(self):
|
195
|
+
# Create test data files for reading
|
196
|
+
self.fs = fsspec.filesystem("file")
|
197
|
+
self.base_path = tempfile.mkdtemp()
|
198
|
+
self.fs.makedirs(self.base_path, exist_ok=True)
|
199
|
+
|
200
|
+
# Create test DataFrame
|
201
|
+
self.df = pd.DataFrame(
|
202
|
+
{
|
203
|
+
"col1": ["a,b\tc|d", "e,f\tg|h", "test"],
|
204
|
+
"col2": [1, 2, 3],
|
205
|
+
"col3": [1.1, 2.2, 3.3],
|
206
|
+
}
|
207
|
+
)
|
208
|
+
|
209
|
+
# Write test files in different formats
|
210
|
+
self._create_test_files()
|
211
|
+
|
212
|
+
def tearDown(self):
|
213
|
+
self.fs.rm(self.base_path, recursive=True)
|
214
|
+
|
215
|
+
def _create_test_files(self):
|
216
|
+
"""Create test files in different formats with different compression types."""
|
217
|
+
import gzip
|
218
|
+
|
219
|
+
# Create CSV file (GZIP compressed)
|
220
|
+
csv_path = f"{self.base_path}/test.csv"
|
221
|
+
with self.fs.open(csv_path, "wb") as f:
|
222
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
223
|
+
content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
|
224
|
+
gz.write(content.encode("utf-8"))
|
225
|
+
|
226
|
+
# Create TSV file (GZIP compressed)
|
227
|
+
tsv_path = f"{self.base_path}/test.tsv"
|
228
|
+
with self.fs.open(tsv_path, "wb") as f:
|
229
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
230
|
+
content = '"a,b\tc|d"\t1\t1.1\n"e,f\tg|h"\t2\t2.2\ntest\t3\t3.3\n'
|
231
|
+
gz.write(content.encode("utf-8"))
|
232
|
+
|
233
|
+
# Create PSV file (GZIP compressed)
|
234
|
+
psv_path = f"{self.base_path}/test.psv"
|
235
|
+
with self.fs.open(psv_path, "wb") as f:
|
236
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
237
|
+
content = '"a,b\tc|d"|1|1.1\n"e,f\tg|h"|2|2.2\ntest|3|3.3\n'
|
238
|
+
gz.write(content.encode("utf-8"))
|
239
|
+
|
240
|
+
# Create unescaped TSV file (GZIP compressed)
|
241
|
+
unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
|
242
|
+
pd.DataFrame(
|
243
|
+
{"col1": ["abc", "def", "ghi"], "col2": [1, 2, 3], "col3": [1.1, 2.2, 3.3]}
|
244
|
+
)
|
245
|
+
with self.fs.open(unescaped_tsv_path, "wb") as f:
|
246
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
247
|
+
content = "abc\t1\t1.1\ndef\t2\t2.2\nghi\t3\t3.3\n"
|
248
|
+
gz.write(content.encode("utf-8"))
|
249
|
+
|
250
|
+
# Create Parquet file
|
251
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
252
|
+
self.df.to_parquet(parquet_path, index=False)
|
253
|
+
|
254
|
+
# Create Feather file
|
255
|
+
feather_path = f"{self.base_path}/test.feather"
|
256
|
+
self.df.to_feather(feather_path)
|
257
|
+
|
258
|
+
# Create JSON file (GZIP compressed, NDJSON format)
|
259
|
+
json_path = f"{self.base_path}/test.json"
|
260
|
+
with self.fs.open(json_path, "wb") as f:
|
261
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
262
|
+
json_str = self.df.to_json(orient="records", lines=True)
|
263
|
+
gz.write(json_str.encode("utf-8"))
|
264
|
+
|
265
|
+
# Create Avro file using polars (since pandas delegates to polars for Avro)
|
266
|
+
avro_path = f"{self.base_path}/test.avro"
|
267
|
+
pl_df = pl.from_pandas(self.df)
|
268
|
+
pl_df.write_avro(avro_path)
|
269
|
+
|
270
|
+
# Create ORC file
|
271
|
+
orc_path = f"{self.base_path}/test.orc"
|
272
|
+
self.df.to_orc(orc_path, index=False)
|
273
|
+
|
274
|
+
def test_content_type_to_reader_kwargs(self):
|
275
|
+
# Test CSV kwargs
|
276
|
+
csv_kwargs = content_type_to_reader_kwargs(ContentType.CSV.value)
|
277
|
+
expected_csv = {"sep": ",", "header": None}
|
278
|
+
assert csv_kwargs == expected_csv
|
279
|
+
|
280
|
+
# Test TSV kwargs
|
281
|
+
tsv_kwargs = content_type_to_reader_kwargs(ContentType.TSV.value)
|
282
|
+
expected_tsv = {"sep": "\t", "header": None}
|
283
|
+
assert tsv_kwargs == expected_tsv
|
284
|
+
|
285
|
+
# Test PSV kwargs
|
286
|
+
psv_kwargs = content_type_to_reader_kwargs(ContentType.PSV.value)
|
287
|
+
expected_psv = {"sep": "|", "header": None}
|
288
|
+
assert psv_kwargs == expected_psv
|
289
|
+
|
290
|
+
# Test unescaped TSV kwargs
|
291
|
+
unescaped_kwargs = content_type_to_reader_kwargs(
|
292
|
+
ContentType.UNESCAPED_TSV.value
|
293
|
+
)
|
294
|
+
expected_unescaped = {
|
295
|
+
"sep": "\t",
|
296
|
+
"header": None,
|
297
|
+
"na_values": [""],
|
298
|
+
"keep_default_na": False,
|
299
|
+
"quoting": csv.QUOTE_NONE,
|
300
|
+
}
|
301
|
+
assert unescaped_kwargs == expected_unescaped
|
302
|
+
|
303
|
+
# Test Parquet kwargs (should be empty)
|
304
|
+
parquet_kwargs = content_type_to_reader_kwargs(ContentType.PARQUET.value)
|
305
|
+
assert parquet_kwargs == {}
|
306
|
+
|
307
|
+
# Test Avro kwargs (should be empty)
|
308
|
+
avro_kwargs = content_type_to_reader_kwargs(ContentType.AVRO.value)
|
309
|
+
assert avro_kwargs == {}
|
310
|
+
|
311
|
+
def test_add_column_kwargs(self):
|
312
|
+
kwargs = {}
|
313
|
+
column_names = ["col1", "col2", "col3"]
|
314
|
+
include_columns = ["col1", "col2"]
|
315
|
+
|
316
|
+
# Test CSV column kwargs
|
317
|
+
_add_column_kwargs(ContentType.CSV.value, column_names, include_columns, kwargs)
|
318
|
+
assert kwargs["names"] == column_names
|
319
|
+
assert kwargs["usecols"] == include_columns
|
320
|
+
|
321
|
+
# Test Parquet column kwargs
|
322
|
+
kwargs = {}
|
323
|
+
_add_column_kwargs(
|
324
|
+
ContentType.PARQUET.value, column_names, include_columns, kwargs
|
325
|
+
)
|
326
|
+
assert kwargs["columns"] == include_columns
|
327
|
+
assert "names" not in kwargs
|
328
|
+
|
329
|
+
def test_file_to_dataframe_csv(self):
|
330
|
+
# Test reading CSV with file_to_dataframe
|
331
|
+
csv_path = f"{self.base_path}/test.csv"
|
332
|
+
|
333
|
+
result = file_to_dataframe(
|
334
|
+
csv_path,
|
335
|
+
ContentType.CSV.value,
|
336
|
+
ContentEncoding.GZIP.value,
|
337
|
+
filesystem=self.fs,
|
338
|
+
column_names=["col1", "col2", "col3"],
|
339
|
+
)
|
340
|
+
|
341
|
+
assert len(result) == 3
|
342
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
343
|
+
assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
344
|
+
|
345
|
+
def test_file_to_dataframe_tsv(self):
|
346
|
+
# Test reading TSV with file_to_dataframe
|
347
|
+
tsv_path = f"{self.base_path}/test.tsv"
|
348
|
+
|
349
|
+
result = file_to_dataframe(
|
350
|
+
tsv_path,
|
351
|
+
ContentType.TSV.value,
|
352
|
+
ContentEncoding.GZIP.value,
|
353
|
+
filesystem=self.fs,
|
354
|
+
column_names=["col1", "col2", "col3"],
|
355
|
+
)
|
356
|
+
|
357
|
+
assert len(result) == 3
|
358
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
359
|
+
assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
360
|
+
|
361
|
+
def test_file_to_dataframe_psv(self):
|
362
|
+
# Test reading PSV with file_to_dataframe
|
363
|
+
psv_path = f"{self.base_path}/test.psv"
|
364
|
+
|
365
|
+
result = file_to_dataframe(
|
366
|
+
psv_path,
|
367
|
+
ContentType.PSV.value,
|
368
|
+
ContentEncoding.GZIP.value,
|
369
|
+
filesystem=self.fs,
|
370
|
+
column_names=["col1", "col2", "col3"],
|
371
|
+
)
|
372
|
+
|
373
|
+
assert len(result) == 3
|
374
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
375
|
+
assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
376
|
+
|
377
|
+
def test_file_to_dataframe_unescaped_tsv(self):
|
378
|
+
# Test reading unescaped TSV with file_to_dataframe
|
379
|
+
unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
|
380
|
+
|
381
|
+
result = file_to_dataframe(
|
382
|
+
unescaped_tsv_path,
|
383
|
+
ContentType.UNESCAPED_TSV.value,
|
384
|
+
ContentEncoding.GZIP.value,
|
385
|
+
filesystem=self.fs,
|
386
|
+
column_names=["col1", "col2", "col3"],
|
387
|
+
)
|
388
|
+
|
389
|
+
assert len(result) == 3
|
390
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
391
|
+
assert result["col1"].tolist() == ["abc", "def", "ghi"]
|
392
|
+
|
393
|
+
def test_file_to_dataframe_parquet(self):
|
394
|
+
# Test reading Parquet with file_to_dataframe
|
395
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
396
|
+
|
397
|
+
result = file_to_dataframe(
|
398
|
+
parquet_path, ContentType.PARQUET.value, filesystem=self.fs
|
399
|
+
)
|
400
|
+
|
401
|
+
assert len(result) == 3
|
402
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
403
|
+
pd.testing.assert_frame_equal(result, self.df)
|
404
|
+
|
405
|
+
def test_file_to_dataframe_feather(self):
|
406
|
+
# Test reading Feather with file_to_dataframe
|
407
|
+
feather_path = f"{self.base_path}/test.feather"
|
408
|
+
|
409
|
+
result = file_to_dataframe(
|
410
|
+
feather_path, ContentType.FEATHER.value, filesystem=self.fs
|
411
|
+
)
|
412
|
+
|
413
|
+
assert len(result) == 3
|
414
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
415
|
+
pd.testing.assert_frame_equal(result, self.df)
|
416
|
+
|
417
|
+
def test_file_to_dataframe_json(self):
|
418
|
+
# Test reading JSON with file_to_dataframe
|
419
|
+
json_path = f"{self.base_path}/test.json"
|
420
|
+
|
421
|
+
result = file_to_dataframe(
|
422
|
+
json_path,
|
423
|
+
ContentType.JSON.value,
|
424
|
+
ContentEncoding.GZIP.value,
|
425
|
+
filesystem=self.fs,
|
426
|
+
)
|
427
|
+
|
428
|
+
assert len(result) == 3
|
429
|
+
assert set(result.columns) == {"col1", "col2", "col3"}
|
430
|
+
assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
431
|
+
|
432
|
+
def test_file_to_dataframe_avro(self):
|
433
|
+
# Test reading Avro with file_to_dataframe
|
434
|
+
avro_path = f"{self.base_path}/test.avro"
|
435
|
+
|
436
|
+
result = file_to_dataframe(
|
437
|
+
avro_path, ContentType.AVRO.value, filesystem=self.fs
|
438
|
+
)
|
439
|
+
|
440
|
+
assert len(result) == 3
|
441
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
442
|
+
# Avro may have different dtypes, so compare values
|
443
|
+
assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
444
|
+
|
445
|
+
def test_file_to_dataframe_orc(self):
|
446
|
+
# Test reading ORC with file_to_dataframe
|
447
|
+
orc_path = f"{self.base_path}/test.orc"
|
448
|
+
|
449
|
+
result = file_to_dataframe(orc_path, ContentType.ORC.value, filesystem=self.fs)
|
450
|
+
|
451
|
+
assert len(result) == 3
|
452
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
453
|
+
pd.testing.assert_frame_equal(result, self.df)
|
454
|
+
|
455
|
+
def test_file_to_dataframe_with_column_selection(self):
|
456
|
+
# Test reading with column selection
|
457
|
+
csv_path = f"{self.base_path}/test.csv"
|
458
|
+
|
459
|
+
result = file_to_dataframe(
|
460
|
+
csv_path,
|
461
|
+
ContentType.CSV.value,
|
462
|
+
ContentEncoding.GZIP.value,
|
463
|
+
filesystem=self.fs,
|
464
|
+
column_names=["col1", "col2", "col3"],
|
465
|
+
include_columns=["col1", "col2"],
|
466
|
+
)
|
467
|
+
|
468
|
+
assert len(result) == 3
|
469
|
+
assert len(result.columns) == 2 # Should only have 2 columns
|
470
|
+
assert list(result.columns) == ["col1", "col2"]
|
471
|
+
|
472
|
+
def test_file_to_dataframe_with_kwargs_provider(self):
|
473
|
+
# Test reading with kwargs provider
|
474
|
+
csv_path = f"{self.base_path}/test.csv"
|
475
|
+
provider = ReadKwargsProviderPandasCsvPureUtf8(
|
476
|
+
include_columns=["col1", "col2", "col3"]
|
477
|
+
)
|
478
|
+
|
479
|
+
result = file_to_dataframe(
|
480
|
+
csv_path,
|
481
|
+
ContentType.CSV.value,
|
482
|
+
ContentEncoding.GZIP.value,
|
483
|
+
filesystem=self.fs,
|
484
|
+
column_names=["col1", "col2", "col3"],
|
485
|
+
pd_read_func_kwargs_provider=provider,
|
486
|
+
)
|
487
|
+
|
488
|
+
assert len(result) == 3
|
489
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
490
|
+
# With string types provider, all columns should be strings
|
491
|
+
assert all(result[col].dtype == "object" for col in result.columns)
|
492
|
+
|
493
|
+
def test_file_to_dataframe_filesystem_inference(self):
|
494
|
+
# Test filesystem inference when no filesystem is provided
|
495
|
+
# Use JSON file since Parquet requires seekable files
|
496
|
+
json_path = f"{self.base_path}/test.json"
|
497
|
+
|
498
|
+
result = file_to_dataframe(
|
499
|
+
json_path,
|
500
|
+
ContentType.JSON.value,
|
501
|
+
ContentEncoding.GZIP.value
|
502
|
+
# No filesystem provided - should be inferred
|
503
|
+
)
|
504
|
+
|
505
|
+
assert len(result) == 3
|
506
|
+
assert set(result.columns) == {"col1", "col2", "col3"}
|
507
|
+
assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
508
|
+
|
509
|
+
def test_file_to_dataframe_unsupported_content_type(self):
|
510
|
+
# Test error handling for unsupported content type
|
511
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
512
|
+
|
513
|
+
with self.assertRaises(NotImplementedError) as context:
|
514
|
+
file_to_dataframe(
|
515
|
+
parquet_path, "unsupported/content-type", filesystem=self.fs
|
516
|
+
)
|
517
|
+
|
518
|
+
assert "not implemented" in str(context.exception)
|
519
|
+
|
520
|
+
def test_file_to_dataframe_bzip2_compression(self):
|
521
|
+
# Test BZIP2 compression handling
|
522
|
+
import bz2
|
523
|
+
|
524
|
+
# Create a BZIP2 compressed CSV file
|
525
|
+
csv_content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
|
526
|
+
compressed_content = bz2.compress(csv_content.encode("utf-8"))
|
527
|
+
|
528
|
+
bz2_path = f"{self.base_path}/test.csv.bz2"
|
529
|
+
with self.fs.open(bz2_path, "wb") as f:
|
530
|
+
f.write(compressed_content)
|
531
|
+
|
532
|
+
result = file_to_dataframe(
|
533
|
+
bz2_path,
|
534
|
+
ContentType.CSV.value,
|
535
|
+
ContentEncoding.BZIP2.value,
|
536
|
+
filesystem=self.fs,
|
537
|
+
column_names=["col1", "col2", "col3"],
|
538
|
+
)
|
539
|
+
|
540
|
+
assert len(result) == 3
|
541
|
+
assert list(result.columns) == ["col1", "col2", "col3"]
|
542
|
+
assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
543
|
+
|
544
|
+
def test_concat_dataframes(self):
|
545
|
+
# Test concatenation of multiple dataframes
|
546
|
+
df1 = pd.DataFrame({"col1": ["a"], "col2": [1]})
|
547
|
+
df2 = pd.DataFrame({"col1": ["b"], "col2": [2]})
|
548
|
+
df3 = pd.DataFrame({"col1": ["c"], "col2": [3]})
|
549
|
+
|
550
|
+
# Test normal concatenation
|
551
|
+
result = concat_dataframes([df1, df2, df3])
|
552
|
+
assert len(result) == 3
|
553
|
+
assert result["col1"].tolist() == ["a", "b", "c"]
|
554
|
+
|
555
|
+
# Test single dataframe
|
556
|
+
result = concat_dataframes([df1])
|
557
|
+
pd.testing.assert_frame_equal(result, df1)
|
558
|
+
|
559
|
+
# Test empty list
|
560
|
+
result = concat_dataframes([])
|
561
|
+
assert result is None
|
562
|
+
|
563
|
+
# Test None input
|
564
|
+
result = concat_dataframes(None)
|
565
|
+
assert result is None
|
566
|
+
|
567
|
+
|
568
|
+
class TestPandasFileSystemSupport(TestCase):
|
569
|
+
"""
|
570
|
+
Comprehensive tests for encoding-aware reader functions with different filesystem types.
|
571
|
+
Tests fsspec AbstractFileSystem, PyArrow FileSystem, and auto-inferred filesystem.
|
572
|
+
"""
|
573
|
+
|
574
|
+
def setUp(self):
|
575
|
+
import pyarrow.fs as pafs
|
576
|
+
|
577
|
+
# Create test data
|
578
|
+
self.test_data = pd.DataFrame(
|
579
|
+
{
|
580
|
+
"col1": ["value1", "value2", "value3"],
|
581
|
+
"col2": [1, 2, 3],
|
582
|
+
"col3": [1.1, 2.2, 3.3],
|
583
|
+
}
|
584
|
+
)
|
585
|
+
|
586
|
+
# Set up temporary directory
|
587
|
+
self.temp_dir = tempfile.mkdtemp()
|
588
|
+
|
589
|
+
# Set up different filesystem types
|
590
|
+
self.fsspec_fs = fsspec.filesystem("file")
|
591
|
+
self.pyarrow_fs = pafs.LocalFileSystem()
|
592
|
+
|
593
|
+
# Create test files for each content type
|
594
|
+
self._create_test_files()
|
595
|
+
|
596
|
+
def tearDown(self):
|
597
|
+
import shutil
|
598
|
+
|
599
|
+
shutil.rmtree(self.temp_dir)
|
600
|
+
|
601
|
+
def _create_test_files(self):
|
602
|
+
"""Create test files in different formats with different compression types."""
|
603
|
+
import gzip
|
604
|
+
import bz2
|
605
|
+
|
606
|
+
# CSV files without headers to match test data structure
|
607
|
+
csv_data = "value1,1,1.1\nvalue2,2,2.2\nvalue3,3,3.3\n"
|
608
|
+
|
609
|
+
# Create uncompressed CSV
|
610
|
+
with open(f"{self.temp_dir}/test.csv", "w") as f:
|
611
|
+
f.write(csv_data)
|
612
|
+
|
613
|
+
# Create GZIP compressed CSV
|
614
|
+
with gzip.open(f"{self.temp_dir}/test_gzip.csv.gz", "wt") as f:
|
615
|
+
f.write(csv_data)
|
616
|
+
|
617
|
+
# Create BZIP2 compressed CSV
|
618
|
+
with bz2.open(f"{self.temp_dir}/test_bzip2.csv.bz2", "wt") as f:
|
619
|
+
f.write(csv_data)
|
620
|
+
|
621
|
+
# Parquet file
|
622
|
+
self.test_data.to_parquet(f"{self.temp_dir}/test.parquet", index=False)
|
623
|
+
|
624
|
+
# Feather file
|
625
|
+
self.test_data.to_feather(f"{self.temp_dir}/test.feather")
|
626
|
+
|
627
|
+
# JSON file (GZIP compressed, NDJSON format)
|
628
|
+
json_path = f"{self.temp_dir}/test.json"
|
629
|
+
with self.fsspec_fs.open(json_path, "wb") as f:
|
630
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
631
|
+
json_str = self.test_data.to_json(orient="records", lines=True)
|
632
|
+
gz.write(json_str.encode("utf-8"))
|
633
|
+
|
634
|
+
# AVRO file (using polars since pandas delegates to polars for AVRO)
|
635
|
+
import polars as pl
|
636
|
+
|
637
|
+
pl_df = pl.from_pandas(self.test_data)
|
638
|
+
pl_df.write_avro(f"{self.temp_dir}/test.avro")
|
639
|
+
|
640
|
+
# ORC file
|
641
|
+
self.test_data.to_orc(f"{self.temp_dir}/test.orc")
|
642
|
+
|
643
|
+
def _assert_dataframes_equal(self, result, expected):
|
644
|
+
"""Helper to assert pandas dataframes are equal."""
|
645
|
+
pd.testing.assert_frame_equal(
|
646
|
+
result.reset_index(drop=True),
|
647
|
+
expected.reset_index(drop=True),
|
648
|
+
check_dtype=False, # Allow minor type differences
|
649
|
+
)
|
650
|
+
|
651
|
+
def test_csv_with_fsspec_filesystem(self):
|
652
|
+
"""Test CSV reading with fsspec AbstractFileSystem."""
|
653
|
+
from deltacat.utils.pandas import read_csv
|
654
|
+
|
655
|
+
# Test uncompressed CSV
|
656
|
+
result = read_csv(
|
657
|
+
f"{self.temp_dir}/test.csv",
|
658
|
+
filesystem=self.fsspec_fs,
|
659
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
660
|
+
names=["col1", "col2", "col3"],
|
661
|
+
)
|
662
|
+
self._assert_dataframes_equal(result, self.test_data)
|
663
|
+
|
664
|
+
# Test GZIP compressed CSV
|
665
|
+
result = read_csv(
|
666
|
+
f"{self.temp_dir}/test_gzip.csv.gz",
|
667
|
+
filesystem=self.fsspec_fs,
|
668
|
+
content_encoding=ContentEncoding.GZIP.value,
|
669
|
+
names=["col1", "col2", "col3"],
|
670
|
+
)
|
671
|
+
self._assert_dataframes_equal(result, self.test_data)
|
672
|
+
|
673
|
+
# Test BZIP2 compressed CSV
|
674
|
+
result = read_csv(
|
675
|
+
f"{self.temp_dir}/test_bzip2.csv.bz2",
|
676
|
+
filesystem=self.fsspec_fs,
|
677
|
+
content_encoding=ContentEncoding.BZIP2.value,
|
678
|
+
names=["col1", "col2", "col3"],
|
679
|
+
)
|
680
|
+
self._assert_dataframes_equal(result, self.test_data)
|
681
|
+
|
682
|
+
def test_csv_with_pyarrow_filesystem(self):
|
683
|
+
"""Test CSV reading with PyArrow FileSystem."""
|
684
|
+
from deltacat.utils.pandas import read_csv
|
685
|
+
|
686
|
+
# Test uncompressed CSV
|
687
|
+
result = read_csv(
|
688
|
+
f"{self.temp_dir}/test.csv",
|
689
|
+
filesystem=self.pyarrow_fs,
|
690
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
691
|
+
names=["col1", "col2", "col3"],
|
692
|
+
)
|
693
|
+
self._assert_dataframes_equal(result, self.test_data)
|
694
|
+
|
695
|
+
# Test GZIP compressed CSV
|
696
|
+
result = read_csv(
|
697
|
+
f"{self.temp_dir}/test_gzip.csv.gz",
|
698
|
+
filesystem=self.pyarrow_fs,
|
699
|
+
content_encoding=ContentEncoding.GZIP.value,
|
700
|
+
names=["col1", "col2", "col3"],
|
701
|
+
)
|
702
|
+
self._assert_dataframes_equal(result, self.test_data)
|
703
|
+
|
704
|
+
def test_csv_with_auto_inferred_filesystem(self):
|
705
|
+
"""Test CSV reading with automatically inferred filesystem."""
|
706
|
+
from deltacat.utils.pandas import read_csv
|
707
|
+
|
708
|
+
# Test uncompressed CSV (filesystem=None, should auto-infer)
|
709
|
+
result = read_csv(
|
710
|
+
f"{self.temp_dir}/test.csv",
|
711
|
+
filesystem=None,
|
712
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
713
|
+
names=["col1", "col2", "col3"],
|
714
|
+
)
|
715
|
+
self._assert_dataframes_equal(result, self.test_data)
|
716
|
+
|
717
|
+
def test_parquet_with_different_filesystems(self):
|
718
|
+
"""Test Parquet reading with different filesystem types."""
|
719
|
+
from deltacat.utils.pandas import read_parquet
|
720
|
+
|
721
|
+
# Test with fsspec
|
722
|
+
result = read_parquet(
|
723
|
+
f"{self.temp_dir}/test.parquet",
|
724
|
+
filesystem=self.fsspec_fs,
|
725
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
726
|
+
)
|
727
|
+
self._assert_dataframes_equal(result, self.test_data)
|
728
|
+
|
729
|
+
# Test with PyArrow
|
730
|
+
result = read_parquet(
|
731
|
+
f"{self.temp_dir}/test.parquet",
|
732
|
+
filesystem=self.pyarrow_fs,
|
733
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
734
|
+
)
|
735
|
+
self._assert_dataframes_equal(result, self.test_data)
|
736
|
+
|
737
|
+
# Test with auto-inferred
|
738
|
+
result = read_parquet(
|
739
|
+
f"{self.temp_dir}/test.parquet",
|
740
|
+
filesystem=None,
|
741
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
742
|
+
)
|
743
|
+
self._assert_dataframes_equal(result, self.test_data)
|
744
|
+
|
745
|
+
def test_feather_with_different_filesystems(self):
|
746
|
+
"""Test Feather reading with different filesystem types."""
|
747
|
+
from deltacat.utils.pandas import read_feather
|
748
|
+
|
749
|
+
# Test with fsspec
|
750
|
+
result = read_feather(
|
751
|
+
f"{self.temp_dir}/test.feather",
|
752
|
+
filesystem=self.fsspec_fs,
|
753
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
754
|
+
)
|
755
|
+
self._assert_dataframes_equal(result, self.test_data)
|
756
|
+
|
757
|
+
# Test with PyArrow
|
758
|
+
result = read_feather(
|
759
|
+
f"{self.temp_dir}/test.feather",
|
760
|
+
filesystem=self.pyarrow_fs,
|
761
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
762
|
+
)
|
763
|
+
self._assert_dataframes_equal(result, self.test_data)
|
764
|
+
|
765
|
+
# Test with auto-inferred
|
766
|
+
result = read_feather(
|
767
|
+
f"{self.temp_dir}/test.feather",
|
768
|
+
filesystem=None,
|
769
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
770
|
+
)
|
771
|
+
self._assert_dataframes_equal(result, self.test_data)
|
772
|
+
|
773
|
+
def test_json_with_different_filesystems(self):
|
774
|
+
"""Test JSON reading with different filesystem types."""
|
775
|
+
from deltacat.utils.pandas import read_json
|
776
|
+
|
777
|
+
# Test with fsspec
|
778
|
+
result = read_json(
|
779
|
+
f"{self.temp_dir}/test.json",
|
780
|
+
filesystem=self.fsspec_fs,
|
781
|
+
content_encoding=ContentEncoding.GZIP.value,
|
782
|
+
lines=True, # Required for NDJSON format
|
783
|
+
)
|
784
|
+
self._assert_dataframes_equal(result, self.test_data)
|
785
|
+
|
786
|
+
# Test with PyArrow
|
787
|
+
result = read_json(
|
788
|
+
f"{self.temp_dir}/test.json",
|
789
|
+
filesystem=self.pyarrow_fs,
|
790
|
+
content_encoding=ContentEncoding.GZIP.value,
|
791
|
+
lines=True, # Required for NDJSON format
|
792
|
+
)
|
793
|
+
self._assert_dataframes_equal(result, self.test_data)
|
794
|
+
|
795
|
+
# Test with auto-inferred
|
796
|
+
result = read_json(
|
797
|
+
f"{self.temp_dir}/test.json",
|
798
|
+
filesystem=None,
|
799
|
+
content_encoding=ContentEncoding.GZIP.value,
|
800
|
+
lines=True, # Required for NDJSON format
|
801
|
+
)
|
802
|
+
self._assert_dataframes_equal(result, self.test_data)
|
803
|
+
|
804
|
+
def test_avro_with_different_filesystems(self):
|
805
|
+
"""Test AVRO reading with different filesystem types."""
|
806
|
+
from deltacat.utils.pandas import read_avro
|
807
|
+
|
808
|
+
# Test with fsspec
|
809
|
+
result = read_avro(
|
810
|
+
f"{self.temp_dir}/test.avro",
|
811
|
+
filesystem=self.fsspec_fs,
|
812
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
813
|
+
)
|
814
|
+
self._assert_dataframes_equal(result, self.test_data)
|
815
|
+
|
816
|
+
# Test with PyArrow
|
817
|
+
result = read_avro(
|
818
|
+
f"{self.temp_dir}/test.avro",
|
819
|
+
filesystem=self.pyarrow_fs,
|
820
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
821
|
+
)
|
822
|
+
self._assert_dataframes_equal(result, self.test_data)
|
823
|
+
|
824
|
+
# Test with auto-inferred
|
825
|
+
result = read_avro(
|
826
|
+
f"{self.temp_dir}/test.avro",
|
827
|
+
filesystem=None,
|
828
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
829
|
+
)
|
830
|
+
self._assert_dataframes_equal(result, self.test_data)
|
831
|
+
|
832
|
+
def test_orc_with_different_filesystems(self):
|
833
|
+
"""Test ORC reading with different filesystem types."""
|
834
|
+
from deltacat.utils.pandas import read_orc
|
835
|
+
|
836
|
+
# Test with fsspec
|
837
|
+
result = read_orc(
|
838
|
+
f"{self.temp_dir}/test.orc",
|
839
|
+
filesystem=self.fsspec_fs,
|
840
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
841
|
+
)
|
842
|
+
self._assert_dataframes_equal(result, self.test_data)
|
843
|
+
|
844
|
+
# Test with PyArrow
|
845
|
+
result = read_orc(
|
846
|
+
f"{self.temp_dir}/test.orc",
|
847
|
+
filesystem=self.pyarrow_fs,
|
848
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
849
|
+
)
|
850
|
+
self._assert_dataframes_equal(result, self.test_data)
|
851
|
+
|
852
|
+
# Test with auto-inferred
|
853
|
+
result = read_orc(
|
854
|
+
f"{self.temp_dir}/test.orc",
|
855
|
+
filesystem=None,
|
856
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
857
|
+
)
|
858
|
+
self._assert_dataframes_equal(result, self.test_data)
|
859
|
+
|
860
|
+
def test_file_to_dataframe_with_different_filesystems(self):
|
861
|
+
"""Test file_to_dataframe with different filesystem types for all content types."""
|
862
|
+
test_cases = [
|
863
|
+
(
|
864
|
+
f"{self.temp_dir}/test.csv",
|
865
|
+
ContentType.CSV.value,
|
866
|
+
ContentEncoding.IDENTITY.value,
|
867
|
+
{"column_names": ["col1", "col2", "col3"]},
|
868
|
+
),
|
869
|
+
(
|
870
|
+
f"{self.temp_dir}/test_gzip.csv.gz",
|
871
|
+
ContentType.CSV.value,
|
872
|
+
ContentEncoding.GZIP.value,
|
873
|
+
{"column_names": ["col1", "col2", "col3"]},
|
874
|
+
),
|
875
|
+
(
|
876
|
+
f"{self.temp_dir}/test.parquet",
|
877
|
+
ContentType.PARQUET.value,
|
878
|
+
ContentEncoding.IDENTITY.value,
|
879
|
+
{},
|
880
|
+
),
|
881
|
+
(
|
882
|
+
f"{self.temp_dir}/test.feather",
|
883
|
+
ContentType.FEATHER.value,
|
884
|
+
ContentEncoding.IDENTITY.value,
|
885
|
+
{},
|
886
|
+
),
|
887
|
+
(
|
888
|
+
f"{self.temp_dir}/test.json",
|
889
|
+
ContentType.JSON.value,
|
890
|
+
ContentEncoding.GZIP.value,
|
891
|
+
{},
|
892
|
+
),
|
893
|
+
(
|
894
|
+
f"{self.temp_dir}/test.avro",
|
895
|
+
ContentType.AVRO.value,
|
896
|
+
ContentEncoding.IDENTITY.value,
|
897
|
+
{},
|
898
|
+
),
|
899
|
+
(
|
900
|
+
f"{self.temp_dir}/test.orc",
|
901
|
+
ContentType.ORC.value,
|
902
|
+
ContentEncoding.IDENTITY.value,
|
903
|
+
{},
|
904
|
+
),
|
905
|
+
]
|
906
|
+
|
907
|
+
filesystems = [
|
908
|
+
("fsspec", self.fsspec_fs),
|
909
|
+
("pyarrow", self.pyarrow_fs),
|
910
|
+
("auto-inferred", None),
|
911
|
+
]
|
912
|
+
|
913
|
+
for path, content_type, content_encoding, extra_kwargs in test_cases:
|
914
|
+
for fs_name, filesystem in filesystems:
|
915
|
+
with self.subTest(
|
916
|
+
content_type=content_type,
|
917
|
+
filesystem=fs_name,
|
918
|
+
encoding=content_encoding,
|
919
|
+
):
|
920
|
+
result = file_to_dataframe(
|
921
|
+
path=path,
|
922
|
+
content_type=content_type,
|
923
|
+
content_encoding=content_encoding,
|
924
|
+
filesystem=filesystem,
|
925
|
+
**extra_kwargs,
|
926
|
+
)
|
927
|
+
self._assert_dataframes_equal(result, self.test_data)
|
928
|
+
|
929
|
+
def test_compression_encoding_with_different_filesystems(self):
|
930
|
+
"""Test that compression encoding works correctly with different filesystem types."""
|
931
|
+
test_cases = [
|
932
|
+
(f"{self.temp_dir}/test.csv", ContentEncoding.IDENTITY.value),
|
933
|
+
(f"{self.temp_dir}/test_gzip.csv.gz", ContentEncoding.GZIP.value),
|
934
|
+
(f"{self.temp_dir}/test_bzip2.csv.bz2", ContentEncoding.BZIP2.value),
|
935
|
+
]
|
936
|
+
|
937
|
+
filesystems = [
|
938
|
+
("fsspec", self.fsspec_fs),
|
939
|
+
("pyarrow", self.pyarrow_fs),
|
940
|
+
("auto-inferred", None),
|
941
|
+
]
|
942
|
+
|
943
|
+
for path, content_encoding in test_cases:
|
944
|
+
for fs_name, filesystem in filesystems:
|
945
|
+
with self.subTest(encoding=content_encoding, filesystem=fs_name):
|
946
|
+
result = file_to_dataframe(
|
947
|
+
path=path,
|
948
|
+
content_type=ContentType.CSV.value,
|
949
|
+
content_encoding=content_encoding,
|
950
|
+
filesystem=filesystem,
|
951
|
+
column_names=["col1", "col2", "col3"],
|
952
|
+
)
|
953
|
+
self._assert_dataframes_equal(result, self.test_data)
|
954
|
+
|
955
|
+
def test_filesystem_open_kwargs(self):
|
956
|
+
"""Test that filesystem open kwargs are properly passed through."""
|
957
|
+
from deltacat.utils.pandas import read_csv
|
958
|
+
|
959
|
+
# Test with custom fs_open_kwargs
|
960
|
+
result = read_csv(
|
961
|
+
f"{self.temp_dir}/test.csv",
|
962
|
+
filesystem=self.fsspec_fs,
|
963
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
964
|
+
fs_open_kwargs={
|
965
|
+
"encoding": "utf-8"
|
966
|
+
}, # This should be passed to filesystem.open()
|
967
|
+
names=["col1", "col2", "col3"],
|
968
|
+
)
|
969
|
+
self._assert_dataframes_equal(result, self.test_data)
|
970
|
+
|
971
|
+
def test_delimited_formats_with_different_filesystems(self):
|
972
|
+
"""Test delimited formats (TSV, PSV, etc.) with different filesystem types."""
|
973
|
+
# Create TSV test file without headers to match test data structure
|
974
|
+
tsv_data = "value1\t1\t1.1\nvalue2\t2\t2.2\nvalue3\t3\t3.3\n"
|
975
|
+
with open(f"{self.temp_dir}/test.tsv", "w") as f:
|
976
|
+
f.write(tsv_data)
|
977
|
+
|
978
|
+
# Create PSV test file without headers to match test data structure
|
979
|
+
psv_data = "value1|1|1.1\nvalue2|2|2.2\nvalue3|3|3.3\n"
|
980
|
+
with open(f"{self.temp_dir}/test.psv", "w") as f:
|
981
|
+
f.write(psv_data)
|
982
|
+
|
983
|
+
delimited_test_cases = [
|
984
|
+
(
|
985
|
+
f"{self.temp_dir}/test.tsv",
|
986
|
+
ContentType.TSV.value,
|
987
|
+
{"sep": "\t", "column_names": ["col1", "col2", "col3"]},
|
988
|
+
),
|
989
|
+
(
|
990
|
+
f"{self.temp_dir}/test.psv",
|
991
|
+
ContentType.PSV.value,
|
992
|
+
{"sep": "|", "column_names": ["col1", "col2", "col3"]},
|
993
|
+
),
|
994
|
+
]
|
995
|
+
|
996
|
+
filesystems = [
|
997
|
+
("fsspec", self.fsspec_fs),
|
998
|
+
("pyarrow", self.pyarrow_fs),
|
999
|
+
("auto-inferred", None),
|
1000
|
+
]
|
1001
|
+
|
1002
|
+
for path, content_type, extra_kwargs in delimited_test_cases:
|
1003
|
+
for fs_name, filesystem in filesystems:
|
1004
|
+
with self.subTest(content_type=content_type, filesystem=fs_name):
|
1005
|
+
result = file_to_dataframe(
|
1006
|
+
path=path,
|
1007
|
+
content_type=content_type,
|
1008
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
1009
|
+
filesystem=filesystem,
|
1010
|
+
**extra_kwargs,
|
1011
|
+
)
|
1012
|
+
self._assert_dataframes_equal(result, self.test_data)
|
1013
|
+
|
1014
|
+
def test_end_to_end_round_trip_all_formats(self):
|
1015
|
+
"""Test end-to-end round trip with write and read for all supported formats."""
|
1016
|
+
from deltacat.utils.pandas import (
|
1017
|
+
write_csv,
|
1018
|
+
write_parquet,
|
1019
|
+
write_feather,
|
1020
|
+
write_json,
|
1021
|
+
write_avro,
|
1022
|
+
write_orc,
|
1023
|
+
read_csv,
|
1024
|
+
read_parquet,
|
1025
|
+
read_feather,
|
1026
|
+
read_json,
|
1027
|
+
read_avro,
|
1028
|
+
read_orc,
|
1029
|
+
)
|
1030
|
+
|
1031
|
+
# Test cases with writer/reader pairs
|
1032
|
+
# Note: CSV and JSON writers automatically apply GZIP compression
|
1033
|
+
round_trip_cases = [
|
1034
|
+
(
|
1035
|
+
"test_roundtrip.csv",
|
1036
|
+
write_csv,
|
1037
|
+
read_csv,
|
1038
|
+
{
|
1039
|
+
"content_encoding": ContentEncoding.GZIP.value,
|
1040
|
+
"names": ["col1", "col2", "col3"],
|
1041
|
+
},
|
1042
|
+
{"index": False},
|
1043
|
+
),
|
1044
|
+
(
|
1045
|
+
"test_roundtrip.parquet",
|
1046
|
+
write_parquet,
|
1047
|
+
read_parquet,
|
1048
|
+
{"content_encoding": ContentEncoding.IDENTITY.value},
|
1049
|
+
{},
|
1050
|
+
),
|
1051
|
+
(
|
1052
|
+
"test_roundtrip.feather",
|
1053
|
+
write_feather,
|
1054
|
+
read_feather,
|
1055
|
+
{"content_encoding": ContentEncoding.IDENTITY.value},
|
1056
|
+
{},
|
1057
|
+
),
|
1058
|
+
(
|
1059
|
+
"test_roundtrip.json",
|
1060
|
+
write_json,
|
1061
|
+
read_json,
|
1062
|
+
{"content_encoding": ContentEncoding.GZIP.value, "orient": "records"},
|
1063
|
+
{"orient": "records"},
|
1064
|
+
),
|
1065
|
+
(
|
1066
|
+
"test_roundtrip.avro",
|
1067
|
+
write_avro,
|
1068
|
+
read_avro,
|
1069
|
+
{"content_encoding": ContentEncoding.IDENTITY.value},
|
1070
|
+
{},
|
1071
|
+
),
|
1072
|
+
(
|
1073
|
+
"test_roundtrip.orc",
|
1074
|
+
write_orc,
|
1075
|
+
read_orc,
|
1076
|
+
{"content_encoding": ContentEncoding.IDENTITY.value},
|
1077
|
+
{},
|
1078
|
+
),
|
1079
|
+
]
|
1080
|
+
|
1081
|
+
filesystems = [
|
1082
|
+
("fsspec", self.fsspec_fs),
|
1083
|
+
("pyarrow", self.pyarrow_fs),
|
1084
|
+
]
|
1085
|
+
|
1086
|
+
for (
|
1087
|
+
filename,
|
1088
|
+
write_func,
|
1089
|
+
read_func,
|
1090
|
+
read_kwargs,
|
1091
|
+
write_kwargs,
|
1092
|
+
) in round_trip_cases:
|
1093
|
+
for fs_name, filesystem in filesystems:
|
1094
|
+
with self.subTest(format=filename, filesystem=fs_name):
|
1095
|
+
file_path = f"{self.temp_dir}/{filename}"
|
1096
|
+
|
1097
|
+
# Write the file
|
1098
|
+
write_func(
|
1099
|
+
self.test_data, file_path, filesystem=filesystem, **write_kwargs
|
1100
|
+
)
|
1101
|
+
|
1102
|
+
# Read it back
|
1103
|
+
result = read_func(file_path, filesystem=filesystem, **read_kwargs)
|
1104
|
+
|
1105
|
+
# Verify it matches
|
1106
|
+
self._assert_dataframes_equal(result, self.test_data)
|