deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,33 @@
|
|
1
1
|
from unittest import TestCase
|
2
2
|
from deltacat.utils.pyarrow import (
|
3
|
-
|
3
|
+
partial_parquet_file_to_table,
|
4
4
|
pyarrow_read_csv,
|
5
5
|
ContentTypeValidationError,
|
6
6
|
content_type_to_reader_kwargs,
|
7
7
|
_add_column_kwargs,
|
8
|
-
|
9
|
-
|
10
|
-
|
8
|
+
file_to_table,
|
9
|
+
file_to_parquet,
|
10
|
+
table_to_file,
|
11
11
|
ReadKwargsProviderPyArrowSchemaOverride,
|
12
|
-
|
12
|
+
ReadKwargsProviderPyArrowCsvPureUtf8,
|
13
13
|
RAISE_ON_DECIMAL_OVERFLOW,
|
14
|
-
|
14
|
+
RAISE_ON_EMPTY_CSV_KWARG,
|
15
15
|
)
|
16
16
|
import decimal
|
17
17
|
from deltacat.types.media import ContentEncoding, ContentType
|
18
18
|
from deltacat.types.partial_download import PartialParquetParameters
|
19
19
|
from pyarrow.parquet import ParquetFile
|
20
|
+
import tempfile
|
20
21
|
import pyarrow as pa
|
22
|
+
from pyarrow import csv as pacsv
|
23
|
+
import fsspec
|
24
|
+
import gzip
|
25
|
+
import json
|
26
|
+
from pyarrow import (
|
27
|
+
feather as paf,
|
28
|
+
parquet as papq,
|
29
|
+
orc as paorc,
|
30
|
+
)
|
21
31
|
|
22
32
|
PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
|
23
33
|
PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
|
@@ -33,8 +43,8 @@ GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed
|
|
33
43
|
BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
|
34
44
|
|
35
45
|
|
36
|
-
class
|
37
|
-
def
|
46
|
+
class TestPartialParquetFileToTable(TestCase):
|
47
|
+
def test_partial_parquet_file_to_table_sanity(self):
|
38
48
|
|
39
49
|
pq_file = ParquetFile(PARQUET_FILE_PATH)
|
40
50
|
partial_parquet_params = PartialParquetParameters.of(
|
@@ -48,7 +58,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
48
58
|
# only first row group to be downloaded
|
49
59
|
partial_parquet_params.row_groups_to_download.pop()
|
50
60
|
|
51
|
-
result =
|
61
|
+
result = partial_parquet_file_to_table(
|
52
62
|
PARQUET_FILE_PATH,
|
53
63
|
include_columns=["n_legs"],
|
54
64
|
content_encoding=ContentEncoding.IDENTITY.value,
|
@@ -59,7 +69,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
59
69
|
self.assertEqual(len(result), 3)
|
60
70
|
self.assertEqual(len(result.columns), 1)
|
61
71
|
|
62
|
-
def
|
72
|
+
def test_partial_parquet_file_to_table_when_schema_passed(self):
|
63
73
|
|
64
74
|
pq_file = ParquetFile(PARQUET_FILE_PATH)
|
65
75
|
partial_parquet_params = PartialParquetParameters.of(
|
@@ -79,7 +89,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
79
89
|
|
80
90
|
pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
|
81
91
|
|
82
|
-
result =
|
92
|
+
result = partial_parquet_file_to_table(
|
83
93
|
PARQUET_FILE_PATH,
|
84
94
|
ContentType.PARQUET.value,
|
85
95
|
ContentEncoding.IDENTITY.value,
|
@@ -98,7 +108,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
98
108
|
self.assertEqual(result_schema.field(2).type, "int64")
|
99
109
|
self.assertEqual(result_schema.field(2).name, "MISSING")
|
100
110
|
|
101
|
-
def
|
111
|
+
def test_partial_parquet_file_to_table_when_schema_missing_columns(self):
|
102
112
|
|
103
113
|
pq_file = ParquetFile(PARQUET_FILE_PATH)
|
104
114
|
partial_parquet_params = PartialParquetParameters.of(
|
@@ -118,7 +128,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
118
128
|
|
119
129
|
pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
|
120
130
|
|
121
|
-
result =
|
131
|
+
result = partial_parquet_file_to_table(
|
122
132
|
PARQUET_FILE_PATH,
|
123
133
|
ContentType.PARQUET.value,
|
124
134
|
ContentEncoding.IDENTITY.value,
|
@@ -135,7 +145,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
135
145
|
self.assertEqual(result_schema.field(0).type, "int64")
|
136
146
|
self.assertEqual(result_schema.field(0).name, "MISSING")
|
137
147
|
|
138
|
-
def
|
148
|
+
def test_partial_parquet_file_to_table_when_schema_passed_with_include_columns(
|
139
149
|
self,
|
140
150
|
):
|
141
151
|
|
@@ -152,11 +162,11 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
152
162
|
|
153
163
|
pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
|
154
164
|
|
155
|
-
result =
|
165
|
+
result = partial_parquet_file_to_table(
|
156
166
|
PARQUET_FILE_PATH,
|
157
167
|
ContentType.PARQUET.value,
|
158
168
|
ContentEncoding.IDENTITY.value,
|
159
|
-
["n_legs", "animal"],
|
169
|
+
column_names=["n_legs", "animal"],
|
160
170
|
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
161
171
|
partial_file_download_params=partial_parquet_params,
|
162
172
|
)
|
@@ -168,7 +178,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
168
178
|
self.assertEqual(result_schema.field(0).type, "string")
|
169
179
|
self.assertEqual(result_schema.field(0).name, "n_legs") # order doesn't change
|
170
180
|
|
171
|
-
def
|
181
|
+
def test_partial_parquet_file_to_table_when_multiple_row_groups(self):
|
172
182
|
|
173
183
|
pq_file = ParquetFile(PARQUET_FILE_PATH)
|
174
184
|
partial_parquet_params = PartialParquetParameters.of(
|
@@ -179,7 +189,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
179
189
|
partial_parquet_params.num_row_groups, 2, "test_file.parquet has changed."
|
180
190
|
)
|
181
191
|
|
182
|
-
result =
|
192
|
+
result = partial_parquet_file_to_table(
|
183
193
|
PARQUET_FILE_PATH,
|
184
194
|
content_encoding=ContentEncoding.IDENTITY.value,
|
185
195
|
content_type=ContentType.PARQUET.value,
|
@@ -668,301 +678,1140 @@ class TestReadCSV(TestCase):
|
|
668
678
|
self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
|
669
679
|
|
670
680
|
|
671
|
-
class
|
672
|
-
def
|
681
|
+
class TestWriters(TestCase):
|
682
|
+
def setUp(self):
|
683
|
+
self.table = pa.table({"col1": ["a,b\tc|d", "e,f\tg|h"], "col2": [1, 2]})
|
684
|
+
self.fs = fsspec.filesystem("file")
|
685
|
+
self.base_path = tempfile.mkdtemp()
|
686
|
+
self.fs.makedirs(self.base_path, exist_ok=True)
|
687
|
+
|
688
|
+
def tearDown(self):
|
689
|
+
self.fs.rm(self.base_path, recursive=True)
|
690
|
+
|
691
|
+
def test_write_feather(self):
|
692
|
+
path = f"{self.base_path}/test.feather"
|
693
|
+
|
694
|
+
table_to_file(
|
695
|
+
self.table,
|
696
|
+
path,
|
697
|
+
self.fs,
|
698
|
+
lambda x: path,
|
699
|
+
content_type=ContentType.FEATHER.value,
|
700
|
+
)
|
701
|
+
assert self.fs.exists(path), "file was not written"
|
702
|
+
|
703
|
+
# Verify content
|
704
|
+
result = paf.read_table(path)
|
705
|
+
assert result.equals(self.table)
|
706
|
+
|
707
|
+
def test_write_csv(self):
|
708
|
+
path = f"{self.base_path}/test.csv.gz"
|
709
|
+
|
710
|
+
table_to_file(
|
711
|
+
self.table,
|
712
|
+
path,
|
713
|
+
self.fs,
|
714
|
+
lambda x: path,
|
715
|
+
content_type=ContentType.CSV.value,
|
716
|
+
)
|
717
|
+
assert self.fs.exists(path), "file was not written"
|
718
|
+
|
719
|
+
# Verify content (should be GZIP compressed)
|
720
|
+
with self.fs.open(path, "rb") as f:
|
721
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
722
|
+
content = gz.read().decode("utf-8")
|
723
|
+
# Should be quoted due to commas in data
|
724
|
+
assert '"a,b\tc|d",1' in content
|
725
|
+
assert '"e,f\tg|h",2' in content
|
726
|
+
|
727
|
+
def test_write_tsv(self):
|
728
|
+
path = f"{self.base_path}/test.tsv.gz"
|
729
|
+
|
730
|
+
table_to_file(
|
731
|
+
self.table,
|
732
|
+
path,
|
733
|
+
self.fs,
|
734
|
+
lambda x: path,
|
735
|
+
content_type=ContentType.TSV.value,
|
736
|
+
)
|
737
|
+
assert self.fs.exists(path), "file was not written"
|
738
|
+
|
739
|
+
# Verify content (should be GZIP compressed)
|
740
|
+
with self.fs.open(path, "rb") as f:
|
741
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
742
|
+
content = gz.read().decode("utf-8")
|
743
|
+
# Should be quoted due to tabs in data
|
744
|
+
assert '"a,b\tc|d"\t1' in content
|
745
|
+
assert '"e,f\tg|h"\t2' in content
|
746
|
+
|
747
|
+
def test_write_psv(self):
|
748
|
+
path = f"{self.base_path}/test.psv.gz"
|
749
|
+
|
750
|
+
table_to_file(
|
751
|
+
self.table,
|
752
|
+
path,
|
753
|
+
self.fs,
|
754
|
+
lambda x: path,
|
755
|
+
content_type=ContentType.PSV.value,
|
756
|
+
)
|
757
|
+
assert self.fs.exists(path), "file was not written"
|
758
|
+
|
759
|
+
# Verify content (should be GZIP compressed)
|
760
|
+
with self.fs.open(path, "rb") as f:
|
761
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
762
|
+
content = gz.read().decode("utf-8")
|
763
|
+
# Should be quoted due to pipes in data
|
764
|
+
assert '"a,b\tc|d"|1' in content
|
765
|
+
assert '"e,f\tg|h"|2' in content
|
766
|
+
|
767
|
+
def test_write_unescaped_tsv(self):
|
768
|
+
# Create table without delimiters for unescaped TSV
|
769
|
+
table = pa.table({"col1": ["abc", "def"], "col2": [1, 2]})
|
770
|
+
path = f"{self.base_path}/test.tsv.gz"
|
771
|
+
|
772
|
+
table_to_file(
|
773
|
+
table,
|
774
|
+
path,
|
775
|
+
self.fs,
|
776
|
+
lambda x: path,
|
777
|
+
content_type=ContentType.UNESCAPED_TSV.value,
|
778
|
+
)
|
779
|
+
assert self.fs.exists(path), "file was not written"
|
780
|
+
|
781
|
+
# Verify content (should be GZIP compressed)
|
782
|
+
with self.fs.open(path, "rb") as f:
|
783
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
784
|
+
content = gz.read().decode("utf-8")
|
785
|
+
# With quoting_style="none", strings should not be quoted
|
786
|
+
assert "abc\t1" in content
|
787
|
+
assert "def\t2" in content
|
788
|
+
|
789
|
+
def test_write_orc(self):
|
790
|
+
path = f"{self.base_path}/test.orc"
|
791
|
+
|
792
|
+
table_to_file(
|
793
|
+
self.table,
|
794
|
+
path,
|
795
|
+
self.fs,
|
796
|
+
lambda x: path,
|
797
|
+
content_type=ContentType.ORC.value,
|
798
|
+
)
|
799
|
+
assert self.fs.exists(path), "file was not written"
|
800
|
+
|
801
|
+
# Verify content
|
802
|
+
result = paorc.read_table(path)
|
803
|
+
assert result.equals(self.table)
|
804
|
+
|
805
|
+
def test_write_parquet(self):
|
806
|
+
path = f"{self.base_path}/test.parquet"
|
807
|
+
|
808
|
+
table_to_file(
|
809
|
+
self.table,
|
810
|
+
path,
|
811
|
+
self.fs,
|
812
|
+
lambda x: path,
|
813
|
+
content_type=ContentType.PARQUET.value,
|
814
|
+
)
|
815
|
+
assert self.fs.exists(path), "file was not written"
|
816
|
+
|
817
|
+
# Verify content
|
818
|
+
result = papq.read_table(path)
|
819
|
+
assert result.equals(self.table)
|
820
|
+
|
821
|
+
def test_write_json(self):
|
822
|
+
path = f"{self.base_path}/test.json.gz"
|
823
|
+
|
824
|
+
table_to_file(
|
825
|
+
self.table,
|
826
|
+
path,
|
827
|
+
self.fs,
|
828
|
+
lambda x: path,
|
829
|
+
content_type=ContentType.JSON.value,
|
830
|
+
)
|
831
|
+
assert self.fs.exists(path), "file was not written"
|
832
|
+
|
833
|
+
# Verify content (should be GZIP compressed)
|
834
|
+
with self.fs.open(path, "rb") as f:
|
835
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
836
|
+
content = gz.read().decode("utf-8")
|
837
|
+
# Each line should be a valid JSON object
|
838
|
+
lines = [
|
839
|
+
line for line in content.split("\n") if line
|
840
|
+
] # Skip empty lines
|
841
|
+
assert len(lines) == 2 # 2 records
|
842
|
+
assert json.loads(lines[0]) == {"col1": "a,b\tc|d", "col2": 1}
|
843
|
+
assert json.loads(lines[1]) == {"col1": "e,f\tg|h", "col2": 2}
|
844
|
+
|
845
|
+
def test_write_avro(self):
|
846
|
+
import polars as pl
|
847
|
+
|
848
|
+
path = f"{self.base_path}/test.avro"
|
849
|
+
|
850
|
+
table_to_file(
|
851
|
+
self.table,
|
852
|
+
path,
|
853
|
+
self.fs,
|
854
|
+
lambda x: path,
|
855
|
+
content_type=ContentType.AVRO.value,
|
856
|
+
)
|
857
|
+
assert self.fs.exists(path), "file was not written"
|
858
|
+
|
859
|
+
# Verify content by reading with polars
|
860
|
+
result = pl.read_avro(path).to_arrow()
|
861
|
+
# Cast the result to match the original table's schema
|
862
|
+
# (the round-trip from arrow->polars->arrow casts string to large string)
|
863
|
+
result = result.cast(self.table.schema)
|
864
|
+
assert result.equals(self.table)
|
865
|
+
|
866
|
+
|
867
|
+
class TestPyArrowReaders(TestCase):
|
868
|
+
def setUp(self):
|
869
|
+
# Create test data files for reading
|
870
|
+
self.fs = fsspec.filesystem("file")
|
871
|
+
self.base_path = tempfile.mkdtemp()
|
872
|
+
self.fs.makedirs(self.base_path, exist_ok=True)
|
873
|
+
|
874
|
+
# Create test Table
|
875
|
+
self.table = pa.Table.from_pylist(
|
876
|
+
[
|
877
|
+
{"col1": "a,b\tc|d", "col2": 1, "col3": 1.1},
|
878
|
+
{"col1": "e,f\tg|h", "col2": 2, "col3": 2.2},
|
879
|
+
{"col1": "test", "col2": 3, "col3": 3.3},
|
880
|
+
]
|
881
|
+
)
|
673
882
|
|
674
|
-
|
675
|
-
|
883
|
+
# Write test files in different formats
|
884
|
+
self._create_test_files()
|
885
|
+
|
886
|
+
def tearDown(self):
|
887
|
+
self.fs.rm(self.base_path, recursive=True)
|
888
|
+
|
889
|
+
def _create_test_files(self):
|
890
|
+
# Create CSV file (GZIP compressed)
|
891
|
+
csv_path = f"{self.base_path}/test.csv"
|
892
|
+
with self.fs.open(csv_path, "wb") as f:
|
893
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
894
|
+
content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
|
895
|
+
gz.write(content.encode("utf-8"))
|
896
|
+
|
897
|
+
# Create TSV file (GZIP compressed)
|
898
|
+
tsv_path = f"{self.base_path}/test.tsv"
|
899
|
+
with self.fs.open(tsv_path, "wb") as f:
|
900
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
901
|
+
content = '"a,b\tc|d"\t1\t1.1\n"e,f\tg|h"\t2\t2.2\ntest\t3\t3.3\n'
|
902
|
+
gz.write(content.encode("utf-8"))
|
903
|
+
|
904
|
+
# Create PSV file (GZIP compressed)
|
905
|
+
psv_path = f"{self.base_path}/test.psv"
|
906
|
+
with self.fs.open(psv_path, "wb") as f:
|
907
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
908
|
+
content = '"a,b\tc|d"|1|1.1\n"e,f\tg|h"|2|2.2\ntest|3|3.3\n'
|
909
|
+
gz.write(content.encode("utf-8"))
|
910
|
+
|
911
|
+
# Create unescaped TSV file (GZIP compressed)
|
912
|
+
unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
|
913
|
+
pa.Table.from_pylist(
|
914
|
+
[
|
915
|
+
{"col1": "abc", "col2": 1, "col3": 1.1},
|
916
|
+
{"col1": "def", "col2": 2, "col3": 2.2},
|
917
|
+
{"col1": "ghi", "col2": 3, "col3": 3.3},
|
918
|
+
]
|
676
919
|
)
|
920
|
+
with self.fs.open(unescaped_tsv_path, "wb") as f:
|
921
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
922
|
+
content = "abc\t1\t1.1\ndef\t2\t2.2\nghi\t3\t3.3\n"
|
923
|
+
gz.write(content.encode("utf-8"))
|
924
|
+
|
925
|
+
# Create Parquet file
|
926
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
927
|
+
with self.fs.open(parquet_path, "wb") as f:
|
928
|
+
papq.write_table(self.table, f)
|
929
|
+
|
930
|
+
# Create Feather file
|
931
|
+
feather_path = f"{self.base_path}/test.feather"
|
932
|
+
with self.fs.open(feather_path, "wb") as f:
|
933
|
+
paf.write_feather(self.table, f)
|
934
|
+
|
935
|
+
# Create JSON file (GZIP compressed)
|
936
|
+
json_path = f"{self.base_path}/test.json"
|
937
|
+
with self.fs.open(json_path, "wb") as f:
|
938
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
939
|
+
# Create NDJSON format - one JSON object per line
|
940
|
+
lines = []
|
941
|
+
for row in self.table.to_pylist():
|
942
|
+
lines.append(json.dumps(row))
|
943
|
+
content = "\n".join(lines) + "\n"
|
944
|
+
gz.write(content.encode("utf-8"))
|
945
|
+
|
946
|
+
# Create Avro file using polars (since pyarrow delegates to polars for Avro)
|
947
|
+
avro_path = f"{self.base_path}/test.avro"
|
948
|
+
import polars as pl
|
949
|
+
|
950
|
+
pl_df = pl.from_arrow(self.table)
|
951
|
+
pl_df.write_avro(avro_path)
|
952
|
+
|
953
|
+
# Create ORC file
|
954
|
+
orc_path = f"{self.base_path}/test.orc"
|
955
|
+
with self.fs.open(orc_path, "wb") as f:
|
956
|
+
paorc.write_table(self.table, f)
|
957
|
+
|
958
|
+
def test_content_type_to_reader_kwargs(self):
|
959
|
+
# Test CSV kwargs
|
960
|
+
csv_kwargs = content_type_to_reader_kwargs(ContentType.CSV.value)
|
961
|
+
expected_csv = {"parse_options": pacsv.ParseOptions(delimiter=",")}
|
962
|
+
assert (
|
963
|
+
csv_kwargs["parse_options"].delimiter
|
964
|
+
== expected_csv["parse_options"].delimiter
|
965
|
+
)
|
966
|
+
|
967
|
+
# Test TSV kwargs
|
968
|
+
tsv_kwargs = content_type_to_reader_kwargs(ContentType.TSV.value)
|
969
|
+
expected_tsv = {"parse_options": pacsv.ParseOptions(delimiter="\t")}
|
970
|
+
assert (
|
971
|
+
tsv_kwargs["parse_options"].delimiter
|
972
|
+
== expected_tsv["parse_options"].delimiter
|
973
|
+
)
|
974
|
+
|
975
|
+
# Test PSV kwargs
|
976
|
+
psv_kwargs = content_type_to_reader_kwargs(ContentType.PSV.value)
|
977
|
+
expected_psv = {"parse_options": pacsv.ParseOptions(delimiter="|")}
|
978
|
+
assert (
|
979
|
+
psv_kwargs["parse_options"].delimiter
|
980
|
+
== expected_psv["parse_options"].delimiter
|
981
|
+
)
|
982
|
+
|
983
|
+
# Test unescaped TSV kwargs
|
984
|
+
unescaped_kwargs = content_type_to_reader_kwargs(
|
985
|
+
ContentType.UNESCAPED_TSV.value
|
986
|
+
)
|
987
|
+
assert unescaped_kwargs["parse_options"].delimiter == "\t"
|
988
|
+
assert unescaped_kwargs["parse_options"].quote_char is False
|
989
|
+
assert unescaped_kwargs["convert_options"].null_values == [""]
|
990
|
+
|
991
|
+
# Test Parquet kwargs (should be empty)
|
992
|
+
parquet_kwargs = content_type_to_reader_kwargs(ContentType.PARQUET.value)
|
993
|
+
assert parquet_kwargs == {}
|
994
|
+
|
995
|
+
# Test ORC kwargs (should be empty)
|
996
|
+
orc_kwargs = content_type_to_reader_kwargs(ContentType.ORC.value)
|
997
|
+
assert orc_kwargs == {}
|
998
|
+
|
999
|
+
# Test Avro kwargs (should be empty)
|
1000
|
+
avro_kwargs = content_type_to_reader_kwargs(ContentType.AVRO.value)
|
1001
|
+
assert avro_kwargs == {}
|
1002
|
+
|
1003
|
+
def test_add_column_kwargs(self):
|
1004
|
+
kwargs = {}
|
1005
|
+
column_names = ["col1", "col2", "col3"]
|
1006
|
+
include_columns = ["col1", "col2"]
|
1007
|
+
|
1008
|
+
# Test CSV column kwargs
|
1009
|
+
_add_column_kwargs(ContentType.CSV.value, column_names, include_columns, kwargs)
|
1010
|
+
assert kwargs["read_options"].column_names == column_names
|
1011
|
+
assert kwargs["convert_options"].include_columns == include_columns
|
1012
|
+
|
1013
|
+
# Test Parquet column kwargs
|
1014
|
+
kwargs = {}
|
1015
|
+
_add_column_kwargs(
|
1016
|
+
ContentType.PARQUET.value, column_names, include_columns, kwargs
|
1017
|
+
)
|
1018
|
+
assert kwargs["columns"] == include_columns
|
677
1019
|
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
1020
|
+
def test_file_to_table_csv(self):
|
1021
|
+
# Test reading CSV with file_to_table
|
1022
|
+
csv_path = f"{self.base_path}/test.csv"
|
1023
|
+
|
1024
|
+
result = file_to_table(
|
1025
|
+
csv_path,
|
1026
|
+
ContentType.CSV.value,
|
1027
|
+
ContentEncoding.GZIP.value,
|
1028
|
+
filesystem=self.fs,
|
1029
|
+
column_names=["col1", "col2", "col3"],
|
687
1030
|
)
|
688
1031
|
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
for index, field in enumerate(result_schema):
|
693
|
-
self.assertEqual(field.name, schema.field(index).name)
|
1032
|
+
assert len(result) == 3
|
1033
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1034
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
694
1035
|
|
695
|
-
|
1036
|
+
def test_file_to_table_tsv(self):
|
1037
|
+
# Test reading TSV with file_to_table
|
1038
|
+
tsv_path = f"{self.base_path}/test.tsv"
|
696
1039
|
|
697
|
-
|
1040
|
+
result = file_to_table(
|
1041
|
+
tsv_path,
|
1042
|
+
ContentType.TSV.value,
|
1043
|
+
ContentEncoding.GZIP.value,
|
1044
|
+
filesystem=self.fs,
|
1045
|
+
column_names=["col1", "col2", "col3"],
|
1046
|
+
)
|
698
1047
|
|
699
|
-
|
700
|
-
|
1048
|
+
assert len(result) == 3
|
1049
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1050
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
1051
|
+
|
1052
|
+
def test_file_to_table_psv(self):
|
1053
|
+
# Test reading PSV with file_to_table
|
1054
|
+
psv_path = f"{self.base_path}/test.psv"
|
1055
|
+
|
1056
|
+
result = file_to_table(
|
1057
|
+
psv_path,
|
1058
|
+
ContentType.PSV.value,
|
1059
|
+
ContentEncoding.GZIP.value,
|
1060
|
+
filesystem=self.fs,
|
1061
|
+
column_names=["col1", "col2", "col3"],
|
701
1062
|
)
|
702
1063
|
|
703
|
-
result
|
704
|
-
|
1064
|
+
assert len(result) == 3
|
1065
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1066
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
1067
|
+
|
1068
|
+
def test_file_to_table_unescaped_tsv(self):
|
1069
|
+
# Test reading unescaped TSV with file_to_table
|
1070
|
+
unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
|
1071
|
+
|
1072
|
+
result = file_to_table(
|
1073
|
+
unescaped_tsv_path,
|
705
1074
|
ContentType.UNESCAPED_TSV.value,
|
706
1075
|
ContentEncoding.GZIP.value,
|
707
|
-
|
708
|
-
|
709
|
-
pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
|
710
|
-
schema=schema
|
711
|
-
),
|
1076
|
+
filesystem=self.fs,
|
1077
|
+
column_names=["col1", "col2", "col3"],
|
712
1078
|
)
|
713
1079
|
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
for index, field in enumerate(result_schema):
|
718
|
-
self.assertEqual(field.name, schema.field(index).name)
|
1080
|
+
assert len(result) == 3
|
1081
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1082
|
+
assert result.column("col1").to_pylist() == ["abc", "def", "ghi"]
|
719
1083
|
|
720
|
-
|
1084
|
+
def test_file_to_table_parquet(self):
|
1085
|
+
# Test reading Parquet with file_to_table
|
1086
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
721
1087
|
|
722
|
-
|
1088
|
+
result = file_to_table(
|
1089
|
+
parquet_path, ContentType.PARQUET.value, filesystem=self.fs
|
1090
|
+
)
|
723
1091
|
|
724
|
-
|
725
|
-
|
1092
|
+
assert len(result) == 3
|
1093
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1094
|
+
assert result.equals(self.table)
|
1095
|
+
|
1096
|
+
def test_file_to_table_feather(self):
|
1097
|
+
# Test reading Feather with file_to_table
|
1098
|
+
feather_path = f"{self.base_path}/test.feather"
|
1099
|
+
|
1100
|
+
result = file_to_table(
|
1101
|
+
feather_path, ContentType.FEATHER.value, filesystem=self.fs
|
726
1102
|
)
|
727
1103
|
|
728
|
-
result
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
1104
|
+
assert len(result) == 3
|
1105
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1106
|
+
assert result.equals(self.table)
|
1107
|
+
|
1108
|
+
def test_file_to_table_json(self):
|
1109
|
+
# Test reading JSON with file_to_table
|
1110
|
+
json_path = f"{self.base_path}/test.json"
|
1111
|
+
|
1112
|
+
result = file_to_table(
|
1113
|
+
json_path,
|
1114
|
+
ContentType.JSON.value,
|
1115
|
+
ContentEncoding.GZIP.value,
|
1116
|
+
filesystem=self.fs,
|
737
1117
|
)
|
738
1118
|
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
for index, field in enumerate(result_schema):
|
743
|
-
self.assertEqual(field.name, schema.field(index).name)
|
1119
|
+
assert len(result) == 3
|
1120
|
+
assert set(result.column_names) == {"col1", "col2", "col3"}
|
1121
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
744
1122
|
|
745
|
-
|
1123
|
+
def test_file_to_table_avro(self):
|
1124
|
+
# Test reading Avro with file_to_table
|
1125
|
+
avro_path = f"{self.base_path}/test.avro"
|
746
1126
|
|
747
|
-
|
1127
|
+
result = file_to_table(avro_path, ContentType.AVRO.value, filesystem=self.fs)
|
748
1128
|
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
1129
|
+
assert len(result) == 3
|
1130
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1131
|
+
# Avro may have different dtypes, so compare values
|
1132
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
753
1133
|
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
1134
|
+
def test_file_to_table_orc(self):
|
1135
|
+
# Test reading ORC with file_to_table
|
1136
|
+
orc_path = f"{self.base_path}/test.orc"
|
1137
|
+
|
1138
|
+
result = file_to_table(orc_path, ContentType.ORC.value, filesystem=self.fs)
|
1139
|
+
|
1140
|
+
assert len(result) == 3
|
1141
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1142
|
+
assert result.equals(self.table)
|
1143
|
+
|
1144
|
+
def test_file_to_table_with_column_selection(self):
|
1145
|
+
# Test reading with column selection
|
1146
|
+
csv_path = f"{self.base_path}/test.csv"
|
1147
|
+
|
1148
|
+
result = file_to_table(
|
1149
|
+
csv_path,
|
1150
|
+
ContentType.CSV.value,
|
1151
|
+
ContentEncoding.GZIP.value,
|
1152
|
+
filesystem=self.fs,
|
1153
|
+
column_names=["col1", "col2", "col3"],
|
1154
|
+
include_columns=["col1", "col2"],
|
761
1155
|
)
|
762
1156
|
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
schema_index = schema.get_field_index("n_legs")
|
767
|
-
self.assertEqual(schema.field(schema_index).type, "int64")
|
1157
|
+
assert len(result) == 3
|
1158
|
+
assert len(result.column_names) == 2 # Should only have 2 columns
|
1159
|
+
assert result.column_names == ["col1", "col2"]
|
768
1160
|
|
769
|
-
def
|
1161
|
+
def test_file_to_table_with_kwargs_provider(self):
|
1162
|
+
# Test reading with kwargs provider
|
1163
|
+
csv_path = f"{self.base_path}/test.csv"
|
1164
|
+
provider = ReadKwargsProviderPyArrowCsvPureUtf8(
|
1165
|
+
include_columns=["col1", "col2", "col3"]
|
1166
|
+
)
|
770
1167
|
|
771
|
-
|
772
|
-
|
1168
|
+
result = file_to_table(
|
1169
|
+
csv_path,
|
1170
|
+
ContentType.CSV.value,
|
1171
|
+
ContentEncoding.GZIP.value,
|
1172
|
+
filesystem=self.fs,
|
1173
|
+
column_names=["col1", "col2", "col3"],
|
1174
|
+
pa_read_func_kwargs_provider=provider,
|
773
1175
|
)
|
774
1176
|
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
1177
|
+
assert len(result) == 3
|
1178
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1179
|
+
# With string types provider, all columns should be strings
|
1180
|
+
for col_name in result.column_names:
|
1181
|
+
assert result.schema.field(col_name).type == pa.string()
|
780
1182
|
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
1183
|
+
def test_file_to_table_filesystem_inference(self):
|
1184
|
+
# Test filesystem inference when no filesystem is provided
|
1185
|
+
# Use JSON file since it should work well with inference
|
1186
|
+
json_path = f"{self.base_path}/test.json"
|
1187
|
+
|
1188
|
+
result = file_to_table(
|
1189
|
+
json_path,
|
1190
|
+
ContentType.JSON.value,
|
1191
|
+
ContentEncoding.GZIP.value
|
1192
|
+
# No filesystem provided - should be inferred
|
787
1193
|
)
|
788
1194
|
|
789
|
-
|
790
|
-
|
1195
|
+
assert len(result) == 3
|
1196
|
+
assert set(result.column_names) == {"col1", "col2", "col3"}
|
1197
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
791
1198
|
|
792
|
-
|
793
|
-
for
|
794
|
-
|
1199
|
+
def test_file_to_table_unsupported_content_type(self):
|
1200
|
+
# Test error handling for unsupported content type
|
1201
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
795
1202
|
|
796
|
-
self.
|
1203
|
+
with self.assertRaises(NotImplementedError) as context:
|
1204
|
+
file_to_table(parquet_path, "unsupported/content-type", filesystem=self.fs)
|
797
1205
|
|
798
|
-
|
1206
|
+
assert "not implemented" in str(context.exception)
|
799
1207
|
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
}
|
1208
|
+
def test_file_to_table_bzip2_compression(self):
|
1209
|
+
# Test BZIP2 compression handling
|
1210
|
+
import bz2
|
804
1211
|
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
1212
|
+
# Create a BZIP2 compressed CSV file
|
1213
|
+
csv_content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
|
1214
|
+
compressed_content = bz2.compress(csv_content.encode("utf-8"))
|
1215
|
+
|
1216
|
+
bz2_path = f"{self.base_path}/test.csv.bz2"
|
1217
|
+
with self.fs.open(bz2_path, "wb") as f:
|
1218
|
+
f.write(compressed_content)
|
1219
|
+
|
1220
|
+
result = file_to_table(
|
1221
|
+
bz2_path,
|
1222
|
+
ContentType.CSV.value,
|
1223
|
+
ContentEncoding.BZIP2.value,
|
1224
|
+
filesystem=self.fs,
|
1225
|
+
column_names=["col1", "col2", "col3"],
|
812
1226
|
)
|
813
1227
|
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
1228
|
+
assert len(result) == 3
|
1229
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1230
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
1231
|
+
|
1232
|
+
|
1233
|
+
class TestFileToParquet(TestCase):
|
1234
|
+
def setUp(self):
|
1235
|
+
# Create test data files for reading
|
1236
|
+
self.fs = fsspec.filesystem("file")
|
1237
|
+
self.base_path = tempfile.mkdtemp()
|
1238
|
+
self.fs.makedirs(self.base_path, exist_ok=True)
|
1239
|
+
|
1240
|
+
# Create test Table
|
1241
|
+
self.table = pa.Table.from_pylist(
|
1242
|
+
[
|
1243
|
+
{"col1": "a,b\tc|d", "col2": 1, "col3": 1.1},
|
1244
|
+
{"col1": "e,f\tg|h", "col2": 2, "col3": 2.2},
|
1245
|
+
{"col1": "test", "col2": 3, "col3": 3.3},
|
1246
|
+
]
|
1247
|
+
)
|
1248
|
+
|
1249
|
+
# Write test parquet files
|
1250
|
+
self._create_test_files()
|
1251
|
+
|
1252
|
+
def tearDown(self):
|
1253
|
+
self.fs.rm(self.base_path, recursive=True)
|
1254
|
+
|
1255
|
+
def _create_test_files(self):
|
1256
|
+
# Create basic Parquet file
|
1257
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1258
|
+
with self.fs.open(parquet_path, "wb") as f:
|
1259
|
+
papq.write_table(self.table, f)
|
1260
|
+
|
1261
|
+
# Create larger Parquet file with multiple row groups
|
1262
|
+
large_table = pa.Table.from_pylist(
|
1263
|
+
[{"col1": f"row_{i}", "col2": i, "col3": float(i)} for i in range(1000)]
|
1264
|
+
)
|
1265
|
+
large_parquet_path = f"{self.base_path}/test_large.parquet"
|
1266
|
+
with self.fs.open(large_parquet_path, "wb") as f:
|
1267
|
+
papq.write_table(
|
1268
|
+
large_table, f, row_group_size=100
|
1269
|
+
) # Create multiple row groups
|
1270
|
+
|
1271
|
+
def test_file_to_parquet_basic(self):
|
1272
|
+
# Test basic parquet file reading
|
1273
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1274
|
+
|
1275
|
+
result = file_to_parquet(parquet_path, filesystem=self.fs)
|
1276
|
+
|
1277
|
+
assert isinstance(result, papq.ParquetFile)
|
1278
|
+
assert result.num_row_groups > 0
|
1279
|
+
assert result.metadata.num_rows == 3
|
1280
|
+
assert result.metadata.num_columns == 3
|
1281
|
+
|
1282
|
+
# Verify we can read the data
|
1283
|
+
table = result.read()
|
1284
|
+
assert len(table) == 3
|
1285
|
+
assert table.column_names == ["col1", "col2", "col3"]
|
1286
|
+
|
1287
|
+
def test_file_to_parquet_with_schema_provider(self):
|
1288
|
+
# Test with schema override provider
|
1289
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
819
1290
|
|
820
|
-
def test_s3_file_to_table_when_utsv_gzip_and_content_type_overridden(self):
|
821
1291
|
schema = pa.schema(
|
822
|
-
[
|
1292
|
+
[
|
1293
|
+
pa.field("col1", pa.string()),
|
1294
|
+
pa.field("col2", pa.string()), # Override to string
|
1295
|
+
pa.field("col3", pa.string()), # Override to string
|
1296
|
+
]
|
823
1297
|
)
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
pa_kwargs_provider = lambda content_type, kwargs: {
|
830
|
-
"reader_type": "pyarrow",
|
831
|
-
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
832
|
-
**kwargs,
|
833
|
-
}
|
834
|
-
|
835
|
-
result = s3_file_to_table(
|
836
|
-
GZIP_COMPRESSED_FILE_UTSV_PATH,
|
837
|
-
ContentType.UNESCAPED_TSV.value,
|
838
|
-
ContentEncoding.GZIP.value,
|
839
|
-
["is_active", "ship_datetime_utc"],
|
840
|
-
None,
|
841
|
-
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
1298
|
+
|
1299
|
+
provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
1300
|
+
|
1301
|
+
result = file_to_parquet(
|
1302
|
+
parquet_path, filesystem=self.fs, pa_read_func_kwargs_provider=provider
|
842
1303
|
)
|
843
1304
|
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
1305
|
+
assert isinstance(result, papq.ParquetFile)
|
1306
|
+
# Note: schema override might not affect ParquetFile metadata,
|
1307
|
+
# but should work when reading the table
|
1308
|
+
table = result.read()
|
1309
|
+
assert len(table) == 3
|
849
1310
|
|
850
|
-
|
1311
|
+
def test_file_to_parquet_with_custom_kwargs(self):
|
1312
|
+
# Test with custom ParquetFile kwargs
|
1313
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
851
1314
|
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
1315
|
+
result = file_to_parquet(
|
1316
|
+
parquet_path,
|
1317
|
+
filesystem=self.fs,
|
1318
|
+
validate_schema=True, # Custom kwarg for ParquetFile
|
1319
|
+
memory_map=True, # Another custom kwarg
|
1320
|
+
)
|
858
1321
|
|
859
|
-
result
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
1322
|
+
assert isinstance(result, papq.ParquetFile)
|
1323
|
+
assert result.metadata.num_rows == 3
|
1324
|
+
|
1325
|
+
def test_file_to_parquet_filesystem_inference(self):
|
1326
|
+
# Test filesystem inference when no filesystem is provided
|
1327
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1328
|
+
|
1329
|
+
result = file_to_parquet(
|
1330
|
+
parquet_path
|
1331
|
+
# No filesystem provided - should be inferred
|
866
1332
|
)
|
867
1333
|
|
868
|
-
|
869
|
-
|
1334
|
+
assert isinstance(result, papq.ParquetFile)
|
1335
|
+
assert result.metadata.num_rows == 3
|
1336
|
+
assert result.metadata.num_columns == 3
|
1337
|
+
|
1338
|
+
def test_file_to_parquet_large_file(self):
|
1339
|
+
# Test with larger parquet file (multiple row groups)
|
1340
|
+
large_parquet_path = f"{self.base_path}/test_large.parquet"
|
1341
|
+
|
1342
|
+
result = file_to_parquet(large_parquet_path, filesystem=self.fs)
|
1343
|
+
|
1344
|
+
assert isinstance(result, papq.ParquetFile)
|
1345
|
+
assert result.metadata.num_rows == 1000
|
1346
|
+
assert result.num_row_groups > 1 # Should have multiple row groups
|
1347
|
+
|
1348
|
+
# Test reading specific row groups
|
1349
|
+
first_row_group = result.read_row_group(0)
|
1350
|
+
assert len(first_row_group) <= 100 # Based on row_group_size=100
|
1351
|
+
|
1352
|
+
def test_file_to_parquet_metadata_access(self):
|
1353
|
+
# Test accessing various metadata properties
|
1354
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1355
|
+
|
1356
|
+
result = file_to_parquet(parquet_path, filesystem=self.fs)
|
1357
|
+
|
1358
|
+
# Test metadata access
|
1359
|
+
metadata = result.metadata
|
1360
|
+
assert metadata.num_rows == 3
|
1361
|
+
assert metadata.num_columns == 3
|
1362
|
+
assert metadata.num_row_groups >= 1
|
1363
|
+
|
1364
|
+
# Test schema access
|
870
1365
|
schema = result.schema
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
}
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
self.
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
field.name, result_parquet_file.schema_arrow.field(index).name
|
1366
|
+
assert len(schema) == 3
|
1367
|
+
assert "col1" in schema.names
|
1368
|
+
assert "col2" in schema.names
|
1369
|
+
assert "col3" in schema.names
|
1370
|
+
|
1371
|
+
# Test schema_arrow property
|
1372
|
+
schema_arrow = result.schema_arrow
|
1373
|
+
assert isinstance(schema_arrow, pa.Schema)
|
1374
|
+
assert len(schema_arrow) == 3
|
1375
|
+
|
1376
|
+
def test_file_to_parquet_column_selection(self):
|
1377
|
+
# Test reading specific columns
|
1378
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1379
|
+
|
1380
|
+
result = file_to_parquet(parquet_path, filesystem=self.fs)
|
1381
|
+
|
1382
|
+
# Read only specific columns
|
1383
|
+
table = result.read(columns=["col1", "col2"])
|
1384
|
+
assert len(table.column_names) == 2
|
1385
|
+
assert table.column_names == ["col1", "col2"]
|
1386
|
+
assert len(table) == 3
|
1387
|
+
|
1388
|
+
def test_file_to_parquet_invalid_content_type(self):
|
1389
|
+
# Test error handling for invalid content type
|
1390
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1391
|
+
|
1392
|
+
with self.assertRaises(ContentTypeValidationError) as context:
|
1393
|
+
file_to_parquet(
|
1394
|
+
parquet_path,
|
1395
|
+
content_type=ContentType.CSV.value, # Invalid content type
|
1396
|
+
filesystem=self.fs,
|
903
1397
|
)
|
904
|
-
self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
|
905
1398
|
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
test_s3_url,
|
920
|
-
test_content_type,
|
921
|
-
test_content_encoding,
|
922
|
-
["n_legs", "animal"],
|
923
|
-
["n_legs"],
|
924
|
-
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
925
|
-
)
|
926
|
-
log_message_log_args = cm.records[0].getMessage()
|
927
|
-
log_message_log_new_content_encoding = cm.records[1].getMessage()
|
928
|
-
log_message_presanitize_kwargs = cm.records[2].getMessage()
|
929
|
-
self.assertIn(
|
930
|
-
f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
|
931
|
-
log_message_log_args,
|
932
|
-
)
|
933
|
-
self.assertIn(
|
934
|
-
f"Overriding {test_s3_url} content encoding from {ContentEncoding.GZIP.value} to {ContentEncoding.IDENTITY.value}",
|
935
|
-
log_message_log_new_content_encoding,
|
936
|
-
)
|
937
|
-
self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
|
938
|
-
for index, field in enumerate(result_parquet_file.schema_arrow):
|
939
|
-
self.assertEqual(
|
940
|
-
field.name, result_parquet_file.schema_arrow.field(index).name
|
1399
|
+
assert "cannot be read into pyarrow.parquet.ParquetFile" in str(
|
1400
|
+
context.exception
|
1401
|
+
)
|
1402
|
+
|
1403
|
+
def test_file_to_parquet_invalid_content_encoding(self):
|
1404
|
+
# Test error handling for invalid content encoding
|
1405
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1406
|
+
|
1407
|
+
with self.assertRaises(ContentTypeValidationError) as context:
|
1408
|
+
file_to_parquet(
|
1409
|
+
parquet_path,
|
1410
|
+
content_encoding=ContentEncoding.GZIP.value, # Invalid encoding
|
1411
|
+
filesystem=self.fs,
|
941
1412
|
)
|
942
|
-
self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
|
943
1413
|
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
1414
|
+
assert "cannot be read into pyarrow.parquet.ParquetFile" in str(
|
1415
|
+
context.exception
|
1416
|
+
)
|
1417
|
+
|
1418
|
+
def test_file_to_parquet_different_filesystems(self):
|
1419
|
+
# Test with different filesystem implementations
|
1420
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1421
|
+
|
1422
|
+
# Test with fsspec filesystem
|
1423
|
+
result_fsspec = file_to_parquet(parquet_path, filesystem=self.fs)
|
1424
|
+
assert isinstance(result_fsspec, papq.ParquetFile)
|
1425
|
+
assert result_fsspec.metadata.num_rows == 3
|
1426
|
+
|
1427
|
+
# Test with None filesystem (inferred)
|
1428
|
+
result_inferred = file_to_parquet(parquet_path, filesystem=None)
|
1429
|
+
assert isinstance(result_inferred, papq.ParquetFile)
|
1430
|
+
assert result_inferred.metadata.num_rows == 3
|
1431
|
+
|
1432
|
+
def test_file_to_parquet_lazy_loading(self):
|
1433
|
+
# Test that ParquetFile provides lazy loading capabilities
|
1434
|
+
large_parquet_path = f"{self.base_path}/test_large.parquet"
|
1435
|
+
|
1436
|
+
result = file_to_parquet(large_parquet_path, filesystem=self.fs)
|
1437
|
+
|
1438
|
+
# ParquetFile should be created without loading all data
|
1439
|
+
assert isinstance(result, papq.ParquetFile)
|
1440
|
+
assert result.metadata.num_rows == 1000
|
1441
|
+
|
1442
|
+
# Test reading only specific columns (lazy loading)
|
1443
|
+
partial_table = result.read(columns=["col1", "col2"])
|
1444
|
+
assert len(partial_table) == 1000 # All rows but only 2 columns
|
1445
|
+
assert partial_table.column_names == ["col1", "col2"]
|
1446
|
+
|
1447
|
+
# Test reading specific row group (lazy loading)
|
1448
|
+
row_group_table = result.read_row_group(0)
|
1449
|
+
assert len(row_group_table) <= 100 # Based on row_group_size
|
1450
|
+
|
1451
|
+
def test_file_to_parquet_performance_timing(self):
|
1452
|
+
# Test that performance timing is logged (basic functionality test)
|
1453
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1454
|
+
|
1455
|
+
# This should complete without error and log timing
|
1456
|
+
result = file_to_parquet(parquet_path, filesystem=self.fs)
|
1457
|
+
|
1458
|
+
assert isinstance(result, papq.ParquetFile)
|
1459
|
+
assert result.metadata.num_rows == 3
|
1460
|
+
|
1461
|
+
|
1462
|
+
class TestFileToTableFilesystems(TestCase):
|
1463
|
+
"""Test file_to_table with different filesystem implementations across all content types."""
|
1464
|
+
|
1465
|
+
def setUp(self):
|
1466
|
+
self.tmpdir = tempfile.mkdtemp()
|
1467
|
+
self._create_test_files()
|
1468
|
+
|
1469
|
+
def tearDown(self):
|
1470
|
+
import shutil
|
1471
|
+
|
1472
|
+
shutil.rmtree(self.tmpdir)
|
1473
|
+
|
1474
|
+
def _create_test_files(self):
|
1475
|
+
"""Create test files for all supported content types."""
|
1476
|
+
# Test data
|
1477
|
+
test_data = pa.table(
|
1478
|
+
{
|
1479
|
+
"id": [1, 2, 3, 4, 5],
|
1480
|
+
"name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
|
1481
|
+
"age": [25, 30, 35, 28, 32],
|
1482
|
+
"score": [85.5, 92.0, 78.5, 88.0, 95.5],
|
1483
|
+
}
|
1484
|
+
)
|
1485
|
+
|
1486
|
+
# File paths
|
1487
|
+
self.csv_file = f"{self.tmpdir}/test.csv"
|
1488
|
+
self.tsv_file = f"{self.tmpdir}/test.tsv"
|
1489
|
+
self.psv_file = f"{self.tmpdir}/test.psv"
|
1490
|
+
self.unescaped_tsv_file = f"{self.tmpdir}/test_unescaped.tsv"
|
1491
|
+
self.parquet_file = f"{self.tmpdir}/test.parquet"
|
1492
|
+
self.feather_file = f"{self.tmpdir}/test.feather"
|
1493
|
+
self.json_file = f"{self.tmpdir}/test.json"
|
1494
|
+
self.orc_file = f"{self.tmpdir}/test.orc"
|
1495
|
+
self.avro_file = f"{self.tmpdir}/test.avro"
|
1496
|
+
|
1497
|
+
# Create CSV file
|
1498
|
+
pacsv.write_csv(
|
1499
|
+
test_data,
|
1500
|
+
self.csv_file,
|
1501
|
+
write_options=pacsv.WriteOptions(delimiter=",", include_header=False),
|
1502
|
+
)
|
1503
|
+
|
1504
|
+
# Create TSV file
|
1505
|
+
pacsv.write_csv(
|
1506
|
+
test_data,
|
1507
|
+
self.tsv_file,
|
1508
|
+
write_options=pacsv.WriteOptions(delimiter="\t", include_header=False),
|
1509
|
+
)
|
1510
|
+
|
1511
|
+
# Create PSV file
|
1512
|
+
pacsv.write_csv(
|
1513
|
+
test_data,
|
1514
|
+
self.psv_file,
|
1515
|
+
write_options=pacsv.WriteOptions(delimiter="|", include_header=False),
|
1516
|
+
)
|
1517
|
+
|
1518
|
+
# Create unescaped TSV file
|
1519
|
+
pacsv.write_csv(
|
1520
|
+
test_data,
|
1521
|
+
self.unescaped_tsv_file,
|
1522
|
+
write_options=pacsv.WriteOptions(
|
1523
|
+
delimiter="\t", include_header=False, quoting_style="none"
|
1524
|
+
),
|
1525
|
+
)
|
1526
|
+
|
1527
|
+
# Create Parquet file
|
1528
|
+
papq.write_table(test_data, self.parquet_file)
|
1529
|
+
|
1530
|
+
# Create Feather file
|
1531
|
+
paf.write_feather(test_data, self.feather_file)
|
1532
|
+
|
1533
|
+
# Create JSON file (write as JSONL format)
|
1534
|
+
df = test_data.to_pandas()
|
1535
|
+
with open(self.json_file, "w") as f:
|
1536
|
+
for _, row in df.iterrows():
|
1537
|
+
json.dump(row.to_dict(), f)
|
1538
|
+
f.write("\n")
|
1539
|
+
|
1540
|
+
# Create ORC file
|
1541
|
+
paorc.write_table(test_data, self.orc_file)
|
1542
|
+
|
1543
|
+
# Create Avro file
|
1544
|
+
try:
|
1545
|
+
import polars as pl
|
1546
|
+
|
1547
|
+
pl_df = pl.from_arrow(test_data)
|
1548
|
+
pl_df.write_avro(self.avro_file)
|
1549
|
+
except ImportError:
|
1550
|
+
# Skip Avro file creation if polars is not available
|
1551
|
+
self.avro_file = None
|
1552
|
+
|
1553
|
+
def _get_filesystems(self, file_path):
|
1554
|
+
"""Get different filesystem implementations for testing."""
|
1555
|
+
# fsspec AbstractFileSystem
|
1556
|
+
fsspec_fs = fsspec.filesystem("file")
|
1557
|
+
|
1558
|
+
# PyArrow filesystem
|
1559
|
+
import pyarrow.fs as pafs
|
1560
|
+
|
1561
|
+
pyarrow_fs = pafs.LocalFileSystem()
|
1562
|
+
|
1563
|
+
# None for automatic inference
|
1564
|
+
auto_infer_fs = None
|
1565
|
+
|
1566
|
+
return [
|
1567
|
+
("fsspec", fsspec_fs),
|
1568
|
+
("pyarrow", pyarrow_fs),
|
1569
|
+
("auto_infer", auto_infer_fs),
|
1570
|
+
]
|
1571
|
+
|
1572
|
+
def _assert_table_content(self, table, content_type):
|
1573
|
+
"""Assert that the loaded table has expected content."""
|
1574
|
+
self.assertEqual(len(table), 5, f"Expected 5 rows for {content_type}")
|
1575
|
+
self.assertEqual(
|
1576
|
+
len(table.columns), 4, f"Expected 4 columns for {content_type}"
|
1577
|
+
)
|
1578
|
+
|
1579
|
+
# Check column names exist (order might vary for some formats)
|
1580
|
+
column_names = set(table.column_names)
|
1581
|
+
expected_columns = {"id", "name", "age", "score"}
|
1582
|
+
self.assertEqual(
|
1583
|
+
column_names, expected_columns, f"Column names mismatch for {content_type}"
|
1584
|
+
)
|
1585
|
+
|
1586
|
+
def test_csv_all_filesystems(self):
|
1587
|
+
"""Test CSV reading with all filesystem types."""
|
1588
|
+
for fs_name, filesystem in self._get_filesystems(self.csv_file):
|
1589
|
+
with self.subTest(filesystem=fs_name):
|
1590
|
+
table = file_to_table(
|
1591
|
+
self.csv_file,
|
1592
|
+
ContentType.CSV.value,
|
1593
|
+
ContentEncoding.IDENTITY.value,
|
1594
|
+
filesystem=filesystem,
|
1595
|
+
column_names=["id", "name", "age", "score"],
|
1596
|
+
)
|
1597
|
+
self._assert_table_content(table, f"CSV with {fs_name}")
|
1598
|
+
|
1599
|
+
def test_tsv_all_filesystems(self):
|
1600
|
+
"""Test TSV reading with all filesystem types."""
|
1601
|
+
for fs_name, filesystem in self._get_filesystems(self.tsv_file):
|
1602
|
+
with self.subTest(filesystem=fs_name):
|
1603
|
+
table = file_to_table(
|
1604
|
+
self.tsv_file,
|
1605
|
+
ContentType.TSV.value,
|
1606
|
+
ContentEncoding.IDENTITY.value,
|
1607
|
+
filesystem=filesystem,
|
1608
|
+
column_names=["id", "name", "age", "score"],
|
1609
|
+
)
|
1610
|
+
self._assert_table_content(table, f"TSV with {fs_name}")
|
1611
|
+
|
1612
|
+
def test_psv_all_filesystems(self):
|
1613
|
+
"""Test PSV reading with all filesystem types."""
|
1614
|
+
for fs_name, filesystem in self._get_filesystems(self.psv_file):
|
1615
|
+
with self.subTest(filesystem=fs_name):
|
1616
|
+
table = file_to_table(
|
1617
|
+
self.psv_file,
|
1618
|
+
ContentType.PSV.value,
|
1619
|
+
ContentEncoding.IDENTITY.value,
|
1620
|
+
filesystem=filesystem,
|
1621
|
+
column_names=["id", "name", "age", "score"],
|
1622
|
+
)
|
1623
|
+
self._assert_table_content(table, f"PSV with {fs_name}")
|
1624
|
+
|
1625
|
+
def test_unescaped_tsv_all_filesystems(self):
|
1626
|
+
"""Test unescaped TSV reading with all filesystem types."""
|
1627
|
+
for fs_name, filesystem in self._get_filesystems(self.unescaped_tsv_file):
|
1628
|
+
with self.subTest(filesystem=fs_name):
|
1629
|
+
table = file_to_table(
|
1630
|
+
self.unescaped_tsv_file,
|
1631
|
+
ContentType.UNESCAPED_TSV.value,
|
1632
|
+
ContentEncoding.IDENTITY.value,
|
1633
|
+
filesystem=filesystem,
|
1634
|
+
column_names=["id", "name", "age", "score"],
|
1635
|
+
)
|
1636
|
+
self._assert_table_content(table, f"UNESCAPED_TSV with {fs_name}")
|
1637
|
+
|
1638
|
+
def test_parquet_all_filesystems(self):
|
1639
|
+
"""Test Parquet reading with all filesystem types."""
|
1640
|
+
for fs_name, filesystem in self._get_filesystems(self.parquet_file):
|
1641
|
+
with self.subTest(filesystem=fs_name):
|
1642
|
+
table = file_to_table(
|
1643
|
+
self.parquet_file,
|
1644
|
+
ContentType.PARQUET.value,
|
1645
|
+
ContentEncoding.IDENTITY.value,
|
1646
|
+
filesystem=filesystem,
|
1647
|
+
)
|
1648
|
+
self._assert_table_content(table, f"PARQUET with {fs_name}")
|
1649
|
+
|
1650
|
+
def test_feather_all_filesystems(self):
|
1651
|
+
"""Test Feather reading with all filesystem types."""
|
1652
|
+
for fs_name, filesystem in self._get_filesystems(self.feather_file):
|
1653
|
+
with self.subTest(filesystem=fs_name):
|
1654
|
+
table = file_to_table(
|
1655
|
+
self.feather_file,
|
1656
|
+
ContentType.FEATHER.value,
|
1657
|
+
ContentEncoding.IDENTITY.value,
|
1658
|
+
filesystem=filesystem,
|
1659
|
+
)
|
1660
|
+
self._assert_table_content(table, f"FEATHER with {fs_name}")
|
1661
|
+
|
1662
|
+
def test_json_all_filesystems(self):
|
1663
|
+
"""Test JSON reading with all filesystem types."""
|
1664
|
+
for fs_name, filesystem in self._get_filesystems(self.json_file):
|
1665
|
+
with self.subTest(filesystem=fs_name):
|
1666
|
+
table = file_to_table(
|
1667
|
+
self.json_file,
|
1668
|
+
ContentType.JSON.value,
|
1669
|
+
ContentEncoding.IDENTITY.value,
|
1670
|
+
filesystem=filesystem,
|
1671
|
+
)
|
1672
|
+
self._assert_table_content(table, f"JSON with {fs_name}")
|
1673
|
+
|
1674
|
+
def test_orc_all_filesystems(self):
|
1675
|
+
"""Test ORC reading with all filesystem types."""
|
1676
|
+
for fs_name, filesystem in self._get_filesystems(self.orc_file):
|
1677
|
+
with self.subTest(filesystem=fs_name):
|
1678
|
+
table = file_to_table(
|
1679
|
+
self.orc_file,
|
1680
|
+
ContentType.ORC.value,
|
1681
|
+
ContentEncoding.IDENTITY.value,
|
1682
|
+
filesystem=filesystem,
|
1683
|
+
)
|
1684
|
+
self._assert_table_content(table, f"ORC with {fs_name}")
|
1685
|
+
|
1686
|
+
def test_avro_all_filesystems(self):
|
1687
|
+
"""Test Avro reading with all filesystem types."""
|
1688
|
+
if self.avro_file is None:
|
1689
|
+
self.skipTest("Avro file creation skipped (polars not available)")
|
1690
|
+
|
1691
|
+
for fs_name, filesystem in self._get_filesystems(self.avro_file):
|
1692
|
+
with self.subTest(filesystem=fs_name):
|
1693
|
+
table = file_to_table(
|
1694
|
+
self.avro_file,
|
1695
|
+
ContentType.AVRO.value,
|
1696
|
+
ContentEncoding.IDENTITY.value,
|
1697
|
+
filesystem=filesystem,
|
1698
|
+
)
|
1699
|
+
self._assert_table_content(table, f"AVRO with {fs_name}")
|
1700
|
+
|
1701
|
+
def test_column_selection_all_filesystems(self):
|
1702
|
+
"""Test column selection works with all filesystem types."""
|
1703
|
+
for fs_name, filesystem in self._get_filesystems(self.parquet_file):
|
1704
|
+
with self.subTest(filesystem=fs_name):
|
1705
|
+
table = file_to_table(
|
1706
|
+
self.parquet_file,
|
1707
|
+
ContentType.PARQUET.value,
|
1708
|
+
ContentEncoding.IDENTITY.value,
|
1709
|
+
filesystem=filesystem,
|
1710
|
+
include_columns=["name", "age"],
|
1711
|
+
)
|
1712
|
+
self.assertEqual(
|
1713
|
+
len(table.columns), 2, f"Expected 2 columns with {fs_name}"
|
963
1714
|
)
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
1715
|
+
self.assertEqual(
|
1716
|
+
set(table.column_names),
|
1717
|
+
{"name", "age"},
|
1718
|
+
f"Column selection failed with {fs_name}",
|
1719
|
+
)
|
1720
|
+
|
1721
|
+
def test_kwargs_provider_all_filesystems(self):
|
1722
|
+
"""Test that kwargs providers work with all filesystem types."""
|
1723
|
+
|
1724
|
+
def schema_provider(content_type, kwargs):
|
1725
|
+
if content_type == ContentType.CSV.value:
|
1726
|
+
# Force all columns to be strings
|
1727
|
+
kwargs["convert_options"] = pacsv.ConvertOptions(
|
1728
|
+
column_types={
|
1729
|
+
"id": pa.string(),
|
1730
|
+
"name": pa.string(),
|
1731
|
+
"age": pa.string(),
|
1732
|
+
"score": pa.string(),
|
1733
|
+
}
|
1734
|
+
)
|
1735
|
+
return kwargs
|
1736
|
+
|
1737
|
+
for fs_name, filesystem in self._get_filesystems(self.csv_file):
|
1738
|
+
with self.subTest(filesystem=fs_name):
|
1739
|
+
table = file_to_table(
|
1740
|
+
self.csv_file,
|
1741
|
+
ContentType.CSV.value,
|
1742
|
+
ContentEncoding.IDENTITY.value,
|
1743
|
+
filesystem=filesystem,
|
1744
|
+
column_names=["id", "name", "age", "score"],
|
1745
|
+
pa_read_func_kwargs_provider=schema_provider,
|
1746
|
+
)
|
1747
|
+
# Check that all columns are strings
|
1748
|
+
for field in table.schema:
|
1749
|
+
self.assertEqual(
|
1750
|
+
field.type,
|
1751
|
+
pa.string(),
|
1752
|
+
f"Column {field.name} should be string with {fs_name}",
|
1753
|
+
)
|
1754
|
+
|
1755
|
+
def test_filesystem_auto_inference_consistency(self):
|
1756
|
+
"""Test that auto-inferred filesystem produces same results as explicit filesystems."""
|
1757
|
+
# Use Parquet as it's most reliable across filesystem types
|
1758
|
+
|
1759
|
+
# Read with auto-inference
|
1760
|
+
auto_table = file_to_table(
|
1761
|
+
self.parquet_file,
|
1762
|
+
ContentType.PARQUET.value,
|
1763
|
+
ContentEncoding.IDENTITY.value,
|
1764
|
+
filesystem=None, # Auto-infer
|
1765
|
+
)
|
1766
|
+
|
1767
|
+
# Read with explicit fsspec filesystem
|
1768
|
+
fsspec_fs = fsspec.filesystem("file")
|
1769
|
+
fsspec_table = file_to_table(
|
1770
|
+
self.parquet_file,
|
1771
|
+
ContentType.PARQUET.value,
|
1772
|
+
ContentEncoding.IDENTITY.value,
|
1773
|
+
filesystem=fsspec_fs,
|
968
1774
|
)
|
1775
|
+
|
1776
|
+
# Read with explicit PyArrow filesystem
|
1777
|
+
import pyarrow.fs as pafs
|
1778
|
+
|
1779
|
+
pyarrow_fs = pafs.LocalFileSystem()
|
1780
|
+
pyarrow_table = file_to_table(
|
1781
|
+
self.parquet_file,
|
1782
|
+
ContentType.PARQUET.value,
|
1783
|
+
ContentEncoding.IDENTITY.value,
|
1784
|
+
filesystem=pyarrow_fs,
|
1785
|
+
)
|
1786
|
+
|
1787
|
+
# All should produce equivalent results
|
1788
|
+
self.assertTrue(
|
1789
|
+
auto_table.equals(fsspec_table),
|
1790
|
+
"Auto-inferred result should match fsspec result",
|
1791
|
+
)
|
1792
|
+
self.assertTrue(
|
1793
|
+
auto_table.equals(pyarrow_table),
|
1794
|
+
"Auto-inferred result should match PyArrow result",
|
1795
|
+
)
|
1796
|
+
|
1797
|
+
def test_error_handling_all_filesystems(self):
|
1798
|
+
"""Test error handling works consistently across filesystem types."""
|
1799
|
+
for fs_name, filesystem in self._get_filesystems(self.parquet_file):
|
1800
|
+
with self.subTest(filesystem=fs_name):
|
1801
|
+
# Test unsupported content type
|
1802
|
+
with self.assertRaises(NotImplementedError):
|
1803
|
+
file_to_table(
|
1804
|
+
self.parquet_file,
|
1805
|
+
"UNSUPPORTED_TYPE",
|
1806
|
+
ContentEncoding.IDENTITY.value,
|
1807
|
+
filesystem=filesystem,
|
1808
|
+
)
|
1809
|
+
|
1810
|
+
# Test non-existent file
|
1811
|
+
with self.assertRaises((FileNotFoundError, OSError)):
|
1812
|
+
file_to_table(
|
1813
|
+
f"{self.tmpdir}/non_existent.parquet",
|
1814
|
+
ContentType.PARQUET.value,
|
1815
|
+
ContentEncoding.IDENTITY.value,
|
1816
|
+
filesystem=filesystem,
|
1817
|
+
)
|