deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/types/tables.py
CHANGED
@@ -1,11 +1,36 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import logging
|
5
|
+
import multiprocessing
|
1
6
|
from enum import Enum
|
2
|
-
from
|
7
|
+
from functools import partial
|
8
|
+
from typing import (
|
9
|
+
Callable,
|
10
|
+
Dict,
|
11
|
+
Type,
|
12
|
+
Union,
|
13
|
+
Optional,
|
14
|
+
Any,
|
15
|
+
List,
|
16
|
+
Tuple,
|
17
|
+
TYPE_CHECKING,
|
18
|
+
)
|
19
|
+
from uuid import uuid4
|
3
20
|
|
21
|
+
import daft
|
4
22
|
import numpy as np
|
5
23
|
import pandas as pd
|
24
|
+
import polars as pl
|
6
25
|
import pyarrow as pa
|
26
|
+
import pyarrow.dataset as ds
|
27
|
+
import pyarrow.fs
|
7
28
|
import pyarrow.parquet as papq
|
8
|
-
|
29
|
+
import ray
|
30
|
+
from ray.data.block import Block, BlockMetadata, BlockAccessor
|
31
|
+
from ray.data._internal.pandas_block import PandasBlockSchema
|
32
|
+
from ray.data.dataset import Dataset as RayDataset, MaterializedDataset
|
33
|
+
from ray.data.datasource import FilenameProvider
|
9
34
|
from ray.data.read_api import (
|
10
35
|
from_arrow,
|
11
36
|
from_arrow_refs,
|
@@ -13,74 +38,506 @@ from ray.data.read_api import (
|
|
13
38
|
from_pandas,
|
14
39
|
from_pandas_refs,
|
15
40
|
)
|
41
|
+
from tenacity import (
|
42
|
+
Retrying,
|
43
|
+
wait_random_exponential,
|
44
|
+
stop_after_delay,
|
45
|
+
retry_if_exception_type,
|
46
|
+
)
|
16
47
|
|
17
|
-
|
18
|
-
from deltacat
|
48
|
+
from deltacat.compute.compactor_v2.constants import MAX_RECORDS_PER_COMPACTED_FILE
|
49
|
+
from deltacat import logs
|
50
|
+
from deltacat.constants import (
|
51
|
+
UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY,
|
52
|
+
RETRYABLE_TRANSIENT_ERRORS,
|
53
|
+
DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY,
|
54
|
+
)
|
55
|
+
from deltacat.storage.model.types import (
|
56
|
+
Dataset,
|
57
|
+
LocalTable,
|
58
|
+
DistributedDataset,
|
59
|
+
LocalDataset,
|
60
|
+
)
|
61
|
+
from deltacat.storage.model.schema import SchemaConsistencyType
|
62
|
+
from deltacat.types.media import (
|
63
|
+
DatasetType,
|
64
|
+
DistributedDatasetType,
|
65
|
+
ContentType,
|
66
|
+
EXPLICIT_COMPRESSION_CONTENT_TYPES,
|
67
|
+
ContentEncoding,
|
68
|
+
CONTENT_TYPE_TO_EXT,
|
69
|
+
CONTENT_ENCODING_TO_EXT,
|
70
|
+
)
|
19
71
|
from deltacat.utils import numpy as np_utils
|
20
72
|
from deltacat.utils import pandas as pd_utils
|
73
|
+
from deltacat.utils import polars as pl_utils
|
21
74
|
from deltacat.utils import pyarrow as pa_utils
|
22
|
-
from deltacat.utils import daft as daft_utils
|
23
75
|
from deltacat.utils.ray_utils import dataset as ds_utils
|
76
|
+
from deltacat.storage.model.manifest import (
|
77
|
+
ManifestEntryList,
|
78
|
+
ManifestEntry,
|
79
|
+
ManifestMeta,
|
80
|
+
EntryParams,
|
81
|
+
EntryType,
|
82
|
+
Manifest,
|
83
|
+
)
|
84
|
+
from deltacat.exceptions import (
|
85
|
+
RetryableError,
|
86
|
+
RetryableUploadTableError,
|
87
|
+
NonRetryableUploadTableError,
|
88
|
+
categorize_errors,
|
89
|
+
RetryableDownloadTableError,
|
90
|
+
NonRetryableDownloadTableError,
|
91
|
+
)
|
92
|
+
from deltacat.utils.common import ReadKwargsProvider
|
93
|
+
from deltacat.types.partial_download import PartialFileDownloadParams
|
94
|
+
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
24
95
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
96
|
+
if TYPE_CHECKING:
|
97
|
+
from deltacat.storage.model.schema import Schema
|
98
|
+
from deltacat.storage.model.table import Table
|
99
|
+
from deltacat.storage.model.table_version import TableVersion
|
100
|
+
|
101
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
102
|
+
|
103
|
+
|
104
|
+
TABLE_TYPE_TO_READER_FUNC: Dict[str, Callable] = {
|
105
|
+
DatasetType.PYARROW_PARQUET.value: pa_utils.file_to_parquet,
|
106
|
+
DatasetType.PYARROW.value: pa_utils.file_to_table,
|
107
|
+
DatasetType.PANDAS.value: pd_utils.file_to_dataframe,
|
108
|
+
DatasetType.NUMPY.value: np_utils.file_to_ndarray,
|
109
|
+
DatasetType.POLARS.value: pl_utils.file_to_dataframe,
|
30
110
|
}
|
31
111
|
|
112
|
+
|
32
113
|
TABLE_CLASS_TO_WRITER_FUNC: Dict[
|
33
|
-
Type[Union[
|
114
|
+
Type[Union[LocalTable, DistributedDataset]], Callable
|
34
115
|
] = {
|
35
116
|
pa.Table: pa_utils.table_to_file,
|
36
117
|
pd.DataFrame: pd_utils.dataframe_to_file,
|
118
|
+
pl.DataFrame: pl_utils.dataframe_to_file,
|
37
119
|
np.ndarray: np_utils.ndarray_to_file,
|
38
|
-
|
120
|
+
RayDataset: ds_utils.dataset_to_file,
|
121
|
+
MaterializedDataset: ds_utils.dataset_to_file,
|
39
122
|
}
|
40
123
|
|
41
124
|
TABLE_CLASS_TO_SLICER_FUNC: Dict[
|
42
|
-
Type[Union[
|
125
|
+
Type[Union[LocalTable, DistributedDataset]], Callable
|
43
126
|
] = {
|
44
127
|
pa.Table: pa_utils.slice_table,
|
45
128
|
pd.DataFrame: pd_utils.slice_dataframe,
|
129
|
+
pl.DataFrame: pl_utils.slice_table,
|
46
130
|
np.ndarray: np_utils.slice_ndarray,
|
47
|
-
|
131
|
+
RayDataset: ds_utils.slice_dataset,
|
132
|
+
MaterializedDataset: ds_utils.slice_dataset,
|
48
133
|
}
|
49
134
|
|
50
135
|
TABLE_CLASS_TO_SIZE_FUNC: Dict[
|
51
|
-
Type[Union[
|
136
|
+
Type[Union[LocalTable, DistributedDataset]], Callable
|
52
137
|
] = {
|
53
138
|
pa.Table: pa_utils.table_size,
|
54
139
|
papq.ParquetFile: pa_utils.parquet_file_size,
|
55
140
|
pd.DataFrame: pd_utils.dataframe_size,
|
141
|
+
pl.DataFrame: pl_utils.dataframe_size,
|
56
142
|
np.ndarray: np_utils.ndarray_size,
|
57
|
-
|
143
|
+
RayDataset: ds_utils.dataset_size,
|
144
|
+
MaterializedDataset: ds_utils.dataset_size,
|
145
|
+
}
|
146
|
+
|
147
|
+
TABLE_CLASS_TO_COLUMN_NAMES_FUNC: Dict[
|
148
|
+
Type[Union[LocalTable, DistributedDataset]], Callable
|
149
|
+
] = {
|
150
|
+
pa.Table: lambda table: table.schema.names,
|
151
|
+
papq.ParquetFile: lambda table: table.schema.names,
|
152
|
+
pd.DataFrame: lambda table: table.columns.tolist(),
|
153
|
+
pl.DataFrame: lambda table: table.columns,
|
154
|
+
np.ndarray: lambda table: [f"{i}" for i in range(table.shape[1])],
|
155
|
+
daft.DataFrame: lambda table: table.column_names,
|
156
|
+
RayDataset: lambda table: table.schema().names,
|
157
|
+
MaterializedDataset: lambda table: table.schema().names,
|
158
|
+
}
|
159
|
+
|
160
|
+
TABLE_CLASS_TO_SCHEMA_FUNC: Dict[
|
161
|
+
Type[Union[LocalTable, DistributedDataset]], Callable
|
162
|
+
] = {
|
163
|
+
pa.Table: lambda table: table.schema,
|
164
|
+
papq.ParquetFile: lambda table: table.schema_arrow,
|
165
|
+
pd.DataFrame: lambda table: pa.Schema.from_pandas(table),
|
166
|
+
pl.DataFrame: lambda table: table.to_arrow().schema,
|
167
|
+
np.ndarray: lambda table: pa.Schema.from_pandas(pd.DataFrame(table)),
|
168
|
+
daft.DataFrame: lambda table: table.schema().to_pyarrow_schema(),
|
169
|
+
RayDataset: lambda table: table.schema().base_schema,
|
170
|
+
MaterializedDataset: lambda table: table.schema().base_schema,
|
171
|
+
}
|
172
|
+
|
173
|
+
TABLE_TYPE_TO_EMPTY_TABLE_FUNC: Dict[str, Callable] = {
|
174
|
+
DatasetType.PYARROW.value: lambda: pa.Table.from_pydict({}),
|
175
|
+
DatasetType.PANDAS.value: lambda: pd.DataFrame(),
|
176
|
+
DatasetType.POLARS.value: lambda: pl.DataFrame(),
|
177
|
+
DatasetType.NUMPY.value: lambda: np.array([]),
|
178
|
+
DatasetType.DAFT.value: lambda: daft.DataFrame(),
|
179
|
+
DatasetType.RAY_DATASET.value: lambda: ray.data.from_items([]),
|
180
|
+
MaterializedDataset: lambda: ray.data.from_items([]),
|
58
181
|
}
|
59
182
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
183
|
+
|
184
|
+
def _numpy_array_to_pyarrow(table: np.ndarray, schema: pa.Schema) -> pa.Table:
|
185
|
+
"""Convert NumPy array to PyArrow Table via Pandas for complex type support."""
|
186
|
+
# Convert NumPy -> Pandas -> PyArrow to handle complex types like structs
|
187
|
+
# This follows the same path as Pandas conversion which handles all complex types properly
|
188
|
+
pandas_df = _numpy_array_to_pandas(table, schema=schema)
|
189
|
+
return pa.Table.from_pandas(pandas_df, schema=schema)
|
190
|
+
|
191
|
+
|
192
|
+
def _numpy_array_to_pandas(
|
193
|
+
table: np.ndarray, *, schema: Optional[pa.Schema] = None, **kwargs
|
194
|
+
) -> pd.DataFrame:
|
195
|
+
"""Convert NumPy array to pandas DataFrame."""
|
196
|
+
if schema and isinstance(schema, pa.Schema):
|
197
|
+
if table.ndim == 1:
|
198
|
+
# 1D array: single column
|
199
|
+
column_names = [schema.names[0]] if schema.names else ["0"]
|
200
|
+
return pd.DataFrame({column_names[0]: table}, **kwargs)
|
201
|
+
elif table.ndim == 2:
|
202
|
+
# 2D array: multiple columns
|
203
|
+
column_names = (
|
204
|
+
schema.names
|
205
|
+
if len(schema.names) == table.shape[1]
|
206
|
+
else [f"{i}" for i in range(table.shape[1])]
|
207
|
+
)
|
208
|
+
return pd.DataFrame(table, columns=column_names, **kwargs)
|
209
|
+
else:
|
210
|
+
raise ValueError(
|
211
|
+
f"NumPy arrays with {table.ndim} dimensions are not supported. "
|
212
|
+
f"Only 1D and 2D arrays are supported."
|
213
|
+
)
|
214
|
+
|
215
|
+
# Fallback to generic column names
|
216
|
+
return pd.DataFrame(table, **kwargs)
|
217
|
+
|
218
|
+
|
219
|
+
def _ray_dataset_to_pyarrow(table, *, schema, **kwargs):
|
220
|
+
"""Convert Ray Dataset to PyArrow tables and concatenate."""
|
221
|
+
arrow_refs = table.to_arrow_refs(**kwargs)
|
222
|
+
arrow_tables = ray.get(arrow_refs)
|
223
|
+
if len(arrow_tables) == 1:
|
224
|
+
return arrow_tables[0]
|
225
|
+
# Unify schemas to support schema evolution across blocks/files
|
226
|
+
try:
|
227
|
+
return pa.concat_tables(
|
228
|
+
arrow_tables, promote_options="permissive", unify_schemas=True
|
229
|
+
)
|
230
|
+
except TypeError:
|
231
|
+
return pa.concat_tables(arrow_tables, promote_options="permissive")
|
232
|
+
|
233
|
+
|
234
|
+
TABLE_CLASS_TO_PYARROW_FUNC: Dict[
|
235
|
+
Type[Union[LocalTable, DistributedDataset]], Callable
|
236
|
+
] = {
|
237
|
+
pa.Table: lambda table, *, schema, **kwargs: table,
|
238
|
+
papq.ParquetFile: lambda table, *, schema, **kwargs: table.read(**kwargs),
|
239
|
+
pd.DataFrame: lambda table, *, schema, **kwargs: pa.Table.from_pandas(
|
240
|
+
table, schema=schema, preserve_index=False, **kwargs
|
241
|
+
),
|
242
|
+
pl.DataFrame: lambda table, *, schema, **kwargs: pl.DataFrame.to_arrow(
|
243
|
+
table, **kwargs
|
244
|
+
),
|
245
|
+
np.ndarray: lambda table, *, schema, **kwargs: _numpy_array_to_pyarrow(
|
246
|
+
table, schema, **kwargs
|
247
|
+
),
|
248
|
+
RayDataset: _ray_dataset_to_pyarrow,
|
249
|
+
MaterializedDataset: _ray_dataset_to_pyarrow,
|
250
|
+
daft.DataFrame: lambda table, *, schema, **kwargs: table.to_arrow(**kwargs),
|
251
|
+
}
|
252
|
+
|
253
|
+
TABLE_CLASS_TO_PANDAS_FUNC: Dict[
|
254
|
+
Type[Union[LocalTable, DistributedDataset]], Callable
|
255
|
+
] = {
|
256
|
+
pa.Table: lambda table, *, schema=None, **kwargs: table.to_pandas(**kwargs),
|
257
|
+
papq.ParquetFile: lambda table, *, schema=None, **kwargs: table.read(
|
258
|
+
**kwargs
|
259
|
+
).to_pandas(**kwargs),
|
260
|
+
pd.DataFrame: lambda table, *, schema=None, **kwargs: table,
|
261
|
+
pl.DataFrame: lambda table, *, schema=None, **kwargs: table.to_pandas(**kwargs),
|
262
|
+
np.ndarray: lambda table, *, schema=None, **kwargs: _numpy_array_to_pandas(
|
263
|
+
table, schema=schema, **kwargs
|
264
|
+
),
|
265
|
+
RayDataset: lambda table, *, schema=None, **kwargs: table.to_pandas(**kwargs),
|
266
|
+
MaterializedDataset: lambda table, *, schema=None, **kwargs: table.to_pandas(
|
267
|
+
**kwargs
|
268
|
+
),
|
269
|
+
daft.DataFrame: lambda table, *, schema=None, **kwargs: table.to_pandas(**kwargs),
|
270
|
+
}
|
271
|
+
|
272
|
+
|
273
|
+
def _pyarrow_to_polars(pa_table: pa.Table, **kwargs) -> pl.DataFrame:
|
274
|
+
"""Convert PyArrow table to Polars DataFrame with clean schema."""
|
275
|
+
# PyArrow metadata can contain invalid UTF-8 sequences that cause Polars to raise an error
|
276
|
+
# Create a new table without metadata that might contain invalid UTF-8
|
277
|
+
clean_schema = pa.schema(
|
278
|
+
[
|
279
|
+
pa.field(field.name, field.type, nullable=field.nullable)
|
280
|
+
for field in pa_table.schema
|
281
|
+
]
|
282
|
+
)
|
283
|
+
clean_table = pa.Table.from_arrays(pa_table.columns, schema=clean_schema)
|
284
|
+
return pl.from_arrow(clean_table, **kwargs)
|
285
|
+
|
286
|
+
|
287
|
+
def _pyarrow_to_numpy(pa_table: pa.Table, **kwargs) -> np.ndarray:
|
288
|
+
"""Convert PyArrow table to numpy array."""
|
289
|
+
if pa_table.num_columns == 1:
|
290
|
+
return pa_table.column(0).to_numpy(**kwargs)
|
291
|
+
else:
|
292
|
+
return pa_table.to_pandas().values
|
293
|
+
|
294
|
+
|
295
|
+
def _pandas_to_numpy(pd_df: pd.DataFrame, **kwargs) -> np.ndarray:
|
296
|
+
"""Convert Pandas DataFrame to numpy array."""
|
297
|
+
if len(pd_df.columns) == 1:
|
298
|
+
return pd_df.iloc[:, 0].to_numpy(**kwargs)
|
299
|
+
else:
|
300
|
+
return pd_df.values
|
301
|
+
|
302
|
+
|
303
|
+
DATASET_TYPE_FROM_PYARROW: Dict[DatasetType, Callable[[pa.Table, Dataset], Any]] = {
|
304
|
+
DatasetType.PYARROW: lambda pa_table, **kwargs: pa_table,
|
305
|
+
DatasetType.PANDAS: lambda pa_table, **kwargs: pa_table.to_pandas(**kwargs),
|
306
|
+
DatasetType.POLARS: lambda pa_table, **kwargs: _pyarrow_to_polars(
|
307
|
+
pa_table, **kwargs
|
308
|
+
),
|
309
|
+
DatasetType.DAFT: lambda pa_table, **kwargs: daft.from_arrow(pa_table, **kwargs),
|
310
|
+
DatasetType.NUMPY: lambda pa_table, **kwargs: _pyarrow_to_numpy(pa_table, **kwargs),
|
311
|
+
DatasetType.RAY_DATASET: lambda pa_table, **kwargs: ray.data.from_arrow(pa_table),
|
312
|
+
DatasetType.PYARROW_PARQUET: lambda pa_table, **kwargs: pa_table, # ParquetFile is read as PyArrow Table
|
313
|
+
}
|
314
|
+
|
315
|
+
|
316
|
+
DATASET_TYPE_FROM_PANDAS: Dict[DatasetType, Callable[[pd.DataFrame, Dataset], Any]] = {
|
317
|
+
DatasetType.PANDAS: lambda pd_df, **kwargs: pd_df,
|
318
|
+
DatasetType.PYARROW: lambda pd_df, **kwargs: pa.Table.from_pandas(pd_df, **kwargs),
|
319
|
+
DatasetType.POLARS: lambda pd_df, **kwargs: pl.from_pandas(pd_df, **kwargs),
|
320
|
+
DatasetType.DAFT: lambda pd_df, **kwargs: daft.from_pandas(pd_df, **kwargs),
|
321
|
+
DatasetType.NUMPY: lambda pd_df, **kwargs: _pandas_to_numpy(pd_df, **kwargs),
|
322
|
+
DatasetType.RAY_DATASET: lambda pd_df, **kwargs: ray.data.from_pandas(
|
323
|
+
pd_df, **kwargs
|
324
|
+
),
|
325
|
+
}
|
326
|
+
|
327
|
+
|
328
|
+
def append_column_to_parquet_file(
|
329
|
+
parquet_file: papq.ParquetFile,
|
330
|
+
column_name: str,
|
331
|
+
column_value: Any,
|
332
|
+
) -> pa.Table:
|
333
|
+
"""
|
334
|
+
Append a column to a ParquetFile by converting to PyArrow Table first.
|
335
|
+
|
336
|
+
Args:
|
337
|
+
parquet_file: The ParquetFile to add column to
|
338
|
+
column_name: Name of the new column
|
339
|
+
column_value: Value to populate in all rows of the new column
|
340
|
+
|
341
|
+
Returns:
|
342
|
+
PyArrow Table with the new column
|
343
|
+
"""
|
344
|
+
# Convert ParquetFile to Table
|
345
|
+
table = parquet_file.read()
|
346
|
+
|
347
|
+
# Use the existing PyArrow append column function
|
348
|
+
num_rows = table.num_rows
|
349
|
+
column_array = pa.array([column_value] * num_rows)
|
350
|
+
return table.append_column(column_name, column_array)
|
351
|
+
|
352
|
+
|
353
|
+
TABLE_CLASS_TO_APPEND_COLUMN_FUNC: Dict[
|
354
|
+
Type[Union[LocalTable, DistributedDataset]], Callable
|
355
|
+
] = {
|
356
|
+
pa.Table: pa_utils.append_column_to_table,
|
357
|
+
papq.ParquetFile: append_column_to_parquet_file,
|
358
|
+
pd.DataFrame: pd_utils.append_column_to_dataframe,
|
359
|
+
pl.DataFrame: pl_utils.append_column_to_table,
|
360
|
+
np.ndarray: np_utils.append_column_to_ndarray,
|
361
|
+
}
|
362
|
+
|
363
|
+
TABLE_CLASS_TO_SELECT_COLUMNS_FUNC: Dict[
|
364
|
+
Type[Union[LocalTable, DistributedDataset]], Callable
|
365
|
+
] = {
|
366
|
+
pa.Table: pa_utils.select_columns,
|
367
|
+
pd.DataFrame: pd_utils.select_columns,
|
368
|
+
pl.DataFrame: pl_utils.select_columns,
|
369
|
+
}
|
370
|
+
|
371
|
+
TABLE_CLASS_TO_TABLE_TYPE: Dict[Union[LocalTable, DistributedDataset], str] = {
|
372
|
+
pa.Table: DatasetType.PYARROW.value,
|
373
|
+
papq.ParquetFile: DatasetType.PYARROW_PARQUET.value,
|
374
|
+
pl.DataFrame: DatasetType.POLARS.value,
|
375
|
+
pd.DataFrame: DatasetType.PANDAS.value,
|
376
|
+
np.ndarray: DatasetType.NUMPY.value,
|
377
|
+
daft.DataFrame: DatasetType.DAFT.value,
|
378
|
+
RayDataset: DatasetType.RAY_DATASET.value,
|
379
|
+
MaterializedDataset: DatasetType.RAY_DATASET.value,
|
65
380
|
}
|
66
381
|
|
67
382
|
TABLE_TYPE_TO_DATASET_CREATE_FUNC: Dict[str, Callable] = {
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
383
|
+
DatasetType.PYARROW.value: from_arrow,
|
384
|
+
DatasetType.PYARROW_PARQUET.value: from_arrow,
|
385
|
+
DatasetType.NUMPY.value: from_numpy,
|
386
|
+
DatasetType.PANDAS.value: from_pandas,
|
72
387
|
}
|
73
388
|
|
74
389
|
TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS: Dict[str, Callable] = {
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
390
|
+
DatasetType.PYARROW.value: from_arrow_refs,
|
391
|
+
DatasetType.PYARROW_PARQUET.value: from_arrow_refs,
|
392
|
+
DatasetType.NUMPY.value: from_numpy,
|
393
|
+
DatasetType.PANDAS.value: from_pandas_refs,
|
394
|
+
DatasetType.POLARS.value: from_arrow_refs, # We cast Polars to Arrow for Ray Datasets
|
395
|
+
DatasetType.RAY_DATASET.value: from_arrow_refs, # Ray Datasets are created from Arrow refs
|
396
|
+
}
|
397
|
+
|
398
|
+
TABLE_TYPE_TO_CONCAT_FUNC: Dict[str, Callable] = {
|
399
|
+
DatasetType.PYARROW_PARQUET.value: pa_utils.concat_tables,
|
400
|
+
DatasetType.PYARROW.value: pa_utils.concat_tables,
|
401
|
+
DatasetType.PANDAS.value: pd_utils.concat_dataframes,
|
402
|
+
DatasetType.NUMPY.value: np_utils.concat_ndarrays,
|
403
|
+
DatasetType.POLARS.value: pl_utils.concat_dataframes,
|
79
404
|
}
|
80
405
|
|
81
406
|
|
407
|
+
def _infer_schema_from_numpy_array(data: np.ndarray) -> Schema:
|
408
|
+
"""Infer schema from NumPy array."""
|
409
|
+
if data.ndim > 2:
|
410
|
+
raise ValueError(
|
411
|
+
f"NumPy arrays with {data.ndim} dimensions are not supported. "
|
412
|
+
f"Only 1D and 2D arrays are supported."
|
413
|
+
)
|
414
|
+
# Handle object dtype by converting to pandas first
|
415
|
+
df = pd.DataFrame(data)
|
416
|
+
arrow_schema = pa.Schema.from_pandas(df)
|
417
|
+
|
418
|
+
from deltacat.storage.model.schema import Schema
|
419
|
+
|
420
|
+
return Schema.of(schema=arrow_schema)
|
421
|
+
|
422
|
+
|
423
|
+
def _infer_schema_from_ray_dataset(data: RayDataset) -> Schema:
|
424
|
+
"""Infer schema from Ray Dataset."""
|
425
|
+
ray_schema = data.schema()
|
426
|
+
base_schema = ray_schema.base_schema
|
427
|
+
|
428
|
+
if isinstance(base_schema, pa.Schema):
|
429
|
+
arrow_schema = base_schema
|
430
|
+
elif isinstance(base_schema, PandasBlockSchema):
|
431
|
+
try:
|
432
|
+
dtype_dict = {
|
433
|
+
name: dtype for name, dtype in zip(base_schema.names, base_schema.types)
|
434
|
+
}
|
435
|
+
empty_df = pd.DataFrame(columns=base_schema.names).astype(dtype_dict)
|
436
|
+
arrow_schema = pa.Schema.from_pandas(empty_df)
|
437
|
+
except Exception as e:
|
438
|
+
raise ValueError(
|
439
|
+
f"Failed to convert Ray Dataset PandasBlockSchema to PyArrow schema: {e}"
|
440
|
+
)
|
441
|
+
else:
|
442
|
+
raise ValueError(
|
443
|
+
f"Unsupported Ray Dataset schema type: {type(base_schema)}. "
|
444
|
+
f"Expected PyArrow Schema or PandasBlockSchema, got {base_schema}"
|
445
|
+
)
|
446
|
+
|
447
|
+
from deltacat.storage.model.schema import Schema
|
448
|
+
|
449
|
+
return Schema.of(schema=arrow_schema)
|
450
|
+
|
451
|
+
|
452
|
+
def _infer_schema_from_pandas_dataframe(data: pd.DataFrame) -> Schema:
|
453
|
+
"""Infer schema from Pandas DataFrame."""
|
454
|
+
from deltacat.storage.model.schema import Schema
|
455
|
+
|
456
|
+
arrow_schema = pa.Schema.from_pandas(data)
|
457
|
+
return Schema.of(schema=arrow_schema)
|
458
|
+
|
459
|
+
|
460
|
+
def _infer_schema_from_polars_dataframe(data: pl.DataFrame) -> Schema:
|
461
|
+
"""Infer schema from Polars DataFrame."""
|
462
|
+
from deltacat.storage.model.schema import Schema
|
463
|
+
|
464
|
+
arrow_table = data.to_arrow()
|
465
|
+
return Schema.of(schema=arrow_table.schema)
|
466
|
+
|
467
|
+
|
468
|
+
def _infer_schema_from_pyarrow(
|
469
|
+
data: Union[pa.Table, pa.RecordBatch, ds.Dataset]
|
470
|
+
) -> Schema:
|
471
|
+
"""Infer schema from PyArrow Table, RecordBatch, or Dataset."""
|
472
|
+
from deltacat.storage.model.schema import Schema
|
473
|
+
|
474
|
+
return Schema.of(schema=data.schema)
|
475
|
+
|
476
|
+
|
477
|
+
def _infer_schema_from_daft_dataframe(data: daft.DataFrame) -> Schema:
|
478
|
+
"""Infer schema from Daft DataFrame."""
|
479
|
+
from deltacat.storage.model.schema import Schema
|
480
|
+
|
481
|
+
daft_schema = data.schema()
|
482
|
+
arrow_schema = daft_schema.to_pyarrow_schema()
|
483
|
+
return Schema.of(schema=arrow_schema)
|
484
|
+
|
485
|
+
|
486
|
+
TABLE_CLASS_TO_SCHEMA_INFERENCE_FUNC: Dict[
|
487
|
+
Type[Union[LocalTable, DistributedDataset], Callable]
|
488
|
+
] = {
|
489
|
+
pd.DataFrame: _infer_schema_from_pandas_dataframe,
|
490
|
+
pl.DataFrame: _infer_schema_from_polars_dataframe,
|
491
|
+
pa.Table: _infer_schema_from_pyarrow,
|
492
|
+
pa.RecordBatch: _infer_schema_from_pyarrow,
|
493
|
+
ds.Dataset: _infer_schema_from_pyarrow,
|
494
|
+
RayDataset: _infer_schema_from_ray_dataset,
|
495
|
+
MaterializedDataset: _infer_schema_from_ray_dataset, # MaterializedDataset uses same schema inference as RayDataset
|
496
|
+
daft.DataFrame: _infer_schema_from_daft_dataframe,
|
497
|
+
np.ndarray: _infer_schema_from_numpy_array,
|
498
|
+
}
|
499
|
+
|
500
|
+
|
501
|
+
def infer_table_schema(data: Union[LocalTable, DistributedDataset]) -> Schema:
|
502
|
+
"""Infer schema from a table or dataset."""
|
503
|
+
infer_schema_func = _get_table_function(
|
504
|
+
data,
|
505
|
+
TABLE_CLASS_TO_SCHEMA_INFERENCE_FUNC,
|
506
|
+
"schema inference",
|
507
|
+
)
|
508
|
+
return infer_schema_func(data)
|
509
|
+
|
510
|
+
|
511
|
+
def concat_tables(tables: List[LocalTable], table_type: DatasetType) -> LocalTable:
|
512
|
+
"""
|
513
|
+
Concatenate a list of tables into a single table using the appropriate
|
514
|
+
concatenation function for the given table type.
|
515
|
+
|
516
|
+
Args:
|
517
|
+
tables: List of tables to concatenate
|
518
|
+
table_type: The DatasetType indicating which concatenation function to use
|
519
|
+
|
520
|
+
Returns:
|
521
|
+
Single concatenated table of the appropriate type
|
522
|
+
|
523
|
+
Raises:
|
524
|
+
ValueError: If no concatenation function is found for the table type
|
525
|
+
"""
|
526
|
+
concat_func = _get_table_type_function(
|
527
|
+
table_type, TABLE_TYPE_TO_CONCAT_FUNC, "concatenation"
|
528
|
+
)
|
529
|
+
return concat_func(tables)
|
530
|
+
|
531
|
+
|
532
|
+
def _daft_reader_wrapper(*args, **kwargs):
|
533
|
+
"""Wrapper for daft reader with lazy import to avoid circular import."""
|
534
|
+
from deltacat.utils.daft import files_to_dataframe
|
535
|
+
|
536
|
+
return files_to_dataframe(*args, **kwargs)
|
537
|
+
|
538
|
+
|
82
539
|
DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
|
83
|
-
DistributedDatasetType.DAFT.value:
|
540
|
+
DistributedDatasetType.DAFT.value: _daft_reader_wrapper,
|
84
541
|
}
|
85
542
|
|
86
543
|
|
@@ -89,13 +546,14 @@ class TableWriteMode(str, Enum):
|
|
89
546
|
Enum controlling how a given dataset will be written to a table.
|
90
547
|
|
91
548
|
AUTO: CREATE if the table doesn't exist, APPEND if the table exists
|
92
|
-
without
|
549
|
+
without merge keys, and MERGE if the table exists with merge keys.
|
93
550
|
CREATE: Create the table if it doesn't exist, throw an error if it does.
|
94
551
|
APPEND: Append to the table if it exists, throw an error if it doesn't.
|
95
552
|
REPLACE: Replace existing table contents with the data to write.
|
96
|
-
MERGE: Insert
|
97
|
-
Updates or inserts records based on the table's
|
553
|
+
MERGE: Insert or update records matching table merge keys.
|
554
|
+
Updates or inserts records based on the table's merge and sort keys by
|
98
555
|
default.
|
556
|
+
DELETE: Delete records matching table merge keys.
|
99
557
|
"""
|
100
558
|
|
101
559
|
AUTO = "auto"
|
@@ -103,29 +561,1869 @@ class TableWriteMode(str, Enum):
|
|
103
561
|
APPEND = "append"
|
104
562
|
REPLACE = "replace"
|
105
563
|
MERGE = "merge"
|
564
|
+
DELETE = "delete"
|
106
565
|
|
107
566
|
|
108
|
-
|
109
|
-
|
567
|
+
class SchemaEvolutionMode(str, Enum):
|
568
|
+
"""
|
569
|
+
Enum controlling how schema changes are handled when writing to a table.
|
570
|
+
|
571
|
+
MANUAL: Schema changes must be explicitly handled by the user. New fields
|
572
|
+
not in the existing schema will cause an error.
|
573
|
+
AUTO: Schema changes are automatically handled. New fields are added to
|
574
|
+
the schema using the table's default_schema_consistency_type.
|
575
|
+
DISABLED: Schema changes are disabled. The schema that the table was
|
576
|
+
created with is immutable.
|
577
|
+
"""
|
578
|
+
|
579
|
+
MANUAL = "manual"
|
580
|
+
AUTO = "auto"
|
581
|
+
DISABLED = "disabled"
|
582
|
+
|
583
|
+
|
584
|
+
class TableProperty(str, Enum):
|
585
|
+
"""
|
586
|
+
Enum defining known table property key names.
|
587
|
+
"""
|
588
|
+
|
589
|
+
READ_OPTIMIZATION_LEVEL = "read_optimization_level"
|
590
|
+
RECORDS_PER_COMPACTED_FILE = "records_per_compacted_file"
|
591
|
+
APPENDED_RECORD_COUNT_COMPACTION_TRIGGER = (
|
592
|
+
"appended_record_count_compaction_trigger"
|
593
|
+
)
|
594
|
+
APPENDED_FILE_COUNT_COMPACTION_TRIGGER = "appended_file_count_compaction_trigger"
|
595
|
+
APPENDED_DELTA_COUNT_COMPACTION_TRIGGER = "appended_delta_count_compaction_trigger"
|
596
|
+
DEFAULT_COMPACTION_HASH_BUCKET_COUNT = "default_compaction_hash_bucket_count"
|
597
|
+
SCHEMA_EVOLUTION_MODE = "schema_evolution_mode"
|
598
|
+
DEFAULT_SCHEMA_CONSISTENCY_TYPE = "default_schema_consistency_type"
|
599
|
+
SUPPORTED_READER_TYPES = "supported_reader_types"
|
600
|
+
|
601
|
+
def read_table_property(
|
602
|
+
table_or_table_version: Union[Table, TableVersion], property: TableProperty
|
603
|
+
) -> Any:
|
604
|
+
properties = table_or_table_version.properties or {}
|
605
|
+
value = properties.get(property.value, TablePropertyDefaultValues[property])
|
606
|
+
|
607
|
+
# Handle property type conversion
|
608
|
+
if property == TableProperty.SUPPORTED_READER_TYPES and isinstance(value, list):
|
609
|
+
# Convert string values back to DatasetType enums
|
610
|
+
return [DatasetType(v) for v in value]
|
611
|
+
if property == TableProperty.SCHEMA_EVOLUTION_MODE:
|
612
|
+
return SchemaEvolutionMode(value)
|
613
|
+
if property == TableProperty.DEFAULT_SCHEMA_CONSISTENCY_TYPE:
|
614
|
+
return SchemaConsistencyType(value)
|
615
|
+
if property == TableProperty.READ_OPTIMIZATION_LEVEL:
|
616
|
+
return TableReadOptimizationLevel(value)
|
617
|
+
return value
|
618
|
+
|
619
|
+
|
620
|
+
class TableReadOptimizationLevel(str, Enum):
|
621
|
+
"""
|
622
|
+
Enum controlling the how much to optimize reads when writing to a table. Different levels
|
623
|
+
here correspond to different tradeoffs between write and read performance.
|
624
|
+
|
625
|
+
NONE: No read optimization. Deletes and updates are resolved by finding the values
|
626
|
+
that match merge key predicates by running compaction at read time. Provides the
|
627
|
+
fastest/cheapest writes but slow/expensive reads. Resilient to conflicts with concurrent
|
628
|
+
writes, including table management jobs like compaction.
|
629
|
+
|
630
|
+
MODERATE: Discover record indexes that match merge key predicates at write time and record
|
631
|
+
those values as logically deleted (e.g., using a bitmask). Provides faster/cheaper reads but
|
632
|
+
slower/more-expensive writes. May conflict with concurrent writes that remove/replace data
|
633
|
+
files like compaction.
|
634
|
+
|
635
|
+
MAX: Materialize all deletes and updates at write time by running compaction during
|
636
|
+
every write. Provides fast/cheap reads but slow/expensive writes. May conflict with
|
637
|
+
concurrent writes, including table management jobs like compaction.
|
638
|
+
"""
|
639
|
+
|
640
|
+
NONE = "none"
|
641
|
+
MODERATE = "moderate"
|
642
|
+
MAX = "max"
|
643
|
+
|
644
|
+
|
645
|
+
TablePropertyDefaultValues: Dict[TableProperty, Any] = {
|
646
|
+
TableProperty.READ_OPTIMIZATION_LEVEL: TableReadOptimizationLevel.MAX,
|
647
|
+
TableProperty.RECORDS_PER_COMPACTED_FILE: MAX_RECORDS_PER_COMPACTED_FILE,
|
648
|
+
TableProperty.APPENDED_RECORD_COUNT_COMPACTION_TRIGGER: MAX_RECORDS_PER_COMPACTED_FILE
|
649
|
+
* 2,
|
650
|
+
TableProperty.APPENDED_FILE_COUNT_COMPACTION_TRIGGER: 1000,
|
651
|
+
TableProperty.APPENDED_DELTA_COUNT_COMPACTION_TRIGGER: 100,
|
652
|
+
TableProperty.DEFAULT_COMPACTION_HASH_BUCKET_COUNT: 8,
|
653
|
+
TableProperty.SCHEMA_EVOLUTION_MODE: SchemaEvolutionMode.AUTO,
|
654
|
+
TableProperty.DEFAULT_SCHEMA_CONSISTENCY_TYPE: SchemaConsistencyType.NONE,
|
655
|
+
TableProperty.SUPPORTED_READER_TYPES: [d for d in DatasetType],
|
656
|
+
}
|
110
657
|
|
111
658
|
|
112
|
-
def
|
113
|
-
|
114
|
-
|
659
|
+
def _get_table_function(
|
660
|
+
table: Union[LocalTable, DistributedDataset],
|
661
|
+
function_map: Dict[Type, Callable],
|
662
|
+
operation_name: str,
|
663
|
+
) -> Callable:
|
664
|
+
"""Generic helper to look up table-type-specific functions."""
|
665
|
+
table_func = function_map.get(type(table))
|
666
|
+
if table_func is None:
|
115
667
|
msg = (
|
116
|
-
f"No
|
117
|
-
f"Known table types: {
|
668
|
+
f"No {operation_name} function found for table type: {type(table)}.\n"
|
669
|
+
f"Known table types: {list(function_map.keys())}"
|
118
670
|
)
|
119
671
|
raise ValueError(msg)
|
120
|
-
return
|
672
|
+
return table_func
|
121
673
|
|
122
674
|
|
123
|
-
def
|
124
|
-
|
125
|
-
|
675
|
+
def _get_table_type_function(
|
676
|
+
table_type: DatasetType, function_map: Dict[str, Callable], operation_name: str
|
677
|
+
) -> Callable:
|
678
|
+
"""Generic helper to look up DatasetType-specific functions."""
|
679
|
+
table_func = function_map.get(table_type.value)
|
680
|
+
if table_func is None:
|
126
681
|
msg = (
|
127
|
-
f"No
|
128
|
-
f"Known table types: {
|
682
|
+
f"No {operation_name} function found for table type: {table_type}.\n"
|
683
|
+
f"Known table types: {list(function_map.keys())}"
|
129
684
|
)
|
130
685
|
raise ValueError(msg)
|
131
|
-
return
|
686
|
+
return table_func
|
687
|
+
|
688
|
+
|
689
|
+
def _convert_all(tables: List[LocalTable], conversion_fn: Callable, **kwargs):
|
690
|
+
if not tables: # Empty list
|
691
|
+
return pd.DataFrame()
|
692
|
+
|
693
|
+
# Convert list elements
|
694
|
+
all_tables = []
|
695
|
+
for i, table in enumerate(tables):
|
696
|
+
try:
|
697
|
+
converted_table = conversion_fn(table, **kwargs)
|
698
|
+
all_tables.append(converted_table)
|
699
|
+
except Exception as e:
|
700
|
+
raise ValueError(f"Failed to convert list element {i}: {e}") from e
|
701
|
+
|
702
|
+
# Concatenate with error handling - handle different table types
|
703
|
+
try:
|
704
|
+
# Check if we have PyArrow tables
|
705
|
+
if all(isinstance(table, pa.Table) for table in all_tables):
|
706
|
+
# Use PyArrow concatenation for PyArrow tables
|
707
|
+
return pa.concat_tables(all_tables, promote_options="permissive")
|
708
|
+
else:
|
709
|
+
# Use pandas concatenation for other types
|
710
|
+
return pd.concat(all_tables, ignore_index=True, sort=False)
|
711
|
+
except Exception as e:
|
712
|
+
raise ValueError(f"Failed to concatenate {len(all_tables)} tables: {e}") from e
|
713
|
+
|
714
|
+
|
715
|
+
def get_table_length(
|
716
|
+
table: Union[LocalTable, DistributedDataset, BlockAccessor]
|
717
|
+
) -> int:
|
718
|
+
"""
|
719
|
+
Generic function to get the length of a table or distributed dataset.
|
720
|
+
|
721
|
+
Args:
|
722
|
+
table: The local table or distributed dataset to get the length of
|
723
|
+
|
724
|
+
Returns:
|
725
|
+
Length of the table or distributed dataset in rows
|
726
|
+
"""
|
727
|
+
# Handle DAFT DataFrames dynamically
|
728
|
+
if hasattr(table, "count_rows") and str(type(table).__module__).startswith("daft"):
|
729
|
+
return table.count_rows()
|
730
|
+
elif isinstance(table, RayDataset):
|
731
|
+
return table.count()
|
732
|
+
elif isinstance(table, papq.ParquetFile):
|
733
|
+
return table.metadata.num_rows
|
734
|
+
else:
|
735
|
+
return len(table)
|
736
|
+
|
737
|
+
|
738
|
+
def dataset_length(table: Dataset) -> int:
|
739
|
+
"""
|
740
|
+
Generic function to get the length of a dataset in records.
|
741
|
+
If the input is a list of tables, the length is the sum of the
|
742
|
+
lengths of the tables.
|
743
|
+
|
744
|
+
Args:
|
745
|
+
table: The dataset to get the length of
|
746
|
+
|
747
|
+
Returns:
|
748
|
+
Length of the dataset in records
|
749
|
+
"""
|
750
|
+
if isinstance(table, list):
|
751
|
+
return sum(get_table_length(t) for t in table)
|
752
|
+
return get_table_length(table)
|
753
|
+
|
754
|
+
|
755
|
+
def get_table_size(table: Union[LocalTable, DistributedDataset]) -> int:
|
756
|
+
"""
|
757
|
+
Generic function to get the size of a table or distributed dataset.
|
758
|
+
|
759
|
+
Args:
|
760
|
+
table: The local table or distributed dataset to get the size of
|
761
|
+
|
762
|
+
Returns:
|
763
|
+
Size of the table or distributed dataset
|
764
|
+
"""
|
765
|
+
table_size_func = _get_table_function(table, TABLE_CLASS_TO_SIZE_FUNC, "size")
|
766
|
+
return table_size_func(table)
|
767
|
+
|
768
|
+
|
769
|
+
def dataset_size(table: Dataset) -> int:
|
770
|
+
"""
|
771
|
+
Generic function to get the size of a dataset in bytes.
|
772
|
+
If the input is a list of tables, the size is the sum of the
|
773
|
+
sizes of the tables.
|
774
|
+
|
775
|
+
Args:
|
776
|
+
table: The dataset to get the size of
|
777
|
+
|
778
|
+
Returns:
|
779
|
+
Size of the dataset in bytes
|
780
|
+
"""
|
781
|
+
if isinstance(table, list):
|
782
|
+
return sum(get_table_size(t) for t in table)
|
783
|
+
return get_table_size(table)
|
784
|
+
|
785
|
+
|
786
|
+
def get_table_column_names(table: Union[LocalTable, DistributedDataset]) -> List[str]:
|
787
|
+
"""
|
788
|
+
Generic function to get the column names of a table or distributed dataset.
|
789
|
+
|
790
|
+
Args:
|
791
|
+
table: The local table or distributed dataset to get the column names of
|
792
|
+
|
793
|
+
Returns:
|
794
|
+
List of column names
|
795
|
+
"""
|
796
|
+
column_names_func = _get_table_function(
|
797
|
+
table, TABLE_CLASS_TO_COLUMN_NAMES_FUNC, "column names"
|
798
|
+
)
|
799
|
+
return column_names_func(table)
|
800
|
+
|
801
|
+
|
802
|
+
def dataset_column_names(table: Dataset) -> List[str]:
|
803
|
+
"""
|
804
|
+
Generic function to get the column names of a dataset.
|
805
|
+
If the input is a list of tables, unique column names are
|
806
|
+
returned in the order they are first seen in the list.
|
807
|
+
|
808
|
+
Args:
|
809
|
+
table: The dataset to get the column names of
|
810
|
+
|
811
|
+
Returns:
|
812
|
+
List of column names
|
813
|
+
"""
|
814
|
+
if isinstance(table, list):
|
815
|
+
# use dictionary keys as an ordered set
|
816
|
+
column_names = {}
|
817
|
+
for t in table:
|
818
|
+
for column_name in get_table_column_names(t):
|
819
|
+
column_names[column_name] = None
|
820
|
+
return list(column_names.keys())
|
821
|
+
return get_table_column_names(table)
|
822
|
+
|
823
|
+
|
824
|
+
def get_table_schema(table: Union[LocalTable, DistributedDataset]) -> pa.Schema:
|
825
|
+
"""
|
826
|
+
Generic function to get the PyArrow schema of a table or distributed dataset.
|
827
|
+
|
828
|
+
Args:
|
829
|
+
table: The local table or distributed dataset to get the schema of
|
830
|
+
|
831
|
+
Returns:
|
832
|
+
PyArrow Schema object
|
833
|
+
"""
|
834
|
+
schema_func = _get_table_function(table, TABLE_CLASS_TO_SCHEMA_FUNC, "schema")
|
835
|
+
return schema_func(table)
|
836
|
+
|
837
|
+
|
838
|
+
def dataset_schema(table: Dataset) -> pa.Schema:
|
839
|
+
"""
|
840
|
+
Generic function to get the PyArrow schema of a dataset. If the input is a list of
|
841
|
+
tables, uses pyarrow.unify_schemas(schemas, promote_options="permissive").
|
842
|
+
|
843
|
+
Args:
|
844
|
+
table: The dataset to get the schema of
|
845
|
+
|
846
|
+
Returns:
|
847
|
+
PyArrow Schema object
|
848
|
+
"""
|
849
|
+
if isinstance(table, list):
|
850
|
+
return pa.unify_schemas(
|
851
|
+
[get_table_schema(t) for t in table], promote_options="permissive"
|
852
|
+
)
|
853
|
+
return get_table_schema(table)
|
854
|
+
|
855
|
+
|
856
|
+
def get_table_writer(table: Union[LocalTable, DistributedDataset]) -> Callable:
|
857
|
+
"""
|
858
|
+
Generic function to get a table writer function for a given dataset type.
|
859
|
+
|
860
|
+
Args:
|
861
|
+
table: The local table or distributed dataset to get the writer function for
|
862
|
+
|
863
|
+
Returns:
|
864
|
+
Writer function for the given dataset type
|
865
|
+
"""
|
866
|
+
return _get_table_function(table, TABLE_CLASS_TO_WRITER_FUNC, "writer")
|
867
|
+
|
868
|
+
|
869
|
+
def get_table_slicer(table: Union[LocalTable, DistributedDataset]) -> Callable:
|
870
|
+
"""
|
871
|
+
Generic function to get a table slicer function for a given dataset type.
|
872
|
+
|
873
|
+
Args:
|
874
|
+
table: The local table or distributed dataset to get the slicer function for
|
875
|
+
|
876
|
+
Returns:
|
877
|
+
Slicer function for the given dataset type
|
878
|
+
"""
|
879
|
+
return _get_table_function(table, TABLE_CLASS_TO_SLICER_FUNC, "slicer")
|
880
|
+
|
881
|
+
|
882
|
+
def get_dataset_type(dataset: Dataset) -> DatasetType:
|
883
|
+
"""Get the DatasetType enum value for a given dataset object.
|
884
|
+
|
885
|
+
Args:
|
886
|
+
dataset: The dataset object to identify
|
887
|
+
|
888
|
+
Returns:
|
889
|
+
DatasetType enum value corresponding to the dataset type
|
890
|
+
|
891
|
+
Raises:
|
892
|
+
ValueError: If the dataset type is not supported
|
893
|
+
"""
|
894
|
+
dataset_type_str = _get_table_function(
|
895
|
+
dataset, TABLE_CLASS_TO_TABLE_TYPE, "dataset type identification"
|
896
|
+
)
|
897
|
+
return DatasetType(dataset_type_str)
|
898
|
+
|
899
|
+
|
900
|
+
def table_to_pyarrow(
|
901
|
+
table: Union[LocalTable, DistributedDataset],
|
902
|
+
*,
|
903
|
+
schema: Optional[pa.Schema] = None,
|
904
|
+
**kwargs,
|
905
|
+
) -> pa.Table:
|
906
|
+
"""
|
907
|
+
Convert a single table or distributed dataset to PyArrow Table format.
|
908
|
+
|
909
|
+
Args:
|
910
|
+
table: The local table or distributed dataset to convert
|
911
|
+
schema: Optional schema to use for the conversion
|
912
|
+
**kwargs: Additional arguments passed to the conversion function
|
913
|
+
|
914
|
+
Returns:
|
915
|
+
PyArrow Table created from the provided dataset
|
916
|
+
"""
|
917
|
+
to_pyarrow_func = _get_table_function(
|
918
|
+
table, TABLE_CLASS_TO_PYARROW_FUNC, "pyarrow conversion"
|
919
|
+
)
|
920
|
+
return to_pyarrow_func(table, schema=schema, **kwargs)
|
921
|
+
|
922
|
+
|
923
|
+
def table_to_pandas(
|
924
|
+
table: Union[LocalTable, DistributedDataset],
|
925
|
+
*,
|
926
|
+
schema: Optional[pa.Schema] = None,
|
927
|
+
**kwargs,
|
928
|
+
) -> pd.DataFrame:
|
929
|
+
"""
|
930
|
+
Convert a single table or distributed dataset to pandas DataFrame format.
|
931
|
+
|
932
|
+
Args:
|
933
|
+
table: The local table or distributed dataset to convert
|
934
|
+
schema: Optional schema to use for the conversion
|
935
|
+
**kwargs: Additional arguments passed to the conversion function
|
936
|
+
|
937
|
+
Returns:
|
938
|
+
pandas DataFrame created from the provided dataset
|
939
|
+
"""
|
940
|
+
to_pandas_func = _get_table_function(
|
941
|
+
table, TABLE_CLASS_TO_PANDAS_FUNC, "pandas conversion"
|
942
|
+
)
|
943
|
+
return to_pandas_func(table, schema=schema, **kwargs)
|
944
|
+
|
945
|
+
|
946
|
+
def to_pyarrow(
|
947
|
+
table: Dataset, *, schema: Optional[pa.Schema] = None, **kwargs
|
948
|
+
) -> pa.Table:
|
949
|
+
"""
|
950
|
+
Convert any supported dataset type to PyArrow Table format.
|
951
|
+
|
952
|
+
Args:
|
953
|
+
table: The table/dataset to convert
|
954
|
+
schema: Optional schema to use for the conversion
|
955
|
+
**kwargs: Additional arguments passed to the conversion function
|
956
|
+
|
957
|
+
Returns:
|
958
|
+
PyArrow Table created from the provided dataset
|
959
|
+
"""
|
960
|
+
if isinstance(table, list):
|
961
|
+
return _convert_all(table, table_to_pyarrow, schema=schema, **kwargs)
|
962
|
+
return table_to_pyarrow(table, schema=schema, **kwargs)
|
963
|
+
|
964
|
+
|
965
|
+
def to_pandas(
|
966
|
+
table: Dataset, *, schema: Optional[pa.Schema] = None, **kwargs
|
967
|
+
) -> pd.DataFrame:
|
968
|
+
"""
|
969
|
+
Convert any supported dataset type to pandas DataFrame format.
|
970
|
+
|
971
|
+
Args:
|
972
|
+
table: The table/dataset to convert
|
973
|
+
schema: Optional schema to use for the conversion
|
974
|
+
**kwargs: Additional arguments passed to the conversion function
|
975
|
+
|
976
|
+
Returns:
|
977
|
+
pandas DataFrame created from the provided dataset
|
978
|
+
"""
|
979
|
+
if isinstance(table, list):
|
980
|
+
return _convert_all(table, table_to_pandas, schema=schema, **kwargs)
|
981
|
+
return table_to_pandas(table, schema=schema, **kwargs)
|
982
|
+
|
983
|
+
|
984
|
+
def from_pyarrow(pa_table: pa.Table, target_type: DatasetType, **kwargs) -> Dataset:
|
985
|
+
"""Convert PyArrow Table to the specified dataset type.
|
986
|
+
|
987
|
+
Args:
|
988
|
+
pa_table: PyArrow Table to convert
|
989
|
+
target_type: Target DatasetType to convert to
|
990
|
+
**kwargs: Additional arguments passed to the conversion function
|
991
|
+
|
992
|
+
Returns:
|
993
|
+
Dataset converted to the target type
|
994
|
+
|
995
|
+
Raises:
|
996
|
+
ValueError: If target_type is not supported
|
997
|
+
"""
|
998
|
+
conversion_func = _get_table_type_function(
|
999
|
+
target_type,
|
1000
|
+
DATASET_TYPE_FROM_PYARROW,
|
1001
|
+
f"{target_type} conversion",
|
1002
|
+
)
|
1003
|
+
return conversion_func(pa_table, **kwargs)
|
1004
|
+
|
1005
|
+
|
1006
|
+
def from_pandas(pd_df: pd.DataFrame, target_type: DatasetType, **kwargs) -> Dataset:
|
1007
|
+
"""Convert Pandas DataFrame to the specified dataset type.
|
1008
|
+
|
1009
|
+
Args:
|
1010
|
+
pd_df: Pandas DataFrame to convert
|
1011
|
+
target_type: Target DatasetType to convert to
|
1012
|
+
**kwargs: Additional arguments passed to the conversion function
|
1013
|
+
|
1014
|
+
Returns:
|
1015
|
+
Dataset converted to the target type
|
1016
|
+
|
1017
|
+
Raises:
|
1018
|
+
ValueError: If target_type is not supported
|
1019
|
+
"""
|
1020
|
+
conversion_func = _get_table_type_function(
|
1021
|
+
target_type,
|
1022
|
+
DATASET_TYPE_FROM_PANDAS,
|
1023
|
+
f"{target_type} conversion",
|
1024
|
+
)
|
1025
|
+
return conversion_func(pd_df, **kwargs)
|
1026
|
+
|
1027
|
+
|
1028
|
+
def empty_table(table_type: DatasetType) -> Dataset:
|
1029
|
+
"""
|
1030
|
+
Create an empty table of the given type.
|
1031
|
+
"""
|
1032
|
+
empty_table_func = _get_table_type_function(
|
1033
|
+
table_type, TABLE_TYPE_TO_EMPTY_TABLE_FUNC, "empty table"
|
1034
|
+
)
|
1035
|
+
return empty_table_func()
|
1036
|
+
|
1037
|
+
|
1038
|
+
def append_column_to_table(
|
1039
|
+
table: LocalTable,
|
1040
|
+
column_name: str,
|
1041
|
+
column_value: Any,
|
1042
|
+
) -> LocalTable:
|
1043
|
+
"""
|
1044
|
+
Generic function to append a column with a specified value to any supported dataset type.
|
1045
|
+
|
1046
|
+
Args:
|
1047
|
+
table: The table/dataset to add column to
|
1048
|
+
column_name: Name of the new column
|
1049
|
+
column_value: Value to populate in all rows of the new column
|
1050
|
+
table_type: Type of the dataset
|
1051
|
+
|
1052
|
+
Returns:
|
1053
|
+
Updated table with the new column
|
1054
|
+
"""
|
1055
|
+
append_column_to_table_func = _get_table_function(
|
1056
|
+
table, TABLE_CLASS_TO_APPEND_COLUMN_FUNC, "append column"
|
1057
|
+
)
|
1058
|
+
return append_column_to_table_func(table, column_name, column_value)
|
1059
|
+
|
1060
|
+
|
1061
|
+
def select_columns_from_table(
|
1062
|
+
table: LocalTable,
|
1063
|
+
column_names: List[str],
|
1064
|
+
) -> LocalTable:
|
1065
|
+
"""
|
1066
|
+
Generic function to select columns from any supported dataset type.
|
1067
|
+
|
1068
|
+
Args:
|
1069
|
+
table: The table/dataset to select columns from
|
1070
|
+
column_names: List of column names to select
|
1071
|
+
|
1072
|
+
Returns:
|
1073
|
+
Updated table with the selected columns
|
1074
|
+
"""
|
1075
|
+
select_columns_func = _get_table_function(
|
1076
|
+
table, TABLE_CLASS_TO_SELECT_COLUMNS_FUNC, "select columns"
|
1077
|
+
)
|
1078
|
+
return select_columns_func(table, column_names)
|
1079
|
+
|
1080
|
+
|
1081
|
+
def write_sliced_table(
|
1082
|
+
table: Union[LocalTable, DistributedDataset],
|
1083
|
+
base_path: str,
|
1084
|
+
filesystem: Optional[pa.fs.FileSystem],
|
1085
|
+
max_records_per_entry: Optional[int],
|
1086
|
+
table_writer_fn: Callable,
|
1087
|
+
table_slicer_fn: Callable,
|
1088
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
1089
|
+
content_type: ContentType = ContentType.PARQUET,
|
1090
|
+
entry_params: Optional[EntryParams] = None,
|
1091
|
+
entry_type: Optional[EntryType] = EntryType.DATA,
|
1092
|
+
) -> ManifestEntryList:
|
1093
|
+
"""
|
1094
|
+
Writes table slices to 1 or more files and returns
|
1095
|
+
manifest entries describing the uploaded files.
|
1096
|
+
|
1097
|
+
Args:
|
1098
|
+
table: The local table or distributed dataset to write
|
1099
|
+
base_path: The base path to write the table to
|
1100
|
+
filesystem: The filesystem to write the table to
|
1101
|
+
table_writer_fn: The function to write the table to
|
1102
|
+
table_slicer_fn: The function to slice the table into multiple files
|
1103
|
+
table_writer_kwargs: Additional arguments to pass to the table writer
|
1104
|
+
content_type: The content type to write the table to
|
1105
|
+
entry_params: Manifest entry parameters
|
1106
|
+
entry_type: The manifest entry types to write
|
1107
|
+
|
1108
|
+
Returns:
|
1109
|
+
Manifest entries describing the uploaded files
|
1110
|
+
"""
|
1111
|
+
# @retry decorator can't be pickled by Ray, so wrap upload in Retrying
|
1112
|
+
retrying = Retrying(
|
1113
|
+
wait=wait_random_exponential(multiplier=1, max=60),
|
1114
|
+
stop=stop_after_delay(UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY),
|
1115
|
+
retry=retry_if_exception_type(RetryableError),
|
1116
|
+
)
|
1117
|
+
|
1118
|
+
manifest_entries = ManifestEntryList()
|
1119
|
+
table_record_count = get_table_length(table)
|
1120
|
+
|
1121
|
+
if max_records_per_entry is None or not table_record_count:
|
1122
|
+
# write the whole table to a single file
|
1123
|
+
manifest_entries = retrying(
|
1124
|
+
write_table,
|
1125
|
+
table,
|
1126
|
+
f"{base_path}", # cast any non-string arg to string
|
1127
|
+
filesystem,
|
1128
|
+
table_writer_fn,
|
1129
|
+
table_writer_kwargs,
|
1130
|
+
content_type,
|
1131
|
+
entry_params,
|
1132
|
+
entry_type,
|
1133
|
+
)
|
1134
|
+
else:
|
1135
|
+
# iteratively write table slices
|
1136
|
+
table_slices = table_slicer_fn(table, max_records_per_entry)
|
1137
|
+
for table_slice in table_slices:
|
1138
|
+
slice_entries = retrying(
|
1139
|
+
write_table,
|
1140
|
+
table_slice,
|
1141
|
+
f"{base_path}", # cast any non-string arg to string
|
1142
|
+
filesystem,
|
1143
|
+
table_writer_fn,
|
1144
|
+
table_writer_kwargs,
|
1145
|
+
content_type,
|
1146
|
+
entry_params,
|
1147
|
+
entry_type,
|
1148
|
+
)
|
1149
|
+
manifest_entries.extend(slice_entries)
|
1150
|
+
return manifest_entries
|
1151
|
+
|
1152
|
+
|
1153
|
+
def write_table(
|
1154
|
+
table: Union[LocalTable, DistributedDataset],
|
1155
|
+
base_path: str,
|
1156
|
+
filesystem: Optional[pa.fs.FileSystem],
|
1157
|
+
table_writer_fn: Callable,
|
1158
|
+
table_writer_kwargs: Optional[Dict[str, Any]],
|
1159
|
+
content_type: ContentType = ContentType.PARQUET,
|
1160
|
+
entry_params: Optional[EntryParams] = None,
|
1161
|
+
entry_type: Optional[EntryType] = EntryType.DATA,
|
1162
|
+
) -> ManifestEntryList:
|
1163
|
+
"""
|
1164
|
+
Writes the given table to 1 or more files and return
|
1165
|
+
manifest entries describing the uploaded files.
|
1166
|
+
|
1167
|
+
Args:
|
1168
|
+
table: The local table or distributed dataset to write
|
1169
|
+
base_path: The base path to write the table to
|
1170
|
+
filesystem: The filesystem to write the table to
|
1171
|
+
table_writer_fn: The function to write the table to
|
1172
|
+
table_writer_kwargs: Additional arguments to pass to the table writer
|
1173
|
+
content_type: The content type to write the table to
|
1174
|
+
entry_params: Manifest entry parameters
|
1175
|
+
entry_type: The manifest entry types to write
|
1176
|
+
|
1177
|
+
Returns:
|
1178
|
+
Manifest entries describing the uploaded files
|
1179
|
+
"""
|
1180
|
+
if table_writer_kwargs is None:
|
1181
|
+
table_writer_kwargs = {}
|
1182
|
+
|
1183
|
+
# Determine content_encoding before writing files so we can include it in filenames
|
1184
|
+
content_encoding = None
|
1185
|
+
if content_type in EXPLICIT_COMPRESSION_CONTENT_TYPES:
|
1186
|
+
# TODO(pdames): Support other user-specified encodings at write time.
|
1187
|
+
content_encoding = ContentEncoding.GZIP
|
1188
|
+
|
1189
|
+
wrapped_obj = (
|
1190
|
+
CapturedBlockWritePathsActor.remote()
|
1191
|
+
if isinstance(table, RayDataset)
|
1192
|
+
else CapturedBlockWritePathsBase()
|
1193
|
+
)
|
1194
|
+
capture_object = CapturedBlockWritePaths(wrapped_obj)
|
1195
|
+
block_write_path_provider = UuidBlockWritePathProvider(
|
1196
|
+
capture_object,
|
1197
|
+
base_path=base_path,
|
1198
|
+
content_type=content_type,
|
1199
|
+
content_encoding=content_encoding,
|
1200
|
+
)
|
1201
|
+
# Extract schema, schema_id, and sort_scheme_id from table_writer_kwargs
|
1202
|
+
schema = table_writer_kwargs.pop("schema", None)
|
1203
|
+
schema_id = table_writer_kwargs.pop("schema_id", None)
|
1204
|
+
sort_scheme_id = table_writer_kwargs.pop("sort_scheme_id", None)
|
1205
|
+
table_writer_fn(
|
1206
|
+
table,
|
1207
|
+
base_path,
|
1208
|
+
filesystem,
|
1209
|
+
block_write_path_provider,
|
1210
|
+
content_type.value,
|
1211
|
+
schema=schema,
|
1212
|
+
**table_writer_kwargs,
|
1213
|
+
)
|
1214
|
+
# TODO: Add a proper fix for block_refs and write_paths not persisting in Ray actors
|
1215
|
+
del block_write_path_provider
|
1216
|
+
blocks = capture_object.blocks()
|
1217
|
+
write_paths = capture_object.write_paths()
|
1218
|
+
metadata = get_block_metadata_list(table, write_paths, blocks)
|
1219
|
+
manifest_entries = ManifestEntryList()
|
1220
|
+
for block_idx, path in enumerate(write_paths):
|
1221
|
+
try:
|
1222
|
+
manifest_entry = ManifestEntry.from_path(
|
1223
|
+
path=path,
|
1224
|
+
filesystem=filesystem,
|
1225
|
+
record_count=metadata[block_idx].num_rows,
|
1226
|
+
source_content_length=metadata[block_idx].size_bytes,
|
1227
|
+
content_type=content_type.value,
|
1228
|
+
content_encoding=content_encoding,
|
1229
|
+
entry_type=entry_type,
|
1230
|
+
entry_params=entry_params,
|
1231
|
+
schema_id=schema_id,
|
1232
|
+
sort_scheme_id=sort_scheme_id,
|
1233
|
+
)
|
1234
|
+
manifest_entries.append(manifest_entry)
|
1235
|
+
except RETRYABLE_TRANSIENT_ERRORS as e:
|
1236
|
+
_handle_retryable_error(e, path, "write", RetryableUploadTableError)
|
1237
|
+
except BaseException as e:
|
1238
|
+
_handle_non_retryable_error(
|
1239
|
+
e,
|
1240
|
+
path,
|
1241
|
+
"upload",
|
1242
|
+
NonRetryableUploadTableError,
|
1243
|
+
f"and content_type={content_type}",
|
1244
|
+
)
|
1245
|
+
return manifest_entries
|
1246
|
+
|
1247
|
+
|
1248
|
+
@ray.remote
|
1249
|
+
class CapturedBlockWritePathsActor:
|
1250
|
+
def __init__(self):
|
1251
|
+
self._wrapped = CapturedBlockWritePathsBase()
|
1252
|
+
|
1253
|
+
def extend(self, write_paths: List[str], blocks: List[Block]) -> None:
|
1254
|
+
self._wrapped.extend(write_paths, blocks)
|
1255
|
+
|
1256
|
+
def write_paths(self) -> List[str]:
|
1257
|
+
return self._wrapped.write_paths()
|
1258
|
+
|
1259
|
+
def blocks(self) -> List[Block]:
|
1260
|
+
return self._wrapped.blocks()
|
1261
|
+
|
1262
|
+
|
1263
|
+
class CapturedBlockWritePathsBase:
|
1264
|
+
def __init__(self):
|
1265
|
+
self._write_paths: List[str] = []
|
1266
|
+
self._blocks: List[Block] = []
|
1267
|
+
|
1268
|
+
def extend(self, write_paths: List[str], blocks: List[Block]) -> None:
|
1269
|
+
try:
|
1270
|
+
iter(write_paths)
|
1271
|
+
except TypeError:
|
1272
|
+
pass
|
1273
|
+
else:
|
1274
|
+
self._write_paths.extend(write_paths)
|
1275
|
+
try:
|
1276
|
+
iter(blocks)
|
1277
|
+
except TypeError:
|
1278
|
+
pass
|
1279
|
+
else:
|
1280
|
+
self._blocks.extend(blocks)
|
1281
|
+
|
1282
|
+
def write_paths(self) -> List[str]:
|
1283
|
+
return self._write_paths
|
1284
|
+
|
1285
|
+
def blocks(self) -> List[Block]:
|
1286
|
+
return self._blocks
|
1287
|
+
|
1288
|
+
|
1289
|
+
class CapturedBlockWritePaths:
|
1290
|
+
def __init__(self, wrapped=CapturedBlockWritePathsBase()):
|
1291
|
+
self._wrapped = wrapped
|
1292
|
+
|
1293
|
+
def extend(self, write_paths: List[str], blocks: List[Block]) -> None:
|
1294
|
+
return (
|
1295
|
+
self._wrapped.extend(write_paths, blocks)
|
1296
|
+
if isinstance(self._wrapped, CapturedBlockWritePathsBase)
|
1297
|
+
else ray.get(self._wrapped.extend.remote(write_paths, blocks))
|
1298
|
+
)
|
1299
|
+
|
1300
|
+
def write_paths(self) -> List[str]:
|
1301
|
+
return (
|
1302
|
+
self._wrapped.write_paths()
|
1303
|
+
if isinstance(self._wrapped, CapturedBlockWritePathsBase)
|
1304
|
+
else ray.get(self._wrapped.write_paths.remote())
|
1305
|
+
)
|
1306
|
+
|
1307
|
+
def blocks(self) -> List[Block]:
|
1308
|
+
return (
|
1309
|
+
self._wrapped.blocks()
|
1310
|
+
if isinstance(self._wrapped, CapturedBlockWritePathsBase)
|
1311
|
+
else ray.get(self._wrapped.blocks.remote())
|
1312
|
+
)
|
1313
|
+
|
1314
|
+
|
1315
|
+
class UuidBlockWritePathProvider(FilenameProvider):
|
1316
|
+
"""Block write path provider implementation that writes each
|
1317
|
+
dataset block out to a file of the form: {base_path}/{uuid}
|
1318
|
+
"""
|
1319
|
+
|
1320
|
+
def __init__(
|
1321
|
+
self,
|
1322
|
+
capture_object: CapturedBlockWritePaths,
|
1323
|
+
base_path: Optional[str] = None,
|
1324
|
+
content_type: Optional[ContentType] = None,
|
1325
|
+
content_encoding: Optional[ContentEncoding] = None,
|
1326
|
+
):
|
1327
|
+
self.base_path = base_path
|
1328
|
+
self.content_type = content_type
|
1329
|
+
self.content_encoding = content_encoding
|
1330
|
+
self.write_paths: List[str] = []
|
1331
|
+
self.blocks: List[Block] = []
|
1332
|
+
self.capture_object = capture_object
|
1333
|
+
|
1334
|
+
def __del__(self):
|
1335
|
+
if self.write_paths or self.blocks:
|
1336
|
+
self.capture_object.extend(
|
1337
|
+
self.write_paths,
|
1338
|
+
self.blocks,
|
1339
|
+
)
|
1340
|
+
|
1341
|
+
def get_filename_for_block(
|
1342
|
+
self,
|
1343
|
+
block: Block,
|
1344
|
+
task_index: int,
|
1345
|
+
block_index: int,
|
1346
|
+
) -> str:
|
1347
|
+
if self.base_path is None:
|
1348
|
+
raise ValueError(
|
1349
|
+
"Base path must be provided to UuidBlockWritePathProvider",
|
1350
|
+
)
|
1351
|
+
return self._get_write_path_for_block(
|
1352
|
+
base_path=self.base_path,
|
1353
|
+
block=block,
|
1354
|
+
block_index=block_index,
|
1355
|
+
)
|
1356
|
+
|
1357
|
+
def _get_write_path_for_block(
|
1358
|
+
self,
|
1359
|
+
base_path: str,
|
1360
|
+
*,
|
1361
|
+
block: Optional[Block] = None,
|
1362
|
+
**kwargs,
|
1363
|
+
) -> str:
|
1364
|
+
# Generate base UUID filename
|
1365
|
+
filename = str(uuid4())
|
1366
|
+
|
1367
|
+
# Add content type extension if available
|
1368
|
+
if self.content_type:
|
1369
|
+
content_type_extension = None
|
1370
|
+
content_type_extension = CONTENT_TYPE_TO_EXT.get(self.content_type)
|
1371
|
+
if content_type_extension:
|
1372
|
+
filename += content_type_extension
|
1373
|
+
|
1374
|
+
# Add content encoding extension if available
|
1375
|
+
if self.content_encoding:
|
1376
|
+
encoding_extension = None
|
1377
|
+
encoding_extension = CONTENT_ENCODING_TO_EXT.get(self.content_encoding)
|
1378
|
+
if encoding_extension:
|
1379
|
+
filename += encoding_extension
|
1380
|
+
|
1381
|
+
write_path = f"{base_path}/{filename}"
|
1382
|
+
self.write_paths.append(write_path)
|
1383
|
+
if block is not None:
|
1384
|
+
self.blocks.append(block)
|
1385
|
+
return write_path
|
1386
|
+
|
1387
|
+
def __call__(
|
1388
|
+
self,
|
1389
|
+
base_path: str,
|
1390
|
+
*,
|
1391
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
1392
|
+
dataset_uuid: Optional[str] = None,
|
1393
|
+
block: Optional[Block] = None,
|
1394
|
+
block_index: Optional[int] = None,
|
1395
|
+
file_format: Optional[str] = None,
|
1396
|
+
) -> str:
|
1397
|
+
return self._get_write_path_for_block(
|
1398
|
+
base_path,
|
1399
|
+
filesystem=filesystem,
|
1400
|
+
dataset_uuid=dataset_uuid,
|
1401
|
+
block=block,
|
1402
|
+
block_index=block_index,
|
1403
|
+
file_format=file_format,
|
1404
|
+
)
|
1405
|
+
|
1406
|
+
|
1407
|
+
def get_block_metadata_list(
|
1408
|
+
table: LocalTable,
|
1409
|
+
write_paths: List[str],
|
1410
|
+
blocks: List[Block],
|
1411
|
+
) -> List[BlockMetadata]:
|
1412
|
+
"""
|
1413
|
+
Get the block metadata for a given table.
|
1414
|
+
|
1415
|
+
Args:
|
1416
|
+
table: The local table or distributed dataset to get the block metadata for
|
1417
|
+
write_paths: The list of write paths for the table
|
1418
|
+
blocks: The list of blocks to get the metadata for
|
1419
|
+
|
1420
|
+
Returns:
|
1421
|
+
List of block metadata
|
1422
|
+
"""
|
1423
|
+
block_meta_list: List[BlockMetadata] = []
|
1424
|
+
if not blocks:
|
1425
|
+
# this must be a local table - ensure it was written to only 1 file
|
1426
|
+
assert len(write_paths) == 1, (
|
1427
|
+
f"Expected table of type '{type(table)}' to be written to 1 "
|
1428
|
+
f"file, but found {len(write_paths)} files."
|
1429
|
+
)
|
1430
|
+
blocks = [table]
|
1431
|
+
for block in blocks:
|
1432
|
+
block_meta_list.append(get_block_metadata(block))
|
1433
|
+
return block_meta_list
|
1434
|
+
|
1435
|
+
|
1436
|
+
def get_block_metadata(
|
1437
|
+
table: Union[LocalTable, DistributedDataset, BlockAccessor],
|
1438
|
+
) -> BlockMetadata:
|
1439
|
+
"""
|
1440
|
+
Get the block metadata for a given table.
|
1441
|
+
|
1442
|
+
Args:
|
1443
|
+
table: The local table or distributed dataset to get the block metadata for
|
1444
|
+
|
1445
|
+
Returns:
|
1446
|
+
Block metadata
|
1447
|
+
"""
|
1448
|
+
table_size = None
|
1449
|
+
table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
|
1450
|
+
if table_size_func:
|
1451
|
+
table_size = table_size_func(table)
|
1452
|
+
else:
|
1453
|
+
logger.warning(f"Unable to estimate '{type(table)}' table size.")
|
1454
|
+
if isinstance(table, BlockAccessor):
|
1455
|
+
table = table.to_block()
|
1456
|
+
return BlockMetadata(
|
1457
|
+
num_rows=get_table_length(table),
|
1458
|
+
size_bytes=table_size,
|
1459
|
+
schema=None,
|
1460
|
+
input_files=None,
|
1461
|
+
exec_stats=None,
|
1462
|
+
)
|
1463
|
+
|
1464
|
+
|
1465
|
+
def _reconstruct_manifest_entry_uri(
|
1466
|
+
manifest_entry: ManifestEntry,
|
1467
|
+
**kwargs,
|
1468
|
+
) -> ManifestEntry:
|
1469
|
+
"""
|
1470
|
+
Reconstruct the full URI for a manifest entry.
|
1471
|
+
|
1472
|
+
Args:
|
1473
|
+
manifest_entry: The manifest entry to reconstruct the URI for
|
1474
|
+
**kwargs: Additional arguments to pass to the catalog properties
|
1475
|
+
|
1476
|
+
Returns:
|
1477
|
+
Manifest entry with the reconstructed URI
|
1478
|
+
"""
|
1479
|
+
# Reconstruct full URI with scheme for external readers (see GitHub issue #567)
|
1480
|
+
from deltacat.catalog import get_catalog_properties
|
1481
|
+
|
1482
|
+
# Only pass kwargs that CatalogProperties actually accepts
|
1483
|
+
catalog_kwargs = _filter_kwargs_for_catalog_properties(kwargs)
|
1484
|
+
catalog_properties = get_catalog_properties(**catalog_kwargs)
|
1485
|
+
|
1486
|
+
original_uri = manifest_entry.uri
|
1487
|
+
reconstructed_uri = catalog_properties.reconstruct_full_path(original_uri)
|
1488
|
+
if original_uri != reconstructed_uri:
|
1489
|
+
# Create a copy of the manifest entry with the reconstructed URI
|
1490
|
+
reconstructed_entry = ManifestEntry(
|
1491
|
+
uri=reconstructed_uri, url=manifest_entry.url, meta=manifest_entry.meta
|
1492
|
+
)
|
1493
|
+
return reconstructed_entry
|
1494
|
+
return manifest_entry
|
1495
|
+
|
1496
|
+
|
1497
|
+
def _filter_kwargs_for_external_readers(kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
1498
|
+
"""
|
1499
|
+
Filter out DeltaCAT system kwargs that external file readers don't expect.
|
1500
|
+
|
1501
|
+
Use this when passing kwargs to external libraries like PyArrow, Pandas, Polars, etc.
|
1502
|
+
This removes all DeltaCAT-specific parameters that would cause TypeErrors in external readers.
|
1503
|
+
|
1504
|
+
Args:
|
1505
|
+
kwargs: The dictionary of arguments to filter
|
1506
|
+
|
1507
|
+
Returns:
|
1508
|
+
Dictionary of arguments with DeltaCAT system kwargs removed
|
1509
|
+
"""
|
1510
|
+
return {
|
1511
|
+
k: v
|
1512
|
+
for k, v in kwargs.items()
|
1513
|
+
if k
|
1514
|
+
not in [
|
1515
|
+
# DeltaCAT catalog/storage system kwargs
|
1516
|
+
"inner",
|
1517
|
+
"catalog",
|
1518
|
+
"ray_options_provider",
|
1519
|
+
"distributed_dataset_type",
|
1520
|
+
# DeltaCAT schema/reader kwargs
|
1521
|
+
"table_version_schema",
|
1522
|
+
"entry_params",
|
1523
|
+
# Daft-specific kwargs
|
1524
|
+
"io_config",
|
1525
|
+
"ray_init_options",
|
1526
|
+
# DeltaCAT processing kwargs
|
1527
|
+
"column_names",
|
1528
|
+
"include_columns",
|
1529
|
+
"file_reader_kwargs_provider",
|
1530
|
+
"file_path_column",
|
1531
|
+
"max_parallelism",
|
1532
|
+
]
|
1533
|
+
}
|
1534
|
+
|
1535
|
+
|
1536
|
+
def _filter_kwargs_for_catalog_properties(kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
1537
|
+
"""
|
1538
|
+
Filter kwargs to only include those that CatalogProperties accepts.
|
1539
|
+
|
1540
|
+
Use this when calling get_catalog_properties() or CatalogProperties.__init__().
|
1541
|
+
Uses a whitelist approach - only passes known compatible parameters.
|
1542
|
+
|
1543
|
+
CatalogProperties.__init__ accepts: root, filesystem, storage
|
1544
|
+
get_catalog_properties also accepts: catalog, inner
|
1545
|
+
|
1546
|
+
Args:
|
1547
|
+
kwargs: The dictionary of arguments to filter
|
1548
|
+
|
1549
|
+
Returns:
|
1550
|
+
Dictionary containing only CatalogProperties-compatible kwargs
|
1551
|
+
"""
|
1552
|
+
return {
|
1553
|
+
k: v
|
1554
|
+
for k, v in kwargs.items()
|
1555
|
+
if k in ["root", "filesystem", "storage", "catalog", "inner"]
|
1556
|
+
}
|
1557
|
+
|
1558
|
+
|
1559
|
+
def _filter_kwargs_for_reader_functions(kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
1560
|
+
"""
|
1561
|
+
Filter kwargs for internal DeltaCAT reader functions that need most params but not catalog-specific ones.
|
1562
|
+
|
1563
|
+
Use this for internal DeltaCAT functions that need file reader kwargs, schema kwargs, etc.
|
1564
|
+
but should not receive catalog/storage system parameters.
|
1565
|
+
Preserves table_version_schema, entry_params, file reader kwargs, etc.
|
1566
|
+
|
1567
|
+
Args:
|
1568
|
+
kwargs: The dictionary of arguments to filter
|
1569
|
+
|
1570
|
+
Returns:
|
1571
|
+
Dictionary with catalog/storage system kwargs removed
|
1572
|
+
"""
|
1573
|
+
return {
|
1574
|
+
k: v
|
1575
|
+
for k, v in kwargs.items()
|
1576
|
+
if k
|
1577
|
+
not in ["inner", "catalog", "ray_options_provider", "distributed_dataset_type"]
|
1578
|
+
}
|
1579
|
+
|
1580
|
+
|
1581
|
+
def _extract_content_metadata(
|
1582
|
+
manifest_entry: ManifestEntry,
|
1583
|
+
) -> Tuple[ContentType, ContentEncoding, str]:
|
1584
|
+
"""
|
1585
|
+
Extract content type, encoding, and path from manifest entry.
|
1586
|
+
|
1587
|
+
Args:
|
1588
|
+
manifest_entry: The manifest entry to extract the content metadata from
|
1589
|
+
|
1590
|
+
Returns:
|
1591
|
+
Tuple of content type, encoding, and path
|
1592
|
+
"""
|
1593
|
+
content_type = manifest_entry.meta.content_type
|
1594
|
+
assert content_type, f"Unknown content type for manifest entry: {manifest_entry}"
|
1595
|
+
content_type = ContentType(content_type)
|
1596
|
+
|
1597
|
+
content_encoding = manifest_entry.meta.content_encoding
|
1598
|
+
assert (
|
1599
|
+
content_encoding
|
1600
|
+
), f"Unknown content encoding for manifest entry: {manifest_entry}"
|
1601
|
+
content_encoding = ContentEncoding(content_encoding)
|
1602
|
+
|
1603
|
+
path = manifest_entry.uri
|
1604
|
+
if path is None:
|
1605
|
+
path = manifest_entry.url
|
1606
|
+
|
1607
|
+
return content_type, content_encoding, path
|
1608
|
+
|
1609
|
+
|
1610
|
+
def _extract_partial_download_params(
|
1611
|
+
manifest_entry: ManifestEntry,
|
1612
|
+
) -> Optional[PartialFileDownloadParams]:
|
1613
|
+
"""
|
1614
|
+
Extract partial file download parameters from manifest entry.
|
1615
|
+
|
1616
|
+
Args:
|
1617
|
+
manifest_entry: The manifest entry to extract the partial file download parameters from
|
1618
|
+
|
1619
|
+
Returns:
|
1620
|
+
Partial file download parameters
|
1621
|
+
"""
|
1622
|
+
if not manifest_entry.meta or not manifest_entry.meta.content_type_parameters:
|
1623
|
+
return None
|
1624
|
+
|
1625
|
+
for type_params in manifest_entry.meta.content_type_parameters:
|
1626
|
+
if isinstance(type_params, PartialFileDownloadParams):
|
1627
|
+
return type_params
|
1628
|
+
return None
|
1629
|
+
|
1630
|
+
|
1631
|
+
def _create_retry_wrapper():
|
1632
|
+
"""
|
1633
|
+
Create a standardized Tenacity Retrying wrapper for file operations.
|
1634
|
+
|
1635
|
+
Returns:
|
1636
|
+
Tenacity Retrying wrapper
|
1637
|
+
"""
|
1638
|
+
return Retrying(
|
1639
|
+
wait=wait_random_exponential(multiplier=1, max=60),
|
1640
|
+
stop=stop_after_delay(DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY),
|
1641
|
+
retry=retry_if_exception_type(RetryableError),
|
1642
|
+
)
|
1643
|
+
|
1644
|
+
|
1645
|
+
def _remove_file_path_column(
|
1646
|
+
include_columns: Optional[List[str]],
|
1647
|
+
file_path_column: Optional[str],
|
1648
|
+
) -> Optional[List[str]]:
|
1649
|
+
"""Remove the file path system column from the include_columns list.
|
1650
|
+
|
1651
|
+
Args:
|
1652
|
+
include_columns: The list of columns to include in a selection
|
1653
|
+
file_path_column: Optional file path system column name to remove from the selection
|
1654
|
+
|
1655
|
+
Returns:
|
1656
|
+
List of columns to include without the file path system column
|
1657
|
+
"""
|
1658
|
+
if file_path_column and include_columns:
|
1659
|
+
return [col for col in include_columns if col != file_path_column]
|
1660
|
+
return include_columns
|
1661
|
+
|
1662
|
+
|
1663
|
+
def _prepare_download_arguments(
|
1664
|
+
table_type: DatasetType,
|
1665
|
+
column_names: Optional[List[str]],
|
1666
|
+
include_columns: Optional[List[str]],
|
1667
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider],
|
1668
|
+
file_path_column: Optional[str],
|
1669
|
+
**kwargs,
|
1670
|
+
) -> Dict[str, Any]:
|
1671
|
+
"""Prepare standardized arguments for download operations.
|
1672
|
+
|
1673
|
+
Args:
|
1674
|
+
table_type: The type of table to download
|
1675
|
+
column_names: The list of column names in the table
|
1676
|
+
include_columns: The list of columns to include in the selection
|
1677
|
+
file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs
|
1678
|
+
file_path_column: The file path system column name
|
1679
|
+
**kwargs: Additional arguments to pass to the file reader
|
1680
|
+
|
1681
|
+
Returns:
|
1682
|
+
Dictionary of arguments for the download operation
|
1683
|
+
"""
|
1684
|
+
reader_kwargs = _filter_kwargs_for_external_readers(kwargs)
|
1685
|
+
processed_include_columns = _remove_file_path_column(
|
1686
|
+
include_columns, file_path_column
|
1687
|
+
)
|
1688
|
+
|
1689
|
+
return {
|
1690
|
+
"table_type": table_type,
|
1691
|
+
"column_names": column_names,
|
1692
|
+
"include_columns": processed_include_columns,
|
1693
|
+
"file_reader_kwargs_provider": file_reader_kwargs_provider,
|
1694
|
+
"file_path_column": file_path_column,
|
1695
|
+
**reader_kwargs,
|
1696
|
+
}
|
1697
|
+
|
1698
|
+
|
1699
|
+
def _handle_retryable_error(e: Exception, path: str, operation: str, error_class: type):
|
1700
|
+
"""Handle retryable errors with standardized error message."""
|
1701
|
+
raise error_class(
|
1702
|
+
f"Retry {operation} for: {path} after receiving {type(e).__name__}: {e}"
|
1703
|
+
) from e
|
1704
|
+
|
1705
|
+
|
1706
|
+
def _handle_non_retryable_error(
|
1707
|
+
e: Exception, path: str, operation: str, error_class: type, extra_context: str = ""
|
1708
|
+
):
|
1709
|
+
"""Handle non-retryable errors with logging and standardized error message."""
|
1710
|
+
context = f" {extra_context}" if extra_context else ""
|
1711
|
+
logger.warning(
|
1712
|
+
f"{operation.title()} has failed for {path}{context}. Error: {e}",
|
1713
|
+
exc_info=True,
|
1714
|
+
)
|
1715
|
+
raise error_class(
|
1716
|
+
f"{operation.title()} has failed for {path}{context}: Error: {e}"
|
1717
|
+
) from e
|
1718
|
+
|
1719
|
+
|
1720
|
+
def from_manifest_table(
|
1721
|
+
manifest_table: Union[LocalDataset, DistributedDataset],
|
1722
|
+
dataset_type: DatasetType = DatasetType.DAFT,
|
1723
|
+
schema: Optional[pa.Schema] = None,
|
1724
|
+
**kwargs,
|
1725
|
+
) -> Dataset:
|
1726
|
+
"""
|
1727
|
+
Read a manifest table (containing file paths and metadata) and download the actual data.
|
1728
|
+
|
1729
|
+
This utility function takes the output from a schemaless table read (which returns
|
1730
|
+
manifest entries instead of data) and downloads the actual file contents.
|
1731
|
+
|
1732
|
+
Args:
|
1733
|
+
manifest_table: Dataset containing manifest entries with file paths and metadata
|
1734
|
+
dataset_type: The type of dataset to return (DAFT, RAY_DATASET, PYARROW, etc.)
|
1735
|
+
schema: Optional PyArrow schema to enforce consistent column names across formats
|
1736
|
+
**kwargs: Additional arguments forwarded to download functions
|
1737
|
+
|
1738
|
+
Returns:
|
1739
|
+
Dataset containing the actual file contents
|
1740
|
+
"""
|
1741
|
+
# Convert the manifest table to pandas for easier processing
|
1742
|
+
# TODO(pdames): Iterate over each input manifest table in its native format
|
1743
|
+
manifest_df = to_pandas(manifest_table)
|
1744
|
+
|
1745
|
+
# Reconstruct ManifestEntry objects from the manifest data
|
1746
|
+
manifest_entries = []
|
1747
|
+
for _, row in manifest_df.iterrows():
|
1748
|
+
# Create ManifestMeta from the row data
|
1749
|
+
meta = ManifestMeta.of(
|
1750
|
+
content_length=row.get("meta_content_length"),
|
1751
|
+
record_count=row.get("meta_record_count"),
|
1752
|
+
content_type=row.get("meta_content_type"),
|
1753
|
+
content_encoding=row.get("meta_content_encoding"),
|
1754
|
+
)
|
1755
|
+
|
1756
|
+
# Create ManifestEntry
|
1757
|
+
entry = ManifestEntry.of(
|
1758
|
+
url=row["path"],
|
1759
|
+
meta=meta,
|
1760
|
+
mandatory=row.get("mandatory", True),
|
1761
|
+
uuid=row.get("id"),
|
1762
|
+
)
|
1763
|
+
manifest_entries.append(entry)
|
1764
|
+
|
1765
|
+
# Create a new Manifest from the entries
|
1766
|
+
reconstructed_manifest = Manifest.of(entries=manifest_entries)
|
1767
|
+
|
1768
|
+
# Add schema to kwargs if provided
|
1769
|
+
if schema is not None:
|
1770
|
+
kwargs["table_version_schema"] = schema
|
1771
|
+
|
1772
|
+
# Choose the appropriate download function based on dataset type
|
1773
|
+
if dataset_type in DatasetType.distributed():
|
1774
|
+
# Use distributed download function
|
1775
|
+
# Map DatasetType to DistributedDatasetType
|
1776
|
+
distributed_type_map = {
|
1777
|
+
DatasetType.DAFT: DistributedDatasetType.DAFT,
|
1778
|
+
DatasetType.RAY_DATASET: DistributedDatasetType.RAY_DATASET,
|
1779
|
+
}
|
1780
|
+
distributed_dataset_type = distributed_type_map.get(dataset_type)
|
1781
|
+
if distributed_dataset_type is None:
|
1782
|
+
raise ValueError(f"Unsupported distributed dataset type: {dataset_type}")
|
1783
|
+
|
1784
|
+
return download_manifest_entries_distributed(
|
1785
|
+
manifest=reconstructed_manifest,
|
1786
|
+
distributed_dataset_type=distributed_dataset_type,
|
1787
|
+
**kwargs,
|
1788
|
+
)
|
1789
|
+
else:
|
1790
|
+
# Use local download function
|
1791
|
+
return download_manifest_entries(
|
1792
|
+
manifest=reconstructed_manifest,
|
1793
|
+
table_type=dataset_type,
|
1794
|
+
**kwargs,
|
1795
|
+
)
|
1796
|
+
|
1797
|
+
|
1798
|
+
def download_manifest_entries(
|
1799
|
+
manifest: Manifest,
|
1800
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
1801
|
+
max_parallelism: Optional[int] = 1,
|
1802
|
+
column_names: Optional[List[str]] = None,
|
1803
|
+
include_columns: Optional[List[str]] = None,
|
1804
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
1805
|
+
file_path_column: Optional[str] = None,
|
1806
|
+
**kwargs,
|
1807
|
+
) -> LocalDataset:
|
1808
|
+
"""Download all entries in the manifest.
|
1809
|
+
|
1810
|
+
Args:
|
1811
|
+
manifest: The manifest containing the entries to download
|
1812
|
+
table_type: Dataset type to load the entries into
|
1813
|
+
max_parallelism: Maximum parallelism to use
|
1814
|
+
column_names: The list of column names in the table
|
1815
|
+
include_columns: The list of columns to include in the selection
|
1816
|
+
file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
|
1817
|
+
(e.g., to pass in a custom schema for a Parquet file)
|
1818
|
+
file_path_column: Optional file path system column name
|
1819
|
+
**kwargs: Additional arguments to pass to the file reader
|
1820
|
+
|
1821
|
+
Returns:
|
1822
|
+
Local dataset
|
1823
|
+
"""
|
1824
|
+
if max_parallelism and max_parallelism <= 1:
|
1825
|
+
return _download_manifest_entries(
|
1826
|
+
manifest,
|
1827
|
+
table_type,
|
1828
|
+
column_names,
|
1829
|
+
include_columns,
|
1830
|
+
file_reader_kwargs_provider,
|
1831
|
+
file_path_column,
|
1832
|
+
**kwargs,
|
1833
|
+
)
|
1834
|
+
else:
|
1835
|
+
return _download_manifest_entries_parallel(
|
1836
|
+
manifest,
|
1837
|
+
table_type,
|
1838
|
+
max_parallelism,
|
1839
|
+
column_names,
|
1840
|
+
include_columns,
|
1841
|
+
file_reader_kwargs_provider,
|
1842
|
+
file_path_column,
|
1843
|
+
**kwargs,
|
1844
|
+
)
|
1845
|
+
|
1846
|
+
|
1847
|
+
def _download_manifest_entries(
|
1848
|
+
manifest: Manifest,
|
1849
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
1850
|
+
column_names: Optional[List[str]] = None,
|
1851
|
+
include_columns: Optional[List[str]] = None,
|
1852
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
1853
|
+
file_path_column: Optional[str] = None,
|
1854
|
+
**kwargs,
|
1855
|
+
) -> LocalDataset:
|
1856
|
+
"""Download all entries in the manifest.
|
1857
|
+
|
1858
|
+
Args:
|
1859
|
+
manifest: The manifest containing the entries to download
|
1860
|
+
table_type: Dataset type to load the entries into
|
1861
|
+
column_names: The list of column names in the table
|
1862
|
+
include_columns: The list of columns to include in the selection
|
1863
|
+
file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
|
1864
|
+
(e.g., to pass in a custom schema for a Parquet file)
|
1865
|
+
file_path_column: Optional file path system column name
|
1866
|
+
**kwargs: Additional arguments to pass to the file reader
|
1867
|
+
|
1868
|
+
Returns:
|
1869
|
+
Local dataset
|
1870
|
+
"""
|
1871
|
+
download_args = _prepare_download_arguments(
|
1872
|
+
table_type,
|
1873
|
+
column_names,
|
1874
|
+
include_columns,
|
1875
|
+
file_reader_kwargs_provider,
|
1876
|
+
file_path_column,
|
1877
|
+
**kwargs,
|
1878
|
+
)
|
1879
|
+
result = []
|
1880
|
+
for e in manifest.entries:
|
1881
|
+
manifest_entry = _reconstruct_manifest_entry_uri(e, **kwargs)
|
1882
|
+
result.append(
|
1883
|
+
download_manifest_entry(manifest_entry=manifest_entry, **download_args)
|
1884
|
+
)
|
1885
|
+
|
1886
|
+
return result
|
1887
|
+
|
1888
|
+
|
1889
|
+
@ray.remote
|
1890
|
+
def download_manifest_entry_ray(
|
1891
|
+
manifest_entry: ManifestEntry,
|
1892
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
1893
|
+
column_names: Optional[List[str]] = None,
|
1894
|
+
include_columns: Optional[List[str]] = None,
|
1895
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
1896
|
+
content_type: Optional[ContentType] = None,
|
1897
|
+
content_encoding: Optional[ContentEncoding] = None,
|
1898
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
1899
|
+
file_path_column: Optional[str] = None,
|
1900
|
+
**kwargs,
|
1901
|
+
) -> LocalTable:
|
1902
|
+
"""
|
1903
|
+
Ray remote function for downloading manifest entries.
|
1904
|
+
|
1905
|
+
Args:
|
1906
|
+
manifest_entry: The manifest entry to download
|
1907
|
+
table_type: Dataset type to load the entry into
|
1908
|
+
column_names: The list of column names in the table
|
1909
|
+
include_columns: The list of columns to include in the selection
|
1910
|
+
file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
|
1911
|
+
(e.g., to pass in a custom schema for a Parquet file)
|
1912
|
+
content_type: Optional content type of the file
|
1913
|
+
content_encoding: Optional content encoding of the file
|
1914
|
+
filesystem: Optional PyArrow filesystem to use to read the file
|
1915
|
+
file_path_column: Optional file path system column name
|
1916
|
+
**kwargs: Additional arguments to pass to the file reader
|
1917
|
+
|
1918
|
+
Returns:
|
1919
|
+
Local table
|
1920
|
+
"""
|
1921
|
+
# Make sure we normalize the table type to PyArrow to provide the correct
|
1922
|
+
# input type to from_arrow_refs
|
1923
|
+
effective_table_type = table_type
|
1924
|
+
if table_type == DatasetType.RAY_DATASET:
|
1925
|
+
effective_table_type = DatasetType.PYARROW
|
1926
|
+
|
1927
|
+
# Call the regular download function
|
1928
|
+
result = download_manifest_entry(
|
1929
|
+
manifest_entry=manifest_entry,
|
1930
|
+
table_type=effective_table_type,
|
1931
|
+
column_names=column_names,
|
1932
|
+
include_columns=include_columns,
|
1933
|
+
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
1934
|
+
content_type=content_type,
|
1935
|
+
content_encoding=content_encoding,
|
1936
|
+
filesystem=filesystem,
|
1937
|
+
file_path_column=file_path_column,
|
1938
|
+
**kwargs,
|
1939
|
+
)
|
1940
|
+
|
1941
|
+
# Convert Polars DataFrame to Arrow Table for Ray dataset compatibility
|
1942
|
+
if isinstance(result, pl.DataFrame):
|
1943
|
+
result = result.to_arrow()
|
1944
|
+
|
1945
|
+
# Cast string_view columns to string to avoid cloudpickle issues
|
1946
|
+
if isinstance(result, pa.Table):
|
1947
|
+
result = _cast_string_view_to_string(result)
|
1948
|
+
|
1949
|
+
return result
|
1950
|
+
|
1951
|
+
|
1952
|
+
def download_manifest_entries_distributed(
|
1953
|
+
manifest: Manifest,
|
1954
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
1955
|
+
max_parallelism: Optional[int] = 1000,
|
1956
|
+
column_names: Optional[List[str]] = None,
|
1957
|
+
include_columns: Optional[List[str]] = None,
|
1958
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
1959
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
1960
|
+
distributed_dataset_type: Optional[
|
1961
|
+
DistributedDatasetType
|
1962
|
+
] = DistributedDatasetType.RAY_DATASET,
|
1963
|
+
file_path_column: Optional[str] = None,
|
1964
|
+
**kwargs,
|
1965
|
+
) -> DistributedDataset:
|
1966
|
+
"""Download all entries in the manifest using the given distributed dataset type.
|
1967
|
+
|
1968
|
+
Args:
|
1969
|
+
manifest: The manifest containing the entries to download
|
1970
|
+
table_type: Dataset type to load the entries into
|
1971
|
+
max_parallelism: Maximum parallelism to use
|
1972
|
+
column_names: The list of column names in the table
|
1973
|
+
include_columns: The list of columns to include in the selection
|
1974
|
+
file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
|
1975
|
+
(e.g., to pass in a custom schema for a Parquet file)
|
1976
|
+
ray_options_provider: Optional provider of Ray options
|
1977
|
+
distributed_dataset_type: Optional distributed dataset type to use
|
1978
|
+
file_path_column: Optional file path system column name
|
1979
|
+
**kwargs: Additional arguments to pass to the file reader
|
1980
|
+
|
1981
|
+
Returns:
|
1982
|
+
Distributed dataset
|
1983
|
+
"""
|
1984
|
+
params = {
|
1985
|
+
"manifest": manifest,
|
1986
|
+
"table_type": table_type,
|
1987
|
+
"max_parallelism": max_parallelism,
|
1988
|
+
"column_names": column_names,
|
1989
|
+
"include_columns": include_columns,
|
1990
|
+
"file_reader_kwargs_provider": file_reader_kwargs_provider,
|
1991
|
+
"ray_options_provider": ray_options_provider,
|
1992
|
+
"file_path_column": file_path_column,
|
1993
|
+
**kwargs,
|
1994
|
+
}
|
1995
|
+
|
1996
|
+
if (
|
1997
|
+
distributed_dataset_type
|
1998
|
+
and distributed_dataset_type.value == DistributedDatasetType.RAY_DATASET.value
|
1999
|
+
):
|
2000
|
+
result = _download_manifest_entries_ray_data_distributed(**params)
|
2001
|
+
return result
|
2002
|
+
elif distributed_dataset_type is not None:
|
2003
|
+
params["distributed_dataset_type"] = distributed_dataset_type
|
2004
|
+
return _download_manifest_entries_all_dataset_distributed(**params)
|
2005
|
+
else:
|
2006
|
+
raise ValueError(
|
2007
|
+
f"Distributed dataset type {distributed_dataset_type} not supported."
|
2008
|
+
)
|
2009
|
+
|
2010
|
+
|
2011
|
+
def _cast_string_view_to_string(table: pa.Table) -> pa.Table:
|
2012
|
+
"""
|
2013
|
+
Cast any string_view columns to string type for Ray dataset compatibility.
|
2014
|
+
|
2015
|
+
This addresses compatibility issues where Ray datasets may have trouble with
|
2016
|
+
string_view columns written by Polars to Feather.
|
2017
|
+
|
2018
|
+
Args:
|
2019
|
+
table: PyArrow table that may contain string_view columns
|
2020
|
+
|
2021
|
+
Returns:
|
2022
|
+
PyArrow table with string_view columns cast to string type
|
2023
|
+
"""
|
2024
|
+
if not isinstance(table, pa.Table):
|
2025
|
+
return table
|
2026
|
+
|
2027
|
+
schema = table.schema
|
2028
|
+
has_string_view = False
|
2029
|
+
|
2030
|
+
# Check if any columns are string_view
|
2031
|
+
for field in schema:
|
2032
|
+
if pa.types.is_string_view(field.type):
|
2033
|
+
has_string_view = True
|
2034
|
+
break
|
2035
|
+
|
2036
|
+
if not has_string_view:
|
2037
|
+
return table
|
2038
|
+
|
2039
|
+
# Convert to pandas and back to normalize string types
|
2040
|
+
# This is a workaround since direct casting from string_view to string is not supported
|
2041
|
+
try:
|
2042
|
+
pandas_df = table.to_pandas()
|
2043
|
+
# Convert back to PyArrow table, which should use regular string type
|
2044
|
+
return pa.Table.from_pandas(pandas_df, preserve_index=False)
|
2045
|
+
except Exception:
|
2046
|
+
# If pandas conversion fails, return original table
|
2047
|
+
return table
|
2048
|
+
|
2049
|
+
|
2050
|
+
def _download_manifest_entries_ray_data_distributed(
|
2051
|
+
manifest: Manifest,
|
2052
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
2053
|
+
max_parallelism: Optional[int] = 1000,
|
2054
|
+
column_names: Optional[List[str]] = None,
|
2055
|
+
include_columns: Optional[List[str]] = None,
|
2056
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
2057
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
2058
|
+
file_path_column: Optional[str] = None,
|
2059
|
+
**kwargs,
|
2060
|
+
) -> DistributedDataset:
|
2061
|
+
"""Download all entries in the manifest into a Ray dataset.
|
2062
|
+
|
2063
|
+
Args:
|
2064
|
+
manifest: The manifest containing the entries to download
|
2065
|
+
table_type: Dataset type to load the entries into
|
2066
|
+
max_parallelism: Maximum parallelism to use
|
2067
|
+
column_names: The list of column names in the table
|
2068
|
+
include_columns: The list of columns to include in the selection
|
2069
|
+
file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
|
2070
|
+
(e.g., to pass in a custom schema for a Parquet file)
|
2071
|
+
ray_options_provider: Optional provider of Ray options
|
2072
|
+
file_path_column: Optional file path system column name
|
2073
|
+
**kwargs: Additional arguments to pass to the file reader
|
2074
|
+
|
2075
|
+
Returns:
|
2076
|
+
Ray dataset
|
2077
|
+
"""
|
2078
|
+
table_pending_ids = []
|
2079
|
+
manifest_entries = manifest.entries
|
2080
|
+
|
2081
|
+
if manifest_entries:
|
2082
|
+
table_pending_ids = invoke_parallel(
|
2083
|
+
manifest_entries,
|
2084
|
+
download_manifest_entry_ray,
|
2085
|
+
table_type,
|
2086
|
+
column_names,
|
2087
|
+
include_columns,
|
2088
|
+
file_reader_kwargs_provider,
|
2089
|
+
max_parallelism=max_parallelism,
|
2090
|
+
options_provider=ray_options_provider,
|
2091
|
+
file_path_column=file_path_column,
|
2092
|
+
**kwargs, # Pass through kwargs like include_paths
|
2093
|
+
)
|
2094
|
+
|
2095
|
+
create_func = _get_table_type_function(
|
2096
|
+
table_type, TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS, "dataset create"
|
2097
|
+
)
|
2098
|
+
return create_func(table_pending_ids)
|
2099
|
+
|
2100
|
+
|
2101
|
+
def _group_manifest_uris_by_content_type(
|
2102
|
+
manifest: Manifest, **kwargs
|
2103
|
+
) -> Dict[Tuple[str, str], List[str]]:
|
2104
|
+
"""
|
2105
|
+
Group manifest URIs by content type and content encoding.
|
2106
|
+
|
2107
|
+
Args:
|
2108
|
+
manifest: The manifest containing the entries to group by content type
|
2109
|
+
**kwargs: Additional arguments to pass to the catalog properties
|
2110
|
+
|
2111
|
+
Returns:
|
2112
|
+
Dictionary mapping (content_type, content_encoding) tuples to lists of URIs
|
2113
|
+
"""
|
2114
|
+
from deltacat.catalog import get_catalog_properties
|
2115
|
+
|
2116
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
2117
|
+
|
2118
|
+
uris_by_type = {}
|
2119
|
+
|
2120
|
+
for entry in manifest.entries or []:
|
2121
|
+
content_type = entry.meta.content_type
|
2122
|
+
content_encoding = entry.meta.content_encoding
|
2123
|
+
key = (content_type, content_encoding)
|
2124
|
+
|
2125
|
+
if key not in uris_by_type:
|
2126
|
+
uris_by_type[key] = []
|
2127
|
+
|
2128
|
+
full_uri = catalog_properties.reconstruct_full_path(entry.uri)
|
2129
|
+
uris_by_type[key].append(full_uri)
|
2130
|
+
|
2131
|
+
return uris_by_type
|
2132
|
+
|
2133
|
+
|
2134
|
+
def _download_manifest_entries_all_dataset_distributed(
|
2135
|
+
manifest: Manifest,
|
2136
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
2137
|
+
max_parallelism: Optional[int] = 1000,
|
2138
|
+
column_names: Optional[List[str]] = None,
|
2139
|
+
include_columns: Optional[List[str]] = None,
|
2140
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
2141
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
2142
|
+
distributed_dataset_type: Optional[DatasetType] = DatasetType.DAFT,
|
2143
|
+
file_path_column: Optional[str] = None,
|
2144
|
+
**kwargs,
|
2145
|
+
) -> DistributedDataset:
|
2146
|
+
"""Download all entries in the manifest into a distributed dataset other than Ray Dataset.
|
2147
|
+
|
2148
|
+
Args:
|
2149
|
+
manifest: The manifest containing the entries to download
|
2150
|
+
table_type: Dataset type to load the entries into
|
2151
|
+
max_parallelism: Maximum parallelism to use
|
2152
|
+
column_names: The list of column names in the table
|
2153
|
+
include_columns: The list of columns to include in the selection
|
2154
|
+
file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
|
2155
|
+
(e.g., to pass in a custom schema for a Parquet file)
|
2156
|
+
ray_options_provider: Optional provider of Ray options
|
2157
|
+
distributed_dataset_type: Optional distributed dataset type to use
|
2158
|
+
file_path_column: Optional file path system column name
|
2159
|
+
**kwargs: Additional arguments to pass to the file reader
|
2160
|
+
|
2161
|
+
Returns:
|
2162
|
+
Distributed dataset
|
2163
|
+
"""
|
2164
|
+
# Group manifest entries by content type instead of validating consistency
|
2165
|
+
# Filter out table_version_schema from kwargs passed to catalog properties
|
2166
|
+
filtered_kwargs = _filter_kwargs_for_catalog_properties(kwargs)
|
2167
|
+
uris_by_content_type = _group_manifest_uris_by_content_type(
|
2168
|
+
manifest, **filtered_kwargs
|
2169
|
+
)
|
2170
|
+
|
2171
|
+
# If only one content type, use the original single-reader logic
|
2172
|
+
if len(uris_by_content_type) == 1:
|
2173
|
+
content_type, content_encoding = next(iter(uris_by_content_type.keys()))
|
2174
|
+
uris = next(iter(uris_by_content_type.values()))
|
2175
|
+
|
2176
|
+
# Keep table_version_schema for the reader, but filter other system kwargs
|
2177
|
+
reader_kwargs = _filter_kwargs_for_reader_functions(kwargs)
|
2178
|
+
|
2179
|
+
try:
|
2180
|
+
reader_func = DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC[
|
2181
|
+
distributed_dataset_type.value
|
2182
|
+
]
|
2183
|
+
except KeyError:
|
2184
|
+
raise ValueError(
|
2185
|
+
f"Unsupported distributed dataset type={distributed_dataset_type}. "
|
2186
|
+
f"Supported types: {list(DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC.keys())}"
|
2187
|
+
)
|
2188
|
+
|
2189
|
+
return reader_func(
|
2190
|
+
uris=uris,
|
2191
|
+
content_type=content_type,
|
2192
|
+
content_encoding=content_encoding,
|
2193
|
+
column_names=column_names,
|
2194
|
+
include_columns=include_columns,
|
2195
|
+
read_func_kwargs_provider=file_reader_kwargs_provider,
|
2196
|
+
ray_options_provider=ray_options_provider,
|
2197
|
+
file_path_column=file_path_column,
|
2198
|
+
**reader_kwargs,
|
2199
|
+
)
|
2200
|
+
|
2201
|
+
# Multiple content types - read each group and union them (only for Daft)
|
2202
|
+
if distributed_dataset_type != DistributedDatasetType.DAFT:
|
2203
|
+
raise ValueError(
|
2204
|
+
f"Mixed content types are only supported for Daft datasets. "
|
2205
|
+
f"Got {len(uris_by_content_type)} different content types with dataset type {distributed_dataset_type}"
|
2206
|
+
)
|
2207
|
+
|
2208
|
+
# Keep table_version_schema for the reader, but filter other system kwargs
|
2209
|
+
reader_kwargs = _filter_kwargs_for_reader_functions(kwargs)
|
2210
|
+
|
2211
|
+
try:
|
2212
|
+
reader_func = DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC[
|
2213
|
+
distributed_dataset_type.value
|
2214
|
+
]
|
2215
|
+
except KeyError:
|
2216
|
+
raise ValueError(
|
2217
|
+
f"Unsupported distributed dataset type={distributed_dataset_type}. "
|
2218
|
+
f"Supported types: {list(DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC.keys())}"
|
2219
|
+
)
|
2220
|
+
|
2221
|
+
# Read each content type group into a separate DataFrame
|
2222
|
+
dataframes = []
|
2223
|
+
for (content_type, content_encoding), uris in uris_by_content_type.items():
|
2224
|
+
df = reader_func(
|
2225
|
+
uris=uris,
|
2226
|
+
content_type=content_type,
|
2227
|
+
content_encoding=content_encoding,
|
2228
|
+
column_names=column_names,
|
2229
|
+
include_columns=include_columns,
|
2230
|
+
read_func_kwargs_provider=file_reader_kwargs_provider,
|
2231
|
+
ray_options_provider=ray_options_provider,
|
2232
|
+
file_path_column=file_path_column,
|
2233
|
+
**reader_kwargs,
|
2234
|
+
)
|
2235
|
+
dataframes.append(df)
|
2236
|
+
|
2237
|
+
# Union all DataFrames using Daft's union_all
|
2238
|
+
if len(dataframes) == 1:
|
2239
|
+
return dataframes[0]
|
2240
|
+
|
2241
|
+
result = dataframes[0]
|
2242
|
+
for df in dataframes[1:]:
|
2243
|
+
result = result.union_all(df)
|
2244
|
+
|
2245
|
+
return result
|
2246
|
+
|
2247
|
+
|
2248
|
+
def _download_manifest_entries_parallel(
|
2249
|
+
manifest: Manifest,
|
2250
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
2251
|
+
max_parallelism: Optional[int] = None,
|
2252
|
+
column_names: Optional[List[str]] = None,
|
2253
|
+
include_columns: Optional[List[str]] = None,
|
2254
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
2255
|
+
file_path_column: Optional[str] = None,
|
2256
|
+
**kwargs,
|
2257
|
+
) -> LocalDataset:
|
2258
|
+
"""Download all entries in the manifest into a local dataset using multiprocessing.
|
2259
|
+
|
2260
|
+
Args:
|
2261
|
+
manifest: The manifest containing the entries to download
|
2262
|
+
table_type: Dataset type to load the entries into
|
2263
|
+
max_parallelism: Maximum parallel processes to use for entry downloads
|
2264
|
+
column_names: The list of column names in the table
|
2265
|
+
include_columns: The list of columns to include in the selection
|
2266
|
+
file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
|
2267
|
+
(e.g., to pass in a custom schema for a Parquet file)
|
2268
|
+
file_path_column: Optional file path system column name
|
2269
|
+
**kwargs: Additional arguments to pass to the file reader
|
2270
|
+
|
2271
|
+
Returns:
|
2272
|
+
Local dataset
|
2273
|
+
"""
|
2274
|
+
download_args = _prepare_download_arguments(
|
2275
|
+
table_type,
|
2276
|
+
column_names,
|
2277
|
+
include_columns,
|
2278
|
+
file_reader_kwargs_provider,
|
2279
|
+
file_path_column,
|
2280
|
+
**kwargs,
|
2281
|
+
)
|
2282
|
+
|
2283
|
+
entries_to_process = []
|
2284
|
+
for e in manifest.entries:
|
2285
|
+
manifest_entry = _reconstruct_manifest_entry_uri(e, **kwargs)
|
2286
|
+
entries_to_process.append(manifest_entry)
|
2287
|
+
|
2288
|
+
tables = []
|
2289
|
+
pool = multiprocessing.Pool(max_parallelism)
|
2290
|
+
|
2291
|
+
downloader = partial(download_manifest_entry, **download_args)
|
2292
|
+
for table in pool.map(downloader, entries_to_process):
|
2293
|
+
tables.append(table)
|
2294
|
+
return tables
|
2295
|
+
|
2296
|
+
|
2297
|
+
def download_manifest_entry(
|
2298
|
+
manifest_entry: ManifestEntry,
|
2299
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
2300
|
+
column_names: Optional[List[str]] = None,
|
2301
|
+
include_columns: Optional[List[str]] = None,
|
2302
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
2303
|
+
content_type: Optional[ContentType] = None,
|
2304
|
+
content_encoding: Optional[ContentEncoding] = None,
|
2305
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
2306
|
+
file_path_column: Optional[str] = None,
|
2307
|
+
**kwargs,
|
2308
|
+
) -> LocalTable:
|
2309
|
+
"""Download a single entry in the manifest into a local table.
|
2310
|
+
|
2311
|
+
Args:
|
2312
|
+
manifest_entry: The manifest entry to download
|
2313
|
+
table_type: Dataset type to load the entry into
|
2314
|
+
column_names: The list of column names in the table
|
2315
|
+
include_columns: The list of columns to include in the selection
|
2316
|
+
file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
|
2317
|
+
(e.g., to pass in a custom schema for a Parquet file)
|
2318
|
+
content_type: Optional content type of the file
|
2319
|
+
content_encoding: Optional content encoding of the file
|
2320
|
+
filesystem: Optional PyArrow filesystem to use to read the file
|
2321
|
+
file_path_column: Optional file path system column name
|
2322
|
+
**kwargs: Additional arguments to pass to the file reader
|
2323
|
+
|
2324
|
+
Returns:
|
2325
|
+
Local table
|
2326
|
+
"""
|
2327
|
+
# Extract manifest metadata
|
2328
|
+
(
|
2329
|
+
extracted_content_type,
|
2330
|
+
extracted_content_encoding,
|
2331
|
+
path,
|
2332
|
+
) = _extract_content_metadata(manifest_entry)
|
2333
|
+
content_type = content_type or extracted_content_type
|
2334
|
+
content_encoding = content_encoding or extracted_content_encoding
|
2335
|
+
|
2336
|
+
# Extract partial download parameters
|
2337
|
+
partial_file_download_params = _extract_partial_download_params(manifest_entry)
|
2338
|
+
|
2339
|
+
# Filter kwargs and process file path column
|
2340
|
+
reader_kwargs = _filter_kwargs_for_external_readers(kwargs)
|
2341
|
+
processed_include_columns = _remove_file_path_column(
|
2342
|
+
include_columns, file_path_column
|
2343
|
+
)
|
2344
|
+
|
2345
|
+
# Create retry wrapper and read file
|
2346
|
+
retrying = _create_retry_wrapper()
|
2347
|
+
table = retrying(
|
2348
|
+
read_file,
|
2349
|
+
path,
|
2350
|
+
content_type,
|
2351
|
+
content_encoding,
|
2352
|
+
table_type,
|
2353
|
+
column_names,
|
2354
|
+
processed_include_columns,
|
2355
|
+
file_reader_kwargs_provider,
|
2356
|
+
partial_file_download_params,
|
2357
|
+
filesystem,
|
2358
|
+
**reader_kwargs,
|
2359
|
+
)
|
2360
|
+
|
2361
|
+
# Add file path column if requested
|
2362
|
+
if file_path_column:
|
2363
|
+
if isinstance(table, papq.ParquetFile):
|
2364
|
+
logger.warning(
|
2365
|
+
f"Skipping file_path_column '{file_path_column}' for lazily materialized ParquetFile. "
|
2366
|
+
f"File path information can be retrieved from the ParquetFile object's metadata. "
|
2367
|
+
f"Use read_as=DatasetType.PYARROW to materialize with file path column."
|
2368
|
+
)
|
2369
|
+
else:
|
2370
|
+
table = append_column_to_table(table, file_path_column, manifest_entry.uri)
|
2371
|
+
|
2372
|
+
return table
|
2373
|
+
|
2374
|
+
|
2375
|
+
@categorize_errors
|
2376
|
+
def read_file(
|
2377
|
+
path: str,
|
2378
|
+
content_type: ContentType,
|
2379
|
+
content_encoding: ContentEncoding = ContentEncoding.IDENTITY,
|
2380
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
2381
|
+
column_names: Optional[List[str]] = None,
|
2382
|
+
include_columns: Optional[List[str]] = None,
|
2383
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
2384
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
2385
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
2386
|
+
**kwargs,
|
2387
|
+
) -> LocalTable:
|
2388
|
+
"""Read a file into a local table.
|
2389
|
+
|
2390
|
+
Args:
|
2391
|
+
path: The path to the file to read
|
2392
|
+
content_type: The content type of the file
|
2393
|
+
content_encoding: The content encoding of the file
|
2394
|
+
table_type: Dataset type to load the file into
|
2395
|
+
column_names: The list of column names in the table
|
2396
|
+
include_columns: The list of columns to include in the selection
|
2397
|
+
file_reader_kwargs_provider: Optional per-content-type provider of file reader kwargs,
|
2398
|
+
(e.g., to pass in a custom schema for a Parquet file)
|
2399
|
+
partial_file_download_params: Optional partial file download parameters
|
2400
|
+
filesystem: Optional PyArrow filesystem to use to read the file
|
2401
|
+
**kwargs: Additional arguments to pass to the file reader
|
2402
|
+
|
2403
|
+
Returns:
|
2404
|
+
Local table
|
2405
|
+
"""
|
2406
|
+
reader = TABLE_TYPE_TO_READER_FUNC[table_type.value]
|
2407
|
+
try:
|
2408
|
+
table = reader(
|
2409
|
+
path,
|
2410
|
+
content_type.value,
|
2411
|
+
content_encoding.value,
|
2412
|
+
filesystem,
|
2413
|
+
column_names,
|
2414
|
+
include_columns,
|
2415
|
+
file_reader_kwargs_provider,
|
2416
|
+
partial_file_download_params,
|
2417
|
+
**kwargs,
|
2418
|
+
)
|
2419
|
+
return table
|
2420
|
+
except RETRYABLE_TRANSIENT_ERRORS as e:
|
2421
|
+
_handle_retryable_error(e, path, "download", RetryableDownloadTableError)
|
2422
|
+
except BaseException as e:
|
2423
|
+
_handle_non_retryable_error(
|
2424
|
+
e,
|
2425
|
+
path,
|
2426
|
+
"read",
|
2427
|
+
NonRetryableDownloadTableError,
|
2428
|
+
f"and content_type={content_type} and encoding={content_encoding}",
|
2429
|
+
)
|