deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,643 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import itertools
|
5
|
+
|
6
|
+
from enum import Enum
|
7
|
+
from typing import Optional, List, Dict, Any, TYPE_CHECKING
|
8
|
+
from uuid import uuid4
|
9
|
+
|
10
|
+
if TYPE_CHECKING:
|
11
|
+
from deltacat.storage.model.schema import FieldLocator
|
12
|
+
|
13
|
+
from deltacat import logs
|
14
|
+
|
15
|
+
from deltacat.types.media import (
|
16
|
+
ContentType,
|
17
|
+
ContentEncoding,
|
18
|
+
EXT_TO_CONTENT_TYPE,
|
19
|
+
EXT_TO_CONTENT_ENCODING,
|
20
|
+
)
|
21
|
+
|
22
|
+
import json
|
23
|
+
import pyarrow as pa
|
24
|
+
import posixpath
|
25
|
+
|
26
|
+
from deltacat.utils.filesystem import get_file_info
|
27
|
+
|
28
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
29
|
+
|
30
|
+
|
31
|
+
class EntryType(str, Enum):
|
32
|
+
"""
|
33
|
+
Enum representing all possible content categories of a manifest entry file.
|
34
|
+
|
35
|
+
DATA: The entry contains fully qualified records compliant with the parent
|
36
|
+
table's schema to insert and/or update. Data files for upsert Deltas use
|
37
|
+
this entry's parameters to find matching fields to update. If no entry
|
38
|
+
parameters are specified, then the parent table's primary keys are used.
|
39
|
+
Only records from entries in Deltas with lower stream positions than this
|
40
|
+
entry will be targeted for update.
|
41
|
+
|
42
|
+
POSITIONAL_DELETE: The entry contains pointers to records in other entries
|
43
|
+
to delete. Deleted records will be filtered from query results at runtime.
|
44
|
+
|
45
|
+
EQUALITY_DELETE: The entry contains a subset of field values from the
|
46
|
+
table records to find and delete. The full record of any matching data
|
47
|
+
entries in Deltas with a lower stream position than this entry's Delta
|
48
|
+
will be deleted. The fields used for record discovery are controlled by
|
49
|
+
this entry's parameters. If no entry parameters are specified, then the
|
50
|
+
fields used for record discovery are linked to the parent table's merge
|
51
|
+
keys. The entry may contain additional fields not used for delete record
|
52
|
+
discovery which will be ignored. Deleted records will be filtered from
|
53
|
+
query results at runtime.
|
54
|
+
"""
|
55
|
+
|
56
|
+
DATA = "data"
|
57
|
+
POSITIONAL_DELETE = "positional_delete"
|
58
|
+
EQUALITY_DELETE = "equality_delete"
|
59
|
+
|
60
|
+
@classmethod
|
61
|
+
def get_default(cls):
|
62
|
+
return EntryType.DATA
|
63
|
+
|
64
|
+
@classmethod
|
65
|
+
def list(cls):
|
66
|
+
return [c.value for c in EntryType]
|
67
|
+
|
68
|
+
|
69
|
+
class EntryParams(dict):
|
70
|
+
"""
|
71
|
+
Parameters that control manifest entry interpretation.
|
72
|
+
|
73
|
+
For EQUALITY_DELETE manifest entry types, parameters include equality
|
74
|
+
field identifiers.
|
75
|
+
"""
|
76
|
+
|
77
|
+
@staticmethod
|
78
|
+
def of(
|
79
|
+
equality_field_locators: Optional[List["FieldLocator"]] = None,
|
80
|
+
) -> EntryParams:
|
81
|
+
params = EntryParams()
|
82
|
+
if equality_field_locators is not None:
|
83
|
+
params["equality_field_locators"] = equality_field_locators
|
84
|
+
return params
|
85
|
+
|
86
|
+
@property
|
87
|
+
def equality_field_locators(self) -> Optional[List["FieldLocator"]]:
|
88
|
+
return self.get("equality_field_locators")
|
89
|
+
|
90
|
+
|
91
|
+
class Manifest(dict):
|
92
|
+
"""
|
93
|
+
A DeltaCAT manifest contains metadata common to multiple manifest formats
|
94
|
+
like Amazon Redshift and Apache Iceberg to simplify dataset import/export.
|
95
|
+
"""
|
96
|
+
|
97
|
+
@staticmethod
|
98
|
+
def _build_manifest(
|
99
|
+
meta: Optional[ManifestMeta],
|
100
|
+
entries: Optional[ManifestEntryList],
|
101
|
+
author: Optional[ManifestAuthor] = None,
|
102
|
+
uuid: str = None,
|
103
|
+
) -> Manifest:
|
104
|
+
if not uuid:
|
105
|
+
uuid = str(uuid4())
|
106
|
+
manifest = Manifest()
|
107
|
+
manifest["id"] = uuid
|
108
|
+
if meta is not None:
|
109
|
+
manifest["meta"] = meta
|
110
|
+
if entries is not None:
|
111
|
+
manifest["entries"] = entries
|
112
|
+
if author is not None:
|
113
|
+
manifest["author"] = author
|
114
|
+
return manifest
|
115
|
+
|
116
|
+
@staticmethod
|
117
|
+
def of(
|
118
|
+
entries: ManifestEntryList,
|
119
|
+
author: Optional[ManifestAuthor] = None,
|
120
|
+
uuid: str = None,
|
121
|
+
entry_type: Optional[EntryType] = None,
|
122
|
+
entry_params: Optional[EntryParams] = None,
|
123
|
+
) -> Manifest:
|
124
|
+
if not uuid:
|
125
|
+
uuid = str(uuid4())
|
126
|
+
total_record_count = 0
|
127
|
+
total_content_length = 0
|
128
|
+
total_source_content_length = 0
|
129
|
+
content_type = None
|
130
|
+
content_encoding = None
|
131
|
+
credentials = None
|
132
|
+
content_type_params = None
|
133
|
+
schema_id = None
|
134
|
+
sort_scheme_id = None
|
135
|
+
if entries:
|
136
|
+
content_type = entries[0].meta.content_type
|
137
|
+
content_encoding = entries[0].meta.content_encoding
|
138
|
+
credentials = entries[0].meta.credentials
|
139
|
+
content_type_params = entries[0].meta.content_type_parameters
|
140
|
+
|
141
|
+
# Keep the latest schema ID
|
142
|
+
# Schema IDs are >= 0, and schema evolution always increments the last schema ID
|
143
|
+
entry_schema_ids = [
|
144
|
+
entry.meta.schema_id if entry.meta.schema_id is not None else -1
|
145
|
+
for entry in entries
|
146
|
+
]
|
147
|
+
max_schema_id = max(entry_schema_ids) if entry_schema_ids else -1
|
148
|
+
schema_id = max_schema_id if max_schema_id >= 0 else None
|
149
|
+
|
150
|
+
# Handle sort_scheme_id: set to None if entries have multiple different sort_scheme_ids
|
151
|
+
entry_sort_scheme_ids = set(
|
152
|
+
entry.meta.sort_scheme_id
|
153
|
+
for entry in entries
|
154
|
+
if entry.meta.sort_scheme_id is not None
|
155
|
+
)
|
156
|
+
sort_scheme_id = (
|
157
|
+
list(entry_sort_scheme_ids)[0]
|
158
|
+
if len(entry_sort_scheme_ids) == 1
|
159
|
+
else None
|
160
|
+
)
|
161
|
+
|
162
|
+
for entry in entries:
|
163
|
+
meta = entry.meta
|
164
|
+
if meta.content_type != content_type:
|
165
|
+
content_type = None
|
166
|
+
if meta.content_encoding != content_encoding:
|
167
|
+
content_encoding = None
|
168
|
+
entry_content_type = meta.content_type
|
169
|
+
if content_type and entry_content_type != content_type:
|
170
|
+
msg = (
|
171
|
+
f"Expected all manifest entries to have content "
|
172
|
+
f"type '{content_type}' but found "
|
173
|
+
f"'{entry_content_type}'"
|
174
|
+
)
|
175
|
+
raise ValueError(msg)
|
176
|
+
entry_content_encoding = meta.get("content_encoding", None)
|
177
|
+
if content_encoding and entry_content_encoding != content_encoding:
|
178
|
+
msg = (
|
179
|
+
f"Expected all manifest entries to have content "
|
180
|
+
f"encoding '{content_encoding}' but found "
|
181
|
+
f"'{entry_content_encoding}'"
|
182
|
+
)
|
183
|
+
raise ValueError(msg)
|
184
|
+
actual_entry_type = meta.entry_type
|
185
|
+
if entry_type and (actual_entry_type != entry_type):
|
186
|
+
msg = (
|
187
|
+
f"Expected all manifest entries to have type "
|
188
|
+
f"'{entry_type}' but found '{actual_entry_type}'"
|
189
|
+
)
|
190
|
+
raise ValueError(msg)
|
191
|
+
actual_entry_params = meta.entry_params
|
192
|
+
if entry_params and (actual_entry_params != entry_params):
|
193
|
+
msg = (
|
194
|
+
f"Expected all manifest entries to have params "
|
195
|
+
f"'{entry_params}' but found '{actual_entry_params}'"
|
196
|
+
)
|
197
|
+
raise ValueError(msg)
|
198
|
+
actual_credentials = meta.credentials
|
199
|
+
if credentials and (actual_credentials != credentials):
|
200
|
+
msg = (
|
201
|
+
f"Expected all manifest entries to have credentials "
|
202
|
+
f"'{credentials}' but found '{actual_credentials}'"
|
203
|
+
)
|
204
|
+
raise ValueError(msg)
|
205
|
+
actual_content_type_params = meta.content_type_parameters
|
206
|
+
if content_type_params and (
|
207
|
+
actual_content_type_params != content_type_params
|
208
|
+
):
|
209
|
+
msg = (
|
210
|
+
f"Expected all manifest entries to have content type params "
|
211
|
+
f"'{content_type_params}' but found '{actual_content_type_params}'"
|
212
|
+
)
|
213
|
+
raise ValueError(msg)
|
214
|
+
|
215
|
+
total_record_count += meta.record_count or 0
|
216
|
+
total_content_length += meta.content_length or 0
|
217
|
+
total_source_content_length += meta.source_content_length or 0
|
218
|
+
|
219
|
+
meta = ManifestMeta.of(
|
220
|
+
record_count=total_record_count,
|
221
|
+
content_length=total_content_length,
|
222
|
+
content_type=content_type,
|
223
|
+
content_encoding=content_encoding,
|
224
|
+
source_content_length=total_source_content_length,
|
225
|
+
credentials=credentials,
|
226
|
+
content_type_parameters=content_type_params,
|
227
|
+
entry_type=entry_type,
|
228
|
+
entry_params=entry_params,
|
229
|
+
schema_id=schema_id,
|
230
|
+
sort_scheme_id=sort_scheme_id,
|
231
|
+
)
|
232
|
+
manifest = Manifest._build_manifest(meta, entries, author, uuid)
|
233
|
+
return manifest
|
234
|
+
|
235
|
+
@staticmethod
|
236
|
+
def from_json(json_string: str) -> Manifest:
|
237
|
+
parsed_dict = json.loads(json_string)
|
238
|
+
return Manifest.of(
|
239
|
+
entries=ManifestEntryList.of(
|
240
|
+
[
|
241
|
+
ManifestEntry.from_dict(entry)
|
242
|
+
for entry in parsed_dict.get("entries", [])
|
243
|
+
]
|
244
|
+
),
|
245
|
+
author=ManifestAuthor.from_dict(parsed_dict.get("author")),
|
246
|
+
uuid=parsed_dict.get("id"),
|
247
|
+
)
|
248
|
+
|
249
|
+
@staticmethod
|
250
|
+
def merge_manifests(
|
251
|
+
manifests: List[Manifest], author: Optional[ManifestAuthor] = None
|
252
|
+
) -> Manifest:
|
253
|
+
all_entries = ManifestEntryList(
|
254
|
+
itertools.chain(*[m.entries for m in manifests])
|
255
|
+
)
|
256
|
+
merged_manifest = Manifest.of(all_entries, author)
|
257
|
+
return merged_manifest
|
258
|
+
|
259
|
+
@property
|
260
|
+
def meta(self) -> Optional[ManifestMeta]:
|
261
|
+
val: Dict[str, Any] = self.get("meta")
|
262
|
+
if val is not None and not isinstance(val, ManifestMeta):
|
263
|
+
self["meta"] = val = ManifestMeta(val)
|
264
|
+
return val
|
265
|
+
|
266
|
+
@property
|
267
|
+
def entries(self) -> Optional[ManifestEntryList]:
|
268
|
+
val: List[ManifestEntry] = self.get("entries")
|
269
|
+
if val is not None and not isinstance(val, ManifestEntryList):
|
270
|
+
self["entries"] = val = ManifestEntryList.of(val)
|
271
|
+
return val
|
272
|
+
|
273
|
+
@property
|
274
|
+
def id(self) -> str:
|
275
|
+
return self["id"]
|
276
|
+
|
277
|
+
@property
|
278
|
+
def author(self) -> Optional[ManifestAuthor]:
|
279
|
+
val: Dict[str, Any] = self.get("author")
|
280
|
+
if val is not None and not isinstance(val, ManifestAuthor):
|
281
|
+
self["author"] = val = ManifestAuthor(val)
|
282
|
+
return val
|
283
|
+
|
284
|
+
|
285
|
+
class ManifestMeta(dict):
|
286
|
+
@staticmethod
|
287
|
+
def of(
|
288
|
+
record_count: Optional[int],
|
289
|
+
content_length: Optional[int],
|
290
|
+
content_type: Optional[str],
|
291
|
+
content_encoding: Optional[str],
|
292
|
+
source_content_length: Optional[int] = None,
|
293
|
+
credentials: Optional[Dict[str, str]] = None,
|
294
|
+
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
295
|
+
entry_type: Optional[EntryType] = None,
|
296
|
+
entry_params: Optional[EntryParams] = None,
|
297
|
+
schema_id: Optional[int] = None,
|
298
|
+
sort_scheme_id: Optional[str] = None,
|
299
|
+
) -> ManifestMeta:
|
300
|
+
manifest_meta = ManifestMeta()
|
301
|
+
if record_count is not None:
|
302
|
+
manifest_meta["record_count"] = record_count
|
303
|
+
if content_length is not None:
|
304
|
+
manifest_meta["content_length"] = content_length
|
305
|
+
if source_content_length is not None:
|
306
|
+
manifest_meta["source_content_length"] = source_content_length
|
307
|
+
if content_type is not None:
|
308
|
+
manifest_meta["content_type"] = content_type
|
309
|
+
if content_type_parameters is not None:
|
310
|
+
manifest_meta["content_type_parameters"] = content_type_parameters
|
311
|
+
if content_encoding is not None:
|
312
|
+
manifest_meta["content_encoding"] = content_encoding
|
313
|
+
if credentials is not None:
|
314
|
+
manifest_meta["credentials"] = credentials
|
315
|
+
if entry_type is not None:
|
316
|
+
manifest_meta["entry_type"] = (
|
317
|
+
entry_type.value if isinstance(entry_type, EntryType) else entry_type
|
318
|
+
)
|
319
|
+
if entry_params is not None:
|
320
|
+
manifest_meta["entry_params"] = entry_params
|
321
|
+
if schema_id is not None:
|
322
|
+
manifest_meta["schema_id"] = schema_id
|
323
|
+
if sort_scheme_id is not None:
|
324
|
+
manifest_meta["sort_scheme_id"] = sort_scheme_id
|
325
|
+
return manifest_meta
|
326
|
+
|
327
|
+
@staticmethod
|
328
|
+
def from_dict(obj: dict) -> Optional[ManifestMeta]:
|
329
|
+
if obj is None:
|
330
|
+
return None
|
331
|
+
|
332
|
+
return ManifestMeta.of(
|
333
|
+
record_count=obj.get("record_count"),
|
334
|
+
content_length=obj.get("content_length"),
|
335
|
+
content_type=obj.get("content_type"),
|
336
|
+
content_encoding=obj.get("content_encoding"),
|
337
|
+
source_content_length=obj.get("source_content_length"),
|
338
|
+
credentials=obj.get("credentials"),
|
339
|
+
content_type_parameters=obj.get("content_type_parameters"),
|
340
|
+
entry_type=obj.get("entry_type"),
|
341
|
+
entry_params=obj.get("entry_params"),
|
342
|
+
schema_id=obj.get("schema_id"),
|
343
|
+
sort_scheme_id=obj.get("sort_scheme_id"),
|
344
|
+
)
|
345
|
+
|
346
|
+
@property
|
347
|
+
def record_count(self) -> Optional[int]:
|
348
|
+
return self.get("record_count")
|
349
|
+
|
350
|
+
@property
|
351
|
+
def content_length(self) -> Optional[int]:
|
352
|
+
return self.get("content_length")
|
353
|
+
|
354
|
+
@property
|
355
|
+
def content_type(self) -> Optional[str]:
|
356
|
+
return self.get("content_type")
|
357
|
+
|
358
|
+
@property
|
359
|
+
def content_encoding(self) -> Optional[str]:
|
360
|
+
return self.get("content_encoding")
|
361
|
+
|
362
|
+
@property
|
363
|
+
def source_content_length(self) -> Optional[int]:
|
364
|
+
return self.get("source_content_length")
|
365
|
+
|
366
|
+
@property
|
367
|
+
def content_type_parameters(self) -> Optional[List[Dict[str, str]]]:
|
368
|
+
return self.get("content_type_parameters")
|
369
|
+
|
370
|
+
@content_type_parameters.setter
|
371
|
+
def content_type_parameters(self, params: List[Dict[str, str]]) -> None:
|
372
|
+
self["content_type_parameters"] = params
|
373
|
+
|
374
|
+
@property
|
375
|
+
def credentials(self) -> Optional[Dict[str, str]]:
|
376
|
+
return self.get("credentials")
|
377
|
+
|
378
|
+
@property
|
379
|
+
def entry_type(self) -> Optional[EntryType]:
|
380
|
+
val = self.get("entry_type")
|
381
|
+
if val is not None:
|
382
|
+
return EntryType(self["entry_type"])
|
383
|
+
return val
|
384
|
+
|
385
|
+
@property
|
386
|
+
def entry_params(self) -> Optional[EntryParams]:
|
387
|
+
val: Dict[str, Any] = self.get("entry_params")
|
388
|
+
if val is not None and not isinstance(val, EntryParams):
|
389
|
+
self["entry_params"] = val = EntryParams(val)
|
390
|
+
return val
|
391
|
+
|
392
|
+
@property
|
393
|
+
def schema_id(self) -> Optional[int]:
|
394
|
+
return self.get("schema_id")
|
395
|
+
|
396
|
+
@property
|
397
|
+
def sort_scheme_id(self) -> Optional[str]:
|
398
|
+
return self.get("sort_scheme_id")
|
399
|
+
|
400
|
+
|
401
|
+
class ManifestEntry(dict):
|
402
|
+
@staticmethod
|
403
|
+
def of(
|
404
|
+
url: Optional[str],
|
405
|
+
meta: Optional[ManifestMeta],
|
406
|
+
mandatory: bool = True,
|
407
|
+
uri: Optional[str] = None,
|
408
|
+
uuid: Optional[str] = None,
|
409
|
+
) -> ManifestEntry:
|
410
|
+
manifest_entry = ManifestEntry()
|
411
|
+
if not (uri or url):
|
412
|
+
raise ValueError("No URI or URL specified for manifest entry contents.")
|
413
|
+
if (uri and url) and (uri != url):
|
414
|
+
raise ValueError(f"Manifest entry URI ({uri}) != URL ({url})")
|
415
|
+
if url:
|
416
|
+
manifest_entry["url"] = manifest_entry["uri"] = url
|
417
|
+
elif uri:
|
418
|
+
manifest_entry["url"] = manifest_entry["uri"] = uri
|
419
|
+
if meta is not None:
|
420
|
+
manifest_entry["meta"] = meta
|
421
|
+
if mandatory is not None:
|
422
|
+
manifest_entry["mandatory"] = mandatory
|
423
|
+
if uuid is not None:
|
424
|
+
manifest_entry["id"] = uuid
|
425
|
+
return manifest_entry
|
426
|
+
|
427
|
+
@staticmethod
|
428
|
+
def from_s3_obj_url(
|
429
|
+
url: str,
|
430
|
+
record_count: int,
|
431
|
+
source_content_length: Optional[int] = None,
|
432
|
+
credentials: Optional[Dict[str, str]] = None,
|
433
|
+
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
434
|
+
entry_type: Optional[EntryType] = None,
|
435
|
+
entry_params: Optional[EntryParams] = None,
|
436
|
+
**s3_client_kwargs,
|
437
|
+
) -> ManifestEntry:
|
438
|
+
from deltacat.aws import s3u as s3_utils
|
439
|
+
|
440
|
+
s3_obj = s3_utils.get_object_at_url(url, **s3_client_kwargs)
|
441
|
+
logger.debug(f"Building manifest entry from {url}: {s3_obj}")
|
442
|
+
manifest_entry_meta = ManifestMeta.of(
|
443
|
+
record_count=record_count,
|
444
|
+
content_length=s3_obj["ContentLength"],
|
445
|
+
content_type=s3_obj["ContentType"],
|
446
|
+
content_encoding=s3_obj["ContentEncoding"],
|
447
|
+
source_content_length=source_content_length,
|
448
|
+
credentials=credentials,
|
449
|
+
content_type_parameters=content_type_parameters,
|
450
|
+
entry_type=entry_type,
|
451
|
+
entry_params=entry_params,
|
452
|
+
)
|
453
|
+
manifest_entry = ManifestEntry.of(url, manifest_entry_meta)
|
454
|
+
return manifest_entry
|
455
|
+
|
456
|
+
@staticmethod
|
457
|
+
def from_dict(obj: dict) -> ManifestEntry:
|
458
|
+
return ManifestEntry.of(
|
459
|
+
url=obj.get("url"),
|
460
|
+
uri=obj.get("uri"),
|
461
|
+
meta=ManifestMeta.from_dict(obj.get("meta")),
|
462
|
+
mandatory=obj.get("mandatory", True),
|
463
|
+
uuid=obj.get("id"),
|
464
|
+
)
|
465
|
+
|
466
|
+
@staticmethod
|
467
|
+
def from_path(
|
468
|
+
path: str,
|
469
|
+
filesystem: pa.fs.FileSystem,
|
470
|
+
record_count: int,
|
471
|
+
source_content_length: Optional[int] = None,
|
472
|
+
content_type: Optional[str] = None,
|
473
|
+
content_encoding: Optional[str] = None,
|
474
|
+
credentials: Optional[Dict[str, str]] = None,
|
475
|
+
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
476
|
+
entry_type: Optional[EntryType] = None,
|
477
|
+
entry_params: Optional[EntryParams] = None,
|
478
|
+
schema_id: Optional[int] = None,
|
479
|
+
sort_scheme_id: Optional[str] = None,
|
480
|
+
) -> ManifestEntry:
|
481
|
+
"""
|
482
|
+
Creates a manifest entry from a path using a pyarrow filesystem.
|
483
|
+
|
484
|
+
Args:
|
485
|
+
path: Path to the file
|
486
|
+
filesystem: PyArrow filesystem to use for accessing the file
|
487
|
+
record_count: Number of records in the file
|
488
|
+
source_content_length: Optional original content length in-memory
|
489
|
+
before writing to disk.
|
490
|
+
content_type: Optional content type override. If not provided, will
|
491
|
+
be derived from file extension.
|
492
|
+
content_encoding: Optional content encoding override. If not
|
493
|
+
provided, will be derived from file extension.
|
494
|
+
credentials: Optional credentials required to read this manifest entry.
|
495
|
+
content_type_parameters: Optional content type parameters.
|
496
|
+
entry_type: Optional entry type of this manifest entry. Defaults to DATA.
|
497
|
+
entry_params: Optional entry type parameters.
|
498
|
+
schema_id: Schema ID used to write this manifest entry.
|
499
|
+
sort_scheme_id: Sort scheme ID used to write this manifest entry.
|
500
|
+
|
501
|
+
Returns:
|
502
|
+
A ManifestEntry instance
|
503
|
+
"""
|
504
|
+
file_info = get_file_info(path, filesystem)
|
505
|
+
if file_info.type != pa.fs.FileType.File:
|
506
|
+
raise FileNotFoundError(f"Path does not point to a file: {path}")
|
507
|
+
|
508
|
+
# Extract extensions from right to left
|
509
|
+
# First split will get potential encoding extension
|
510
|
+
base_path, ext1 = posixpath.splitext(path)
|
511
|
+
|
512
|
+
# Initialize with defaults for no extensions
|
513
|
+
derived_content_type = ContentType.BINARY
|
514
|
+
derived_content_encoding = ContentEncoding.IDENTITY
|
515
|
+
|
516
|
+
# Only proceed with extension checks if we found at least one extension
|
517
|
+
if ext1:
|
518
|
+
# Check if the first extension is a known encoding
|
519
|
+
derived_content_encoding = EXT_TO_CONTENT_ENCODING.get(
|
520
|
+
ext1,
|
521
|
+
ContentEncoding.IDENTITY,
|
522
|
+
)
|
523
|
+
|
524
|
+
# Get second extension only if first was an encoding
|
525
|
+
if derived_content_encoding != ContentEncoding.IDENTITY:
|
526
|
+
# Second split will get potential content type extension
|
527
|
+
_, ext2 = posixpath.splitext(base_path)
|
528
|
+
if ext2:
|
529
|
+
derived_content_type = EXT_TO_CONTENT_TYPE.get(
|
530
|
+
ext2,
|
531
|
+
ContentType.BINARY,
|
532
|
+
)
|
533
|
+
else:
|
534
|
+
# First extension wasn't an encoding, check if it's a
|
535
|
+
# content type
|
536
|
+
derived_content_type = EXT_TO_CONTENT_TYPE.get(
|
537
|
+
ext1,
|
538
|
+
ContentType.BINARY,
|
539
|
+
)
|
540
|
+
|
541
|
+
if (
|
542
|
+
derived_content_type == ContentType.BINARY
|
543
|
+
and derived_content_encoding != ContentEncoding.IDENTITY
|
544
|
+
):
|
545
|
+
logger.debug(
|
546
|
+
f"Found encoding {derived_content_encoding.value} but no "
|
547
|
+
f"content type for {path}, assuming binary"
|
548
|
+
)
|
549
|
+
|
550
|
+
# Use provided values if available, otherwise use derived values
|
551
|
+
final_content_type = (
|
552
|
+
content_type if content_type is not None else derived_content_type.value
|
553
|
+
)
|
554
|
+
final_content_encoding = (
|
555
|
+
content_encoding
|
556
|
+
if content_encoding is not None
|
557
|
+
else derived_content_encoding.value
|
558
|
+
)
|
559
|
+
|
560
|
+
manifest_entry_meta = ManifestMeta.of(
|
561
|
+
record_count=record_count,
|
562
|
+
content_length=file_info.size,
|
563
|
+
content_type=final_content_type,
|
564
|
+
content_encoding=final_content_encoding,
|
565
|
+
source_content_length=source_content_length,
|
566
|
+
credentials=credentials,
|
567
|
+
content_type_parameters=content_type_parameters,
|
568
|
+
entry_type=entry_type,
|
569
|
+
entry_params=entry_params,
|
570
|
+
schema_id=schema_id,
|
571
|
+
sort_scheme_id=sort_scheme_id,
|
572
|
+
)
|
573
|
+
manifest_entry = ManifestEntry.of(path, manifest_entry_meta)
|
574
|
+
return manifest_entry
|
575
|
+
|
576
|
+
@property
|
577
|
+
def uri(self) -> Optional[str]:
|
578
|
+
return self.get("uri")
|
579
|
+
|
580
|
+
@property
|
581
|
+
def url(self) -> Optional[str]:
|
582
|
+
return self.get("url")
|
583
|
+
|
584
|
+
@property
|
585
|
+
def meta(self) -> Optional[ManifestMeta]:
|
586
|
+
val: Dict[str, Any] = self.get("meta")
|
587
|
+
if val is not None and not isinstance(val, ManifestMeta):
|
588
|
+
self["meta"] = val = ManifestMeta(val)
|
589
|
+
return val
|
590
|
+
|
591
|
+
@property
|
592
|
+
def mandatory(self) -> bool:
|
593
|
+
return self["mandatory"]
|
594
|
+
|
595
|
+
@property
|
596
|
+
def id(self) -> Optional[str]:
|
597
|
+
return self.get("id")
|
598
|
+
|
599
|
+
|
600
|
+
class ManifestAuthor(dict):
|
601
|
+
@staticmethod
|
602
|
+
def of(name: Optional[str], version: Optional[str]) -> ManifestAuthor:
|
603
|
+
manifest_author = ManifestAuthor()
|
604
|
+
if name is not None:
|
605
|
+
manifest_author["name"] = name
|
606
|
+
if version is not None:
|
607
|
+
manifest_author["version"] = version
|
608
|
+
return manifest_author
|
609
|
+
|
610
|
+
@staticmethod
|
611
|
+
def from_dict(obj: dict) -> Optional[ManifestAuthor]:
|
612
|
+
if obj is None:
|
613
|
+
return None
|
614
|
+
return ManifestAuthor.of(obj.get("name"), obj.get("version"))
|
615
|
+
|
616
|
+
@property
|
617
|
+
def name(self) -> Optional[str]:
|
618
|
+
return self.get("name")
|
619
|
+
|
620
|
+
@property
|
621
|
+
def version(self) -> Optional[str]:
|
622
|
+
return self.get("version")
|
623
|
+
|
624
|
+
|
625
|
+
class ManifestEntryList(List[ManifestEntry]):
|
626
|
+
@staticmethod
|
627
|
+
def of(entries: List[ManifestEntry]) -> ManifestEntryList:
|
628
|
+
manifest_entries = ManifestEntryList()
|
629
|
+
for entry in entries:
|
630
|
+
if entry is not None and not isinstance(entry, ManifestEntry):
|
631
|
+
entry = ManifestEntry(entry)
|
632
|
+
manifest_entries.append(entry)
|
633
|
+
return manifest_entries
|
634
|
+
|
635
|
+
def __getitem__(self, item):
|
636
|
+
val = super().__getitem__(item)
|
637
|
+
if val is not None and not isinstance(val, ManifestEntry):
|
638
|
+
self[item] = val = ManifestEntry(val)
|
639
|
+
return val
|
640
|
+
|
641
|
+
def __iter__(self):
|
642
|
+
for i in range(len(self)):
|
643
|
+
yield self[i] # This triggers __getitem__ conversion
|