deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/utils/export.py
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
import logging
|
2
|
+
import json
|
3
|
+
import pyarrow as pa
|
4
|
+
import pyarrow.parquet
|
5
|
+
import pyarrow.feather
|
6
|
+
from typing import Callable, Dict
|
7
|
+
|
8
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
9
|
+
QueryExpression,
|
10
|
+
)
|
11
|
+
from deltacat import logs
|
12
|
+
|
13
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
14
|
+
|
15
|
+
|
16
|
+
def export_parquet(dataset, file_uri: str, query: QueryExpression = QueryExpression()):
|
17
|
+
records = dataset.scan(query).to_arrow()
|
18
|
+
table = pa.Table.from_batches(records)
|
19
|
+
pyarrow.parquet.write_table(table, file_uri)
|
20
|
+
|
21
|
+
|
22
|
+
def export_feather(dataset, file_uri: str, query: QueryExpression = QueryExpression()):
|
23
|
+
records = dataset.scan(query).to_arrow()
|
24
|
+
table = pa.Table.from_batches(records)
|
25
|
+
pyarrow.feather.write_feather(table, file_uri)
|
26
|
+
|
27
|
+
|
28
|
+
def export_json(dataset, file_uri: str, query: QueryExpression = QueryExpression()):
|
29
|
+
with open(file_uri, "w") as f:
|
30
|
+
for batch in dataset.scan(query).to_pydict():
|
31
|
+
json.dump(batch, f, indent=2)
|
32
|
+
f.write("\n")
|
33
|
+
|
34
|
+
|
35
|
+
def export_dataset(dataset, file_uri: str, format: str = "parquet", query=None):
|
36
|
+
"""
|
37
|
+
Export the dataset to a file.
|
38
|
+
|
39
|
+
TODO: Make this pluggable for custom formats.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
dataset: The dataset to export.
|
43
|
+
file_uri: The URI to write the dataset to.
|
44
|
+
format: The format to write the dataset in. Options are [parquet, feather, json].
|
45
|
+
query: QueryExpression to filter the dataset before exporting.
|
46
|
+
"""
|
47
|
+
# Supported format handlers
|
48
|
+
export_handlers: Dict[str, Callable] = {
|
49
|
+
"parquet": export_parquet,
|
50
|
+
"feather": export_feather,
|
51
|
+
"json": export_json,
|
52
|
+
}
|
53
|
+
|
54
|
+
if format not in export_handlers:
|
55
|
+
raise ValueError(
|
56
|
+
f"Unsupported format: {format}. Supported formats are {list(export_handlers.keys())}"
|
57
|
+
)
|
58
|
+
|
59
|
+
export_handlers[format](dataset, file_uri, query or QueryExpression())
|
60
|
+
|
61
|
+
logger.info(f"Dataset exported to {file_uri} in {format} format.")
|
@@ -0,0 +1,450 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
from typing import Optional, Tuple, Union, List
|
5
|
+
from datetime import timedelta
|
6
|
+
from enum import Enum
|
7
|
+
|
8
|
+
import sys
|
9
|
+
import urllib
|
10
|
+
import pathlib
|
11
|
+
|
12
|
+
import pyarrow as pa
|
13
|
+
from pyarrow.fs import (
|
14
|
+
_resolve_filesystem_and_path,
|
15
|
+
FileSelector,
|
16
|
+
FileInfo,
|
17
|
+
FileType,
|
18
|
+
FileSystem,
|
19
|
+
FSSpecHandler,
|
20
|
+
PyFileSystem,
|
21
|
+
GcsFileSystem,
|
22
|
+
LocalFileSystem,
|
23
|
+
S3FileSystem,
|
24
|
+
AzureFileSystem,
|
25
|
+
HadoopFileSystem,
|
26
|
+
)
|
27
|
+
|
28
|
+
_LOCAL_SCHEME = "local"
|
29
|
+
|
30
|
+
|
31
|
+
class FilesystemType(str, Enum):
|
32
|
+
LOCAL = "local"
|
33
|
+
S3 = "s3"
|
34
|
+
GCS = "gcs"
|
35
|
+
AZURE = "azure"
|
36
|
+
HADOOP = "hadoop"
|
37
|
+
UNKNOWN = "unknown"
|
38
|
+
|
39
|
+
@classmethod
|
40
|
+
def from_filesystem(cls, filesystem: FileSystem) -> FilesystemType:
|
41
|
+
if isinstance(filesystem, LocalFileSystem):
|
42
|
+
return cls.LOCAL
|
43
|
+
elif isinstance(filesystem, S3FileSystem):
|
44
|
+
return cls.S3
|
45
|
+
elif isinstance(filesystem, GcsFileSystem):
|
46
|
+
return cls.GCS
|
47
|
+
elif isinstance(filesystem, AzureFileSystem):
|
48
|
+
return cls.AZURE
|
49
|
+
elif isinstance(filesystem, HadoopFileSystem):
|
50
|
+
return cls.HADOOP
|
51
|
+
else:
|
52
|
+
return cls.UNKNOWN
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def to_filesystem(cls, filesystem_type: FilesystemType) -> FileSystem:
|
56
|
+
if filesystem_type == cls.LOCAL:
|
57
|
+
return LocalFileSystem()
|
58
|
+
elif filesystem_type == cls.S3:
|
59
|
+
return S3FileSystem()
|
60
|
+
elif filesystem_type == cls.GCS:
|
61
|
+
return GcsFileSystem()
|
62
|
+
elif filesystem_type == cls.AZURE:
|
63
|
+
return AzureFileSystem()
|
64
|
+
elif filesystem_type == cls.HADOOP:
|
65
|
+
return HadoopFileSystem()
|
66
|
+
else:
|
67
|
+
raise ValueError(f"Unsupported filesystem type: {filesystem_type}")
|
68
|
+
|
69
|
+
|
70
|
+
def resolve_paths_and_filesystem(
|
71
|
+
paths: Union[str, List[str]],
|
72
|
+
filesystem: FileSystem = None,
|
73
|
+
) -> Tuple[List[str], FileSystem]:
|
74
|
+
"""
|
75
|
+
Resolves and normalizes all provided paths, infers a filesystem from the
|
76
|
+
paths or validates the provided filesystem against the paths and ensures
|
77
|
+
that all paths use the same filesystem.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
paths: A single file/directory path or a list of file/directory paths.
|
81
|
+
A list of paths can contain both files and directories.
|
82
|
+
filesystem: The filesystem implementation that should be used for
|
83
|
+
reading these files. If None, a filesystem will be inferred. If not
|
84
|
+
None, the provided filesystem will still be validated against all
|
85
|
+
filesystems inferred from the provided paths to ensure
|
86
|
+
compatibility.
|
87
|
+
"""
|
88
|
+
if isinstance(paths, str):
|
89
|
+
paths = [paths]
|
90
|
+
if isinstance(paths, pathlib.Path):
|
91
|
+
paths = [str(paths)]
|
92
|
+
elif not isinstance(paths, list) or any(not isinstance(p, str) for p in paths):
|
93
|
+
raise ValueError(
|
94
|
+
"Expected `paths` to be a `str`, `pathlib.Path`, or `list[str]`, but got "
|
95
|
+
f"`{paths}`."
|
96
|
+
)
|
97
|
+
elif len(paths) == 0:
|
98
|
+
raise ValueError("Must provide at least one path.")
|
99
|
+
|
100
|
+
need_unwrap_path_protocol = True
|
101
|
+
if filesystem and not isinstance(filesystem, FileSystem):
|
102
|
+
err_msg = (
|
103
|
+
f"The filesystem passed must either conform to "
|
104
|
+
f"pyarrow.fs.FileSystem, or "
|
105
|
+
f"fsspec.spec.AbstractFileSystem. The provided "
|
106
|
+
f"filesystem was: {filesystem}"
|
107
|
+
)
|
108
|
+
try:
|
109
|
+
import fsspec
|
110
|
+
from fsspec.implementations.http import HTTPFileSystem
|
111
|
+
except ModuleNotFoundError:
|
112
|
+
# If filesystem is not a pyarrow filesystem and fsspec isn't
|
113
|
+
# installed, then filesystem is neither a pyarrow filesystem nor
|
114
|
+
# an fsspec filesystem, so we raise a TypeError.
|
115
|
+
raise TypeError(err_msg) from None
|
116
|
+
if not isinstance(filesystem, fsspec.spec.AbstractFileSystem):
|
117
|
+
raise TypeError(err_msg) from None
|
118
|
+
if isinstance(filesystem, HTTPFileSystem):
|
119
|
+
# If filesystem is fsspec HTTPFileSystem, the protocol/scheme of paths
|
120
|
+
# should not be unwrapped/removed, because HTTPFileSystem expects full file
|
121
|
+
# paths including protocol/scheme. This is different behavior compared to
|
122
|
+
# file systems implementation in pyarrow.fs.FileSystem.
|
123
|
+
need_unwrap_path_protocol = False
|
124
|
+
|
125
|
+
filesystem = PyFileSystem(FSSpecHandler(filesystem))
|
126
|
+
|
127
|
+
resolved_paths = []
|
128
|
+
for path in paths:
|
129
|
+
path = _resolve_custom_scheme(path)
|
130
|
+
try:
|
131
|
+
resolved_filesystem, resolved_path = _resolve_filesystem_and_path(
|
132
|
+
path, filesystem
|
133
|
+
)
|
134
|
+
except pa.lib.ArrowInvalid as e:
|
135
|
+
if "Cannot parse URI" in str(e):
|
136
|
+
resolved_filesystem, resolved_path = _resolve_filesystem_and_path(
|
137
|
+
_encode_url(path), filesystem
|
138
|
+
)
|
139
|
+
resolved_path = _decode_url(resolved_path)
|
140
|
+
elif "Unrecognized filesystem type in URI" in str(e):
|
141
|
+
scheme = urllib.parse.urlparse(path, allow_fragments=False).scheme
|
142
|
+
if scheme in ["http", "https"]:
|
143
|
+
# If scheme of path is HTTP and filesystem is not resolved,
|
144
|
+
# try to use fsspec HTTPFileSystem. This expects fsspec is
|
145
|
+
# installed.
|
146
|
+
try:
|
147
|
+
from fsspec.implementations.http import HTTPFileSystem
|
148
|
+
except ModuleNotFoundError:
|
149
|
+
raise ImportError(
|
150
|
+
"Please install fsspec to read files from HTTP."
|
151
|
+
) from None
|
152
|
+
|
153
|
+
resolved_filesystem = PyFileSystem(FSSpecHandler(HTTPFileSystem()))
|
154
|
+
resolved_path = path
|
155
|
+
need_unwrap_path_protocol = False
|
156
|
+
else:
|
157
|
+
raise
|
158
|
+
else:
|
159
|
+
raise
|
160
|
+
if filesystem is None:
|
161
|
+
if isinstance(resolved_filesystem, GcsFileSystem):
|
162
|
+
# Configure a retry time limit for GcsFileSystem so that it
|
163
|
+
# doesn't hang forever trying to get file info (e.g., when
|
164
|
+
# trying to get a public file w/o anonymous=True).
|
165
|
+
filesystem = GcsFileSystem(
|
166
|
+
retry_time_limit=timedelta(seconds=60),
|
167
|
+
)
|
168
|
+
else:
|
169
|
+
filesystem = resolved_filesystem
|
170
|
+
elif need_unwrap_path_protocol:
|
171
|
+
resolved_path = _unwrap_protocol(resolved_path)
|
172
|
+
resolved_path = filesystem.normalize_path(resolved_path)
|
173
|
+
resolved_paths.append(resolved_path)
|
174
|
+
return resolved_paths, filesystem
|
175
|
+
|
176
|
+
|
177
|
+
def resolve_path_and_filesystem(
|
178
|
+
path: str,
|
179
|
+
filesystem: Optional[FileSystem] = None,
|
180
|
+
) -> Tuple[str, FileSystem]:
|
181
|
+
"""
|
182
|
+
Resolves and normalizes the provided path, infers a filesystem from the
|
183
|
+
path or validates the provided filesystem against the path.
|
184
|
+
|
185
|
+
Args:
|
186
|
+
path: A single file/directory path.
|
187
|
+
filesystem: The filesystem implementation that should be used for
|
188
|
+
reading these files. If None, a filesystem will be inferred. If not
|
189
|
+
None, the provided filesystem will still be validated against all
|
190
|
+
filesystems inferred from the provided paths to ensure
|
191
|
+
compatibility.
|
192
|
+
"""
|
193
|
+
paths, filesystem = resolve_paths_and_filesystem(
|
194
|
+
paths=path,
|
195
|
+
filesystem=filesystem,
|
196
|
+
)
|
197
|
+
assert len(paths) == 1, len(paths)
|
198
|
+
return paths[0], filesystem
|
199
|
+
|
200
|
+
|
201
|
+
def list_directory(
|
202
|
+
path: str,
|
203
|
+
filesystem: FileSystem,
|
204
|
+
exclude_prefixes: Optional[List[str]] = None,
|
205
|
+
ignore_missing_path: bool = False,
|
206
|
+
recursive: bool = False,
|
207
|
+
) -> List[Tuple[str, int]]:
|
208
|
+
"""
|
209
|
+
Expand the provided directory path to a list of file paths.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
path: The directory path to expand.
|
213
|
+
filesystem: The filesystem implementation that should be used for
|
214
|
+
reading these files.
|
215
|
+
exclude_prefixes: The file relative path prefixes that should be
|
216
|
+
excluded from the returned file set. Default excluded prefixes are
|
217
|
+
"." and "_".
|
218
|
+
recursive: Whether to expand subdirectories or not.
|
219
|
+
|
220
|
+
Returns:
|
221
|
+
An iterator of (file_path, file_size) tuples.
|
222
|
+
"""
|
223
|
+
if exclude_prefixes is None:
|
224
|
+
exclude_prefixes = [".", "_"]
|
225
|
+
|
226
|
+
selector = FileSelector(
|
227
|
+
base_dir=path,
|
228
|
+
recursive=recursive,
|
229
|
+
allow_not_found=ignore_missing_path,
|
230
|
+
)
|
231
|
+
try:
|
232
|
+
files = filesystem.get_file_info(selector)
|
233
|
+
except OSError as e:
|
234
|
+
if isinstance(e, FileNotFoundError):
|
235
|
+
files = []
|
236
|
+
else:
|
237
|
+
_handle_read_os_error(e, path)
|
238
|
+
base_path = selector.base_dir
|
239
|
+
out = []
|
240
|
+
for file_ in files:
|
241
|
+
file_path = file_.path
|
242
|
+
if not file_path.startswith(base_path):
|
243
|
+
continue
|
244
|
+
relative = file_path[len(base_path) :]
|
245
|
+
if any(relative.startswith(prefix) for prefix in exclude_prefixes):
|
246
|
+
continue
|
247
|
+
out.append((file_path, file_.size))
|
248
|
+
# We sort the paths to guarantee a stable order.
|
249
|
+
return sorted(out)
|
250
|
+
|
251
|
+
|
252
|
+
def get_file_info(
|
253
|
+
path: str,
|
254
|
+
filesystem: FileSystem,
|
255
|
+
ignore_missing_path: bool = False,
|
256
|
+
) -> FileInfo:
|
257
|
+
"""Get the file info for the provided path."""
|
258
|
+
try:
|
259
|
+
file_info = filesystem.get_file_info(path)
|
260
|
+
except OSError as e:
|
261
|
+
_handle_read_os_error(e, path)
|
262
|
+
if file_info.type == FileType.NotFound and not ignore_missing_path:
|
263
|
+
raise FileNotFoundError(path)
|
264
|
+
|
265
|
+
return file_info
|
266
|
+
|
267
|
+
|
268
|
+
def write_file(
|
269
|
+
path: str,
|
270
|
+
data: Union[str, bytes],
|
271
|
+
filesystem: Optional[FileSystem] = None,
|
272
|
+
) -> None:
|
273
|
+
"""
|
274
|
+
Write data to a file using any filesystem.
|
275
|
+
|
276
|
+
Args:
|
277
|
+
path: The file path to write to.
|
278
|
+
data: The data to write (string or bytes).
|
279
|
+
filesystem: The filesystem implementation to use. If None, will be inferred from the path.
|
280
|
+
"""
|
281
|
+
resolved_path, resolved_filesystem = resolve_path_and_filesystem(
|
282
|
+
path=path,
|
283
|
+
filesystem=filesystem,
|
284
|
+
)
|
285
|
+
|
286
|
+
# Convert string to bytes if necessary
|
287
|
+
if isinstance(data, str):
|
288
|
+
data = data.encode("utf-8")
|
289
|
+
|
290
|
+
with resolved_filesystem.open_output_stream(resolved_path) as f:
|
291
|
+
f.write(data)
|
292
|
+
|
293
|
+
|
294
|
+
def read_file(
|
295
|
+
path: str,
|
296
|
+
filesystem: Optional[FileSystem] = None,
|
297
|
+
fail_if_not_found: bool = True,
|
298
|
+
) -> Optional[bytes]:
|
299
|
+
"""
|
300
|
+
Read data from a file using any filesystem.
|
301
|
+
|
302
|
+
Args:
|
303
|
+
path: The file path to read from.
|
304
|
+
filesystem: The filesystem implementation to use. If None, will be inferred from the path.
|
305
|
+
fail_if_not_found: Whether to raise an error if the file is not found.
|
306
|
+
|
307
|
+
Returns:
|
308
|
+
The file data as bytes, or None if file not found and fail_if_not_found is False.
|
309
|
+
"""
|
310
|
+
try:
|
311
|
+
resolved_path, resolved_filesystem = resolve_path_and_filesystem(
|
312
|
+
path=path,
|
313
|
+
filesystem=filesystem,
|
314
|
+
)
|
315
|
+
|
316
|
+
with resolved_filesystem.open_input_stream(resolved_path) as f:
|
317
|
+
return f.read()
|
318
|
+
except FileNotFoundError:
|
319
|
+
if fail_if_not_found:
|
320
|
+
raise
|
321
|
+
return None
|
322
|
+
|
323
|
+
|
324
|
+
def _handle_read_os_error(
|
325
|
+
error: OSError,
|
326
|
+
paths: Union[str, List[str]],
|
327
|
+
) -> str:
|
328
|
+
# NOTE: this is not comprehensive yet, and should be extended as more errors arise.
|
329
|
+
# NOTE: The latter patterns are raised in Arrow 10+, while the former is raised in
|
330
|
+
# Arrow < 10.
|
331
|
+
aws_error_pattern = (
|
332
|
+
r"^(?:(.*)AWS Error \[code \d+\]: No response body\.(.*))|"
|
333
|
+
r"(?:(.*)AWS Error UNKNOWN \(HTTP status 400\) during HeadObject operation: "
|
334
|
+
r"No response body\.(.*))|"
|
335
|
+
r"(?:(.*)AWS Error ACCESS_DENIED during HeadObject operation: No response "
|
336
|
+
r"body\.(.*))$"
|
337
|
+
)
|
338
|
+
gcp_error_pattern = (
|
339
|
+
r"^(?:(.*)google::cloud::Status\(UNAVAILABLE:(.*?)Couldn't resolve host name)"
|
340
|
+
)
|
341
|
+
if re.match(aws_error_pattern, str(error)):
|
342
|
+
# Specially handle AWS error when reading files, to give a clearer error
|
343
|
+
# message to avoid confusing users. The real issue is most likely that the AWS
|
344
|
+
# S3 file credentials have not been properly configured yet.
|
345
|
+
if isinstance(paths, str):
|
346
|
+
# Quote to highlight single file path in error message for better
|
347
|
+
# readability. List of file paths will be shown up as ['foo', 'boo'],
|
348
|
+
# so only quote single file path here.
|
349
|
+
paths = f'"{paths}"'
|
350
|
+
raise OSError(
|
351
|
+
(
|
352
|
+
f"Failing to read AWS S3 file(s): {paths}. "
|
353
|
+
"Please check that file exists and has properly configured access. "
|
354
|
+
"You can also run AWS CLI command to get more detailed error message "
|
355
|
+
"(e.g., aws s3 ls <file-name>). "
|
356
|
+
"See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html " # noqa
|
357
|
+
"and https://arrow.apache.org/docs/python/generated/pyarrow.fs.S3FileSystem.html "
|
358
|
+
"for more information."
|
359
|
+
)
|
360
|
+
)
|
361
|
+
elif re.match(gcp_error_pattern, str(error)):
|
362
|
+
# Special handling for GCP errors (e.g., handling the special case of
|
363
|
+
# requiring the filesystem to be instantiated with anonymous access to
|
364
|
+
# read public files).
|
365
|
+
if isinstance(paths, str):
|
366
|
+
paths = f'"{paths}"'
|
367
|
+
raise OSError(
|
368
|
+
(
|
369
|
+
f"Failing to read GCP GS file(s): {paths}. "
|
370
|
+
"Please check that file exists and has properly configured access. "
|
371
|
+
"If this is a public file, please instantiate a filesystem with "
|
372
|
+
"anonymous access via `pyarrow.fs.GcsFileSystem(anonymous=True)` "
|
373
|
+
"to read it. See https://google.aip.dev/auth/4110 and "
|
374
|
+
"https://arrow.apache.org/docs/python/generated/pyarrow.fs.GcsFileSystem.html" # noqa
|
375
|
+
"for more information."
|
376
|
+
)
|
377
|
+
)
|
378
|
+
|
379
|
+
else:
|
380
|
+
raise error
|
381
|
+
|
382
|
+
|
383
|
+
def _is_local_windows_path(path: str) -> bool:
|
384
|
+
"""Determines if path is a Windows file-system location."""
|
385
|
+
if sys.platform != "win32":
|
386
|
+
return False
|
387
|
+
|
388
|
+
if len(path) >= 1 and path[0] == "\\":
|
389
|
+
return True
|
390
|
+
if (
|
391
|
+
len(path) >= 3
|
392
|
+
and path[1] == ":"
|
393
|
+
and (path[2] == "/" or path[2] == "\\")
|
394
|
+
and path[0].isalpha()
|
395
|
+
):
|
396
|
+
return True
|
397
|
+
return False
|
398
|
+
|
399
|
+
|
400
|
+
def _unwrap_protocol(path):
|
401
|
+
"""
|
402
|
+
Slice off any protocol prefixes on path.
|
403
|
+
"""
|
404
|
+
if sys.platform == "win32" and _is_local_windows_path(path):
|
405
|
+
# Represent as posix path such that downstream functions properly handle it.
|
406
|
+
# This is executed when 'file://' is NOT included in the path.
|
407
|
+
return pathlib.Path(path).as_posix()
|
408
|
+
|
409
|
+
parsed = urllib.parse.urlparse(path, allow_fragments=False) # support '#' in path
|
410
|
+
query = "?" + parsed.query if parsed.query else "" # support '?' in path
|
411
|
+
netloc = parsed.netloc
|
412
|
+
if parsed.scheme == "s3" and "@" in parsed.netloc:
|
413
|
+
# If the path contains an @, it is assumed to be an anonymous
|
414
|
+
# credentialed path, and we need to strip off the credentials.
|
415
|
+
netloc = parsed.netloc.split("@")[-1]
|
416
|
+
|
417
|
+
parsed_path = parsed.path
|
418
|
+
# urlparse prepends the path with a '/'. This does not work on Windows
|
419
|
+
# so if this is the case strip the leading slash.
|
420
|
+
if (
|
421
|
+
sys.platform == "win32"
|
422
|
+
and not netloc
|
423
|
+
and len(parsed_path) >= 3
|
424
|
+
and parsed_path[0] == "/" # The problematic leading slash
|
425
|
+
and parsed_path[1].isalpha() # Ensure it is a drive letter.
|
426
|
+
and parsed_path[2:4] in (":", ":/")
|
427
|
+
):
|
428
|
+
parsed_path = parsed_path[1:]
|
429
|
+
|
430
|
+
return netloc + parsed_path + query
|
431
|
+
|
432
|
+
|
433
|
+
def _encode_url(path):
|
434
|
+
return urllib.parse.quote(path, safe="/:")
|
435
|
+
|
436
|
+
|
437
|
+
def _decode_url(path):
|
438
|
+
return urllib.parse.unquote(path)
|
439
|
+
|
440
|
+
|
441
|
+
def _resolve_custom_scheme(path: str) -> str:
|
442
|
+
"""Returns the resolved path if the given path follows a Ray-specific custom
|
443
|
+
scheme. Othewise, returns the path unchanged.
|
444
|
+
|
445
|
+
The supported custom schemes are: "local", "example".
|
446
|
+
"""
|
447
|
+
parsed_uri = urllib.parse.urlparse(path)
|
448
|
+
if parsed_uri.scheme == _LOCAL_SCHEME:
|
449
|
+
path = parsed_uri.netloc + parsed_uri.path
|
450
|
+
return path
|
@@ -0,0 +1,74 @@
|
|
1
|
+
import posixpath
|
2
|
+
import pyarrow.fs
|
3
|
+
|
4
|
+
from deltacat.constants import REV_DIR_NAME
|
5
|
+
from deltacat.storage.model.partition import PartitionLocator
|
6
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
7
|
+
|
8
|
+
"""
|
9
|
+
Helper functions to work with deltacat metadata paths.
|
10
|
+
TODO: Replace with direct calls to Deltacat storage interface.
|
11
|
+
"""
|
12
|
+
|
13
|
+
|
14
|
+
def _find_first_child_with_rev(
|
15
|
+
parent_path: str, filesystem: pyarrow.fs.FileSystem
|
16
|
+
) -> str:
|
17
|
+
"""
|
18
|
+
Walks the filesystem to find the first child directory with a `rev/` folder.
|
19
|
+
|
20
|
+
This is a temporary solution to locate the first Namespace and Table directories.
|
21
|
+
The Deltacat Storage interface will provide a more robust way to locate these directories.
|
22
|
+
|
23
|
+
param: parent_path: The parent directory to search for a child with a `rev/` folder.
|
24
|
+
param: filesystem: The filesystem to search for the child directory.
|
25
|
+
returns: The name of the first child directory with a `rev/` folder.
|
26
|
+
"""
|
27
|
+
children = filesystem.get_file_info(
|
28
|
+
pyarrow.fs.FileSelector(parent_path, allow_not_found=True)
|
29
|
+
)
|
30
|
+
for child in children:
|
31
|
+
if child.type == pyarrow.fs.FileType.Directory:
|
32
|
+
rev_path = posixpath.join(child.path, REV_DIR_NAME)
|
33
|
+
if filesystem.get_file_info(rev_path).type == pyarrow.fs.FileType.Directory:
|
34
|
+
return child.base_name
|
35
|
+
raise ValueError(f"No directory with 'rev/' found under {parent_path}")
|
36
|
+
|
37
|
+
|
38
|
+
def _find_table_path(root_path: str, filesystem: pyarrow.fs.FileSystem):
|
39
|
+
"""
|
40
|
+
Finds a path with structure: root/namespace_id/table_id
|
41
|
+
Uses _find_first_child_with_rev to determine the namespace and table ids.
|
42
|
+
|
43
|
+
param: root_path: The root directory to search for the namespace and table directories.
|
44
|
+
param: filesystem: The filesystem to search for the namespace and table directories.
|
45
|
+
returns: The path to the table directory.
|
46
|
+
raises: ValueError if the namespace or table directories are not found.
|
47
|
+
"""
|
48
|
+
try:
|
49
|
+
# Find Namespace (first directory under root with rev/)
|
50
|
+
namespace_id = _find_first_child_with_rev(root_path, filesystem)
|
51
|
+
namespace_path = posixpath.join(root_path, namespace_id)
|
52
|
+
|
53
|
+
# Find Table (first directory under namespace with rev/)
|
54
|
+
table_id = _find_first_child_with_rev(namespace_path, filesystem)
|
55
|
+
return posixpath.join(namespace_path, table_id)
|
56
|
+
|
57
|
+
except ValueError as e:
|
58
|
+
raise ValueError(f"Failed to locate Namespace or Table: {e}") from e
|
59
|
+
|
60
|
+
|
61
|
+
def _find_partition_path(root_path: str, locator: PartitionLocator) -> str:
|
62
|
+
"""
|
63
|
+
Finds the path to the partition directory for the specified locator.
|
64
|
+
|
65
|
+
param: root_uri: The root URI of the dataset.
|
66
|
+
param: locator: The DeltaLocator for the delta.
|
67
|
+
returns: The path to the delta directory.
|
68
|
+
"""
|
69
|
+
root_path, filesystem = resolve_path_and_filesystem(root_path)
|
70
|
+
return posixpath.join(
|
71
|
+
_find_table_path(root_path, filesystem),
|
72
|
+
locator.table_version,
|
73
|
+
locator.stream_id,
|
74
|
+
)
|