deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,23 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
|
2
|
+
|
3
|
+
import base64
|
3
4
|
from enum import Enum
|
5
|
+
from typing import Dict, Any, Optional
|
6
|
+
import pyarrow as pa
|
7
|
+
|
8
|
+
from deltacat.constants import METAFILE_FORMAT, METAFILE_FORMAT_JSON
|
4
9
|
|
5
10
|
|
6
11
|
class TransformName(str, Enum):
|
7
12
|
IDENTITY = "identity"
|
8
13
|
BUCKET = "bucket"
|
14
|
+
YEAR = "year"
|
15
|
+
MONTH = "month"
|
16
|
+
DAY = "day"
|
17
|
+
HOUR = "hour"
|
18
|
+
TRUNCATE = "truncate"
|
19
|
+
VOID = "void"
|
20
|
+
UNKNOWN = "unknown"
|
9
21
|
|
10
22
|
|
11
23
|
class TransformParameters(dict):
|
@@ -17,63 +29,42 @@ class TransformParameters(dict):
|
|
17
29
|
pass
|
18
30
|
|
19
31
|
|
20
|
-
class
|
32
|
+
class BucketingStrategy(str, Enum):
|
21
33
|
"""
|
22
|
-
|
34
|
+
A bucketing strategy for the transform
|
23
35
|
"""
|
24
36
|
|
25
|
-
|
26
|
-
|
27
|
-
identify_transform_parameters = IdentityTransformParameters()
|
28
|
-
identify_transform_parameters["columnName"] = column_name
|
29
|
-
return identify_transform_parameters
|
30
|
-
|
31
|
-
@property
|
32
|
-
def column_name(self) -> str:
|
33
|
-
"""
|
34
|
-
The name of the column to use for identity transform
|
35
|
-
"""
|
36
|
-
return self["columnName"]
|
37
|
+
# Default DeltaCAT SHA-1 based hash bucketing strategy.
|
38
|
+
DEFAULT = "default"
|
37
39
|
|
38
|
-
|
39
|
-
|
40
|
-
self["columnName"] = value
|
40
|
+
# Iceberg-compliant murmur3 based hash bucketing strategy.
|
41
|
+
ICEBERG = "iceberg"
|
41
42
|
|
42
43
|
|
43
|
-
class
|
44
|
+
class TruncateStrategy(str, Enum):
|
44
45
|
"""
|
45
|
-
A
|
46
|
+
A truncation strategy for the transform
|
46
47
|
"""
|
47
48
|
|
48
|
-
#
|
49
|
-
# This strategy supports hashing on composite keys
|
50
|
-
# and uses SHA1 hashing for determining the bucket.
|
51
|
-
# If no columns passed, it will use a random UUID
|
52
|
-
# for determining the bucket.
|
49
|
+
# Default DeltaCAT truncate strategy.
|
53
50
|
DEFAULT = "default"
|
54
51
|
|
55
|
-
#
|
56
|
-
# As indicated in the iceberg spec, it does not support
|
57
|
-
# composite keys and uses murmur3 hash for determining
|
58
|
-
# the bucket.
|
59
|
-
# See https://iceberg.apache.org/spec/#partitioning
|
52
|
+
# Iceberg-compliant truncate strategy.
|
60
53
|
ICEBERG = "iceberg"
|
61
54
|
|
62
55
|
|
63
56
|
class BucketTransformParameters(TransformParameters):
|
64
57
|
"""
|
65
|
-
|
58
|
+
Parameters for the bucket transform.
|
66
59
|
"""
|
67
60
|
|
61
|
+
@staticmethod
|
68
62
|
def of(
|
69
|
-
self,
|
70
63
|
num_buckets: int,
|
71
|
-
|
72
|
-
bucketing_strategy: BucketingStrategy,
|
64
|
+
bucketing_strategy: BucketingStrategy = BucketingStrategy.DEFAULT,
|
73
65
|
) -> BucketTransformParameters:
|
74
66
|
bucket_transform_parameters = BucketTransformParameters()
|
75
67
|
bucket_transform_parameters["numBuckets"] = num_buckets
|
76
|
-
bucket_transform_parameters["columnNames"] = column_names
|
77
68
|
bucket_transform_parameters["bucketingStrategy"] = bucketing_strategy
|
78
69
|
|
79
70
|
return bucket_transform_parameters
|
@@ -81,47 +72,272 @@ class BucketTransformParameters(TransformParameters):
|
|
81
72
|
@property
|
82
73
|
def num_buckets(self) -> int:
|
83
74
|
"""
|
84
|
-
The total number of buckets to create
|
75
|
+
The total number of buckets to create.
|
85
76
|
"""
|
86
77
|
return self["numBuckets"]
|
87
78
|
|
88
79
|
@property
|
89
|
-
def
|
80
|
+
def bucketing_strategy(self) -> BucketingStrategy:
|
81
|
+
"""
|
82
|
+
The bucketing strategy to use.
|
83
|
+
"""
|
84
|
+
return BucketingStrategy(self["bucketingStrategy"])
|
85
|
+
|
86
|
+
|
87
|
+
class TruncateTransformParameters(TransformParameters):
|
88
|
+
"""
|
89
|
+
Parameters for the truncate transform.
|
90
|
+
"""
|
91
|
+
|
92
|
+
@staticmethod
|
93
|
+
def of(
|
94
|
+
width: int,
|
95
|
+
truncate_strategy: TruncateStrategy = TruncateStrategy.DEFAULT,
|
96
|
+
) -> TruncateTransformParameters:
|
97
|
+
truncate_transform_parameters = TruncateTransformParameters()
|
98
|
+
truncate_transform_parameters["width"] = width
|
99
|
+
truncate_transform_parameters["truncateStrategy"] = truncate_strategy
|
100
|
+
return truncate_transform_parameters
|
101
|
+
|
102
|
+
@property
|
103
|
+
def width(self) -> int:
|
90
104
|
"""
|
91
|
-
|
92
|
-
to use for bucketings.
|
105
|
+
The width to truncate the field to.
|
93
106
|
"""
|
94
|
-
return self["
|
107
|
+
return self["width"]
|
95
108
|
|
96
109
|
@property
|
97
|
-
def
|
110
|
+
def truncate_strategy(self) -> TruncateStrategy:
|
98
111
|
"""
|
99
|
-
The
|
112
|
+
The truncate strategy to use.
|
100
113
|
"""
|
101
|
-
return self["
|
114
|
+
return TruncateStrategy(self["truncateStrategy"])
|
102
115
|
|
103
116
|
|
104
117
|
class Transform(dict):
|
105
118
|
"""
|
106
|
-
A transform
|
107
|
-
transformed into a new value.
|
108
|
-
|
119
|
+
A transform represents how a particular column value can be
|
120
|
+
transformed into a new value. For example, transforms may be used
|
121
|
+
to determine partition or sort values for table records.
|
122
|
+
"""
|
123
|
+
|
124
|
+
@property
|
125
|
+
def name(self) -> TransformName:
|
126
|
+
return TransformName(self["name"])
|
127
|
+
|
128
|
+
@name.setter
|
129
|
+
def name(self, name: TransformName) -> None:
|
130
|
+
self["name"] = name
|
131
|
+
|
132
|
+
@property
|
133
|
+
def parameters(self) -> Optional[TransformParameters]:
|
134
|
+
return NAME_TO_TRANSFORM[self.name].parameters
|
135
|
+
|
136
|
+
@parameters.setter
|
137
|
+
def parameters(
|
138
|
+
self,
|
139
|
+
parameters: Optional[TransformParameters] = None,
|
140
|
+
) -> None:
|
141
|
+
NAME_TO_TRANSFORM[self.name].parameters = parameters
|
142
|
+
|
143
|
+
@property
|
144
|
+
def return_type(self) -> Optional[pa.DataType]:
|
145
|
+
"""
|
146
|
+
The PyArrow data type that this transform returns.
|
147
|
+
A return value of "None" indicates that the return type is the same
|
148
|
+
as the source type. Transforms that always return null return pa.null().
|
149
|
+
"""
|
150
|
+
return_type = self.get("return_type")
|
151
|
+
if return_type is not None:
|
152
|
+
schema_bytes = (
|
153
|
+
base64.b64decode(return_type)
|
154
|
+
if METAFILE_FORMAT == METAFILE_FORMAT_JSON
|
155
|
+
else return_type
|
156
|
+
)
|
157
|
+
return_type = pa.ipc.read_schema(
|
158
|
+
pa.py_buffer(schema_bytes),
|
159
|
+
)[0].type
|
160
|
+
return return_type
|
161
|
+
|
162
|
+
@return_type.setter
|
163
|
+
def return_type(self, return_type: pa.Schema) -> None:
|
164
|
+
"""
|
165
|
+
Set the PyArrow data type that this transform returns.
|
166
|
+
"""
|
167
|
+
self["return_type"] = return_type.serialize().to_pybytes()
|
168
|
+
|
169
|
+
@property
|
170
|
+
def is_multi_field_transform(self) -> bool:
|
171
|
+
"""
|
172
|
+
Whether this transform is a multi-field transform.
|
173
|
+
"""
|
174
|
+
return False
|
175
|
+
|
176
|
+
|
177
|
+
class BucketTransform(Transform):
|
178
|
+
"""
|
179
|
+
A transform that hashes field values into a fixed number of buckets.
|
180
|
+
Returns a PyArrow int32() type.
|
109
181
|
"""
|
110
182
|
|
111
183
|
@staticmethod
|
112
|
-
def of(
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
partition_transform["parameters"] = parameters
|
119
|
-
return partition_transform
|
184
|
+
def of(parameters: BucketTransformParameters) -> BucketTransform:
|
185
|
+
transform = BucketTransform()
|
186
|
+
transform.name = TransformName.BUCKET
|
187
|
+
transform.parameters = parameters
|
188
|
+
transform.return_type = pa.schema([("return_type", pa.int32())])
|
189
|
+
return transform
|
120
190
|
|
121
191
|
@property
|
122
|
-
def
|
123
|
-
|
192
|
+
def parameters(self) -> BucketTransformParameters:
|
193
|
+
val: Dict[str, Any] = self.get("parameters")
|
194
|
+
if val is not None and not isinstance(val, BucketTransformParameters.__class__):
|
195
|
+
self["parameters"] = val = BucketTransformParameters(val)
|
196
|
+
return val
|
197
|
+
|
198
|
+
@parameters.setter
|
199
|
+
def parameters(
|
200
|
+
self,
|
201
|
+
parameters: Optional[BucketTransformParameters] = None,
|
202
|
+
) -> None:
|
203
|
+
self["parameters"] = parameters
|
124
204
|
|
125
205
|
@property
|
126
|
-
def
|
127
|
-
return
|
206
|
+
def is_multi_field_transform(self) -> bool:
|
207
|
+
return True
|
208
|
+
|
209
|
+
|
210
|
+
class TruncateTransform(Transform):
|
211
|
+
"""
|
212
|
+
A transform that truncates field values to a fixed width.
|
213
|
+
Returns the same type as the input field.
|
214
|
+
"""
|
215
|
+
|
216
|
+
@staticmethod
|
217
|
+
def of(parameters: TruncateTransformParameters) -> TruncateTransform:
|
218
|
+
transform = TruncateTransform()
|
219
|
+
transform.name = TransformName.TRUNCATE
|
220
|
+
transform.parameters = parameters
|
221
|
+
return transform
|
222
|
+
|
223
|
+
@property
|
224
|
+
def parameters(self) -> TruncateTransformParameters:
|
225
|
+
val: Dict[str, Any] = self.get("parameters")
|
226
|
+
if val is not None and not isinstance(val, TruncateTransformParameters):
|
227
|
+
self["parameters"] = val = TruncateTransformParameters(val)
|
228
|
+
return val
|
229
|
+
|
230
|
+
@parameters.setter
|
231
|
+
def parameters(
|
232
|
+
self,
|
233
|
+
parameters: Optional[TruncateTransformParameters] = None,
|
234
|
+
) -> None:
|
235
|
+
self["parameters"] = parameters
|
236
|
+
|
237
|
+
|
238
|
+
class IdentityTransform(Transform):
|
239
|
+
"""
|
240
|
+
A no-op transform that returns unmodified field values.
|
241
|
+
Returns the same PyArrow type as the input.
|
242
|
+
"""
|
243
|
+
|
244
|
+
@staticmethod
|
245
|
+
def of() -> IdentityTransform:
|
246
|
+
transform = IdentityTransform()
|
247
|
+
transform.name = TransformName.IDENTITY
|
248
|
+
return transform
|
249
|
+
|
250
|
+
|
251
|
+
class HourTransform(Transform):
|
252
|
+
"""
|
253
|
+
A transform that returns the hour of a datetime value.
|
254
|
+
Returns a PyArrow int32 type representing the hour (0-23).
|
255
|
+
"""
|
256
|
+
|
257
|
+
@staticmethod
|
258
|
+
def of() -> HourTransform:
|
259
|
+
transform = HourTransform()
|
260
|
+
transform.name = TransformName.HOUR
|
261
|
+
transform.return_type = pa.schema([("return_type", pa.int32())])
|
262
|
+
return transform
|
263
|
+
|
264
|
+
|
265
|
+
class DayTransform(Transform):
|
266
|
+
"""
|
267
|
+
A transform that returns the day of a datetime value.
|
268
|
+
Returns a PyArrow int32 type representing the day (1-31).
|
269
|
+
"""
|
270
|
+
|
271
|
+
@staticmethod
|
272
|
+
def of() -> DayTransform:
|
273
|
+
transform = DayTransform()
|
274
|
+
transform.name = TransformName.DAY
|
275
|
+
transform.return_type = pa.schema([("return_type", pa.int32())])
|
276
|
+
return transform
|
277
|
+
|
278
|
+
|
279
|
+
class MonthTransform(Transform):
|
280
|
+
"""
|
281
|
+
A transform that returns the month of a datetime value.
|
282
|
+
Returns a PyArrow int32 type representing the month (1-12).
|
283
|
+
"""
|
284
|
+
|
285
|
+
@staticmethod
|
286
|
+
def of() -> MonthTransform:
|
287
|
+
transform = MonthTransform()
|
288
|
+
transform.name = TransformName.MONTH
|
289
|
+
transform.return_type = pa.schema([("return_type", pa.int32())])
|
290
|
+
return transform
|
291
|
+
|
292
|
+
|
293
|
+
class YearTransform(Transform):
|
294
|
+
"""
|
295
|
+
A transform that returns the year of a datetime value.
|
296
|
+
Returns a PyArrow int32 type representing the year.
|
297
|
+
"""
|
298
|
+
|
299
|
+
@staticmethod
|
300
|
+
def of() -> YearTransform:
|
301
|
+
transform = YearTransform()
|
302
|
+
transform.name = TransformName.YEAR
|
303
|
+
transform.return_type = pa.schema([("return_type", pa.int32())])
|
304
|
+
return transform
|
305
|
+
|
306
|
+
|
307
|
+
class VoidTransform(Transform):
|
308
|
+
"""
|
309
|
+
A transform that coerces all field values to None.
|
310
|
+
Returns a PyArrow null type.
|
311
|
+
"""
|
312
|
+
|
313
|
+
@staticmethod
|
314
|
+
def of() -> VoidTransform:
|
315
|
+
transform = VoidTransform()
|
316
|
+
transform.name = TransformName.VOID
|
317
|
+
transform.return_type = pa.schema([("return_type", pa.null())])
|
318
|
+
return transform
|
319
|
+
|
320
|
+
|
321
|
+
class UnknownTransform(Transform):
|
322
|
+
"""
|
323
|
+
An unknown or invalid transform.
|
324
|
+
"""
|
325
|
+
|
326
|
+
@staticmethod
|
327
|
+
def of() -> UnknownTransform:
|
328
|
+
transform = UnknownTransform()
|
329
|
+
transform.name = TransformName.UNKNOWN
|
330
|
+
return transform
|
331
|
+
|
332
|
+
|
333
|
+
NAME_TO_TRANSFORM: Dict[TransformName, Transform] = {
|
334
|
+
TransformName.IDENTITY: IdentityTransform,
|
335
|
+
TransformName.BUCKET: BucketTransform,
|
336
|
+
TransformName.YEAR: YearTransform,
|
337
|
+
TransformName.MONTH: MonthTransform,
|
338
|
+
TransformName.DAY: DayTransform,
|
339
|
+
TransformName.HOUR: HourTransform,
|
340
|
+
TransformName.TRUNCATE: TruncateTransform,
|
341
|
+
TransformName.VOID: VoidTransform,
|
342
|
+
TransformName.UNKNOWN: UnknownTransform,
|
343
|
+
}
|
deltacat/storage/model/types.py
CHANGED
@@ -1,16 +1,41 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
from enum import Enum
|
2
4
|
from typing import List, Union
|
3
5
|
|
4
|
-
from pyarrow.parquet import ParquetFile
|
5
6
|
import numpy as np
|
6
7
|
import pandas as pd
|
7
8
|
import pyarrow as pa
|
8
|
-
|
9
|
+
import polars as pl
|
10
|
+
from ray.data.dataset import Dataset as RayDataset
|
9
11
|
from daft import DataFrame as DaftDataFrame
|
10
12
|
|
11
|
-
|
12
|
-
|
13
|
-
|
13
|
+
from deltacat.constants import (
|
14
|
+
RUNNING_TXN_DIR_NAME,
|
15
|
+
PAUSED_TXN_DIR_NAME,
|
16
|
+
FAILED_TXN_DIR_NAME,
|
17
|
+
SUCCESS_TXN_DIR_NAME,
|
18
|
+
)
|
19
|
+
|
20
|
+
LocalTable = Union[
|
21
|
+
pa.Table,
|
22
|
+
pd.DataFrame,
|
23
|
+
pl.DataFrame,
|
24
|
+
np.ndarray,
|
25
|
+
pa.parquet.ParquetFile,
|
26
|
+
]
|
27
|
+
LocalDataset = Union[LocalTable, List[LocalTable]]
|
28
|
+
DistributedDataset = Union[RayDataset, DaftDataFrame]
|
29
|
+
Dataset = Union[LocalDataset, DistributedDataset]
|
30
|
+
|
31
|
+
|
32
|
+
class StreamFormat(str, Enum):
|
33
|
+
DELTACAT = "deltacat"
|
34
|
+
ICEBERG = "iceberg"
|
35
|
+
HIVE = "hive"
|
36
|
+
HUDI = "hudi"
|
37
|
+
DELTA_LAKE = "delta_lake"
|
38
|
+
SQLITE3 = "SQLITE3" # used by tests
|
14
39
|
|
15
40
|
|
16
41
|
class DeltaType(str, Enum):
|
@@ -19,7 +44,81 @@ class DeltaType(str, Enum):
|
|
19
44
|
DELETE = "delete"
|
20
45
|
|
21
46
|
|
47
|
+
class TransactionOperationType(str, Enum):
|
48
|
+
CREATE = "create"
|
49
|
+
UPDATE = "update"
|
50
|
+
REPLACE = "replace"
|
51
|
+
DELETE = "delete"
|
52
|
+
|
53
|
+
READ_SIBLINGS = "read_siblings"
|
54
|
+
READ_CHILDREN = "read_children"
|
55
|
+
READ_LATEST = "read_latest"
|
56
|
+
READ_EXISTS = "read_exists"
|
57
|
+
|
58
|
+
@staticmethod
|
59
|
+
def write_operations():
|
60
|
+
return {
|
61
|
+
TransactionOperationType.CREATE,
|
62
|
+
TransactionOperationType.UPDATE,
|
63
|
+
TransactionOperationType.REPLACE,
|
64
|
+
TransactionOperationType.DELETE,
|
65
|
+
}
|
66
|
+
|
67
|
+
@staticmethod
|
68
|
+
def read_operations():
|
69
|
+
return {
|
70
|
+
TransactionOperationType.READ_SIBLINGS,
|
71
|
+
TransactionOperationType.READ_CHILDREN,
|
72
|
+
TransactionOperationType.READ_LATEST,
|
73
|
+
TransactionOperationType.READ_EXISTS,
|
74
|
+
}
|
75
|
+
|
76
|
+
def is_write_operation(self) -> bool:
|
77
|
+
return self in TransactionOperationType.write_operations()
|
78
|
+
|
79
|
+
def is_read_operation(self) -> bool:
|
80
|
+
return self in TransactionOperationType.read_operations()
|
81
|
+
|
82
|
+
|
83
|
+
class TransactionStatus(str, Enum):
|
84
|
+
"""
|
85
|
+
Transaction user status types. Every transaction status maps to a distinct
|
86
|
+
transaction log directory.
|
87
|
+
"""
|
88
|
+
|
89
|
+
SUCCESS = "SUCCESS"
|
90
|
+
RUNNING = "RUNNING"
|
91
|
+
PAUSED = "PAUSED"
|
92
|
+
FAILED = "FAILED"
|
93
|
+
|
94
|
+
def dir_name(self) -> str:
|
95
|
+
if self == TransactionStatus.RUNNING:
|
96
|
+
return RUNNING_TXN_DIR_NAME
|
97
|
+
elif self == TransactionStatus.PAUSED:
|
98
|
+
return PAUSED_TXN_DIR_NAME
|
99
|
+
elif self == TransactionStatus.FAILED:
|
100
|
+
return FAILED_TXN_DIR_NAME
|
101
|
+
elif self == TransactionStatus.SUCCESS:
|
102
|
+
return SUCCESS_TXN_DIR_NAME
|
103
|
+
|
104
|
+
|
105
|
+
class TransactionState(str, Enum):
|
106
|
+
"""
|
107
|
+
Transaction system state types. Transaction states do not map to distinct transaction log directories,
|
108
|
+
but can be inferred by its presence in one or more directories. These states are used to infer whether
|
109
|
+
to run system activities like transaction cleanup jobs.
|
110
|
+
"""
|
111
|
+
|
112
|
+
FAILED = "FAILED"
|
113
|
+
PURGED = "PURGED"
|
114
|
+
TIMEOUT = "TIMEOUT"
|
115
|
+
RUNNING = "RUNNING"
|
116
|
+
SUCCESS = "SUCCESS"
|
117
|
+
PAUSED = "PAUSED"
|
118
|
+
|
119
|
+
|
22
120
|
class LifecycleState(str, Enum):
|
121
|
+
CREATED = "created"
|
23
122
|
UNRELEASED = "unreleased"
|
24
123
|
ACTIVE = "active"
|
25
124
|
DEPRECATED = "deprecated"
|
@@ -35,22 +134,45 @@ class CommitState(str, Enum):
|
|
35
134
|
|
36
135
|
class SchemaConsistencyType(str, Enum):
|
37
136
|
"""
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
table load time:
|
137
|
+
DeltaCAT table schemas can be used to inform the data consistency checks
|
138
|
+
run for each field. When present, the schema can be used to enforce the
|
139
|
+
following field-level data consistency policies at table load time:
|
42
140
|
|
43
|
-
NONE: No consistency checks are run.
|
44
|
-
policies by specifying column names to pass through together with
|
45
|
-
column names to coerce/validate.
|
141
|
+
NONE: No consistency checks are run.
|
46
142
|
|
47
|
-
COERCE: Coerce fields to fit the schema whenever possible.
|
48
|
-
subset of column names to coerce may optionally be specified.
|
143
|
+
COERCE: Coerce fields to fit the schema whenever possible.
|
49
144
|
|
50
|
-
VALIDATE: Raise an error for any fields that don't fit the schema.
|
51
|
-
explicit subset of column names to validate may optionally be specified.
|
145
|
+
VALIDATE: Raise an error for any fields that don't fit the schema.
|
52
146
|
"""
|
53
147
|
|
54
148
|
NONE = "none"
|
55
149
|
COERCE = "coerce"
|
56
150
|
VALIDATE = "validate"
|
151
|
+
|
152
|
+
|
153
|
+
class SortOrder(str, Enum):
|
154
|
+
ASCENDING = "ascending"
|
155
|
+
DESCENDING = "descending"
|
156
|
+
|
157
|
+
@classmethod
|
158
|
+
def _missing_(cls, value: str):
|
159
|
+
# pyiceberg.table.sorting.SortDirection mappings
|
160
|
+
if value.lower() == "asc":
|
161
|
+
return SortOrder.ASCENDING
|
162
|
+
elif value.lower() == "desc":
|
163
|
+
return SortOrder.DESCENDING
|
164
|
+
return None
|
165
|
+
|
166
|
+
|
167
|
+
class NullOrder(str, Enum):
|
168
|
+
AT_START = "at_start"
|
169
|
+
AT_END = "at_end"
|
170
|
+
|
171
|
+
@classmethod
|
172
|
+
def _missing_(cls, value: str):
|
173
|
+
# pyiceberg.table.sorting.NullOrder mappings
|
174
|
+
if value.lower() == "nulls-first":
|
175
|
+
return NullOrder.AT_START
|
176
|
+
elif value.lower() == "nulls-last":
|
177
|
+
return NullOrder.AT_END
|
178
|
+
return None
|
File without changes
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from deltacat.storage.model.scan.push_down import Pushdown
|
5
|
+
from deltacat.storage.model.scan.scan_plan import ScanPlan
|
6
|
+
|
7
|
+
|
8
|
+
class ScanPlanner(ABC):
|
9
|
+
@abstractmethod
|
10
|
+
def create_scan_plan(
|
11
|
+
self,
|
12
|
+
table_name: str,
|
13
|
+
namespace: Optional[str] = None,
|
14
|
+
pushdown: Optional[Pushdown] = None,
|
15
|
+
) -> ScanPlan:
|
16
|
+
"""Return a ScanPlan for a given DeltaCAT Table after applying pushdown predicates
|
17
|
+
|
18
|
+
Args:
|
19
|
+
table: Name of the table
|
20
|
+
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
21
|
+
pushdown: Pushdown predicates used to filter partitions/data files
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
a ScanPlan object containing list of ScanTasks
|
25
|
+
"""
|
26
|
+
pass
|
@@ -0,0 +1 @@
|
|
1
|
+
# NOTE - this module is renamed because it is shadowing the stdlib io module when running tests in Pycharm
|
File without changes
|
File without changes
|
@@ -45,7 +45,11 @@ class TestCloudpickleBugFix(unittest.TestCase):
|
|
45
45
|
def test_sanity(self):
|
46
46
|
ray.init(local_mode=True, ignore_reinit_error=True)
|
47
47
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
48
|
+
try:
|
49
|
+
result = ray.get(calculate_pickled_length.remote(AnyObject()))
|
50
|
+
|
51
|
+
self.assertTrue(result[0] < 1000)
|
52
|
+
self.assertTrue(result[1] >= 5000000)
|
53
|
+
finally:
|
54
|
+
if ray.is_initialized():
|
55
|
+
ray.shutdown()
|
deltacat/tests/aws/test_s3u.py
CHANGED
@@ -2,9 +2,8 @@ import unittest
|
|
2
2
|
|
3
3
|
import botocore
|
4
4
|
|
5
|
-
from deltacat.
|
6
|
-
from deltacat.
|
7
|
-
|
5
|
+
from deltacat.constants import RETRYABLE_TRANSIENT_ERRORS
|
6
|
+
from deltacat.types.tables import CapturedBlockWritePaths, UuidBlockWritePathProvider
|
8
7
|
|
9
8
|
import os
|
10
9
|
from unittest import mock
|
@@ -99,34 +98,6 @@ class TestDownloadUpload(unittest.TestCase):
|
|
99
98
|
|
100
99
|
assert mock_s3.put_object.call_count > 3
|
101
100
|
|
102
|
-
@patch("deltacat.aws.s3u.UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY", 1)
|
103
|
-
@patch("deltacat.aws.s3u.ManifestEntry")
|
104
|
-
@patch("deltacat.aws.s3u._get_metadata")
|
105
|
-
@patch("deltacat.aws.s3u.CapturedBlockWritePaths")
|
106
|
-
def test_upload_sliced_table_retry(
|
107
|
-
self,
|
108
|
-
mock_captured_block_write_paths,
|
109
|
-
mock_get_metadata,
|
110
|
-
mock_manifest_entry,
|
111
|
-
):
|
112
|
-
mock_manifest_entry.from_s3_obj_url.side_effect = OSError(
|
113
|
-
"Please reduce your request rate.."
|
114
|
-
)
|
115
|
-
mock_get_metadata.return_value = [mock.MagicMock()]
|
116
|
-
cbwp = CapturedBlockWritePaths()
|
117
|
-
cbwp._write_paths = ["s3_write_path"]
|
118
|
-
cbwp._block_refs = [mock.MagicMock()]
|
119
|
-
mock_captured_block_write_paths.return_value = cbwp
|
120
|
-
with pytest.raises(RetryError):
|
121
|
-
s3u.upload_sliced_table(
|
122
|
-
mock.MagicMock(),
|
123
|
-
"s3-prefix",
|
124
|
-
mock.MagicMock(),
|
125
|
-
mock.MagicMock(),
|
126
|
-
mock.MagicMock(),
|
127
|
-
mock.MagicMock(),
|
128
|
-
)
|
129
|
-
|
130
101
|
@patch("deltacat.aws.s3u.UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY", 1)
|
131
102
|
@patch("deltacat.aws.s3u.s3_client_cache")
|
132
103
|
def test_upload_transient_error_retry(self, mock_s3_client_cache):
|
File without changes
|
File without changes
|