deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,163 @@
|
|
1
|
+
import argparse
|
2
|
+
|
3
|
+
from datetime import datetime
|
4
|
+
|
5
|
+
import ray
|
6
|
+
|
7
|
+
import deltacat
|
8
|
+
import daft
|
9
|
+
import pyarrow as pa
|
10
|
+
import pandas as pd
|
11
|
+
import polars as pl
|
12
|
+
import numpy as np
|
13
|
+
|
14
|
+
from deltacat import DeltaCatUrl
|
15
|
+
|
16
|
+
|
17
|
+
def print_package_version_info() -> None:
|
18
|
+
print(f"DeltaCAT Version: {deltacat.__version__}")
|
19
|
+
print(f"Ray Version: {ray.__version__}")
|
20
|
+
print(f"Daft Version: {daft.__version__}")
|
21
|
+
print(f"NumPy Version: {np.__version__}")
|
22
|
+
print(f"PyArrow Version: {pa.__version__}")
|
23
|
+
print(f"Polars Version: {pl.__version__}")
|
24
|
+
print(f"Pandas Version: {pd.__version__}")
|
25
|
+
|
26
|
+
|
27
|
+
def json_path_to_regex(path: str):
|
28
|
+
if not path:
|
29
|
+
raise ValueError("Path cannot be empty")
|
30
|
+
parts = path.split("/")
|
31
|
+
leaf_key = parts.pop()
|
32
|
+
regex = r""
|
33
|
+
for part in parts:
|
34
|
+
if part.strip(): # discard leading and/or redundant separators
|
35
|
+
regex += rf'"{part}"\s*:\s*[{{\[].*?'
|
36
|
+
regex += rf'"{leaf_key}"\s*:\s*"(?<{leaf_key}>.*?)"'
|
37
|
+
return regex
|
38
|
+
|
39
|
+
|
40
|
+
def run(
|
41
|
+
source: str,
|
42
|
+
dest: str,
|
43
|
+
) -> None:
|
44
|
+
# print package version info
|
45
|
+
print_package_version_info()
|
46
|
+
|
47
|
+
# run a synchronous copy from the source to the destination
|
48
|
+
deltacat.copy(
|
49
|
+
DeltaCatUrl(source),
|
50
|
+
DeltaCatUrl(dest),
|
51
|
+
# reader arguments to pass to the default reader (polars)
|
52
|
+
# for the given text-based datasource, it accepts the same
|
53
|
+
# arguments as polars.read_csv except for `source`, `n_threads`
|
54
|
+
# `new_columns`, `separator`, `has_header`, `quote_char`, and
|
55
|
+
# `infer_schema`.
|
56
|
+
reader_args={
|
57
|
+
"low_memory": True, # try to use less memory (++stability, --perf)
|
58
|
+
"batch_size": 1024, # text line count read into a buffer at once
|
59
|
+
"use_pyarrow": True, # use the native pyarrow reader
|
60
|
+
},
|
61
|
+
# writer arguments to pass to the default writer (polars)
|
62
|
+
# for the given parquet-based datasink, it generally accepts the same
|
63
|
+
# arguments as polars.DataFrame.write_{dest-type} except for `file`
|
64
|
+
writer_args={
|
65
|
+
"compression": "lz4", # faster compression & decompression
|
66
|
+
# "compression": "zstd", # better compression ratio
|
67
|
+
# "compression": "snappy", # compatible w/ older Parquet readers
|
68
|
+
},
|
69
|
+
# Transforms to run against the default polars dataframe read.
|
70
|
+
# By default, each transform takes a polars dataframe `df` as input
|
71
|
+
# and produces a polars dataframe as output. All transforms listed
|
72
|
+
# are run in order (i.e., the dataframe output from transform[0]
|
73
|
+
# is the dataframe input to transform[1]).
|
74
|
+
#
|
75
|
+
# See:
|
76
|
+
# https://docs.pola.rs/api/python/stable/reference/dataframe/index.html
|
77
|
+
# https://docs.pola.rs/api/python/stable/reference/expressions/index.html
|
78
|
+
transforms=[
|
79
|
+
lambda df, src: df.rename(
|
80
|
+
{"text": "utf8_body"},
|
81
|
+
),
|
82
|
+
lambda df, src: df.with_columns(
|
83
|
+
pl.col("utf8_body").hash().alias("utf8_body_hash"),
|
84
|
+
pl.lit(datetime.utcnow()).dt.datetime().alias("processing_time"),
|
85
|
+
pl.lit(src.url_path).alias("source_file_path"),
|
86
|
+
),
|
87
|
+
],
|
88
|
+
)
|
89
|
+
|
90
|
+
|
91
|
+
if __name__ == "__main__":
|
92
|
+
"""
|
93
|
+
Example 1: Run this script locally using Ray:
|
94
|
+
$ python indexer.py \
|
95
|
+
$ --source 'text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31' \
|
96
|
+
$ --dest 'parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet'
|
97
|
+
|
98
|
+
Example 2: Submit this script as a local Ray job using a local job client:
|
99
|
+
>>> from deltacat import local_job_client
|
100
|
+
>>> client = local_job_client()
|
101
|
+
>>> # read the source file as line-delimited text
|
102
|
+
>>> src = "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31"
|
103
|
+
>>> # write to the destination file using the default DeltaCAT Parquet writer (i.e., polars.DataFrame.write_parquet)
|
104
|
+
>>> dst = "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet"
|
105
|
+
>>> try:
|
106
|
+
>>> job_run_result = client.run_job(
|
107
|
+
>>> # Entrypoint shell command to run the indexer job
|
108
|
+
>>> entrypoint=f"python indexer.py --source '{src}' --dest '{dst}'",
|
109
|
+
>>> # Path to the local directory that contains the indexer.py file
|
110
|
+
>>> runtime_env={"working_dir": "./deltacat/examples/indexer.py"},
|
111
|
+
>>> )
|
112
|
+
>>> print(f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}")
|
113
|
+
>>> print(f"Job ID {job_run_result.job_id} logs: ")
|
114
|
+
>>> print(job_run_result.job_logs)
|
115
|
+
>>> except RuntimeError as e:
|
116
|
+
>>> print(f"Job Run Failed: {e}")
|
117
|
+
>>> except TimeoutError as e:
|
118
|
+
>>> print(f"Job Run Timed Out: {e}")
|
119
|
+
|
120
|
+
Example 3: Submit this script as a remote Ray job using a remote job client:
|
121
|
+
>>> from deltacat import job_client
|
122
|
+
>>> # use `deltacat.yaml` from the current working directory as the ray cluster launcher config file
|
123
|
+
>>> # automatically launches the cluster if it doesn't exist or has died
|
124
|
+
>>> # automatically forwards the ray cluster's dashboard for viewing in a web browser @ http://localhost:8265
|
125
|
+
>>> client = job_client()
|
126
|
+
>>> # ... follow the same steps as above to submit a synchronous indexer job ...
|
127
|
+
>>>
|
128
|
+
>>> # OR use an explicit cluster launcher config file path
|
129
|
+
>>> client = job_client("/Users/pdames/workspace/deltacat.yaml")
|
130
|
+
>>> # ... follow the same steps as above to submit a synchronous indexer job ...
|
131
|
+
"""
|
132
|
+
script_args = [
|
133
|
+
(
|
134
|
+
[
|
135
|
+
"--source",
|
136
|
+
],
|
137
|
+
{
|
138
|
+
"help": "Source DeltaCAT URL to index.",
|
139
|
+
"type": str,
|
140
|
+
},
|
141
|
+
),
|
142
|
+
(
|
143
|
+
[
|
144
|
+
"--dest",
|
145
|
+
],
|
146
|
+
{
|
147
|
+
"help": "Destination DeltaCAT URL to index.",
|
148
|
+
"type": str,
|
149
|
+
},
|
150
|
+
),
|
151
|
+
]
|
152
|
+
# parse CLI input arguments
|
153
|
+
parser = argparse.ArgumentParser()
|
154
|
+
for args, kwargs in script_args:
|
155
|
+
parser.add_argument(*args, **kwargs)
|
156
|
+
args = parser.parse_args()
|
157
|
+
print(f"Command Line Arguments: {args}")
|
158
|
+
|
159
|
+
# initialize deltacat
|
160
|
+
deltacat.init()
|
161
|
+
|
162
|
+
# run the example using the parsed arguments
|
163
|
+
run(**vars(args))
|
@@ -0,0 +1,198 @@
|
|
1
|
+
import argparse
|
2
|
+
import pathlib
|
3
|
+
|
4
|
+
from deltacat.compute import (
|
5
|
+
job_client,
|
6
|
+
JobStatus,
|
7
|
+
)
|
8
|
+
|
9
|
+
|
10
|
+
def run_async(
|
11
|
+
source: str,
|
12
|
+
dest: str,
|
13
|
+
jobs_to_submit: int,
|
14
|
+
job_timeout: int,
|
15
|
+
cloud: str,
|
16
|
+
restart_ray: bool,
|
17
|
+
):
|
18
|
+
# print package version info
|
19
|
+
working_dir = pathlib.Path(__file__).parent
|
20
|
+
cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
|
21
|
+
job_number = 0
|
22
|
+
client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
|
23
|
+
job_ids = []
|
24
|
+
while jobs_to_submit > 0:
|
25
|
+
jobs_to_submit -= 1
|
26
|
+
job_dest = dest + f".{job_number}"
|
27
|
+
job_id = client.submit_job(
|
28
|
+
# Entrypoint shell command to execute
|
29
|
+
entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
|
30
|
+
# Path to the local directory that contains the indexer.py file
|
31
|
+
# This entire directory will be zipped into a job package, so keep
|
32
|
+
# it small.
|
33
|
+
runtime_env={"working_dir": working_dir},
|
34
|
+
)
|
35
|
+
job_ids.append(job_id)
|
36
|
+
job_number += 1
|
37
|
+
|
38
|
+
print("Waiting for all jobs to complete...")
|
39
|
+
job_number = 0
|
40
|
+
all_job_logs = ""
|
41
|
+
for job_id in job_ids:
|
42
|
+
job_status = client.await_job(job_id, timeout_seconds=job_timeout)
|
43
|
+
if job_status != JobStatus.SUCCEEDED:
|
44
|
+
print(f"Job `{job_id}` logs: ")
|
45
|
+
print(client.get_job_logs(job_id))
|
46
|
+
raise RuntimeError(f"Job `{job_id}` terminated with status: {job_status}")
|
47
|
+
all_job_logs += f"\nJob #{job_number} logs: \n"
|
48
|
+
all_job_logs += client.get_job_logs(job_id)
|
49
|
+
job_number += 1
|
50
|
+
print("All jobs completed!")
|
51
|
+
print("Job Logs: ")
|
52
|
+
print(all_job_logs)
|
53
|
+
|
54
|
+
|
55
|
+
def run_sync(
|
56
|
+
source: str,
|
57
|
+
dest: str,
|
58
|
+
jobs_to_submit: int,
|
59
|
+
job_timeout: int,
|
60
|
+
cloud: str,
|
61
|
+
restart_ray: bool,
|
62
|
+
):
|
63
|
+
working_dir = pathlib.Path(__file__).parent
|
64
|
+
cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
|
65
|
+
client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
|
66
|
+
job_number = 0
|
67
|
+
while job_number < jobs_to_submit:
|
68
|
+
job_dest = dest + f".{job_number}"
|
69
|
+
job_run_result = client.run_job(
|
70
|
+
# Entrypoint shell command to execute
|
71
|
+
entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
|
72
|
+
# Path to the local directory that contains the indexer.py file
|
73
|
+
# This entire directory will be zipped into a job package, so keep
|
74
|
+
# it small.
|
75
|
+
runtime_env={"working_dir": working_dir},
|
76
|
+
timeout_seconds=job_timeout,
|
77
|
+
)
|
78
|
+
print(
|
79
|
+
f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}"
|
80
|
+
)
|
81
|
+
print(f"Job ID {job_run_result.job_id} logs: ")
|
82
|
+
print(job_run_result.job_logs)
|
83
|
+
job_number += 1
|
84
|
+
|
85
|
+
|
86
|
+
def run(
|
87
|
+
source: str,
|
88
|
+
dest: str,
|
89
|
+
restart_ray: bool,
|
90
|
+
jobs_to_submit: int,
|
91
|
+
job_timeout: int,
|
92
|
+
asynchronous: bool,
|
93
|
+
cloud_provider: str,
|
94
|
+
):
|
95
|
+
run_func = run_async if asynchronous else run_sync
|
96
|
+
run_func(
|
97
|
+
source=source,
|
98
|
+
dest=dest,
|
99
|
+
jobs_to_submit=jobs_to_submit,
|
100
|
+
job_timeout=job_timeout,
|
101
|
+
cloud=cloud_provider,
|
102
|
+
restart_ray=restart_ray,
|
103
|
+
)
|
104
|
+
|
105
|
+
|
106
|
+
if __name__ == "__main__":
|
107
|
+
"""
|
108
|
+
# Run this example through a command of the form:
|
109
|
+
$ python ./deltacat/examples/job_runner.py -- \
|
110
|
+
$ --source text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31 \
|
111
|
+
$ --dest parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet \
|
112
|
+
$ --asynchronous \
|
113
|
+
$ --jobs-to-submit 100 \
|
114
|
+
$ --job-timeout 90 \
|
115
|
+
$ --cloud-provider aws
|
116
|
+
"""
|
117
|
+
script_args = [
|
118
|
+
(
|
119
|
+
[
|
120
|
+
"--source",
|
121
|
+
],
|
122
|
+
{
|
123
|
+
"help": "Source DeltaCAT URL to index.",
|
124
|
+
"type": str,
|
125
|
+
"default": "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31",
|
126
|
+
},
|
127
|
+
),
|
128
|
+
(
|
129
|
+
[
|
130
|
+
"--dest",
|
131
|
+
],
|
132
|
+
{
|
133
|
+
"help": "Destination DeltaCAT URL to store the indexed file.",
|
134
|
+
"type": str,
|
135
|
+
"default": "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet",
|
136
|
+
},
|
137
|
+
),
|
138
|
+
(
|
139
|
+
[
|
140
|
+
"--restart-ray",
|
141
|
+
],
|
142
|
+
{
|
143
|
+
"help": "Restart Ray on an existing cluster.",
|
144
|
+
"action": "store_true",
|
145
|
+
"default": False,
|
146
|
+
},
|
147
|
+
),
|
148
|
+
(
|
149
|
+
[
|
150
|
+
"--asynchronous",
|
151
|
+
],
|
152
|
+
{
|
153
|
+
"help": "Run jobs asynchronously.",
|
154
|
+
"action": "store_true",
|
155
|
+
"default": False,
|
156
|
+
},
|
157
|
+
),
|
158
|
+
(
|
159
|
+
[
|
160
|
+
"--jobs-to-submit",
|
161
|
+
],
|
162
|
+
{
|
163
|
+
"help": "Number of indexer jobs to submit for execution.",
|
164
|
+
"type": int,
|
165
|
+
"default": 1,
|
166
|
+
},
|
167
|
+
),
|
168
|
+
(
|
169
|
+
[
|
170
|
+
"--job-timeout",
|
171
|
+
],
|
172
|
+
{
|
173
|
+
"help": "Job timeout in seconds.",
|
174
|
+
"type": int,
|
175
|
+
"default": 300,
|
176
|
+
},
|
177
|
+
),
|
178
|
+
(
|
179
|
+
[
|
180
|
+
"--cloud-provider",
|
181
|
+
],
|
182
|
+
{
|
183
|
+
"help": "Ray Cluster Cloud Provider ('aws' or 'gcp')",
|
184
|
+
"type": str,
|
185
|
+
"default": "aws",
|
186
|
+
},
|
187
|
+
),
|
188
|
+
]
|
189
|
+
|
190
|
+
# parse CLI input arguments
|
191
|
+
parser = argparse.ArgumentParser()
|
192
|
+
for args, kwargs in script_args:
|
193
|
+
parser.add_argument(*args, **kwargs)
|
194
|
+
args = parser.parse_args()
|
195
|
+
print(f"Command Line Arguments: {args}")
|
196
|
+
|
197
|
+
# run the example using os.environ as kwargs
|
198
|
+
run(**vars(args))
|
deltacat/exceptions.py
CHANGED
@@ -1,10 +1,16 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from enum import Enum
|
3
|
-
import
|
4
|
-
import ray
|
3
|
+
from typing import Callable, Optional, TYPE_CHECKING
|
5
4
|
import logging
|
5
|
+
|
6
6
|
import tenacity
|
7
|
-
|
7
|
+
|
8
|
+
from pyarrow.lib import ArrowException, ArrowInvalid, ArrowCapacityError
|
9
|
+
|
10
|
+
import botocore
|
11
|
+
from botocore.exceptions import BotoCoreError
|
12
|
+
|
13
|
+
import ray
|
8
14
|
from ray.exceptions import (
|
9
15
|
RayError,
|
10
16
|
RayTaskError,
|
@@ -13,14 +19,17 @@ from ray.exceptions import (
|
|
13
19
|
NodeDiedError,
|
14
20
|
OutOfMemoryError,
|
15
21
|
)
|
16
|
-
|
17
|
-
from
|
18
|
-
|
19
|
-
|
22
|
+
|
23
|
+
from daft.exceptions import DaftTransientError, DaftCoreException
|
24
|
+
|
25
|
+
import deltacat as dc
|
26
|
+
from deltacat import logs
|
20
27
|
from deltacat.utils.ray_utils.runtime import (
|
21
28
|
get_current_ray_task_id,
|
22
29
|
)
|
23
|
-
|
30
|
+
|
31
|
+
if TYPE_CHECKING:
|
32
|
+
from deltacat.storage.model.schema import FieldLocator
|
24
33
|
|
25
34
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
26
35
|
|
@@ -64,6 +73,23 @@ class DeltaCatErrorNames(str, Enum):
|
|
64
73
|
UNCLASSIFIED_DELTACAT_ERROR = "UnclassifiedDeltaCatError"
|
65
74
|
UNRECOGNIZED_RAY_TASK_ERROR = "UnrecognizedRayTaskError"
|
66
75
|
|
76
|
+
NAMESPACE_NOT_FOUND_ERROR = "NamespaceNotFoundError"
|
77
|
+
TABLE_NOT_FOUND_ERROR = "TableNotFoundError"
|
78
|
+
TABLE_VERSION_NOT_FOUND_ERROR = "TableVersionNotFoundError"
|
79
|
+
STREAM_NOT_FOUND_ERROR = "StreamNotFoundError"
|
80
|
+
PARTITION_NOT_FOUND_ERROR = "PartitionNotFoundError"
|
81
|
+
DELTA_NOT_FOUND_ERROR = "DeltaNotFoundError"
|
82
|
+
TABLE_ALREADY_EXISTS_ERROR = "TableAlreadyExistsError"
|
83
|
+
TABLE_VERSION_ALREADY_EXISTS_ERROR = "TableVersionAlreadyExistsError"
|
84
|
+
NAMESPACE_ALREADY_EXISTS_ERROR = "NamespaceAlreadyExistsError"
|
85
|
+
SCHEMA_COMPATIBILITY_ERROR = "SchemaCompatibilityError"
|
86
|
+
SCHEMA_VALIDATION_ERROR = "SchemaValidationError"
|
87
|
+
TABLE_VALIDATION_ERROR = "TableValidationError"
|
88
|
+
CONCURRENT_MODIFICATION_ERROR = "ConcurrentModificationError"
|
89
|
+
OBJECT_NOT_FOUND_ERROR = "ObjectNotFoundError"
|
90
|
+
OBJECT_DELETED_ERROR = "ObjectDeletedError"
|
91
|
+
OBJECT_ALREADY_EXISTS_ERROR = "ObjectAlreadyExistsError"
|
92
|
+
|
67
93
|
|
68
94
|
class DeltaCatError(Exception):
|
69
95
|
def __init__(self, *args, **kwargs):
|
@@ -73,9 +99,12 @@ class DeltaCatError(Exception):
|
|
73
99
|
super().__init__(*args, **kwargs)
|
74
100
|
|
75
101
|
def _get_ray_task_id_and_node_ip(self):
|
76
|
-
|
77
|
-
|
78
|
-
|
102
|
+
if ray.is_initialized():
|
103
|
+
task_id = get_current_ray_task_id()
|
104
|
+
node_ip = ray.util.get_node_ip_address()
|
105
|
+
return task_id, node_ip
|
106
|
+
else:
|
107
|
+
return None, None
|
79
108
|
|
80
109
|
|
81
110
|
class NonRetryableError(DeltaCatError):
|
@@ -206,6 +235,81 @@ class UnrecognizedRayTaskError(NonRetryableError):
|
|
206
235
|
error_name = DeltaCatErrorNames.UNRECOGNIZED_RAY_TASK_ERROR.value
|
207
236
|
|
208
237
|
|
238
|
+
class NamespaceNotFoundError(NonRetryableError):
|
239
|
+
error_name = DeltaCatErrorNames.NAMESPACE_NOT_FOUND_ERROR.value
|
240
|
+
|
241
|
+
|
242
|
+
class TableNotFoundError(NonRetryableError):
|
243
|
+
error_name = DeltaCatErrorNames.TABLE_NOT_FOUND_ERROR.value
|
244
|
+
|
245
|
+
|
246
|
+
class TableVersionNotFoundError(NonRetryableError):
|
247
|
+
error_name = DeltaCatErrorNames.TABLE_VERSION_NOT_FOUND_ERROR.value
|
248
|
+
|
249
|
+
|
250
|
+
class PartitionNotFoundError(NonRetryableError):
|
251
|
+
error_name = DeltaCatErrorNames.PARTITION_NOT_FOUND_ERROR.value
|
252
|
+
|
253
|
+
|
254
|
+
class StreamNotFoundError(NonRetryableError):
|
255
|
+
error_name = DeltaCatErrorNames.STREAM_NOT_FOUND_ERROR.value
|
256
|
+
|
257
|
+
|
258
|
+
class DeltaNotFoundError(NonRetryableError):
|
259
|
+
error_name = DeltaCatErrorNames.DELTA_NOT_FOUND_ERROR.value
|
260
|
+
|
261
|
+
|
262
|
+
class TableAlreadyExistsError(NonRetryableError):
|
263
|
+
error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
|
264
|
+
|
265
|
+
|
266
|
+
class TableVersionAlreadyExistsError(NonRetryableError):
|
267
|
+
error_name = DeltaCatErrorNames.TABLE_VERSION_ALREADY_EXISTS_ERROR.value
|
268
|
+
|
269
|
+
|
270
|
+
class NamespaceAlreadyExistsError(NonRetryableError):
|
271
|
+
error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
|
272
|
+
|
273
|
+
|
274
|
+
class ObjectNotFoundError(NonRetryableError):
|
275
|
+
error_name = DeltaCatErrorNames.OBJECT_NOT_FOUND_ERROR.value
|
276
|
+
|
277
|
+
|
278
|
+
class ObjectDeletedError(NonRetryableError):
|
279
|
+
error_name = DeltaCatErrorNames.OBJECT_DELETED_ERROR.value
|
280
|
+
|
281
|
+
|
282
|
+
class ObjectAlreadyExistsError(NonRetryableError):
|
283
|
+
error_name = DeltaCatErrorNames.OBJECT_ALREADY_EXISTS_ERROR.value
|
284
|
+
|
285
|
+
|
286
|
+
class ConcurrentModificationError(NonRetryableError):
|
287
|
+
error_name = DeltaCatErrorNames.CONCURRENT_MODIFICATION_ERROR.value
|
288
|
+
|
289
|
+
|
290
|
+
class SchemaValidationError(NonRetryableError):
|
291
|
+
error_name = DeltaCatErrorNames.SCHEMA_VALIDATION_ERROR.value
|
292
|
+
|
293
|
+
|
294
|
+
class TableValidationError(NonRetryableError):
|
295
|
+
error_name = DeltaCatErrorNames.TABLE_VALIDATION_ERROR.value
|
296
|
+
|
297
|
+
|
298
|
+
class SchemaCompatibilityError(NonRetryableError):
|
299
|
+
error_name = DeltaCatErrorNames.SCHEMA_COMPATIBILITY_ERROR.value
|
300
|
+
"""Raised when a schema update would break backward compatibility."""
|
301
|
+
|
302
|
+
def __init__(
|
303
|
+
self,
|
304
|
+
message: str,
|
305
|
+
field_locator: Optional[FieldLocator] = None,
|
306
|
+
*args,
|
307
|
+
**kwargs,
|
308
|
+
):
|
309
|
+
super().__init__(message, *args, **kwargs)
|
310
|
+
self.field_locator = field_locator
|
311
|
+
|
312
|
+
|
209
313
|
def categorize_errors(func: Callable):
|
210
314
|
def wrapper(*args, **kwargs):
|
211
315
|
try:
|
@@ -238,7 +342,7 @@ def categorize_errors(func: Callable):
|
|
238
342
|
|
239
343
|
def categorize_deltacat_exception(
|
240
344
|
e: BaseException,
|
241
|
-
deltacat_storage:
|
345
|
+
deltacat_storage: dc.storage.interface = None,
|
242
346
|
deltacat_storage_kwargs: dict = None,
|
243
347
|
):
|
244
348
|
if deltacat_storage_kwargs is None:
|
File without changes
|
File without changes
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Any, Dict
|
3
|
+
|
4
|
+
from attr import dataclass
|
5
|
+
from pyiceberg.catalog import CatalogType
|
6
|
+
|
7
|
+
|
8
|
+
@dataclass
|
9
|
+
class IcebergCatalogConfig:
|
10
|
+
"""
|
11
|
+
Configuration properties for Iceberg catalog implementation.
|
12
|
+
|
13
|
+
This class holds the PyIceberg Catalog instance needed for interaction with
|
14
|
+
Iceberg tables and metadata.
|
15
|
+
|
16
|
+
This configuration is passed through to PyIceberg by invoking load_catalog.
|
17
|
+
The Properties provided must match properties accepted by PyIceberg for each catalog type
|
18
|
+
See: :func:`deltacat.experimental.catalog.iceberg.initialize`
|
19
|
+
|
20
|
+
Attributes:
|
21
|
+
type: The PyIceberg Catalog instance
|
22
|
+
properties: Dict of properties passed to pyiceberg load_catalog
|
23
|
+
"""
|
24
|
+
|
25
|
+
type: CatalogType
|
26
|
+
properties: Dict[str, Any]
|