deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,263 @@
|
|
1
|
+
"""
|
2
|
+
Spark SQL utilities for Iceberg table operations.
|
3
|
+
|
4
|
+
This module provides Beam DoFn classes that use Spark SQL to work with Iceberg tables,
|
5
|
+
"""
|
6
|
+
|
7
|
+
import os
|
8
|
+
import apache_beam as beam
|
9
|
+
from apache_beam import Row
|
10
|
+
|
11
|
+
|
12
|
+
class SparkSQLIcebergRead(beam.DoFn):
|
13
|
+
"""
|
14
|
+
Custom Beam DoFn that uses Spark SQL to read Iceberg tables.
|
15
|
+
"""
|
16
|
+
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
table_name: str,
|
20
|
+
catalog_uri: str = "http://localhost:8181",
|
21
|
+
warehouse: str = "warehouse/",
|
22
|
+
):
|
23
|
+
"""
|
24
|
+
Initialize the Spark SQL reader.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
table_name: Name of the Iceberg table
|
28
|
+
catalog_uri: URI of the Iceberg REST catalog
|
29
|
+
warehouse: Warehouse path
|
30
|
+
"""
|
31
|
+
self.table_name = table_name
|
32
|
+
self.catalog_uri = catalog_uri
|
33
|
+
self.warehouse = warehouse
|
34
|
+
self.spark = None
|
35
|
+
|
36
|
+
def setup(self):
|
37
|
+
"""Set up Spark session (called once per worker)."""
|
38
|
+
try:
|
39
|
+
from pyspark.sql import SparkSession
|
40
|
+
import importlib.metadata
|
41
|
+
|
42
|
+
# Get Spark version for dependency resolution
|
43
|
+
try:
|
44
|
+
spark_version = ".".join(
|
45
|
+
importlib.metadata.version("pyspark").split(".")[:2]
|
46
|
+
)
|
47
|
+
except Exception:
|
48
|
+
spark_version = "3.5" # Default fallback
|
49
|
+
|
50
|
+
scala_version = "2.12"
|
51
|
+
iceberg_version = "1.6.0"
|
52
|
+
|
53
|
+
print(f"🔧 Setting up Spark session for reading {self.table_name}")
|
54
|
+
print(f" - Spark version: {spark_version}")
|
55
|
+
print(f" - Iceberg version: {iceberg_version}")
|
56
|
+
|
57
|
+
# Set Spark packages for Iceberg runtime
|
58
|
+
os.environ["PYSPARK_SUBMIT_ARGS"] = (
|
59
|
+
f"--packages org.apache.iceberg:iceberg-spark-runtime-{spark_version}_{scala_version}:{iceberg_version} "
|
60
|
+
f"pyspark-shell"
|
61
|
+
)
|
62
|
+
|
63
|
+
# Create Spark session with Iceberg REST catalog configuration
|
64
|
+
self.spark = (
|
65
|
+
SparkSession.builder.appName(f"DeltaCAT Read - {self.table_name}")
|
66
|
+
.config("spark.sql.session.timeZone", "UTC")
|
67
|
+
.config(
|
68
|
+
"spark.serializer", "org.apache.spark.serializer.KryoSerializer"
|
69
|
+
)
|
70
|
+
.config(
|
71
|
+
"spark.sql.extensions",
|
72
|
+
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
|
73
|
+
)
|
74
|
+
# Configure REST catalog
|
75
|
+
.config(
|
76
|
+
"spark.sql.catalog.rest", "org.apache.iceberg.spark.SparkCatalog"
|
77
|
+
)
|
78
|
+
.config("spark.sql.catalog.rest.type", "rest")
|
79
|
+
.config("spark.sql.catalog.rest.uri", self.catalog_uri)
|
80
|
+
.config("spark.sql.catalog.rest.warehouse", self.warehouse)
|
81
|
+
# Set REST as default catalog
|
82
|
+
.config("spark.sql.defaultCatalog", "rest")
|
83
|
+
# Local mode configuration (within Beam workers)
|
84
|
+
.config("spark.master", "local[1]") # Single thread per worker
|
85
|
+
.config("spark.sql.adaptive.enabled", "true")
|
86
|
+
# Networking binding
|
87
|
+
.config("spark.driver.bindAddress", "127.0.0.1")
|
88
|
+
.config("spark.driver.host", "127.0.0.1")
|
89
|
+
.config("spark.ui.enabled", "false")
|
90
|
+
.config("spark.sql.adaptive.coalescePartitions.enabled", "false")
|
91
|
+
.getOrCreate()
|
92
|
+
)
|
93
|
+
|
94
|
+
print(f"✅ Spark session created successfully")
|
95
|
+
|
96
|
+
except Exception as e:
|
97
|
+
print(f"❌ Failed to set up Spark session: {e}")
|
98
|
+
raise
|
99
|
+
|
100
|
+
def teardown(self):
|
101
|
+
"""Clean up Spark session (called once per worker)."""
|
102
|
+
if self.spark:
|
103
|
+
try:
|
104
|
+
self.spark.stop()
|
105
|
+
print("✅ Spark session stopped")
|
106
|
+
except Exception as e:
|
107
|
+
print(f"⚠️ Error stopping Spark session: {e}")
|
108
|
+
|
109
|
+
def process(self, element):
|
110
|
+
"""
|
111
|
+
Process element (read from Iceberg table using Spark SQL).
|
112
|
+
|
113
|
+
Args:
|
114
|
+
element: Input element (not used, just triggers the read)
|
115
|
+
|
116
|
+
Yields:
|
117
|
+
Records from the Iceberg table
|
118
|
+
"""
|
119
|
+
try:
|
120
|
+
if not self.spark:
|
121
|
+
raise RuntimeError("Spark session not initialized")
|
122
|
+
|
123
|
+
print(f"📖 Reading table {self.table_name} using Spark SQL")
|
124
|
+
|
125
|
+
# Read from Iceberg table using Spark SQL
|
126
|
+
df = self.spark.sql(f"SELECT * FROM {self.table_name}")
|
127
|
+
|
128
|
+
# Collect all records
|
129
|
+
records = df.collect()
|
130
|
+
|
131
|
+
print(f"📊 Successfully read {len(records)} records from {self.table_name}")
|
132
|
+
|
133
|
+
# Convert Spark rows to Beam Row objects and yield
|
134
|
+
for row in records:
|
135
|
+
row_dict = row.asDict()
|
136
|
+
# Convert to Beam Row for consistency with write mode
|
137
|
+
beam_row = Row(**row_dict)
|
138
|
+
yield beam_row
|
139
|
+
|
140
|
+
except Exception as e:
|
141
|
+
print(f"❌ Failed to read from table {self.table_name}: {e}")
|
142
|
+
raise
|
143
|
+
|
144
|
+
|
145
|
+
class SparkSQLIcebergRewrite(beam.DoFn):
|
146
|
+
"""
|
147
|
+
Custom Beam DoFn that uses Spark SQL to rewrite Iceberg table data files.
|
148
|
+
|
149
|
+
This uses Spark's rewrite_data_files procedure to materialize positional deletes
|
150
|
+
by rewriting data files. The result is a "clean" table without positional deletes.
|
151
|
+
"""
|
152
|
+
|
153
|
+
def __init__(self, catalog_uri, warehouse_path, table_name):
|
154
|
+
self.catalog_uri = catalog_uri
|
155
|
+
self.warehouse_path = warehouse_path
|
156
|
+
self.table_name = table_name
|
157
|
+
|
158
|
+
def setup(self):
|
159
|
+
"""Initialize Spark session for rewrite operations."""
|
160
|
+
try:
|
161
|
+
from pyspark.sql import SparkSession
|
162
|
+
import importlib.metadata
|
163
|
+
|
164
|
+
print(f"🔧 Setting up Spark session for rewriting {self.table_name}")
|
165
|
+
|
166
|
+
# Detect Spark version for appropriate Iceberg runtime
|
167
|
+
spark_version = importlib.metadata.version("pyspark")
|
168
|
+
major_minor = ".".join(spark_version.split(".")[:2])
|
169
|
+
print(f" - Spark version: {major_minor}")
|
170
|
+
print(f" - Iceberg version: 1.6.0")
|
171
|
+
|
172
|
+
# Configure Spark with Iceberg
|
173
|
+
self.spark = (
|
174
|
+
SparkSession.builder.appName("IcebergRewrite")
|
175
|
+
.config(
|
176
|
+
"spark.jars.packages",
|
177
|
+
f"org.apache.iceberg:iceberg-spark-runtime-{major_minor}_2.12:1.6.0",
|
178
|
+
)
|
179
|
+
.config(
|
180
|
+
"spark.sql.extensions",
|
181
|
+
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
|
182
|
+
)
|
183
|
+
.config(
|
184
|
+
"spark.sql.catalog.spark_catalog",
|
185
|
+
"org.apache.iceberg.spark.SparkSessionCatalog",
|
186
|
+
)
|
187
|
+
.config("spark.sql.catalog.spark_catalog.type", "rest")
|
188
|
+
.config("spark.sql.catalog.spark_catalog.uri", self.catalog_uri)
|
189
|
+
.config(
|
190
|
+
"spark.sql.catalog.spark_catalog.warehouse", self.warehouse_path
|
191
|
+
)
|
192
|
+
.config("spark.driver.bindAddress", "127.0.0.1")
|
193
|
+
.config("spark.driver.host", "127.0.0.1")
|
194
|
+
.config("spark.ui.enabled", "false")
|
195
|
+
.getOrCreate()
|
196
|
+
)
|
197
|
+
|
198
|
+
print("✅ Spark session created successfully")
|
199
|
+
|
200
|
+
except ImportError as e:
|
201
|
+
raise RuntimeError(
|
202
|
+
f"PySpark is required for rewrite mode. Install with: pip install pyspark"
|
203
|
+
) from e
|
204
|
+
except Exception as e:
|
205
|
+
raise RuntimeError(f"Failed to create Spark session: {e}") from e
|
206
|
+
|
207
|
+
def process(self, element):
|
208
|
+
"""Rewrite table data files to materialize positional deletes."""
|
209
|
+
try:
|
210
|
+
print(
|
211
|
+
f"📋 Rewriting table {self.table_name} to materialize positional deletes"
|
212
|
+
)
|
213
|
+
|
214
|
+
# Use Spark's rewrite_data_files procedure with delete_file_threshold=1
|
215
|
+
# This forces rewrite even when there's only 1 positional delete file
|
216
|
+
rewrite_sql = f"""
|
217
|
+
CALL spark_catalog.system.rewrite_data_files(
|
218
|
+
table => '{self.table_name}',
|
219
|
+
options => map('delete-file-threshold', '1')
|
220
|
+
)
|
221
|
+
"""
|
222
|
+
|
223
|
+
print(f"🔄 Executing rewrite procedure with delete_file_threshold=1...")
|
224
|
+
print(f" SQL: {rewrite_sql.strip()}")
|
225
|
+
print(
|
226
|
+
f" Rationale: Forces rewrite even with single positional delete file"
|
227
|
+
)
|
228
|
+
|
229
|
+
result = self.spark.sql(rewrite_sql)
|
230
|
+
|
231
|
+
# Collect results to see what was rewritten
|
232
|
+
rewrite_result = result.collect()[0]
|
233
|
+
print(f"📊 Rewrite result: {rewrite_result}")
|
234
|
+
|
235
|
+
# Check if we actually rewrote anything
|
236
|
+
if rewrite_result.rewritten_data_files_count > 0:
|
237
|
+
print(
|
238
|
+
f"✅ Successfully rewrote {rewrite_result.rewritten_data_files_count} data files"
|
239
|
+
)
|
240
|
+
print(
|
241
|
+
f" - Added {rewrite_result.added_data_files_count} new data files"
|
242
|
+
)
|
243
|
+
print(f" - Rewrote {rewrite_result.rewritten_bytes_count} bytes")
|
244
|
+
print(f" - Positional deletes have been materialized!")
|
245
|
+
else:
|
246
|
+
print(f"⚠️ No files were rewritten (rewritten_data_files_count=0)")
|
247
|
+
print(f" - This may indicate no positional deletes exist")
|
248
|
+
print(f" - Or the table may already be in optimal state")
|
249
|
+
|
250
|
+
yield f"Rewrite completed for {self.table_name}"
|
251
|
+
|
252
|
+
except Exception as e:
|
253
|
+
print(f"❌ Error during rewrite: {e}")
|
254
|
+
import traceback
|
255
|
+
|
256
|
+
traceback.print_exc()
|
257
|
+
yield f"Rewrite failed for {self.table_name}: {e}"
|
258
|
+
|
259
|
+
def teardown(self):
|
260
|
+
"""Clean up Spark session."""
|
261
|
+
if hasattr(self, "spark"):
|
262
|
+
print("✅ Spark session stopped")
|
263
|
+
self.spark.stop()
|
@@ -0,0 +1,184 @@
|
|
1
|
+
import os
|
2
|
+
import logging
|
3
|
+
|
4
|
+
import uuid
|
5
|
+
import daft
|
6
|
+
from pyiceberg.catalog import CatalogType
|
7
|
+
|
8
|
+
import deltacat as dc
|
9
|
+
|
10
|
+
from deltacat import logs
|
11
|
+
from deltacat import IcebergCatalog
|
12
|
+
from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
|
13
|
+
from env import store_cli_args_in_os_environ
|
14
|
+
|
15
|
+
from pyiceberg.schema import (
|
16
|
+
Schema,
|
17
|
+
NestedField,
|
18
|
+
DoubleType,
|
19
|
+
StringType,
|
20
|
+
)
|
21
|
+
from pyiceberg.partitioning import PartitionSpec, PartitionField
|
22
|
+
from pyiceberg.transforms import BucketTransform
|
23
|
+
|
24
|
+
from deltacat.experimental.storage.iceberg.model import (
|
25
|
+
SchemaMapper,
|
26
|
+
PartitionSchemeMapper,
|
27
|
+
)
|
28
|
+
from deltacat.env import create_ray_runtime_environment
|
29
|
+
|
30
|
+
# initialize the driver logger
|
31
|
+
driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
|
32
|
+
|
33
|
+
|
34
|
+
def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
|
35
|
+
"""
|
36
|
+
This is an e2e example that
|
37
|
+
1. creates a DeltaCAT Table (backed by an Iceberg Table) in Glue
|
38
|
+
2. writes data into the DeltaCAT Table
|
39
|
+
3. reads data from the DeltaCAT Table using Daft
|
40
|
+
|
41
|
+
To run the script:
|
42
|
+
1. prepare an AWS Account
|
43
|
+
1. prepare a S3 location where the data will be written to, which will be used in Step 3.
|
44
|
+
2. prepare an IAM Role that has access to the S3 location and Glue
|
45
|
+
2. retrieve the IAM Role AWS Credential and cache locally in ~/.aws/credentials
|
46
|
+
3. run below command to execute the example
|
47
|
+
```
|
48
|
+
make venv && source venv/bin/activate
|
49
|
+
python -m deltacat.examples.iceberg.iceberg_bucket_writer --warehouse=s3://<YOUR_S3_LOCATION>
|
50
|
+
```
|
51
|
+
|
52
|
+
"""
|
53
|
+
# create any runtime environment required to run the example
|
54
|
+
runtime_env = create_ray_runtime_environment()
|
55
|
+
|
56
|
+
# Start by initializing DeltaCAT and registering available Catalogs.
|
57
|
+
# Ray will be initialized automatically via `ray.init()`.
|
58
|
+
# Only the `iceberg` data catalog is provided so it will become the default.
|
59
|
+
# If initializing multiple catalogs, use the `default_catalog_name` param
|
60
|
+
# to specify which catalog should be the default.
|
61
|
+
|
62
|
+
dc.init(
|
63
|
+
catalogs={
|
64
|
+
# the name of the DeltaCAT catalog is "iceberg"
|
65
|
+
"iceberg": dc.Catalog(
|
66
|
+
# Apache Iceberg implementation of deltacat.catalog.interface
|
67
|
+
impl=IcebergCatalog,
|
68
|
+
# kwargs for pyiceberg.catalog.load_catalog start here...
|
69
|
+
# the name of the Iceberg catalog is "example-iceberg-catalog"
|
70
|
+
name="example-iceberg-catalog",
|
71
|
+
# for additional properties see:
|
72
|
+
# https://py.iceberg.apache.org/configuration/
|
73
|
+
config=IcebergCatalogConfig(
|
74
|
+
type=CatalogType.GLUE,
|
75
|
+
properties={
|
76
|
+
"warehouse": warehouse,
|
77
|
+
"region_name": "us-east-1",
|
78
|
+
},
|
79
|
+
),
|
80
|
+
)
|
81
|
+
},
|
82
|
+
# pass the runtime environment into ray.init()
|
83
|
+
ray_init_args={"runtime_env": runtime_env},
|
84
|
+
)
|
85
|
+
|
86
|
+
# define a native Iceberg table schema
|
87
|
+
schema = Schema(
|
88
|
+
NestedField(field_id=1, name="symbol", field_type=StringType(), required=True),
|
89
|
+
NestedField(field_id=2, name="bid", field_type=DoubleType(), required=False),
|
90
|
+
NestedField(field_id=3, name="ask", field_type=DoubleType(), required=False),
|
91
|
+
)
|
92
|
+
|
93
|
+
# define a native Iceberg partition spec
|
94
|
+
partition_spec = PartitionSpec(
|
95
|
+
PartitionField(
|
96
|
+
source_id=1,
|
97
|
+
field_id=1000,
|
98
|
+
transform=BucketTransform(2),
|
99
|
+
name="symbol_bucket",
|
100
|
+
)
|
101
|
+
)
|
102
|
+
|
103
|
+
# define a native Iceberg sort order
|
104
|
+
# sort_order = SortOrder(SortField(source_id=1, transform=IdentityTransform()))
|
105
|
+
|
106
|
+
# define the Daft dataframe to write
|
107
|
+
df = daft.from_pydict(
|
108
|
+
{
|
109
|
+
"symbol": ["amzn", "goog", "meta", "msft"],
|
110
|
+
"bid": [157.16, 150.55, 392.03, 403.25],
|
111
|
+
"ask": [157.17, 150.56, 392.09, 403.27],
|
112
|
+
}
|
113
|
+
)
|
114
|
+
|
115
|
+
# write to a table named `test_namespace.test_table_bucketed-<SUFFIX>`
|
116
|
+
# we don't need to specify which catalog to create this table in since
|
117
|
+
# only the "iceberg" catalog is available
|
118
|
+
table_name = f"test_table_bucketed-{uuid.uuid4().hex[:8]}"
|
119
|
+
namespace = "test_namespace"
|
120
|
+
print(f"Creating Glue Table: {namespace}.{table_name}")
|
121
|
+
dc.write_to_table(
|
122
|
+
data=df,
|
123
|
+
# path=warehouse + "/datafiles",
|
124
|
+
table=table_name,
|
125
|
+
namespace=namespace,
|
126
|
+
schema=SchemaMapper.map(schema),
|
127
|
+
partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
|
128
|
+
# sort_keys=SortSchemeMapper.map(sort_order, schema),
|
129
|
+
)
|
130
|
+
|
131
|
+
print(f"Getting Glue Table: {namespace}.{table_name}")
|
132
|
+
table_definition = dc.get_table(name=table_name, namespace=namespace)
|
133
|
+
print(f"Retrieved Glue Table: {table_definition}")
|
134
|
+
|
135
|
+
# Read Data from DeltaCAT Table (backed by Iceberg) using Daft
|
136
|
+
daft_dataframe = dc.read_table(table=table_name, namespace=namespace)
|
137
|
+
|
138
|
+
daft_dataframe.where(df["bid"] > 200.0).show()
|
139
|
+
# Expected result:
|
140
|
+
# ╭────────┬─────────┬─────────╮
|
141
|
+
# │ symbol ┆ bid ┆ ask │
|
142
|
+
# │ --- ┆ --- ┆ --- │
|
143
|
+
# │ Utf8 ┆ Float64 ┆ Float64 │
|
144
|
+
# ╞════════╪═════════╪═════════╡
|
145
|
+
# │ meta ┆ 392.03 ┆ 392.09 │
|
146
|
+
# ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
147
|
+
# │ msft ┆ 403.25 ┆ 403.27 │
|
148
|
+
# ╰────────┴─────────┴─────────╯
|
149
|
+
|
150
|
+
daft_dataframe.select("symbol").show()
|
151
|
+
# Expected result:
|
152
|
+
# ╭────────╮
|
153
|
+
# │ symbol │
|
154
|
+
# │ --- │
|
155
|
+
# │ Utf8 │
|
156
|
+
# ╞════════╡
|
157
|
+
# │ meta │
|
158
|
+
# ├╌╌╌╌╌╌╌╌┤
|
159
|
+
# │ amzn │
|
160
|
+
# ├╌╌╌╌╌╌╌╌┤
|
161
|
+
# │ goog │
|
162
|
+
# ├╌╌╌╌╌╌╌╌┤
|
163
|
+
# │ msft │
|
164
|
+
# ╰────────╯
|
165
|
+
|
166
|
+
|
167
|
+
if __name__ == "__main__":
|
168
|
+
example_script_args = [
|
169
|
+
(
|
170
|
+
[
|
171
|
+
"--warehouse",
|
172
|
+
],
|
173
|
+
{
|
174
|
+
"help": "S3 path for Iceberg file storage.",
|
175
|
+
"type": str,
|
176
|
+
},
|
177
|
+
),
|
178
|
+
]
|
179
|
+
|
180
|
+
# store any CLI args in the runtime environment
|
181
|
+
store_cli_args_in_os_environ(example_script_args)
|
182
|
+
|
183
|
+
# run the example using os.environ as kwargs
|
184
|
+
run(**os.environ)
|
@@ -0,0 +1,147 @@
|
|
1
|
+
import os
|
2
|
+
import logging
|
3
|
+
import deltacat as dc
|
4
|
+
|
5
|
+
from deltacat import logs
|
6
|
+
from deltacat import IcebergCatalog
|
7
|
+
from env import store_cli_args_in_os_environ
|
8
|
+
|
9
|
+
from pyiceberg.schema import (
|
10
|
+
Schema,
|
11
|
+
NestedField,
|
12
|
+
DoubleType,
|
13
|
+
StringType,
|
14
|
+
TimestampType,
|
15
|
+
FloatType,
|
16
|
+
StructType,
|
17
|
+
)
|
18
|
+
from pyiceberg.partitioning import PartitionSpec, PartitionField
|
19
|
+
from pyiceberg.transforms import DayTransform, IdentityTransform
|
20
|
+
from pyiceberg.table.sorting import SortField, SortOrder
|
21
|
+
|
22
|
+
from deltacat.exceptions import TableAlreadyExistsError
|
23
|
+
from deltacat.experimental.storage.iceberg.model import (
|
24
|
+
SchemaMapper,
|
25
|
+
PartitionSchemeMapper,
|
26
|
+
SortSchemeMapper,
|
27
|
+
)
|
28
|
+
from deltacat.env import create_ray_runtime_environment
|
29
|
+
|
30
|
+
# initialize the driver logger
|
31
|
+
driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
|
32
|
+
|
33
|
+
|
34
|
+
def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
|
35
|
+
# create any runtime environment required to run the example
|
36
|
+
runtime_env = create_ray_runtime_environment()
|
37
|
+
|
38
|
+
# Start by initializing DeltaCAT and registering available Catalogs.
|
39
|
+
# Ray will be initialized automatically via `ray.init()`.
|
40
|
+
# Only the `iceberg` data catalog is provided so it will become the default.
|
41
|
+
# If initializing multiple catalogs, use the `default_catalog_name` param
|
42
|
+
# to specify which catalog should be the default.
|
43
|
+
dc.init(
|
44
|
+
catalogs={
|
45
|
+
# the name of the DeltaCAT catalog is "iceberg"
|
46
|
+
"iceberg": dc.Catalog(
|
47
|
+
# Apache Iceberg implementation of deltacat.catalog.interface
|
48
|
+
impl=IcebergCatalog,
|
49
|
+
# kwargs for pyiceberg.catalog.load_catalog start here...
|
50
|
+
# the name of the Iceberg catalog is "example-iceberg-catalog"
|
51
|
+
name="example-iceberg-catalog",
|
52
|
+
# for additional properties see:
|
53
|
+
# https://py.iceberg.apache.org/configuration/
|
54
|
+
properties={
|
55
|
+
"type": "glue",
|
56
|
+
"region_name": "us-east-1",
|
57
|
+
"warehouse": warehouse,
|
58
|
+
},
|
59
|
+
)
|
60
|
+
},
|
61
|
+
# pass the runtime environment into ray.init()
|
62
|
+
ray_init_args={"runtime_env": runtime_env},
|
63
|
+
)
|
64
|
+
|
65
|
+
# define a native Iceberg table schema
|
66
|
+
schema = Schema(
|
67
|
+
NestedField(
|
68
|
+
field_id=1, name="datetime", field_type=TimestampType(), required=True
|
69
|
+
),
|
70
|
+
NestedField(field_id=2, name="symbol", field_type=StringType(), required=True),
|
71
|
+
NestedField(field_id=3, name="bid", field_type=FloatType(), required=False),
|
72
|
+
NestedField(field_id=4, name="ask", field_type=DoubleType(), required=False),
|
73
|
+
NestedField(
|
74
|
+
field_id=5,
|
75
|
+
name="details",
|
76
|
+
field_type=StructType(
|
77
|
+
NestedField(
|
78
|
+
field_id=6,
|
79
|
+
name="created_by",
|
80
|
+
field_type=StringType(),
|
81
|
+
required=False,
|
82
|
+
),
|
83
|
+
),
|
84
|
+
required=False,
|
85
|
+
),
|
86
|
+
)
|
87
|
+
|
88
|
+
# define a native Iceberg partition spec
|
89
|
+
partition_spec = PartitionSpec(
|
90
|
+
PartitionField(
|
91
|
+
source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day"
|
92
|
+
)
|
93
|
+
)
|
94
|
+
|
95
|
+
# define a native Iceberg sort order
|
96
|
+
sort_order = SortOrder(SortField(source_id=2, transform=IdentityTransform()))
|
97
|
+
|
98
|
+
# create a table named `test_namespace.test_table`
|
99
|
+
# we don't need to specify which catalog to create this table in since
|
100
|
+
# only the "iceberg" catalog is available
|
101
|
+
table_name = "test_table"
|
102
|
+
namespace = "test_namespace"
|
103
|
+
print(f"Creating Glue Table: {namespace}.{table_name}")
|
104
|
+
try:
|
105
|
+
table_definition = dc.create_table(
|
106
|
+
table=table_name,
|
107
|
+
namespace=namespace,
|
108
|
+
schema=SchemaMapper.map(schema),
|
109
|
+
partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
|
110
|
+
sort_keys=SortSchemeMapper.map(sort_order, schema),
|
111
|
+
)
|
112
|
+
print(f"Created Glue Table: {table_definition}")
|
113
|
+
except TableAlreadyExistsError:
|
114
|
+
print(f"Glue Table `{namespace}.{table_name}` already exists.")
|
115
|
+
|
116
|
+
print(f"Getting Glue Table: {namespace}.{table_name}")
|
117
|
+
table_definition = dc.get_table(table_name, namespace)
|
118
|
+
print(f"Retrieved Glue Table: {table_definition}")
|
119
|
+
|
120
|
+
|
121
|
+
if __name__ == "__main__":
|
122
|
+
example_script_args = [
|
123
|
+
(
|
124
|
+
[
|
125
|
+
"--warehouse",
|
126
|
+
],
|
127
|
+
{
|
128
|
+
"help": "S3 path for Iceberg file storage.",
|
129
|
+
"type": str,
|
130
|
+
},
|
131
|
+
),
|
132
|
+
(
|
133
|
+
[
|
134
|
+
"--STAGE",
|
135
|
+
],
|
136
|
+
{
|
137
|
+
"help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
|
138
|
+
"type": str,
|
139
|
+
},
|
140
|
+
),
|
141
|
+
]
|
142
|
+
|
143
|
+
# store any CLI args in the runtime environment
|
144
|
+
store_cli_args_in_os_environ(example_script_args)
|
145
|
+
|
146
|
+
# run the example using os.environ as kwargs
|
147
|
+
run(**os.environ)
|
@@ -0,0 +1,29 @@
|
|
1
|
+
import ray
|
2
|
+
import deltacat
|
3
|
+
import daft
|
4
|
+
|
5
|
+
|
6
|
+
def print_package_version_info():
|
7
|
+
print(f"DeltaCAT Version: {deltacat.__version__}")
|
8
|
+
print(f"Ray Version: {ray.__version__}")
|
9
|
+
print(f"Daft Version: {daft.__version__}")
|
10
|
+
|
11
|
+
|
12
|
+
@ray.remote
|
13
|
+
def hello_worker():
|
14
|
+
print("Hello, Worker!")
|
15
|
+
print_package_version_info()
|
16
|
+
|
17
|
+
|
18
|
+
def run():
|
19
|
+
print("Hello, Driver!")
|
20
|
+
print_package_version_info()
|
21
|
+
hello_worker.remote()
|
22
|
+
|
23
|
+
|
24
|
+
if __name__ == "__main__":
|
25
|
+
# initialize deltacat
|
26
|
+
deltacat.init()
|
27
|
+
|
28
|
+
# run the example
|
29
|
+
run()
|
File without changes
|
File without changes
|
File without changes
|