deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,369 +0,0 @@
|
|
1
|
-
from typing import Any, Dict, List, Optional, Set, Union, Tuple
|
2
|
-
import pyarrow as pa
|
3
|
-
import logging
|
4
|
-
from deltacat.catalog.model.table_definition import TableDefinition
|
5
|
-
from deltacat.storage.model.sort_key import SortKey
|
6
|
-
from deltacat.storage.model.list_result import ListResult
|
7
|
-
from deltacat.storage.model.namespace import Namespace
|
8
|
-
from deltacat.storage.model.types import (
|
9
|
-
DistributedDataset,
|
10
|
-
LifecycleState,
|
11
|
-
LocalDataset,
|
12
|
-
LocalTable,
|
13
|
-
SchemaConsistencyType,
|
14
|
-
)
|
15
|
-
from deltacat.storage.model.partition import PartitionLocator, Partition
|
16
|
-
from deltacat.storage.model.table_version import TableVersion
|
17
|
-
from deltacat.compute.merge_on_read.model.merge_on_read_params import MergeOnReadParams
|
18
|
-
from deltacat.storage.model.delta import DeltaType
|
19
|
-
import deltacat.storage.interface as deltacat_storage
|
20
|
-
from deltacat.types.media import ContentType, TableType, DistributedDatasetType
|
21
|
-
from deltacat.types.tables import TableWriteMode
|
22
|
-
from deltacat.compute.merge_on_read import MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE
|
23
|
-
from deltacat import logs
|
24
|
-
|
25
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
26
|
-
|
27
|
-
STORAGE = None
|
28
|
-
|
29
|
-
|
30
|
-
# table functions
|
31
|
-
def write_to_table(
|
32
|
-
data: Union[LocalTable, LocalDataset, DistributedDataset], # type: ignore
|
33
|
-
table: str,
|
34
|
-
namespace: Optional[str] = None,
|
35
|
-
mode: TableWriteMode = TableWriteMode.AUTO,
|
36
|
-
content_type: ContentType = ContentType.PARQUET,
|
37
|
-
*args,
|
38
|
-
**kwargs,
|
39
|
-
) -> None:
|
40
|
-
"""Write local or distributed data to a table. Raises an error if the
|
41
|
-
table does not exist and the table write mode is not CREATE or AUTO.
|
42
|
-
|
43
|
-
When creating a table, all `create_table` parameters may be optionally
|
44
|
-
specified as additional keyword arguments. When appending to, or replacing,
|
45
|
-
an existing table, all `alter_table` parameters may be optionally specified
|
46
|
-
as additional keyword arguments."""
|
47
|
-
raise NotImplementedError("write_to_table not implemented")
|
48
|
-
|
49
|
-
|
50
|
-
def read_table(
|
51
|
-
table: str,
|
52
|
-
namespace: Optional[str] = None,
|
53
|
-
table_version: Optional[str] = None,
|
54
|
-
table_type: Optional[TableType] = TableType.PYARROW,
|
55
|
-
distributed_dataset_type: Optional[
|
56
|
-
DistributedDatasetType
|
57
|
-
] = DistributedDatasetType.RAY_DATASET,
|
58
|
-
partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
|
59
|
-
stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
|
60
|
-
merge_on_read: Optional[bool] = False,
|
61
|
-
reader_kwargs: Optional[Dict[Any, Any]] = None,
|
62
|
-
deltacat_storage_kwargs: Optional[Dict[Any, Any]] = None,
|
63
|
-
*args,
|
64
|
-
**kwargs,
|
65
|
-
) -> DistributedDataset: # type: ignore
|
66
|
-
"""Read a table into a distributed dataset."""
|
67
|
-
|
68
|
-
if reader_kwargs is None:
|
69
|
-
reader_kwargs = {}
|
70
|
-
|
71
|
-
if deltacat_storage_kwargs is None:
|
72
|
-
deltacat_storage_kwargs = {}
|
73
|
-
|
74
|
-
_validate_read_table_args(
|
75
|
-
namespace=namespace,
|
76
|
-
table_type=table_type,
|
77
|
-
distributed_dataset_type=distributed_dataset_type,
|
78
|
-
merge_on_read=merge_on_read,
|
79
|
-
)
|
80
|
-
|
81
|
-
table_version_obj = _get_latest_or_given_table_version(
|
82
|
-
namespace=namespace,
|
83
|
-
table_name=table,
|
84
|
-
table_version=table_version,
|
85
|
-
**deltacat_storage_kwargs,
|
86
|
-
)
|
87
|
-
table_version = table_version_obj.table_version
|
88
|
-
|
89
|
-
if (
|
90
|
-
table_version_obj.content_types is None
|
91
|
-
or len(table_version_obj.content_types) != 1
|
92
|
-
):
|
93
|
-
raise ValueError(
|
94
|
-
"Expected exactly one content type but "
|
95
|
-
f"found {table_version_obj.content_types}."
|
96
|
-
)
|
97
|
-
|
98
|
-
logger.info(
|
99
|
-
f"Reading metadata for table={namespace}/{table}/{table_version} "
|
100
|
-
f"with partition_filters={partition_filter} and stream position"
|
101
|
-
f" range={stream_position_range_inclusive}"
|
102
|
-
)
|
103
|
-
|
104
|
-
if partition_filter is None:
|
105
|
-
logger.info(
|
106
|
-
f"Reading all partitions metadata in the table={table} "
|
107
|
-
"as partition_filter was None."
|
108
|
-
)
|
109
|
-
partition_filter = STORAGE.list_partitions(
|
110
|
-
table_name=table,
|
111
|
-
namespace=namespace,
|
112
|
-
table_version=table_version,
|
113
|
-
**deltacat_storage_kwargs,
|
114
|
-
).all_items()
|
115
|
-
|
116
|
-
qualified_deltas = _get_deltas_from_partition_filter(
|
117
|
-
stream_position_range_inclusive=stream_position_range_inclusive,
|
118
|
-
partition_filter=partition_filter,
|
119
|
-
**deltacat_storage_kwargs,
|
120
|
-
)
|
121
|
-
|
122
|
-
logger.info(
|
123
|
-
f"Total qualified deltas={len(qualified_deltas)} "
|
124
|
-
f"from {len(partition_filter)} partitions."
|
125
|
-
)
|
126
|
-
|
127
|
-
merge_on_read_params = MergeOnReadParams.of(
|
128
|
-
{
|
129
|
-
"deltas": qualified_deltas,
|
130
|
-
"deltacat_storage": STORAGE,
|
131
|
-
"deltacat_storage_kwargs": deltacat_storage_kwargs,
|
132
|
-
"reader_kwargs": reader_kwargs,
|
133
|
-
}
|
134
|
-
)
|
135
|
-
|
136
|
-
return MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE[distributed_dataset_type.value](
|
137
|
-
params=merge_on_read_params, **kwargs
|
138
|
-
)
|
139
|
-
|
140
|
-
|
141
|
-
def alter_table(
|
142
|
-
table: str,
|
143
|
-
namespace: Optional[str] = None,
|
144
|
-
lifecycle_state: Optional[LifecycleState] = None,
|
145
|
-
schema_updates: Optional[Dict[str, Any]] = None,
|
146
|
-
partition_updates: Optional[Dict[str, Any]] = None,
|
147
|
-
primary_keys: Optional[Set[str]] = None,
|
148
|
-
sort_keys: Optional[List[SortKey]] = None,
|
149
|
-
description: Optional[str] = None,
|
150
|
-
properties: Optional[Dict[str, str]] = None,
|
151
|
-
*args,
|
152
|
-
**kwargs,
|
153
|
-
) -> None:
|
154
|
-
"""Alter table definition."""
|
155
|
-
raise NotImplementedError("alter_table not implemented")
|
156
|
-
|
157
|
-
|
158
|
-
def create_table(
|
159
|
-
table: str,
|
160
|
-
namespace: Optional[str] = None,
|
161
|
-
lifecycle_state: Optional[LifecycleState] = None,
|
162
|
-
schema: Optional[Union[pa.Schema, str, bytes]] = None,
|
163
|
-
schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
|
164
|
-
partition_keys: Optional[List[Dict[str, Any]]] = None,
|
165
|
-
primary_keys: Optional[Set[str]] = None,
|
166
|
-
sort_keys: Optional[List[SortKey]] = None,
|
167
|
-
description: Optional[str] = None,
|
168
|
-
properties: Optional[Dict[str, str]] = None,
|
169
|
-
permissions: Optional[Dict[str, Any]] = None,
|
170
|
-
content_types: Optional[List[ContentType]] = None,
|
171
|
-
replace_existing_table: bool = False,
|
172
|
-
*args,
|
173
|
-
**kwargs,
|
174
|
-
) -> TableDefinition:
|
175
|
-
"""Create an empty table. Raises an error if the table already exists and
|
176
|
-
`replace_existing_table` is False."""
|
177
|
-
raise NotImplementedError("create_table not implemented")
|
178
|
-
|
179
|
-
|
180
|
-
def drop_table(
|
181
|
-
table: str, namespace: Optional[str] = None, purge: bool = False, *args, **kwargs
|
182
|
-
) -> None:
|
183
|
-
"""Drop a table from the catalog and optionally purge it. Raises an error
|
184
|
-
if the table does not exist."""
|
185
|
-
raise NotImplementedError("drop_table not implemented")
|
186
|
-
|
187
|
-
|
188
|
-
def refresh_table(table: str, namespace: Optional[str] = None, *args, **kwargs) -> None:
|
189
|
-
"""Refresh metadata cached on the Ray cluster for the given table."""
|
190
|
-
raise NotImplementedError("refresh_table not implemented")
|
191
|
-
|
192
|
-
|
193
|
-
def list_tables(
|
194
|
-
namespace: Optional[str] = None, *args, **kwargs
|
195
|
-
) -> ListResult[TableDefinition]:
|
196
|
-
"""List a page of table definitions. Raises an error if the given namespace
|
197
|
-
does not exist."""
|
198
|
-
raise NotImplementedError("list_tables not implemented")
|
199
|
-
|
200
|
-
|
201
|
-
def get_table(
|
202
|
-
table: str, namespace: Optional[str] = None, *args, **kwargs
|
203
|
-
) -> Optional[TableDefinition]:
|
204
|
-
"""Get table definition metadata. Returns None if the given table does not
|
205
|
-
exist."""
|
206
|
-
raise NotImplementedError("get_table not implemented")
|
207
|
-
|
208
|
-
|
209
|
-
def truncate_table(
|
210
|
-
table: str, namespace: Optional[str] = None, *args, **kwargs
|
211
|
-
) -> None:
|
212
|
-
"""Truncate table data. Raises an error if the table does not exist."""
|
213
|
-
raise NotImplementedError("truncate_table not implemented")
|
214
|
-
|
215
|
-
|
216
|
-
def rename_table(
|
217
|
-
table: str, new_name: str, namespace: Optional[str] = None, *args, **kwargs
|
218
|
-
) -> None:
|
219
|
-
"""Rename a table."""
|
220
|
-
raise NotImplementedError("rename_table not implemented")
|
221
|
-
|
222
|
-
|
223
|
-
def table_exists(table: str, namespace: Optional[str] = None, *args, **kwargs) -> bool:
|
224
|
-
"""Returns True if the given table exists, False if not."""
|
225
|
-
raise NotImplementedError("table_exists not implemented")
|
226
|
-
|
227
|
-
|
228
|
-
# namespace functions
|
229
|
-
def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
|
230
|
-
"""List a page of table namespaces."""
|
231
|
-
raise NotImplementedError("list_namespaces not implemented")
|
232
|
-
|
233
|
-
|
234
|
-
def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
|
235
|
-
"""Gets table namespace metadata for the specified table namespace. Returns
|
236
|
-
None if the given namespace does not exist."""
|
237
|
-
raise NotImplementedError("get_namespace not implemented")
|
238
|
-
|
239
|
-
|
240
|
-
def namespace_exists(namespace: str, *args, **kwargs) -> bool:
|
241
|
-
"""Returns True if the given table namespace exists, False if not."""
|
242
|
-
raise NotImplementedError("namespace_exists not implemented")
|
243
|
-
|
244
|
-
|
245
|
-
def create_namespace(
|
246
|
-
namespace: str, permissions: Dict[str, Any], *args, **kwargs
|
247
|
-
) -> Namespace:
|
248
|
-
"""Creates a table namespace with the given name and permissions. Returns
|
249
|
-
the created namespace. Raises an error if the namespace already exists."""
|
250
|
-
raise NotImplementedError("create_namespace not implemented")
|
251
|
-
|
252
|
-
|
253
|
-
def alter_namespace(
|
254
|
-
namespace: str,
|
255
|
-
permissions: Optional[Dict[str, Any]] = None,
|
256
|
-
new_namespace: Optional[str] = None,
|
257
|
-
*args,
|
258
|
-
**kwargs,
|
259
|
-
) -> None:
|
260
|
-
"""Alter table namespace definition."""
|
261
|
-
raise NotImplementedError("alter_namespace not implemented")
|
262
|
-
|
263
|
-
|
264
|
-
def drop_namespace(namespace: str, purge: bool = False, *args, **kwargs) -> None:
|
265
|
-
"""Drop the given namespace and all of its tables from the catalog,
|
266
|
-
optionally purging them."""
|
267
|
-
raise NotImplementedError("drop_namespace not implemented")
|
268
|
-
|
269
|
-
|
270
|
-
def default_namespace() -> str:
|
271
|
-
"""Returns the default namespace for the catalog."""
|
272
|
-
raise NotImplementedError("default_namespace not implemented")
|
273
|
-
|
274
|
-
|
275
|
-
# catalog functions
|
276
|
-
def initialize(ds: deltacat_storage, *args, **kwargs) -> None:
|
277
|
-
"""Initializes the data catalog with the given arguments."""
|
278
|
-
global STORAGE
|
279
|
-
STORAGE = ds
|
280
|
-
|
281
|
-
|
282
|
-
def _validate_read_table_args(
|
283
|
-
namespace: Optional[str] = None,
|
284
|
-
table_type: Optional[TableType] = None,
|
285
|
-
distributed_dataset_type: Optional[DistributedDatasetType] = None,
|
286
|
-
merge_on_read: Optional[bool] = None,
|
287
|
-
):
|
288
|
-
if STORAGE is None:
|
289
|
-
raise ValueError(
|
290
|
-
"Catalog not initialized. Did you miss calling "
|
291
|
-
"initialize(ds=<deltacat_storage>)?"
|
292
|
-
)
|
293
|
-
|
294
|
-
if merge_on_read:
|
295
|
-
raise ValueError("Merge on read not supported currently.")
|
296
|
-
|
297
|
-
if table_type is not TableType.PYARROW:
|
298
|
-
raise ValueError("Only PYARROW table type is supported as of now")
|
299
|
-
|
300
|
-
if distributed_dataset_type is not DistributedDatasetType.DAFT:
|
301
|
-
raise ValueError("Only DAFT dataset type is supported as of now")
|
302
|
-
|
303
|
-
if namespace is None:
|
304
|
-
raise ValueError(
|
305
|
-
"namespace must be passed to uniquely identify a table in the catalog."
|
306
|
-
)
|
307
|
-
|
308
|
-
|
309
|
-
def _get_latest_or_given_table_version(
|
310
|
-
namespace: str,
|
311
|
-
table_name: str,
|
312
|
-
table_version: Optional[str] = None,
|
313
|
-
*args,
|
314
|
-
**kwargs,
|
315
|
-
) -> TableVersion:
|
316
|
-
table_version_obj = None
|
317
|
-
if table_version is None:
|
318
|
-
table_version_obj = STORAGE.get_latest_table_version(
|
319
|
-
namespace=namespace, table_name=table_name, *args, **kwargs
|
320
|
-
)
|
321
|
-
table_version = table_version_obj.table_version
|
322
|
-
else:
|
323
|
-
table_version_obj = STORAGE.get_table_version(
|
324
|
-
namespace=namespace,
|
325
|
-
table_name=table_name,
|
326
|
-
table_version=table_version,
|
327
|
-
*args,
|
328
|
-
**kwargs,
|
329
|
-
)
|
330
|
-
|
331
|
-
return table_version_obj
|
332
|
-
|
333
|
-
|
334
|
-
def _get_deltas_from_partition_filter(
|
335
|
-
partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
|
336
|
-
stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
|
337
|
-
*args,
|
338
|
-
**kwargs,
|
339
|
-
):
|
340
|
-
|
341
|
-
result_deltas = []
|
342
|
-
start_stream_position, end_stream_position = stream_position_range_inclusive or (
|
343
|
-
None,
|
344
|
-
None,
|
345
|
-
)
|
346
|
-
for partition_like in partition_filter:
|
347
|
-
deltas = STORAGE.list_partition_deltas(
|
348
|
-
partition_like=partition_like,
|
349
|
-
ascending_order=True,
|
350
|
-
include_manifest=True,
|
351
|
-
start_stream_position=start_stream_position,
|
352
|
-
last_stream_position=end_stream_position,
|
353
|
-
*args,
|
354
|
-
**kwargs,
|
355
|
-
).all_items()
|
356
|
-
|
357
|
-
for delta in deltas:
|
358
|
-
if (
|
359
|
-
start_stream_position is None
|
360
|
-
or delta.stream_position >= start_stream_position
|
361
|
-
) and (
|
362
|
-
end_stream_position is None
|
363
|
-
or delta.stream_position <= end_stream_position
|
364
|
-
):
|
365
|
-
if delta.type == DeltaType.DELETE:
|
366
|
-
raise ValueError("DELETE type deltas are not supported")
|
367
|
-
result_deltas.append(delta)
|
368
|
-
|
369
|
-
return result_deltas
|
@@ -1,97 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import logging
|
3
|
-
from typing import Dict, Any
|
4
|
-
from deltacat import logs
|
5
|
-
from deltacat.compute.compactor import RoundCompletionInfo
|
6
|
-
from deltacat.storage import PartitionLocator
|
7
|
-
from deltacat.aws import s3u as s3_utils
|
8
|
-
from typing import Optional
|
9
|
-
from deltacat.utils.metrics import metrics
|
10
|
-
|
11
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
12
|
-
|
13
|
-
|
14
|
-
def get_round_completion_file_s3_url(
|
15
|
-
bucket: str,
|
16
|
-
source_partition_locator: PartitionLocator,
|
17
|
-
destination_partition_locator: Optional[PartitionLocator] = None,
|
18
|
-
) -> str:
|
19
|
-
|
20
|
-
base_url = source_partition_locator.path(f"s3://{bucket}")
|
21
|
-
if destination_partition_locator:
|
22
|
-
base_url = destination_partition_locator.path(
|
23
|
-
f"s3://{bucket}/{source_partition_locator.hexdigest()}"
|
24
|
-
)
|
25
|
-
|
26
|
-
return f"{base_url}.json"
|
27
|
-
|
28
|
-
|
29
|
-
@metrics
|
30
|
-
def read_round_completion_file(
|
31
|
-
bucket: str,
|
32
|
-
source_partition_locator: PartitionLocator,
|
33
|
-
destination_partition_locator: Optional[PartitionLocator] = None,
|
34
|
-
**s3_client_kwargs: Optional[Dict[str, Any]],
|
35
|
-
) -> RoundCompletionInfo:
|
36
|
-
|
37
|
-
all_uris = []
|
38
|
-
if destination_partition_locator:
|
39
|
-
round_completion_file_url_with_destination = get_round_completion_file_s3_url(
|
40
|
-
bucket,
|
41
|
-
source_partition_locator,
|
42
|
-
destination_partition_locator,
|
43
|
-
)
|
44
|
-
all_uris.append(round_completion_file_url_with_destination)
|
45
|
-
|
46
|
-
# Note: we read from RCF at two different URI for backward
|
47
|
-
# compatibility reasons.
|
48
|
-
round_completion_file_url_prev = get_round_completion_file_s3_url(
|
49
|
-
bucket,
|
50
|
-
source_partition_locator,
|
51
|
-
)
|
52
|
-
|
53
|
-
all_uris.append(round_completion_file_url_prev)
|
54
|
-
|
55
|
-
round_completion_info = None
|
56
|
-
|
57
|
-
for rcf_uri in all_uris:
|
58
|
-
logger.info(f"Reading round completion file from: {rcf_uri}")
|
59
|
-
result = s3_utils.download(rcf_uri, False, **s3_client_kwargs)
|
60
|
-
if result:
|
61
|
-
json_str = result["Body"].read().decode("utf-8")
|
62
|
-
round_completion_info = RoundCompletionInfo(json.loads(json_str))
|
63
|
-
logger.info(f"Read round completion info: {round_completion_info}")
|
64
|
-
break
|
65
|
-
else:
|
66
|
-
logger.warning(f"Round completion file not present at {rcf_uri}")
|
67
|
-
|
68
|
-
return round_completion_info
|
69
|
-
|
70
|
-
|
71
|
-
@metrics
|
72
|
-
def write_round_completion_file(
|
73
|
-
bucket: Optional[str],
|
74
|
-
source_partition_locator: Optional[PartitionLocator],
|
75
|
-
destination_partition_locator: Optional[PartitionLocator],
|
76
|
-
round_completion_info: RoundCompletionInfo,
|
77
|
-
completion_file_s3_url: Optional[str] = None,
|
78
|
-
**s3_client_kwargs: Optional[Dict[str, Any]],
|
79
|
-
) -> str:
|
80
|
-
if bucket is None and completion_file_s3_url is None:
|
81
|
-
raise AssertionError("Either bucket or completion_file_s3_url must be passed")
|
82
|
-
|
83
|
-
logger.info(f"writing round completion file contents: {round_completion_info}")
|
84
|
-
if completion_file_s3_url is None:
|
85
|
-
completion_file_s3_url = get_round_completion_file_s3_url(
|
86
|
-
bucket,
|
87
|
-
source_partition_locator,
|
88
|
-
destination_partition_locator,
|
89
|
-
)
|
90
|
-
logger.info(f"writing round completion file to: {completion_file_s3_url}")
|
91
|
-
s3_utils.upload(
|
92
|
-
completion_file_s3_url,
|
93
|
-
str(json.dumps(round_completion_info)),
|
94
|
-
**s3_client_kwargs,
|
95
|
-
)
|
96
|
-
logger.info(f"round completion file written to: {completion_file_s3_url}")
|
97
|
-
return completion_file_s3_url
|
@@ -1,40 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from deltacat.compute.merge_on_read.model.merge_on_read_params import MergeOnReadParams
|
3
|
-
from deltacat.storage.model.types import DistributedDataset
|
4
|
-
from deltacat.types.media import TableType, DistributedDatasetType
|
5
|
-
from deltacat.compute.merge_on_read.utils.delta import create_df_from_all_deltas
|
6
|
-
from deltacat import logs
|
7
|
-
|
8
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
9
|
-
|
10
|
-
|
11
|
-
def merge(params: MergeOnReadParams, **kwargs) -> DistributedDataset:
|
12
|
-
"""
|
13
|
-
Merges the given deltas and returns the result as distributed dataframe.
|
14
|
-
It reads the deltas into the Daft dataframe and leverages operations supported
|
15
|
-
by Daft to perform an efficient merge using Ray cluster.
|
16
|
-
|
17
|
-
TODO(raghumdani): Perform actual merge.
|
18
|
-
"""
|
19
|
-
|
20
|
-
delta_dfs = create_df_from_all_deltas(
|
21
|
-
deltas=params.deltas,
|
22
|
-
table_type=TableType.PYARROW,
|
23
|
-
distributed_dataset_type=DistributedDatasetType.DAFT,
|
24
|
-
reader_kwargs=params.reader_kwargs,
|
25
|
-
deltacat_storage=params.deltacat_storage,
|
26
|
-
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
27
|
-
**kwargs,
|
28
|
-
)
|
29
|
-
|
30
|
-
logger.info(f"Merging {len(delta_dfs)} delta dfs...")
|
31
|
-
|
32
|
-
# TODO: This code should be optimized from daft side
|
33
|
-
result = None
|
34
|
-
for df in delta_dfs:
|
35
|
-
if result is None:
|
36
|
-
result = df
|
37
|
-
else:
|
38
|
-
result = result.concat(df)
|
39
|
-
|
40
|
-
return result
|
@@ -1,66 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
from typing import Optional, Dict, List, Union, Any
|
3
|
-
from deltacat.storage import (
|
4
|
-
Delta,
|
5
|
-
DeltaLocator,
|
6
|
-
interface as unimplemented_deltacat_storage,
|
7
|
-
)
|
8
|
-
|
9
|
-
|
10
|
-
class MergeOnReadParams(dict):
|
11
|
-
"""
|
12
|
-
This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
|
13
|
-
"""
|
14
|
-
|
15
|
-
@staticmethod
|
16
|
-
def of(params: Optional[Dict]) -> MergeOnReadParams:
|
17
|
-
params = {} if params is None else params
|
18
|
-
|
19
|
-
result = MergeOnReadParams(params)
|
20
|
-
assert result.deltas is not None, "deltas is a required arg"
|
21
|
-
|
22
|
-
result.deltacat_storage = params.get(
|
23
|
-
"deltacat_storage", unimplemented_deltacat_storage
|
24
|
-
)
|
25
|
-
result.reader_kwargs = params.get("reader_kwargs", {})
|
26
|
-
result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
|
27
|
-
|
28
|
-
return result
|
29
|
-
|
30
|
-
@property
|
31
|
-
def deltas(self) -> List[Union[Delta, DeltaLocator]]:
|
32
|
-
"""
|
33
|
-
The list of deltas to compact in-memory.
|
34
|
-
"""
|
35
|
-
return self["deltas"]
|
36
|
-
|
37
|
-
@deltas.setter
|
38
|
-
def deltas(self, to_set: List[Union[Delta, DeltaLocator]]) -> None:
|
39
|
-
self["deltas"] = to_set
|
40
|
-
|
41
|
-
@property
|
42
|
-
def reader_kwargs(self) -> Dict[Any, Any]:
|
43
|
-
"""
|
44
|
-
The key word arguments to be passed to the reader.
|
45
|
-
"""
|
46
|
-
return self["reader_kwargs"]
|
47
|
-
|
48
|
-
@reader_kwargs.setter
|
49
|
-
def reader_kwargs(self, kwargs: Dict[Any, Any]) -> None:
|
50
|
-
self["reader_kwargs"] = kwargs
|
51
|
-
|
52
|
-
@property
|
53
|
-
def deltacat_storage(self) -> unimplemented_deltacat_storage:
|
54
|
-
return self["deltacat_storage"]
|
55
|
-
|
56
|
-
@deltacat_storage.setter
|
57
|
-
def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
|
58
|
-
self["deltacat_storage"] = storage
|
59
|
-
|
60
|
-
@property
|
61
|
-
def deltacat_storage_kwargs(self) -> dict:
|
62
|
-
return self["deltacat_storage_kwargs"]
|
63
|
-
|
64
|
-
@deltacat_storage_kwargs.setter
|
65
|
-
def deltacat_storage_kwargs(self, kwargs: dict) -> None:
|
66
|
-
self["deltacat_storage_kwargs"] = kwargs
|
@@ -1,42 +0,0 @@
|
|
1
|
-
from typing import List, Dict, Any, Optional, Union
|
2
|
-
from deltacat.storage.model.delta import Delta, DeltaLocator
|
3
|
-
from deltacat.storage.model.types import DistributedDataset
|
4
|
-
from deltacat.storage import (
|
5
|
-
interface as unimplemented_deltacat_storage,
|
6
|
-
)
|
7
|
-
from deltacat.types.media import TableType, StorageType, DistributedDatasetType
|
8
|
-
|
9
|
-
|
10
|
-
def create_df_from_all_deltas(
|
11
|
-
deltas: List[Union[Delta, DeltaLocator]],
|
12
|
-
table_type: TableType,
|
13
|
-
distributed_dataset_type: DistributedDatasetType,
|
14
|
-
reader_kwargs: Optional[Dict[Any, Any]] = None,
|
15
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
16
|
-
deltacat_storage_kwargs: Optional[Dict[Any, Any]] = None,
|
17
|
-
*args,
|
18
|
-
**kwargs
|
19
|
-
) -> List[DistributedDataset]: # type: ignore
|
20
|
-
"""
|
21
|
-
This method creates a distributed dataset for each delta and returns their references.
|
22
|
-
"""
|
23
|
-
|
24
|
-
if reader_kwargs is None:
|
25
|
-
reader_kwargs = {}
|
26
|
-
if deltacat_storage_kwargs is None:
|
27
|
-
deltacat_storage_kwargs = {}
|
28
|
-
|
29
|
-
df_list = []
|
30
|
-
|
31
|
-
for delta in deltas:
|
32
|
-
df = deltacat_storage.download_delta(
|
33
|
-
delta_like=delta,
|
34
|
-
table_type=table_type,
|
35
|
-
distributed_dataset_type=distributed_dataset_type,
|
36
|
-
storage_type=StorageType.DISTRIBUTED,
|
37
|
-
**reader_kwargs,
|
38
|
-
**deltacat_storage_kwargs
|
39
|
-
)
|
40
|
-
df_list.append(df)
|
41
|
-
|
42
|
-
return df_list
|
deltacat/io/dataset.py
DELETED
@@ -1,73 +0,0 @@
|
|
1
|
-
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
-
from __future__ import annotations
|
3
|
-
|
4
|
-
from typing import Any, Callable, Dict, Optional, TypeVar, Union, cast
|
5
|
-
|
6
|
-
import pyarrow as pa
|
7
|
-
import s3fs
|
8
|
-
from ray.data import Dataset
|
9
|
-
|
10
|
-
T = TypeVar("T")
|
11
|
-
|
12
|
-
|
13
|
-
class DeltacatDataset(Dataset[T]):
|
14
|
-
@staticmethod
|
15
|
-
def from_dataset(dataset: Dataset[T]) -> DeltacatDataset[T]:
|
16
|
-
# cast to DeltacatDataset in-place since it only adds new methods
|
17
|
-
dataset.__class__ = DeltacatDataset
|
18
|
-
return cast(DeltacatDataset[T], dataset)
|
19
|
-
|
20
|
-
def write_redshift(
|
21
|
-
self,
|
22
|
-
path: str,
|
23
|
-
*,
|
24
|
-
filesystem: Optional[Union[pa.fs.FileSystem, s3fs.S3FileSystem]] = None,
|
25
|
-
try_create_dir: bool = True,
|
26
|
-
arrow_open_stream_args: Optional[Dict[str, Any]] = None,
|
27
|
-
arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
|
28
|
-
**arrow_parquet_args,
|
29
|
-
) -> None:
|
30
|
-
"""Writes the dataset to Parquet files and commits a Redshift manifest
|
31
|
-
back to S3 indexing the files written. The output can be loaded into
|
32
|
-
Redshift by providing it to the Redshift COPY command, or via AWS Data
|
33
|
-
Wrangler's `wr.redshift.copy_from_files()` API.
|
34
|
-
|
35
|
-
This is only supported for datasets convertible to Arrow records.
|
36
|
-
To control the number of files, use ``.repartition()``.
|
37
|
-
|
38
|
-
Unless a custom block path provider is given, the format of the output
|
39
|
-
files will be {uuid}_{block_idx}.parquet, where ``uuid`` is an unique
|
40
|
-
id for the dataset.
|
41
|
-
|
42
|
-
The Redshift manifest will be written to ``f"{path}/manifest``
|
43
|
-
|
44
|
-
Examples:
|
45
|
-
>>> ds.write_redshift("s3://bucket/path")
|
46
|
-
|
47
|
-
Time complexity: O(dataset size / parallelism)
|
48
|
-
|
49
|
-
Args:
|
50
|
-
path: The path to the destination root directory where Parquet
|
51
|
-
files and the Redshift manifest will be written to.
|
52
|
-
filesystem: The filesystem implementation to write to. This should
|
53
|
-
be either PyArrow's S3FileSystem or s3fs.
|
54
|
-
try_create_dir: Try to create all directories in destination path
|
55
|
-
if True. Does nothing if all directories already exist.
|
56
|
-
arrow_open_stream_args: kwargs passed to
|
57
|
-
pyarrow.fs.FileSystem.open_output_stream
|
58
|
-
filename_provider: FilenameProvider implementation
|
59
|
-
to write each dataset block to a custom output path.
|
60
|
-
arrow_parquet_args_fn: Callable that returns a dictionary of write
|
61
|
-
arguments to use when writing each block to a file. Overrides
|
62
|
-
any duplicate keys from arrow_parquet_args. This should be used
|
63
|
-
instead of arrow_parquet_args if any of your write arguments
|
64
|
-
cannot be pickled, or if you'd like to lazily resolve the write
|
65
|
-
arguments for each dataset block.
|
66
|
-
arrow_parquet_args: Options to pass to
|
67
|
-
pyarrow.parquet.write_table(), which is used to write out each
|
68
|
-
block to a file.
|
69
|
-
"""
|
70
|
-
raise NotImplementedError(
|
71
|
-
"Writing to Redshift is not yet supported. "
|
72
|
-
"Please use DeltacatDataset.write_parquet() instead."
|
73
|
-
)
|