deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,305 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import threading
|
4
|
+
from threading import Thread
|
5
|
+
from typing import Any, List, Set, Protocol, TypeVar, Dict, Iterable
|
6
|
+
|
7
|
+
from pyarrow import RecordBatch, Table
|
8
|
+
from deltacat.storage.model.partition import PartitionLocator
|
9
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import (
|
10
|
+
ManifestIO,
|
11
|
+
DeltacatManifestIO,
|
12
|
+
)
|
13
|
+
|
14
|
+
from deltacat.experimental.storage.rivulet import Schema
|
15
|
+
from deltacat.experimental.storage.rivulet.metastore.json_sst import JsonSstWriter
|
16
|
+
from deltacat.experimental.storage.rivulet.serializer import (
|
17
|
+
MEMTABLE_DATA,
|
18
|
+
DataSerializer,
|
19
|
+
)
|
20
|
+
from deltacat.experimental.storage.rivulet.serializer_factory import (
|
21
|
+
DataSerializerFactory,
|
22
|
+
)
|
23
|
+
from deltacat.experimental.storage.rivulet.writer.dataset_writer import (
|
24
|
+
DatasetWriter,
|
25
|
+
DATA,
|
26
|
+
)
|
27
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTWriter
|
28
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
29
|
+
|
30
|
+
INPUT_ROW = TypeVar("INPUT_ROW")
|
31
|
+
|
32
|
+
|
33
|
+
class Memtable(Protocol[INPUT_ROW]):
|
34
|
+
"""
|
35
|
+
Protocol defining the interface for a memtable that can store and sort records of type T.
|
36
|
+
"""
|
37
|
+
|
38
|
+
def add_record(self, record: INPUT_ROW) -> bool:
|
39
|
+
"""
|
40
|
+
Add a record to the memtable.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
record: The record to add of type INPUT_ROW
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
bool: True if the memtable is full after adding the record, False otherwise
|
47
|
+
"""
|
48
|
+
...
|
49
|
+
|
50
|
+
def get_sorted_records(self, schema: Schema) -> MEMTABLE_DATA:
|
51
|
+
"""
|
52
|
+
Get all records in the memtable in sorted order.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
List[T]: A list of sorted records
|
56
|
+
"""
|
57
|
+
...
|
58
|
+
|
59
|
+
|
60
|
+
class DictMemTable(Memtable[Dict[str, Any]]):
|
61
|
+
"""
|
62
|
+
Unit of in memory buffering of sorted records before records are written to file
|
63
|
+
|
64
|
+
TODO future improvements:
|
65
|
+
1. build b+ tree of record indexes on insertion
|
66
|
+
OR If we end up using arrow as intermediate format, we can use
|
67
|
+
pyarrow compute sort
|
68
|
+
2. Probably we will re-write in rust
|
69
|
+
"""
|
70
|
+
|
71
|
+
def __init__(self, merge_key: str):
|
72
|
+
self.row_size = 0
|
73
|
+
self.merge_key = merge_key
|
74
|
+
|
75
|
+
self._records: List[Dict[str, Any]] = []
|
76
|
+
self.lock = threading.Lock()
|
77
|
+
|
78
|
+
def add_record(self, record: Dict[str, Any]):
|
79
|
+
with self.lock:
|
80
|
+
self._records.append(record)
|
81
|
+
self.row_size += 1
|
82
|
+
|
83
|
+
if self.row_size >= MemtableDatasetWriter.MAX_ROW_SIZE:
|
84
|
+
return True
|
85
|
+
return False
|
86
|
+
|
87
|
+
def get_sorted_records(self, schema: Schema) -> List[Dict[str, Any]]:
|
88
|
+
"""
|
89
|
+
Gets sorted records
|
90
|
+
|
91
|
+
:return: iterator over sorted record
|
92
|
+
"""
|
93
|
+
with self.lock:
|
94
|
+
self._records.sort(key=lambda x: x.__getitem__(self.merge_key))
|
95
|
+
return self._records
|
96
|
+
|
97
|
+
|
98
|
+
class RecordBatchMemTable(Memtable[RecordBatch]):
|
99
|
+
"""
|
100
|
+
Note that this will not respect max row size.
|
101
|
+
"""
|
102
|
+
|
103
|
+
def __init__(self, merge_key: str):
|
104
|
+
self.row_size = 0
|
105
|
+
self.merge_key = merge_key
|
106
|
+
|
107
|
+
# list of full record batches in memtable
|
108
|
+
self._records_batches: List[RecordBatch] = []
|
109
|
+
self.lock = threading.Lock()
|
110
|
+
|
111
|
+
def add_record(self, record: RecordBatch):
|
112
|
+
with self.lock:
|
113
|
+
self._records_batches.append(record)
|
114
|
+
self.row_size += record.num_rows
|
115
|
+
|
116
|
+
if self.row_size >= MemtableDatasetWriter.MAX_ROW_SIZE:
|
117
|
+
return True
|
118
|
+
return False
|
119
|
+
|
120
|
+
def get_sorted_records(self, schema: Schema) -> Table:
|
121
|
+
"""
|
122
|
+
Gets sorted records
|
123
|
+
|
124
|
+
:return: iterator over sorted record
|
125
|
+
"""
|
126
|
+
with self.lock:
|
127
|
+
# Note that we are providing schema so that pyarrow does not infer it
|
128
|
+
table = Table.from_batches(self._records_batches, schema.to_pyarrow())
|
129
|
+
return table.sort_by(self.merge_key)
|
130
|
+
|
131
|
+
|
132
|
+
class MemtableDatasetWriter(DatasetWriter):
|
133
|
+
# Note that this max row size is not respected when PyArrow RecordBatches are used
|
134
|
+
# In that case, the entire record batch is written within one memtable even if the row count overflows
|
135
|
+
MAX_ROW_SIZE = 1000000
|
136
|
+
"""
|
137
|
+
Buffers data into rotating memtables. When a memtable reaches a certain size, it is flushed to disk and a new memtable is allocated
|
138
|
+
|
139
|
+
Uses DataWriter which will be format specific for writing data
|
140
|
+
Uses MetadataWriter for writing metadata
|
141
|
+
|
142
|
+
TODO Future Improvements
|
143
|
+
1. Maybe we should re-write this class in Rust (pending testing)
|
144
|
+
"""
|
145
|
+
|
146
|
+
def __init__(
|
147
|
+
self,
|
148
|
+
file_provider: FileProvider,
|
149
|
+
schema: Schema,
|
150
|
+
locator: PartitionLocator,
|
151
|
+
file_format: str | None = None,
|
152
|
+
sst_writer: SSTWriter = None,
|
153
|
+
manifest_io: ManifestIO = None,
|
154
|
+
):
|
155
|
+
|
156
|
+
if not sst_writer:
|
157
|
+
sst_writer = JsonSstWriter()
|
158
|
+
if not manifest_io:
|
159
|
+
manifest_io = DeltacatManifestIO(file_provider.uri, locator)
|
160
|
+
|
161
|
+
self.schema = schema
|
162
|
+
|
163
|
+
self.file_provider = file_provider
|
164
|
+
self.data_serializer: DataSerializer = DataSerializerFactory.get_serializer(
|
165
|
+
self.schema, self.file_provider, file_format
|
166
|
+
)
|
167
|
+
self.sst_writer = sst_writer
|
168
|
+
self.manifest_io = manifest_io
|
169
|
+
|
170
|
+
self._sst_files: Set[str] = set()
|
171
|
+
self.__curr_memtable = None
|
172
|
+
self.__open_memtables = []
|
173
|
+
self.__rlock = threading.RLock()
|
174
|
+
self.__open_threads: List[Thread] = []
|
175
|
+
self._locator = locator
|
176
|
+
|
177
|
+
def write_dict(self, record: Dict[str, Any]) -> None:
|
178
|
+
|
179
|
+
# Construct memtable if doesn't exist. If previous memtable wrong type, rotate
|
180
|
+
memtable_ctor = lambda: DictMemTable(self.schema.get_merge_key())
|
181
|
+
if not self.__curr_memtable:
|
182
|
+
self.__curr_memtable = memtable_ctor()
|
183
|
+
try:
|
184
|
+
isinstance(self.__curr_memtable, DictMemTable)
|
185
|
+
except TypeError:
|
186
|
+
self.__rotate_memtable(memtable_ctor)
|
187
|
+
|
188
|
+
# Write record(s). If memtable is full, rotate
|
189
|
+
if self.__curr_memtable.add_record(record):
|
190
|
+
self.__rotate_memtable(memtable_ctor)
|
191
|
+
|
192
|
+
def write_record_batch(self, record: RecordBatch) -> None:
|
193
|
+
# Construct memtable if doesn't exist. If previous memtable wrong type, rotate
|
194
|
+
memtable_ctor = lambda: RecordBatchMemTable(self.schema.get_merge_key())
|
195
|
+
if not self.__curr_memtable:
|
196
|
+
self.__curr_memtable = memtable_ctor()
|
197
|
+
|
198
|
+
try:
|
199
|
+
isinstance(self.__curr_memtable, RecordBatchMemTable)
|
200
|
+
except TypeError:
|
201
|
+
self.__rotate_memtable(memtable_ctor)
|
202
|
+
|
203
|
+
# Write record(s). If memtable is full, rotate
|
204
|
+
if self.__curr_memtable.add_record(record):
|
205
|
+
self.__rotate_memtable(memtable_ctor)
|
206
|
+
|
207
|
+
def write(self, data: DATA) -> None:
|
208
|
+
if isinstance(data, RecordBatch):
|
209
|
+
self.write_record_batch(data)
|
210
|
+
elif isinstance(data, Iterable):
|
211
|
+
for x in data:
|
212
|
+
if isinstance(x, dict):
|
213
|
+
self.write_dict(x)
|
214
|
+
elif isinstance(x, RecordBatch):
|
215
|
+
self.write_record_batch(x)
|
216
|
+
else:
|
217
|
+
raise ValueError(
|
218
|
+
f"Iterable contained unsupported type {type(x).__name__}."
|
219
|
+
f" Supported data types to write are: {DATA}"
|
220
|
+
)
|
221
|
+
else:
|
222
|
+
raise ValueError(
|
223
|
+
f"Unsupported data type {type(data).__name__}. Supported data types to write are: {DATA}"
|
224
|
+
)
|
225
|
+
|
226
|
+
def flush(self) -> str:
|
227
|
+
"""
|
228
|
+
Explicitly flush any data and metadata and commit to dataset
|
229
|
+
"""
|
230
|
+
self.__flush_memtable(self.__curr_memtable)
|
231
|
+
for thread in [t for t in self.__open_threads if t.is_alive()]:
|
232
|
+
thread.join()
|
233
|
+
|
234
|
+
manifest_location = self.__write_manifest_file()
|
235
|
+
self._sst_files.clear()
|
236
|
+
|
237
|
+
return manifest_location
|
238
|
+
|
239
|
+
def __enter__(self) -> Any:
|
240
|
+
"""
|
241
|
+
Enter and exit method allows python "with" statement
|
242
|
+
"""
|
243
|
+
return self
|
244
|
+
|
245
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
246
|
+
"""
|
247
|
+
Closes all open memtables and ensures all data is flushed.
|
248
|
+
"""
|
249
|
+
self.flush()
|
250
|
+
# return False to propogate up error messages
|
251
|
+
return False
|
252
|
+
|
253
|
+
def __rotate_memtable(self, memtable_constructor_closure):
|
254
|
+
"""
|
255
|
+
Replace the active memtable
|
256
|
+
:return:
|
257
|
+
"""
|
258
|
+
with self.__rlock:
|
259
|
+
self.__flush_memtable(self.__curr_memtable)
|
260
|
+
self.__curr_memtable = memtable_constructor_closure()
|
261
|
+
self.__open_memtables.append(self.__curr_memtable)
|
262
|
+
|
263
|
+
# Reap dead threads
|
264
|
+
self.__open_threads = [t for t in self.__open_threads if t.is_alive()]
|
265
|
+
|
266
|
+
def __flush_memtable(self, memtable):
|
267
|
+
thread = threading.Thread(target=self.__flush_memtable_async, args=(memtable,))
|
268
|
+
thread.start()
|
269
|
+
with self.__rlock:
|
270
|
+
self.__open_threads.append(thread)
|
271
|
+
|
272
|
+
def __flush_memtable_async(self, memtable: Memtable):
|
273
|
+
"""
|
274
|
+
Flushes data and metadata for a given memtable
|
275
|
+
Called asynchronously in background thread
|
276
|
+
"""
|
277
|
+
if not memtable:
|
278
|
+
return
|
279
|
+
|
280
|
+
sst_metadata_list = self.data_serializer.flush_batch(
|
281
|
+
memtable.get_sorted_records(self.schema)
|
282
|
+
)
|
283
|
+
|
284
|
+
# short circuit if no data/metadata written
|
285
|
+
if not sst_metadata_list:
|
286
|
+
with self.__rlock:
|
287
|
+
self.__open_memtables.remove(memtable)
|
288
|
+
return
|
289
|
+
|
290
|
+
# Write SST. Each memtable is going to have a dedicated L0 SST file because that is the unit at which
|
291
|
+
# we have contiguously sorted data
|
292
|
+
sst_file = self.file_provider.provide_l0_sst_file()
|
293
|
+
|
294
|
+
with self.__rlock:
|
295
|
+
self.sst_writer.write(sst_file, sst_metadata_list)
|
296
|
+
self._sst_files.add(sst_file.location)
|
297
|
+
|
298
|
+
if memtable in self.__open_memtables:
|
299
|
+
self.__open_memtables.remove(memtable)
|
300
|
+
|
301
|
+
def __write_manifest_file(self) -> str:
|
302
|
+
"""
|
303
|
+
Write the manifest file to the filesystem at the given URI.
|
304
|
+
"""
|
305
|
+
return self.manifest_io.write(list(self._sst_files), self.schema, 0)
|
deltacat/io/__init__.py
CHANGED
@@ -0,0 +1,13 @@
|
|
1
|
+
from deltacat.io.reader.deltacat_read_api import read_deltacat
|
2
|
+
from deltacat.io.datasource.deltacat_datasource import DeltacatReadType
|
3
|
+
from deltacat.io.datasource.deltacat_datasource import (
|
4
|
+
METAFILE_DATA_COLUMN_NAME,
|
5
|
+
METAFILE_TYPE_COLUMN_NAME,
|
6
|
+
)
|
7
|
+
|
8
|
+
__all__ = [
|
9
|
+
"read_deltacat",
|
10
|
+
"DeltacatReadType",
|
11
|
+
"METAFILE_DATA_COLUMN_NAME",
|
12
|
+
"METAFILE_TYPE_COLUMN_NAME",
|
13
|
+
]
|
File without changes
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
from typing import Any, Callable, Dict, Optional, cast
|
5
|
+
|
6
|
+
import pyarrow as pa
|
7
|
+
from ray.data import Dataset
|
8
|
+
|
9
|
+
from deltacat.utils.url import DeltaCatUrl
|
10
|
+
from deltacat.io.datasink.deltacat_datasink import DeltaCatDatasink
|
11
|
+
|
12
|
+
|
13
|
+
class DeltaCatDataset(Dataset):
|
14
|
+
@staticmethod
|
15
|
+
def from_dataset(dataset: Dataset) -> DeltaCatDataset:
|
16
|
+
# cast to DeltacatDataset in-place since it only adds new methods
|
17
|
+
dataset.__class__ = DeltaCatDataset
|
18
|
+
return cast(DeltaCatDataset, dataset)
|
19
|
+
|
20
|
+
def write_deltacat(
|
21
|
+
self,
|
22
|
+
url: DeltaCatUrl,
|
23
|
+
*,
|
24
|
+
# if the source dataset only contains DeltaCAT metadata, then only copy the metadata to the destination... if it contains external source file paths, then register them in a new Delta.
|
25
|
+
metadata_only: bool = False,
|
26
|
+
# merge all deltas as part of the write operation
|
27
|
+
copy_on_write: Optional[bool] = False,
|
28
|
+
filesystem: Optional[pa.fs.S3FileSystem] = None,
|
29
|
+
try_create_dir: bool = True,
|
30
|
+
arrow_open_stream_args: Optional[Dict[str, Any]] = None,
|
31
|
+
arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
|
32
|
+
min_rows_per_file: Optional[int] = None,
|
33
|
+
ray_remote_args: Dict[str, Any] = None,
|
34
|
+
concurrency: Optional[int] = None,
|
35
|
+
**arrow_parquet_args,
|
36
|
+
) -> None:
|
37
|
+
"""Writes the dataset to files and commits DeltaCAT metadata indexing
|
38
|
+
the files written.
|
39
|
+
|
40
|
+
This is only supported for datasets convertible to Arrow records.
|
41
|
+
To control the number of files, use ``.repartition()``.
|
42
|
+
|
43
|
+
Unless a custom block path provider is given, the format of the output
|
44
|
+
files will be {uuid}_{block_idx}.{extension}, where ``uuid`` is a
|
45
|
+
unique id for the dataset.
|
46
|
+
|
47
|
+
The DeltaCAT manifest will be written to ``f"{path}/manifest``
|
48
|
+
|
49
|
+
Examples:
|
50
|
+
>>> ds.write_deltacat("s3://catalog/root/path")
|
51
|
+
|
52
|
+
Time complexity: O(dataset size / parallelism)
|
53
|
+
|
54
|
+
Args:
|
55
|
+
url: The path to the root directory where materialized files and
|
56
|
+
DeltaCAT manifest will be written.
|
57
|
+
filesystem: The filesystem implementation to write to. This should
|
58
|
+
be either a PyArrow S3FileSystem.
|
59
|
+
try_create_dir: Try to create all directories in destination path
|
60
|
+
if True. Does nothing if all directories already exist.
|
61
|
+
arrow_open_stream_args: kwargs passed to
|
62
|
+
pyarrow.fs.S3FileSystem.open_output_stream
|
63
|
+
filename_provider: FilenameProvider implementation
|
64
|
+
to write each dataset block to a custom output path.
|
65
|
+
arrow_parquet_args_fn: Callable that returns a dictionary of write
|
66
|
+
arguments to use when writing each block to a file. Overrides
|
67
|
+
any duplicate keys from arrow_parquet_args. This should be used
|
68
|
+
instead of arrow_parquet_args if any of your write arguments
|
69
|
+
cannot be pickled, or if you'd like to lazily resolve the write
|
70
|
+
arguments for each dataset block.
|
71
|
+
arrow_parquet_args: Options to pass to
|
72
|
+
pyarrow.parquet.write_table(), which is used to write out each
|
73
|
+
block to a file.
|
74
|
+
"""
|
75
|
+
datasink = DeltaCatDatasink(
|
76
|
+
url,
|
77
|
+
metadata_only=metadata_only,
|
78
|
+
copy_on_write=copy_on_write,
|
79
|
+
arrow_parquet_args_fn=arrow_parquet_args_fn,
|
80
|
+
arrow_parquet_args=arrow_parquet_args,
|
81
|
+
min_rows_per_file=min_rows_per_file,
|
82
|
+
filesystem=filesystem,
|
83
|
+
try_create_dir=try_create_dir,
|
84
|
+
open_stream_args=arrow_open_stream_args,
|
85
|
+
dataset_uuid=self._uuid,
|
86
|
+
)
|
87
|
+
self.write_datasink(
|
88
|
+
datasink,
|
89
|
+
ray_remote_args=ray_remote_args,
|
90
|
+
concurrency=concurrency,
|
91
|
+
)
|
File without changes
|
@@ -0,0 +1,207 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from collections import OrderedDict
|
4
|
+
from typing import Dict, Any, Optional, List, Iterable
|
5
|
+
|
6
|
+
from ray.data import Datasink
|
7
|
+
from ray.data._internal.execution.interfaces import TaskContext
|
8
|
+
from ray.data.block import Block, BlockAccessor
|
9
|
+
from ray.data.datasource import WriteResult
|
10
|
+
|
11
|
+
from ray.data.datasource.filename_provider import (
|
12
|
+
FilenameProvider,
|
13
|
+
)
|
14
|
+
|
15
|
+
from deltacat import logs
|
16
|
+
|
17
|
+
from deltacat.constants import METAFILE_FORMAT_MSGPACK
|
18
|
+
from deltacat.storage import Metafile
|
19
|
+
from deltacat.io.datasource.deltacat_datasource import (
|
20
|
+
METAFILE_DATA_COLUMN_NAME,
|
21
|
+
METAFILE_TYPE_COLUMN_NAME,
|
22
|
+
)
|
23
|
+
from deltacat.utils.url import DeltaCatUrl, DeltaCatUrlWriter
|
24
|
+
|
25
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
26
|
+
|
27
|
+
|
28
|
+
class CapturingBlockWritePathProvider(FilenameProvider):
|
29
|
+
"""Delegating block write path provider that saves an ordered dictionary of
|
30
|
+
input keyword arguments for every block write path returned."""
|
31
|
+
|
32
|
+
def __init__(
|
33
|
+
self,
|
34
|
+
block_write_path_provider: FilenameProvider,
|
35
|
+
base_path: Optional[str] = None,
|
36
|
+
):
|
37
|
+
self.base_path = base_path
|
38
|
+
self.block_write_path_provider = block_write_path_provider
|
39
|
+
self.write_path_kwargs: Dict[str, Dict[str, Any]] = OrderedDict()
|
40
|
+
|
41
|
+
def get_filename_for_block(
|
42
|
+
self,
|
43
|
+
block: Any,
|
44
|
+
task_index: int,
|
45
|
+
block_index: int,
|
46
|
+
) -> str:
|
47
|
+
if self.base_path is None:
|
48
|
+
raise ValueError(
|
49
|
+
"Base path must be provided to CapturingBlockWritePathProvider",
|
50
|
+
)
|
51
|
+
return self._get_write_path_for_block(
|
52
|
+
base_path=self.base_path,
|
53
|
+
block=block,
|
54
|
+
block_index=block_index,
|
55
|
+
)
|
56
|
+
|
57
|
+
def _get_write_path_for_block(
|
58
|
+
self,
|
59
|
+
base_path: str,
|
60
|
+
*args,
|
61
|
+
**kwargs,
|
62
|
+
) -> str:
|
63
|
+
filename = self.block_write_path_provider.get_filename_for_block(
|
64
|
+
*args,
|
65
|
+
**kwargs,
|
66
|
+
)
|
67
|
+
write_path = f"{base_path}/{filename}"
|
68
|
+
kwargs["base_path"] = base_path
|
69
|
+
self.write_path_kwargs[write_path] = kwargs
|
70
|
+
return write_path
|
71
|
+
|
72
|
+
|
73
|
+
class DeltaCatWriteResult:
|
74
|
+
def __init__(self):
|
75
|
+
self.metadata = None
|
76
|
+
self.path = None
|
77
|
+
self.dataset_uuid = None
|
78
|
+
self.block_write_path_provider = None
|
79
|
+
self.content_type = None
|
80
|
+
self.content_encoding = None
|
81
|
+
self.filesystem = None
|
82
|
+
|
83
|
+
|
84
|
+
class DeltaCatDatasink(Datasink[List[Metafile]]):
|
85
|
+
def __init__(
|
86
|
+
self,
|
87
|
+
url: DeltaCatUrl,
|
88
|
+
*,
|
89
|
+
metadata_only: bool = False,
|
90
|
+
copy_on_write: Optional[bool] = False,
|
91
|
+
):
|
92
|
+
self._url = url
|
93
|
+
self._metadata_only = metadata_only
|
94
|
+
self._copy_on_write = copy_on_write
|
95
|
+
|
96
|
+
def on_write_start(self) -> None:
|
97
|
+
pass
|
98
|
+
|
99
|
+
def write(
|
100
|
+
self,
|
101
|
+
blocks: Iterable[Block],
|
102
|
+
ctx: TaskContext,
|
103
|
+
) -> List[Metafile]:
|
104
|
+
for block in blocks:
|
105
|
+
pa_table = BlockAccessor.for_block(block).to_arrow()
|
106
|
+
if (
|
107
|
+
METAFILE_DATA_COLUMN_NAME in pa_table.column_names
|
108
|
+
and METAFILE_TYPE_COLUMN_NAME in pa_table.column_names
|
109
|
+
):
|
110
|
+
for pa_scalar in pa_table[METAFILE_DATA_COLUMN_NAME]:
|
111
|
+
metafile_msgpack_bytes = pa_scalar.as_py()
|
112
|
+
metafile = Metafile.deserialize(
|
113
|
+
serialized=metafile_msgpack_bytes,
|
114
|
+
meta_format=METAFILE_FORMAT_MSGPACK,
|
115
|
+
)
|
116
|
+
# TODO(pdames): Add `metafile` to writer as a kwarg instead
|
117
|
+
# of constructing a new URL with the metafile as input.
|
118
|
+
writer_url = DeltaCatUrlWriter(self._url, metafile=metafile)
|
119
|
+
# TODO(pdames): Run writes in order from catalog -> delta
|
120
|
+
# by truncating the URL down to just dc://{catalog-name}
|
121
|
+
# and rebuilding all path elements from there.
|
122
|
+
writer_url.write(metafile)
|
123
|
+
else:
|
124
|
+
raise NotImplementedError(
|
125
|
+
f"Expected {METAFILE_DATA_COLUMN_NAME} and "
|
126
|
+
f"{METAFILE_TYPE_COLUMN_NAME} columns in the input block, "
|
127
|
+
f"but found {pa_table.column_names}."
|
128
|
+
)
|
129
|
+
|
130
|
+
def on_write_complete(
|
131
|
+
self,
|
132
|
+
write_result: WriteResult[List[Metafile]],
|
133
|
+
):
|
134
|
+
pass
|
135
|
+
|
136
|
+
|
137
|
+
"""
|
138
|
+
def write(
|
139
|
+
self,
|
140
|
+
blocks: Iterable[Block],
|
141
|
+
ctx: TaskContext,
|
142
|
+
) -> List[ObjectRef[DeltacatWriteResult]]:
|
143
|
+
paths, filesystem = resolve_paths_and_filesystem(
|
144
|
+
self.path,
|
145
|
+
self.filesystem,
|
146
|
+
)
|
147
|
+
assert len(paths) == 1, f"Expected 1 write path, found {len(paths)}."
|
148
|
+
path = paths[0]
|
149
|
+
write_results = super().write(blocks)
|
150
|
+
# append a summary of this write operation in the last write result
|
151
|
+
metadata = [BlockAccessor.for_block(_).get_metadata() for _ in blocks]
|
152
|
+
rwr = DeltacatWriteResult()
|
153
|
+
rwr.metadata = metadata
|
154
|
+
rwr.path = path
|
155
|
+
rwr.dataset_uuid = self.dataset_uuid
|
156
|
+
rwr.block_write_path_provider = self.filename_provider
|
157
|
+
rwr.content_type = ContentType.PARQUET.value
|
158
|
+
rwr.content_encoding = ContentEncoding.IDENTITY.value
|
159
|
+
rwr.filesystem = filesystem
|
160
|
+
rwr_obj_ref = ray.put(rwr)
|
161
|
+
write_results.append(rwr_obj_ref)
|
162
|
+
return write_results
|
163
|
+
|
164
|
+
def on_write_complete(self, write_results: List[Any], **kwargs) -> None:
|
165
|
+
# TODO (pdames): time latency of this operation - overall s3 write times
|
166
|
+
# are 2-3x pure read_parquet_fast() times
|
167
|
+
# restore the write operation summary from the last write result
|
168
|
+
result: DeltacatWriteResult = write_results[len(write_results) - 1]
|
169
|
+
write_path_args = result.block_write_path_provider.write_path_kwargs
|
170
|
+
blocks_written = len(write_path_args)
|
171
|
+
expected_blocks_written = len(result.metadata)
|
172
|
+
# TODO(pdames): Corner cases where mismatch is expected? Emply blocks?
|
173
|
+
# Blocks filtered/split/merged to more/less write paths?
|
174
|
+
assert blocks_written == expected_blocks_written, (
|
175
|
+
f"Dataset write result validation failed. Found "
|
176
|
+
f"{blocks_written}/{expected_blocks_written} Dataset blocks "
|
177
|
+
f"written. Refusing to commit DeltaCAT Manifest."
|
178
|
+
)
|
179
|
+
manifest_entries = ManifestEntryList()
|
180
|
+
for block_idx, path in enumerate(write_path_args.keys()):
|
181
|
+
file_info = result.filesystem.get_file_info(path)
|
182
|
+
if file_info.type == pyarrow.fs.FileType.File:
|
183
|
+
content_length = file_info.size
|
184
|
+
else:
|
185
|
+
raise FileNotFoundError(ENOENT, strerror(ENOENT), path)
|
186
|
+
num_rows = result.metadata[block_idx].num_rows
|
187
|
+
source_content_length = result.metadata[block_idx].size_bytes
|
188
|
+
manifest_entry_meta = ManifestMeta.of(
|
189
|
+
int(num_rows) if num_rows is not None else None,
|
190
|
+
int(content_length) if content_length is not None else None,
|
191
|
+
result.content_type,
|
192
|
+
result.content_encoding,
|
193
|
+
int(source_content_length) if source_content_length else None,
|
194
|
+
)
|
195
|
+
parsed_url = parse_s3_url(path)
|
196
|
+
manifest_entry = ManifestEntry.of(
|
197
|
+
parsed_url.url,
|
198
|
+
manifest_entry_meta,
|
199
|
+
)
|
200
|
+
manifest_entries.append(manifest_entry)
|
201
|
+
manifest = Manifest.of(manifest_entries)
|
202
|
+
manifest_path = f"{result.path}/manifest"
|
203
|
+
logger.debug(f"Write succeeded for Dataset ID: {result.dataset_uuid}")
|
204
|
+
with result.filesystem.open_output_stream(manifest_path) as f:
|
205
|
+
f.write(json.dumps(manifest).encode("utf-8"))
|
206
|
+
logger.debug(f"Manifest committed to: {manifest_path}")
|
207
|
+
"""
|
File without changes
|