deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ import functools
|
|
3
3
|
from deltacat.storage import (
|
4
4
|
PartitionLocator,
|
5
5
|
Delta,
|
6
|
-
|
6
|
+
metastore,
|
7
7
|
)
|
8
8
|
from deltacat import logs
|
9
9
|
from deltacat.compute.compactor.utils import io as io_v1
|
@@ -38,7 +38,7 @@ def discover_deltas(
|
|
38
38
|
rebase_source_partition_locator: Optional[PartitionLocator] = None,
|
39
39
|
rebase_source_partition_high_watermark: Optional[int] = None,
|
40
40
|
rcf_high_watermark: Optional[int] = None,
|
41
|
-
deltacat_storage=
|
41
|
+
deltacat_storage=metastore,
|
42
42
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
43
43
|
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
44
44
|
) -> List[Delta]:
|
@@ -67,6 +67,11 @@ def discover_deltas(
|
|
67
67
|
f"Length of input deltas from delta source table is {len(delta_source_incremental_deltas)}"
|
68
68
|
f" from ({previous_compacted_high_watermark}, {last_stream_position_to_compact}]"
|
69
69
|
)
|
70
|
+
logger.info(f"DEBUG: source_partition_locator = {source_partition_locator}")
|
71
|
+
logger.info(
|
72
|
+
f"DEBUG: source_partition_locator.partition_id = {getattr(source_partition_locator, 'partition_id', 'NO_PARTITION_ID')}"
|
73
|
+
)
|
74
|
+
logger.info(f"DEBUG: total input deltas found = {len(result)}")
|
70
75
|
|
71
76
|
if rebase_source_partition_locator:
|
72
77
|
previous_compacted_deltas = io_v1._discover_deltas(
|
@@ -93,7 +98,8 @@ def create_uniform_input_deltas(
|
|
93
98
|
hash_bucket_count: int,
|
94
99
|
compaction_audit: CompactionSessionAuditInfo,
|
95
100
|
compact_partition_params: CompactPartitionParams,
|
96
|
-
|
101
|
+
all_column_names: List[str],
|
102
|
+
deltacat_storage=metastore,
|
97
103
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
98
104
|
) -> List[DeltaAnnotated]:
|
99
105
|
|
@@ -101,7 +107,6 @@ def create_uniform_input_deltas(
|
|
101
107
|
delta_manifest_entries_count = 0
|
102
108
|
estimated_da_bytes = 0
|
103
109
|
input_da_list = []
|
104
|
-
|
105
110
|
for delta in input_deltas:
|
106
111
|
if (
|
107
112
|
compact_partition_params.enable_input_split
|
@@ -114,10 +119,12 @@ def create_uniform_input_deltas(
|
|
114
119
|
)
|
115
120
|
append_content_type_params(
|
116
121
|
delta=delta,
|
122
|
+
all_column_names=all_column_names,
|
117
123
|
deltacat_storage=deltacat_storage,
|
118
124
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
119
125
|
task_max_parallelism=compact_partition_params.task_max_parallelism,
|
120
126
|
max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
|
127
|
+
file_reader_kwargs_provider=compact_partition_params.read_kwargs_provider,
|
121
128
|
)
|
122
129
|
|
123
130
|
manifest_entries = delta.manifest.entries
|
@@ -23,6 +23,7 @@ from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
|
|
23
23
|
|
24
24
|
from deltacat.utils.performance import timed_invocation
|
25
25
|
from deltacat.storage import (
|
26
|
+
DeltaType,
|
26
27
|
Partition,
|
27
28
|
)
|
28
29
|
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
@@ -47,13 +48,21 @@ def materialize(
|
|
47
48
|
# TODO (pdames): compare performance to pandas-native materialize path
|
48
49
|
df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
|
49
50
|
compacted_table = df
|
51
|
+
# Extract schema from table_writer_kwargs to pass as direct parameter
|
52
|
+
# This ensures schema_id is properly set in the manifest
|
53
|
+
schema = None
|
54
|
+
if input.table_writer_kwargs and "schema" in input.table_writer_kwargs:
|
55
|
+
schema = input.table_writer_kwargs["schema"]
|
56
|
+
|
50
57
|
delta, stage_delta_time = timed_invocation(
|
51
58
|
input.deltacat_storage.stage_delta,
|
52
59
|
compacted_table,
|
53
60
|
input.write_to_partition,
|
61
|
+
delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
|
54
62
|
max_records_per_entry=input.max_records_per_output_file,
|
55
63
|
content_type=input.compacted_file_content_type,
|
56
|
-
|
64
|
+
schema=schema, # Pass schema as direct parameter for schema_id extraction
|
65
|
+
table_writer_kwargs=input.table_writer_kwargs,
|
57
66
|
**input.deltacat_storage_kwargs,
|
58
67
|
)
|
59
68
|
compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
|
@@ -112,6 +121,7 @@ def generate_local_merge_input(
|
|
112
121
|
return MergeInput.of(
|
113
122
|
merge_file_groups_provider=LocalMergeFileGroupsProvider(
|
114
123
|
annotated_deltas,
|
124
|
+
all_column_names=params.all_column_names,
|
115
125
|
read_kwargs_provider=params.read_kwargs_provider,
|
116
126
|
deltacat_storage=params.deltacat_storage,
|
117
127
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
@@ -119,12 +129,13 @@ def generate_local_merge_input(
|
|
119
129
|
write_to_partition=compacted_partition,
|
120
130
|
compacted_file_content_type=params.compacted_file_content_type,
|
121
131
|
primary_keys=params.primary_keys,
|
132
|
+
all_column_names=params.all_column_names,
|
122
133
|
sort_keys=params.sort_keys,
|
123
134
|
drop_duplicates=params.drop_duplicates,
|
124
135
|
max_records_per_output_file=params.records_per_compacted_file,
|
125
136
|
enable_profiler=params.enable_profiler,
|
126
137
|
metrics_config=params.metrics_config,
|
127
|
-
|
138
|
+
table_writer_kwargs=params.table_writer_kwargs,
|
128
139
|
read_kwargs_provider=params.read_kwargs_provider,
|
129
140
|
round_completion_info=round_completion_info,
|
130
141
|
object_store=params.object_store,
|
@@ -133,4 +144,6 @@ def generate_local_merge_input(
|
|
133
144
|
delete_strategy=delete_strategy,
|
134
145
|
delete_file_envelopes=delete_file_envelopes,
|
135
146
|
disable_copy_by_reference=params.disable_copy_by_reference,
|
147
|
+
hash_bucket_count=params.hash_bucket_count,
|
148
|
+
original_fields=params.original_fields,
|
136
149
|
)
|
@@ -10,6 +10,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
10
10
|
TOTAL_BYTES_IN_SHA1_HASH,
|
11
11
|
PK_DELIMITER,
|
12
12
|
MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
|
13
|
+
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
|
13
14
|
)
|
14
15
|
import time
|
15
16
|
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
|
@@ -48,6 +49,13 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
|
|
48
49
|
f"Found total length of hash column={total_len} and total_size={total_size}"
|
49
50
|
)
|
50
51
|
|
52
|
+
if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
|
53
|
+
logger.info(
|
54
|
+
f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
|
55
|
+
f"Returning False for is_sha1_desired"
|
56
|
+
)
|
57
|
+
return False
|
58
|
+
|
51
59
|
return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
|
52
60
|
|
53
61
|
|
@@ -70,13 +78,25 @@ def _append_table_by_hash_bucket(
|
|
70
78
|
f"Grouping a pki table of length {len(pki_table)} took {groupby_latency}s"
|
71
79
|
)
|
72
80
|
|
81
|
+
hb_pk_grouped_by = hb_pk_grouped_by.sort_by(sc._HASH_BUCKET_IDX_COLUMN_NAME)
|
73
82
|
group_count_array = hb_pk_grouped_by[f"{sc._HASH_BUCKET_IDX_COLUMN_NAME}_count"]
|
74
83
|
hb_group_array = hb_pk_grouped_by[sc._HASH_BUCKET_IDX_COLUMN_NAME]
|
75
84
|
|
76
85
|
result_len = 0
|
77
86
|
for i, group_count in enumerate(group_count_array):
|
78
87
|
hb_idx = hb_group_array[i].as_py()
|
79
|
-
|
88
|
+
group_count_py = group_count.as_py()
|
89
|
+
pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count_py)
|
90
|
+
assert group_count_py == len(
|
91
|
+
pyarrow_table
|
92
|
+
), f"Group count {group_count_py} not equal to {len(pyarrow_table)}"
|
93
|
+
all_buckets = pc.unique(pyarrow_table[sc._HASH_BUCKET_IDX_COLUMN_NAME])
|
94
|
+
assert (
|
95
|
+
len(all_buckets) == 1
|
96
|
+
), f"Only one hash bucket is allowed but found {len(all_buckets)}"
|
97
|
+
assert (
|
98
|
+
all_buckets[0].as_py() == hb_idx
|
99
|
+
), f"Hash bucket not equal, {all_buckets[0]} and {hb_idx}"
|
80
100
|
pyarrow_table = pyarrow_table.drop(
|
81
101
|
[sc._HASH_BUCKET_IDX_COLUMN_NAME, sc._PK_HASH_STRING_COLUMN_NAME]
|
82
102
|
)
|
@@ -108,9 +128,10 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
108
128
|
record_batches = []
|
109
129
|
result_len = 0
|
110
130
|
for record_batch in table_batches:
|
111
|
-
|
112
|
-
|
113
|
-
|
131
|
+
if (
|
132
|
+
record_batches
|
133
|
+
and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
|
134
|
+
):
|
114
135
|
logger.info(
|
115
136
|
f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
|
116
137
|
f"is {len(record_batches)} and size {current_bytes}"
|
@@ -128,6 +149,9 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
128
149
|
current_bytes = 0
|
129
150
|
record_batches.clear()
|
130
151
|
|
152
|
+
current_bytes += record_batch.nbytes
|
153
|
+
record_batches.append(record_batch)
|
154
|
+
|
131
155
|
if record_batches:
|
132
156
|
appended_len, append_latency = timed_invocation(
|
133
157
|
_append_table_by_hash_bucket,
|
@@ -1,12 +1,17 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import Dict, Optional, List, Tuple, Any
|
3
3
|
from deltacat import logs
|
4
|
+
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
5
|
+
from deltacat.compute.compactor_v2.constants import (
|
6
|
+
AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
|
7
|
+
)
|
4
8
|
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
5
9
|
LocalMergeFileGroupsProvider,
|
6
10
|
)
|
7
11
|
from deltacat.storage import (
|
8
12
|
Manifest,
|
9
|
-
|
13
|
+
ManifestEntry,
|
14
|
+
metastore,
|
10
15
|
)
|
11
16
|
from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
|
12
17
|
from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
|
@@ -72,8 +77,6 @@ def _get_merge_task_options(
|
|
72
77
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
73
78
|
compacted_delta_manifest: Optional[Manifest] = None,
|
74
79
|
primary_keys: Optional[List[str]] = None,
|
75
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
76
|
-
deltacat_storage_kwargs: Optional[Dict] = {},
|
77
80
|
memory_logs_enabled: Optional[bool] = None,
|
78
81
|
) -> Dict[str, Any]:
|
79
82
|
if (
|
@@ -81,16 +84,27 @@ def _get_merge_task_options(
|
|
81
84
|
and compacted_delta_manifest
|
82
85
|
and round_completion_info.hb_index_to_entry_range
|
83
86
|
):
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
87
|
+
logger.debug_conditional(
|
88
|
+
f"[Merge task {index}]: Using previous compaction rounds to calculate merge memory: {round_completion_info.compacted_pyarrow_write_result}",
|
89
|
+
memory_logs_enabled,
|
90
|
+
)
|
91
|
+
previous_inflation: float = (
|
92
|
+
(
|
93
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
94
|
+
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
95
|
+
)
|
96
|
+
if round_completion_info.compacted_pyarrow_write_result.file_bytes
|
97
|
+
else PYARROW_INFLATION_MULTIPLIER
|
88
98
|
)
|
89
99
|
debug_memory_params["previous_inflation"] = previous_inflation
|
90
100
|
|
91
|
-
average_record_size = (
|
92
|
-
|
93
|
-
|
101
|
+
average_record_size: float = (
|
102
|
+
(
|
103
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
104
|
+
/ round_completion_info.compacted_pyarrow_write_result.records
|
105
|
+
)
|
106
|
+
if round_completion_info.compacted_pyarrow_write_result.records
|
107
|
+
else DEFAULT_AVERAGE_RECORD_SIZE_BYTES
|
94
108
|
)
|
95
109
|
debug_memory_params["average_record_size"] = average_record_size
|
96
110
|
|
@@ -106,31 +120,36 @@ def _get_merge_task_options(
|
|
106
120
|
str(hb_idx)
|
107
121
|
]
|
108
122
|
for entry_index in range(entry_start, entry_end):
|
109
|
-
entry = compacted_delta_manifest.entries[entry_index]
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
123
|
+
entry: ManifestEntry = compacted_delta_manifest.entries[entry_index]
|
124
|
+
current_entry_size: float = (
|
125
|
+
estimate_manifest_entry_size_bytes(
|
126
|
+
entry=entry,
|
127
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
128
|
+
estimate_resources_params=estimate_resources_params,
|
129
|
+
)
|
130
|
+
or 0.0
|
115
131
|
)
|
116
|
-
current_entry_rows =
|
117
|
-
|
118
|
-
|
119
|
-
|
132
|
+
current_entry_rows: int = (
|
133
|
+
estimate_manifest_entry_num_rows(
|
134
|
+
entry=entry,
|
135
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
136
|
+
estimate_resources_params=estimate_resources_params,
|
137
|
+
)
|
138
|
+
or 0
|
120
139
|
)
|
121
|
-
|
140
|
+
# NOTE: We can treat the current_entry_size and current_entry_rows as 0 as a None estimated entry size implies a 0 value
|
122
141
|
data_size += current_entry_size
|
123
142
|
num_rows += current_entry_rows
|
124
|
-
|
125
143
|
if primary_keys:
|
126
|
-
pk_size
|
144
|
+
pk_size: Optional[
|
145
|
+
float
|
146
|
+
] = estimate_manifest_entry_column_size_bytes(
|
127
147
|
entry=entry,
|
128
148
|
columns=primary_keys,
|
129
149
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
130
150
|
estimate_resources_params=estimate_resources_params,
|
131
151
|
)
|
132
|
-
|
133
|
-
if pk_size is None:
|
152
|
+
if not pk_size:
|
134
153
|
pk_size_bytes += current_entry_size
|
135
154
|
else:
|
136
155
|
pk_size_bytes += pk_size
|
@@ -159,7 +178,6 @@ def _get_merge_task_options(
|
|
159
178
|
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
|
160
179
|
memory_logs_enabled,
|
161
180
|
)
|
162
|
-
|
163
181
|
return _get_task_options(0.01, total_memory, ray_custom_resources)
|
164
182
|
|
165
183
|
|
@@ -255,8 +273,6 @@ def merge_resource_options_provider(
|
|
255
273
|
compacted_delta_manifest: Optional[Manifest] = None,
|
256
274
|
ray_custom_resources: Optional[Dict] = None,
|
257
275
|
primary_keys: Optional[List[str]] = None,
|
258
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
259
|
-
deltacat_storage_kwargs: Optional[Dict] = {},
|
260
276
|
memory_logs_enabled: Optional[bool] = None,
|
261
277
|
**kwargs,
|
262
278
|
) -> Dict:
|
@@ -286,8 +302,6 @@ def merge_resource_options_provider(
|
|
286
302
|
round_completion_info=round_completion_info,
|
287
303
|
compacted_delta_manifest=compacted_delta_manifest,
|
288
304
|
primary_keys=primary_keys,
|
289
|
-
deltacat_storage=deltacat_storage,
|
290
|
-
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
291
305
|
memory_logs_enabled=memory_logs_enabled,
|
292
306
|
estimate_resources_params=estimate_resources_params,
|
293
307
|
)
|
@@ -302,7 +316,7 @@ def local_merge_resource_options_provider(
|
|
302
316
|
compacted_delta_manifest: Optional[Manifest] = None,
|
303
317
|
ray_custom_resources: Optional[Dict] = None,
|
304
318
|
primary_keys: Optional[List[str]] = None,
|
305
|
-
deltacat_storage=
|
319
|
+
deltacat_storage=metastore,
|
306
320
|
deltacat_storage_kwargs: Optional[Dict] = {},
|
307
321
|
memory_logs_enabled: Optional[bool] = None,
|
308
322
|
**kwargs,
|
@@ -328,8 +342,6 @@ def local_merge_resource_options_provider(
|
|
328
342
|
round_completion_info=round_completion_info,
|
329
343
|
compacted_delta_manifest=compacted_delta_manifest,
|
330
344
|
primary_keys=primary_keys,
|
331
|
-
deltacat_storage=deltacat_storage,
|
332
|
-
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
333
345
|
memory_logs_enabled=memory_logs_enabled,
|
334
346
|
estimate_resources_params=estimate_resources_params,
|
335
347
|
)
|
@@ -2,3 +2,8 @@ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM = 4096
|
|
2
2
|
|
3
3
|
# Safe limit ONLY considering CPU limit, typically 32 for a 8x-large worker
|
4
4
|
DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD = 30
|
5
|
+
|
6
|
+
|
7
|
+
# Unique identifier delimiter to ensure different primary key don't end up with same hash when concatenated.
|
8
|
+
# e.g.: pk column a with value: 1, 12; pk column b with value: 12, 1; Without delimiter will both become "121".
|
9
|
+
IDENTIFIER_FIELD_DELIMITER = "c303282d"
|