deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -2,17 +2,19 @@ from __future__ import annotations
|
|
2
2
|
import importlib
|
3
3
|
import copy
|
4
4
|
import json
|
5
|
-
|
5
|
+
import posixpath
|
6
|
+
from typing import Any, Dict, List, Optional, Set
|
6
7
|
from deltacat.io.object_store import IObjectStore
|
7
8
|
from deltacat.utils.common import ReadKwargsProvider
|
8
9
|
from deltacat.types.media import ContentType
|
9
10
|
from deltacat.utils.placement import PlacementGroupConfig
|
10
11
|
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
11
12
|
from deltacat.storage import (
|
12
|
-
|
13
|
+
metastore,
|
13
14
|
PartitionLocator,
|
14
15
|
SortKey,
|
15
16
|
)
|
17
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
16
18
|
from deltacat.compute.resource_estimation import (
|
17
19
|
ResourceEstimationMethod,
|
18
20
|
EstimateResourcesParams,
|
@@ -52,11 +54,22 @@ class CompactPartitionParams(dict):
|
|
52
54
|
assert (
|
53
55
|
params.get("source_partition_locator") is not None
|
54
56
|
), "source_partition_locator is a required arg"
|
57
|
+
assert params.get("catalog") is not None, "catalog is a required arg"
|
55
58
|
assert (
|
56
|
-
params.get("
|
57
|
-
), "
|
59
|
+
params.get("all_column_names") is not None
|
60
|
+
), "all_column_names is a required arg"
|
58
61
|
|
59
62
|
result = CompactPartitionParams(params)
|
63
|
+
assert (
|
64
|
+
result.destination_partition_locator.partition_id
|
65
|
+
), "destination_partition_locator must have a globally unique partition_id"
|
66
|
+
assert (
|
67
|
+
result.source_partition_locator.partition_id
|
68
|
+
), "source_partition_locator must have a globally unique partition_id"
|
69
|
+
if result.rebase_source_partition_locator:
|
70
|
+
assert (
|
71
|
+
result.rebase_source_partition_locator.partition_id
|
72
|
+
), "rebase_source_partition_locator must have a globally unique partition_id"
|
60
73
|
|
61
74
|
result.records_per_compacted_file = params.get(
|
62
75
|
"records_per_compacted_file", MAX_RECORDS_PER_COMPACTED_FILE
|
@@ -65,15 +78,18 @@ class CompactPartitionParams(dict):
|
|
65
78
|
"compacted_file_content_type", ContentType.PARQUET
|
66
79
|
)
|
67
80
|
result.object_store = params.get("object_store", RayPlasmaObjectStore())
|
81
|
+
result.table_writer_kwargs = params.get("table_writer_kwargs", {})
|
68
82
|
|
69
83
|
result.enable_profiler = params.get("enable_profiler", False)
|
70
|
-
result.deltacat_storage = params.get(
|
71
|
-
|
72
|
-
)
|
73
|
-
result.s3_client_kwargs = params.get("s3_client_kwargs", {})
|
84
|
+
result.deltacat_storage = params.get("deltacat_storage", metastore)
|
85
|
+
result.catalog = params.get("catalog")
|
74
86
|
result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
|
75
87
|
result.list_deltas_kwargs = params.get("list_deltas_kwargs", {})
|
76
|
-
result.
|
88
|
+
result.all_column_names = params.get("all_column_names")
|
89
|
+
|
90
|
+
# Add catalog to deltacat_storage_kwargs
|
91
|
+
result.deltacat_storage_kwargs["catalog"] = result.catalog
|
92
|
+
|
77
93
|
result.bit_width_of_sort_keys = validate_sort_keys(
|
78
94
|
result.source_partition_locator,
|
79
95
|
result.sort_keys,
|
@@ -133,6 +149,8 @@ class CompactPartitionParams(dict):
|
|
133
149
|
if result.primary_keys:
|
134
150
|
result.primary_keys = sorted(result.primary_keys)
|
135
151
|
|
152
|
+
result.original_fields = params.get("original_fields")
|
153
|
+
|
136
154
|
# assertions
|
137
155
|
assert (
|
138
156
|
result.source_partition_locator.partition_values
|
@@ -177,21 +195,32 @@ class CompactPartitionParams(dict):
|
|
177
195
|
self["source_partition_locator"] = locator
|
178
196
|
|
179
197
|
@property
|
180
|
-
def
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
self["compaction_artifact_s3_bucket"] = s3_bucket
|
198
|
+
def compaction_artifact_path(self) -> str:
|
199
|
+
"""
|
200
|
+
Returns the compaction artifact path based on catalog root.
|
201
|
+
"""
|
202
|
+
return posixpath.join(self.catalog.root, "compute", "compactor")
|
186
203
|
|
187
204
|
@property
|
188
|
-
def deltacat_storage(self) ->
|
205
|
+
def deltacat_storage(self) -> metastore:
|
189
206
|
return self["deltacat_storage"]
|
190
207
|
|
191
208
|
@deltacat_storage.setter
|
192
|
-
def deltacat_storage(self, storage:
|
209
|
+
def deltacat_storage(self, storage: metastore) -> None:
|
193
210
|
self["deltacat_storage"] = storage
|
194
211
|
|
212
|
+
@property
|
213
|
+
def catalog(self) -> CatalogProperties:
|
214
|
+
return self["catalog"]
|
215
|
+
|
216
|
+
@catalog.setter
|
217
|
+
def catalog(self, catalog: CatalogProperties) -> None:
|
218
|
+
self["catalog"] = catalog
|
219
|
+
# Update deltacat_storage_kwargs when catalog is set
|
220
|
+
if "deltacat_storage_kwargs" not in self:
|
221
|
+
self["deltacat_storage_kwargs"] = {}
|
222
|
+
self["deltacat_storage_kwargs"]["catalog"] = catalog
|
223
|
+
|
195
224
|
@property
|
196
225
|
def object_store(self) -> IObjectStore:
|
197
226
|
return self["object_store"]
|
@@ -286,14 +315,6 @@ class CompactPartitionParams(dict):
|
|
286
315
|
def list_deltas_kwargs(self, kwargs: dict) -> None:
|
287
316
|
self["list_deltas_kwargs"] = kwargs
|
288
317
|
|
289
|
-
@property
|
290
|
-
def s3_table_writer_kwargs(self) -> dict:
|
291
|
-
return self["s3_table_writer_kwargs"]
|
292
|
-
|
293
|
-
@s3_table_writer_kwargs.setter
|
294
|
-
def s3_table_writer_kwargs(self, kwargs: dict) -> None:
|
295
|
-
self["s3_table_writer_kwargs"] = kwargs
|
296
|
-
|
297
318
|
@property
|
298
319
|
def deltacat_storage_kwargs(self) -> dict:
|
299
320
|
return self["deltacat_storage_kwargs"]
|
@@ -303,12 +324,12 @@ class CompactPartitionParams(dict):
|
|
303
324
|
self["deltacat_storage_kwargs"] = kwargs
|
304
325
|
|
305
326
|
@property
|
306
|
-
def
|
307
|
-
return self
|
327
|
+
def all_column_names(self) -> List[str]:
|
328
|
+
return self.get("all_column_names")
|
308
329
|
|
309
|
-
@
|
310
|
-
def
|
311
|
-
self["
|
330
|
+
@all_column_names.setter
|
331
|
+
def all_column_names(self, column_names: List[str]) -> None:
|
332
|
+
self["all_column_names"] = column_names
|
312
333
|
|
313
334
|
@property
|
314
335
|
def records_per_compacted_file(self) -> int:
|
@@ -489,6 +510,30 @@ class CompactPartitionParams(dict):
|
|
489
510
|
average_record_size_bytes=self.average_record_size_bytes,
|
490
511
|
)
|
491
512
|
|
513
|
+
@property
|
514
|
+
def table_writer_kwargs(self) -> dict:
|
515
|
+
return self["table_writer_kwargs"]
|
516
|
+
|
517
|
+
@table_writer_kwargs.setter
|
518
|
+
def table_writer_kwargs(self, kwargs: dict) -> None:
|
519
|
+
self["table_writer_kwargs"] = kwargs
|
520
|
+
|
521
|
+
@property
|
522
|
+
def expected_previous_partition_id(self) -> Optional[str]:
|
523
|
+
return self.get("expected_previous_partition_id")
|
524
|
+
|
525
|
+
@expected_previous_partition_id.setter
|
526
|
+
def expected_previous_partition_id(self, partition_id: Optional[str]) -> None:
|
527
|
+
self["expected_previous_partition_id"] = partition_id
|
528
|
+
|
529
|
+
@property
|
530
|
+
def original_fields(self) -> Optional[Set[str]]:
|
531
|
+
return self.get("original_fields")
|
532
|
+
|
533
|
+
@original_fields.setter
|
534
|
+
def original_fields(self, fields: Optional[Set[str]]) -> None:
|
535
|
+
self["original_fields"] = fields
|
536
|
+
|
492
537
|
@staticmethod
|
493
538
|
def json_handler_for_compact_partition_params(obj):
|
494
539
|
"""
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
from typing import Optional
|
4
4
|
import pyarrow as pa
|
5
5
|
import logging
|
6
|
+
from pathlib import PosixPath
|
6
7
|
from deltacat import logs
|
7
8
|
from typing import List, Union
|
8
9
|
from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
|
@@ -919,3 +920,19 @@ class CompactionSessionAuditInfo(dict):
|
|
919
920
|
)
|
920
921
|
|
921
922
|
self.set_pyarrow_version(pa.__version__)
|
923
|
+
|
924
|
+
def to_serializable(self, catalog_root: str) -> CompactionSessionAuditInfo:
|
925
|
+
root_path = PosixPath(catalog_root)
|
926
|
+
target_path = PosixPath(self.audit_url)
|
927
|
+
if root_path == target_path:
|
928
|
+
raise ValueError(
|
929
|
+
"Target and root are identical, but expected target to be a child of root."
|
930
|
+
)
|
931
|
+
try:
|
932
|
+
relative_path = target_path.relative_to(root_path)
|
933
|
+
# Create a copy of the audit info with the relative path
|
934
|
+
audit_copy = CompactionSessionAuditInfo(**dict(self))
|
935
|
+
audit_copy["auditUrl"] = str(relative_path)
|
936
|
+
return audit_copy
|
937
|
+
except ValueError:
|
938
|
+
raise ValueError("Expected target to be a child of root.")
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
from typing import
|
4
|
+
from typing import Tuple, Union
|
5
5
|
from deltacat.storage import DeltaLocator, PartitionLocator
|
6
6
|
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
7
7
|
from typing import Any, Dict, Optional
|
@@ -10,7 +10,7 @@ from typing import Any, Dict, Optional
|
|
10
10
|
class HighWatermark(dict):
|
11
11
|
"""
|
12
12
|
Inherit from dict to make it easy for serialization/deserialization.
|
13
|
-
Keep both partition locator and high watermark as a tuple to be persisted in the
|
13
|
+
Keep both partition locator and high watermark as a tuple to be persisted in the rci
|
14
14
|
"""
|
15
15
|
|
16
16
|
def set(self, partition_locator: PartitionLocator, delta_stream_position: int):
|
@@ -46,6 +46,7 @@ class RoundCompletionInfo(dict):
|
|
46
46
|
compactor_version: Optional[str] = None,
|
47
47
|
input_inflation: Optional[float] = None,
|
48
48
|
input_average_record_size_bytes: Optional[float] = None,
|
49
|
+
prev_source_partition_locator: Optional[PartitionLocator] = None,
|
49
50
|
) -> RoundCompletionInfo:
|
50
51
|
|
51
52
|
rci = RoundCompletionInfo()
|
@@ -63,6 +64,7 @@ class RoundCompletionInfo(dict):
|
|
63
64
|
rci["compactorVersion"] = compactor_version
|
64
65
|
rci["inputInflation"] = input_inflation
|
65
66
|
rci["inputAverageRecordSizeBytes"] = input_average_record_size_bytes
|
67
|
+
rci["prevSourcePartitionLocator"] = prev_source_partition_locator
|
66
68
|
return rci
|
67
69
|
|
68
70
|
@property
|
@@ -100,7 +102,11 @@ class RoundCompletionInfo(dict):
|
|
100
102
|
|
101
103
|
@property
|
102
104
|
def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
|
103
|
-
|
105
|
+
val = self.get("rebaseSourcePartitionLocator")
|
106
|
+
if val is not None and not isinstance(val, PartitionLocator):
|
107
|
+
val = PartitionLocator(val)
|
108
|
+
self["rebaseSourcePartitionLocator"] = val # Cache the converted value
|
109
|
+
return val
|
104
110
|
|
105
111
|
@property
|
106
112
|
def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
|
@@ -129,6 +135,10 @@ class RoundCompletionInfo(dict):
|
|
129
135
|
def input_average_record_size_bytes(self) -> Optional[float]:
|
130
136
|
return self.get("inputAverageRecordSizeBytes")
|
131
137
|
|
132
|
-
@
|
133
|
-
def
|
134
|
-
|
138
|
+
@property
|
139
|
+
def prev_source_partition_locator(self) -> Optional[PartitionLocator]:
|
140
|
+
val = self.get("prevSourcePartitionLocator")
|
141
|
+
if val is not None and not isinstance(val, PartitionLocator):
|
142
|
+
val = PartitionLocator(val)
|
143
|
+
self["prevSourcePartitionLocator"] = val # Cache the converted value
|
144
|
+
return val
|
@@ -21,14 +21,13 @@ from deltacat.utils.placement import PlacementGroupConfig
|
|
21
21
|
from typing import List, Optional, Dict, Any
|
22
22
|
from deltacat.utils.ray_utils.runtime import live_node_resource_keys
|
23
23
|
from deltacat.compute.compactor.utils import io
|
24
|
-
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
25
24
|
from deltacat.compute.compactor.steps import repartition as repar
|
26
25
|
from deltacat.compute.compactor.steps.repartition import RepartitionType
|
27
26
|
from deltacat.storage import (
|
28
27
|
Delta,
|
29
28
|
DeltaLocator,
|
30
29
|
PartitionLocator,
|
31
|
-
|
30
|
+
metastore,
|
32
31
|
)
|
33
32
|
from deltacat.utils.metrics import MetricsConfig
|
34
33
|
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
@@ -41,7 +40,6 @@ def repartition(
|
|
41
40
|
source_partition_locator: PartitionLocator,
|
42
41
|
destination_partition_locator: PartitionLocator,
|
43
42
|
repartition_args: Any,
|
44
|
-
repartition_completion_file_s3_url: str,
|
45
43
|
last_stream_position_to_compact: int,
|
46
44
|
repartition_type: RepartitionType = RepartitionType.RANGE,
|
47
45
|
sort_keys: List[SortKey] = None,
|
@@ -54,9 +52,8 @@ def repartition(
|
|
54
52
|
pg_config: Optional[PlacementGroupConfig] = None,
|
55
53
|
list_deltas_kwargs: Optional[Dict[str, Any]] = None,
|
56
54
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
57
|
-
|
58
|
-
|
59
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
55
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
56
|
+
deltacat_storage=metastore,
|
60
57
|
**kwargs,
|
61
58
|
) -> Optional[str]:
|
62
59
|
|
@@ -132,7 +129,7 @@ def repartition(
|
|
132
129
|
enable_profiler=enable_profiler,
|
133
130
|
metrics_config=metrics_config,
|
134
131
|
read_kwargs_provider=read_kwargs_provider,
|
135
|
-
|
132
|
+
table_writer_kwargs=table_writer_kwargs,
|
136
133
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
137
134
|
deltacat_storage=deltacat_storage,
|
138
135
|
)
|
@@ -153,9 +150,6 @@ def repartition(
|
|
153
150
|
compacted_delta = deltacat_storage.commit_delta(
|
154
151
|
merged_delta, properties=kwargs.get("properties", {})
|
155
152
|
)
|
156
|
-
deltacat_storage.commit_partition(partition)
|
157
|
-
logger.info(f"Committed final delta: {compacted_delta}")
|
158
|
-
logger.info(f"Job run completed successfully!")
|
159
153
|
new_compacted_delta_locator = DeltaLocator.of(
|
160
154
|
new_compacted_partition_locator,
|
161
155
|
compacted_delta.stream_position,
|
@@ -173,14 +167,7 @@ def repartition(
|
|
173
167
|
bit_width_of_sort_keys,
|
174
168
|
None,
|
175
169
|
)
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
None,
|
181
|
-
None,
|
182
|
-
None,
|
183
|
-
repartition_completion_info,
|
184
|
-
repartition_completion_file_s3_url,
|
185
|
-
**s3_client_kwargs,
|
186
|
-
)
|
170
|
+
partition.compaction_round_completion_info = repartition_completion_info
|
171
|
+
deltacat_storage.commit_partition(partition)
|
172
|
+
logger.info(f"Committed final delta: {compacted_delta}")
|
173
|
+
logger.info(f"Job run completed successfully!")
|
@@ -21,7 +21,7 @@ from deltacat.compute.compactor.utils.primary_key_index import (
|
|
21
21
|
group_hash_bucket_indices,
|
22
22
|
group_record_indices_by_hash_bucket,
|
23
23
|
)
|
24
|
-
from deltacat.storage import
|
24
|
+
from deltacat.storage import metastore
|
25
25
|
from deltacat.types.media import StorageType
|
26
26
|
from deltacat.utils.common import sha1_digest
|
27
27
|
from deltacat.utils.ray_utils.runtime import (
|
@@ -90,7 +90,7 @@ def _group_file_records_by_pk_hash_bucket(
|
|
90
90
|
sort_key_names: List[str],
|
91
91
|
is_src_delta: np.bool_ = True,
|
92
92
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
93
|
-
deltacat_storage=
|
93
|
+
deltacat_storage=metastore,
|
94
94
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
95
95
|
**kwargs,
|
96
96
|
) -> Tuple[Optional[DeltaFileEnvelopeGroups], int]:
|
@@ -139,7 +139,7 @@ def _read_delta_file_envelopes(
|
|
139
139
|
primary_keys: List[str],
|
140
140
|
sort_key_names: List[str],
|
141
141
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
142
|
-
deltacat_storage=
|
142
|
+
deltacat_storage=metastore,
|
143
143
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
144
144
|
**kwargs,
|
145
145
|
) -> Tuple[Optional[List[DeltaFileEnvelope]], int]:
|
@@ -190,7 +190,7 @@ def _timed_hash_bucket(
|
|
190
190
|
enable_profiler: bool,
|
191
191
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
192
192
|
object_store: Optional[IObjectStore] = None,
|
193
|
-
deltacat_storage=
|
193
|
+
deltacat_storage=metastore,
|
194
194
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
195
195
|
**kwargs,
|
196
196
|
):
|
@@ -249,7 +249,7 @@ def hash_bucket(
|
|
249
249
|
metrics_config: MetricsConfig,
|
250
250
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
251
251
|
object_store: Optional[IObjectStore],
|
252
|
-
deltacat_storage=
|
252
|
+
deltacat_storage=metastore,
|
253
253
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
254
254
|
**kwargs,
|
255
255
|
) -> HashBucketResult:
|
@@ -29,7 +29,7 @@ from deltacat.storage import (
|
|
29
29
|
ManifestEntryList,
|
30
30
|
)
|
31
31
|
from deltacat.storage.model.manifest import Manifest
|
32
|
-
|
32
|
+
|
33
33
|
from deltacat.utils.common import ReadKwargsProvider
|
34
34
|
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
|
35
35
|
from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
|
@@ -46,6 +46,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
46
46
|
)
|
47
47
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
48
48
|
from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
|
49
|
+
from deltacat.storage import metastore
|
49
50
|
|
50
51
|
if importlib.util.find_spec("memray"):
|
51
52
|
import memray
|
@@ -67,9 +68,9 @@ def materialize(
|
|
67
68
|
metrics_config: MetricsConfig,
|
68
69
|
schema: Optional[pa.Schema] = None,
|
69
70
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
70
|
-
|
71
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
71
72
|
object_store: Optional[IObjectStore] = None,
|
72
|
-
deltacat_storage=
|
73
|
+
deltacat_storage=metastore,
|
73
74
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
74
75
|
):
|
75
76
|
if deltacat_storage_kwargs is None:
|
@@ -78,11 +79,11 @@ def materialize(
|
|
78
79
|
def _stage_delta_from_manifest_entry_reference_list(
|
79
80
|
manifest_entry_list_reference: List[ManifestEntry],
|
80
81
|
partition: Partition,
|
81
|
-
delta_type: DeltaType = DeltaType.
|
82
|
+
delta_type: DeltaType = DeltaType.APPEND,
|
82
83
|
) -> Delta:
|
83
84
|
assert (
|
84
|
-
delta_type == DeltaType.
|
85
|
-
), "
|
85
|
+
delta_type == DeltaType.APPEND
|
86
|
+
), "Compaction should always produce APPEND deltas for consistent read operations!"
|
86
87
|
manifest = Manifest.of(
|
87
88
|
entries=ManifestEntryList.of(manifest_entry_list_reference),
|
88
89
|
uuid=str(uuid4()),
|
@@ -110,9 +111,10 @@ def materialize(
|
|
110
111
|
deltacat_storage.stage_delta,
|
111
112
|
compacted_table,
|
112
113
|
partition,
|
114
|
+
delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
|
113
115
|
max_records_per_entry=max_records_per_output_file,
|
114
116
|
content_type=compacted_file_content_type,
|
115
|
-
|
117
|
+
table_writer_kwargs=table_writer_kwargs,
|
116
118
|
**deltacat_storage_kwargs,
|
117
119
|
)
|
118
120
|
compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
|
@@ -10,7 +10,7 @@ import ray
|
|
10
10
|
from deltacat import logs
|
11
11
|
from deltacat.compute.compactor import DeltaAnnotated
|
12
12
|
from deltacat.compute.compactor.model.repartition_result import RepartitionResult
|
13
|
-
from deltacat.storage import
|
13
|
+
from deltacat.storage import metastore
|
14
14
|
from deltacat.storage import Partition
|
15
15
|
from deltacat.utils.ray_utils.runtime import (
|
16
16
|
get_current_ray_task_id,
|
@@ -19,7 +19,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
19
19
|
from deltacat.utils.common import ReadKwargsProvider
|
20
20
|
from deltacat.utils.performance import timed_invocation
|
21
21
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
22
|
-
from deltacat.storage import Delta
|
22
|
+
from deltacat.storage import Delta, DeltaType
|
23
23
|
from enum import Enum
|
24
24
|
|
25
25
|
if importlib.util.find_spec("memray"):
|
@@ -56,9 +56,9 @@ def repartition_range(
|
|
56
56
|
destination_partition: Partition,
|
57
57
|
repartition_args: dict,
|
58
58
|
max_records_per_output_file: int,
|
59
|
-
|
59
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
60
60
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
61
|
-
deltacat_storage=
|
61
|
+
deltacat_storage=metastore,
|
62
62
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
63
63
|
**kwargs,
|
64
64
|
):
|
@@ -144,9 +144,10 @@ def repartition_range(
|
|
144
144
|
partition_delta: Delta = deltacat_storage.stage_delta(
|
145
145
|
partition_table,
|
146
146
|
destination_partition,
|
147
|
+
delta_type=DeltaType.APPEND, # Repartition always produces APPEND deltas
|
147
148
|
max_records_per_entry=max_records_per_output_file,
|
148
149
|
content_type=repartitioned_file_content_type,
|
149
|
-
|
150
|
+
table_writer_kwargs=table_writer_kwargs,
|
150
151
|
**deltacat_storage_kwargs,
|
151
152
|
)
|
152
153
|
partition_deltas.append(partition_delta)
|
@@ -168,9 +169,9 @@ def _timed_repartition(
|
|
168
169
|
max_records_per_output_file: int,
|
169
170
|
enable_profiler: bool,
|
170
171
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
171
|
-
|
172
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
172
173
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
173
|
-
deltacat_storage=
|
174
|
+
deltacat_storage=metastore,
|
174
175
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
175
176
|
**kwargs,
|
176
177
|
) -> RepartitionResult:
|
@@ -192,7 +193,7 @@ def _timed_repartition(
|
|
192
193
|
destination_partition=destination_partition,
|
193
194
|
repartition_args=repartition_args,
|
194
195
|
max_records_per_output_file=max_records_per_output_file,
|
195
|
-
|
196
|
+
table_writer_kwargs=table_writer_kwargs,
|
196
197
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
197
198
|
deltacat_storage=deltacat_storage,
|
198
199
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
@@ -213,9 +214,9 @@ def repartition(
|
|
213
214
|
enable_profiler: bool,
|
214
215
|
metrics_config: Optional[MetricsConfig],
|
215
216
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
216
|
-
|
217
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
217
218
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
218
|
-
deltacat_storage=
|
219
|
+
deltacat_storage=metastore,
|
219
220
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
220
221
|
**kwargs,
|
221
222
|
) -> RepartitionResult:
|
@@ -231,7 +232,7 @@ def repartition(
|
|
231
232
|
max_records_per_output_file=max_records_per_output_file,
|
232
233
|
enable_profiler=enable_profiler,
|
233
234
|
read_kwargs_provider=read_kwargs_provider,
|
234
|
-
|
235
|
+
table_writer_kwargs=table_writer_kwargs,
|
235
236
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
236
237
|
deltacat_storage=deltacat_storage,
|
237
238
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
@@ -11,7 +11,7 @@ from deltacat.storage import (
|
|
11
11
|
PartitionLocator,
|
12
12
|
Delta,
|
13
13
|
ManifestEntry,
|
14
|
-
|
14
|
+
metastore,
|
15
15
|
)
|
16
16
|
from deltacat import logs
|
17
17
|
from deltacat.compute.compactor import DeltaAnnotated
|
@@ -31,12 +31,13 @@ def discover_deltas(
|
|
31
31
|
compacted_partition_locator: Optional[PartitionLocator],
|
32
32
|
rebase_source_partition_locator: Optional[PartitionLocator],
|
33
33
|
rebase_source_partition_high_watermark: Optional[int],
|
34
|
-
deltacat_storage=
|
34
|
+
deltacat_storage=metastore,
|
35
35
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
36
36
|
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
37
37
|
) -> Tuple[List[Delta], int]:
|
38
38
|
if deltacat_storage_kwargs is None:
|
39
39
|
deltacat_storage_kwargs = {}
|
40
|
+
|
40
41
|
# Source One: new deltas from uncompacted table for incremental compaction or deltas from compacted table for rebase
|
41
42
|
start_position_exclusive = (
|
42
43
|
high_watermark.get(source_partition_locator)
|
@@ -109,7 +110,7 @@ def limit_input_deltas(
|
|
109
110
|
user_hash_bucket_chunk_size: int,
|
110
111
|
input_deltas_stats: Dict[int, DeltaStats],
|
111
112
|
compaction_audit: CompactionSessionAuditInfo,
|
112
|
-
deltacat_storage=
|
113
|
+
deltacat_storage=metastore,
|
113
114
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
114
115
|
**kwargs,
|
115
116
|
) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
|
@@ -272,7 +273,7 @@ def fit_input_deltas(
|
|
272
273
|
cluster_resources: Dict[str, float],
|
273
274
|
compaction_audit: CompactionSessionAuditInfo,
|
274
275
|
hash_bucket_count: Optional[int],
|
275
|
-
deltacat_storage=
|
276
|
+
deltacat_storage=metastore,
|
276
277
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
277
278
|
**kwargs,
|
278
279
|
) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
|
@@ -359,7 +360,7 @@ def _discover_deltas(
|
|
359
360
|
source_partition_locator: PartitionLocator,
|
360
361
|
start_position_exclusive: Optional[int],
|
361
362
|
end_position_inclusive: Optional[int],
|
362
|
-
deltacat_storage=
|
363
|
+
deltacat_storage=metastore,
|
363
364
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
364
365
|
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
365
366
|
) -> List[Delta]:
|