deltacat 0.1.18b3__tar.gz → 0.1.18b6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deltacat-0.1.18b3/deltacat.egg-info → deltacat-0.1.18b6}/PKG-INFO +1 -1
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/__init__.py +1 -1
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/compaction_session.py +165 -29
- deltacat-0.1.18b6/deltacat/compute/compactor/model/compact_partition_params.py +153 -0
- deltacat-0.1.18b6/deltacat/compute/compactor/model/compaction_session_audit_info.py +725 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/dedupe_result.py +3 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/delta_file_envelope.py +8 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/delta_file_locator.py +11 -6
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/hash_bucket_result.py +3 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/materialize_result.py +27 -6
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/round_completion_info.py +9 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/dedupe.py +35 -19
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/hash_bucket.py +41 -16
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/materialize.py +73 -70
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/io.py +15 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/primary_key_index.py +9 -15
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/round_completion_file.py +2 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/system_columns.py +32 -0
- deltacat-0.1.18b6/deltacat/io/file_object_store.py +48 -0
- deltacat-0.1.18b6/deltacat/io/memcached_object_store.py +121 -0
- deltacat-0.1.18b6/deltacat/io/object_store.py +51 -0
- deltacat-0.1.18b6/deltacat/io/ray_plasma_object_store.py +23 -0
- deltacat-0.1.18b6/deltacat/io/redis_object_store.py +114 -0
- deltacat-0.1.18b6/deltacat/io/s3_object_store.py +44 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/delta.py +2 -1
- deltacat-0.1.18b6/deltacat/tests/compactor/test_compact_partition_params.py +237 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/tests/compactor/utils/test_io.py +27 -5
- deltacat-0.1.18b6/deltacat/tests/io/test_file_object_store.py +86 -0
- deltacat-0.1.18b6/deltacat/tests/io/test_memcached_object_store.py +158 -0
- deltacat-0.1.18b6/deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
- deltacat-0.1.18b6/deltacat/tests/io/test_redis_object_store.py +103 -0
- deltacat-0.1.18b6/deltacat/tests/io/test_s3_object_store.py +59 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/tests/utils/test_record_batch_tables.py +1 -1
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/tests/utils/test_resources.py +9 -0
- deltacat-0.1.18b6/deltacat/utils/__init__.py +0 -0
- deltacat-0.1.18b6/deltacat/utils/ray_utils/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/concurrency.py +0 -2
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/resources.py +30 -18
- {deltacat-0.1.18b3 → deltacat-0.1.18b6/deltacat.egg-info}/PKG-INFO +1 -1
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat.egg-info/SOURCES.txt +15 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat.egg-info/requires.txt +2 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/setup.py +2 -0
- deltacat-0.1.18b3/deltacat/io/__init__.py +0 -7
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/LICENSE +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/MANIFEST.in +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/README.md +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/clients.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/constants.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/redshift/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/redshift/model/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/redshift/model/manifest.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/s3u.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/catalog/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/catalog/delegate.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/catalog/interface.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/catalog/model/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/catalog/model/catalog.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/catalog/model/table_definition.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/repartition_result.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/sort_key.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/repartition_session.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/rehash/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/rehash/rehash_bucket.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/rehash/rewrite_index.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/repartition.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/config/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/meta_stats.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/model/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/model/partition_stats_dict.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/stats.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/constants.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/io.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/ray_utils.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/basic.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/models/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/models/delta_stats.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/models/stats_result.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/types.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/intervals.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/io.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/manifest_stats_file.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/constants.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/exceptions.py +0 -0
- {deltacat-0.1.18b3/deltacat/io/aws → deltacat-0.1.18b6/deltacat/io}/__init__.py +0 -0
- {deltacat-0.1.18b3/deltacat/io/aws/redshift → deltacat-0.1.18b6/deltacat/io/aws}/__init__.py +0 -0
- {deltacat-0.1.18b3/deltacat/storage/model → deltacat-0.1.18b6/deltacat/io/aws/redshift}/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/io/aws/redshift/redshift_datasource.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/io/dataset.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/io/read_api.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/logs.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/interface.py +0 -0
- {deltacat-0.1.18b3/deltacat/tests → deltacat-0.1.18b6/deltacat/storage/model}/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/list_result.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/locator.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/namespace.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/partition.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/stream.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/table.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/table_version.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/types.py +0 -0
- {deltacat-0.1.18b3/deltacat/tests/compactor → deltacat-0.1.18b6/deltacat/tests}/__init__.py +0 -0
- {deltacat-0.1.18b3/deltacat/tests/compactor/utils → deltacat-0.1.18b6/deltacat/tests/compactor}/__init__.py +0 -0
- {deltacat-0.1.18b3/deltacat/tests/stats → deltacat-0.1.18b6/deltacat/tests/compactor/utils}/__init__.py +0 -0
- {deltacat-0.1.18b3/deltacat/tests/test_utils → deltacat-0.1.18b6/deltacat/tests/io}/__init__.py +0 -0
- {deltacat-0.1.18b3/deltacat/tests/utils → deltacat-0.1.18b6/deltacat/tests/stats}/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/tests/stats/test_intervals.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/tests/test_repartition.py +0 -0
- {deltacat-0.1.18b3/deltacat/types → deltacat-0.1.18b6/deltacat/tests/test_utils}/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/tests/test_utils/constants.py +0 -0
- {deltacat-0.1.18b3/deltacat → deltacat-0.1.18b6/deltacat/tests}/utils/__init__.py +0 -0
- {deltacat-0.1.18b3/deltacat/utils/ray_utils → deltacat-0.1.18b6/deltacat/types}/__init__.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/types/media.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/types/tables.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/common.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/metrics.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/numpy.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/pandas.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/performance.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/placement.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/pyarrow.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/collections.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/dataset.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/performance.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/runtime.py +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat.egg-info/dependency_links.txt +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat.egg-info/top_level.txt +0 -0
- {deltacat-0.1.18b3 → deltacat-0.1.18b6}/setup.cfg +0 -0
@@ -3,6 +3,10 @@ from contextlib import nullcontext
|
|
3
3
|
import functools
|
4
4
|
import logging
|
5
5
|
import ray
|
6
|
+
import time
|
7
|
+
import json
|
8
|
+
from deltacat.aws import s3u as s3_utils
|
9
|
+
import deltacat
|
6
10
|
from deltacat import logs
|
7
11
|
import pyarrow as pa
|
8
12
|
from deltacat.compute.compactor import (
|
@@ -12,6 +16,9 @@ from deltacat.compute.compactor import (
|
|
12
16
|
)
|
13
17
|
from deltacat.compute.compactor.model.dedupe_result import DedupeResult
|
14
18
|
from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
|
19
|
+
from deltacat.io.object_store import IObjectStore
|
20
|
+
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
21
|
+
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
15
22
|
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
16
23
|
from deltacat.storage import (
|
17
24
|
Delta,
|
@@ -20,6 +27,9 @@ from deltacat.storage import (
|
|
20
27
|
PartitionLocator,
|
21
28
|
interface as unimplemented_deltacat_storage,
|
22
29
|
)
|
30
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
31
|
+
CompactPartitionParams,
|
32
|
+
)
|
23
33
|
from deltacat.utils.ray_utils.concurrency import (
|
24
34
|
invoke_parallel,
|
25
35
|
round_robin_options_provider,
|
@@ -37,7 +47,11 @@ from deltacat.utils.placement import PlacementGroupConfig
|
|
37
47
|
from typing import List, Set, Optional, Tuple, Dict, Any
|
38
48
|
from collections import defaultdict
|
39
49
|
from deltacat.utils.metrics import MetricsConfig
|
40
|
-
from deltacat.
|
50
|
+
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
51
|
+
CompactionSessionAuditInfo,
|
52
|
+
)
|
53
|
+
from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
|
54
|
+
|
41
55
|
|
42
56
|
if importlib.util.find_spec("memray"):
|
43
57
|
import memray
|
@@ -100,6 +114,7 @@ def compact_partition(
|
|
100
114
|
list_deltas_kwargs: Optional[Dict[str, Any]] = None,
|
101
115
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
102
116
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
117
|
+
object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
|
103
118
|
deltacat_storage=unimplemented_deltacat_storage,
|
104
119
|
**kwargs,
|
105
120
|
) -> Optional[str]:
|
@@ -139,6 +154,7 @@ def compact_partition(
|
|
139
154
|
list_deltas_kwargs,
|
140
155
|
read_kwargs_provider,
|
141
156
|
s3_table_writer_kwargs,
|
157
|
+
object_store,
|
142
158
|
deltacat_storage,
|
143
159
|
**kwargs,
|
144
160
|
)
|
@@ -184,10 +200,28 @@ def _execute_compaction_round(
|
|
184
200
|
list_deltas_kwargs: Optional[Dict[str, Any]],
|
185
201
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
186
202
|
s3_table_writer_kwargs: Optional[Dict[str, Any]],
|
203
|
+
object_store: Optional[IObjectStore],
|
187
204
|
deltacat_storage=unimplemented_deltacat_storage,
|
188
205
|
**kwargs,
|
189
206
|
) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
|
190
207
|
|
208
|
+
rcf_source_partition_locator = (
|
209
|
+
rebase_source_partition_locator
|
210
|
+
if rebase_source_partition_locator
|
211
|
+
else source_partition_locator
|
212
|
+
)
|
213
|
+
|
214
|
+
base_audit_url = rcf_source_partition_locator.path(
|
215
|
+
f"s3://{compaction_artifact_s3_bucket}/compaction-audit"
|
216
|
+
)
|
217
|
+
audit_url = f"{base_audit_url}.json"
|
218
|
+
|
219
|
+
logger.info(f"Compaction audit will be written to {audit_url}")
|
220
|
+
|
221
|
+
compaction_audit = CompactionSessionAuditInfo(deltacat.__version__, audit_url)
|
222
|
+
|
223
|
+
compaction_start = time.monotonic()
|
224
|
+
|
191
225
|
if not primary_keys:
|
192
226
|
# TODO (pdames): run simple rebatch to reduce all deltas into 1 delta
|
193
227
|
# with normalized manifest entry sizes
|
@@ -230,6 +264,7 @@ def _execute_compaction_round(
|
|
230
264
|
f"{node_resource_keys}"
|
231
265
|
)
|
232
266
|
|
267
|
+
compaction_audit.set_cluster_cpu_max(cluster_cpus)
|
233
268
|
# create a remote options provider to round-robin tasks across all nodes or allocated bundles
|
234
269
|
logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
|
235
270
|
round_robin_opt_provider = functools.partial(
|
@@ -257,6 +292,13 @@ def _execute_compaction_round(
|
|
257
292
|
)
|
258
293
|
logger.info(f"Round completion file: {round_completion_info}")
|
259
294
|
|
295
|
+
enable_manifest_entry_copy_by_reference = (
|
296
|
+
False if rebase_source_partition_locator else True
|
297
|
+
)
|
298
|
+
logger.info(
|
299
|
+
f"Enable manifest entry copy by reference is set to: {enable_manifest_entry_copy_by_reference}"
|
300
|
+
)
|
301
|
+
|
260
302
|
# discover input delta files
|
261
303
|
# For rebase:
|
262
304
|
# Copy the old compacted table to a new destination, plus any new deltas from rebased source
|
@@ -268,6 +310,7 @@ def _execute_compaction_round(
|
|
268
310
|
round_completion_info.high_watermark if round_completion_info else None
|
269
311
|
)
|
270
312
|
|
313
|
+
delta_discovery_start = time.monotonic()
|
271
314
|
(
|
272
315
|
input_deltas,
|
273
316
|
previous_last_stream_position_compacted_on_destination_table,
|
@@ -282,6 +325,13 @@ def _execute_compaction_round(
|
|
282
325
|
**list_deltas_kwargs,
|
283
326
|
)
|
284
327
|
|
328
|
+
delta_discovery_end = time.monotonic()
|
329
|
+
compaction_audit.set_delta_discovery_time_in_seconds(
|
330
|
+
delta_discovery_end - delta_discovery_start
|
331
|
+
)
|
332
|
+
|
333
|
+
s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
|
334
|
+
|
285
335
|
if not input_deltas:
|
286
336
|
logger.info("No input deltas found to compact.")
|
287
337
|
return None, None, None
|
@@ -298,6 +348,7 @@ def _execute_compaction_round(
|
|
298
348
|
io.fit_input_deltas(
|
299
349
|
input_deltas,
|
300
350
|
cluster_resources,
|
351
|
+
compaction_audit,
|
301
352
|
hash_bucket_count,
|
302
353
|
deltacat_storage=deltacat_storage,
|
303
354
|
)
|
@@ -307,11 +358,14 @@ def _execute_compaction_round(
|
|
307
358
|
cluster_resources,
|
308
359
|
hash_bucket_count,
|
309
360
|
min_hash_bucket_chunk_size,
|
361
|
+
compaction_audit=compaction_audit,
|
310
362
|
input_deltas_stats=input_deltas_stats,
|
311
363
|
deltacat_storage=deltacat_storage,
|
312
364
|
)
|
313
365
|
)
|
314
366
|
|
367
|
+
compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
|
368
|
+
|
315
369
|
assert hash_bucket_count is not None and hash_bucket_count > 0, (
|
316
370
|
f"Expected hash bucket count to be a positive integer, but found "
|
317
371
|
f"`{hash_bucket_count}`"
|
@@ -335,6 +389,8 @@ def _execute_compaction_round(
|
|
335
389
|
"Multiple rounds are not supported. Please increase the cluster size and run again."
|
336
390
|
)
|
337
391
|
|
392
|
+
hb_start = time.monotonic()
|
393
|
+
|
338
394
|
hb_tasks_pending = invoke_parallel(
|
339
395
|
items=uniform_deltas,
|
340
396
|
ray_task=hb.hash_bucket,
|
@@ -348,11 +404,28 @@ def _execute_compaction_round(
|
|
348
404
|
enable_profiler=enable_profiler,
|
349
405
|
metrics_config=metrics_config,
|
350
406
|
read_kwargs_provider=read_kwargs_provider,
|
407
|
+
object_store=object_store,
|
351
408
|
deltacat_storage=deltacat_storage,
|
352
409
|
)
|
410
|
+
|
411
|
+
hb_invoke_end = time.monotonic()
|
412
|
+
|
353
413
|
logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
|
354
414
|
hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
|
355
415
|
logger.info(f"Got {len(hb_results)} hash bucket results.")
|
416
|
+
hb_end = time.monotonic()
|
417
|
+
hb_results_retrieved_at = time.time()
|
418
|
+
|
419
|
+
telemetry_time_hb = compaction_audit.save_step_stats(
|
420
|
+
CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
|
421
|
+
hb_results,
|
422
|
+
hb_results_retrieved_at,
|
423
|
+
hb_invoke_end - hb_start,
|
424
|
+
hb_end - hb_start,
|
425
|
+
)
|
426
|
+
|
427
|
+
s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
|
428
|
+
|
356
429
|
all_hash_group_idx_to_obj_id = defaultdict(list)
|
357
430
|
for hb_result in hb_results:
|
358
431
|
for hash_group_index, object_id in enumerate(
|
@@ -367,6 +440,8 @@ def _execute_compaction_round(
|
|
367
440
|
f"Got {total_hb_record_count} hash bucket records from hash bucketing step..."
|
368
441
|
)
|
369
442
|
|
443
|
+
compaction_audit.set_input_records(total_hb_record_count.item())
|
444
|
+
|
370
445
|
# TODO (pdames): when resources are freed during the last round of hash
|
371
446
|
# bucketing, start running dedupe tasks that read existing dedupe
|
372
447
|
# output from S3 then wait for hash bucketing to finish before continuing
|
@@ -389,10 +464,18 @@ def _execute_compaction_round(
|
|
389
464
|
# identify the index of records to keep or drop based on sort keys
|
390
465
|
num_materialize_buckets = max_parallelism
|
391
466
|
logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
|
467
|
+
|
468
|
+
dedupe_start = time.monotonic()
|
469
|
+
dd_max_parallelism = int(
|
470
|
+
max_parallelism * kwargs.get("dd_max_parallelism_ratio", 1)
|
471
|
+
)
|
472
|
+
logger.info(
|
473
|
+
f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
|
474
|
+
)
|
392
475
|
dd_tasks_pending = invoke_parallel(
|
393
476
|
items=all_hash_group_idx_to_obj_id.values(),
|
394
477
|
ray_task=dd.dedupe,
|
395
|
-
max_parallelism=
|
478
|
+
max_parallelism=dd_max_parallelism,
|
396
479
|
options_provider=round_robin_opt_provider,
|
397
480
|
kwargs_provider=lambda index, item: {
|
398
481
|
"dedupe_task_index": index,
|
@@ -402,12 +485,33 @@ def _execute_compaction_round(
|
|
402
485
|
num_materialize_buckets=num_materialize_buckets,
|
403
486
|
enable_profiler=enable_profiler,
|
404
487
|
metrics_config=metrics_config,
|
488
|
+
object_store=object_store,
|
405
489
|
)
|
490
|
+
|
491
|
+
dedupe_invoke_end = time.monotonic()
|
406
492
|
logger.info(f"Getting {len(dd_tasks_pending)} dedupe results...")
|
407
493
|
dd_results: List[DedupeResult] = ray.get(dd_tasks_pending)
|
408
494
|
logger.info(f"Got {len(dd_results)} dedupe results.")
|
495
|
+
|
496
|
+
# we use time.time() here because time.monotonic() has no reference point
|
497
|
+
# whereas time.time() measures epoch seconds. Hence, it will be reasonable
|
498
|
+
# to compare time.time()s captured in different nodes.
|
499
|
+
dedupe_results_retrieved_at = time.time()
|
500
|
+
dedupe_end = time.monotonic()
|
501
|
+
|
409
502
|
total_dd_record_count = sum([ddr.deduped_record_count for ddr in dd_results])
|
410
503
|
logger.info(f"Deduped {total_dd_record_count} records...")
|
504
|
+
|
505
|
+
telemetry_time_dd = compaction_audit.save_step_stats(
|
506
|
+
CompactionSessionAuditInfo.DEDUPE_STEP_NAME,
|
507
|
+
dd_results,
|
508
|
+
dedupe_results_retrieved_at,
|
509
|
+
dedupe_invoke_end - dedupe_start,
|
510
|
+
dedupe_end - dedupe_start,
|
511
|
+
)
|
512
|
+
|
513
|
+
compaction_audit.set_records_deduped(total_dd_record_count.item())
|
514
|
+
|
411
515
|
all_mat_buckets_to_obj_id = defaultdict(list)
|
412
516
|
for dd_result in dd_results:
|
413
517
|
for (
|
@@ -420,6 +524,8 @@ def _execute_compaction_round(
|
|
420
524
|
logger.info(f"Getting {len(dd_tasks_pending)} dedupe result stat(s)...")
|
421
525
|
logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
|
422
526
|
|
527
|
+
compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
|
528
|
+
|
423
529
|
# TODO(pdames): when resources are freed during the last round of deduping
|
424
530
|
# start running materialize tasks that read materialization source file
|
425
531
|
# tables from S3 then wait for deduping to finish before continuing
|
@@ -432,6 +538,11 @@ def _execute_compaction_round(
|
|
432
538
|
|
433
539
|
# parallel step 3:
|
434
540
|
# materialize records to keep by index
|
541
|
+
|
542
|
+
s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
|
543
|
+
|
544
|
+
materialize_start = time.monotonic()
|
545
|
+
|
435
546
|
mat_tasks_pending = invoke_parallel(
|
436
547
|
items=all_mat_buckets_to_obj_id.items(),
|
437
548
|
ray_task=mat.materialize,
|
@@ -445,38 +556,34 @@ def _execute_compaction_round(
|
|
445
556
|
round_completion_info=round_completion_info,
|
446
557
|
source_partition_locator=source_partition_locator,
|
447
558
|
partition=partition,
|
559
|
+
enable_manifest_entry_copy_by_reference=enable_manifest_entry_copy_by_reference,
|
448
560
|
max_records_per_output_file=records_per_compacted_file,
|
449
561
|
compacted_file_content_type=compacted_file_content_type,
|
450
562
|
enable_profiler=enable_profiler,
|
451
563
|
metrics_config=metrics_config,
|
452
564
|
read_kwargs_provider=read_kwargs_provider,
|
453
565
|
s3_table_writer_kwargs=s3_table_writer_kwargs,
|
566
|
+
object_store=object_store,
|
454
567
|
deltacat_storage=deltacat_storage,
|
455
568
|
)
|
569
|
+
|
570
|
+
materialize_invoke_end = time.monotonic()
|
571
|
+
|
456
572
|
logger.info(f"Getting {len(mat_tasks_pending)} materialize result(s)...")
|
457
|
-
mat_results = ray.get(mat_tasks_pending)
|
458
|
-
total_count_of_src_dfl_not_touched = sum(
|
459
|
-
m.count_of_src_dfl_not_touched for m in mat_results
|
460
|
-
)
|
461
|
-
total_length_src_dfl = sum(m.count_of_src_dfl for m in mat_results)
|
462
|
-
logger.info(
|
463
|
-
f"Got total of {total_count_of_src_dfl_not_touched} manifest files not touched."
|
464
|
-
)
|
465
|
-
logger.info(
|
466
|
-
f"Got total of {total_length_src_dfl} manifest files during compaction."
|
467
|
-
)
|
468
|
-
manifest_entry_copied_by_reference_ratio = (
|
469
|
-
(round(total_count_of_src_dfl_not_touched / total_length_src_dfl, 4) * 100)
|
470
|
-
if total_length_src_dfl != 0
|
471
|
-
else None
|
472
|
-
)
|
473
|
-
logger.info(
|
474
|
-
f"{manifest_entry_copied_by_reference_ratio} percent of manifest files are copied by reference during materialize."
|
475
|
-
)
|
573
|
+
mat_results: List[MaterializeResult] = ray.get(mat_tasks_pending)
|
476
574
|
|
477
575
|
logger.info(f"Got {len(mat_results)} materialize result(s).")
|
478
576
|
|
479
|
-
|
577
|
+
materialize_end = time.monotonic()
|
578
|
+
materialize_results_retrieved_at = time.time()
|
579
|
+
|
580
|
+
telemetry_time_materialize = compaction_audit.save_step_stats(
|
581
|
+
CompactionSessionAuditInfo.MATERIALIZE_STEP_NAME,
|
582
|
+
mat_results,
|
583
|
+
materialize_results_retrieved_at,
|
584
|
+
materialize_invoke_end - materialize_start,
|
585
|
+
materialize_end - materialize_start,
|
586
|
+
)
|
480
587
|
|
481
588
|
mat_results = sorted(mat_results, key=lambda m: m.task_index)
|
482
589
|
deltas = [m.delta for m in mat_results]
|
@@ -494,6 +601,7 @@ def _execute_compaction_round(
|
|
494
601
|
f" Materialized records: {merged_delta.meta.record_count}"
|
495
602
|
)
|
496
603
|
logger.info(record_info_msg)
|
604
|
+
|
497
605
|
assert (
|
498
606
|
total_hb_record_count - total_dd_record_count == merged_delta.meta.record_count
|
499
607
|
), (
|
@@ -506,6 +614,9 @@ def _execute_compaction_round(
|
|
506
614
|
)
|
507
615
|
logger.info(f"Committed compacted delta: {compacted_delta}")
|
508
616
|
|
617
|
+
compaction_end = time.monotonic()
|
618
|
+
compaction_audit.set_compaction_time_in_seconds(compaction_end - compaction_start)
|
619
|
+
|
509
620
|
new_compacted_delta_locator = DeltaLocator.of(
|
510
621
|
new_compacted_partition_locator,
|
511
622
|
compacted_delta.stream_position,
|
@@ -516,26 +627,51 @@ def _execute_compaction_round(
|
|
516
627
|
if round_completion_info
|
517
628
|
else None
|
518
629
|
)
|
630
|
+
|
631
|
+
pyarrow_write_result = PyArrowWriteResult.union(
|
632
|
+
[m.pyarrow_write_result for m in mat_results]
|
633
|
+
)
|
634
|
+
|
635
|
+
session_peak_memory = get_current_node_peak_memory_usage_in_bytes()
|
636
|
+
compaction_audit.set_peak_memory_used_bytes_by_compaction_session_process(
|
637
|
+
session_peak_memory
|
638
|
+
)
|
639
|
+
|
640
|
+
compaction_audit.save_round_completion_stats(
|
641
|
+
mat_results, telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
|
642
|
+
)
|
643
|
+
|
644
|
+
s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
|
645
|
+
|
519
646
|
new_round_completion_info = RoundCompletionInfo.of(
|
520
647
|
last_stream_position_compacted,
|
521
648
|
new_compacted_delta_locator,
|
522
|
-
|
649
|
+
pyarrow_write_result,
|
523
650
|
bit_width_of_sort_keys,
|
524
651
|
last_rebase_source_partition_locator,
|
525
|
-
|
526
|
-
|
527
|
-
rcf_source_partition_locator = (
|
528
|
-
rebase_source_partition_locator
|
529
|
-
if rebase_source_partition_locator
|
530
|
-
else source_partition_locator
|
652
|
+
compaction_audit.untouched_file_ratio,
|
653
|
+
audit_url,
|
531
654
|
)
|
655
|
+
|
532
656
|
logger.info(
|
533
657
|
f"partition-{source_partition_locator.partition_values},"
|
534
658
|
f"compacted at: {last_stream_position_compacted},"
|
535
659
|
f"last position: {last_stream_position_to_compact}"
|
536
660
|
)
|
661
|
+
|
537
662
|
return (
|
538
663
|
partition,
|
539
664
|
new_round_completion_info,
|
540
665
|
rcf_source_partition_locator,
|
541
666
|
)
|
667
|
+
|
668
|
+
|
669
|
+
def compact_partition_from_request(
|
670
|
+
compact_partition_params: CompactPartitionParams,
|
671
|
+
) -> Optional[str]:
|
672
|
+
"""
|
673
|
+
Wrapper for compact_partition that allows for the compact_partition parameters to be
|
674
|
+
passed in as a custom dictionary-like CompactPartitionParams object.
|
675
|
+
:param compact_partition_params:
|
676
|
+
"""
|
677
|
+
return compact_partition(**compact_partition_params)
|
@@ -0,0 +1,153 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import copy
|
4
|
+
import json
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
|
+
|
7
|
+
from deltacat.types.media import ContentType
|
8
|
+
|
9
|
+
|
10
|
+
class CompactPartitionParams(dict):
|
11
|
+
"""
|
12
|
+
This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
|
13
|
+
"""
|
14
|
+
|
15
|
+
@staticmethod
|
16
|
+
def of(params: Optional[Dict]) -> CompactPartitionParams:
|
17
|
+
if params is None:
|
18
|
+
params = {}
|
19
|
+
compact_partition_params = CompactPartitionParams()
|
20
|
+
compact_partition_params["destination_partition_locator"] = params.get(
|
21
|
+
"destination_partition_locator"
|
22
|
+
)
|
23
|
+
compact_partition_params["last_stream_position_to_compact"] = params.get(
|
24
|
+
"last_stream_position_to_compact"
|
25
|
+
)
|
26
|
+
compact_partition_params["source_partition_locator"] = params.get(
|
27
|
+
"source_partition_locator"
|
28
|
+
)
|
29
|
+
compact_partition_params["primary_keys"] = params.get("primary_keys")
|
30
|
+
compact_partition_params["rebase_source_partition_locator"] = params.get(
|
31
|
+
"rebase_source_partition_locator"
|
32
|
+
)
|
33
|
+
compact_partition_params["rebase_source_partition_high_watermark"] = params.get(
|
34
|
+
"rebase_source_partition_high_watermark"
|
35
|
+
)
|
36
|
+
compact_partition_params["hash_bucket_count"] = params.get("hash_bucket_count")
|
37
|
+
compact_partition_params["deltacat_storage"] = params.get("deltacat_storage")
|
38
|
+
compact_partition_params["compaction_artifact_s3_bucket"] = params.get(
|
39
|
+
"compaction_artifact_s3_bucket"
|
40
|
+
)
|
41
|
+
compact_partition_params["properties"] = params.get("properties")
|
42
|
+
compact_partition_params["compacted_file_content_type"] = params.get(
|
43
|
+
"compacted_file_content_type"
|
44
|
+
)
|
45
|
+
compact_partition_params["list_deltas_kwargs"] = params.get(
|
46
|
+
"list_deltas_kwargs"
|
47
|
+
)
|
48
|
+
compact_partition_params["pg_config"] = params.get("pg_config")
|
49
|
+
compact_partition_params["read_kwargs_provider"] = params.get(
|
50
|
+
"read_kwargs_provider"
|
51
|
+
)
|
52
|
+
compact_partition_params["s3_table_writer_kwargs"] = params.get(
|
53
|
+
"s3_table_writer_kwargs"
|
54
|
+
)
|
55
|
+
return compact_partition_params
|
56
|
+
|
57
|
+
@property
|
58
|
+
def destination_partition_locator(self) -> Optional[dict]:
|
59
|
+
return self["destination_partition_locator"]
|
60
|
+
|
61
|
+
@property
|
62
|
+
def last_stream_position_to_compact(self) -> Optional[int]:
|
63
|
+
return self["last_stream_position_to_compact"]
|
64
|
+
|
65
|
+
@property
|
66
|
+
def source_partition_locator(self) -> Optional[dict]:
|
67
|
+
return self["source_partition_locator"]
|
68
|
+
|
69
|
+
@property
|
70
|
+
def primary_keys(self) -> Optional[List[str]]:
|
71
|
+
return list(self["primary_keys"])
|
72
|
+
|
73
|
+
@property
|
74
|
+
def rebase_source_partition_locator(self) -> Optional[dict]:
|
75
|
+
return self["rebase_source_partition_locator"]
|
76
|
+
|
77
|
+
@property
|
78
|
+
def rebase_source_partition_high_watermark(self) -> Optional[int]:
|
79
|
+
return self["rebase_source_partition_high_watermark"]
|
80
|
+
|
81
|
+
@property
|
82
|
+
def hash_bucket_count(self) -> Optional[int]:
|
83
|
+
return self["hash_bucket_count"]
|
84
|
+
|
85
|
+
@property
|
86
|
+
def deltacat_storage(self) -> Optional[str]:
|
87
|
+
return self["deltacat_storage"]
|
88
|
+
|
89
|
+
@property
|
90
|
+
def compaction_artifact_s3_bucket(self) -> Optional[str]:
|
91
|
+
return self["compaction_artifact_s3_bucket"]
|
92
|
+
|
93
|
+
@property
|
94
|
+
def properties(self) -> Optional[Dict[str, str]]:
|
95
|
+
return self["properties"]
|
96
|
+
|
97
|
+
@property
|
98
|
+
def compacted_file_content_type(self) -> Optional[ContentType]:
|
99
|
+
return self["compacted_file_content_type"]
|
100
|
+
|
101
|
+
@property
|
102
|
+
def list_deltas_kwargs(self) -> Optional[dict]:
|
103
|
+
return self["list_deltas_kwargs"]
|
104
|
+
|
105
|
+
@property
|
106
|
+
def pg_config(self) -> Optional[Any]:
|
107
|
+
return self["pg_config"]
|
108
|
+
|
109
|
+
@property
|
110
|
+
def read_kwargs_provider(self) -> Optional[Any]:
|
111
|
+
return self["read_kwargs_provider"]
|
112
|
+
|
113
|
+
@property
|
114
|
+
def s3_table_writer_kwargs(self) -> Optional[Any]:
|
115
|
+
return self["s3_table_writer_kwargs"]
|
116
|
+
|
117
|
+
@staticmethod
|
118
|
+
def json_handler_for_compact_partition_params(obj):
|
119
|
+
"""
|
120
|
+
A handler for the `json.dumps()` function that can be used to serialize sets to JSON.
|
121
|
+
If the `set_default()` handler is passed as the `default` argument to the `json.dumps()` function, it will be called whenever a set object is encountered.
|
122
|
+
The `set_default()` handler will then serialize the set as a list.
|
123
|
+
"""
|
124
|
+
try:
|
125
|
+
if isinstance(obj, set):
|
126
|
+
return list(obj)
|
127
|
+
elif hasattr(obj, "toJSON"):
|
128
|
+
return obj.toJSON()
|
129
|
+
else:
|
130
|
+
return obj.__dict__
|
131
|
+
except Exception:
|
132
|
+
return obj.__class__.__name__
|
133
|
+
|
134
|
+
def serialize(self) -> str:
|
135
|
+
"""
|
136
|
+
Serializes itself to a json-formatted string
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
The serialized object.
|
140
|
+
|
141
|
+
"""
|
142
|
+
to_serialize: Dict[str, Any] = {}
|
143
|
+
# individually try deepcopy the values from the self dictionary and just use the class name for the value when it is not possible to deepcopy
|
144
|
+
for attr, value in self.items():
|
145
|
+
try:
|
146
|
+
to_serialize[attr] = copy.deepcopy(value)
|
147
|
+
except Exception: # if unable to deep copy the objects like module objects for example then just provide the class name at minimum
|
148
|
+
to_serialize[attr] = value.__class__.__name__
|
149
|
+
serialized_arguments_compact_partition_args: str = json.dumps(
|
150
|
+
to_serialize,
|
151
|
+
default=CompactPartitionParams.json_handler_for_compact_partition_params,
|
152
|
+
)
|
153
|
+
return serialized_arguments_compact_partition_args
|