deltacat 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/model/compact_partition_params.py +76 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
- deltacat/compute/compactor/model/delta_annotated.py +16 -9
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +11 -5
- deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
- deltacat/compute/compactor_v2/utils/io.py +28 -14
- deltacat/compute/compactor_v2/utils/task_options.py +128 -183
- deltacat/compute/resource_estimation/__init__.py +27 -0
- deltacat/compute/resource_estimation/delta.py +271 -0
- deltacat/compute/resource_estimation/manifest.py +394 -0
- deltacat/compute/resource_estimation/model.py +165 -0
- deltacat/compute/resource_estimation/parquet.py +108 -0
- deltacat/constants.py +5 -0
- deltacat/logs.py +8 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
- deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
- deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
- deltacat/tests/compute/test_util_common.py +2 -0
- deltacat/tests/test_logs.py +34 -0
- deltacat/tests/test_utils/pyarrow.py +15 -5
- {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/METADATA +2 -2
- {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/RECORD +30 -46
- deltacat/compute/metastats/meta_stats.py +0 -479
- deltacat/compute/metastats/model/__init__.py +0 -0
- deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
- deltacat/compute/metastats/stats.py +0 -182
- deltacat/compute/metastats/utils/__init__.py +0 -0
- deltacat/compute/metastats/utils/constants.py +0 -16
- deltacat/compute/metastats/utils/io.py +0 -223
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
- deltacat/compute/metastats/utils/ray_utils.py +0 -129
- deltacat/compute/stats/basic.py +0 -226
- deltacat/compute/stats/models/__init__.py +0 -0
- deltacat/compute/stats/models/delta_column_stats.py +0 -98
- deltacat/compute/stats/models/delta_stats.py +0 -233
- deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
- deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
- deltacat/compute/stats/models/stats_result.py +0 -104
- deltacat/compute/stats/utils/__init__.py +0 -0
- deltacat/compute/stats/utils/intervals.py +0 -94
- deltacat/compute/stats/utils/io.py +0 -230
- deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
- deltacat/tests/stats/__init__.py +0 -0
- deltacat/tests/stats/test_intervals.py +0 -49
- /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
- /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
- {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/LICENSE +0 -0
- {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/WHEEL +0 -0
- {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -13,6 +13,10 @@ from deltacat.storage import (
|
|
13
13
|
PartitionLocator,
|
14
14
|
SortKey,
|
15
15
|
)
|
16
|
+
from deltacat.compute.resource_estimation import (
|
17
|
+
ResourceEstimationMethod,
|
18
|
+
EstimateResourcesParams,
|
19
|
+
)
|
16
20
|
from deltacat.compute.compactor_v2.constants import (
|
17
21
|
MAX_RECORDS_PER_COMPACTED_FILE,
|
18
22
|
MIN_DELTA_BYTES_IN_BATCH,
|
@@ -23,6 +27,8 @@ from deltacat.compute.compactor_v2.constants import (
|
|
23
27
|
TOTAL_MEMORY_BUFFER_PERCENTAGE,
|
24
28
|
DEFAULT_DISABLE_COPY_BY_REFERENCE,
|
25
29
|
DEFAULT_NUM_ROUNDS,
|
30
|
+
PARQUET_TO_PYARROW_INFLATION,
|
31
|
+
MAX_PARQUET_METADATA_SIZE,
|
26
32
|
)
|
27
33
|
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
28
34
|
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
@@ -104,6 +110,22 @@ class CompactPartitionParams(dict):
|
|
104
110
|
result.metrics_config = params.get("metrics_config")
|
105
111
|
|
106
112
|
result.num_rounds = params.get("num_rounds", DEFAULT_NUM_ROUNDS)
|
113
|
+
result.parquet_to_pyarrow_inflation = params.get(
|
114
|
+
"parquet_to_pyarrow_inflation", PARQUET_TO_PYARROW_INFLATION
|
115
|
+
)
|
116
|
+
result.resource_estimation_method = ResourceEstimationMethod[
|
117
|
+
params.get(
|
118
|
+
"resource_estimation_method", ResourceEstimationMethod.DEFAULT.value
|
119
|
+
)
|
120
|
+
]
|
121
|
+
|
122
|
+
# disable input split during rebase as the rebase files are already uniform
|
123
|
+
result.enable_input_split = (
|
124
|
+
params.get("rebase_source_partition_locator") is None
|
125
|
+
)
|
126
|
+
result.max_parquet_meta_size_bytes = params.get(
|
127
|
+
"max_parquet_meta_size_bytes", MAX_PARQUET_METADATA_SIZE
|
128
|
+
)
|
107
129
|
|
108
130
|
if not importlib.util.find_spec("memray"):
|
109
131
|
result.enable_profiler = False
|
@@ -413,6 +435,60 @@ class CompactPartitionParams(dict):
|
|
413
435
|
def num_rounds(self, num_rounds: int) -> None:
|
414
436
|
self["num_rounds"] = num_rounds
|
415
437
|
|
438
|
+
@property
|
439
|
+
def parquet_to_pyarrow_inflation(self) -> float:
|
440
|
+
"""
|
441
|
+
The inflation factor for the parquet uncompressed_size_bytes to pyarrow table size.
|
442
|
+
"""
|
443
|
+
return self["parquet_to_pyarrow_inflation"]
|
444
|
+
|
445
|
+
@parquet_to_pyarrow_inflation.setter
|
446
|
+
def parquet_to_pyarrow_inflation(self, value: float) -> None:
|
447
|
+
self["parquet_to_pyarrow_inflation"] = value
|
448
|
+
|
449
|
+
@property
|
450
|
+
def enable_input_split(self) -> bool:
|
451
|
+
"""
|
452
|
+
When this is True, the input split will be always enabled for parquet files.
|
453
|
+
The input split feature will split the parquet files into individual row groups
|
454
|
+
so that we could process them in different nodes in parallel.
|
455
|
+
By default, input split is enabled for incremental compaction and disabled for rebase or backfill.
|
456
|
+
"""
|
457
|
+
return self["enable_input_split"]
|
458
|
+
|
459
|
+
@enable_input_split.setter
|
460
|
+
def enable_input_split(self, value: bool) -> None:
|
461
|
+
self["enable_input_split"] = value
|
462
|
+
|
463
|
+
@property
|
464
|
+
def max_parquet_meta_size_bytes(self) -> int:
|
465
|
+
"""
|
466
|
+
The maximum size of the parquet metadata in bytes. Used for allocating tasks
|
467
|
+
to fetch parquet metadata.
|
468
|
+
"""
|
469
|
+
return self["max_parquet_meta_size_bytes"]
|
470
|
+
|
471
|
+
@max_parquet_meta_size_bytes.setter
|
472
|
+
def max_parquet_meta_size_bytes(self, value: int) -> None:
|
473
|
+
self["max_parquet_meta_size_bytes"] = value
|
474
|
+
|
475
|
+
@property
|
476
|
+
def resource_estimation_method(self) -> ResourceEstimationMethod:
|
477
|
+
return self["resource_estimation_method"]
|
478
|
+
|
479
|
+
@resource_estimation_method.setter
|
480
|
+
def resource_estimation_method(self, value: ResourceEstimationMethod) -> None:
|
481
|
+
self["resource_estimation_method"] = value
|
482
|
+
|
483
|
+
@property
|
484
|
+
def estimate_resources_params(self) -> EstimateResourcesParams:
|
485
|
+
return EstimateResourcesParams.of(
|
486
|
+
resource_estimation_method=self.resource_estimation_method,
|
487
|
+
previous_inflation=self.previous_inflation,
|
488
|
+
parquet_to_pyarrow_inflation=self.parquet_to_pyarrow_inflation,
|
489
|
+
average_record_size_bytes=self.average_record_size_bytes,
|
490
|
+
)
|
491
|
+
|
416
492
|
@staticmethod
|
417
493
|
def json_handler_for_compact_partition_params(obj):
|
418
494
|
"""
|
@@ -436,6 +436,22 @@ class CompactionSessionAuditInfo(dict):
|
|
436
436
|
"""
|
437
437
|
return self.get("compactorVersion")
|
438
438
|
|
439
|
+
@property
|
440
|
+
def observed_input_inflation(self) -> float:
|
441
|
+
"""
|
442
|
+
The average inflation observed for input files only.
|
443
|
+
This only accounts for files in the source.
|
444
|
+
"""
|
445
|
+
return self.get("observedInputInflation")
|
446
|
+
|
447
|
+
@property
|
448
|
+
def observed_input_average_record_size_bytes(self) -> float:
|
449
|
+
"""
|
450
|
+
The average record size observed for input files only.
|
451
|
+
This only accounts for files in the source.
|
452
|
+
"""
|
453
|
+
return self.get("observedInputAverageRecordSizeBytes")
|
454
|
+
|
439
455
|
# Setters follow
|
440
456
|
|
441
457
|
def set_audit_url(self, audit_url: str) -> CompactionSessionAuditInfo:
|
@@ -756,6 +772,16 @@ class CompactionSessionAuditInfo(dict):
|
|
756
772
|
self["compactorVersion"] = value
|
757
773
|
return self
|
758
774
|
|
775
|
+
def set_observed_input_inflation(self, value: float) -> CompactionSessionAuditInfo:
|
776
|
+
self["observedInputInflation"] = value
|
777
|
+
return self
|
778
|
+
|
779
|
+
def set_observed_input_average_record_size_bytes(
|
780
|
+
self, value: float
|
781
|
+
) -> CompactionSessionAuditInfo:
|
782
|
+
self["observedInputAverageRecordSizeBytes"] = value
|
783
|
+
return self
|
784
|
+
|
759
785
|
# High level methods to save stats
|
760
786
|
def save_step_stats(
|
761
787
|
self,
|
@@ -69,6 +69,7 @@ class DeltaAnnotated(Delta):
|
|
69
69
|
estimation_function: Optional[
|
70
70
|
Callable[[ManifestEntry], float]
|
71
71
|
] = lambda entry: entry.meta.content_length,
|
72
|
+
enable_input_split: Optional[bool] = False,
|
72
73
|
) -> List[DeltaAnnotated]:
|
73
74
|
"""
|
74
75
|
Simple greedy algorithm to split/merge 1 or more annotated deltas into
|
@@ -86,13 +87,19 @@ class DeltaAnnotated(Delta):
|
|
86
87
|
new_da_bytes = 0
|
87
88
|
da_group_entry_count = 0
|
88
89
|
|
89
|
-
|
90
|
-
|
90
|
+
if enable_input_split:
|
91
|
+
for delta_annotated in annotated_deltas:
|
92
|
+
split_annotated_deltas.extend(
|
93
|
+
DeltaAnnotated._split_single(delta_annotated)
|
94
|
+
)
|
91
95
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
+
logger.info(
|
97
|
+
f"Split the {len(annotated_deltas)} annotated deltas "
|
98
|
+
f"into {len(split_annotated_deltas)} groups."
|
99
|
+
)
|
100
|
+
else:
|
101
|
+
logger.info("Skipping input split as it is disabled...")
|
102
|
+
split_annotated_deltas = annotated_deltas
|
96
103
|
|
97
104
|
for src_da in split_annotated_deltas:
|
98
105
|
src_da_annotations = src_da.annotations
|
@@ -107,7 +114,7 @@ class DeltaAnnotated(Delta):
|
|
107
114
|
# (i.e. the previous compaction round ran a rebase)
|
108
115
|
if new_da and src_da.locator != new_da.locator:
|
109
116
|
groups.append(new_da)
|
110
|
-
logger.
|
117
|
+
logger.debug(
|
111
118
|
f"Due to different delta locator, Appending group of {da_group_entry_count} elements "
|
112
119
|
f"and {new_da_bytes} bytes"
|
113
120
|
)
|
@@ -126,12 +133,12 @@ class DeltaAnnotated(Delta):
|
|
126
133
|
or da_group_entry_count >= min_file_counts
|
127
134
|
):
|
128
135
|
if new_da_bytes >= min_delta_bytes:
|
129
|
-
logger.
|
136
|
+
logger.debug(
|
130
137
|
f"Appending group of {da_group_entry_count} elements "
|
131
138
|
f"and {new_da_bytes} bytes to meet file size limit"
|
132
139
|
)
|
133
140
|
if da_group_entry_count >= min_file_counts:
|
134
|
-
logger.
|
141
|
+
logger.debug(
|
135
142
|
f"Appending group of {da_group_entry_count} elements "
|
136
143
|
f"and {da_group_entry_count} files to meet file count limit"
|
137
144
|
)
|
@@ -41,6 +41,9 @@ DROP_DUPLICATES = True
|
|
41
41
|
# size in metadata to pyarrow table size.
|
42
42
|
PARQUET_TO_PYARROW_INFLATION = 4
|
43
43
|
|
44
|
+
# Maximum size of the parquet metadata
|
45
|
+
MAX_PARQUET_METADATA_SIZE = 100_000_000 # 100 MB
|
46
|
+
|
44
47
|
# By default, copy by reference is enabled
|
45
48
|
DEFAULT_DISABLE_COPY_BY_REFERENCE = False
|
46
49
|
|
@@ -148,12 +148,8 @@ def _build_uniform_deltas(
|
|
148
148
|
input_deltas=input_deltas,
|
149
149
|
hash_bucket_count=params.hash_bucket_count,
|
150
150
|
compaction_audit=mutable_compaction_audit,
|
151
|
+
compact_partition_params=params,
|
151
152
|
deltacat_storage=params.deltacat_storage,
|
152
|
-
previous_inflation=params.previous_inflation,
|
153
|
-
min_delta_bytes=params.min_delta_bytes_in_batch,
|
154
|
-
min_file_counts=params.min_files_in_batch,
|
155
|
-
# disable input split during rebase as the rebase files are already uniform
|
156
|
-
enable_input_split=params.rebase_source_partition_locator is None,
|
157
153
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
158
154
|
)
|
159
155
|
delta_discovery_end: float = time.monotonic()
|
@@ -369,6 +365,8 @@ def _run_hash_and_merge(
|
|
369
365
|
mutable_compaction_audit.set_telemetry_time_in_seconds(
|
370
366
|
telemetry_this_round + previous_telemetry
|
371
367
|
)
|
368
|
+
params.object_store.clear()
|
369
|
+
|
372
370
|
return merge_results
|
373
371
|
|
374
372
|
|
@@ -400,6 +398,7 @@ def _merge(
|
|
400
398
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
401
399
|
ray_custom_resources=params.ray_custom_resources,
|
402
400
|
memory_logs_enabled=params.memory_logs_enabled,
|
401
|
+
estimate_resources_params=params.estimate_resources_params,
|
403
402
|
)
|
404
403
|
|
405
404
|
def merge_input_provider(index, item) -> dict[str, MergeInput]:
|
@@ -463,6 +462,7 @@ def _hash_bucket(
|
|
463
462
|
primary_keys=params.primary_keys,
|
464
463
|
ray_custom_resources=params.ray_custom_resources,
|
465
464
|
memory_logs_enabled=params.memory_logs_enabled,
|
465
|
+
estimate_resources_params=params.estimate_resources_params,
|
466
466
|
)
|
467
467
|
|
468
468
|
def hash_bucket_input_provider(index, item) -> dict[str, HashBucketInput]:
|
@@ -537,6 +537,7 @@ def _run_local_merge(
|
|
537
537
|
ray_custom_resources=params.ray_custom_resources,
|
538
538
|
primary_keys=params.primary_keys,
|
539
539
|
memory_logs_enabled=params.memory_logs_enabled,
|
540
|
+
estimate_resources_params=params.estimate_resources_params,
|
540
541
|
)
|
541
542
|
local_merge_result = ray.get(
|
542
543
|
mg.merge.options(**local_merge_options).remote(local_merge_input)
|
@@ -666,6 +667,11 @@ def _write_new_round_completion_file(
|
|
666
667
|
f" and average record size={input_average_record_size_bytes}"
|
667
668
|
)
|
668
669
|
|
670
|
+
mutable_compaction_audit.set_observed_input_inflation(input_inflation)
|
671
|
+
mutable_compaction_audit.set_observed_input_average_record_size_bytes(
|
672
|
+
input_average_record_size_bytes
|
673
|
+
)
|
674
|
+
|
669
675
|
_update_and_upload_compaction_audit(
|
670
676
|
params,
|
671
677
|
mutable_compaction_audit,
|
@@ -1,66 +1,217 @@
|
|
1
1
|
import logging
|
2
|
+
import ray
|
3
|
+
import functools
|
4
|
+
from deltacat.compute.compactor_v2.constants import (
|
5
|
+
TASK_MAX_PARALLELISM,
|
6
|
+
MAX_PARQUET_METADATA_SIZE,
|
7
|
+
)
|
8
|
+
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
2
9
|
from deltacat import logs
|
3
10
|
from deltacat.storage import (
|
4
11
|
Delta,
|
12
|
+
ManifestEntry,
|
5
13
|
interface as unimplemented_deltacat_storage,
|
6
14
|
)
|
7
15
|
from typing import Dict, Optional, Any
|
8
|
-
from deltacat.types.media import TableType
|
16
|
+
from deltacat.types.media import TableType
|
9
17
|
from deltacat.types.media import ContentType
|
10
18
|
from deltacat.types.partial_download import PartialParquetParameters
|
19
|
+
from deltacat.exceptions import RetryableError
|
11
20
|
|
12
21
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
13
22
|
|
14
23
|
|
24
|
+
def append_content_type_params_options_provider(
|
25
|
+
index: int, item: Any, max_parquet_meta_size_bytes: int, **kwargs
|
26
|
+
) -> Dict:
|
27
|
+
task_opts = {
|
28
|
+
"num_cpus": 0.01,
|
29
|
+
"memory": max_parquet_meta_size_bytes,
|
30
|
+
"scheduling_strategy": "DEFAULT",
|
31
|
+
}
|
32
|
+
|
33
|
+
task_opts["max_retries"] = 3
|
34
|
+
task_opts["retry_exceptions"] = [RetryableError]
|
35
|
+
|
36
|
+
return task_opts
|
37
|
+
|
38
|
+
|
39
|
+
def _contains_partial_parquet_parameters(entry: ManifestEntry) -> bool:
|
40
|
+
return (
|
41
|
+
entry.meta
|
42
|
+
and entry.meta.content_type_parameters
|
43
|
+
and any(
|
44
|
+
isinstance(type_params, PartialParquetParameters)
|
45
|
+
for type_params in entry.meta.content_type_parameters
|
46
|
+
)
|
47
|
+
)
|
48
|
+
|
49
|
+
|
50
|
+
APPEND_CONTENT_TYPE_PARAMS_CACHE = "append_content_type_params_cache"
|
51
|
+
# At this point, it's better to fetch all parquet than to cache and
|
52
|
+
# call actor which is not expected to support high throughput.
|
53
|
+
MINIMUM_ENTRIES_TO_CACHE = 10
|
54
|
+
|
55
|
+
|
56
|
+
@ray.remote
|
57
|
+
class AppendContentTypeParamsCache:
|
58
|
+
"""
|
59
|
+
This actor caches the delta that contains content type meta.
|
60
|
+
"""
|
61
|
+
|
62
|
+
def __init__(self):
|
63
|
+
self.cache = {}
|
64
|
+
|
65
|
+
def get(self, key):
|
66
|
+
return self.cache.get(key)
|
67
|
+
|
68
|
+
def put(self, key, value):
|
69
|
+
self.cache[key] = value
|
70
|
+
|
71
|
+
|
72
|
+
@ray.remote
|
73
|
+
def _download_parquet_metadata_for_manifest_entry(
|
74
|
+
delta: Delta,
|
75
|
+
entry_index: int,
|
76
|
+
deltacat_storage: unimplemented_deltacat_storage,
|
77
|
+
deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
|
78
|
+
) -> Dict[str, Any]:
|
79
|
+
pq_file = deltacat_storage.download_delta_manifest_entry(
|
80
|
+
delta,
|
81
|
+
entry_index=entry_index,
|
82
|
+
table_type=TableType.PYARROW_PARQUET,
|
83
|
+
**deltacat_storage_kwargs,
|
84
|
+
)
|
85
|
+
|
86
|
+
return {
|
87
|
+
"entry_index": entry_index,
|
88
|
+
"partial_parquet_params": PartialParquetParameters.of(
|
89
|
+
pq_metadata=pq_file.metadata
|
90
|
+
),
|
91
|
+
}
|
92
|
+
|
93
|
+
|
15
94
|
def append_content_type_params(
|
16
95
|
delta: Delta,
|
17
|
-
|
96
|
+
task_max_parallelism: int = TASK_MAX_PARALLELISM,
|
97
|
+
max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
|
18
98
|
deltacat_storage=unimplemented_deltacat_storage,
|
19
99
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
20
100
|
) -> None:
|
101
|
+
"""
|
102
|
+
This operation appends content type params into the delta entry. Note
|
103
|
+
that this operation can be time consuming, hence we cache it in a Ray actor.
|
104
|
+
"""
|
21
105
|
|
22
|
-
if delta.meta
|
23
|
-
logger.
|
24
|
-
f"Delta with locator {delta.locator} is not a parquet delta, "
|
25
|
-
"skipping appending content type parameters."
|
26
|
-
)
|
106
|
+
if not delta.meta:
|
107
|
+
logger.warning(f"Delta with locator {delta.locator} doesn't contain meta.")
|
27
108
|
return
|
28
109
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
110
|
+
entry_indices_to_download = []
|
111
|
+
for entry_index, entry in enumerate(delta.manifest.entries):
|
112
|
+
if (
|
113
|
+
not _contains_partial_parquet_parameters(entry)
|
114
|
+
and entry.meta
|
115
|
+
and entry.meta.content_type == ContentType.PARQUET.value
|
116
|
+
):
|
117
|
+
entry_indices_to_download.append(entry_index)
|
34
118
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
table_type=TableType.PYARROW_PARQUET,
|
39
|
-
**deltacat_storage_kwargs,
|
119
|
+
if not entry_indices_to_download:
|
120
|
+
logger.info(
|
121
|
+
f"No parquet type params to download for delta with locator {delta.locator}."
|
40
122
|
)
|
123
|
+
return None
|
41
124
|
|
42
|
-
|
43
|
-
|
125
|
+
ray_namespace = ray.get_runtime_context().namespace
|
126
|
+
logger.info(
|
127
|
+
f"Got Ray namespace: {ray_namespace}. "
|
128
|
+
"Note that caching only works with non-anonymous namespace."
|
129
|
+
"To set a non-anonymous namespace, call ray.init(namespace='X')."
|
130
|
+
)
|
131
|
+
if len(entry_indices_to_download) >= MINIMUM_ENTRIES_TO_CACHE:
|
132
|
+
logger.info(
|
133
|
+
f"Checking if cache contains parquet meta in namespace {ray_namespace} for "
|
134
|
+
f"delta locator {delta.locator} and digest {delta.locator.hexdigest()}..."
|
135
|
+
)
|
136
|
+
cache = AppendContentTypeParamsCache.options(
|
137
|
+
name=APPEND_CONTENT_TYPE_PARAMS_CACHE,
|
138
|
+
namespace=ray_namespace,
|
139
|
+
get_if_exists=True,
|
140
|
+
).remote()
|
44
141
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
142
|
+
logger.info(f"Got cache actor: {cache}")
|
143
|
+
cached_value = ray.get(cache.get.remote(delta.locator.hexdigest()))
|
144
|
+
if cached_value is not None:
|
145
|
+
logger.info(
|
146
|
+
"Using cached parquet meta for delta with locator"
|
147
|
+
f" {delta.locator} and digest {delta.locator.hexdigest()}."
|
148
|
+
)
|
149
|
+
delta.manifest = cached_value.manifest
|
150
|
+
return
|
151
|
+
logger.info(
|
152
|
+
f"Cache doesn't contain parquet meta for delta with locator {delta.locator}."
|
51
153
|
)
|
52
154
|
|
53
|
-
|
54
|
-
|
55
|
-
|
155
|
+
options_provider = functools.partial(
|
156
|
+
append_content_type_params_options_provider,
|
157
|
+
max_parquet_meta_size_bytes=max_parquet_meta_size_bytes,
|
158
|
+
)
|
56
159
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
160
|
+
def input_provider(index, item) -> Dict:
|
161
|
+
return {
|
162
|
+
"deltacat_storage_kwargs": deltacat_storage_kwargs,
|
163
|
+
"deltacat_storage": deltacat_storage,
|
164
|
+
"delta": delta,
|
165
|
+
"entry_index": item,
|
166
|
+
}
|
61
167
|
|
62
|
-
|
168
|
+
logger.info(
|
169
|
+
f"Downloading parquet meta for {len(entry_indices_to_download)} manifest entries..."
|
170
|
+
)
|
171
|
+
pq_files_promise = invoke_parallel(
|
172
|
+
entry_indices_to_download,
|
173
|
+
ray_task=_download_parquet_metadata_for_manifest_entry,
|
174
|
+
max_parallelism=task_max_parallelism,
|
175
|
+
options_provider=options_provider,
|
176
|
+
kwargs_provider=input_provider,
|
177
|
+
)
|
178
|
+
|
179
|
+
partial_file_meta_list = ray.get(pq_files_promise)
|
180
|
+
|
181
|
+
logger.info(
|
182
|
+
f"Downloaded parquet meta for {len(entry_indices_to_download)} manifest entries"
|
183
|
+
)
|
184
|
+
|
185
|
+
assert len(partial_file_meta_list) == len(
|
186
|
+
entry_indices_to_download
|
187
|
+
), f"Expected {len(entry_indices_to_download)} pq files, got {len(partial_file_meta_list)}"
|
188
|
+
|
189
|
+
for index, entry_index in enumerate(entry_indices_to_download):
|
190
|
+
assert (
|
191
|
+
entry_index == partial_file_meta_list[index]["entry_index"]
|
192
|
+
), "entry_index must match with the associated parquet meta"
|
193
|
+
entry = delta.manifest.entries[entry_index]
|
63
194
|
if not entry.meta.content_type_parameters:
|
64
195
|
entry.meta.content_type_parameters = []
|
196
|
+
entry.meta.content_type_parameters.append(
|
197
|
+
partial_file_meta_list[index]["partial_parquet_params"]
|
198
|
+
)
|
199
|
+
|
200
|
+
for entry_index, entry in enumerate(delta.manifest.entries):
|
201
|
+
assert _contains_partial_parquet_parameters(
|
202
|
+
entry
|
203
|
+
), "partial parquet params validation failed."
|
65
204
|
|
66
|
-
|
205
|
+
if len(entry_indices_to_download) >= MINIMUM_ENTRIES_TO_CACHE:
|
206
|
+
cache = AppendContentTypeParamsCache.options(
|
207
|
+
name=APPEND_CONTENT_TYPE_PARAMS_CACHE,
|
208
|
+
namespace=ray_namespace,
|
209
|
+
get_if_exists=True,
|
210
|
+
).remote()
|
211
|
+
logger.info(f"Got cache actor when writing: {cache}")
|
212
|
+
logger.info(
|
213
|
+
f"Caching parquet meta for delta with locator {delta.locator} "
|
214
|
+
f"and digest {delta.locator.hexdigest()}..."
|
215
|
+
)
|
216
|
+
ray.get(cache.put.remote(delta.locator.hexdigest(), delta))
|
217
|
+
assert ray.get(cache.get.remote(delta.locator.hexdigest())) is not None
|
@@ -1,6 +1,5 @@
|
|
1
1
|
import logging
|
2
2
|
import functools
|
3
|
-
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
4
3
|
from deltacat.storage import (
|
5
4
|
PartitionLocator,
|
6
5
|
Delta,
|
@@ -9,11 +8,10 @@ from deltacat.storage import (
|
|
9
8
|
from deltacat import logs
|
10
9
|
from deltacat.compute.compactor.utils import io as io_v1
|
11
10
|
from deltacat.compute.compactor import DeltaAnnotated
|
12
|
-
from
|
13
|
-
|
14
|
-
MIN_FILES_IN_BATCH,
|
15
|
-
MIN_DELTA_BYTES_IN_BATCH,
|
11
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
12
|
+
CompactPartitionParams,
|
16
13
|
)
|
14
|
+
from typing import Dict, List, Optional, Any
|
17
15
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
18
16
|
CompactionSessionAuditInfo,
|
19
17
|
)
|
@@ -25,6 +23,10 @@ from deltacat.compute.compactor_v2.utils.content_type_params import (
|
|
25
23
|
)
|
26
24
|
from deltacat.utils.metrics import metrics
|
27
25
|
from deltacat.compute.compactor_v2.constants import DISCOVER_DELTAS_METRIC_PREFIX
|
26
|
+
from deltacat.compute.resource_estimation.manifest import (
|
27
|
+
does_require_content_type_params,
|
28
|
+
)
|
29
|
+
from deltacat.compute.resource_estimation.model import OperationType
|
28
30
|
|
29
31
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
30
32
|
|
@@ -90,10 +92,7 @@ def create_uniform_input_deltas(
|
|
90
92
|
input_deltas: List[Delta],
|
91
93
|
hash_bucket_count: int,
|
92
94
|
compaction_audit: CompactionSessionAuditInfo,
|
93
|
-
|
94
|
-
min_file_counts: Optional[float] = MIN_FILES_IN_BATCH,
|
95
|
-
previous_inflation: Optional[float] = PYARROW_INFLATION_MULTIPLIER,
|
96
|
-
enable_input_split: Optional[bool] = False,
|
95
|
+
compact_partition_params: CompactPartitionParams,
|
97
96
|
deltacat_storage=unimplemented_deltacat_storage,
|
98
97
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
99
98
|
) -> List[DeltaAnnotated]:
|
@@ -104,11 +103,21 @@ def create_uniform_input_deltas(
|
|
104
103
|
input_da_list = []
|
105
104
|
|
106
105
|
for delta in input_deltas:
|
107
|
-
if
|
106
|
+
if (
|
107
|
+
compact_partition_params.enable_input_split
|
108
|
+
or does_require_content_type_params(
|
109
|
+
compact_partition_params.resource_estimation_method
|
110
|
+
)
|
111
|
+
):
|
112
|
+
logger.debug(
|
113
|
+
f"Delta with locator: {delta.locator} requires content type params..."
|
114
|
+
)
|
108
115
|
append_content_type_params(
|
109
116
|
delta=delta,
|
110
117
|
deltacat_storage=deltacat_storage,
|
111
118
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
119
|
+
task_max_parallelism=compact_partition_params.task_max_parallelism,
|
120
|
+
max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
|
112
121
|
)
|
113
122
|
|
114
123
|
manifest_entries = delta.manifest.entries
|
@@ -118,7 +127,9 @@ def create_uniform_input_deltas(
|
|
118
127
|
entry = manifest_entries[entry_index]
|
119
128
|
delta_bytes += entry.meta.content_length
|
120
129
|
estimated_da_bytes += estimate_manifest_entry_size_bytes(
|
121
|
-
entry=entry,
|
130
|
+
entry=entry,
|
131
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
132
|
+
estimate_resources_params=compact_partition_params.estimate_resources_params,
|
122
133
|
)
|
123
134
|
|
124
135
|
delta_annotated = DeltaAnnotated.of(delta)
|
@@ -129,13 +140,16 @@ def create_uniform_input_deltas(
|
|
129
140
|
logger.info(f"Input delta files to compact: {delta_manifest_entries_count}")
|
130
141
|
|
131
142
|
size_estimation_function = functools.partial(
|
132
|
-
estimate_manifest_entry_size_bytes,
|
143
|
+
estimate_manifest_entry_size_bytes,
|
144
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
145
|
+
estimate_resources_params=compact_partition_params.estimate_resources_params,
|
133
146
|
)
|
134
147
|
rebatched_da_list = DeltaAnnotated.rebatch(
|
135
148
|
input_da_list,
|
136
|
-
min_delta_bytes=
|
137
|
-
min_file_counts=
|
149
|
+
min_delta_bytes=compact_partition_params.min_delta_bytes_in_batch,
|
150
|
+
min_file_counts=compact_partition_params.min_files_in_batch,
|
138
151
|
estimation_function=size_estimation_function,
|
152
|
+
enable_input_split=compact_partition_params.enable_input_split,
|
139
153
|
)
|
140
154
|
|
141
155
|
compaction_audit.set_input_size_bytes(delta_bytes)
|