deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +0 -1
- deltacat/compute/compactor/model/compact_partition_params.py +76 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
- deltacat/compute/compactor/model/delta_annotated.py +16 -9
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
- deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
- deltacat/compute/compactor_v2/utils/io.py +28 -14
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +128 -183
- deltacat/compute/resource_estimation/__init__.py +27 -0
- deltacat/compute/resource_estimation/delta.py +271 -0
- deltacat/compute/resource_estimation/manifest.py +394 -0
- deltacat/compute/resource_estimation/model.py +165 -0
- deltacat/compute/resource_estimation/parquet.py +108 -0
- deltacat/constants.py +5 -0
- deltacat/exceptions.py +2 -4
- deltacat/logs.py +8 -0
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
- deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
- deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
- deltacat/tests/compute/test_util_common.py +2 -0
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
- deltacat/tests/test_logs.py +34 -0
- deltacat/tests/test_utils/pyarrow.py +15 -5
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
- deltacat/compute/metastats/meta_stats.py +0 -479
- deltacat/compute/metastats/model/__init__.py +0 -0
- deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
- deltacat/compute/metastats/stats.py +0 -182
- deltacat/compute/metastats/utils/__init__.py +0 -0
- deltacat/compute/metastats/utils/constants.py +0 -16
- deltacat/compute/metastats/utils/io.py +0 -223
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
- deltacat/compute/metastats/utils/ray_utils.py +0 -129
- deltacat/compute/stats/basic.py +0 -226
- deltacat/compute/stats/models/__init__.py +0 -0
- deltacat/compute/stats/models/delta_column_stats.py +0 -98
- deltacat/compute/stats/models/delta_stats.py +0 -233
- deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
- deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
- deltacat/compute/stats/models/stats_result.py +0 -104
- deltacat/compute/stats/utils/__init__.py +0 -0
- deltacat/compute/stats/utils/intervals.py +0 -94
- deltacat/compute/stats/utils/io.py +0 -230
- deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
- deltacat/tests/stats/__init__.py +0 -0
- deltacat/tests/stats/test_intervals.py +0 -49
- /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
- /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
deltacat/aws/constants.py
CHANGED
@@ -13,6 +13,10 @@ from deltacat.storage import (
|
|
13
13
|
PartitionLocator,
|
14
14
|
SortKey,
|
15
15
|
)
|
16
|
+
from deltacat.compute.resource_estimation import (
|
17
|
+
ResourceEstimationMethod,
|
18
|
+
EstimateResourcesParams,
|
19
|
+
)
|
16
20
|
from deltacat.compute.compactor_v2.constants import (
|
17
21
|
MAX_RECORDS_PER_COMPACTED_FILE,
|
18
22
|
MIN_DELTA_BYTES_IN_BATCH,
|
@@ -23,6 +27,8 @@ from deltacat.compute.compactor_v2.constants import (
|
|
23
27
|
TOTAL_MEMORY_BUFFER_PERCENTAGE,
|
24
28
|
DEFAULT_DISABLE_COPY_BY_REFERENCE,
|
25
29
|
DEFAULT_NUM_ROUNDS,
|
30
|
+
PARQUET_TO_PYARROW_INFLATION,
|
31
|
+
MAX_PARQUET_METADATA_SIZE,
|
26
32
|
)
|
27
33
|
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
28
34
|
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
@@ -104,6 +110,22 @@ class CompactPartitionParams(dict):
|
|
104
110
|
result.metrics_config = params.get("metrics_config")
|
105
111
|
|
106
112
|
result.num_rounds = params.get("num_rounds", DEFAULT_NUM_ROUNDS)
|
113
|
+
result.parquet_to_pyarrow_inflation = params.get(
|
114
|
+
"parquet_to_pyarrow_inflation", PARQUET_TO_PYARROW_INFLATION
|
115
|
+
)
|
116
|
+
result.resource_estimation_method = ResourceEstimationMethod[
|
117
|
+
params.get(
|
118
|
+
"resource_estimation_method", ResourceEstimationMethod.DEFAULT.value
|
119
|
+
)
|
120
|
+
]
|
121
|
+
|
122
|
+
# disable input split during rebase as the rebase files are already uniform
|
123
|
+
result.enable_input_split = (
|
124
|
+
params.get("rebase_source_partition_locator") is None
|
125
|
+
)
|
126
|
+
result.max_parquet_meta_size_bytes = params.get(
|
127
|
+
"max_parquet_meta_size_bytes", MAX_PARQUET_METADATA_SIZE
|
128
|
+
)
|
107
129
|
|
108
130
|
if not importlib.util.find_spec("memray"):
|
109
131
|
result.enable_profiler = False
|
@@ -413,6 +435,60 @@ class CompactPartitionParams(dict):
|
|
413
435
|
def num_rounds(self, num_rounds: int) -> None:
|
414
436
|
self["num_rounds"] = num_rounds
|
415
437
|
|
438
|
+
@property
|
439
|
+
def parquet_to_pyarrow_inflation(self) -> float:
|
440
|
+
"""
|
441
|
+
The inflation factor for the parquet uncompressed_size_bytes to pyarrow table size.
|
442
|
+
"""
|
443
|
+
return self["parquet_to_pyarrow_inflation"]
|
444
|
+
|
445
|
+
@parquet_to_pyarrow_inflation.setter
|
446
|
+
def parquet_to_pyarrow_inflation(self, value: float) -> None:
|
447
|
+
self["parquet_to_pyarrow_inflation"] = value
|
448
|
+
|
449
|
+
@property
|
450
|
+
def enable_input_split(self) -> bool:
|
451
|
+
"""
|
452
|
+
When this is True, the input split will be always enabled for parquet files.
|
453
|
+
The input split feature will split the parquet files into individual row groups
|
454
|
+
so that we could process them in different nodes in parallel.
|
455
|
+
By default, input split is enabled for incremental compaction and disabled for rebase or backfill.
|
456
|
+
"""
|
457
|
+
return self["enable_input_split"]
|
458
|
+
|
459
|
+
@enable_input_split.setter
|
460
|
+
def enable_input_split(self, value: bool) -> None:
|
461
|
+
self["enable_input_split"] = value
|
462
|
+
|
463
|
+
@property
|
464
|
+
def max_parquet_meta_size_bytes(self) -> int:
|
465
|
+
"""
|
466
|
+
The maximum size of the parquet metadata in bytes. Used for allocating tasks
|
467
|
+
to fetch parquet metadata.
|
468
|
+
"""
|
469
|
+
return self["max_parquet_meta_size_bytes"]
|
470
|
+
|
471
|
+
@max_parquet_meta_size_bytes.setter
|
472
|
+
def max_parquet_meta_size_bytes(self, value: int) -> None:
|
473
|
+
self["max_parquet_meta_size_bytes"] = value
|
474
|
+
|
475
|
+
@property
|
476
|
+
def resource_estimation_method(self) -> ResourceEstimationMethod:
|
477
|
+
return self["resource_estimation_method"]
|
478
|
+
|
479
|
+
@resource_estimation_method.setter
|
480
|
+
def resource_estimation_method(self, value: ResourceEstimationMethod) -> None:
|
481
|
+
self["resource_estimation_method"] = value
|
482
|
+
|
483
|
+
@property
|
484
|
+
def estimate_resources_params(self) -> EstimateResourcesParams:
|
485
|
+
return EstimateResourcesParams.of(
|
486
|
+
resource_estimation_method=self.resource_estimation_method,
|
487
|
+
previous_inflation=self.previous_inflation,
|
488
|
+
parquet_to_pyarrow_inflation=self.parquet_to_pyarrow_inflation,
|
489
|
+
average_record_size_bytes=self.average_record_size_bytes,
|
490
|
+
)
|
491
|
+
|
416
492
|
@staticmethod
|
417
493
|
def json_handler_for_compact_partition_params(obj):
|
418
494
|
"""
|
@@ -436,6 +436,22 @@ class CompactionSessionAuditInfo(dict):
|
|
436
436
|
"""
|
437
437
|
return self.get("compactorVersion")
|
438
438
|
|
439
|
+
@property
|
440
|
+
def observed_input_inflation(self) -> float:
|
441
|
+
"""
|
442
|
+
The average inflation observed for input files only.
|
443
|
+
This only accounts for files in the source.
|
444
|
+
"""
|
445
|
+
return self.get("observedInputInflation")
|
446
|
+
|
447
|
+
@property
|
448
|
+
def observed_input_average_record_size_bytes(self) -> float:
|
449
|
+
"""
|
450
|
+
The average record size observed for input files only.
|
451
|
+
This only accounts for files in the source.
|
452
|
+
"""
|
453
|
+
return self.get("observedInputAverageRecordSizeBytes")
|
454
|
+
|
439
455
|
# Setters follow
|
440
456
|
|
441
457
|
def set_audit_url(self, audit_url: str) -> CompactionSessionAuditInfo:
|
@@ -756,6 +772,16 @@ class CompactionSessionAuditInfo(dict):
|
|
756
772
|
self["compactorVersion"] = value
|
757
773
|
return self
|
758
774
|
|
775
|
+
def set_observed_input_inflation(self, value: float) -> CompactionSessionAuditInfo:
|
776
|
+
self["observedInputInflation"] = value
|
777
|
+
return self
|
778
|
+
|
779
|
+
def set_observed_input_average_record_size_bytes(
|
780
|
+
self, value: float
|
781
|
+
) -> CompactionSessionAuditInfo:
|
782
|
+
self["observedInputAverageRecordSizeBytes"] = value
|
783
|
+
return self
|
784
|
+
|
759
785
|
# High level methods to save stats
|
760
786
|
def save_step_stats(
|
761
787
|
self,
|
@@ -69,6 +69,7 @@ class DeltaAnnotated(Delta):
|
|
69
69
|
estimation_function: Optional[
|
70
70
|
Callable[[ManifestEntry], float]
|
71
71
|
] = lambda entry: entry.meta.content_length,
|
72
|
+
enable_input_split: Optional[bool] = False,
|
72
73
|
) -> List[DeltaAnnotated]:
|
73
74
|
"""
|
74
75
|
Simple greedy algorithm to split/merge 1 or more annotated deltas into
|
@@ -86,13 +87,19 @@ class DeltaAnnotated(Delta):
|
|
86
87
|
new_da_bytes = 0
|
87
88
|
da_group_entry_count = 0
|
88
89
|
|
89
|
-
|
90
|
-
|
90
|
+
if enable_input_split:
|
91
|
+
for delta_annotated in annotated_deltas:
|
92
|
+
split_annotated_deltas.extend(
|
93
|
+
DeltaAnnotated._split_single(delta_annotated)
|
94
|
+
)
|
91
95
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
+
logger.info(
|
97
|
+
f"Split the {len(annotated_deltas)} annotated deltas "
|
98
|
+
f"into {len(split_annotated_deltas)} groups."
|
99
|
+
)
|
100
|
+
else:
|
101
|
+
logger.info("Skipping input split as it is disabled...")
|
102
|
+
split_annotated_deltas = annotated_deltas
|
96
103
|
|
97
104
|
for src_da in split_annotated_deltas:
|
98
105
|
src_da_annotations = src_da.annotations
|
@@ -107,7 +114,7 @@ class DeltaAnnotated(Delta):
|
|
107
114
|
# (i.e. the previous compaction round ran a rebase)
|
108
115
|
if new_da and src_da.locator != new_da.locator:
|
109
116
|
groups.append(new_da)
|
110
|
-
logger.
|
117
|
+
logger.debug(
|
111
118
|
f"Due to different delta locator, Appending group of {da_group_entry_count} elements "
|
112
119
|
f"and {new_da_bytes} bytes"
|
113
120
|
)
|
@@ -126,12 +133,12 @@ class DeltaAnnotated(Delta):
|
|
126
133
|
or da_group_entry_count >= min_file_counts
|
127
134
|
):
|
128
135
|
if new_da_bytes >= min_delta_bytes:
|
129
|
-
logger.
|
136
|
+
logger.debug(
|
130
137
|
f"Appending group of {da_group_entry_count} elements "
|
131
138
|
f"and {new_da_bytes} bytes to meet file size limit"
|
132
139
|
)
|
133
140
|
if da_group_entry_count >= min_file_counts:
|
134
|
-
logger.
|
141
|
+
logger.debug(
|
135
142
|
f"Appending group of {da_group_entry_count} elements "
|
136
143
|
f"and {da_group_entry_count} files to meet file count limit"
|
137
144
|
)
|
@@ -41,6 +41,9 @@ DROP_DUPLICATES = True
|
|
41
41
|
# size in metadata to pyarrow table size.
|
42
42
|
PARQUET_TO_PYARROW_INFLATION = 4
|
43
43
|
|
44
|
+
# Maximum size of the parquet metadata
|
45
|
+
MAX_PARQUET_METADATA_SIZE = 100_000_000 # 100 MB
|
46
|
+
|
44
47
|
# By default, copy by reference is enabled
|
45
48
|
DEFAULT_DISABLE_COPY_BY_REFERENCE = False
|
46
49
|
|
@@ -148,12 +148,8 @@ def _build_uniform_deltas(
|
|
148
148
|
input_deltas=input_deltas,
|
149
149
|
hash_bucket_count=params.hash_bucket_count,
|
150
150
|
compaction_audit=mutable_compaction_audit,
|
151
|
+
compact_partition_params=params,
|
151
152
|
deltacat_storage=params.deltacat_storage,
|
152
|
-
previous_inflation=params.previous_inflation,
|
153
|
-
min_delta_bytes=params.min_delta_bytes_in_batch,
|
154
|
-
min_file_counts=params.min_files_in_batch,
|
155
|
-
# disable input split during rebase as the rebase files are already uniform
|
156
|
-
enable_input_split=params.rebase_source_partition_locator is None,
|
157
153
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
158
154
|
)
|
159
155
|
delta_discovery_end: float = time.monotonic()
|
@@ -400,6 +396,7 @@ def _merge(
|
|
400
396
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
401
397
|
ray_custom_resources=params.ray_custom_resources,
|
402
398
|
memory_logs_enabled=params.memory_logs_enabled,
|
399
|
+
estimate_resources_params=params.estimate_resources_params,
|
403
400
|
)
|
404
401
|
|
405
402
|
def merge_input_provider(index, item) -> dict[str, MergeInput]:
|
@@ -463,6 +460,7 @@ def _hash_bucket(
|
|
463
460
|
primary_keys=params.primary_keys,
|
464
461
|
ray_custom_resources=params.ray_custom_resources,
|
465
462
|
memory_logs_enabled=params.memory_logs_enabled,
|
463
|
+
estimate_resources_params=params.estimate_resources_params,
|
466
464
|
)
|
467
465
|
|
468
466
|
def hash_bucket_input_provider(index, item) -> dict[str, HashBucketInput]:
|
@@ -537,6 +535,7 @@ def _run_local_merge(
|
|
537
535
|
ray_custom_resources=params.ray_custom_resources,
|
538
536
|
primary_keys=params.primary_keys,
|
539
537
|
memory_logs_enabled=params.memory_logs_enabled,
|
538
|
+
estimate_resources_params=params.estimate_resources_params,
|
540
539
|
)
|
541
540
|
local_merge_result = ray.get(
|
542
541
|
mg.merge.options(**local_merge_options).remote(local_merge_input)
|
@@ -666,6 +665,11 @@ def _write_new_round_completion_file(
|
|
666
665
|
f" and average record size={input_average_record_size_bytes}"
|
667
666
|
)
|
668
667
|
|
668
|
+
mutable_compaction_audit.set_observed_input_inflation(input_inflation)
|
669
|
+
mutable_compaction_audit.set_observed_input_average_record_size_bytes(
|
670
|
+
input_average_record_size_bytes
|
671
|
+
)
|
672
|
+
|
669
673
|
_update_and_upload_compaction_audit(
|
670
674
|
params,
|
671
675
|
mutable_compaction_audit,
|
@@ -1,66 +1,217 @@
|
|
1
1
|
import logging
|
2
|
+
import ray
|
3
|
+
import functools
|
4
|
+
from deltacat.compute.compactor_v2.constants import (
|
5
|
+
TASK_MAX_PARALLELISM,
|
6
|
+
MAX_PARQUET_METADATA_SIZE,
|
7
|
+
)
|
8
|
+
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
2
9
|
from deltacat import logs
|
3
10
|
from deltacat.storage import (
|
4
11
|
Delta,
|
12
|
+
ManifestEntry,
|
5
13
|
interface as unimplemented_deltacat_storage,
|
6
14
|
)
|
7
15
|
from typing import Dict, Optional, Any
|
8
|
-
from deltacat.types.media import TableType
|
16
|
+
from deltacat.types.media import TableType
|
9
17
|
from deltacat.types.media import ContentType
|
10
18
|
from deltacat.types.partial_download import PartialParquetParameters
|
19
|
+
from deltacat.exceptions import RetryableError
|
11
20
|
|
12
21
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
13
22
|
|
14
23
|
|
24
|
+
def append_content_type_params_options_provider(
|
25
|
+
index: int, item: Any, max_parquet_meta_size_bytes: int, **kwargs
|
26
|
+
) -> Dict:
|
27
|
+
task_opts = {
|
28
|
+
"num_cpus": 0.01,
|
29
|
+
"memory": max_parquet_meta_size_bytes,
|
30
|
+
"scheduling_strategy": "DEFAULT",
|
31
|
+
}
|
32
|
+
|
33
|
+
task_opts["max_retries"] = 3
|
34
|
+
task_opts["retry_exceptions"] = [RetryableError]
|
35
|
+
|
36
|
+
return task_opts
|
37
|
+
|
38
|
+
|
39
|
+
def _contains_partial_parquet_parameters(entry: ManifestEntry) -> bool:
|
40
|
+
return (
|
41
|
+
entry.meta
|
42
|
+
and entry.meta.content_type_parameters
|
43
|
+
and any(
|
44
|
+
isinstance(type_params, PartialParquetParameters)
|
45
|
+
for type_params in entry.meta.content_type_parameters
|
46
|
+
)
|
47
|
+
)
|
48
|
+
|
49
|
+
|
50
|
+
APPEND_CONTENT_TYPE_PARAMS_CACHE = "append_content_type_params_cache"
|
51
|
+
# At this point, it's better to fetch all parquet than to cache and
|
52
|
+
# call actor which is not expected to support high throughput.
|
53
|
+
MINIMUM_ENTRIES_TO_CACHE = 10
|
54
|
+
|
55
|
+
|
56
|
+
@ray.remote
|
57
|
+
class AppendContentTypeParamsCache:
|
58
|
+
"""
|
59
|
+
This actor caches the delta that contains content type meta.
|
60
|
+
"""
|
61
|
+
|
62
|
+
def __init__(self):
|
63
|
+
self.cache = {}
|
64
|
+
|
65
|
+
def get(self, key):
|
66
|
+
return self.cache.get(key)
|
67
|
+
|
68
|
+
def put(self, key, value):
|
69
|
+
self.cache[key] = value
|
70
|
+
|
71
|
+
|
72
|
+
@ray.remote
|
73
|
+
def _download_parquet_metadata_for_manifest_entry(
|
74
|
+
delta: Delta,
|
75
|
+
entry_index: int,
|
76
|
+
deltacat_storage: unimplemented_deltacat_storage,
|
77
|
+
deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
|
78
|
+
) -> Dict[str, Any]:
|
79
|
+
pq_file = deltacat_storage.download_delta_manifest_entry(
|
80
|
+
delta,
|
81
|
+
entry_index=entry_index,
|
82
|
+
table_type=TableType.PYARROW_PARQUET,
|
83
|
+
**deltacat_storage_kwargs,
|
84
|
+
)
|
85
|
+
|
86
|
+
return {
|
87
|
+
"entry_index": entry_index,
|
88
|
+
"partial_parquet_params": PartialParquetParameters.of(
|
89
|
+
pq_metadata=pq_file.metadata
|
90
|
+
),
|
91
|
+
}
|
92
|
+
|
93
|
+
|
15
94
|
def append_content_type_params(
|
16
95
|
delta: Delta,
|
17
|
-
|
96
|
+
task_max_parallelism: int = TASK_MAX_PARALLELISM,
|
97
|
+
max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
|
18
98
|
deltacat_storage=unimplemented_deltacat_storage,
|
19
99
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
20
100
|
) -> None:
|
101
|
+
"""
|
102
|
+
This operation appends content type params into the delta entry. Note
|
103
|
+
that this operation can be time consuming, hence we cache it in a Ray actor.
|
104
|
+
"""
|
21
105
|
|
22
|
-
if delta.meta
|
23
|
-
logger.
|
24
|
-
f"Delta with locator {delta.locator} is not a parquet delta, "
|
25
|
-
"skipping appending content type parameters."
|
26
|
-
)
|
106
|
+
if not delta.meta:
|
107
|
+
logger.warning(f"Delta with locator {delta.locator} doesn't contain meta.")
|
27
108
|
return
|
28
109
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
110
|
+
entry_indices_to_download = []
|
111
|
+
for entry_index, entry in enumerate(delta.manifest.entries):
|
112
|
+
if (
|
113
|
+
not _contains_partial_parquet_parameters(entry)
|
114
|
+
and entry.meta
|
115
|
+
and entry.meta.content_type == ContentType.PARQUET.value
|
116
|
+
):
|
117
|
+
entry_indices_to_download.append(entry_index)
|
34
118
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
table_type=TableType.PYARROW_PARQUET,
|
39
|
-
**deltacat_storage_kwargs,
|
119
|
+
if not entry_indices_to_download:
|
120
|
+
logger.info(
|
121
|
+
f"No parquet type params to download for delta with locator {delta.locator}."
|
40
122
|
)
|
123
|
+
return None
|
41
124
|
|
42
|
-
|
43
|
-
|
125
|
+
ray_namespace = ray.get_runtime_context().namespace
|
126
|
+
logger.info(
|
127
|
+
f"Got Ray namespace: {ray_namespace}. "
|
128
|
+
"Note that caching only works with non-anonymous namespace."
|
129
|
+
"To set a non-anonymous namespace, call ray.init(namespace='X')."
|
130
|
+
)
|
131
|
+
if len(entry_indices_to_download) >= MINIMUM_ENTRIES_TO_CACHE:
|
132
|
+
logger.info(
|
133
|
+
f"Checking if cache contains parquet meta in namespace {ray_namespace} for "
|
134
|
+
f"delta locator {delta.locator} and digest {delta.locator.hexdigest()}..."
|
135
|
+
)
|
136
|
+
cache = AppendContentTypeParamsCache.options(
|
137
|
+
name=APPEND_CONTENT_TYPE_PARAMS_CACHE,
|
138
|
+
namespace=ray_namespace,
|
139
|
+
get_if_exists=True,
|
140
|
+
).remote()
|
44
141
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
142
|
+
logger.info(f"Got cache actor: {cache}")
|
143
|
+
cached_value = ray.get(cache.get.remote(delta.locator.hexdigest()))
|
144
|
+
if cached_value is not None:
|
145
|
+
logger.info(
|
146
|
+
"Using cached parquet meta for delta with locator"
|
147
|
+
f" {delta.locator} and digest {delta.locator.hexdigest()}."
|
148
|
+
)
|
149
|
+
delta.manifest = cached_value.manifest
|
150
|
+
return
|
151
|
+
logger.info(
|
152
|
+
f"Cache doesn't contain parquet meta for delta with locator {delta.locator}."
|
51
153
|
)
|
52
154
|
|
53
|
-
|
54
|
-
|
55
|
-
|
155
|
+
options_provider = functools.partial(
|
156
|
+
append_content_type_params_options_provider,
|
157
|
+
max_parquet_meta_size_bytes=max_parquet_meta_size_bytes,
|
158
|
+
)
|
56
159
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
160
|
+
def input_provider(index, item) -> Dict:
|
161
|
+
return {
|
162
|
+
"deltacat_storage_kwargs": deltacat_storage_kwargs,
|
163
|
+
"deltacat_storage": deltacat_storage,
|
164
|
+
"delta": delta,
|
165
|
+
"entry_index": item,
|
166
|
+
}
|
61
167
|
|
62
|
-
|
168
|
+
logger.info(
|
169
|
+
f"Downloading parquet meta for {len(entry_indices_to_download)} manifest entries..."
|
170
|
+
)
|
171
|
+
pq_files_promise = invoke_parallel(
|
172
|
+
entry_indices_to_download,
|
173
|
+
ray_task=_download_parquet_metadata_for_manifest_entry,
|
174
|
+
max_parallelism=task_max_parallelism,
|
175
|
+
options_provider=options_provider,
|
176
|
+
kwargs_provider=input_provider,
|
177
|
+
)
|
178
|
+
|
179
|
+
partial_file_meta_list = ray.get(pq_files_promise)
|
180
|
+
|
181
|
+
logger.info(
|
182
|
+
f"Downloaded parquet meta for {len(entry_indices_to_download)} manifest entries"
|
183
|
+
)
|
184
|
+
|
185
|
+
assert len(partial_file_meta_list) == len(
|
186
|
+
entry_indices_to_download
|
187
|
+
), f"Expected {len(entry_indices_to_download)} pq files, got {len(partial_file_meta_list)}"
|
188
|
+
|
189
|
+
for index, entry_index in enumerate(entry_indices_to_download):
|
190
|
+
assert (
|
191
|
+
entry_index == partial_file_meta_list[index]["entry_index"]
|
192
|
+
), "entry_index must match with the associated parquet meta"
|
193
|
+
entry = delta.manifest.entries[entry_index]
|
63
194
|
if not entry.meta.content_type_parameters:
|
64
195
|
entry.meta.content_type_parameters = []
|
196
|
+
entry.meta.content_type_parameters.append(
|
197
|
+
partial_file_meta_list[index]["partial_parquet_params"]
|
198
|
+
)
|
199
|
+
|
200
|
+
for entry_index, entry in enumerate(delta.manifest.entries):
|
201
|
+
assert _contains_partial_parquet_parameters(
|
202
|
+
entry
|
203
|
+
), "partial parquet params validation failed."
|
65
204
|
|
66
|
-
|
205
|
+
if len(entry_indices_to_download) >= MINIMUM_ENTRIES_TO_CACHE:
|
206
|
+
cache = AppendContentTypeParamsCache.options(
|
207
|
+
name=APPEND_CONTENT_TYPE_PARAMS_CACHE,
|
208
|
+
namespace=ray_namespace,
|
209
|
+
get_if_exists=True,
|
210
|
+
).remote()
|
211
|
+
logger.info(f"Got cache actor when writing: {cache}")
|
212
|
+
logger.info(
|
213
|
+
f"Caching parquet meta for delta with locator {delta.locator} "
|
214
|
+
f"and digest {delta.locator.hexdigest()}..."
|
215
|
+
)
|
216
|
+
ray.get(cache.put.remote(delta.locator.hexdigest(), delta))
|
217
|
+
assert ray.get(cache.get.remote(delta.locator.hexdigest())) is not None
|
@@ -1,6 +1,5 @@
|
|
1
1
|
import logging
|
2
2
|
import functools
|
3
|
-
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
4
3
|
from deltacat.storage import (
|
5
4
|
PartitionLocator,
|
6
5
|
Delta,
|
@@ -9,11 +8,10 @@ from deltacat.storage import (
|
|
9
8
|
from deltacat import logs
|
10
9
|
from deltacat.compute.compactor.utils import io as io_v1
|
11
10
|
from deltacat.compute.compactor import DeltaAnnotated
|
12
|
-
from
|
13
|
-
|
14
|
-
MIN_FILES_IN_BATCH,
|
15
|
-
MIN_DELTA_BYTES_IN_BATCH,
|
11
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
12
|
+
CompactPartitionParams,
|
16
13
|
)
|
14
|
+
from typing import Dict, List, Optional, Any
|
17
15
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
18
16
|
CompactionSessionAuditInfo,
|
19
17
|
)
|
@@ -25,6 +23,10 @@ from deltacat.compute.compactor_v2.utils.content_type_params import (
|
|
25
23
|
)
|
26
24
|
from deltacat.utils.metrics import metrics
|
27
25
|
from deltacat.compute.compactor_v2.constants import DISCOVER_DELTAS_METRIC_PREFIX
|
26
|
+
from deltacat.compute.resource_estimation.manifest import (
|
27
|
+
does_require_content_type_params,
|
28
|
+
)
|
29
|
+
from deltacat.compute.resource_estimation.model import OperationType
|
28
30
|
|
29
31
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
30
32
|
|
@@ -90,10 +92,7 @@ def create_uniform_input_deltas(
|
|
90
92
|
input_deltas: List[Delta],
|
91
93
|
hash_bucket_count: int,
|
92
94
|
compaction_audit: CompactionSessionAuditInfo,
|
93
|
-
|
94
|
-
min_file_counts: Optional[float] = MIN_FILES_IN_BATCH,
|
95
|
-
previous_inflation: Optional[float] = PYARROW_INFLATION_MULTIPLIER,
|
96
|
-
enable_input_split: Optional[bool] = False,
|
95
|
+
compact_partition_params: CompactPartitionParams,
|
97
96
|
deltacat_storage=unimplemented_deltacat_storage,
|
98
97
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
99
98
|
) -> List[DeltaAnnotated]:
|
@@ -104,11 +103,21 @@ def create_uniform_input_deltas(
|
|
104
103
|
input_da_list = []
|
105
104
|
|
106
105
|
for delta in input_deltas:
|
107
|
-
if
|
106
|
+
if (
|
107
|
+
compact_partition_params.enable_input_split
|
108
|
+
or does_require_content_type_params(
|
109
|
+
compact_partition_params.resource_estimation_method
|
110
|
+
)
|
111
|
+
):
|
112
|
+
logger.debug(
|
113
|
+
f"Delta with locator: {delta.locator} requires content type params..."
|
114
|
+
)
|
108
115
|
append_content_type_params(
|
109
116
|
delta=delta,
|
110
117
|
deltacat_storage=deltacat_storage,
|
111
118
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
119
|
+
task_max_parallelism=compact_partition_params.task_max_parallelism,
|
120
|
+
max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
|
112
121
|
)
|
113
122
|
|
114
123
|
manifest_entries = delta.manifest.entries
|
@@ -118,7 +127,9 @@ def create_uniform_input_deltas(
|
|
118
127
|
entry = manifest_entries[entry_index]
|
119
128
|
delta_bytes += entry.meta.content_length
|
120
129
|
estimated_da_bytes += estimate_manifest_entry_size_bytes(
|
121
|
-
entry=entry,
|
130
|
+
entry=entry,
|
131
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
132
|
+
estimate_resources_params=compact_partition_params.estimate_resources_params,
|
122
133
|
)
|
123
134
|
|
124
135
|
delta_annotated = DeltaAnnotated.of(delta)
|
@@ -129,13 +140,16 @@ def create_uniform_input_deltas(
|
|
129
140
|
logger.info(f"Input delta files to compact: {delta_manifest_entries_count}")
|
130
141
|
|
131
142
|
size_estimation_function = functools.partial(
|
132
|
-
estimate_manifest_entry_size_bytes,
|
143
|
+
estimate_manifest_entry_size_bytes,
|
144
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
145
|
+
estimate_resources_params=compact_partition_params.estimate_resources_params,
|
133
146
|
)
|
134
147
|
rebatched_da_list = DeltaAnnotated.rebatch(
|
135
148
|
input_da_list,
|
136
|
-
min_delta_bytes=
|
137
|
-
min_file_counts=
|
149
|
+
min_delta_bytes=compact_partition_params.min_delta_bytes_in_batch,
|
150
|
+
min_file_counts=compact_partition_params.min_files_in_batch,
|
138
151
|
estimation_function=size_estimation_function,
|
152
|
+
enable_input_split=compact_partition_params.enable_input_split,
|
139
153
|
)
|
140
154
|
|
141
155
|
compaction_audit.set_input_size_bytes(delta_bytes)
|
@@ -27,8 +27,11 @@ def _append_sha1_hash_to_table(table: pa.Table, hash_column: pa.Array) -> pa.Tab
|
|
27
27
|
|
28
28
|
result = []
|
29
29
|
for hash_value in hash_column_np:
|
30
|
-
|
31
|
-
|
30
|
+
if hash_value is None:
|
31
|
+
result.append(None)
|
32
|
+
logger.info("A primary key hash is null")
|
33
|
+
else:
|
34
|
+
result.append(hashlib.sha1(hash_value.encode("utf-8")).hexdigest())
|
32
35
|
|
33
36
|
return sc.append_pk_hash_string_column(table, result)
|
34
37
|
|
@@ -191,7 +194,7 @@ def generate_pk_hash_column(
|
|
191
194
|
pk_columns.append(sliced_string_cast(table[pk_name]))
|
192
195
|
|
193
196
|
pk_columns.append(PK_DELIMITER)
|
194
|
-
hash_column = pc.binary_join_element_wise(*pk_columns)
|
197
|
+
hash_column = pc.binary_join_element_wise(*pk_columns, null_handling="replace")
|
195
198
|
return hash_column
|
196
199
|
|
197
200
|
def _generate_uuid(table: pa.Table) -> pa.Array:
|
@@ -345,8 +348,10 @@ def hash_group_index_to_hash_bucket_indices(
|
|
345
348
|
return range(hb_group, num_buckets, num_groups)
|
346
349
|
|
347
350
|
|
348
|
-
def pk_digest_to_hash_bucket_index(digest: str, num_buckets: int) -> int:
|
351
|
+
def pk_digest_to_hash_bucket_index(digest: Optional[str], num_buckets: int) -> int:
|
349
352
|
"""
|
350
353
|
Generates the hash bucket index from the given digest.
|
351
354
|
"""
|
355
|
+
if digest is None:
|
356
|
+
return 0
|
352
357
|
return int(digest, 16) % num_buckets
|