deltacat 1.1.8__py3-none-any.whl → 1.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +6 -0
- deltacat/aws/redshift/model/manifest.py +16 -0
- deltacat/aws/s3u.py +65 -38
- deltacat/compute/compactor/compaction_session.py +5 -1
- deltacat/compute/compactor/model/compact_partition_params.py +12 -1
- deltacat/compute/compactor/model/materialize_result.py +0 -4
- deltacat/compute/compactor/repartition_session.py +1 -0
- deltacat/compute/compactor/utils/round_completion_file.py +39 -9
- deltacat/compute/compactor_v2/compaction_session.py +26 -16
- deltacat/compute/compactor_v2/constants.py +5 -11
- deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
- deltacat/compute/compactor_v2/model/merge_input.py +6 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
- deltacat/compute/compactor_v2/steps/merge.py +12 -12
- deltacat/compute/compactor_v2/utils/merge.py +1 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +2 -12
- deltacat/exceptions.py +342 -7
- deltacat/io/dataset.py +5 -17
- deltacat/io/memcached_object_store.py +7 -4
- deltacat/storage/__init__.py +24 -0
- deltacat/storage/interface.py +56 -6
- deltacat/storage/model/delta.py +23 -3
- deltacat/storage/model/partition.py +6 -7
- deltacat/storage/model/partition_spec.py +71 -0
- deltacat/storage/model/stream.py +38 -1
- deltacat/storage/model/transform.py +127 -0
- deltacat/tests/aws/test_s3u.py +2 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
- deltacat/tests/compute/compact_partition_test_cases.py +4 -2
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +204 -37
- deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
- deltacat/tests/compute/test_util_common.py +19 -4
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -2
- deltacat/tests/local_deltacat_storage/__init__.py +124 -29
- deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
- deltacat/tests/test_exceptions.py +100 -0
- deltacat/tests/test_logs.py +1 -0
- deltacat/tests/test_utils/pyarrow.py +4 -1
- deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
- deltacat/tests/utils/test_daft.py +0 -1
- deltacat/tests/utils/test_resources.py +0 -28
- deltacat/utils/daft.py +3 -0
- deltacat/utils/numpy.py +3 -3
- deltacat/utils/pandas.py +3 -3
- deltacat/utils/pyarrow.py +11 -8
- deltacat/utils/ray_utils/dataset.py +7 -7
- deltacat/utils/ray_utils/runtime.py +2 -2
- deltacat/utils/resources.py +0 -45
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/METADATA +6 -5
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/RECORD +58 -51
- deltacat/io/aws/redshift/redshift_datasource.py +0 -578
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
deltacat/aws/constants.py
CHANGED
@@ -1,9 +1,14 @@
|
|
1
1
|
import botocore
|
2
2
|
from typing import Set
|
3
|
+
from daft.exceptions import DaftTransientError
|
3
4
|
|
4
5
|
from deltacat.utils.common import env_integer, env_string
|
5
6
|
|
7
|
+
|
6
8
|
DAFT_MAX_S3_CONNECTIONS_PER_FILE = env_integer("DAFT_MAX_S3_CONNECTIONS_PER_FILE", 8)
|
9
|
+
DEFAULT_FILE_READ_TIMEOUT_MS = env_integer(
|
10
|
+
"DEFAULT_FILE_READ_TIMEOUT_MS", 300_000
|
11
|
+
) # 5 mins
|
7
12
|
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 5)
|
8
13
|
BOTO_TIMEOUT_ERROR_CODES: Set[str] = {"ReadTimeoutError", "ConnectTimeoutError"}
|
9
14
|
BOTO_THROTTLING_ERROR_CODES: Set[str] = {"Throttling", "SlowDown"}
|
@@ -14,6 +19,7 @@ RETRYABLE_TRANSIENT_ERRORS = (
|
|
14
19
|
botocore.exceptions.NoCredentialsError,
|
15
20
|
botocore.exceptions.ConnectTimeoutError,
|
16
21
|
botocore.exceptions.ReadTimeoutError,
|
22
|
+
DaftTransientError,
|
17
23
|
)
|
18
24
|
AWS_REGION = env_string("AWS_REGION", "us-east-1")
|
19
25
|
UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY = env_integer(
|
@@ -99,6 +99,8 @@ class Manifest(dict):
|
|
99
99
|
total_source_content_length = 0
|
100
100
|
content_type = None
|
101
101
|
content_encoding = None
|
102
|
+
partition_values_set = set()
|
103
|
+
partition_values = None
|
102
104
|
if entries:
|
103
105
|
content_type = entries[0].meta.content_type
|
104
106
|
content_encoding = entries[0].meta.content_encoding
|
@@ -127,6 +129,12 @@ class Manifest(dict):
|
|
127
129
|
total_record_count += meta.record_count or 0
|
128
130
|
total_content_length += meta.content_length or 0
|
129
131
|
total_source_content_length += meta.source_content_length or 0
|
132
|
+
if len(partition_values_set) <= 1:
|
133
|
+
partition_values_set.add(entry.meta.partition_values)
|
134
|
+
|
135
|
+
if len(partition_values_set) == 1:
|
136
|
+
partition_values = partition_values_set.pop()
|
137
|
+
|
130
138
|
meta = ManifestMeta.of(
|
131
139
|
total_record_count,
|
132
140
|
total_content_length,
|
@@ -134,6 +142,7 @@ class Manifest(dict):
|
|
134
142
|
content_encoding,
|
135
143
|
total_source_content_length,
|
136
144
|
entry_type=entry_type,
|
145
|
+
partition_values=partition_values,
|
137
146
|
)
|
138
147
|
manifest = Manifest._build_manifest(meta, entries, author, uuid, entry_type)
|
139
148
|
return manifest
|
@@ -185,6 +194,7 @@ class ManifestMeta(dict):
|
|
185
194
|
credentials: Optional[Dict[str, str]] = None,
|
186
195
|
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
187
196
|
entry_type: Optional[EntryType] = None,
|
197
|
+
partition_values: Optional[List[str]] = None,
|
188
198
|
) -> ManifestMeta:
|
189
199
|
manifest_meta = ManifestMeta()
|
190
200
|
if record_count is not None:
|
@@ -203,6 +213,8 @@ class ManifestMeta(dict):
|
|
203
213
|
manifest_meta["credentials"] = credentials
|
204
214
|
if entry_type is not None:
|
205
215
|
manifest_meta["entry_type"] = entry_type.value
|
216
|
+
if partition_values is not None:
|
217
|
+
manifest_meta["partition_values"] = partition_values
|
206
218
|
return manifest_meta
|
207
219
|
|
208
220
|
@property
|
@@ -244,6 +256,10 @@ class ManifestMeta(dict):
|
|
244
256
|
return EntryType(self["entry_type"])
|
245
257
|
return val
|
246
258
|
|
259
|
+
@property
|
260
|
+
def partition_values(self) -> Optional[List[str]]:
|
261
|
+
return self.get("partition_values")
|
262
|
+
|
247
263
|
|
248
264
|
class ManifestAuthor(dict):
|
249
265
|
@staticmethod
|
deltacat/aws/s3u.py
CHANGED
@@ -21,19 +21,17 @@ from boto3.resources.base import ServiceResource
|
|
21
21
|
from botocore.client import BaseClient
|
22
22
|
from botocore.exceptions import ClientError
|
23
23
|
from ray.data.block import Block, BlockAccessor, BlockMetadata
|
24
|
-
from ray.data.datasource import
|
24
|
+
from ray.data.datasource import FilenameProvider
|
25
25
|
from ray.types import ObjectRef
|
26
26
|
from tenacity import (
|
27
27
|
Retrying,
|
28
28
|
retry_if_exception_type,
|
29
|
-
retry_if_not_exception_type,
|
30
29
|
stop_after_delay,
|
31
30
|
wait_random_exponential,
|
32
31
|
)
|
33
32
|
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
34
33
|
import deltacat.aws.clients as aws_utils
|
35
34
|
from deltacat import logs
|
36
|
-
from deltacat.exceptions import NonRetryableError, RetryableError
|
37
35
|
from deltacat.storage import (
|
38
36
|
DistributedDataset,
|
39
37
|
LocalDataset,
|
@@ -55,14 +53,23 @@ from deltacat.types.tables import (
|
|
55
53
|
DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC,
|
56
54
|
get_table_length,
|
57
55
|
)
|
56
|
+
from deltacat.exceptions import (
|
57
|
+
RetryableError,
|
58
|
+
RetryableUploadTableError,
|
59
|
+
RetryableDownloadTableError,
|
60
|
+
RetryableDownloadFileError,
|
61
|
+
RetryableUploadFileError,
|
62
|
+
NonRetryableDownloadFileError,
|
63
|
+
NonRetryableUploadFileError,
|
64
|
+
NonRetryableUploadTableError,
|
65
|
+
NonRetryableDownloadTableError,
|
66
|
+
)
|
58
67
|
from deltacat.types.partial_download import PartialFileDownloadParams
|
59
68
|
from deltacat.utils.common import ReadKwargsProvider
|
69
|
+
from deltacat.exceptions import categorize_errors
|
60
70
|
|
61
71
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
62
72
|
|
63
|
-
# TODO(raghumdani): refactor redshift datasource to reuse the
|
64
|
-
# same module for writing output files.
|
65
|
-
|
66
73
|
|
67
74
|
class CapturedBlockWritePaths:
|
68
75
|
def __init__(self):
|
@@ -90,12 +97,15 @@ class CapturedBlockWritePaths:
|
|
90
97
|
return self._block_refs
|
91
98
|
|
92
99
|
|
93
|
-
class UuidBlockWritePathProvider(
|
100
|
+
class UuidBlockWritePathProvider(FilenameProvider):
|
94
101
|
"""Block write path provider implementation that writes each
|
95
102
|
dataset block out to a file of the form: {base_path}/{uuid}
|
96
103
|
"""
|
97
104
|
|
98
|
-
def __init__(
|
105
|
+
def __init__(
|
106
|
+
self, capture_object: CapturedBlockWritePaths, base_path: Optional[str] = None
|
107
|
+
):
|
108
|
+
self.base_path = base_path
|
99
109
|
self.write_paths: List[str] = []
|
100
110
|
self.block_refs: List[ObjectRef[Block]] = []
|
101
111
|
self.capture_object = capture_object
|
@@ -107,6 +117,19 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
|
|
107
117
|
self.block_refs,
|
108
118
|
)
|
109
119
|
|
120
|
+
def get_filename_for_block(
|
121
|
+
self, block: Any, task_index: int, block_index: int
|
122
|
+
) -> str:
|
123
|
+
if self.base_path is None:
|
124
|
+
raise ValueError(
|
125
|
+
"Base path must be provided to UuidBlockWritePathProvider",
|
126
|
+
)
|
127
|
+
return self._get_write_path_for_block(
|
128
|
+
base_path=self.base_path,
|
129
|
+
block=block,
|
130
|
+
block_index=block_index,
|
131
|
+
)
|
132
|
+
|
110
133
|
def _get_write_path_for_block(
|
111
134
|
self,
|
112
135
|
base_path: str,
|
@@ -133,13 +156,6 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
|
|
133
156
|
block_index: Optional[int] = None,
|
134
157
|
file_format: Optional[str] = None,
|
135
158
|
) -> str:
|
136
|
-
"""
|
137
|
-
TODO: BlockWritePathProvider is deprecated as of Ray version 2.20.0. Please use FilenameProvider.
|
138
|
-
See: https://docs.ray.io/en/master/data/api/doc/ray.data.datasource.FilenameProvider.html
|
139
|
-
Also See: https://github.com/ray-project/deltacat/issues/299
|
140
|
-
|
141
|
-
Hence, this class only works with Ray version 2.20.0 or lower when used in Ray Dataset.
|
142
|
-
"""
|
143
159
|
return self._get_write_path_for_block(
|
144
160
|
base_path,
|
145
161
|
filesystem=filesystem,
|
@@ -232,6 +248,7 @@ def filter_objects_by_prefix(
|
|
232
248
|
more_objects_to_list = params["ContinuationToken"] is not None
|
233
249
|
|
234
250
|
|
251
|
+
@categorize_errors
|
235
252
|
def read_file(
|
236
253
|
s3_url: str,
|
237
254
|
content_type: ContentType,
|
@@ -263,15 +280,15 @@ def read_file(
|
|
263
280
|
in BOTO_TIMEOUT_ERROR_CODES | BOTO_THROTTLING_ERROR_CODES
|
264
281
|
):
|
265
282
|
# Timeout error not caught by botocore
|
266
|
-
raise
|
267
|
-
f"Retry table download from: {s3_url} after receiving {type(e).__name__}"
|
283
|
+
raise RetryableDownloadTableError(
|
284
|
+
f"Retry table download from: {s3_url} after receiving {type(e).__name__}",
|
268
285
|
) from e
|
269
|
-
raise
|
286
|
+
raise NonRetryableDownloadTableError(
|
270
287
|
f"Failed table download from: {s3_url} after receiving {type(e).__name__}"
|
271
288
|
) from e
|
272
289
|
except RETRYABLE_TRANSIENT_ERRORS as e:
|
273
|
-
raise
|
274
|
-
f"Retry
|
290
|
+
raise RetryableDownloadTableError(
|
291
|
+
f"Retry download for: {s3_url} after receiving {type(e).__name__}"
|
275
292
|
) from e
|
276
293
|
except BaseException as e:
|
277
294
|
logger.warn(
|
@@ -279,7 +296,10 @@ def read_file(
|
|
279
296
|
f"and encoding={content_encoding}. Error: {e}",
|
280
297
|
exc_info=True,
|
281
298
|
)
|
282
|
-
raise
|
299
|
+
raise NonRetryableDownloadTableError(
|
300
|
+
f"Read has failed for {s3_url} and content_type={content_type} "
|
301
|
+
f"and encoding={content_encoding}",
|
302
|
+
) from e
|
283
303
|
|
284
304
|
|
285
305
|
def upload_sliced_table(
|
@@ -378,29 +398,31 @@ def upload_table(
|
|
378
398
|
except ClientError as e:
|
379
399
|
if e.response["Error"]["Code"] == "NoSuchKey":
|
380
400
|
# s3fs may swallow S3 errors - we were probably throttled
|
381
|
-
raise
|
382
|
-
f"Retry table
|
401
|
+
raise RetryableUploadTableError(
|
402
|
+
f"Retry table upload from: {s3_url} after receiving {type(e).__name__}",
|
383
403
|
) from e
|
384
404
|
if (
|
385
405
|
e.response["Error"]["Code"]
|
386
406
|
in BOTO_TIMEOUT_ERROR_CODES | BOTO_THROTTLING_ERROR_CODES
|
387
407
|
):
|
388
|
-
raise
|
389
|
-
f"Retry table
|
408
|
+
raise RetryableUploadTableError(
|
409
|
+
f"Retry table upload from: {s3_url} after receiving {type(e).__name__}",
|
390
410
|
) from e
|
391
|
-
raise
|
392
|
-
f"Failed table upload to: {s3_url} after receiving {type(e).__name__}"
|
411
|
+
raise NonRetryableUploadTableError(
|
412
|
+
f"Failed table upload to: {s3_url} after receiving {type(e).__name__}",
|
393
413
|
) from e
|
394
414
|
except RETRYABLE_TRANSIENT_ERRORS as e:
|
395
|
-
raise
|
396
|
-
f"Retry upload for: {s3_url} after receiving {type(e).__name__}"
|
415
|
+
raise RetryableUploadTableError(
|
416
|
+
f"Retry upload for: {s3_url} after receiving {type(e).__name__}",
|
397
417
|
) from e
|
398
418
|
except BaseException as e:
|
399
419
|
logger.warn(
|
400
420
|
f"Upload has failed for {s3_url} and content_type={content_type}. Error: {e}",
|
401
421
|
exc_info=True,
|
402
422
|
)
|
403
|
-
raise
|
423
|
+
raise NonRetryableUploadTableError(
|
424
|
+
f"Upload has failed for {s3_url} and content_type={content_type} because of {type(e).__name__}",
|
425
|
+
) from e
|
404
426
|
return manifest_entries
|
405
427
|
|
406
428
|
|
@@ -443,7 +465,7 @@ def download_manifest_entry(
|
|
443
465
|
retrying = Retrying(
|
444
466
|
wait=wait_random_exponential(multiplier=1, max=60),
|
445
467
|
stop=stop_after_delay(DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY),
|
446
|
-
retry=
|
468
|
+
retry=retry_if_exception_type(RetryableError),
|
447
469
|
)
|
448
470
|
table = retrying(
|
449
471
|
read_file,
|
@@ -559,12 +581,15 @@ def _put_object(
|
|
559
581
|
)
|
560
582
|
except ClientError as e:
|
561
583
|
if e.response["Error"]["Code"] in BOTO_THROTTLING_ERROR_CODES:
|
562
|
-
|
563
|
-
|
584
|
+
error_code = e.response["Error"]["Code"]
|
585
|
+
raise RetryableUploadFileError(
|
586
|
+
f"Retry upload for: {bucket}/{key} after receiving {error_code}",
|
564
587
|
) from e
|
565
|
-
raise
|
588
|
+
raise NonRetryableUploadFileError(
|
589
|
+
f"Failed table upload to: {bucket}/{key}"
|
590
|
+
) from e
|
566
591
|
except RETRYABLE_TRANSIENT_ERRORS as e:
|
567
|
-
raise
|
592
|
+
raise RetryableUploadFileError(
|
568
593
|
f"Retry upload for: {bucket}/{key} after receiving {type(e).__name__}"
|
569
594
|
) from e
|
570
595
|
except BaseException as e:
|
@@ -572,7 +597,9 @@ def _put_object(
|
|
572
597
|
f"Upload has failed for {bucket}/{key}. Error: {type(e).__name__}",
|
573
598
|
exc_info=True,
|
574
599
|
)
|
575
|
-
raise
|
600
|
+
raise NonRetryableUploadFileError(
|
601
|
+
f"Failed table upload to: {bucket}/{key}"
|
602
|
+
) from e
|
576
603
|
|
577
604
|
|
578
605
|
def download(
|
@@ -604,12 +631,12 @@ def _get_object(s3_client, bucket: str, key: str, fail_if_not_found: bool = True
|
|
604
631
|
except ClientError as e:
|
605
632
|
if e.response["Error"]["Code"] == "NoSuchKey":
|
606
633
|
if fail_if_not_found:
|
607
|
-
raise
|
634
|
+
raise NonRetryableDownloadFileError(
|
608
635
|
f"Failed get object from: {bucket}/{key}"
|
609
636
|
) from e
|
610
637
|
logger.info(f"file not found: {bucket}/{key}")
|
611
638
|
except RETRYABLE_TRANSIENT_ERRORS as e:
|
612
|
-
raise
|
639
|
+
raise RetryableDownloadFileError(
|
613
640
|
f"Retry get object: {bucket}/{key} after receiving {type(e).__name__}"
|
614
641
|
) from e
|
615
642
|
|
@@ -193,6 +193,7 @@ def compact_partition(
|
|
193
193
|
round_completion_file_s3_url = rcf.write_round_completion_file(
|
194
194
|
compaction_artifact_s3_bucket,
|
195
195
|
new_rcf_partition_locator,
|
196
|
+
partition.locator,
|
196
197
|
new_rci,
|
197
198
|
**s3_client_kwargs,
|
198
199
|
)
|
@@ -312,7 +313,10 @@ def _execute_compaction_round(
|
|
312
313
|
round_completion_info = None
|
313
314
|
if not rebase_source_partition_locator:
|
314
315
|
round_completion_info = rcf.read_round_completion_file(
|
315
|
-
compaction_artifact_s3_bucket,
|
316
|
+
compaction_artifact_s3_bucket,
|
317
|
+
source_partition_locator,
|
318
|
+
destination_partition_locator,
|
319
|
+
**s3_client_kwargs,
|
316
320
|
)
|
317
321
|
if not round_completion_info:
|
318
322
|
logger.info(
|
@@ -21,6 +21,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
21
21
|
TASK_MAX_PARALLELISM,
|
22
22
|
DROP_DUPLICATES,
|
23
23
|
TOTAL_MEMORY_BUFFER_PERCENTAGE,
|
24
|
+
DEFAULT_DISABLE_COPY_BY_REFERENCE,
|
24
25
|
)
|
25
26
|
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
26
27
|
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
@@ -50,7 +51,6 @@ class CompactPartitionParams(dict):
|
|
50
51
|
|
51
52
|
result = CompactPartitionParams(params)
|
52
53
|
|
53
|
-
# TODO: move defaults to single file
|
54
54
|
result.records_per_compacted_file = params.get(
|
55
55
|
"records_per_compacted_file", MAX_RECORDS_PER_COMPACTED_FILE
|
56
56
|
)
|
@@ -92,6 +92,9 @@ class CompactPartitionParams(dict):
|
|
92
92
|
result.hash_group_count = params.get(
|
93
93
|
"hash_group_count", result.hash_bucket_count
|
94
94
|
)
|
95
|
+
result.disable_copy_by_reference = params.get(
|
96
|
+
"disable_copy_by_reference", DEFAULT_DISABLE_COPY_BY_REFERENCE
|
97
|
+
)
|
95
98
|
result.drop_duplicates = params.get("drop_duplicates", DROP_DUPLICATES)
|
96
99
|
result.ray_custom_resources = params.get("ray_custom_resources")
|
97
100
|
|
@@ -238,6 +241,14 @@ class CompactPartitionParams(dict):
|
|
238
241
|
def enable_profiler(self, value: bool) -> None:
|
239
242
|
self["enable_profiler"] = value
|
240
243
|
|
244
|
+
@property
|
245
|
+
def disable_copy_by_reference(self) -> bool:
|
246
|
+
return self["disable_copy_by_reference"]
|
247
|
+
|
248
|
+
@disable_copy_by_reference.setter
|
249
|
+
def disable_copy_by_reference(self, value: bool) -> None:
|
250
|
+
self["disable_copy_by_reference"] = value
|
251
|
+
|
241
252
|
@property
|
242
253
|
def list_deltas_kwargs(self) -> dict:
|
243
254
|
return self["list_deltas_kwargs"]
|
@@ -55,10 +55,6 @@ class MaterializeResult(dict):
|
|
55
55
|
self["paWriteResult"] = val = PyArrowWriteResult(val)
|
56
56
|
return val
|
57
57
|
|
58
|
-
@property
|
59
|
-
def count_of_src_dfl_not_touched(self) -> int:
|
60
|
-
return self["countOfSrcFileNotTouched"]
|
61
|
-
|
62
58
|
@property
|
63
59
|
def referenced_pyarrow_write_result(self) -> PyArrowWriteResult:
|
64
60
|
val: Dict[str, Any] = self.get("referencedPaWriteResult")
|
@@ -12,10 +12,17 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
12
12
|
|
13
13
|
|
14
14
|
def get_round_completion_file_s3_url(
|
15
|
-
bucket: str,
|
15
|
+
bucket: str,
|
16
|
+
source_partition_locator: PartitionLocator,
|
17
|
+
destination_partition_locator: Optional[PartitionLocator] = None,
|
16
18
|
) -> str:
|
17
19
|
|
18
20
|
base_url = source_partition_locator.path(f"s3://{bucket}")
|
21
|
+
if destination_partition_locator:
|
22
|
+
base_url = destination_partition_locator.path(
|
23
|
+
f"s3://{bucket}/{source_partition_locator.hexdigest()}"
|
24
|
+
)
|
25
|
+
|
19
26
|
return f"{base_url}.json"
|
20
27
|
|
21
28
|
|
@@ -23,20 +30,41 @@ def get_round_completion_file_s3_url(
|
|
23
30
|
def read_round_completion_file(
|
24
31
|
bucket: str,
|
25
32
|
source_partition_locator: PartitionLocator,
|
33
|
+
destination_partition_locator: Optional[PartitionLocator] = None,
|
26
34
|
**s3_client_kwargs: Optional[Dict[str, Any]],
|
27
35
|
) -> RoundCompletionInfo:
|
28
36
|
|
29
|
-
|
37
|
+
all_uris = []
|
38
|
+
if destination_partition_locator:
|
39
|
+
round_completion_file_url_with_destination = get_round_completion_file_s3_url(
|
40
|
+
bucket,
|
41
|
+
source_partition_locator,
|
42
|
+
destination_partition_locator,
|
43
|
+
)
|
44
|
+
all_uris.append(round_completion_file_url_with_destination)
|
45
|
+
|
46
|
+
# Note: we read from RCF at two different URI for backward
|
47
|
+
# compatibility reasons.
|
48
|
+
round_completion_file_url_prev = get_round_completion_file_s3_url(
|
30
49
|
bucket,
|
31
50
|
source_partition_locator,
|
32
51
|
)
|
33
|
-
|
52
|
+
|
53
|
+
all_uris.append(round_completion_file_url_prev)
|
54
|
+
|
34
55
|
round_completion_info = None
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
56
|
+
|
57
|
+
for rcf_uri in all_uris:
|
58
|
+
logger.info(f"Reading round completion file from: {rcf_uri}")
|
59
|
+
result = s3_utils.download(rcf_uri, False, **s3_client_kwargs)
|
60
|
+
if result:
|
61
|
+
json_str = result["Body"].read().decode("utf-8")
|
62
|
+
round_completion_info = RoundCompletionInfo(json.loads(json_str))
|
63
|
+
logger.info(f"Read round completion info: {round_completion_info}")
|
64
|
+
break
|
65
|
+
else:
|
66
|
+
logger.warn(f"Round completion file not present at {rcf_uri}")
|
67
|
+
|
40
68
|
return round_completion_info
|
41
69
|
|
42
70
|
|
@@ -44,8 +72,9 @@ def read_round_completion_file(
|
|
44
72
|
def write_round_completion_file(
|
45
73
|
bucket: Optional[str],
|
46
74
|
source_partition_locator: Optional[PartitionLocator],
|
75
|
+
destination_partition_locator: Optional[PartitionLocator],
|
47
76
|
round_completion_info: RoundCompletionInfo,
|
48
|
-
completion_file_s3_url: str = None,
|
77
|
+
completion_file_s3_url: Optional[str] = None,
|
49
78
|
**s3_client_kwargs: Optional[Dict[str, Any]],
|
50
79
|
) -> str:
|
51
80
|
if bucket is None and completion_file_s3_url is None:
|
@@ -56,6 +85,7 @@ def write_round_completion_file(
|
|
56
85
|
completion_file_s3_url = get_round_completion_file_s3_url(
|
57
86
|
bucket,
|
58
87
|
source_partition_locator,
|
88
|
+
destination_partition_locator,
|
59
89
|
)
|
60
90
|
logger.info(f"writing round completion file to: {completion_file_s3_url}")
|
61
91
|
s3_utils.upload(
|
@@ -24,7 +24,7 @@ from deltacat.compute.compactor import (
|
|
24
24
|
)
|
25
25
|
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
26
26
|
from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
|
27
|
-
from deltacat.compute.compactor_v2.model.
|
27
|
+
from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
|
28
28
|
ExecutionCompactionResult,
|
29
29
|
)
|
30
30
|
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
@@ -77,6 +77,8 @@ from deltacat.compute.compactor_v2.utils.task_options import (
|
|
77
77
|
local_merge_resource_options_provider,
|
78
78
|
)
|
79
79
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
80
|
+
from deltacat.exceptions import categorize_errors
|
81
|
+
from deltacat.compute.compactor_v2.constants import COMPACT_PARTITION_METRIC_PREFIX
|
80
82
|
|
81
83
|
if importlib.util.find_spec("memray"):
|
82
84
|
import memray
|
@@ -85,7 +87,8 @@ if importlib.util.find_spec("memray"):
|
|
85
87
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
86
88
|
|
87
89
|
|
88
|
-
@metrics
|
90
|
+
@metrics(prefix=COMPACT_PARTITION_METRIC_PREFIX)
|
91
|
+
@categorize_errors
|
89
92
|
def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]:
|
90
93
|
assert (
|
91
94
|
params.hash_bucket_count is not None and params.hash_bucket_count >= 1
|
@@ -107,7 +110,6 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
107
110
|
f"Partition-{params.source_partition_locator} -> "
|
108
111
|
f"{compaction_session_type} Compaction session data processing completed"
|
109
112
|
)
|
110
|
-
round_completion_file_s3_url: Optional[str] = None
|
111
113
|
if execute_compaction_result.new_compacted_partition:
|
112
114
|
previous_partition: Optional[Partition] = None
|
113
115
|
if execute_compaction_result.is_inplace_compacted:
|
@@ -123,25 +125,19 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
123
125
|
f"Committing compacted partition to: {execute_compaction_result.new_compacted_partition.locator} "
|
124
126
|
f"using previous partition: {previous_partition.locator if previous_partition else None}"
|
125
127
|
)
|
126
|
-
|
128
|
+
committed_partition: Partition = params.deltacat_storage.commit_partition(
|
127
129
|
execute_compaction_result.new_compacted_partition,
|
128
130
|
previous_partition,
|
129
131
|
**params.deltacat_storage_kwargs,
|
130
132
|
)
|
131
|
-
logger.info(f"Committed compacted partition: {
|
132
|
-
round_completion_file_s3_url = rcf.write_round_completion_file(
|
133
|
-
params.compaction_artifact_s3_bucket,
|
134
|
-
execute_compaction_result.new_round_completion_file_partition_locator,
|
135
|
-
execute_compaction_result.new_round_completion_info,
|
136
|
-
**params.s3_client_kwargs,
|
137
|
-
)
|
133
|
+
logger.info(f"Committed compacted partition: {committed_partition}")
|
138
134
|
else:
|
139
135
|
logger.warning("No new partition was committed during compaction.")
|
140
136
|
|
141
137
|
logger.info(
|
142
138
|
f"Completed compaction session for: {params.source_partition_locator}"
|
143
139
|
)
|
144
|
-
return round_completion_file_s3_url
|
140
|
+
return execute_compaction_result.round_completion_file_s3_url
|
145
141
|
|
146
142
|
|
147
143
|
def _execute_compaction(
|
@@ -186,6 +182,7 @@ def _execute_compaction(
|
|
186
182
|
round_completion_info = rcf.read_round_completion_file(
|
187
183
|
params.compaction_artifact_s3_bucket,
|
188
184
|
params.source_partition_locator,
|
185
|
+
params.destination_partition_locator,
|
189
186
|
**params.s3_client_kwargs,
|
190
187
|
)
|
191
188
|
if not round_completion_info:
|
@@ -479,6 +476,7 @@ def _execute_compaction(
|
|
479
476
|
delete_strategy=delete_strategy,
|
480
477
|
delete_file_envelopes=delete_file_envelopes,
|
481
478
|
memory_logs_enabled=params.memory_logs_enabled,
|
479
|
+
disable_copy_by_reference=params.disable_copy_by_reference,
|
482
480
|
)
|
483
481
|
}
|
484
482
|
|
@@ -662,13 +660,16 @@ def _execute_compaction(
|
|
662
660
|
)
|
663
661
|
|
664
662
|
logger.info(
|
665
|
-
f"
|
663
|
+
f"Partition-{params.source_partition_locator.partition_values},"
|
666
664
|
f"compacted at: {params.last_stream_position_to_compact},"
|
667
665
|
)
|
666
|
+
logger.info(
|
667
|
+
f"Checking if partition {rcf_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
|
668
|
+
)
|
668
669
|
is_inplace_compacted: bool = (
|
669
|
-
|
670
|
+
rcf_source_partition_locator.partition_values
|
670
671
|
== params.destination_partition_locator.partition_values
|
671
|
-
and
|
672
|
+
and rcf_source_partition_locator.stream_id
|
672
673
|
== params.destination_partition_locator.stream_id
|
673
674
|
)
|
674
675
|
if is_inplace_compacted:
|
@@ -678,9 +679,18 @@ def _execute_compaction(
|
|
678
679
|
f"and rcf source partition_id of {rcf_source_partition_locator.partition_id}."
|
679
680
|
)
|
680
681
|
rcf_source_partition_locator = compacted_partition.locator
|
682
|
+
|
683
|
+
round_completion_file_s3_url = rcf.write_round_completion_file(
|
684
|
+
params.compaction_artifact_s3_bucket,
|
685
|
+
rcf_source_partition_locator,
|
686
|
+
compacted_partition.locator,
|
687
|
+
new_round_completion_info,
|
688
|
+
**params.s3_client_kwargs,
|
689
|
+
)
|
690
|
+
|
681
691
|
return ExecutionCompactionResult(
|
682
692
|
compacted_partition,
|
683
693
|
new_round_completion_info,
|
684
|
-
|
694
|
+
round_completion_file_s3_url,
|
685
695
|
is_inplace_compacted,
|
686
696
|
)
|
@@ -1,5 +1,3 @@
|
|
1
|
-
from deltacat.utils.common import env_integer
|
2
|
-
|
3
1
|
TOTAL_BYTES_IN_SHA1_HASH = 20
|
4
2
|
|
5
3
|
PK_DELIMITER = "L6kl7u5f"
|
@@ -43,15 +41,8 @@ DROP_DUPLICATES = True
|
|
43
41
|
# size in metadata to pyarrow table size.
|
44
42
|
PARQUET_TO_PYARROW_INFLATION = 4
|
45
43
|
|
46
|
-
#
|
47
|
-
|
48
|
-
# This timeout depends on total data processed per task.
|
49
|
-
MERGE_TASK_TIMEOUT_IN_SECONDS = env_integer("MERGE_TASK_TIMEOUT_IN_SECONDS", 25 * 60)
|
50
|
-
|
51
|
-
# A hash bucket task will fail after this timeout
|
52
|
-
HASH_BUCKET_TASK_TIMEOUT_IN_SECONDS = env_integer(
|
53
|
-
"HASH_BUCKET_TASK_TIMEOUT_IN_SECONDS", 25 * 60
|
54
|
-
)
|
44
|
+
# By default, copy by reference is enabled
|
45
|
+
DEFAULT_DISABLE_COPY_BY_REFERENCE = False
|
55
46
|
|
56
47
|
# Metric Names
|
57
48
|
# Time taken for a hash bucket task
|
@@ -77,3 +68,6 @@ DISCOVER_DELTAS_METRIC_PREFIX = "discover_deltas"
|
|
77
68
|
|
78
69
|
# Metric prefix for prepare deletes
|
79
70
|
PREPARE_DELETES_METRIC_PREFIX = "prepare_deletes"
|
71
|
+
|
72
|
+
# Metric prefix for compact partition method
|
73
|
+
COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
|
@@ -2,7 +2,6 @@ from dataclasses import dataclass, fields
|
|
2
2
|
|
3
3
|
from deltacat.storage import (
|
4
4
|
Partition,
|
5
|
-
PartitionLocator,
|
6
5
|
)
|
7
6
|
from deltacat.compute.compactor import (
|
8
7
|
RoundCompletionInfo,
|
@@ -14,7 +13,7 @@ from typing import Optional
|
|
14
13
|
class ExecutionCompactionResult:
|
15
14
|
new_compacted_partition: Optional[Partition]
|
16
15
|
new_round_completion_info: Optional[RoundCompletionInfo]
|
17
|
-
|
16
|
+
round_completion_file_s3_url: Optional[str]
|
18
17
|
is_inplace_compacted: bool
|
19
18
|
|
20
19
|
def __iter__(self):
|