deltacat 1.1.8__py3-none-any.whl → 1.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +6 -0
  3. deltacat/aws/redshift/model/manifest.py +16 -0
  4. deltacat/aws/s3u.py +65 -38
  5. deltacat/compute/compactor/compaction_session.py +5 -1
  6. deltacat/compute/compactor/model/compact_partition_params.py +12 -1
  7. deltacat/compute/compactor/model/materialize_result.py +0 -4
  8. deltacat/compute/compactor/repartition_session.py +1 -0
  9. deltacat/compute/compactor/utils/round_completion_file.py +39 -9
  10. deltacat/compute/compactor_v2/compaction_session.py +26 -16
  11. deltacat/compute/compactor_v2/constants.py +5 -11
  12. deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
  13. deltacat/compute/compactor_v2/model/merge_input.py +6 -0
  14. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
  15. deltacat/compute/compactor_v2/steps/merge.py +12 -12
  16. deltacat/compute/compactor_v2/utils/merge.py +1 -0
  17. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  18. deltacat/compute/compactor_v2/utils/task_options.py +2 -12
  19. deltacat/exceptions.py +342 -7
  20. deltacat/io/dataset.py +5 -17
  21. deltacat/io/memcached_object_store.py +7 -4
  22. deltacat/storage/__init__.py +24 -0
  23. deltacat/storage/interface.py +56 -6
  24. deltacat/storage/model/delta.py +23 -3
  25. deltacat/storage/model/partition.py +6 -7
  26. deltacat/storage/model/partition_spec.py +71 -0
  27. deltacat/storage/model/stream.py +38 -1
  28. deltacat/storage/model/transform.py +127 -0
  29. deltacat/tests/aws/test_s3u.py +2 -0
  30. deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
  31. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
  32. deltacat/tests/compute/compact_partition_test_cases.py +4 -2
  33. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
  34. deltacat/tests/compute/compactor_v2/test_compaction_session.py +204 -37
  35. deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
  36. deltacat/tests/compute/test_util_common.py +19 -4
  37. deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
  38. deltacat/tests/io/test_memcached_object_store.py +5 -2
  39. deltacat/tests/local_deltacat_storage/__init__.py +124 -29
  40. deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
  41. deltacat/tests/test_exceptions.py +100 -0
  42. deltacat/tests/test_logs.py +1 -0
  43. deltacat/tests/test_utils/pyarrow.py +4 -1
  44. deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
  45. deltacat/tests/utils/test_daft.py +0 -1
  46. deltacat/tests/utils/test_resources.py +0 -28
  47. deltacat/utils/daft.py +3 -0
  48. deltacat/utils/numpy.py +3 -3
  49. deltacat/utils/pandas.py +3 -3
  50. deltacat/utils/pyarrow.py +11 -8
  51. deltacat/utils/ray_utils/dataset.py +7 -7
  52. deltacat/utils/ray_utils/runtime.py +2 -2
  53. deltacat/utils/resources.py +0 -45
  54. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/METADATA +6 -5
  55. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/RECORD +58 -51
  56. deltacat/io/aws/redshift/redshift_datasource.py +0 -578
  57. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
  58. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
  59. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.8"
47
+ __version__ = "1.1.10"
48
48
 
49
49
 
50
50
  __all__ = [
deltacat/aws/constants.py CHANGED
@@ -1,9 +1,14 @@
1
1
  import botocore
2
2
  from typing import Set
3
+ from daft.exceptions import DaftTransientError
3
4
 
4
5
  from deltacat.utils.common import env_integer, env_string
5
6
 
7
+
6
8
  DAFT_MAX_S3_CONNECTIONS_PER_FILE = env_integer("DAFT_MAX_S3_CONNECTIONS_PER_FILE", 8)
9
+ DEFAULT_FILE_READ_TIMEOUT_MS = env_integer(
10
+ "DEFAULT_FILE_READ_TIMEOUT_MS", 300_000
11
+ ) # 5 mins
7
12
  BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 5)
8
13
  BOTO_TIMEOUT_ERROR_CODES: Set[str] = {"ReadTimeoutError", "ConnectTimeoutError"}
9
14
  BOTO_THROTTLING_ERROR_CODES: Set[str] = {"Throttling", "SlowDown"}
@@ -14,6 +19,7 @@ RETRYABLE_TRANSIENT_ERRORS = (
14
19
  botocore.exceptions.NoCredentialsError,
15
20
  botocore.exceptions.ConnectTimeoutError,
16
21
  botocore.exceptions.ReadTimeoutError,
22
+ DaftTransientError,
17
23
  )
18
24
  AWS_REGION = env_string("AWS_REGION", "us-east-1")
19
25
  UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY = env_integer(
@@ -99,6 +99,8 @@ class Manifest(dict):
99
99
  total_source_content_length = 0
100
100
  content_type = None
101
101
  content_encoding = None
102
+ partition_values_set = set()
103
+ partition_values = None
102
104
  if entries:
103
105
  content_type = entries[0].meta.content_type
104
106
  content_encoding = entries[0].meta.content_encoding
@@ -127,6 +129,12 @@ class Manifest(dict):
127
129
  total_record_count += meta.record_count or 0
128
130
  total_content_length += meta.content_length or 0
129
131
  total_source_content_length += meta.source_content_length or 0
132
+ if len(partition_values_set) <= 1:
133
+ partition_values_set.add(entry.meta.partition_values)
134
+
135
+ if len(partition_values_set) == 1:
136
+ partition_values = partition_values_set.pop()
137
+
130
138
  meta = ManifestMeta.of(
131
139
  total_record_count,
132
140
  total_content_length,
@@ -134,6 +142,7 @@ class Manifest(dict):
134
142
  content_encoding,
135
143
  total_source_content_length,
136
144
  entry_type=entry_type,
145
+ partition_values=partition_values,
137
146
  )
138
147
  manifest = Manifest._build_manifest(meta, entries, author, uuid, entry_type)
139
148
  return manifest
@@ -185,6 +194,7 @@ class ManifestMeta(dict):
185
194
  credentials: Optional[Dict[str, str]] = None,
186
195
  content_type_parameters: Optional[List[Dict[str, str]]] = None,
187
196
  entry_type: Optional[EntryType] = None,
197
+ partition_values: Optional[List[str]] = None,
188
198
  ) -> ManifestMeta:
189
199
  manifest_meta = ManifestMeta()
190
200
  if record_count is not None:
@@ -203,6 +213,8 @@ class ManifestMeta(dict):
203
213
  manifest_meta["credentials"] = credentials
204
214
  if entry_type is not None:
205
215
  manifest_meta["entry_type"] = entry_type.value
216
+ if partition_values is not None:
217
+ manifest_meta["partition_values"] = partition_values
206
218
  return manifest_meta
207
219
 
208
220
  @property
@@ -244,6 +256,10 @@ class ManifestMeta(dict):
244
256
  return EntryType(self["entry_type"])
245
257
  return val
246
258
 
259
+ @property
260
+ def partition_values(self) -> Optional[List[str]]:
261
+ return self.get("partition_values")
262
+
247
263
 
248
264
  class ManifestAuthor(dict):
249
265
  @staticmethod
deltacat/aws/s3u.py CHANGED
@@ -21,19 +21,17 @@ from boto3.resources.base import ServiceResource
21
21
  from botocore.client import BaseClient
22
22
  from botocore.exceptions import ClientError
23
23
  from ray.data.block import Block, BlockAccessor, BlockMetadata
24
- from ray.data.datasource import BlockWritePathProvider
24
+ from ray.data.datasource import FilenameProvider
25
25
  from ray.types import ObjectRef
26
26
  from tenacity import (
27
27
  Retrying,
28
28
  retry_if_exception_type,
29
- retry_if_not_exception_type,
30
29
  stop_after_delay,
31
30
  wait_random_exponential,
32
31
  )
33
32
  from deltacat.utils.ray_utils.concurrency import invoke_parallel
34
33
  import deltacat.aws.clients as aws_utils
35
34
  from deltacat import logs
36
- from deltacat.exceptions import NonRetryableError, RetryableError
37
35
  from deltacat.storage import (
38
36
  DistributedDataset,
39
37
  LocalDataset,
@@ -55,14 +53,23 @@ from deltacat.types.tables import (
55
53
  DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC,
56
54
  get_table_length,
57
55
  )
56
+ from deltacat.exceptions import (
57
+ RetryableError,
58
+ RetryableUploadTableError,
59
+ RetryableDownloadTableError,
60
+ RetryableDownloadFileError,
61
+ RetryableUploadFileError,
62
+ NonRetryableDownloadFileError,
63
+ NonRetryableUploadFileError,
64
+ NonRetryableUploadTableError,
65
+ NonRetryableDownloadTableError,
66
+ )
58
67
  from deltacat.types.partial_download import PartialFileDownloadParams
59
68
  from deltacat.utils.common import ReadKwargsProvider
69
+ from deltacat.exceptions import categorize_errors
60
70
 
61
71
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
62
72
 
63
- # TODO(raghumdani): refactor redshift datasource to reuse the
64
- # same module for writing output files.
65
-
66
73
 
67
74
  class CapturedBlockWritePaths:
68
75
  def __init__(self):
@@ -90,12 +97,15 @@ class CapturedBlockWritePaths:
90
97
  return self._block_refs
91
98
 
92
99
 
93
- class UuidBlockWritePathProvider(BlockWritePathProvider):
100
+ class UuidBlockWritePathProvider(FilenameProvider):
94
101
  """Block write path provider implementation that writes each
95
102
  dataset block out to a file of the form: {base_path}/{uuid}
96
103
  """
97
104
 
98
- def __init__(self, capture_object: CapturedBlockWritePaths):
105
+ def __init__(
106
+ self, capture_object: CapturedBlockWritePaths, base_path: Optional[str] = None
107
+ ):
108
+ self.base_path = base_path
99
109
  self.write_paths: List[str] = []
100
110
  self.block_refs: List[ObjectRef[Block]] = []
101
111
  self.capture_object = capture_object
@@ -107,6 +117,19 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
107
117
  self.block_refs,
108
118
  )
109
119
 
120
+ def get_filename_for_block(
121
+ self, block: Any, task_index: int, block_index: int
122
+ ) -> str:
123
+ if self.base_path is None:
124
+ raise ValueError(
125
+ "Base path must be provided to UuidBlockWritePathProvider",
126
+ )
127
+ return self._get_write_path_for_block(
128
+ base_path=self.base_path,
129
+ block=block,
130
+ block_index=block_index,
131
+ )
132
+
110
133
  def _get_write_path_for_block(
111
134
  self,
112
135
  base_path: str,
@@ -133,13 +156,6 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
133
156
  block_index: Optional[int] = None,
134
157
  file_format: Optional[str] = None,
135
158
  ) -> str:
136
- """
137
- TODO: BlockWritePathProvider is deprecated as of Ray version 2.20.0. Please use FilenameProvider.
138
- See: https://docs.ray.io/en/master/data/api/doc/ray.data.datasource.FilenameProvider.html
139
- Also See: https://github.com/ray-project/deltacat/issues/299
140
-
141
- Hence, this class only works with Ray version 2.20.0 or lower when used in Ray Dataset.
142
- """
143
159
  return self._get_write_path_for_block(
144
160
  base_path,
145
161
  filesystem=filesystem,
@@ -232,6 +248,7 @@ def filter_objects_by_prefix(
232
248
  more_objects_to_list = params["ContinuationToken"] is not None
233
249
 
234
250
 
251
+ @categorize_errors
235
252
  def read_file(
236
253
  s3_url: str,
237
254
  content_type: ContentType,
@@ -263,15 +280,15 @@ def read_file(
263
280
  in BOTO_TIMEOUT_ERROR_CODES | BOTO_THROTTLING_ERROR_CODES
264
281
  ):
265
282
  # Timeout error not caught by botocore
266
- raise RetryableError(
267
- f"Retry table download from: {s3_url} after receiving {type(e).__name__}"
283
+ raise RetryableDownloadTableError(
284
+ f"Retry table download from: {s3_url} after receiving {type(e).__name__}",
268
285
  ) from e
269
- raise NonRetryableError(
286
+ raise NonRetryableDownloadTableError(
270
287
  f"Failed table download from: {s3_url} after receiving {type(e).__name__}"
271
288
  ) from e
272
289
  except RETRYABLE_TRANSIENT_ERRORS as e:
273
- raise RetryableError(
274
- f"Retry upload for: {s3_url} after receiving {type(e).__name__}"
290
+ raise RetryableDownloadTableError(
291
+ f"Retry download for: {s3_url} after receiving {type(e).__name__}"
275
292
  ) from e
276
293
  except BaseException as e:
277
294
  logger.warn(
@@ -279,7 +296,10 @@ def read_file(
279
296
  f"and encoding={content_encoding}. Error: {e}",
280
297
  exc_info=True,
281
298
  )
282
- raise e
299
+ raise NonRetryableDownloadTableError(
300
+ f"Read has failed for {s3_url} and content_type={content_type} "
301
+ f"and encoding={content_encoding}",
302
+ ) from e
283
303
 
284
304
 
285
305
  def upload_sliced_table(
@@ -378,29 +398,31 @@ def upload_table(
378
398
  except ClientError as e:
379
399
  if e.response["Error"]["Code"] == "NoSuchKey":
380
400
  # s3fs may swallow S3 errors - we were probably throttled
381
- raise RetryableError(
382
- f"Retry table download from: {s3_url} after receiving {type(e).__name__}"
401
+ raise RetryableUploadTableError(
402
+ f"Retry table upload from: {s3_url} after receiving {type(e).__name__}",
383
403
  ) from e
384
404
  if (
385
405
  e.response["Error"]["Code"]
386
406
  in BOTO_TIMEOUT_ERROR_CODES | BOTO_THROTTLING_ERROR_CODES
387
407
  ):
388
- raise RetryableError(
389
- f"Retry table download from: {s3_url} after receiving {type(e).__name__}"
408
+ raise RetryableUploadTableError(
409
+ f"Retry table upload from: {s3_url} after receiving {type(e).__name__}",
390
410
  ) from e
391
- raise NonRetryableError(
392
- f"Failed table upload to: {s3_url} after receiving {type(e).__name__}"
411
+ raise NonRetryableUploadTableError(
412
+ f"Failed table upload to: {s3_url} after receiving {type(e).__name__}",
393
413
  ) from e
394
414
  except RETRYABLE_TRANSIENT_ERRORS as e:
395
- raise RetryableError(
396
- f"Retry upload for: {s3_url} after receiving {type(e).__name__}"
415
+ raise RetryableUploadTableError(
416
+ f"Retry upload for: {s3_url} after receiving {type(e).__name__}",
397
417
  ) from e
398
418
  except BaseException as e:
399
419
  logger.warn(
400
420
  f"Upload has failed for {s3_url} and content_type={content_type}. Error: {e}",
401
421
  exc_info=True,
402
422
  )
403
- raise e
423
+ raise NonRetryableUploadTableError(
424
+ f"Upload has failed for {s3_url} and content_type={content_type} because of {type(e).__name__}",
425
+ ) from e
404
426
  return manifest_entries
405
427
 
406
428
 
@@ -443,7 +465,7 @@ def download_manifest_entry(
443
465
  retrying = Retrying(
444
466
  wait=wait_random_exponential(multiplier=1, max=60),
445
467
  stop=stop_after_delay(DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY),
446
- retry=retry_if_not_exception_type(NonRetryableError),
468
+ retry=retry_if_exception_type(RetryableError),
447
469
  )
448
470
  table = retrying(
449
471
  read_file,
@@ -559,12 +581,15 @@ def _put_object(
559
581
  )
560
582
  except ClientError as e:
561
583
  if e.response["Error"]["Code"] in BOTO_THROTTLING_ERROR_CODES:
562
- raise RetryableError(
563
- f"Retry upload for: {bucket}/{key} after receiving {e.response['Error']['Code']}"
584
+ error_code = e.response["Error"]["Code"]
585
+ raise RetryableUploadFileError(
586
+ f"Retry upload for: {bucket}/{key} after receiving {error_code}",
564
587
  ) from e
565
- raise NonRetryableError(f"Failed table upload to: {bucket}/{key}") from e
588
+ raise NonRetryableUploadFileError(
589
+ f"Failed table upload to: {bucket}/{key}"
590
+ ) from e
566
591
  except RETRYABLE_TRANSIENT_ERRORS as e:
567
- raise RetryableError(
592
+ raise RetryableUploadFileError(
568
593
  f"Retry upload for: {bucket}/{key} after receiving {type(e).__name__}"
569
594
  ) from e
570
595
  except BaseException as e:
@@ -572,7 +597,9 @@ def _put_object(
572
597
  f"Upload has failed for {bucket}/{key}. Error: {type(e).__name__}",
573
598
  exc_info=True,
574
599
  )
575
- raise NonRetryableError(f"Failed table upload to: {bucket}/{key}") from e
600
+ raise NonRetryableUploadFileError(
601
+ f"Failed table upload to: {bucket}/{key}"
602
+ ) from e
576
603
 
577
604
 
578
605
  def download(
@@ -604,12 +631,12 @@ def _get_object(s3_client, bucket: str, key: str, fail_if_not_found: bool = True
604
631
  except ClientError as e:
605
632
  if e.response["Error"]["Code"] == "NoSuchKey":
606
633
  if fail_if_not_found:
607
- raise NonRetryableError(
634
+ raise NonRetryableDownloadFileError(
608
635
  f"Failed get object from: {bucket}/{key}"
609
636
  ) from e
610
637
  logger.info(f"file not found: {bucket}/{key}")
611
638
  except RETRYABLE_TRANSIENT_ERRORS as e:
612
- raise RetryableError(
639
+ raise RetryableDownloadFileError(
613
640
  f"Retry get object: {bucket}/{key} after receiving {type(e).__name__}"
614
641
  ) from e
615
642
 
@@ -193,6 +193,7 @@ def compact_partition(
193
193
  round_completion_file_s3_url = rcf.write_round_completion_file(
194
194
  compaction_artifact_s3_bucket,
195
195
  new_rcf_partition_locator,
196
+ partition.locator,
196
197
  new_rci,
197
198
  **s3_client_kwargs,
198
199
  )
@@ -312,7 +313,10 @@ def _execute_compaction_round(
312
313
  round_completion_info = None
313
314
  if not rebase_source_partition_locator:
314
315
  round_completion_info = rcf.read_round_completion_file(
315
- compaction_artifact_s3_bucket, source_partition_locator, **s3_client_kwargs
316
+ compaction_artifact_s3_bucket,
317
+ source_partition_locator,
318
+ destination_partition_locator,
319
+ **s3_client_kwargs,
316
320
  )
317
321
  if not round_completion_info:
318
322
  logger.info(
@@ -21,6 +21,7 @@ from deltacat.compute.compactor_v2.constants import (
21
21
  TASK_MAX_PARALLELISM,
22
22
  DROP_DUPLICATES,
23
23
  TOTAL_MEMORY_BUFFER_PERCENTAGE,
24
+ DEFAULT_DISABLE_COPY_BY_REFERENCE,
24
25
  )
25
26
  from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
26
27
  from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
@@ -50,7 +51,6 @@ class CompactPartitionParams(dict):
50
51
 
51
52
  result = CompactPartitionParams(params)
52
53
 
53
- # TODO: move defaults to single file
54
54
  result.records_per_compacted_file = params.get(
55
55
  "records_per_compacted_file", MAX_RECORDS_PER_COMPACTED_FILE
56
56
  )
@@ -92,6 +92,9 @@ class CompactPartitionParams(dict):
92
92
  result.hash_group_count = params.get(
93
93
  "hash_group_count", result.hash_bucket_count
94
94
  )
95
+ result.disable_copy_by_reference = params.get(
96
+ "disable_copy_by_reference", DEFAULT_DISABLE_COPY_BY_REFERENCE
97
+ )
95
98
  result.drop_duplicates = params.get("drop_duplicates", DROP_DUPLICATES)
96
99
  result.ray_custom_resources = params.get("ray_custom_resources")
97
100
 
@@ -238,6 +241,14 @@ class CompactPartitionParams(dict):
238
241
  def enable_profiler(self, value: bool) -> None:
239
242
  self["enable_profiler"] = value
240
243
 
244
+ @property
245
+ def disable_copy_by_reference(self) -> bool:
246
+ return self["disable_copy_by_reference"]
247
+
248
+ @disable_copy_by_reference.setter
249
+ def disable_copy_by_reference(self, value: bool) -> None:
250
+ self["disable_copy_by_reference"] = value
251
+
241
252
  @property
242
253
  def list_deltas_kwargs(self) -> dict:
243
254
  return self["list_deltas_kwargs"]
@@ -55,10 +55,6 @@ class MaterializeResult(dict):
55
55
  self["paWriteResult"] = val = PyArrowWriteResult(val)
56
56
  return val
57
57
 
58
- @property
59
- def count_of_src_dfl_not_touched(self) -> int:
60
- return self["countOfSrcFileNotTouched"]
61
-
62
58
  @property
63
59
  def referenced_pyarrow_write_result(self) -> PyArrowWriteResult:
64
60
  val: Dict[str, Any] = self.get("referencedPaWriteResult")
@@ -177,6 +177,7 @@ def repartition(
177
177
  s3_client_kwargs = {}
178
178
 
179
179
  return rcf.write_round_completion_file(
180
+ None,
180
181
  None,
181
182
  None,
182
183
  repartition_completion_info,
@@ -12,10 +12,17 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
12
12
 
13
13
 
14
14
  def get_round_completion_file_s3_url(
15
- bucket: str, source_partition_locator: PartitionLocator
15
+ bucket: str,
16
+ source_partition_locator: PartitionLocator,
17
+ destination_partition_locator: Optional[PartitionLocator] = None,
16
18
  ) -> str:
17
19
 
18
20
  base_url = source_partition_locator.path(f"s3://{bucket}")
21
+ if destination_partition_locator:
22
+ base_url = destination_partition_locator.path(
23
+ f"s3://{bucket}/{source_partition_locator.hexdigest()}"
24
+ )
25
+
19
26
  return f"{base_url}.json"
20
27
 
21
28
 
@@ -23,20 +30,41 @@ def get_round_completion_file_s3_url(
23
30
  def read_round_completion_file(
24
31
  bucket: str,
25
32
  source_partition_locator: PartitionLocator,
33
+ destination_partition_locator: Optional[PartitionLocator] = None,
26
34
  **s3_client_kwargs: Optional[Dict[str, Any]],
27
35
  ) -> RoundCompletionInfo:
28
36
 
29
- round_completion_file_url = get_round_completion_file_s3_url(
37
+ all_uris = []
38
+ if destination_partition_locator:
39
+ round_completion_file_url_with_destination = get_round_completion_file_s3_url(
40
+ bucket,
41
+ source_partition_locator,
42
+ destination_partition_locator,
43
+ )
44
+ all_uris.append(round_completion_file_url_with_destination)
45
+
46
+ # Note: we read from RCF at two different URI for backward
47
+ # compatibility reasons.
48
+ round_completion_file_url_prev = get_round_completion_file_s3_url(
30
49
  bucket,
31
50
  source_partition_locator,
32
51
  )
33
- logger.info(f"reading round completion file from: {round_completion_file_url}")
52
+
53
+ all_uris.append(round_completion_file_url_prev)
54
+
34
55
  round_completion_info = None
35
- result = s3_utils.download(round_completion_file_url, False, **s3_client_kwargs)
36
- if result:
37
- json_str = result["Body"].read().decode("utf-8")
38
- round_completion_info = RoundCompletionInfo(json.loads(json_str))
39
- logger.info(f"read round completion info: {round_completion_info}")
56
+
57
+ for rcf_uri in all_uris:
58
+ logger.info(f"Reading round completion file from: {rcf_uri}")
59
+ result = s3_utils.download(rcf_uri, False, **s3_client_kwargs)
60
+ if result:
61
+ json_str = result["Body"].read().decode("utf-8")
62
+ round_completion_info = RoundCompletionInfo(json.loads(json_str))
63
+ logger.info(f"Read round completion info: {round_completion_info}")
64
+ break
65
+ else:
66
+ logger.warn(f"Round completion file not present at {rcf_uri}")
67
+
40
68
  return round_completion_info
41
69
 
42
70
 
@@ -44,8 +72,9 @@ def read_round_completion_file(
44
72
  def write_round_completion_file(
45
73
  bucket: Optional[str],
46
74
  source_partition_locator: Optional[PartitionLocator],
75
+ destination_partition_locator: Optional[PartitionLocator],
47
76
  round_completion_info: RoundCompletionInfo,
48
- completion_file_s3_url: str = None,
77
+ completion_file_s3_url: Optional[str] = None,
49
78
  **s3_client_kwargs: Optional[Dict[str, Any]],
50
79
  ) -> str:
51
80
  if bucket is None and completion_file_s3_url is None:
@@ -56,6 +85,7 @@ def write_round_completion_file(
56
85
  completion_file_s3_url = get_round_completion_file_s3_url(
57
86
  bucket,
58
87
  source_partition_locator,
88
+ destination_partition_locator,
59
89
  )
60
90
  logger.info(f"writing round completion file to: {completion_file_s3_url}")
61
91
  s3_utils.upload(
@@ -24,7 +24,7 @@ from deltacat.compute.compactor import (
24
24
  )
25
25
  from deltacat.compute.compactor_v2.model.merge_result import MergeResult
26
26
  from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
27
- from deltacat.compute.compactor_v2.model.compaction_session import (
27
+ from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
28
28
  ExecutionCompactionResult,
29
29
  )
30
30
  from deltacat.compute.compactor.model.materialize_result import MaterializeResult
@@ -77,6 +77,8 @@ from deltacat.compute.compactor_v2.utils.task_options import (
77
77
  local_merge_resource_options_provider,
78
78
  )
79
79
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
80
+ from deltacat.exceptions import categorize_errors
81
+ from deltacat.compute.compactor_v2.constants import COMPACT_PARTITION_METRIC_PREFIX
80
82
 
81
83
  if importlib.util.find_spec("memray"):
82
84
  import memray
@@ -85,7 +87,8 @@ if importlib.util.find_spec("memray"):
85
87
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
86
88
 
87
89
 
88
- @metrics
90
+ @metrics(prefix=COMPACT_PARTITION_METRIC_PREFIX)
91
+ @categorize_errors
89
92
  def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]:
90
93
  assert (
91
94
  params.hash_bucket_count is not None and params.hash_bucket_count >= 1
@@ -107,7 +110,6 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
107
110
  f"Partition-{params.source_partition_locator} -> "
108
111
  f"{compaction_session_type} Compaction session data processing completed"
109
112
  )
110
- round_completion_file_s3_url: Optional[str] = None
111
113
  if execute_compaction_result.new_compacted_partition:
112
114
  previous_partition: Optional[Partition] = None
113
115
  if execute_compaction_result.is_inplace_compacted:
@@ -123,25 +125,19 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
123
125
  f"Committing compacted partition to: {execute_compaction_result.new_compacted_partition.locator} "
124
126
  f"using previous partition: {previous_partition.locator if previous_partition else None}"
125
127
  )
126
- commited_partition: Partition = params.deltacat_storage.commit_partition(
128
+ committed_partition: Partition = params.deltacat_storage.commit_partition(
127
129
  execute_compaction_result.new_compacted_partition,
128
130
  previous_partition,
129
131
  **params.deltacat_storage_kwargs,
130
132
  )
131
- logger.info(f"Committed compacted partition: {commited_partition}")
132
- round_completion_file_s3_url = rcf.write_round_completion_file(
133
- params.compaction_artifact_s3_bucket,
134
- execute_compaction_result.new_round_completion_file_partition_locator,
135
- execute_compaction_result.new_round_completion_info,
136
- **params.s3_client_kwargs,
137
- )
133
+ logger.info(f"Committed compacted partition: {committed_partition}")
138
134
  else:
139
135
  logger.warning("No new partition was committed during compaction.")
140
136
 
141
137
  logger.info(
142
138
  f"Completed compaction session for: {params.source_partition_locator}"
143
139
  )
144
- return round_completion_file_s3_url
140
+ return execute_compaction_result.round_completion_file_s3_url
145
141
 
146
142
 
147
143
  def _execute_compaction(
@@ -186,6 +182,7 @@ def _execute_compaction(
186
182
  round_completion_info = rcf.read_round_completion_file(
187
183
  params.compaction_artifact_s3_bucket,
188
184
  params.source_partition_locator,
185
+ params.destination_partition_locator,
189
186
  **params.s3_client_kwargs,
190
187
  )
191
188
  if not round_completion_info:
@@ -479,6 +476,7 @@ def _execute_compaction(
479
476
  delete_strategy=delete_strategy,
480
477
  delete_file_envelopes=delete_file_envelopes,
481
478
  memory_logs_enabled=params.memory_logs_enabled,
479
+ disable_copy_by_reference=params.disable_copy_by_reference,
482
480
  )
483
481
  }
484
482
 
@@ -662,13 +660,16 @@ def _execute_compaction(
662
660
  )
663
661
 
664
662
  logger.info(
665
- f"partition-{params.source_partition_locator.partition_values},"
663
+ f"Partition-{params.source_partition_locator.partition_values},"
666
664
  f"compacted at: {params.last_stream_position_to_compact},"
667
665
  )
666
+ logger.info(
667
+ f"Checking if partition {rcf_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
668
+ )
668
669
  is_inplace_compacted: bool = (
669
- params.source_partition_locator.partition_values
670
+ rcf_source_partition_locator.partition_values
670
671
  == params.destination_partition_locator.partition_values
671
- and params.source_partition_locator.stream_id
672
+ and rcf_source_partition_locator.stream_id
672
673
  == params.destination_partition_locator.stream_id
673
674
  )
674
675
  if is_inplace_compacted:
@@ -678,9 +679,18 @@ def _execute_compaction(
678
679
  f"and rcf source partition_id of {rcf_source_partition_locator.partition_id}."
679
680
  )
680
681
  rcf_source_partition_locator = compacted_partition.locator
682
+
683
+ round_completion_file_s3_url = rcf.write_round_completion_file(
684
+ params.compaction_artifact_s3_bucket,
685
+ rcf_source_partition_locator,
686
+ compacted_partition.locator,
687
+ new_round_completion_info,
688
+ **params.s3_client_kwargs,
689
+ )
690
+
681
691
  return ExecutionCompactionResult(
682
692
  compacted_partition,
683
693
  new_round_completion_info,
684
- rcf_source_partition_locator,
694
+ round_completion_file_s3_url,
685
695
  is_inplace_compacted,
686
696
  )
@@ -1,5 +1,3 @@
1
- from deltacat.utils.common import env_integer
2
-
3
1
  TOTAL_BYTES_IN_SHA1_HASH = 20
4
2
 
5
3
  PK_DELIMITER = "L6kl7u5f"
@@ -43,15 +41,8 @@ DROP_DUPLICATES = True
43
41
  # size in metadata to pyarrow table size.
44
42
  PARQUET_TO_PYARROW_INFLATION = 4
45
43
 
46
- # A merge task will fail after this timeout
47
- # The default is currently double the observed maximum.
48
- # This timeout depends on total data processed per task.
49
- MERGE_TASK_TIMEOUT_IN_SECONDS = env_integer("MERGE_TASK_TIMEOUT_IN_SECONDS", 25 * 60)
50
-
51
- # A hash bucket task will fail after this timeout
52
- HASH_BUCKET_TASK_TIMEOUT_IN_SECONDS = env_integer(
53
- "HASH_BUCKET_TASK_TIMEOUT_IN_SECONDS", 25 * 60
54
- )
44
+ # By default, copy by reference is enabled
45
+ DEFAULT_DISABLE_COPY_BY_REFERENCE = False
55
46
 
56
47
  # Metric Names
57
48
  # Time taken for a hash bucket task
@@ -77,3 +68,6 @@ DISCOVER_DELTAS_METRIC_PREFIX = "discover_deltas"
77
68
 
78
69
  # Metric prefix for prepare deletes
79
70
  PREPARE_DELETES_METRIC_PREFIX = "prepare_deletes"
71
+
72
+ # Metric prefix for compact partition method
73
+ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
@@ -2,7 +2,6 @@ from dataclasses import dataclass, fields
2
2
 
3
3
  from deltacat.storage import (
4
4
  Partition,
5
- PartitionLocator,
6
5
  )
7
6
  from deltacat.compute.compactor import (
8
7
  RoundCompletionInfo,
@@ -14,7 +13,7 @@ from typing import Optional
14
13
  class ExecutionCompactionResult:
15
14
  new_compacted_partition: Optional[Partition]
16
15
  new_round_completion_info: Optional[RoundCompletionInfo]
17
- new_round_completion_file_partition_locator: Optional[PartitionLocator]
16
+ round_completion_file_s3_url: Optional[str]
18
17
  is_inplace_compacted: bool
19
18
 
20
19
  def __iter__(self):