deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +0 -1
  3. deltacat/compute/compactor/model/compact_partition_params.py +76 -0
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
  5. deltacat/compute/compactor/model/delta_annotated.py +16 -9
  6. deltacat/compute/compactor_v2/constants.py +3 -0
  7. deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
  8. deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
  9. deltacat/compute/compactor_v2/utils/io.py +28 -14
  10. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  11. deltacat/compute/compactor_v2/utils/task_options.py +128 -183
  12. deltacat/compute/resource_estimation/__init__.py +27 -0
  13. deltacat/compute/resource_estimation/delta.py +271 -0
  14. deltacat/compute/resource_estimation/manifest.py +394 -0
  15. deltacat/compute/resource_estimation/model.py +165 -0
  16. deltacat/compute/resource_estimation/parquet.py +108 -0
  17. deltacat/constants.py +5 -0
  18. deltacat/exceptions.py +2 -4
  19. deltacat/logs.py +8 -0
  20. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
  21. deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
  22. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
  23. deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
  24. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
  25. deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
  26. deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
  27. deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
  28. deltacat/tests/compute/test_util_common.py +2 -0
  29. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
  30. deltacat/tests/test_logs.py +34 -0
  31. deltacat/tests/test_utils/pyarrow.py +15 -5
  32. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
  33. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
  34. deltacat/compute/metastats/meta_stats.py +0 -479
  35. deltacat/compute/metastats/model/__init__.py +0 -0
  36. deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
  37. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
  38. deltacat/compute/metastats/stats.py +0 -182
  39. deltacat/compute/metastats/utils/__init__.py +0 -0
  40. deltacat/compute/metastats/utils/constants.py +0 -16
  41. deltacat/compute/metastats/utils/io.py +0 -223
  42. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
  43. deltacat/compute/metastats/utils/ray_utils.py +0 -129
  44. deltacat/compute/stats/basic.py +0 -226
  45. deltacat/compute/stats/models/__init__.py +0 -0
  46. deltacat/compute/stats/models/delta_column_stats.py +0 -98
  47. deltacat/compute/stats/models/delta_stats.py +0 -233
  48. deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
  49. deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
  50. deltacat/compute/stats/models/stats_result.py +0 -104
  51. deltacat/compute/stats/utils/__init__.py +0 -0
  52. deltacat/compute/stats/utils/intervals.py +0 -94
  53. deltacat/compute/stats/utils/io.py +0 -230
  54. deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
  55. deltacat/tests/stats/__init__.py +0 -0
  56. deltacat/tests/stats/test_intervals.py +0 -49
  57. /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
  58. /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
  59. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
  60. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
  61. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.17"
47
+ __version__ = "1.1.19"
48
48
 
49
49
 
50
50
  __all__ = [
deltacat/aws/constants.py CHANGED
@@ -1,7 +1,6 @@
1
1
  import botocore
2
2
  from typing import Set
3
3
  from daft.exceptions import DaftTransientError
4
-
5
4
  from deltacat.utils.common import env_integer, env_string
6
5
 
7
6
 
@@ -13,6 +13,10 @@ from deltacat.storage import (
13
13
  PartitionLocator,
14
14
  SortKey,
15
15
  )
16
+ from deltacat.compute.resource_estimation import (
17
+ ResourceEstimationMethod,
18
+ EstimateResourcesParams,
19
+ )
16
20
  from deltacat.compute.compactor_v2.constants import (
17
21
  MAX_RECORDS_PER_COMPACTED_FILE,
18
22
  MIN_DELTA_BYTES_IN_BATCH,
@@ -23,6 +27,8 @@ from deltacat.compute.compactor_v2.constants import (
23
27
  TOTAL_MEMORY_BUFFER_PERCENTAGE,
24
28
  DEFAULT_DISABLE_COPY_BY_REFERENCE,
25
29
  DEFAULT_NUM_ROUNDS,
30
+ PARQUET_TO_PYARROW_INFLATION,
31
+ MAX_PARQUET_METADATA_SIZE,
26
32
  )
27
33
  from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
28
34
  from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
@@ -104,6 +110,22 @@ class CompactPartitionParams(dict):
104
110
  result.metrics_config = params.get("metrics_config")
105
111
 
106
112
  result.num_rounds = params.get("num_rounds", DEFAULT_NUM_ROUNDS)
113
+ result.parquet_to_pyarrow_inflation = params.get(
114
+ "parquet_to_pyarrow_inflation", PARQUET_TO_PYARROW_INFLATION
115
+ )
116
+ result.resource_estimation_method = ResourceEstimationMethod[
117
+ params.get(
118
+ "resource_estimation_method", ResourceEstimationMethod.DEFAULT.value
119
+ )
120
+ ]
121
+
122
+ # disable input split during rebase as the rebase files are already uniform
123
+ result.enable_input_split = (
124
+ params.get("rebase_source_partition_locator") is None
125
+ )
126
+ result.max_parquet_meta_size_bytes = params.get(
127
+ "max_parquet_meta_size_bytes", MAX_PARQUET_METADATA_SIZE
128
+ )
107
129
 
108
130
  if not importlib.util.find_spec("memray"):
109
131
  result.enable_profiler = False
@@ -413,6 +435,60 @@ class CompactPartitionParams(dict):
413
435
  def num_rounds(self, num_rounds: int) -> None:
414
436
  self["num_rounds"] = num_rounds
415
437
 
438
+ @property
439
+ def parquet_to_pyarrow_inflation(self) -> float:
440
+ """
441
+ The inflation factor for the parquet uncompressed_size_bytes to pyarrow table size.
442
+ """
443
+ return self["parquet_to_pyarrow_inflation"]
444
+
445
+ @parquet_to_pyarrow_inflation.setter
446
+ def parquet_to_pyarrow_inflation(self, value: float) -> None:
447
+ self["parquet_to_pyarrow_inflation"] = value
448
+
449
+ @property
450
+ def enable_input_split(self) -> bool:
451
+ """
452
+ When this is True, the input split will be always enabled for parquet files.
453
+ The input split feature will split the parquet files into individual row groups
454
+ so that we could process them in different nodes in parallel.
455
+ By default, input split is enabled for incremental compaction and disabled for rebase or backfill.
456
+ """
457
+ return self["enable_input_split"]
458
+
459
+ @enable_input_split.setter
460
+ def enable_input_split(self, value: bool) -> None:
461
+ self["enable_input_split"] = value
462
+
463
+ @property
464
+ def max_parquet_meta_size_bytes(self) -> int:
465
+ """
466
+ The maximum size of the parquet metadata in bytes. Used for allocating tasks
467
+ to fetch parquet metadata.
468
+ """
469
+ return self["max_parquet_meta_size_bytes"]
470
+
471
+ @max_parquet_meta_size_bytes.setter
472
+ def max_parquet_meta_size_bytes(self, value: int) -> None:
473
+ self["max_parquet_meta_size_bytes"] = value
474
+
475
+ @property
476
+ def resource_estimation_method(self) -> ResourceEstimationMethod:
477
+ return self["resource_estimation_method"]
478
+
479
+ @resource_estimation_method.setter
480
+ def resource_estimation_method(self, value: ResourceEstimationMethod) -> None:
481
+ self["resource_estimation_method"] = value
482
+
483
+ @property
484
+ def estimate_resources_params(self) -> EstimateResourcesParams:
485
+ return EstimateResourcesParams.of(
486
+ resource_estimation_method=self.resource_estimation_method,
487
+ previous_inflation=self.previous_inflation,
488
+ parquet_to_pyarrow_inflation=self.parquet_to_pyarrow_inflation,
489
+ average_record_size_bytes=self.average_record_size_bytes,
490
+ )
491
+
416
492
  @staticmethod
417
493
  def json_handler_for_compact_partition_params(obj):
418
494
  """
@@ -436,6 +436,22 @@ class CompactionSessionAuditInfo(dict):
436
436
  """
437
437
  return self.get("compactorVersion")
438
438
 
439
+ @property
440
+ def observed_input_inflation(self) -> float:
441
+ """
442
+ The average inflation observed for input files only.
443
+ This only accounts for files in the source.
444
+ """
445
+ return self.get("observedInputInflation")
446
+
447
+ @property
448
+ def observed_input_average_record_size_bytes(self) -> float:
449
+ """
450
+ The average record size observed for input files only.
451
+ This only accounts for files in the source.
452
+ """
453
+ return self.get("observedInputAverageRecordSizeBytes")
454
+
439
455
  # Setters follow
440
456
 
441
457
  def set_audit_url(self, audit_url: str) -> CompactionSessionAuditInfo:
@@ -756,6 +772,16 @@ class CompactionSessionAuditInfo(dict):
756
772
  self["compactorVersion"] = value
757
773
  return self
758
774
 
775
+ def set_observed_input_inflation(self, value: float) -> CompactionSessionAuditInfo:
776
+ self["observedInputInflation"] = value
777
+ return self
778
+
779
+ def set_observed_input_average_record_size_bytes(
780
+ self, value: float
781
+ ) -> CompactionSessionAuditInfo:
782
+ self["observedInputAverageRecordSizeBytes"] = value
783
+ return self
784
+
759
785
  # High level methods to save stats
760
786
  def save_step_stats(
761
787
  self,
@@ -69,6 +69,7 @@ class DeltaAnnotated(Delta):
69
69
  estimation_function: Optional[
70
70
  Callable[[ManifestEntry], float]
71
71
  ] = lambda entry: entry.meta.content_length,
72
+ enable_input_split: Optional[bool] = False,
72
73
  ) -> List[DeltaAnnotated]:
73
74
  """
74
75
  Simple greedy algorithm to split/merge 1 or more annotated deltas into
@@ -86,13 +87,19 @@ class DeltaAnnotated(Delta):
86
87
  new_da_bytes = 0
87
88
  da_group_entry_count = 0
88
89
 
89
- for delta_annotated in annotated_deltas:
90
- split_annotated_deltas.extend(DeltaAnnotated._split_single(delta_annotated))
90
+ if enable_input_split:
91
+ for delta_annotated in annotated_deltas:
92
+ split_annotated_deltas.extend(
93
+ DeltaAnnotated._split_single(delta_annotated)
94
+ )
91
95
 
92
- logger.info(
93
- f"Split the {len(annotated_deltas)} annotated deltas "
94
- f"into {len(split_annotated_deltas)} groups."
95
- )
96
+ logger.info(
97
+ f"Split the {len(annotated_deltas)} annotated deltas "
98
+ f"into {len(split_annotated_deltas)} groups."
99
+ )
100
+ else:
101
+ logger.info("Skipping input split as it is disabled...")
102
+ split_annotated_deltas = annotated_deltas
96
103
 
97
104
  for src_da in split_annotated_deltas:
98
105
  src_da_annotations = src_da.annotations
@@ -107,7 +114,7 @@ class DeltaAnnotated(Delta):
107
114
  # (i.e. the previous compaction round ran a rebase)
108
115
  if new_da and src_da.locator != new_da.locator:
109
116
  groups.append(new_da)
110
- logger.info(
117
+ logger.debug(
111
118
  f"Due to different delta locator, Appending group of {da_group_entry_count} elements "
112
119
  f"and {new_da_bytes} bytes"
113
120
  )
@@ -126,12 +133,12 @@ class DeltaAnnotated(Delta):
126
133
  or da_group_entry_count >= min_file_counts
127
134
  ):
128
135
  if new_da_bytes >= min_delta_bytes:
129
- logger.info(
136
+ logger.debug(
130
137
  f"Appending group of {da_group_entry_count} elements "
131
138
  f"and {new_da_bytes} bytes to meet file size limit"
132
139
  )
133
140
  if da_group_entry_count >= min_file_counts:
134
- logger.info(
141
+ logger.debug(
135
142
  f"Appending group of {da_group_entry_count} elements "
136
143
  f"and {da_group_entry_count} files to meet file count limit"
137
144
  )
@@ -41,6 +41,9 @@ DROP_DUPLICATES = True
41
41
  # size in metadata to pyarrow table size.
42
42
  PARQUET_TO_PYARROW_INFLATION = 4
43
43
 
44
+ # Maximum size of the parquet metadata
45
+ MAX_PARQUET_METADATA_SIZE = 100_000_000 # 100 MB
46
+
44
47
  # By default, copy by reference is enabled
45
48
  DEFAULT_DISABLE_COPY_BY_REFERENCE = False
46
49
 
@@ -148,12 +148,8 @@ def _build_uniform_deltas(
148
148
  input_deltas=input_deltas,
149
149
  hash_bucket_count=params.hash_bucket_count,
150
150
  compaction_audit=mutable_compaction_audit,
151
+ compact_partition_params=params,
151
152
  deltacat_storage=params.deltacat_storage,
152
- previous_inflation=params.previous_inflation,
153
- min_delta_bytes=params.min_delta_bytes_in_batch,
154
- min_file_counts=params.min_files_in_batch,
155
- # disable input split during rebase as the rebase files are already uniform
156
- enable_input_split=params.rebase_source_partition_locator is None,
157
153
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
158
154
  )
159
155
  delta_discovery_end: float = time.monotonic()
@@ -400,6 +396,7 @@ def _merge(
400
396
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
401
397
  ray_custom_resources=params.ray_custom_resources,
402
398
  memory_logs_enabled=params.memory_logs_enabled,
399
+ estimate_resources_params=params.estimate_resources_params,
403
400
  )
404
401
 
405
402
  def merge_input_provider(index, item) -> dict[str, MergeInput]:
@@ -463,6 +460,7 @@ def _hash_bucket(
463
460
  primary_keys=params.primary_keys,
464
461
  ray_custom_resources=params.ray_custom_resources,
465
462
  memory_logs_enabled=params.memory_logs_enabled,
463
+ estimate_resources_params=params.estimate_resources_params,
466
464
  )
467
465
 
468
466
  def hash_bucket_input_provider(index, item) -> dict[str, HashBucketInput]:
@@ -537,6 +535,7 @@ def _run_local_merge(
537
535
  ray_custom_resources=params.ray_custom_resources,
538
536
  primary_keys=params.primary_keys,
539
537
  memory_logs_enabled=params.memory_logs_enabled,
538
+ estimate_resources_params=params.estimate_resources_params,
540
539
  )
541
540
  local_merge_result = ray.get(
542
541
  mg.merge.options(**local_merge_options).remote(local_merge_input)
@@ -666,6 +665,11 @@ def _write_new_round_completion_file(
666
665
  f" and average record size={input_average_record_size_bytes}"
667
666
  )
668
667
 
668
+ mutable_compaction_audit.set_observed_input_inflation(input_inflation)
669
+ mutable_compaction_audit.set_observed_input_average_record_size_bytes(
670
+ input_average_record_size_bytes
671
+ )
672
+
669
673
  _update_and_upload_compaction_audit(
670
674
  params,
671
675
  mutable_compaction_audit,
@@ -1,66 +1,217 @@
1
1
  import logging
2
+ import ray
3
+ import functools
4
+ from deltacat.compute.compactor_v2.constants import (
5
+ TASK_MAX_PARALLELISM,
6
+ MAX_PARQUET_METADATA_SIZE,
7
+ )
8
+ from deltacat.utils.ray_utils.concurrency import invoke_parallel
2
9
  from deltacat import logs
3
10
  from deltacat.storage import (
4
11
  Delta,
12
+ ManifestEntry,
5
13
  interface as unimplemented_deltacat_storage,
6
14
  )
7
15
  from typing import Dict, Optional, Any
8
- from deltacat.types.media import TableType, StorageType
16
+ from deltacat.types.media import TableType
9
17
  from deltacat.types.media import ContentType
10
18
  from deltacat.types.partial_download import PartialParquetParameters
19
+ from deltacat.exceptions import RetryableError
11
20
 
12
21
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
13
22
 
14
23
 
24
+ def append_content_type_params_options_provider(
25
+ index: int, item: Any, max_parquet_meta_size_bytes: int, **kwargs
26
+ ) -> Dict:
27
+ task_opts = {
28
+ "num_cpus": 0.01,
29
+ "memory": max_parquet_meta_size_bytes,
30
+ "scheduling_strategy": "DEFAULT",
31
+ }
32
+
33
+ task_opts["max_retries"] = 3
34
+ task_opts["retry_exceptions"] = [RetryableError]
35
+
36
+ return task_opts
37
+
38
+
39
+ def _contains_partial_parquet_parameters(entry: ManifestEntry) -> bool:
40
+ return (
41
+ entry.meta
42
+ and entry.meta.content_type_parameters
43
+ and any(
44
+ isinstance(type_params, PartialParquetParameters)
45
+ for type_params in entry.meta.content_type_parameters
46
+ )
47
+ )
48
+
49
+
50
+ APPEND_CONTENT_TYPE_PARAMS_CACHE = "append_content_type_params_cache"
51
+ # At this point, it's better to fetch all parquet than to cache and
52
+ # call actor which is not expected to support high throughput.
53
+ MINIMUM_ENTRIES_TO_CACHE = 10
54
+
55
+
56
+ @ray.remote
57
+ class AppendContentTypeParamsCache:
58
+ """
59
+ This actor caches the delta that contains content type meta.
60
+ """
61
+
62
+ def __init__(self):
63
+ self.cache = {}
64
+
65
+ def get(self, key):
66
+ return self.cache.get(key)
67
+
68
+ def put(self, key, value):
69
+ self.cache[key] = value
70
+
71
+
72
+ @ray.remote
73
+ def _download_parquet_metadata_for_manifest_entry(
74
+ delta: Delta,
75
+ entry_index: int,
76
+ deltacat_storage: unimplemented_deltacat_storage,
77
+ deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
78
+ ) -> Dict[str, Any]:
79
+ pq_file = deltacat_storage.download_delta_manifest_entry(
80
+ delta,
81
+ entry_index=entry_index,
82
+ table_type=TableType.PYARROW_PARQUET,
83
+ **deltacat_storage_kwargs,
84
+ )
85
+
86
+ return {
87
+ "entry_index": entry_index,
88
+ "partial_parquet_params": PartialParquetParameters.of(
89
+ pq_metadata=pq_file.metadata
90
+ ),
91
+ }
92
+
93
+
15
94
  def append_content_type_params(
16
95
  delta: Delta,
17
- entry_index: Optional[int] = None,
96
+ task_max_parallelism: int = TASK_MAX_PARALLELISM,
97
+ max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
18
98
  deltacat_storage=unimplemented_deltacat_storage,
19
99
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
20
100
  ) -> None:
101
+ """
102
+ This operation appends content type params into the delta entry. Note
103
+ that this operation can be time consuming, hence we cache it in a Ray actor.
104
+ """
21
105
 
22
- if delta.meta.content_type != ContentType.PARQUET.value:
23
- logger.info(
24
- f"Delta with locator {delta.locator} is not a parquet delta, "
25
- "skipping appending content type parameters."
26
- )
106
+ if not delta.meta:
107
+ logger.warning(f"Delta with locator {delta.locator} doesn't contain meta.")
27
108
  return
28
109
 
29
- manifest_entries = delta.manifest.entries
30
- ordered_pq_meta = []
31
-
32
- if entry_index is not None:
33
- manifest_entries = [delta.manifest.entries[entry_index]]
110
+ entry_indices_to_download = []
111
+ for entry_index, entry in enumerate(delta.manifest.entries):
112
+ if (
113
+ not _contains_partial_parquet_parameters(entry)
114
+ and entry.meta
115
+ and entry.meta.content_type == ContentType.PARQUET.value
116
+ ):
117
+ entry_indices_to_download.append(entry_index)
34
118
 
35
- pq_file = deltacat_storage.download_delta_manifest_entry(
36
- delta,
37
- entry_index=entry_index,
38
- table_type=TableType.PYARROW_PARQUET,
39
- **deltacat_storage_kwargs,
119
+ if not entry_indices_to_download:
120
+ logger.info(
121
+ f"No parquet type params to download for delta with locator {delta.locator}."
40
122
  )
123
+ return None
41
124
 
42
- partial_file_meta = PartialParquetParameters.of(pq_metadata=pq_file.metadata)
43
- ordered_pq_meta.append(partial_file_meta)
125
+ ray_namespace = ray.get_runtime_context().namespace
126
+ logger.info(
127
+ f"Got Ray namespace: {ray_namespace}. "
128
+ "Note that caching only works with non-anonymous namespace."
129
+ "To set a non-anonymous namespace, call ray.init(namespace='X')."
130
+ )
131
+ if len(entry_indices_to_download) >= MINIMUM_ENTRIES_TO_CACHE:
132
+ logger.info(
133
+ f"Checking if cache contains parquet meta in namespace {ray_namespace} for "
134
+ f"delta locator {delta.locator} and digest {delta.locator.hexdigest()}..."
135
+ )
136
+ cache = AppendContentTypeParamsCache.options(
137
+ name=APPEND_CONTENT_TYPE_PARAMS_CACHE,
138
+ namespace=ray_namespace,
139
+ get_if_exists=True,
140
+ ).remote()
44
141
 
45
- else:
46
- pq_files = deltacat_storage.download_delta(
47
- delta,
48
- table_type=TableType.PYARROW_PARQUET,
49
- storage_type=StorageType.LOCAL,
50
- **deltacat_storage_kwargs,
142
+ logger.info(f"Got cache actor: {cache}")
143
+ cached_value = ray.get(cache.get.remote(delta.locator.hexdigest()))
144
+ if cached_value is not None:
145
+ logger.info(
146
+ "Using cached parquet meta for delta with locator"
147
+ f" {delta.locator} and digest {delta.locator.hexdigest()}."
148
+ )
149
+ delta.manifest = cached_value.manifest
150
+ return
151
+ logger.info(
152
+ f"Cache doesn't contain parquet meta for delta with locator {delta.locator}."
51
153
  )
52
154
 
53
- assert len(pq_files) == len(
54
- manifest_entries
55
- ), f"Expected {len(manifest_entries)} pq files, got {len(pq_files)}"
155
+ options_provider = functools.partial(
156
+ append_content_type_params_options_provider,
157
+ max_parquet_meta_size_bytes=max_parquet_meta_size_bytes,
158
+ )
56
159
 
57
- ordered_pq_meta = [
58
- PartialParquetParameters.of(pq_metadata=pq_file.metadata)
59
- for pq_file in pq_files
60
- ]
160
+ def input_provider(index, item) -> Dict:
161
+ return {
162
+ "deltacat_storage_kwargs": deltacat_storage_kwargs,
163
+ "deltacat_storage": deltacat_storage,
164
+ "delta": delta,
165
+ "entry_index": item,
166
+ }
61
167
 
62
- for entry_index, entry in enumerate(manifest_entries):
168
+ logger.info(
169
+ f"Downloading parquet meta for {len(entry_indices_to_download)} manifest entries..."
170
+ )
171
+ pq_files_promise = invoke_parallel(
172
+ entry_indices_to_download,
173
+ ray_task=_download_parquet_metadata_for_manifest_entry,
174
+ max_parallelism=task_max_parallelism,
175
+ options_provider=options_provider,
176
+ kwargs_provider=input_provider,
177
+ )
178
+
179
+ partial_file_meta_list = ray.get(pq_files_promise)
180
+
181
+ logger.info(
182
+ f"Downloaded parquet meta for {len(entry_indices_to_download)} manifest entries"
183
+ )
184
+
185
+ assert len(partial_file_meta_list) == len(
186
+ entry_indices_to_download
187
+ ), f"Expected {len(entry_indices_to_download)} pq files, got {len(partial_file_meta_list)}"
188
+
189
+ for index, entry_index in enumerate(entry_indices_to_download):
190
+ assert (
191
+ entry_index == partial_file_meta_list[index]["entry_index"]
192
+ ), "entry_index must match with the associated parquet meta"
193
+ entry = delta.manifest.entries[entry_index]
63
194
  if not entry.meta.content_type_parameters:
64
195
  entry.meta.content_type_parameters = []
196
+ entry.meta.content_type_parameters.append(
197
+ partial_file_meta_list[index]["partial_parquet_params"]
198
+ )
199
+
200
+ for entry_index, entry in enumerate(delta.manifest.entries):
201
+ assert _contains_partial_parquet_parameters(
202
+ entry
203
+ ), "partial parquet params validation failed."
65
204
 
66
- entry.meta.content_type_parameters.append(ordered_pq_meta[entry_index])
205
+ if len(entry_indices_to_download) >= MINIMUM_ENTRIES_TO_CACHE:
206
+ cache = AppendContentTypeParamsCache.options(
207
+ name=APPEND_CONTENT_TYPE_PARAMS_CACHE,
208
+ namespace=ray_namespace,
209
+ get_if_exists=True,
210
+ ).remote()
211
+ logger.info(f"Got cache actor when writing: {cache}")
212
+ logger.info(
213
+ f"Caching parquet meta for delta with locator {delta.locator} "
214
+ f"and digest {delta.locator.hexdigest()}..."
215
+ )
216
+ ray.get(cache.put.remote(delta.locator.hexdigest(), delta))
217
+ assert ray.get(cache.get.remote(delta.locator.hexdigest())) is not None
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  import functools
3
- from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
4
3
  from deltacat.storage import (
5
4
  PartitionLocator,
6
5
  Delta,
@@ -9,11 +8,10 @@ from deltacat.storage import (
9
8
  from deltacat import logs
10
9
  from deltacat.compute.compactor.utils import io as io_v1
11
10
  from deltacat.compute.compactor import DeltaAnnotated
12
- from typing import Dict, List, Optional, Any
13
- from deltacat.compute.compactor_v2.constants import (
14
- MIN_FILES_IN_BATCH,
15
- MIN_DELTA_BYTES_IN_BATCH,
11
+ from deltacat.compute.compactor.model.compact_partition_params import (
12
+ CompactPartitionParams,
16
13
  )
14
+ from typing import Dict, List, Optional, Any
17
15
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
18
16
  CompactionSessionAuditInfo,
19
17
  )
@@ -25,6 +23,10 @@ from deltacat.compute.compactor_v2.utils.content_type_params import (
25
23
  )
26
24
  from deltacat.utils.metrics import metrics
27
25
  from deltacat.compute.compactor_v2.constants import DISCOVER_DELTAS_METRIC_PREFIX
26
+ from deltacat.compute.resource_estimation.manifest import (
27
+ does_require_content_type_params,
28
+ )
29
+ from deltacat.compute.resource_estimation.model import OperationType
28
30
 
29
31
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
30
32
 
@@ -90,10 +92,7 @@ def create_uniform_input_deltas(
90
92
  input_deltas: List[Delta],
91
93
  hash_bucket_count: int,
92
94
  compaction_audit: CompactionSessionAuditInfo,
93
- min_delta_bytes: Optional[float] = MIN_DELTA_BYTES_IN_BATCH,
94
- min_file_counts: Optional[float] = MIN_FILES_IN_BATCH,
95
- previous_inflation: Optional[float] = PYARROW_INFLATION_MULTIPLIER,
96
- enable_input_split: Optional[bool] = False,
95
+ compact_partition_params: CompactPartitionParams,
97
96
  deltacat_storage=unimplemented_deltacat_storage,
98
97
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
99
98
  ) -> List[DeltaAnnotated]:
@@ -104,11 +103,21 @@ def create_uniform_input_deltas(
104
103
  input_da_list = []
105
104
 
106
105
  for delta in input_deltas:
107
- if enable_input_split:
106
+ if (
107
+ compact_partition_params.enable_input_split
108
+ or does_require_content_type_params(
109
+ compact_partition_params.resource_estimation_method
110
+ )
111
+ ):
112
+ logger.debug(
113
+ f"Delta with locator: {delta.locator} requires content type params..."
114
+ )
108
115
  append_content_type_params(
109
116
  delta=delta,
110
117
  deltacat_storage=deltacat_storage,
111
118
  deltacat_storage_kwargs=deltacat_storage_kwargs,
119
+ task_max_parallelism=compact_partition_params.task_max_parallelism,
120
+ max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
112
121
  )
113
122
 
114
123
  manifest_entries = delta.manifest.entries
@@ -118,7 +127,9 @@ def create_uniform_input_deltas(
118
127
  entry = manifest_entries[entry_index]
119
128
  delta_bytes += entry.meta.content_length
120
129
  estimated_da_bytes += estimate_manifest_entry_size_bytes(
121
- entry=entry, previous_inflation=previous_inflation
130
+ entry=entry,
131
+ operation_type=OperationType.PYARROW_DOWNLOAD,
132
+ estimate_resources_params=compact_partition_params.estimate_resources_params,
122
133
  )
123
134
 
124
135
  delta_annotated = DeltaAnnotated.of(delta)
@@ -129,13 +140,16 @@ def create_uniform_input_deltas(
129
140
  logger.info(f"Input delta files to compact: {delta_manifest_entries_count}")
130
141
 
131
142
  size_estimation_function = functools.partial(
132
- estimate_manifest_entry_size_bytes, previous_inflation=previous_inflation
143
+ estimate_manifest_entry_size_bytes,
144
+ operation_type=OperationType.PYARROW_DOWNLOAD,
145
+ estimate_resources_params=compact_partition_params.estimate_resources_params,
133
146
  )
134
147
  rebatched_da_list = DeltaAnnotated.rebatch(
135
148
  input_da_list,
136
- min_delta_bytes=min_delta_bytes,
137
- min_file_counts=min_file_counts,
149
+ min_delta_bytes=compact_partition_params.min_delta_bytes_in_batch,
150
+ min_file_counts=compact_partition_params.min_files_in_batch,
138
151
  estimation_function=size_estimation_function,
152
+ enable_input_split=compact_partition_params.enable_input_split,
139
153
  )
140
154
 
141
155
  compaction_audit.set_input_size_bytes(delta_bytes)
@@ -27,8 +27,11 @@ def _append_sha1_hash_to_table(table: pa.Table, hash_column: pa.Array) -> pa.Tab
27
27
 
28
28
  result = []
29
29
  for hash_value in hash_column_np:
30
- assert hash_value is not None, f"Expected non-null primary key"
31
- result.append(hashlib.sha1(hash_value.encode("utf-8")).hexdigest())
30
+ if hash_value is None:
31
+ result.append(None)
32
+ logger.info("A primary key hash is null")
33
+ else:
34
+ result.append(hashlib.sha1(hash_value.encode("utf-8")).hexdigest())
32
35
 
33
36
  return sc.append_pk_hash_string_column(table, result)
34
37
 
@@ -191,7 +194,7 @@ def generate_pk_hash_column(
191
194
  pk_columns.append(sliced_string_cast(table[pk_name]))
192
195
 
193
196
  pk_columns.append(PK_DELIMITER)
194
- hash_column = pc.binary_join_element_wise(*pk_columns)
197
+ hash_column = pc.binary_join_element_wise(*pk_columns, null_handling="replace")
195
198
  return hash_column
196
199
 
197
200
  def _generate_uuid(table: pa.Table) -> pa.Array:
@@ -345,8 +348,10 @@ def hash_group_index_to_hash_bucket_indices(
345
348
  return range(hb_group, num_buckets, num_groups)
346
349
 
347
350
 
348
- def pk_digest_to_hash_bucket_index(digest: str, num_buckets: int) -> int:
351
+ def pk_digest_to_hash_bucket_index(digest: Optional[str], num_buckets: int) -> int:
349
352
  """
350
353
  Generates the hash bucket index from the given digest.
351
354
  """
355
+ if digest is None:
356
+ return 0
352
357
  return int(digest, 16) % num_buckets