deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,117 @@
1
+ import logging
2
+ from typing import Optional
3
+ from deltacat import logs
4
+ from deltacat.compute.compactor import RoundCompletionInfo
5
+ from deltacat.storage import PartitionLocator
6
+ from deltacat.storage.model.partition import Partition
7
+ from deltacat.utils.metrics import metrics
8
+ from deltacat.exceptions import PartitionNotFoundError
9
+
10
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
11
+
12
+
13
+ @metrics
14
+ def read_round_completion_info(
15
+ source_partition_locator: PartitionLocator,
16
+ destination_partition_locator: PartitionLocator,
17
+ deltacat_storage,
18
+ deltacat_storage_kwargs: Optional[dict] = None,
19
+ destination_partition: Optional[Partition] = None,
20
+ ) -> Optional[RoundCompletionInfo]:
21
+ """
22
+ Read round completion info from the partition metafile.
23
+
24
+ Args:
25
+ source_partition_locator: Source partition locator for validation
26
+ destination_partition_locator: Destination partition locator
27
+ deltacat_storage: Storage implementation
28
+ deltacat_storage_kwargs: Optional storage kwargs
29
+ destination_partition: Optional destination partition to avoid redundant get_partition calls
30
+
31
+ Returns:
32
+ RoundCompletionInfo if found in partition, None otherwise
33
+ """
34
+ if not destination_partition_locator:
35
+ return None
36
+
37
+ if deltacat_storage_kwargs is None:
38
+ deltacat_storage_kwargs = {}
39
+
40
+ try:
41
+ # Use provided partition or get it from storage
42
+ if destination_partition:
43
+ partition = destination_partition
44
+ else:
45
+ # First get the current partition to access its previous_partition_id
46
+ current_partition: Partition = deltacat_storage.get_partition(
47
+ destination_partition_locator.stream_locator,
48
+ destination_partition_locator.partition_values,
49
+ **deltacat_storage_kwargs,
50
+ )
51
+
52
+ # If current partition has round completion info, use it
53
+ if current_partition.compaction_round_completion_info:
54
+ partition = current_partition
55
+ elif current_partition.previous_partition_id is not None:
56
+ # For incremental compaction, we need to get the previous committed partition
57
+ # that contains the round completion info.
58
+ # Get the previous partition by ID - this is where the round completion info should be
59
+ logger.info(
60
+ f"Current partition {destination_partition_locator} does not have round completion info, "
61
+ f"getting previous partition with ID: {current_partition.previous_partition_id}"
62
+ )
63
+ previous_partition = deltacat_storage.get_partition_by_id(
64
+ destination_partition_locator.stream_locator,
65
+ current_partition.previous_partition_id,
66
+ **deltacat_storage_kwargs,
67
+ )
68
+ if previous_partition is not None:
69
+ logger.info(
70
+ f"Found previous partition: {previous_partition.locator}"
71
+ )
72
+ partition = previous_partition
73
+ else:
74
+ raise PartitionNotFoundError(
75
+ f"Previous partition with ID {current_partition.previous_partition_id} not found"
76
+ )
77
+ else:
78
+ logger.info(f"No previous partition ID found, using current partition")
79
+ partition = current_partition
80
+
81
+ if partition:
82
+ round_completion_info = partition.compaction_round_completion_info
83
+ if round_completion_info:
84
+ # Validate that prev_source_partition_locator matches current source
85
+ if (
86
+ not source_partition_locator
87
+ or not round_completion_info.prev_source_partition_locator
88
+ ):
89
+ raise ValueError(
90
+ f"Source partition locator ({source_partition_locator}) and "
91
+ f"prev_source_partition_locator ({round_completion_info.prev_source_partition_locator}) "
92
+ f"must both be provided."
93
+ )
94
+
95
+ if (
96
+ round_completion_info.prev_source_partition_locator.canonical_string()
97
+ != source_partition_locator.canonical_string()
98
+ ):
99
+ logger.warning(
100
+ f"Previous source partition locator mismatch: "
101
+ f"expected {source_partition_locator.canonical_string()}, "
102
+ f"but found {round_completion_info.prev_source_partition_locator.canonical_string()} "
103
+ f"in round completion info. Ignoring cached round completion info."
104
+ )
105
+ return None
106
+
107
+ logger.info(
108
+ f"Read round completion info from partition metafile: {round_completion_info}"
109
+ )
110
+ return round_completion_info
111
+
112
+ except Exception as e:
113
+ logger.debug(
114
+ f"Failed to read round completion info from partition metafile: {e}"
115
+ )
116
+
117
+ return None
@@ -294,7 +294,9 @@ def append_dedupe_task_idx_col(table: pa.Table, dedupe_task_indices) -> pa.Table
294
294
 
295
295
 
296
296
  def delta_type_to_field(delta_type: DeltaType) -> bool:
297
- return True if delta_type is DeltaType.UPSERT else False
297
+ # For deduplication purposes, treat both UPSERT and APPEND as UPSERT (True)
298
+ # Only DELETE should be treated as DELETE (False)
299
+ return delta_type is not DeltaType.DELETE
298
300
 
299
301
 
300
302
  def delta_type_from_field(delta_type_field: bool) -> DeltaType:
@@ -14,7 +14,6 @@ from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
14
14
  ExecutionCompactionResult,
15
15
  )
16
16
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
17
- from deltacat.compute.compactor.utils import round_completion_file as rcf
18
17
  from deltacat.compute.compactor import DeltaAnnotated
19
18
  from deltacat.compute.compactor_v2.deletes.delete_strategy import (
20
19
  DeleteStrategy,
@@ -27,6 +26,7 @@ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
27
26
  from deltacat.storage import (
28
27
  Delta,
29
28
  DeltaLocator,
29
+ PartitionLocator,
30
30
  )
31
31
  from deltacat.storage.model.manifest import Manifest
32
32
  from deltacat.compute.compactor.model.compact_partition_params import (
@@ -36,13 +36,14 @@ from deltacat.utils.resources import (
36
36
  get_current_process_peak_memory_usage_in_bytes,
37
37
  )
38
38
  from deltacat.compute.compactor_v2.private.compaction_utils import (
39
+ _get_rci_source_partition_locator,
39
40
  _fetch_compaction_metadata,
40
41
  _build_uniform_deltas,
41
42
  _group_uniform_deltas,
42
43
  _stage_new_partition,
43
44
  _run_hash_and_merge,
44
45
  _process_merge_results,
45
- _write_new_round_completion_file,
46
+ _create_round_completion_info,
46
47
  _commit_compaction_result,
47
48
  )
48
49
  from deltacat.utils.metrics import metrics
@@ -64,24 +65,26 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
64
65
 
65
66
  @metrics(prefix=COMPACT_PARTITION_METRIC_PREFIX)
66
67
  @categorize_errors
67
- def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]:
68
+ def compact_partition(params: CompactPartitionParams, **kwargs) -> None:
68
69
  assert (
69
70
  params.hash_bucket_count is not None and params.hash_bucket_count >= 1
70
71
  ), "hash_bucket_count is a required arg for compactor v2"
72
+ assert type(params.hash_bucket_count) is int, "Hash bucket count must be an integer"
71
73
  if params.num_rounds > 1:
72
74
  assert (
73
75
  not params.drop_duplicates
74
76
  ), "num_rounds > 1, drop_duplicates must be False but is True"
75
77
 
76
- with memray.Tracker(
77
- "compaction_partition.bin"
78
- ) if params.enable_profiler else nullcontext():
78
+ with (
79
+ memray.Tracker("compaction_partition.bin")
80
+ if params.enable_profiler
81
+ else nullcontext()
82
+ ):
79
83
  execute_compaction_result: ExecutionCompactionResult = _execute_compaction(
80
84
  params,
81
85
  **kwargs,
82
86
  )
83
87
  _commit_compaction_result(params, execute_compaction_result)
84
- return execute_compaction_result.round_completion_file_s3_url
85
88
 
86
89
 
87
90
  def _execute_compaction(
@@ -96,12 +99,12 @@ def _execute_compaction(
96
99
  previous_compacted_delta_manifest,
97
100
  round_completion_info,
98
101
  ) = fetch_compaction_metadata_result
99
- rcf_source_partition_locator: rcf.PartitionLocator = (
100
- params.rebase_source_partition_locator or params.source_partition_locator
102
+ rci_source_partition_locator: PartitionLocator = _get_rci_source_partition_locator(
103
+ params
101
104
  )
102
105
 
103
- base_audit_url: str = rcf_source_partition_locator.path(
104
- f"s3://{params.compaction_artifact_s3_bucket}/compaction-audit"
106
+ base_audit_url: str = rci_source_partition_locator.path(
107
+ f"{params.compaction_artifact_path}/compaction-audit"
105
108
  )
106
109
  audit_url: str = f"{base_audit_url}.json"
107
110
  logger.info(f"Compaction audit will be written to {audit_url}")
@@ -136,7 +139,7 @@ def _execute_compaction(
136
139
  )
137
140
  if not input_deltas:
138
141
  logger.info("No input deltas found to compact.")
139
- return ExecutionCompactionResult(None, None, None, False)
142
+ return ExecutionCompactionResult(None, None, False)
140
143
  build_uniform_deltas_result: tuple[
141
144
  List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
142
145
  ] = _build_uniform_deltas(
@@ -199,13 +202,13 @@ def _execute_compaction(
199
202
 
200
203
  compaction_audit.save_round_completion_stats(mat_results)
201
204
 
202
- compaction_result: ExecutionCompactionResult = _write_new_round_completion_file(
205
+ compaction_result: ExecutionCompactionResult = _create_round_completion_info(
203
206
  params,
204
207
  compaction_audit,
205
208
  compacted_partition,
206
209
  audit_url,
207
210
  hb_id_to_entry_indices_range,
208
- rcf_source_partition_locator,
211
+ rci_source_partition_locator,
209
212
  new_compacted_delta_locator,
210
213
  pyarrow_write_result,
211
214
  round_completion_info,
@@ -1,3 +1,5 @@
1
+ from deltacat.utils.common import env_bool, env_integer, env_string
2
+
1
3
  TOTAL_BYTES_IN_SHA1_HASH = 20
2
4
 
3
5
  PK_DELIMITER = "L6kl7u5f"
@@ -31,7 +33,9 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
31
33
  # The total size of records that will be hash bucketed at once
32
34
  # Since, sorting is nlogn, we ensure that is not performed
33
35
  # on a very large dataset for best performance.
34
- MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
36
+ MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
37
+ "MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
38
+ )
35
39
 
36
40
  # Whether to drop duplicates during merge.
37
41
  DROP_DUPLICATES = True
@@ -78,3 +82,28 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
78
82
  # Number of rounds to run hash/merge for a single
79
83
  # partition. (For large table support)
80
84
  DEFAULT_NUM_ROUNDS = 1
85
+
86
+ # Whether to perform sha1 hashing when required to
87
+ # optimize memory. For example, hashing is always
88
+ # required for bucketing where it's not mandatory
89
+ # when dropping duplicates. Setting this to True
90
+ # will disable sha1 hashing in cases where it isn't
91
+ # mandatory. This flag is False by default.
92
+ SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
93
+ "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
94
+ )
95
+
96
+ # This env variable specifies whether to check bucketing spec
97
+ # compliance of the existing compacted table.
98
+ # PRINT_LOG: Enable logging if any partition is found
99
+ # to be non-compliant with the bucketing spec.
100
+ # ASSERT: Fail the job with ValidationError if the
101
+ # current compacted partition is found to be non-compliant
102
+ # with bucketing spec. Note, logging is implicitly enabled
103
+ # in this case.
104
+ BUCKETING_SPEC_COMPLIANCE_PROFILE = env_string(
105
+ "BUCKETING_SPEC_COMPLIANCE_PROFILE", None
106
+ )
107
+
108
+ BUCKETING_SPEC_COMPLIANCE_PRINT_LOG = "PRINT_LOG"
109
+ BUCKETING_SPEC_COMPLIANCE_ASSERT = "ASSERT"
@@ -13,7 +13,6 @@ from typing import Optional
13
13
  class ExecutionCompactionResult:
14
14
  new_compacted_partition: Optional[Partition]
15
15
  new_round_completion_info: Optional[RoundCompletionInfo]
16
- round_completion_file_s3_url: Optional[str]
17
16
  is_inplace_compacted: bool
18
17
 
19
18
  def __iter__(self):
@@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Any
4
4
  from deltacat.utils.metrics import MetricsConfig
5
5
  from deltacat.utils.common import ReadKwargsProvider
6
6
  from deltacat.io.object_store import IObjectStore
7
- from deltacat.storage import interface as unimplemented_deltacat_storage
7
+ from deltacat.storage import metastore
8
8
  from deltacat.compute.compactor import DeltaAnnotated
9
9
 
10
10
 
@@ -15,12 +15,13 @@ class HashBucketInput(Dict):
15
15
  primary_keys: List[str],
16
16
  num_hash_buckets: int,
17
17
  num_hash_groups: int,
18
+ all_column_names: List[str],
18
19
  hb_task_index: Optional[int] = 0,
19
20
  enable_profiler: Optional[bool] = False,
20
21
  metrics_config: Optional[MetricsConfig] = None,
21
22
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
22
23
  object_store: Optional[IObjectStore] = None,
23
- deltacat_storage=unimplemented_deltacat_storage,
24
+ deltacat_storage=metastore,
24
25
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
25
26
  memory_logs_enabled: Optional[bool] = None,
26
27
  ) -> HashBucketInput:
@@ -31,6 +32,7 @@ class HashBucketInput(Dict):
31
32
  result["hb_task_index"] = hb_task_index
32
33
  result["num_hash_buckets"] = num_hash_buckets
33
34
  result["num_hash_groups"] = num_hash_groups
35
+ result["all_column_names"] = all_column_names
34
36
  result["enable_profiler"] = enable_profiler
35
37
  result["metrics_config"] = metrics_config
36
38
  result["read_kwargs_provider"] = read_kwargs_provider
@@ -61,6 +63,10 @@ class HashBucketInput(Dict):
61
63
  def num_hash_groups(self) -> int:
62
64
  return self["num_hash_groups"]
63
65
 
66
+ @property
67
+ def all_column_names(self) -> List[str]:
68
+ return self["all_column_names"]
69
+
64
70
  @property
65
71
  def enable_profiler(self) -> Optional[bool]:
66
72
  return self.get("enable_profiler")
@@ -78,7 +84,7 @@ class HashBucketInput(Dict):
78
84
  return self.get("object_store")
79
85
 
80
86
  @property
81
- def deltacat_storage(self) -> unimplemented_deltacat_storage:
87
+ def deltacat_storage(self) -> metastore:
82
88
  return self.get("deltacat_storage")
83
89
 
84
90
  @property
@@ -16,7 +16,7 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
16
16
  hash_group_index_to_hash_bucket_indices,
17
17
  )
18
18
 
19
- from deltacat.storage import interface as unimplemented_deltacat_storage
19
+ from deltacat.storage import metastore
20
20
 
21
21
  from deltacat.io.object_store import IObjectStore
22
22
 
@@ -87,11 +87,13 @@ class LocalMergeFileGroupsProvider(MergeFileGroupsProvider):
87
87
  def __init__(
88
88
  self,
89
89
  uniform_deltas: List[DeltaAnnotated],
90
+ all_column_names: List[str],
90
91
  read_kwargs_provider: Optional[ReadKwargsProvider],
91
- deltacat_storage=unimplemented_deltacat_storage,
92
+ deltacat_storage=metastore,
92
93
  deltacat_storage_kwargs: Optional[dict] = None,
93
94
  ):
94
95
  self._deltas = uniform_deltas
96
+ self._all_column_names = all_column_names
95
97
  self._read_kwargs_provider = read_kwargs_provider
96
98
  self._deltacat_storage = deltacat_storage
97
99
  self._deltacat_storage_kwargs = deltacat_storage_kwargs
@@ -110,6 +112,7 @@ class LocalMergeFileGroupsProvider(MergeFileGroupsProvider):
110
112
  total_size_bytes,
111
113
  ) = read_delta_file_envelopes(
112
114
  annotated_delta,
115
+ self._all_column_names,
113
116
  self._read_kwargs_provider,
114
117
  self._deltacat_storage,
115
118
  self._deltacat_storage_kwargs,
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Dict, List, Optional, Any
3
+ from typing import Dict, List, Optional, Any, Set
4
4
 
5
5
  from deltacat.compute.compactor_v2.model.merge_file_group import (
6
6
  MergeFileGroupsProvider,
@@ -12,9 +12,10 @@ from deltacat.utils.metrics import MetricsConfig
12
12
  from deltacat.utils.common import ReadKwargsProvider
13
13
  from deltacat.io.object_store import IObjectStore
14
14
  from deltacat.storage import (
15
+ Manifest,
15
16
  Partition,
16
17
  SortKey,
17
- interface as unimplemented_deltacat_storage,
18
+ metastore,
18
19
  )
19
20
  from deltacat.compute.compactor_v2.constants import (
20
21
  DROP_DUPLICATES,
@@ -32,22 +33,26 @@ class MergeInput(Dict):
32
33
  write_to_partition: Partition,
33
34
  compacted_file_content_type: ContentType,
34
35
  primary_keys: List[str],
36
+ all_column_names: List[str],
35
37
  drop_duplicates: Optional[bool] = DROP_DUPLICATES,
36
38
  sort_keys: Optional[List[SortKey]] = None,
37
39
  merge_task_index: Optional[int] = 0,
38
40
  max_records_per_output_file: Optional[int] = MAX_RECORDS_PER_COMPACTED_FILE,
39
41
  enable_profiler: Optional[bool] = False,
40
42
  metrics_config: Optional[MetricsConfig] = None,
41
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
43
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
42
44
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
43
45
  round_completion_info: Optional[RoundCompletionInfo] = None,
44
46
  object_store: Optional[IObjectStore] = None,
45
47
  delete_strategy: Optional[DeleteStrategy] = None,
46
48
  delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None,
47
- deltacat_storage=unimplemented_deltacat_storage,
49
+ deltacat_storage=metastore,
48
50
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
49
51
  memory_logs_enabled: Optional[bool] = None,
50
52
  disable_copy_by_reference: Optional[bool] = None,
53
+ hash_bucket_count: Optional[int] = None,
54
+ original_fields: Optional[Set[str]] = None,
55
+ compacted_manifest: Optional[Manifest] = None,
51
56
  ) -> MergeInput:
52
57
 
53
58
  result = MergeInput()
@@ -55,13 +60,14 @@ class MergeInput(Dict):
55
60
  result["write_to_partition"] = write_to_partition
56
61
  result["compacted_file_content_type"] = compacted_file_content_type
57
62
  result["primary_keys"] = primary_keys
63
+ result["all_column_names"] = all_column_names
58
64
  result["drop_duplicates"] = drop_duplicates
59
65
  result["sort_keys"] = sort_keys
60
66
  result["merge_task_index"] = merge_task_index
61
67
  result["max_records_per_output_file"] = max_records_per_output_file
62
68
  result["enable_profiler"] = enable_profiler
63
69
  result["metrics_config"] = metrics_config
64
- result["s3_table_writer_kwargs"] = s3_table_writer_kwargs or {}
70
+ result["table_writer_kwargs"] = table_writer_kwargs or {}
65
71
  result["read_kwargs_provider"] = read_kwargs_provider
66
72
  result["round_completion_info"] = round_completion_info
67
73
  result["object_store"] = object_store
@@ -71,6 +77,9 @@ class MergeInput(Dict):
71
77
  result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
72
78
  result["memory_logs_enabled"] = memory_logs_enabled
73
79
  result["disable_copy_by_reference"] = disable_copy_by_reference
80
+ result["hash_bucket_count"] = hash_bucket_count
81
+ result["original_fields"] = original_fields
82
+ result["compacted_manifest"] = compacted_manifest
74
83
  return result
75
84
 
76
85
  @property
@@ -89,6 +98,10 @@ class MergeInput(Dict):
89
98
  def primary_keys(self) -> List[str]:
90
99
  return self["primary_keys"]
91
100
 
101
+ @property
102
+ def all_column_names(self) -> List[str]:
103
+ return self["all_column_names"]
104
+
92
105
  @property
93
106
  def drop_duplicates(self) -> int:
94
107
  return self["drop_duplicates"]
@@ -114,8 +127,8 @@ class MergeInput(Dict):
114
127
  return self.get("metrics_config")
115
128
 
116
129
  @property
117
- def s3_table_writer_kwargs(self) -> Optional[Dict[str, Any]]:
118
- return self.get("s3_table_writer_kwargs")
130
+ def table_writer_kwargs(self) -> Optional[Dict[str, Any]]:
131
+ return self.get("table_writer_kwargs")
119
132
 
120
133
  @property
121
134
  def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
@@ -130,7 +143,7 @@ class MergeInput(Dict):
130
143
  return self.get("object_store")
131
144
 
132
145
  @property
133
- def deltacat_storage(self) -> unimplemented_deltacat_storage:
146
+ def deltacat_storage(self) -> metastore:
134
147
  return self["deltacat_storage"]
135
148
 
136
149
  @property
@@ -154,3 +167,15 @@ class MergeInput(Dict):
154
167
  @property
155
168
  def disable_copy_by_reference(self) -> bool:
156
169
  return self["disable_copy_by_reference"]
170
+
171
+ @property
172
+ def hash_bucket_count(self) -> int:
173
+ return self["hash_bucket_count"]
174
+
175
+ @property
176
+ def original_fields(self) -> Optional[Set[str]]:
177
+ return self.get("original_fields")
178
+
179
+ @property
180
+ def compacted_manifest(self) -> Optional[Manifest]:
181
+ return self.get("compacted_manifest")