deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ import functools
3
3
  from deltacat.storage import (
4
4
  PartitionLocator,
5
5
  Delta,
6
- interface as unimplemented_deltacat_storage,
6
+ metastore,
7
7
  )
8
8
  from deltacat import logs
9
9
  from deltacat.compute.compactor.utils import io as io_v1
@@ -38,7 +38,7 @@ def discover_deltas(
38
38
  rebase_source_partition_locator: Optional[PartitionLocator] = None,
39
39
  rebase_source_partition_high_watermark: Optional[int] = None,
40
40
  rcf_high_watermark: Optional[int] = None,
41
- deltacat_storage=unimplemented_deltacat_storage,
41
+ deltacat_storage=metastore,
42
42
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
43
43
  list_deltas_kwargs: Optional[Dict[str, Any]] = {},
44
44
  ) -> List[Delta]:
@@ -67,6 +67,11 @@ def discover_deltas(
67
67
  f"Length of input deltas from delta source table is {len(delta_source_incremental_deltas)}"
68
68
  f" from ({previous_compacted_high_watermark}, {last_stream_position_to_compact}]"
69
69
  )
70
+ logger.info(f"DEBUG: source_partition_locator = {source_partition_locator}")
71
+ logger.info(
72
+ f"DEBUG: source_partition_locator.partition_id = {getattr(source_partition_locator, 'partition_id', 'NO_PARTITION_ID')}"
73
+ )
74
+ logger.info(f"DEBUG: total input deltas found = {len(result)}")
70
75
 
71
76
  if rebase_source_partition_locator:
72
77
  previous_compacted_deltas = io_v1._discover_deltas(
@@ -93,7 +98,8 @@ def create_uniform_input_deltas(
93
98
  hash_bucket_count: int,
94
99
  compaction_audit: CompactionSessionAuditInfo,
95
100
  compact_partition_params: CompactPartitionParams,
96
- deltacat_storage=unimplemented_deltacat_storage,
101
+ all_column_names: List[str],
102
+ deltacat_storage=metastore,
97
103
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
98
104
  ) -> List[DeltaAnnotated]:
99
105
 
@@ -101,7 +107,6 @@ def create_uniform_input_deltas(
101
107
  delta_manifest_entries_count = 0
102
108
  estimated_da_bytes = 0
103
109
  input_da_list = []
104
-
105
110
  for delta in input_deltas:
106
111
  if (
107
112
  compact_partition_params.enable_input_split
@@ -114,10 +119,12 @@ def create_uniform_input_deltas(
114
119
  )
115
120
  append_content_type_params(
116
121
  delta=delta,
122
+ all_column_names=all_column_names,
117
123
  deltacat_storage=deltacat_storage,
118
124
  deltacat_storage_kwargs=deltacat_storage_kwargs,
119
125
  task_max_parallelism=compact_partition_params.task_max_parallelism,
120
126
  max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
127
+ file_reader_kwargs_provider=compact_partition_params.read_kwargs_provider,
121
128
  )
122
129
 
123
130
  manifest_entries = delta.manifest.entries
@@ -23,6 +23,7 @@ from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
23
23
 
24
24
  from deltacat.utils.performance import timed_invocation
25
25
  from deltacat.storage import (
26
+ DeltaType,
26
27
  Partition,
27
28
  )
28
29
  from deltacat.compute.compactor_v2.deletes.delete_strategy import (
@@ -47,13 +48,21 @@ def materialize(
47
48
  # TODO (pdames): compare performance to pandas-native materialize path
48
49
  df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
49
50
  compacted_table = df
51
+ # Extract schema from table_writer_kwargs to pass as direct parameter
52
+ # This ensures schema_id is properly set in the manifest
53
+ schema = None
54
+ if input.table_writer_kwargs and "schema" in input.table_writer_kwargs:
55
+ schema = input.table_writer_kwargs["schema"]
56
+
50
57
  delta, stage_delta_time = timed_invocation(
51
58
  input.deltacat_storage.stage_delta,
52
59
  compacted_table,
53
60
  input.write_to_partition,
61
+ delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
54
62
  max_records_per_entry=input.max_records_per_output_file,
55
63
  content_type=input.compacted_file_content_type,
56
- s3_table_writer_kwargs=input.s3_table_writer_kwargs,
64
+ schema=schema, # Pass schema as direct parameter for schema_id extraction
65
+ table_writer_kwargs=input.table_writer_kwargs,
57
66
  **input.deltacat_storage_kwargs,
58
67
  )
59
68
  compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
@@ -112,6 +121,7 @@ def generate_local_merge_input(
112
121
  return MergeInput.of(
113
122
  merge_file_groups_provider=LocalMergeFileGroupsProvider(
114
123
  annotated_deltas,
124
+ all_column_names=params.all_column_names,
115
125
  read_kwargs_provider=params.read_kwargs_provider,
116
126
  deltacat_storage=params.deltacat_storage,
117
127
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
@@ -119,12 +129,13 @@ def generate_local_merge_input(
119
129
  write_to_partition=compacted_partition,
120
130
  compacted_file_content_type=params.compacted_file_content_type,
121
131
  primary_keys=params.primary_keys,
132
+ all_column_names=params.all_column_names,
122
133
  sort_keys=params.sort_keys,
123
134
  drop_duplicates=params.drop_duplicates,
124
135
  max_records_per_output_file=params.records_per_compacted_file,
125
136
  enable_profiler=params.enable_profiler,
126
137
  metrics_config=params.metrics_config,
127
- s3_table_writer_kwargs=params.s3_table_writer_kwargs,
138
+ table_writer_kwargs=params.table_writer_kwargs,
128
139
  read_kwargs_provider=params.read_kwargs_provider,
129
140
  round_completion_info=round_completion_info,
130
141
  object_store=params.object_store,
@@ -133,4 +144,6 @@ def generate_local_merge_input(
133
144
  delete_strategy=delete_strategy,
134
145
  delete_file_envelopes=delete_file_envelopes,
135
146
  disable_copy_by_reference=params.disable_copy_by_reference,
147
+ hash_bucket_count=params.hash_bucket_count,
148
+ original_fields=params.original_fields,
136
149
  )
@@ -10,6 +10,7 @@ from deltacat.compute.compactor_v2.constants import (
10
10
  TOTAL_BYTES_IN_SHA1_HASH,
11
11
  PK_DELIMITER,
12
12
  MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
13
+ SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
13
14
  )
14
15
  import time
15
16
  from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
@@ -48,6 +49,13 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
48
49
  f"Found total length of hash column={total_len} and total_size={total_size}"
49
50
  )
50
51
 
52
+ if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
53
+ logger.info(
54
+ f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
55
+ f"Returning False for is_sha1_desired"
56
+ )
57
+ return False
58
+
51
59
  return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
52
60
 
53
61
 
@@ -70,13 +78,25 @@ def _append_table_by_hash_bucket(
70
78
  f"Grouping a pki table of length {len(pki_table)} took {groupby_latency}s"
71
79
  )
72
80
 
81
+ hb_pk_grouped_by = hb_pk_grouped_by.sort_by(sc._HASH_BUCKET_IDX_COLUMN_NAME)
73
82
  group_count_array = hb_pk_grouped_by[f"{sc._HASH_BUCKET_IDX_COLUMN_NAME}_count"]
74
83
  hb_group_array = hb_pk_grouped_by[sc._HASH_BUCKET_IDX_COLUMN_NAME]
75
84
 
76
85
  result_len = 0
77
86
  for i, group_count in enumerate(group_count_array):
78
87
  hb_idx = hb_group_array[i].as_py()
79
- pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count.as_py())
88
+ group_count_py = group_count.as_py()
89
+ pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count_py)
90
+ assert group_count_py == len(
91
+ pyarrow_table
92
+ ), f"Group count {group_count_py} not equal to {len(pyarrow_table)}"
93
+ all_buckets = pc.unique(pyarrow_table[sc._HASH_BUCKET_IDX_COLUMN_NAME])
94
+ assert (
95
+ len(all_buckets) == 1
96
+ ), f"Only one hash bucket is allowed but found {len(all_buckets)}"
97
+ assert (
98
+ all_buckets[0].as_py() == hb_idx
99
+ ), f"Hash bucket not equal, {all_buckets[0]} and {hb_idx}"
80
100
  pyarrow_table = pyarrow_table.drop(
81
101
  [sc._HASH_BUCKET_IDX_COLUMN_NAME, sc._PK_HASH_STRING_COLUMN_NAME]
82
102
  )
@@ -108,9 +128,10 @@ def _optimized_group_record_batches_by_hash_bucket(
108
128
  record_batches = []
109
129
  result_len = 0
110
130
  for record_batch in table_batches:
111
- current_bytes += record_batch.nbytes
112
- record_batches.append(record_batch)
113
- if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
131
+ if (
132
+ record_batches
133
+ and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
134
+ ):
114
135
  logger.info(
115
136
  f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
116
137
  f"is {len(record_batches)} and size {current_bytes}"
@@ -128,6 +149,9 @@ def _optimized_group_record_batches_by_hash_bucket(
128
149
  current_bytes = 0
129
150
  record_batches.clear()
130
151
 
152
+ current_bytes += record_batch.nbytes
153
+ record_batches.append(record_batch)
154
+
131
155
  if record_batches:
132
156
  appended_len, append_latency = timed_invocation(
133
157
  _append_table_by_hash_bucket,
@@ -1,12 +1,17 @@
1
1
  import logging
2
2
  from typing import Dict, Optional, List, Tuple, Any
3
3
  from deltacat import logs
4
+ from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
5
+ from deltacat.compute.compactor_v2.constants import (
6
+ AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
7
+ )
4
8
  from deltacat.compute.compactor_v2.model.merge_file_group import (
5
9
  LocalMergeFileGroupsProvider,
6
10
  )
7
11
  from deltacat.storage import (
8
12
  Manifest,
9
- interface as unimplemented_deltacat_storage,
13
+ ManifestEntry,
14
+ metastore,
10
15
  )
11
16
  from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
12
17
  from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
@@ -72,8 +77,6 @@ def _get_merge_task_options(
72
77
  round_completion_info: Optional[RoundCompletionInfo] = None,
73
78
  compacted_delta_manifest: Optional[Manifest] = None,
74
79
  primary_keys: Optional[List[str]] = None,
75
- deltacat_storage=unimplemented_deltacat_storage,
76
- deltacat_storage_kwargs: Optional[Dict] = {},
77
80
  memory_logs_enabled: Optional[bool] = None,
78
81
  ) -> Dict[str, Any]:
79
82
  if (
@@ -81,16 +84,27 @@ def _get_merge_task_options(
81
84
  and compacted_delta_manifest
82
85
  and round_completion_info.hb_index_to_entry_range
83
86
  ):
84
-
85
- previous_inflation = (
86
- round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
87
- / round_completion_info.compacted_pyarrow_write_result.file_bytes
87
+ logger.debug_conditional(
88
+ f"[Merge task {index}]: Using previous compaction rounds to calculate merge memory: {round_completion_info.compacted_pyarrow_write_result}",
89
+ memory_logs_enabled,
90
+ )
91
+ previous_inflation: float = (
92
+ (
93
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
94
+ / round_completion_info.compacted_pyarrow_write_result.file_bytes
95
+ )
96
+ if round_completion_info.compacted_pyarrow_write_result.file_bytes
97
+ else PYARROW_INFLATION_MULTIPLIER
88
98
  )
89
99
  debug_memory_params["previous_inflation"] = previous_inflation
90
100
 
91
- average_record_size = (
92
- round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
93
- / round_completion_info.compacted_pyarrow_write_result.records
101
+ average_record_size: float = (
102
+ (
103
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
104
+ / round_completion_info.compacted_pyarrow_write_result.records
105
+ )
106
+ if round_completion_info.compacted_pyarrow_write_result.records
107
+ else DEFAULT_AVERAGE_RECORD_SIZE_BYTES
94
108
  )
95
109
  debug_memory_params["average_record_size"] = average_record_size
96
110
 
@@ -106,31 +120,36 @@ def _get_merge_task_options(
106
120
  str(hb_idx)
107
121
  ]
108
122
  for entry_index in range(entry_start, entry_end):
109
- entry = compacted_delta_manifest.entries[entry_index]
110
-
111
- current_entry_size = estimate_manifest_entry_size_bytes(
112
- entry=entry,
113
- operation_type=OperationType.PYARROW_DOWNLOAD,
114
- estimate_resources_params=estimate_resources_params,
123
+ entry: ManifestEntry = compacted_delta_manifest.entries[entry_index]
124
+ current_entry_size: float = (
125
+ estimate_manifest_entry_size_bytes(
126
+ entry=entry,
127
+ operation_type=OperationType.PYARROW_DOWNLOAD,
128
+ estimate_resources_params=estimate_resources_params,
129
+ )
130
+ or 0.0
115
131
  )
116
- current_entry_rows = estimate_manifest_entry_num_rows(
117
- entry=entry,
118
- operation_type=OperationType.PYARROW_DOWNLOAD,
119
- estimate_resources_params=estimate_resources_params,
132
+ current_entry_rows: int = (
133
+ estimate_manifest_entry_num_rows(
134
+ entry=entry,
135
+ operation_type=OperationType.PYARROW_DOWNLOAD,
136
+ estimate_resources_params=estimate_resources_params,
137
+ )
138
+ or 0
120
139
  )
121
-
140
+ # NOTE: We can treat the current_entry_size and current_entry_rows as 0 as a None estimated entry size implies a 0 value
122
141
  data_size += current_entry_size
123
142
  num_rows += current_entry_rows
124
-
125
143
  if primary_keys:
126
- pk_size = estimate_manifest_entry_column_size_bytes(
144
+ pk_size: Optional[
145
+ float
146
+ ] = estimate_manifest_entry_column_size_bytes(
127
147
  entry=entry,
128
148
  columns=primary_keys,
129
149
  operation_type=OperationType.PYARROW_DOWNLOAD,
130
150
  estimate_resources_params=estimate_resources_params,
131
151
  )
132
-
133
- if pk_size is None:
152
+ if not pk_size:
134
153
  pk_size_bytes += current_entry_size
135
154
  else:
136
155
  pk_size_bytes += pk_size
@@ -159,7 +178,6 @@ def _get_merge_task_options(
159
178
  f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
160
179
  memory_logs_enabled,
161
180
  )
162
-
163
181
  return _get_task_options(0.01, total_memory, ray_custom_resources)
164
182
 
165
183
 
@@ -255,8 +273,6 @@ def merge_resource_options_provider(
255
273
  compacted_delta_manifest: Optional[Manifest] = None,
256
274
  ray_custom_resources: Optional[Dict] = None,
257
275
  primary_keys: Optional[List[str]] = None,
258
- deltacat_storage=unimplemented_deltacat_storage,
259
- deltacat_storage_kwargs: Optional[Dict] = {},
260
276
  memory_logs_enabled: Optional[bool] = None,
261
277
  **kwargs,
262
278
  ) -> Dict:
@@ -286,8 +302,6 @@ def merge_resource_options_provider(
286
302
  round_completion_info=round_completion_info,
287
303
  compacted_delta_manifest=compacted_delta_manifest,
288
304
  primary_keys=primary_keys,
289
- deltacat_storage=deltacat_storage,
290
- deltacat_storage_kwargs=deltacat_storage_kwargs,
291
305
  memory_logs_enabled=memory_logs_enabled,
292
306
  estimate_resources_params=estimate_resources_params,
293
307
  )
@@ -302,7 +316,7 @@ def local_merge_resource_options_provider(
302
316
  compacted_delta_manifest: Optional[Manifest] = None,
303
317
  ray_custom_resources: Optional[Dict] = None,
304
318
  primary_keys: Optional[List[str]] = None,
305
- deltacat_storage=unimplemented_deltacat_storage,
319
+ deltacat_storage=metastore,
306
320
  deltacat_storage_kwargs: Optional[Dict] = {},
307
321
  memory_logs_enabled: Optional[bool] = None,
308
322
  **kwargs,
@@ -328,8 +342,6 @@ def local_merge_resource_options_provider(
328
342
  round_completion_info=round_completion_info,
329
343
  compacted_delta_manifest=compacted_delta_manifest,
330
344
  primary_keys=primary_keys,
331
- deltacat_storage=deltacat_storage,
332
- deltacat_storage_kwargs=deltacat_storage_kwargs,
333
345
  memory_logs_enabled=memory_logs_enabled,
334
346
  estimate_resources_params=estimate_resources_params,
335
347
  )
@@ -1,3 +1,4 @@
1
+ from deltacat.constants import DEFAULT_NAMESPACE
1
2
  from deltacat.utils.ray_utils.concurrency import (
2
3
  invoke_parallel,
3
4
  task_resource_options_provider,
@@ -12,8 +13,7 @@ from deltacat import logs
12
13
  from deltacat.compute.converter.model.converter_session_params import (
13
14
  ConverterSessionParams,
14
15
  )
15
-
16
-
16
+ from typing import Dict, List, Any, Callable
17
17
  from deltacat.compute.converter.constants import DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
18
18
  from deltacat.compute.converter.steps.convert import convert
19
19
  from deltacat.compute.converter.model.convert_input import ConvertInput
@@ -31,30 +31,80 @@ from deltacat.compute.converter.pyiceberg.catalog import load_table
31
31
  from deltacat.compute.converter.utils.converter_session_utils import (
32
32
  group_all_files_to_each_bucket,
33
33
  )
34
+ from deltacat.compute.converter.model.convert_result import ConvertResult
35
+ from deltacat.compute.converter.utils.converter_session_utils import (
36
+ _get_snapshot_action_description,
37
+ _determine_snapshot_type,
38
+ SnapshotType,
39
+ )
40
+
41
+ from pyiceberg.manifest import DataFile
42
+ from pyiceberg.table.metadata import TableMetadata
34
43
 
35
44
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
36
45
 
37
46
 
38
- def converter_session(params: ConverterSessionParams, **kwargs):
47
+ def converter_session(params: ConverterSessionParams, **kwargs: Any) -> TableMetadata:
39
48
  """
40
- Convert equality delete to position delete.
41
- Compute and memory heavy work from downloading equality delete table and compute position deletes
42
- will be executed on Ray remote tasks.
49
+ Convert equality deletes to position deletes with option to enforce primary key uniqueness.
50
+
51
+ This function processes Iceberg table files to convert equality delete files to position delete files.
52
+ It can optionally enforce primary key uniqueness by keeping only the latest version of each
53
+ primary key across all data files.
54
+
55
+ **Memory Requirements:**
56
+ - Minimum 512MB of free memory is required to run the converter
57
+
58
+ **Process Overview:**
59
+ 1. Fetches all bucket files (data files, equality deletes, position deletes)
60
+ 2. Groups files by bucket for parallel processing
61
+ 3. Converts equality deletes to position deletes using Ray parallel tasks
62
+ 4. Enforces primary key uniqueness if enabled
63
+ 5. Commits appropriate snapshot (append, replace, or delete) to the Iceberg table
64
+
65
+
66
+ Args:
67
+ params: ConverterSessionParams containing all configuration parameters
68
+ - catalog: Iceberg catalog instance
69
+ - iceberg_table_name: Name of the target Iceberg table
70
+ - enforce_primary_key_uniqueness: Whether to enforce PK uniqueness
71
+ - iceberg_warehouse_bucket_name: S3 bucket for Iceberg warehouse
72
+ - iceberg_namespace: Iceberg namespace
73
+ - merge_keys: Optional list of merge key fields (uses table identifier fields if not provided)
74
+ - compact_previous_position_delete_files: Whether to compact existing position delete files
75
+ - task_max_parallelism: Maximum number of parallel Ray tasks
76
+ - s3_client_kwargs: Additional S3 client configuration
77
+ - s3_file_system: S3 file system instance
78
+ - location_provider_prefix_override: Optional prefix override for file locations
79
+ - position_delete_for_multiple_data_files: Whether to generate position deletes for multiple data files
80
+ **kwargs: Additional keyword arguments (currently unused)
81
+
82
+ Raises:
83
+ Exception: If snapshot commitment fails or other critical errors occur
84
+
43
85
  """
44
86
 
45
87
  catalog = params.catalog
46
88
  table_name = params.iceberg_table_name
47
- iceberg_table = load_table(catalog, table_name)
89
+ if "." not in table_name:
90
+ iceberg_namespace = params.iceberg_namespace or DEFAULT_NAMESPACE
91
+ table_name = params.iceberg_table_name
92
+ table_identifier = f"{iceberg_namespace}.{table_name}"
93
+ else:
94
+ table_identifier = table_name
95
+ identifier_parts = table_identifier.split(".")
96
+ iceberg_namespace = identifier_parts[0]
97
+ table_name = identifier_parts[1]
98
+ iceberg_table = load_table(catalog, table_identifier)
48
99
  enforce_primary_key_uniqueness = params.enforce_primary_key_uniqueness
49
100
  iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
50
- iceberg_namespace = params.iceberg_namespace
51
101
  merge_keys = params.merge_keys
52
102
  compact_previous_position_delete_files = (
53
103
  params.compact_previous_position_delete_files
54
104
  )
55
105
  task_max_parallelism = params.task_max_parallelism
56
106
  s3_client_kwargs = params.s3_client_kwargs
57
- s3_file_system = params.s3_file_system
107
+ s3_file_system = params.filesystem
58
108
  location_provider_prefix_override = params.location_provider_prefix_override
59
109
  position_delete_for_multiple_data_files = (
60
110
  params.position_delete_for_multiple_data_files
@@ -86,7 +136,7 @@ def converter_session(params: ConverterSessionParams, **kwargs):
86
136
  else:
87
137
  identifier_fields = merge_keys
88
138
 
89
- convert_options_provider = functools.partial(
139
+ convert_options_provider: Callable = functools.partial(
90
140
  task_resource_options_provider,
91
141
  resource_amount_provider=convert_resource_options_provider,
92
142
  )
@@ -98,7 +148,8 @@ def converter_session(params: ConverterSessionParams, **kwargs):
98
148
  # Note that approach 2 will ideally require shared object store to avoid download equality delete files * number of child tasks times.
99
149
  max_parallel_data_file_download = DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
100
150
 
101
- def convert_input_provider(index, item):
151
+ def convert_input_provider(index: int, item: Any) -> Dict[str, ConvertInput]:
152
+ task_opts = convert_options_provider(index, item)
102
153
  return {
103
154
  "convert_input": ConvertInput.of(
104
155
  convert_input_files=item,
@@ -112,7 +163,8 @@ def converter_session(params: ConverterSessionParams, **kwargs):
112
163
  position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
113
164
  max_parallel_data_file_download=max_parallel_data_file_download,
114
165
  s3_client_kwargs=s3_client_kwargs,
115
- s3_file_system=s3_file_system,
166
+ filesystem=s3_file_system,
167
+ task_memory=task_opts["memory"],
116
168
  )
117
169
  }
118
170
 
@@ -127,10 +179,10 @@ def converter_session(params: ConverterSessionParams, **kwargs):
127
179
  kwargs_provider=convert_input_provider,
128
180
  )
129
181
 
130
- to_be_deleted_files_list = []
182
+ to_be_deleted_files_list: List[List[DataFile]] = []
131
183
  logger.info(f"Finished invoking {len(convert_tasks_pending)} convert tasks.")
132
184
 
133
- convert_results = ray.get(convert_tasks_pending)
185
+ convert_results: List[ConvertResult] = ray.get(convert_tasks_pending)
134
186
  logger.info(f"Got {len(convert_tasks_pending)} convert tasks.")
135
187
 
136
188
  total_position_delete_record_count = sum(
@@ -153,8 +205,36 @@ def converter_session(params: ConverterSessionParams, **kwargs):
153
205
  convert_result.position_delete_on_disk_sizes
154
206
  for convert_result in convert_results
155
207
  )
208
+ total_input_data_files_on_disk_size = sum(
209
+ convert_result.input_data_files_on_disk_size
210
+ for convert_result in convert_results
211
+ )
212
+
213
+ # Calculate memory usage statistics
214
+ max_peak_memory_usage = max(
215
+ convert_result.peak_memory_usage_bytes for convert_result in convert_results
216
+ )
217
+ avg_memory_usage_percentage = sum(
218
+ convert_result.memory_usage_percentage for convert_result in convert_results
219
+ ) / len(convert_results)
220
+ max_memory_usage_percentage = max(
221
+ convert_result.memory_usage_percentage for convert_result in convert_results
222
+ )
223
+
224
+ logger.info(
225
+ f"Aggregated stats for {table_identifier}: "
226
+ f"total position delete record count: {total_position_delete_record_count}, "
227
+ f"total input data file record count: {total_input_data_file_record_count}, "
228
+ f"total data file hash columns in memory sizes: {total_data_file_hash_columns_in_memory_sizes}, "
229
+ f"total position delete file in memory sizes: {total_position_delete_file_in_memory_sizes}, "
230
+ f"total position delete file on disk sizes: {total_position_delete_on_disk_sizes}, "
231
+ f"total input data files on disk size: {total_input_data_files_on_disk_size}, "
232
+ f"max peak memory usage: {max_peak_memory_usage} bytes, "
233
+ f"average memory usage percentage: {avg_memory_usage_percentage:.2f}%, "
234
+ f"max memory usage percentage: {max_memory_usage_percentage:.2f}%"
235
+ )
156
236
 
157
- to_be_added_files_list = []
237
+ to_be_added_files_list: List[DataFile] = []
158
238
  for convert_result in convert_results:
159
239
  to_be_added_files = convert_result.to_be_added_files
160
240
  to_be_deleted_files = convert_result.to_be_deleted_files
@@ -162,24 +242,57 @@ def converter_session(params: ConverterSessionParams, **kwargs):
162
242
  to_be_deleted_files_list.extend(to_be_deleted_files.values())
163
243
  to_be_added_files_list.extend(to_be_added_files)
164
244
 
165
- if not to_be_deleted_files_list and to_be_added_files_list:
166
- commit_append_snapshot(
167
- iceberg_table=iceberg_table,
168
- new_position_delete_files=to_be_added_files_list,
169
- )
170
- else:
171
- commit_replace_snapshot(
172
- iceberg_table=iceberg_table,
173
- to_be_deleted_files_list=to_be_deleted_files_list,
174
- new_position_delete_files=to_be_added_files_list,
245
+ logger.info(f"To be deleted files list length: {len(to_be_deleted_files_list)}")
246
+ logger.info(f"To be added files list length: {len(to_be_added_files_list)}")
247
+
248
+ # Determine snapshot type and commit
249
+ snapshot_type = _determine_snapshot_type(
250
+ to_be_deleted_files_list, to_be_added_files_list
251
+ )
252
+
253
+ if snapshot_type == SnapshotType.NONE:
254
+ logger.info(
255
+ _get_snapshot_action_description(
256
+ snapshot_type, to_be_deleted_files_list, to_be_added_files_list
257
+ )
175
258
  )
259
+ return
260
+
176
261
  logger.info(
177
- f"Aggregated stats for {table_name}: "
178
- f"total position delete record count: {total_position_delete_record_count}, "
179
- f"total input data file record_count: {total_input_data_file_record_count}, "
180
- f"total data file hash columns in memory sizes: {total_data_file_hash_columns_in_memory_sizes}, "
181
- f"total position delete file in memory sizes: {total_position_delete_file_in_memory_sizes}, "
182
- f"total position delete file on disk sizes: {total_position_delete_on_disk_sizes}."
262
+ f"Snapshot action: {_get_snapshot_action_description(snapshot_type, to_be_deleted_files_list, to_be_added_files_list)}"
183
263
  )
184
264
 
185
- logger.info(f"Committed new Iceberg snapshot.")
265
+ try:
266
+ if snapshot_type == SnapshotType.APPEND:
267
+ logger.info(f"Committing append snapshot for {table_identifier}.")
268
+ updated_table_metadata = commit_append_snapshot(
269
+ iceberg_table=iceberg_table,
270
+ new_position_delete_files=to_be_added_files_list,
271
+ )
272
+ elif snapshot_type == SnapshotType.REPLACE:
273
+ logger.info(f"Committing replace snapshot for {table_identifier}.")
274
+ updated_table_metadata = commit_replace_snapshot(
275
+ iceberg_table=iceberg_table,
276
+ to_be_deleted_files=to_be_deleted_files_list,
277
+ new_position_delete_files=to_be_added_files_list,
278
+ )
279
+ elif snapshot_type == SnapshotType.DELETE:
280
+ logger.info(f"Committing delete snapshot for {table_identifier}.")
281
+ updated_table_metadata = commit_replace_snapshot(
282
+ iceberg_table=iceberg_table,
283
+ to_be_deleted_files=to_be_deleted_files_list,
284
+ new_position_delete_files=[], # No new files to add
285
+ )
286
+ else:
287
+ logger.warning(f"Unexpected snapshot type: {snapshot_type}")
288
+ return
289
+
290
+ logger.info(
291
+ f"Committed new Iceberg snapshot for {table_identifier}: {updated_table_metadata.current_snapshot_id}"
292
+ )
293
+
294
+ # Return the updated table metadata with the new snapshot
295
+ return updated_table_metadata
296
+ except Exception as e:
297
+ logger.error(f"Failed to commit snapshot for {table_identifier}: {str(e)}")
298
+ raise