deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,22 @@
1
+ """
2
+ Common utility functions for main storage compaction tests.
3
+
4
+ These functions are shared between incremental and multiple rounds compaction tests.
5
+ """
1
6
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
7
  from __future__ import annotations
3
8
  from enum import Enum
4
- from typing import Any, Dict, List, Optional
9
+ from typing import Any, Dict, List, Optional, Tuple
5
10
  import datetime as dt
6
- from boto3.resources.base import ServiceResource
7
11
  from datetime import timezone
8
12
 
9
- from deltacat.tests.compute.test_util_constant import (
10
- TEST_S3_RCF_BUCKET_NAME,
11
- )
13
+ import tempfile
14
+ import os
15
+ import shutil
16
+
17
+ import pyarrow as pa
18
+
19
+
12
20
  from deltacat.tests.compute.test_util_constant import (
13
21
  BASE_TEST_SOURCE_NAMESPACE,
14
22
  BASE_TEST_SOURCE_TABLE_NAME,
@@ -26,11 +34,10 @@ from deltacat.compute.compactor import (
26
34
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
27
35
  CompactionSessionAuditInfo,
28
36
  )
29
-
30
37
  from deltacat.storage.model.partition import (
31
38
  PartitionLocator,
32
39
  PartitionScheme,
33
- PartitionKey as PartitionSchemeKey,
40
+ PartitionKey as StoragePartitionKey,
34
41
  )
35
42
  from deltacat.storage.model.stream import StreamLocator
36
43
  from deltacat.storage.model.table_version import TableVersionLocator
@@ -39,8 +46,22 @@ from deltacat.storage.model.namespace import NamespaceLocator
39
46
  from deltacat.storage.model.sort_key import (
40
47
  SortScheme,
41
48
  )
49
+ from deltacat.storage.model.delta import (
50
+ Delta,
51
+ DeltaType,
52
+ )
53
+ from deltacat.storage.model.partition import (
54
+ Partition,
55
+ PartitionKeyList,
56
+ )
57
+ from deltacat.storage.model.stream import Stream
58
+ from deltacat.storage.model.transform import IdentityTransform
59
+ from deltacat.storage.model.schema import Schema
42
60
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
43
61
 
62
+ from deltacat.storage import metastore
63
+ from deltacat.catalog.model.properties import CatalogProperties
64
+
44
65
 
45
66
  class PartitionKeyType(str, Enum):
46
67
  INT = "int"
@@ -80,113 +101,741 @@ def get_test_partition_locator(partition_id):
80
101
  return partition_locator
81
102
 
82
103
 
83
- def _create_table(
104
+ def create_main_deltacat_storage_kwargs() -> Dict[str, Any]:
105
+ """
106
+ Helper function to create main deltacat storage kwargs
107
+
108
+ Returns: kwargs to use for main deltacat storage, i.e. {"catalog": CatalogProperties(...)}
109
+ """
110
+ temp_dir = tempfile.mkdtemp()
111
+ catalog = CatalogProperties(root=temp_dir)
112
+ return {"catalog": catalog}
113
+
114
+
115
+ def clean_up_main_deltacat_storage_kwargs(storage_kwargs: Dict[str, Any]):
116
+ """
117
+ Cleans up directory created by create_main_deltacat_storage_kwargs
118
+ """
119
+ catalog = storage_kwargs["catalog"]
120
+ if hasattr(catalog, "root") and os.path.exists(catalog.root):
121
+ shutil.rmtree(catalog.root)
122
+
123
+
124
+ def _create_table_main(
84
125
  namespace: str,
85
126
  table_name: str,
86
127
  table_version: str,
87
128
  sort_keys: Optional[List[Any]],
88
129
  partition_keys: Optional[List[PartitionKey]],
130
+ input_deltas: Optional[pa.Table],
89
131
  ds_mock_kwargs: Optional[Dict[str, Any]],
90
132
  ):
91
- import deltacat.tests.local_deltacat_storage as ds
92
- from deltacat.types.media import ContentType
133
+ """
134
+ Main storage version of _create_table that works for both incremental and multiple rounds tests.
135
+
136
+ For incremental tests, input_deltas is provided to extract schema.
137
+ For multiple rounds tests, input_deltas can be None and we use a simpler approach.
138
+ """
139
+ # Create namespace first
140
+ metastore.create_namespace(namespace=namespace, **ds_mock_kwargs)
141
+
142
+ # Handle schema creation
143
+ if input_deltas is not None:
144
+ # Incremental test approach - extract schema from input deltas
145
+ schema = input_deltas.schema
146
+
147
+ # Add partition key fields to schema if they're not already present
148
+ if partition_keys:
149
+ for pk in partition_keys:
150
+ field_name = pk.key_name
151
+ if field_name not in schema.names:
152
+ # Add partition key field with appropriate type
153
+ if pk.key_type == PartitionKeyType.INT:
154
+ field_type = pa.int32()
155
+ elif pk.key_type == PartitionKeyType.STRING:
156
+ field_type = pa.string()
157
+ elif (
158
+ pk.key_type == PartitionKeyType.TIMESTAMP
159
+ ): # Handle timestamp type properly
160
+ field_type = pa.timestamp("us")
161
+ else:
162
+ field_type = pa.string() # Default to string
163
+
164
+ schema = schema.append(pa.field(field_name, field_type))
165
+
166
+ schema_obj = Schema.of(schema=schema)
167
+ else:
168
+ # Multiple rounds test approach - use None for schema (will be set later)
169
+ schema_obj = None
93
170
 
94
- ds.create_namespace(namespace, {}, **ds_mock_kwargs)
95
- partition_scheme = (
96
- PartitionScheme.of(
97
- [PartitionSchemeKey.of(key.key_name) for key in partition_keys]
98
- )
99
- if partition_keys
100
- else None
101
- )
102
171
  sort_scheme = SortScheme.of(sort_keys) if sort_keys else None
103
- ds.create_table_version(
104
- namespace,
105
- table_name,
106
- table_version,
107
- sort_keys=sort_scheme,
172
+
173
+ # Convert test partition keys to storage partition keys
174
+ storage_partition_keys = []
175
+ if partition_keys:
176
+ for pk in partition_keys:
177
+ storage_partition_key = StoragePartitionKey.of(
178
+ key=[pk.key_name],
179
+ name=pk.key_name,
180
+ transform=IdentityTransform.of(),
181
+ )
182
+ storage_partition_keys.append(storage_partition_key)
183
+
184
+ # Create partition scheme
185
+ partition_scheme = None
186
+ if storage_partition_keys:
187
+ partition_scheme = PartitionScheme.of(
188
+ keys=PartitionKeyList.of(storage_partition_keys),
189
+ scheme_id="default_partition_scheme",
190
+ )
191
+
192
+ # Create table version (which creates table and stream automatically)
193
+ metastore.create_table_version(
194
+ namespace=namespace,
195
+ table_name=table_name,
196
+ table_version=table_version,
197
+ schema=schema_obj,
108
198
  partition_scheme=partition_scheme,
109
- supported_content_types=[ContentType.PARQUET],
199
+ sort_keys=sort_scheme,
110
200
  **ds_mock_kwargs,
111
201
  )
202
+
112
203
  return namespace, table_name, table_version
113
204
 
114
205
 
115
- def create_src_table(
206
+ def create_src_table_main(
116
207
  sort_keys: Optional[List[Any]],
117
208
  partition_keys: Optional[List[PartitionKey]],
209
+ input_deltas: Optional[pa.Table],
118
210
  ds_mock_kwargs: Optional[Dict[str, Any]],
119
211
  ):
212
+ """
213
+ Main storage version of create_src_table
214
+ """
120
215
  source_namespace: str = BASE_TEST_SOURCE_NAMESPACE
121
216
  source_table_name: str = BASE_TEST_SOURCE_TABLE_NAME
122
217
  source_table_version: str = BASE_TEST_SOURCE_TABLE_VERSION
123
- return _create_table(
218
+ return _create_table_main(
124
219
  source_namespace,
125
220
  source_table_name,
126
221
  source_table_version,
127
222
  sort_keys,
128
223
  partition_keys,
224
+ input_deltas,
129
225
  ds_mock_kwargs,
130
226
  )
131
227
 
132
228
 
133
- def create_destination_table(
229
+ def create_destination_table_main(
134
230
  sort_keys: Optional[List[Any]],
135
231
  partition_keys: Optional[List[PartitionKey]],
232
+ input_deltas: Optional[pa.Table],
136
233
  ds_mock_kwargs: Optional[Dict[str, Any]],
137
234
  ):
235
+ """
236
+ Main storage version of create_destination_table
237
+ """
138
238
  destination_namespace: str = BASE_TEST_DESTINATION_NAMESPACE
139
239
  destination_table_name: str = BASE_TEST_DESTINATION_TABLE_NAME
140
240
  destination_table_version: str = BASE_TEST_DESTINATION_TABLE_VERSION
141
- return _create_table(
241
+ return _create_table_main(
142
242
  destination_namespace,
143
243
  destination_table_name,
144
244
  destination_table_version,
145
245
  sort_keys,
146
246
  partition_keys,
247
+ input_deltas,
147
248
  ds_mock_kwargs,
148
249
  )
149
250
 
150
251
 
151
- def create_rebase_table(
252
+ def create_rebase_table_main(
152
253
  sort_keys: Optional[List[Any]],
153
254
  partition_keys: Optional[List[PartitionKey]],
255
+ input_deltas: Optional[pa.Table],
154
256
  ds_mock_kwargs: Optional[Dict[str, Any]],
155
257
  ):
258
+ """
259
+ Main storage version of create_rebase_table
260
+ """
156
261
  rebasing_namespace = REBASING_NAMESPACE
157
262
  rebasing_table_name = REBASING_TABLE_NAME
158
263
  rebasing_table_version = REBASING_TABLE_VERSION
159
- return _create_table(
264
+ return _create_table_main(
160
265
  rebasing_namespace,
161
266
  rebasing_table_name,
162
267
  rebasing_table_version,
163
268
  sort_keys,
164
269
  partition_keys,
270
+ input_deltas,
165
271
  ds_mock_kwargs,
166
272
  )
167
273
 
168
274
 
169
- def get_rcf(s3_resource, rcf_file_s3_uri: str) -> RoundCompletionInfo:
170
- from deltacat.tests.test_utils.utils import read_s3_contents
275
+ def get_rci_from_partition(
276
+ partition_locator: PartitionLocator, deltacat_storage=None, **kwargs
277
+ ) -> RoundCompletionInfo:
278
+ """
279
+ Read RoundCompletionInfo from a partition metafile.
280
+
281
+ Args:
282
+ partition_locator: Locator of the partition containing the RoundCompletionInfo
283
+ deltacat_storage: Storage implementation (defaults to metastore)
284
+ **kwargs: Additional arguments to pass to deltacat_storage.get_partition (e.g., catalog)
285
+
286
+ Returns:
287
+ RoundCompletionInfo object from the partition, or None if not found
288
+ """
289
+ from deltacat.storage import metastore
290
+
291
+ if deltacat_storage is None:
292
+ deltacat_storage = metastore
293
+
294
+ partition = deltacat_storage.get_partition(
295
+ partition_locator.stream_locator, partition_locator.partition_values, **kwargs
296
+ )
297
+
298
+ if partition and partition.compaction_round_completion_info:
299
+ return partition.compaction_round_completion_info
300
+
301
+ return None
171
302
 
172
- _, rcf_object_key = rcf_file_s3_uri.strip("s3://").split("/", 1)
173
- rcf_file_output: Dict[str, Any] = read_s3_contents(
174
- s3_resource, TEST_S3_RCF_BUCKET_NAME, rcf_object_key
303
+
304
+ def _add_deltas_to_partition_main(
305
+ deltas_ingredients: List[Tuple[pa.Table, DeltaType, Optional[Dict[str, str]]]],
306
+ partition: Optional[Partition],
307
+ ds_mock_kwargs: Optional[Dict[str, Any]],
308
+ ) -> Tuple[Optional[Delta], int]:
309
+ """
310
+ Add deltas to a partition using main storage
311
+ """
312
+ all_deltas_length = 0
313
+ incremental_delta = None
314
+ for (delta_data, delta_type, delete_parameters) in deltas_ingredients:
315
+ staged_delta: Delta = metastore.stage_delta(
316
+ delta_data,
317
+ partition,
318
+ delta_type,
319
+ entry_params=delete_parameters,
320
+ **ds_mock_kwargs,
321
+ )
322
+ incremental_delta = metastore.commit_delta(
323
+ staged_delta,
324
+ **ds_mock_kwargs,
325
+ )
326
+ all_deltas_length += len(delta_data) if delta_data else 0
327
+ return incremental_delta, all_deltas_length
328
+
329
+
330
+ def add_late_deltas_to_partition_main(
331
+ late_deltas: List[Tuple[pa.Table, DeltaType, Optional[Dict[str, str]]]],
332
+ source_partition: Optional[Partition],
333
+ ds_mock_kwargs: Optional[Dict[str, Any]],
334
+ ) -> Tuple[Optional[Delta], int]:
335
+ """
336
+ Add late deltas to a partition using main storage
337
+ """
338
+ return _add_deltas_to_partition_main(late_deltas, source_partition, ds_mock_kwargs)
339
+
340
+
341
+ def multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
342
+ sort_keys: Optional[List[Any]],
343
+ partition_keys: Optional[List[PartitionKey]],
344
+ input_deltas: List[pa.Table],
345
+ partition_values: Optional[List[Any]],
346
+ ds_mock_kwargs: Optional[Dict[str, Any]],
347
+ ) -> Tuple[Stream, Stream, Optional[Stream], bool]:
348
+ """
349
+ Main storage version of multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy
350
+ """
351
+ # For multiple rounds, we need to extract the first delta to get schema
352
+ first_delta_table = input_deltas[0][0] if input_deltas else None
353
+ source_namespace, source_table_name, source_table_version = create_src_table_main(
354
+ sort_keys, partition_keys, first_delta_table, ds_mock_kwargs
175
355
  )
176
- return RoundCompletionInfo(**rcf_file_output)
177
356
 
357
+ source_table_stream: Stream = metastore.get_stream(
358
+ namespace=source_namespace,
359
+ table_name=source_table_name,
360
+ table_version=source_table_version,
361
+ **ds_mock_kwargs,
362
+ )
363
+
364
+ # Convert partition values to correct types
365
+ converted_partition_values = []
366
+ if partition_values and partition_keys:
367
+ for i, (value, pk) in enumerate(zip(partition_values, partition_keys)):
368
+ if pk.key_type == PartitionKeyType.INT:
369
+ converted_partition_values.append(int(value))
370
+ else:
371
+ converted_partition_values.append(value)
372
+ else:
373
+ converted_partition_values = partition_values
374
+
375
+ staged_partition: Partition = metastore.stage_partition(
376
+ source_table_stream,
377
+ converted_partition_values,
378
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
379
+ **ds_mock_kwargs,
380
+ )
381
+
382
+ is_delete = False
383
+ input_delta_length = 0
384
+ for (
385
+ input_delta,
386
+ input_delta_type,
387
+ input_delta_parameters,
388
+ ) in input_deltas:
389
+ if input_delta_type is DeltaType.DELETE:
390
+ is_delete = True
391
+ staged_delta = metastore.stage_delta(
392
+ input_delta,
393
+ staged_partition,
394
+ input_delta_type,
395
+ entry_params=input_delta_parameters,
396
+ **ds_mock_kwargs,
397
+ )
398
+ metastore.commit_delta(staged_delta, **ds_mock_kwargs)
399
+ input_delta_length += len(input_delta)
400
+ metastore.commit_partition(staged_partition, **ds_mock_kwargs)
401
+
402
+ (
403
+ destination_table_namespace,
404
+ destination_table_name,
405
+ destination_table_version,
406
+ ) = create_destination_table_main(
407
+ sort_keys, partition_keys, first_delta_table, ds_mock_kwargs
408
+ )
409
+ destination_table_stream: Stream = metastore.get_stream(
410
+ namespace=destination_table_namespace,
411
+ table_name=destination_table_name,
412
+ table_version=destination_table_version,
413
+ **ds_mock_kwargs,
414
+ )
415
+
416
+ # Always create rebase table for multiple rounds tests
417
+ (
418
+ rebasing_table_namespace,
419
+ rebasing_table_name,
420
+ rebasing_table_version,
421
+ ) = create_rebase_table_main(
422
+ sort_keys, partition_keys, first_delta_table, ds_mock_kwargs
423
+ )
424
+ rebasing_table_stream: Stream = metastore.get_stream(
425
+ namespace=rebasing_table_namespace,
426
+ table_name=rebasing_table_name,
427
+ table_version=rebasing_table_version,
428
+ **ds_mock_kwargs,
429
+ )
430
+
431
+ # Stage partition and add deltas to rebase table
432
+ rebased_staged_partition: Partition = metastore.stage_partition(
433
+ rebasing_table_stream,
434
+ converted_partition_values,
435
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
436
+ **ds_mock_kwargs,
437
+ )
438
+
439
+ for (
440
+ input_delta,
441
+ input_delta_type,
442
+ input_delta_parameters,
443
+ ) in input_deltas:
444
+ staged_delta = metastore.stage_delta(
445
+ input_delta,
446
+ rebased_staged_partition,
447
+ input_delta_type,
448
+ entry_params=input_delta_parameters,
449
+ **ds_mock_kwargs,
450
+ )
451
+ metastore.commit_delta(staged_delta, **ds_mock_kwargs)
452
+ metastore.commit_partition(rebased_staged_partition, **ds_mock_kwargs)
453
+
454
+ return (
455
+ source_table_stream,
456
+ destination_table_stream,
457
+ rebasing_table_stream,
458
+ is_delete,
459
+ )
460
+
461
+
462
+ def create_src_w_deltas_destination_plus_destination_main(
463
+ sort_keys: Optional[List[Any]],
464
+ partition_keys: Optional[List[PartitionKey]],
465
+ input_deltas: pa.Table,
466
+ input_delta_type: DeltaType,
467
+ partition_values: Optional[List[Any]],
468
+ ds_mock_kwargs: Optional[Dict[str, Any]],
469
+ simulate_is_inplace: bool = False,
470
+ ) -> Tuple[Stream, Stream, Optional[Stream], str, str, str]:
471
+ """
472
+ Create source with deltas and destination tables for incremental compaction testing
473
+ """
474
+ source_namespace, source_table_name, source_table_version = create_src_table_main(
475
+ sort_keys, partition_keys, input_deltas, ds_mock_kwargs
476
+ )
178
477
 
179
- def get_compacted_delta_locator_from_rcf(
180
- s3_resource: ServiceResource, rcf_file_s3_uri: str
478
+ source_table_stream: Stream = metastore.get_stream(
479
+ namespace=source_namespace,
480
+ table_name=source_table_name,
481
+ table_version=source_table_version,
482
+ **ds_mock_kwargs,
483
+ )
484
+
485
+ # Convert partition values to correct types
486
+ converted_partition_values = []
487
+ if partition_values and partition_keys:
488
+ for i, (value, pk) in enumerate(zip(partition_values, partition_keys)):
489
+ if pk.key_type == PartitionKeyType.INT:
490
+ converted_partition_values.append(int(value))
491
+ else:
492
+ converted_partition_values.append(value)
493
+ else:
494
+ converted_partition_values = partition_values
495
+
496
+ staged_partition: Partition = metastore.stage_partition(
497
+ source_table_stream,
498
+ converted_partition_values,
499
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
500
+ **ds_mock_kwargs,
501
+ )
502
+ metastore.commit_delta(
503
+ metastore.stage_delta(
504
+ input_deltas, staged_partition, input_delta_type, **ds_mock_kwargs
505
+ ),
506
+ **ds_mock_kwargs,
507
+ )
508
+ metastore.commit_partition(staged_partition, **ds_mock_kwargs)
509
+ source_table_stream_after_committed: Stream = metastore.get_stream(
510
+ namespace=source_namespace,
511
+ table_name=source_table_name,
512
+ table_version=source_table_version,
513
+ **ds_mock_kwargs,
514
+ )
515
+
516
+ destination_table_namespace: Optional[str] = None
517
+ destination_table_name: Optional[str] = None
518
+ destination_table_version: Optional[str] = None
519
+ if not simulate_is_inplace:
520
+ (
521
+ destination_table_namespace,
522
+ destination_table_name,
523
+ destination_table_version,
524
+ ) = create_destination_table_main(
525
+ sort_keys, partition_keys, input_deltas, ds_mock_kwargs
526
+ )
527
+ else:
528
+ destination_table_namespace = source_namespace
529
+ destination_table_name = source_table_name
530
+ destination_table_version = source_table_version
531
+
532
+ destination_table_stream: Stream = metastore.get_stream(
533
+ namespace=destination_table_namespace,
534
+ table_name=destination_table_name,
535
+ table_version=destination_table_version,
536
+ **ds_mock_kwargs,
537
+ )
538
+
539
+ return (
540
+ source_table_stream_after_committed,
541
+ destination_table_stream,
542
+ None,
543
+ source_namespace,
544
+ source_table_name,
545
+ source_table_version,
546
+ )
547
+
548
+
549
+ def create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
550
+ sort_keys: Optional[List[Any]],
551
+ partition_keys: Optional[List[PartitionKey]],
552
+ input_deltas: pa.Table,
553
+ input_delta_type: DeltaType,
554
+ partition_values: Optional[List[Any]],
555
+ ds_mock_kwargs: Optional[Dict[str, Any]],
556
+ ) -> Tuple[Stream, Stream, Optional[Stream]]:
557
+ """
558
+ Main storage version of create_src_w_deltas_destination_rebase_w_deltas_strategy
559
+
560
+ Creates source table with deltas, destination table, and rebase table for rebase testing.
561
+ This test scenario sets up different source and rebase partition locators to simulate
562
+ scenarios like hash bucket count changes.
563
+ """
564
+ from deltacat.utils.common import current_time_ms
565
+
566
+ last_stream_position = current_time_ms()
567
+ source_namespace, source_table_name, source_table_version = create_src_table_main(
568
+ sort_keys, partition_keys, input_deltas, ds_mock_kwargs
569
+ )
570
+
571
+ source_table_stream: Stream = metastore.get_stream(
572
+ namespace=source_namespace,
573
+ table_name=source_table_name,
574
+ table_version=source_table_version,
575
+ **ds_mock_kwargs,
576
+ )
577
+
578
+ # Convert partition values to correct types, including timestamp handling
579
+ converted_partition_values = []
580
+ if partition_values and partition_keys:
581
+ for i, (value, pk) in enumerate(zip(partition_values, partition_keys)):
582
+ if pk.key_type == PartitionKeyType.INT:
583
+ converted_partition_values.append(int(value))
584
+ elif pk.key_type == PartitionKeyType.TIMESTAMP:
585
+ # Handle timestamp partition values
586
+ if isinstance(value, str) and "T" in value and value.endswith("Z"):
587
+ import pandas as pd
588
+
589
+ ts = pd.to_datetime(value)
590
+ # Convert to microseconds since epoch for PyArrow timestamp[us]
591
+ converted_partition_values.append(int(ts.timestamp() * 1_000_000))
592
+ else:
593
+ converted_partition_values.append(value)
594
+ else:
595
+ converted_partition_values.append(value)
596
+ else:
597
+ converted_partition_values = partition_values
598
+
599
+ staged_partition: Partition = metastore.stage_partition(
600
+ source_table_stream,
601
+ converted_partition_values,
602
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
603
+ **ds_mock_kwargs,
604
+ )
605
+ staged_delta: Delta = metastore.stage_delta(
606
+ input_deltas, staged_partition, input_delta_type, **ds_mock_kwargs
607
+ )
608
+ staged_delta.locator.stream_position = last_stream_position
609
+ metastore.commit_delta(staged_delta, **ds_mock_kwargs)
610
+ metastore.commit_partition(staged_partition, **ds_mock_kwargs)
611
+
612
+ source_table_stream_after_committed: Stream = metastore.get_stream(
613
+ namespace=source_namespace,
614
+ table_name=source_table_name,
615
+ table_version=source_table_version,
616
+ **ds_mock_kwargs,
617
+ )
618
+
619
+ # Create the destination table
620
+ (
621
+ destination_table_namespace,
622
+ destination_table_name,
623
+ destination_table_version,
624
+ ) = create_destination_table_main(
625
+ sort_keys, partition_keys, input_deltas, ds_mock_kwargs
626
+ )
627
+
628
+ # Create the rebase table
629
+ (
630
+ rebase_table_namespace,
631
+ rebase_table_name,
632
+ rebase_table_version,
633
+ ) = create_rebase_table_main(
634
+ sort_keys, partition_keys, input_deltas, ds_mock_kwargs
635
+ )
636
+
637
+ rebasing_table_stream: Stream = metastore.get_stream(
638
+ namespace=rebase_table_namespace,
639
+ table_name=rebase_table_name,
640
+ table_version=rebase_table_version,
641
+ **ds_mock_kwargs,
642
+ )
643
+
644
+ staged_partition: Partition = metastore.stage_partition(
645
+ rebasing_table_stream,
646
+ converted_partition_values,
647
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
648
+ **ds_mock_kwargs,
649
+ )
650
+ staged_delta: Delta = metastore.stage_delta(
651
+ input_deltas, staged_partition, **ds_mock_kwargs
652
+ )
653
+ staged_delta.locator.stream_position = last_stream_position
654
+ metastore.commit_delta(staged_delta, **ds_mock_kwargs)
655
+ metastore.commit_partition(staged_partition, **ds_mock_kwargs)
656
+
657
+ # Get destination stream
658
+ destination_table_stream: Stream = metastore.get_stream(
659
+ namespace=destination_table_namespace,
660
+ table_name=destination_table_name,
661
+ table_version=destination_table_version,
662
+ **ds_mock_kwargs,
663
+ )
664
+
665
+ rebased_stream_after_committed: Stream = metastore.get_stream(
666
+ namespace=rebase_table_namespace,
667
+ table_name=rebase_table_name,
668
+ table_version=rebase_table_version,
669
+ **ds_mock_kwargs,
670
+ )
671
+
672
+ return (
673
+ source_table_stream_after_committed,
674
+ destination_table_stream,
675
+ rebased_stream_after_committed,
676
+ )
677
+
678
+
679
+ def create_incremental_deltas_on_source_table_main(
680
+ source_namespace: str,
681
+ source_table_name: str,
682
+ source_table_version: str,
683
+ source_table_stream: Stream,
684
+ partition_values_param,
685
+ incremental_deltas: List[Tuple[pa.Table, DeltaType, Optional[Dict[str, str]]]],
686
+ ds_mock_kwargs: Optional[Dict[str, Any]] = None,
687
+ ) -> Tuple[PartitionLocator, Delta, int, bool]:
688
+ """
689
+ Main storage version of create_incremental_deltas_on_source_table
690
+ """
691
+ total_records = 0
692
+ has_delete_deltas = False
693
+ new_delta = None
694
+
695
+ # Convert partition values for partition lookup (same as in other helper functions)
696
+ converted_partition_values_for_lookup = partition_values_param
697
+ if (
698
+ partition_values_param
699
+ and source_table_stream.partition_scheme
700
+ and source_table_stream.partition_scheme.keys
701
+ ):
702
+ converted_partition_values_for_lookup = []
703
+
704
+ # Get partition field names from the storage partition scheme
705
+ storage_partition_keys = source_table_stream.partition_scheme.keys
706
+ partition_field_names = []
707
+
708
+ for storage_key in storage_partition_keys:
709
+ # Each storage PartitionKey has a 'key' property that contains FieldLocators
710
+ # Extract the field name from the first FieldLocator
711
+ field_name = storage_key.key[0] if storage_key.key else None
712
+ partition_field_names.append(field_name)
713
+
714
+ for i, value in enumerate(partition_values_param):
715
+ # For timestamp fields like 'region_id', we need to convert the timestamp string
716
+ if i < len(partition_field_names):
717
+ field_name = partition_field_names[i]
718
+
719
+ # Check if this is likely a timestamp field based on the value format
720
+ if isinstance(value, str) and "T" in value and value.endswith("Z"):
721
+ # This looks like a timestamp string - convert it
722
+ import pandas as pd
723
+
724
+ ts = pd.to_datetime(value)
725
+ # Convert to microseconds since epoch for PyArrow timestamp[us]
726
+ converted_partition_values_for_lookup.append(
727
+ int(ts.timestamp() * 1_000_000)
728
+ )
729
+ elif isinstance(value, str) and value.isdigit():
730
+ # This looks like an integer string
731
+ converted_partition_values_for_lookup.append(int(value))
732
+ else:
733
+ # Keep as-is
734
+ converted_partition_values_for_lookup.append(value)
735
+ else:
736
+ converted_partition_values_for_lookup.append(value)
737
+
738
+ # Get the current partition to stage deltas against
739
+ try:
740
+ source_partition: Partition = metastore.get_partition(
741
+ source_table_stream.locator,
742
+ converted_partition_values_for_lookup,
743
+ **ds_mock_kwargs,
744
+ )
745
+ except Exception:
746
+ # If we can't get the partition, it might not exist yet. Try to create it.
747
+ # Stage a new partition if it doesn't exist
748
+ staged_partition: Partition = metastore.stage_partition(
749
+ source_table_stream,
750
+ converted_partition_values_for_lookup,
751
+ partition_scheme_id="default_partition_scheme"
752
+ if source_table_stream.partition_scheme
753
+ else None,
754
+ **ds_mock_kwargs,
755
+ )
756
+ # Commit the empty partition first
757
+ metastore.commit_partition(staged_partition, **ds_mock_kwargs)
758
+
759
+ # Now try to get it again
760
+ source_partition: Partition = metastore.get_partition(
761
+ source_table_stream.locator,
762
+ converted_partition_values_for_lookup,
763
+ **ds_mock_kwargs,
764
+ )
765
+
766
+ if source_partition is None:
767
+ raise ValueError(
768
+ f"Could not create or retrieve partition for values: {converted_partition_values_for_lookup}"
769
+ )
770
+
771
+ for delta_table, delta_type, properties_dict in incremental_deltas:
772
+ # Skip None deltas (empty incremental deltas)
773
+ if delta_table is None:
774
+ continue
775
+
776
+ total_records += len(delta_table)
777
+
778
+ if delta_type == DeltaType.DELETE:
779
+ has_delete_deltas = True
780
+
781
+ # Stage and commit the delta
782
+ staged_delta: Delta = metastore.stage_delta(
783
+ delta_table,
784
+ source_partition,
785
+ delta_type,
786
+ entry_params=properties_dict,
787
+ **ds_mock_kwargs,
788
+ )
789
+ new_delta = metastore.commit_delta(staged_delta, **ds_mock_kwargs)
790
+
791
+ # If all deltas were None, return None for new_delta
792
+ if new_delta is None:
793
+ return None, None, total_records, has_delete_deltas
794
+
795
+ # Get updated stream after deltas were committed
796
+ source_table_stream_after_committed: Stream = metastore.get_stream(
797
+ source_namespace,
798
+ source_table_name,
799
+ source_table_version,
800
+ **ds_mock_kwargs,
801
+ )
802
+
803
+ # Get updated partition after deltas were committed
804
+ source_partition_after_committed: Partition = metastore.get_partition(
805
+ source_table_stream_after_committed.locator,
806
+ converted_partition_values_for_lookup,
807
+ **ds_mock_kwargs,
808
+ )
809
+
810
+ return (
811
+ source_partition_after_committed.locator,
812
+ new_delta,
813
+ total_records,
814
+ has_delete_deltas,
815
+ )
816
+
817
+
818
+ def get_compacted_delta_locator_from_partition(
819
+ partition_locator: PartitionLocator, deltacat_storage=None, **kwargs
181
820
  ):
182
- from deltacat.storage import DeltaLocator
821
+ """
822
+ Get compacted delta locator from partition RoundCompletionInfo.
183
823
 
184
- round_completion_info: RoundCompletionInfo = get_rcf(s3_resource, rcf_file_s3_uri)
824
+ Args:
825
+ partition_locator: Locator of the partition containing the RoundCompletionInfo
826
+ deltacat_storage: Storage implementation (defaults to metastore)
827
+ **kwargs: Additional arguments to pass to get_rci_from_partition (e.g., catalog)
185
828
 
186
- compacted_delta_locator: DeltaLocator = (
187
- round_completion_info.compacted_delta_locator
829
+ Returns:
830
+ DeltaLocator of the compacted delta
831
+ """
832
+ round_completion_info: RoundCompletionInfo = get_rci_from_partition(
833
+ partition_locator, deltacat_storage, **kwargs
188
834
  )
189
- return compacted_delta_locator
835
+
836
+ if round_completion_info:
837
+ return round_completion_info.compacted_delta_locator
838
+ return None
190
839
 
191
840
 
192
841
  def offer_iso8601_timestamp_list(
@@ -325,3 +974,27 @@ def assert_compaction_audit_no_hash_bucket(
325
974
  for entry in audit_entries:
326
975
  assert entry is not None
327
976
  return True
977
+
978
+
979
+ def read_audit_file(audit_file_path: str, catalog_root: str) -> Dict[str, Any]:
980
+ """
981
+ Read audit file from any filesystem.
982
+
983
+ Args:
984
+ audit_file_path: Relative path to the audit file from catalog root
985
+ catalog_root: Absolute path to the catalog root directory
986
+
987
+ Returns:
988
+ Dictionary containing audit data
989
+ """
990
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
991
+ import json
992
+ import posixpath
993
+
994
+ # Resolve absolute path from relative audit path
995
+ absolute_path = posixpath.join(catalog_root, audit_file_path)
996
+
997
+ path, filesystem = resolve_path_and_filesystem(absolute_path)
998
+ with filesystem.open_input_stream(path) as stream:
999
+ content = stream.read().decode("utf-8")
1000
+ return json.loads(content)