deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -2,17 +2,19 @@ from __future__ import annotations
2
2
  import importlib
3
3
  import copy
4
4
  import json
5
- from typing import Any, Dict, List, Optional
5
+ import posixpath
6
+ from typing import Any, Dict, List, Optional, Set
6
7
  from deltacat.io.object_store import IObjectStore
7
8
  from deltacat.utils.common import ReadKwargsProvider
8
9
  from deltacat.types.media import ContentType
9
10
  from deltacat.utils.placement import PlacementGroupConfig
10
11
  from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
11
12
  from deltacat.storage import (
12
- interface as unimplemented_deltacat_storage,
13
+ metastore,
13
14
  PartitionLocator,
14
15
  SortKey,
15
16
  )
17
+ from deltacat.catalog.model.properties import CatalogProperties
16
18
  from deltacat.compute.resource_estimation import (
17
19
  ResourceEstimationMethod,
18
20
  EstimateResourcesParams,
@@ -52,11 +54,22 @@ class CompactPartitionParams(dict):
52
54
  assert (
53
55
  params.get("source_partition_locator") is not None
54
56
  ), "source_partition_locator is a required arg"
57
+ assert params.get("catalog") is not None, "catalog is a required arg"
55
58
  assert (
56
- params.get("compaction_artifact_s3_bucket") is not None
57
- ), "compaction_artifact_s3_bucket is a required arg"
59
+ params.get("all_column_names") is not None
60
+ ), "all_column_names is a required arg"
58
61
 
59
62
  result = CompactPartitionParams(params)
63
+ assert (
64
+ result.destination_partition_locator.partition_id
65
+ ), "destination_partition_locator must have a globally unique partition_id"
66
+ assert (
67
+ result.source_partition_locator.partition_id
68
+ ), "source_partition_locator must have a globally unique partition_id"
69
+ if result.rebase_source_partition_locator:
70
+ assert (
71
+ result.rebase_source_partition_locator.partition_id
72
+ ), "rebase_source_partition_locator must have a globally unique partition_id"
60
73
 
61
74
  result.records_per_compacted_file = params.get(
62
75
  "records_per_compacted_file", MAX_RECORDS_PER_COMPACTED_FILE
@@ -65,15 +78,18 @@ class CompactPartitionParams(dict):
65
78
  "compacted_file_content_type", ContentType.PARQUET
66
79
  )
67
80
  result.object_store = params.get("object_store", RayPlasmaObjectStore())
81
+ result.table_writer_kwargs = params.get("table_writer_kwargs", {})
68
82
 
69
83
  result.enable_profiler = params.get("enable_profiler", False)
70
- result.deltacat_storage = params.get(
71
- "deltacat_storage", unimplemented_deltacat_storage
72
- )
73
- result.s3_client_kwargs = params.get("s3_client_kwargs", {})
84
+ result.deltacat_storage = params.get("deltacat_storage", metastore)
85
+ result.catalog = params.get("catalog")
74
86
  result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
75
87
  result.list_deltas_kwargs = params.get("list_deltas_kwargs", {})
76
- result.s3_table_writer_kwargs = params.get("s3_table_writer_kwargs", {})
88
+ result.all_column_names = params.get("all_column_names")
89
+
90
+ # Add catalog to deltacat_storage_kwargs
91
+ result.deltacat_storage_kwargs["catalog"] = result.catalog
92
+
77
93
  result.bit_width_of_sort_keys = validate_sort_keys(
78
94
  result.source_partition_locator,
79
95
  result.sort_keys,
@@ -133,6 +149,8 @@ class CompactPartitionParams(dict):
133
149
  if result.primary_keys:
134
150
  result.primary_keys = sorted(result.primary_keys)
135
151
 
152
+ result.original_fields = params.get("original_fields")
153
+
136
154
  # assertions
137
155
  assert (
138
156
  result.source_partition_locator.partition_values
@@ -177,21 +195,32 @@ class CompactPartitionParams(dict):
177
195
  self["source_partition_locator"] = locator
178
196
 
179
197
  @property
180
- def compaction_artifact_s3_bucket(self) -> str:
181
- return self["compaction_artifact_s3_bucket"]
182
-
183
- @compaction_artifact_s3_bucket.setter
184
- def compaction_artifact_s3_bucket(self, s3_bucket: str) -> None:
185
- self["compaction_artifact_s3_bucket"] = s3_bucket
198
+ def compaction_artifact_path(self) -> str:
199
+ """
200
+ Returns the compaction artifact path based on catalog root.
201
+ """
202
+ return posixpath.join(self.catalog.root, "compute", "compactor")
186
203
 
187
204
  @property
188
- def deltacat_storage(self) -> unimplemented_deltacat_storage:
205
+ def deltacat_storage(self) -> metastore:
189
206
  return self["deltacat_storage"]
190
207
 
191
208
  @deltacat_storage.setter
192
- def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
209
+ def deltacat_storage(self, storage: metastore) -> None:
193
210
  self["deltacat_storage"] = storage
194
211
 
212
+ @property
213
+ def catalog(self) -> CatalogProperties:
214
+ return self["catalog"]
215
+
216
+ @catalog.setter
217
+ def catalog(self, catalog: CatalogProperties) -> None:
218
+ self["catalog"] = catalog
219
+ # Update deltacat_storage_kwargs when catalog is set
220
+ if "deltacat_storage_kwargs" not in self:
221
+ self["deltacat_storage_kwargs"] = {}
222
+ self["deltacat_storage_kwargs"]["catalog"] = catalog
223
+
195
224
  @property
196
225
  def object_store(self) -> IObjectStore:
197
226
  return self["object_store"]
@@ -286,14 +315,6 @@ class CompactPartitionParams(dict):
286
315
  def list_deltas_kwargs(self, kwargs: dict) -> None:
287
316
  self["list_deltas_kwargs"] = kwargs
288
317
 
289
- @property
290
- def s3_table_writer_kwargs(self) -> dict:
291
- return self["s3_table_writer_kwargs"]
292
-
293
- @s3_table_writer_kwargs.setter
294
- def s3_table_writer_kwargs(self, kwargs: dict) -> None:
295
- self["s3_table_writer_kwargs"] = kwargs
296
-
297
318
  @property
298
319
  def deltacat_storage_kwargs(self) -> dict:
299
320
  return self["deltacat_storage_kwargs"]
@@ -303,12 +324,12 @@ class CompactPartitionParams(dict):
303
324
  self["deltacat_storage_kwargs"] = kwargs
304
325
 
305
326
  @property
306
- def s3_client_kwargs(self) -> dict:
307
- return self["s3_client_kwargs"]
327
+ def all_column_names(self) -> List[str]:
328
+ return self.get("all_column_names")
308
329
 
309
- @s3_client_kwargs.setter
310
- def s3_client_kwargs(self, kwargs: dict) -> None:
311
- self["s3_client_kwargs"] = kwargs
330
+ @all_column_names.setter
331
+ def all_column_names(self, column_names: List[str]) -> None:
332
+ self["all_column_names"] = column_names
312
333
 
313
334
  @property
314
335
  def records_per_compacted_file(self) -> int:
@@ -489,6 +510,30 @@ class CompactPartitionParams(dict):
489
510
  average_record_size_bytes=self.average_record_size_bytes,
490
511
  )
491
512
 
513
+ @property
514
+ def table_writer_kwargs(self) -> dict:
515
+ return self["table_writer_kwargs"]
516
+
517
+ @table_writer_kwargs.setter
518
+ def table_writer_kwargs(self, kwargs: dict) -> None:
519
+ self["table_writer_kwargs"] = kwargs
520
+
521
+ @property
522
+ def expected_previous_partition_id(self) -> Optional[str]:
523
+ return self.get("expected_previous_partition_id")
524
+
525
+ @expected_previous_partition_id.setter
526
+ def expected_previous_partition_id(self, partition_id: Optional[str]) -> None:
527
+ self["expected_previous_partition_id"] = partition_id
528
+
529
+ @property
530
+ def original_fields(self) -> Optional[Set[str]]:
531
+ return self.get("original_fields")
532
+
533
+ @original_fields.setter
534
+ def original_fields(self, fields: Optional[Set[str]]) -> None:
535
+ self["original_fields"] = fields
536
+
492
537
  @staticmethod
493
538
  def json_handler_for_compact_partition_params(obj):
494
539
  """
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  from typing import Optional
4
4
  import pyarrow as pa
5
5
  import logging
6
+ from pathlib import PosixPath
6
7
  from deltacat import logs
7
8
  from typing import List, Union
8
9
  from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
@@ -919,3 +920,19 @@ class CompactionSessionAuditInfo(dict):
919
920
  )
920
921
 
921
922
  self.set_pyarrow_version(pa.__version__)
923
+
924
+ def to_serializable(self, catalog_root: str) -> CompactionSessionAuditInfo:
925
+ root_path = PosixPath(catalog_root)
926
+ target_path = PosixPath(self.audit_url)
927
+ if root_path == target_path:
928
+ raise ValueError(
929
+ "Target and root are identical, but expected target to be a child of root."
930
+ )
931
+ try:
932
+ relative_path = target_path.relative_to(root_path)
933
+ # Create a copy of the audit info with the relative path
934
+ audit_copy = CompactionSessionAuditInfo(**dict(self))
935
+ audit_copy["auditUrl"] = str(relative_path)
936
+ return audit_copy
937
+ except ValueError:
938
+ raise ValueError("Expected target to be a child of root.")
@@ -1,7 +1,7 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from typing import List, Tuple, Union
4
+ from typing import Tuple, Union
5
5
  from deltacat.storage import DeltaLocator, PartitionLocator
6
6
  from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
7
7
  from typing import Any, Dict, Optional
@@ -10,7 +10,7 @@ from typing import Any, Dict, Optional
10
10
  class HighWatermark(dict):
11
11
  """
12
12
  Inherit from dict to make it easy for serialization/deserialization.
13
- Keep both partition locator and high watermark as a tuple to be persisted in the rcf
13
+ Keep both partition locator and high watermark as a tuple to be persisted in the rci
14
14
  """
15
15
 
16
16
  def set(self, partition_locator: PartitionLocator, delta_stream_position: int):
@@ -46,6 +46,7 @@ class RoundCompletionInfo(dict):
46
46
  compactor_version: Optional[str] = None,
47
47
  input_inflation: Optional[float] = None,
48
48
  input_average_record_size_bytes: Optional[float] = None,
49
+ prev_source_partition_locator: Optional[PartitionLocator] = None,
49
50
  ) -> RoundCompletionInfo:
50
51
 
51
52
  rci = RoundCompletionInfo()
@@ -63,6 +64,7 @@ class RoundCompletionInfo(dict):
63
64
  rci["compactorVersion"] = compactor_version
64
65
  rci["inputInflation"] = input_inflation
65
66
  rci["inputAverageRecordSizeBytes"] = input_average_record_size_bytes
67
+ rci["prevSourcePartitionLocator"] = prev_source_partition_locator
66
68
  return rci
67
69
 
68
70
  @property
@@ -100,7 +102,11 @@ class RoundCompletionInfo(dict):
100
102
 
101
103
  @property
102
104
  def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
103
- return self.get("rebaseSourcePartitionLocator")
105
+ val = self.get("rebaseSourcePartitionLocator")
106
+ if val is not None and not isinstance(val, PartitionLocator):
107
+ val = PartitionLocator(val)
108
+ self["rebaseSourcePartitionLocator"] = val # Cache the converted value
109
+ return val
104
110
 
105
111
  @property
106
112
  def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
@@ -129,6 +135,10 @@ class RoundCompletionInfo(dict):
129
135
  def input_average_record_size_bytes(self) -> Optional[float]:
130
136
  return self.get("inputAverageRecordSizeBytes")
131
137
 
132
- @staticmethod
133
- def get_audit_bucket_name_and_key(compaction_audit_url: str) -> List[str]:
134
- return compaction_audit_url.replace("s3://", "").split("/", 1)
138
+ @property
139
+ def prev_source_partition_locator(self) -> Optional[PartitionLocator]:
140
+ val = self.get("prevSourcePartitionLocator")
141
+ if val is not None and not isinstance(val, PartitionLocator):
142
+ val = PartitionLocator(val)
143
+ self["prevSourcePartitionLocator"] = val # Cache the converted value
144
+ return val
@@ -21,14 +21,13 @@ from deltacat.utils.placement import PlacementGroupConfig
21
21
  from typing import List, Optional, Dict, Any
22
22
  from deltacat.utils.ray_utils.runtime import live_node_resource_keys
23
23
  from deltacat.compute.compactor.utils import io
24
- from deltacat.compute.compactor.utils import round_completion_file as rcf
25
24
  from deltacat.compute.compactor.steps import repartition as repar
26
25
  from deltacat.compute.compactor.steps.repartition import RepartitionType
27
26
  from deltacat.storage import (
28
27
  Delta,
29
28
  DeltaLocator,
30
29
  PartitionLocator,
31
- interface as unimplemented_deltacat_storage,
30
+ metastore,
32
31
  )
33
32
  from deltacat.utils.metrics import MetricsConfig
34
33
  from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
@@ -41,7 +40,6 @@ def repartition(
41
40
  source_partition_locator: PartitionLocator,
42
41
  destination_partition_locator: PartitionLocator,
43
42
  repartition_args: Any,
44
- repartition_completion_file_s3_url: str,
45
43
  last_stream_position_to_compact: int,
46
44
  repartition_type: RepartitionType = RepartitionType.RANGE,
47
45
  sort_keys: List[SortKey] = None,
@@ -54,9 +52,8 @@ def repartition(
54
52
  pg_config: Optional[PlacementGroupConfig] = None,
55
53
  list_deltas_kwargs: Optional[Dict[str, Any]] = None,
56
54
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
57
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
58
- s3_client_kwargs: Optional[Dict[str, Any]] = None,
59
- deltacat_storage=unimplemented_deltacat_storage,
55
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
56
+ deltacat_storage=metastore,
60
57
  **kwargs,
61
58
  ) -> Optional[str]:
62
59
 
@@ -132,7 +129,7 @@ def repartition(
132
129
  enable_profiler=enable_profiler,
133
130
  metrics_config=metrics_config,
134
131
  read_kwargs_provider=read_kwargs_provider,
135
- s3_table_writer_kwargs=s3_table_writer_kwargs,
132
+ table_writer_kwargs=table_writer_kwargs,
136
133
  repartitioned_file_content_type=repartitioned_file_content_type,
137
134
  deltacat_storage=deltacat_storage,
138
135
  )
@@ -153,9 +150,6 @@ def repartition(
153
150
  compacted_delta = deltacat_storage.commit_delta(
154
151
  merged_delta, properties=kwargs.get("properties", {})
155
152
  )
156
- deltacat_storage.commit_partition(partition)
157
- logger.info(f"Committed final delta: {compacted_delta}")
158
- logger.info(f"Job run completed successfully!")
159
153
  new_compacted_delta_locator = DeltaLocator.of(
160
154
  new_compacted_partition_locator,
161
155
  compacted_delta.stream_position,
@@ -173,14 +167,7 @@ def repartition(
173
167
  bit_width_of_sort_keys,
174
168
  None,
175
169
  )
176
- if s3_client_kwargs is None:
177
- s3_client_kwargs = {}
178
-
179
- return rcf.write_round_completion_file(
180
- None,
181
- None,
182
- None,
183
- repartition_completion_info,
184
- repartition_completion_file_s3_url,
185
- **s3_client_kwargs,
186
- )
170
+ partition.compaction_round_completion_info = repartition_completion_info
171
+ deltacat_storage.commit_partition(partition)
172
+ logger.info(f"Committed final delta: {compacted_delta}")
173
+ logger.info(f"Job run completed successfully!")
@@ -21,7 +21,7 @@ from deltacat.compute.compactor.utils.primary_key_index import (
21
21
  group_hash_bucket_indices,
22
22
  group_record_indices_by_hash_bucket,
23
23
  )
24
- from deltacat.storage import interface as unimplemented_deltacat_storage
24
+ from deltacat.storage import metastore
25
25
  from deltacat.types.media import StorageType
26
26
  from deltacat.utils.common import sha1_digest
27
27
  from deltacat.utils.ray_utils.runtime import (
@@ -90,7 +90,7 @@ def _group_file_records_by_pk_hash_bucket(
90
90
  sort_key_names: List[str],
91
91
  is_src_delta: np.bool_ = True,
92
92
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
93
- deltacat_storage=unimplemented_deltacat_storage,
93
+ deltacat_storage=metastore,
94
94
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
95
95
  **kwargs,
96
96
  ) -> Tuple[Optional[DeltaFileEnvelopeGroups], int]:
@@ -139,7 +139,7 @@ def _read_delta_file_envelopes(
139
139
  primary_keys: List[str],
140
140
  sort_key_names: List[str],
141
141
  read_kwargs_provider: Optional[ReadKwargsProvider],
142
- deltacat_storage=unimplemented_deltacat_storage,
142
+ deltacat_storage=metastore,
143
143
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
144
144
  **kwargs,
145
145
  ) -> Tuple[Optional[List[DeltaFileEnvelope]], int]:
@@ -190,7 +190,7 @@ def _timed_hash_bucket(
190
190
  enable_profiler: bool,
191
191
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
192
192
  object_store: Optional[IObjectStore] = None,
193
- deltacat_storage=unimplemented_deltacat_storage,
193
+ deltacat_storage=metastore,
194
194
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
195
195
  **kwargs,
196
196
  ):
@@ -249,7 +249,7 @@ def hash_bucket(
249
249
  metrics_config: MetricsConfig,
250
250
  read_kwargs_provider: Optional[ReadKwargsProvider],
251
251
  object_store: Optional[IObjectStore],
252
- deltacat_storage=unimplemented_deltacat_storage,
252
+ deltacat_storage=metastore,
253
253
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
254
254
  **kwargs,
255
255
  ) -> HashBucketResult:
@@ -29,7 +29,7 @@ from deltacat.storage import (
29
29
  ManifestEntryList,
30
30
  )
31
31
  from deltacat.storage.model.manifest import Manifest
32
- from deltacat.storage import interface as unimplemented_deltacat_storage
32
+
33
33
  from deltacat.utils.common import ReadKwargsProvider
34
34
  from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
35
35
  from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
@@ -46,6 +46,7 @@ from deltacat.utils.ray_utils.runtime import (
46
46
  )
47
47
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
48
48
  from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
49
+ from deltacat.storage import metastore
49
50
 
50
51
  if importlib.util.find_spec("memray"):
51
52
  import memray
@@ -67,9 +68,9 @@ def materialize(
67
68
  metrics_config: MetricsConfig,
68
69
  schema: Optional[pa.Schema] = None,
69
70
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
70
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
71
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
71
72
  object_store: Optional[IObjectStore] = None,
72
- deltacat_storage=unimplemented_deltacat_storage,
73
+ deltacat_storage=metastore,
73
74
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
74
75
  ):
75
76
  if deltacat_storage_kwargs is None:
@@ -78,11 +79,11 @@ def materialize(
78
79
  def _stage_delta_from_manifest_entry_reference_list(
79
80
  manifest_entry_list_reference: List[ManifestEntry],
80
81
  partition: Partition,
81
- delta_type: DeltaType = DeltaType.UPSERT,
82
+ delta_type: DeltaType = DeltaType.APPEND,
82
83
  ) -> Delta:
83
84
  assert (
84
- delta_type == DeltaType.UPSERT
85
- ), "Stage delta with existing manifest entries only supports UPSERT delta type!"
85
+ delta_type == DeltaType.APPEND
86
+ ), "Compaction should always produce APPEND deltas for consistent read operations!"
86
87
  manifest = Manifest.of(
87
88
  entries=ManifestEntryList.of(manifest_entry_list_reference),
88
89
  uuid=str(uuid4()),
@@ -110,9 +111,10 @@ def materialize(
110
111
  deltacat_storage.stage_delta,
111
112
  compacted_table,
112
113
  partition,
114
+ delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
113
115
  max_records_per_entry=max_records_per_output_file,
114
116
  content_type=compacted_file_content_type,
115
- s3_table_writer_kwargs=s3_table_writer_kwargs,
117
+ table_writer_kwargs=table_writer_kwargs,
116
118
  **deltacat_storage_kwargs,
117
119
  )
118
120
  compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
@@ -10,7 +10,7 @@ import ray
10
10
  from deltacat import logs
11
11
  from deltacat.compute.compactor import DeltaAnnotated
12
12
  from deltacat.compute.compactor.model.repartition_result import RepartitionResult
13
- from deltacat.storage import interface as unimplemented_deltacat_storage
13
+ from deltacat.storage import metastore
14
14
  from deltacat.storage import Partition
15
15
  from deltacat.utils.ray_utils.runtime import (
16
16
  get_current_ray_task_id,
@@ -19,7 +19,7 @@ from deltacat.utils.ray_utils.runtime import (
19
19
  from deltacat.utils.common import ReadKwargsProvider
20
20
  from deltacat.utils.performance import timed_invocation
21
21
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
22
- from deltacat.storage import Delta
22
+ from deltacat.storage import Delta, DeltaType
23
23
  from enum import Enum
24
24
 
25
25
  if importlib.util.find_spec("memray"):
@@ -56,9 +56,9 @@ def repartition_range(
56
56
  destination_partition: Partition,
57
57
  repartition_args: dict,
58
58
  max_records_per_output_file: int,
59
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
59
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
60
60
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
61
- deltacat_storage=unimplemented_deltacat_storage,
61
+ deltacat_storage=metastore,
62
62
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
63
63
  **kwargs,
64
64
  ):
@@ -144,9 +144,10 @@ def repartition_range(
144
144
  partition_delta: Delta = deltacat_storage.stage_delta(
145
145
  partition_table,
146
146
  destination_partition,
147
+ delta_type=DeltaType.APPEND, # Repartition always produces APPEND deltas
147
148
  max_records_per_entry=max_records_per_output_file,
148
149
  content_type=repartitioned_file_content_type,
149
- s3_table_writer_kwargs=s3_table_writer_kwargs,
150
+ table_writer_kwargs=table_writer_kwargs,
150
151
  **deltacat_storage_kwargs,
151
152
  )
152
153
  partition_deltas.append(partition_delta)
@@ -168,9 +169,9 @@ def _timed_repartition(
168
169
  max_records_per_output_file: int,
169
170
  enable_profiler: bool,
170
171
  read_kwargs_provider: Optional[ReadKwargsProvider],
171
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
172
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
172
173
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
173
- deltacat_storage=unimplemented_deltacat_storage,
174
+ deltacat_storage=metastore,
174
175
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
175
176
  **kwargs,
176
177
  ) -> RepartitionResult:
@@ -192,7 +193,7 @@ def _timed_repartition(
192
193
  destination_partition=destination_partition,
193
194
  repartition_args=repartition_args,
194
195
  max_records_per_output_file=max_records_per_output_file,
195
- s3_table_writer_kwargs=s3_table_writer_kwargs,
196
+ table_writer_kwargs=table_writer_kwargs,
196
197
  repartitioned_file_content_type=repartitioned_file_content_type,
197
198
  deltacat_storage=deltacat_storage,
198
199
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -213,9 +214,9 @@ def repartition(
213
214
  enable_profiler: bool,
214
215
  metrics_config: Optional[MetricsConfig],
215
216
  read_kwargs_provider: Optional[ReadKwargsProvider],
216
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
217
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
217
218
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
218
- deltacat_storage=unimplemented_deltacat_storage,
219
+ deltacat_storage=metastore,
219
220
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
220
221
  **kwargs,
221
222
  ) -> RepartitionResult:
@@ -231,7 +232,7 @@ def repartition(
231
232
  max_records_per_output_file=max_records_per_output_file,
232
233
  enable_profiler=enable_profiler,
233
234
  read_kwargs_provider=read_kwargs_provider,
234
- s3_table_writer_kwargs=s3_table_writer_kwargs,
235
+ table_writer_kwargs=table_writer_kwargs,
235
236
  repartitioned_file_content_type=repartitioned_file_content_type,
236
237
  deltacat_storage=deltacat_storage,
237
238
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -11,7 +11,7 @@ from deltacat.storage import (
11
11
  PartitionLocator,
12
12
  Delta,
13
13
  ManifestEntry,
14
- interface as unimplemented_deltacat_storage,
14
+ metastore,
15
15
  )
16
16
  from deltacat import logs
17
17
  from deltacat.compute.compactor import DeltaAnnotated
@@ -31,12 +31,13 @@ def discover_deltas(
31
31
  compacted_partition_locator: Optional[PartitionLocator],
32
32
  rebase_source_partition_locator: Optional[PartitionLocator],
33
33
  rebase_source_partition_high_watermark: Optional[int],
34
- deltacat_storage=unimplemented_deltacat_storage,
34
+ deltacat_storage=metastore,
35
35
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
36
36
  list_deltas_kwargs: Optional[Dict[str, Any]] = {},
37
37
  ) -> Tuple[List[Delta], int]:
38
38
  if deltacat_storage_kwargs is None:
39
39
  deltacat_storage_kwargs = {}
40
+
40
41
  # Source One: new deltas from uncompacted table for incremental compaction or deltas from compacted table for rebase
41
42
  start_position_exclusive = (
42
43
  high_watermark.get(source_partition_locator)
@@ -109,7 +110,7 @@ def limit_input_deltas(
109
110
  user_hash_bucket_chunk_size: int,
110
111
  input_deltas_stats: Dict[int, DeltaStats],
111
112
  compaction_audit: CompactionSessionAuditInfo,
112
- deltacat_storage=unimplemented_deltacat_storage,
113
+ deltacat_storage=metastore,
113
114
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
114
115
  **kwargs,
115
116
  ) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
@@ -272,7 +273,7 @@ def fit_input_deltas(
272
273
  cluster_resources: Dict[str, float],
273
274
  compaction_audit: CompactionSessionAuditInfo,
274
275
  hash_bucket_count: Optional[int],
275
- deltacat_storage=unimplemented_deltacat_storage,
276
+ deltacat_storage=metastore,
276
277
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
277
278
  **kwargs,
278
279
  ) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
@@ -359,7 +360,7 @@ def _discover_deltas(
359
360
  source_partition_locator: PartitionLocator,
360
361
  start_position_exclusive: Optional[int],
361
362
  end_position_inclusive: Optional[int],
362
- deltacat_storage=unimplemented_deltacat_storage,
363
+ deltacat_storage=metastore,
363
364
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
364
365
  list_deltas_kwargs: Optional[Dict[str, Any]] = {},
365
366
  ) -> List[Delta]: