deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -127,7 +127,7 @@ class Catalogs:
127
127
 
128
128
  def put(self, name: str, catalog: Catalog, set_default: bool = False) -> None:
129
129
  self._catalogs[name] = catalog
130
- if set_default:
130
+ if set_default or len(self._catalogs) == 1:
131
131
  self._default_catalog = catalog
132
132
 
133
133
  def get(self, name) -> Optional[Catalog]:
@@ -182,7 +182,7 @@ def init(
182
182
  ray_init_args: Dict[str, Any] = {},
183
183
  *,
184
184
  force=False,
185
- ) -> None:
185
+ ) -> Optional[ray.runtime.BaseContext]:
186
186
  """
187
187
  Initialize DeltaCAT catalogs.
188
188
 
@@ -194,16 +194,17 @@ def init(
194
194
  :param force: Whether to force DeltaCAT reinitialization. If True, reruns
195
195
  ray.init(**ray_init_args) and overwrites all previously registered
196
196
  catalogs.
197
+ :returns: The Ray context object if Ray was initialized, otherwise None.
197
198
  """
198
199
  global all_catalogs
199
200
 
200
201
  if is_initialized() and not force:
201
202
  logger.warning("DeltaCAT already initialized.")
202
- return
203
+ return None
203
204
 
204
205
  # initialize ray (and ignore reinitialization errors)
205
206
  ray_init_args["ignore_reinit_error"] = True
206
- ray.init(**ray_init_args)
207
+ context = ray.init(**ray_init_args)
207
208
 
208
209
  # register custom serializer for catalogs since these may contain
209
210
  # unserializable objects like boto3 clients with SSLContext
@@ -213,6 +214,39 @@ def init(
213
214
  # TODO(pdames): If no catalogs are provided then re-initialize DeltaCAT
214
215
  # with all catalogs from the last session
215
216
  all_catalogs = Catalogs.remote(catalogs=catalogs, default=default)
217
+ return context
218
+
219
+
220
+ def init_local(
221
+ path: Optional[str] = None,
222
+ ray_init_args: Dict[str, Any] = {},
223
+ *,
224
+ force=False,
225
+ ) -> Optional[ray.runtime.BaseContext]:
226
+ """
227
+ Initialize DeltaCAT with a default local catalog.
228
+
229
+ This is a convenience function that creates a default catalog for local usage.
230
+ Equivalent to calling init(catalogs={"default": Catalog()}).
231
+
232
+ :param path: Optional path for catalog root directory. If not provided, uses
233
+ the default behavior of CatalogProperties (DELTACAT_ROOT env var or
234
+ "./.deltacat/").
235
+ :param ray_init_args: Keyword arguments to pass to `ray.init()`.
236
+ :param force: Whether to force DeltaCAT reinitialization. If True, reruns
237
+ ray.init(**ray_init_args) and overwrites all previously registered
238
+ catalogs.
239
+ :returns: The Ray context object if Ray was initialized, otherwise None.
240
+ """
241
+ from deltacat.catalog.model.properties import CatalogProperties
242
+
243
+ config = CatalogProperties(root=path) if path is not None else None
244
+ return init(
245
+ catalogs={"default": Catalog(config=config)},
246
+ default="default",
247
+ ray_init_args=ray_init_args,
248
+ force=force,
249
+ )
216
250
 
217
251
 
218
252
  def get_catalog(name: Optional[str] = None) -> Catalog:
@@ -244,7 +278,7 @@ def get_catalog(name: Optional[str] = None) -> Catalog:
244
278
  else:
245
279
  catalog = ray.get(all_catalogs.default.remote())
246
280
  if not catalog:
247
- available_catalogs = ray.get(all_catalogs.all.remote()).values()
281
+ available_catalogs = list(ray.get(all_catalogs.all.remote()).keys())
248
282
  raise ValueError(
249
283
  f"Call to get_catalog without name set failed because there "
250
284
  f"is no default Catalog set. Available catalogs: "
@@ -334,12 +368,17 @@ def put_catalog(
334
368
  if fail_if_exists:
335
369
  try:
336
370
  get_catalog(name)
337
- except ValueError:
371
+ # If we get here, catalog exists - raise error
372
+ raise ValueError(
373
+ f"Failed to put catalog {name} because it already exists and "
374
+ f"fail_if_exists={fail_if_exists}"
375
+ )
376
+ except ValueError as e:
377
+ if "not found" not in str(e):
378
+ # Re-raise if it's not a "catalog not found" error
379
+ raise
380
+ # If catalog doesn't exist, continue normally
338
381
  pass
339
- raise ValueError(
340
- f"Failed to put catalog {name} because it already exists and "
341
- f"fail_if_exists={fail_if_exists}"
342
- )
343
382
 
344
383
  # Add the catalog (which may overwrite existing if fail_if_exists=False)
345
384
  ray.get(all_catalogs.put.remote(name, catalog, default))
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from typing import Optional, Any
4
+ import urllib.parse
4
5
 
5
6
  import os
6
7
 
@@ -76,6 +77,7 @@ class CatalogProperties:
76
77
  reading these files. If None, a filesystem will be inferred.
77
78
  If provided, this will be validated for compatibility with the
78
79
  catalog root path.
80
+ storage: DeltaCAT storage implementation override.
79
81
  """
80
82
  # set root, using precedence rules described in pydoc
81
83
  if root is None:
@@ -85,6 +87,10 @@ class CatalogProperties:
85
87
  # Default to "./.deltacat/"
86
88
  root = os.path.join(os.getcwd(), ".deltacat")
87
89
 
90
+ # Store the original root with its scheme for reconstruction later
91
+ self._original_root = root
92
+ self._original_scheme = urllib.parse.urlparse(root).scheme
93
+
88
94
  resolved_root, resolved_filesystem = resolve_path_and_filesystem(
89
95
  path=root,
90
96
  filesystem=filesystem,
@@ -108,6 +114,38 @@ class CatalogProperties:
108
114
  """
109
115
  return self._storage
110
116
 
117
+ def reconstruct_full_path(self, path: str) -> str:
118
+ """
119
+ Reconstruct a full path with the original scheme for external readers.
120
+
121
+ This addresses GitHub issue #567 by ensuring that cloud storage URIs
122
+ include the relevant scheme prefix (e.g., s3://) that some file readers
123
+ require regardless of the filesystem being used to read the file
124
+ (e.g., Daft).
125
+
126
+ Args:
127
+ path: A path relative to the catalog root or absolute path
128
+
129
+ Returns:
130
+ Full path with appropriate scheme prefix for external readers
131
+ """
132
+ # If the path already has a scheme, return it as-is
133
+ if urllib.parse.urlparse(path).scheme:
134
+ return path
135
+
136
+ # If we don't have an original scheme (local filesystem), return as-is
137
+ if not self._original_scheme:
138
+ return path
139
+
140
+ # Reconstruct the full path with the original scheme
141
+ # Handle both absolute and relative paths
142
+ if path.startswith("/"):
143
+ # Absolute path - this shouldn't happen normally but handle it
144
+ return f"{self._original_scheme}:/{path}"
145
+ else:
146
+ # Relative path - prepend the s3:// scheme
147
+ return f"{self._original_scheme}://{path}"
148
+
111
149
  def __str__(self):
112
150
  return (
113
151
  f"{self.__class__.__name__}(root={self.root}, filesystem={self.filesystem})"
@@ -5,7 +5,8 @@ import logging
5
5
  import ray
6
6
  import time
7
7
  import json
8
- from deltacat.aws import s3u as s3_utils
8
+ import posixpath
9
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
9
10
  import deltacat
10
11
  from deltacat import logs
11
12
  import pyarrow as pa
@@ -25,7 +26,7 @@ from deltacat.storage import (
25
26
  DeltaLocator,
26
27
  Partition,
27
28
  PartitionLocator,
28
- interface as unimplemented_deltacat_storage,
29
+ metastore,
29
30
  )
30
31
  from deltacat.compute.compactor.model.compact_partition_params import (
31
32
  CompactPartitionParams,
@@ -40,7 +41,7 @@ from deltacat.compute.compactor.steps import dedupe as dd
40
41
  from deltacat.compute.compactor.steps import hash_bucket as hb
41
42
  from deltacat.compute.compactor.steps import materialize as mat
42
43
  from deltacat.compute.compactor.utils import io
43
- from deltacat.compute.compactor.utils import round_completion_file as rcf
44
+ from deltacat.compute.compactor.utils import round_completion_reader as rci
44
45
 
45
46
  from deltacat.types.media import ContentType
46
47
  from deltacat.utils.placement import PlacementGroupConfig
@@ -65,13 +66,37 @@ DEFAULT_DEDUPE_MAX_PARALLELISM_RATIO_ARG: int = 1
65
66
  DEFAULT_PROPERTIES_ARG: Dict[str, Any] = {}
66
67
 
67
68
 
69
+ def _upload_audit_data(url: str, content: str, **kwargs) -> None:
70
+ """
71
+ Upload audit data to the specified URL using filesystem-agnostic operations.
72
+ """
73
+ try:
74
+ path, filesystem = resolve_path_and_filesystem(url)
75
+
76
+ # Create parent directories if they don't exist
77
+ parent_dir = posixpath.dirname(path)
78
+ if parent_dir:
79
+ try:
80
+ filesystem.create_dir(parent_dir, recursive=True)
81
+ except Exception as dir_error:
82
+ # Directory might already exist, which is fine
83
+ logger.debug(
84
+ f"Directory creation warning for {parent_dir}: {dir_error}"
85
+ )
86
+
87
+ with filesystem.open_output_stream(path) as stream:
88
+ stream.write(content.encode("utf-8"))
89
+ except Exception as e:
90
+ logger.warning(f"Failed to upload audit data to {url}: {e}")
91
+
92
+
68
93
  def check_preconditions(
69
94
  source_partition_locator: PartitionLocator,
70
95
  destination_partition_locator: PartitionLocator,
71
96
  sort_keys: List[SortKey],
72
97
  max_records_per_output_file: int,
73
98
  new_hash_bucket_count: Optional[int],
74
- deltacat_storage=unimplemented_deltacat_storage,
99
+ deltacat_storage=metastore,
75
100
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
76
101
  **kwargs,
77
102
  ) -> int:
@@ -104,7 +129,7 @@ def compact_partition(
104
129
  source_partition_locator: PartitionLocator,
105
130
  destination_partition_locator: PartitionLocator,
106
131
  primary_keys: Set[str],
107
- compaction_artifact_s3_bucket: str,
132
+ compaction_artifact_path: str,
108
133
  last_stream_position_to_compact: int,
109
134
  *,
110
135
  hash_bucket_count: Optional[int] = None,
@@ -123,37 +148,29 @@ def compact_partition(
123
148
  metrics_config: Optional[MetricsConfig] = None,
124
149
  list_deltas_kwargs: Optional[Dict[str, Any]] = None,
125
150
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
126
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
151
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
127
152
  object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
128
- s3_client_kwargs: Optional[Dict[str, Any]] = None,
129
- deltacat_storage=unimplemented_deltacat_storage,
153
+ deltacat_storage=metastore,
130
154
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
131
155
  **kwargs,
132
- ) -> Optional[str]:
156
+ ) -> None:
133
157
  if deltacat_storage_kwargs is None:
134
158
  deltacat_storage_kwargs = {}
135
159
  if not importlib.util.find_spec("memray"):
136
160
  logger.info(f"memray profiler not available, disabling all profiling")
137
161
  enable_profiler = False
138
162
 
139
- if s3_client_kwargs is None:
140
- s3_client_kwargs = {}
141
-
142
163
  # memray official documentation link:
143
164
  # https://bloomberg.github.io/memray/getting_started.html
144
165
  with memray.Tracker(
145
166
  f"compaction_partition.bin"
146
167
  ) if enable_profiler else nullcontext():
147
168
  partition = None
148
- (
149
- new_partition,
150
- new_rci,
151
- new_rcf_partition_locator,
152
- ) = _execute_compaction_round(
169
+ (new_partition, new_rci,) = _execute_compaction_round(
153
170
  source_partition_locator,
154
171
  destination_partition_locator,
155
172
  primary_keys,
156
- compaction_artifact_s3_bucket,
173
+ compaction_artifact_path,
157
174
  last_stream_position_to_compact,
158
175
  hash_bucket_count,
159
176
  sort_keys,
@@ -169,9 +186,8 @@ def compact_partition(
169
186
  metrics_config,
170
187
  list_deltas_kwargs,
171
188
  read_kwargs_provider,
172
- s3_table_writer_kwargs,
189
+ table_writer_kwargs,
173
190
  object_store,
174
- s3_client_kwargs,
175
191
  deltacat_storage,
176
192
  deltacat_storage_kwargs,
177
193
  **kwargs,
@@ -182,30 +198,23 @@ def compact_partition(
182
198
  logger.info(
183
199
  f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed"
184
200
  )
185
- round_completion_file_s3_url = None
186
201
  if partition:
187
202
  logger.info(f"Committing compacted partition to: {partition.locator}")
203
+ # Set the round completion info on the partition before committing
204
+ partition.compaction_round_completion_info = new_rci
188
205
  partition = deltacat_storage.commit_partition(
189
- partition, **deltacat_storage_kwargs
206
+ partition,
207
+ **deltacat_storage_kwargs,
190
208
  )
191
209
  logger.info(f"Committed compacted partition: {partition}")
192
-
193
- round_completion_file_s3_url = rcf.write_round_completion_file(
194
- compaction_artifact_s3_bucket,
195
- new_rcf_partition_locator,
196
- partition.locator,
197
- new_rci,
198
- **s3_client_kwargs,
199
- )
200
210
  logger.info(f"Completed compaction session for: {source_partition_locator}")
201
- return round_completion_file_s3_url
202
211
 
203
212
 
204
213
  def _execute_compaction_round(
205
214
  source_partition_locator: PartitionLocator,
206
215
  destination_partition_locator: PartitionLocator,
207
216
  primary_keys: Set[str],
208
- compaction_artifact_s3_bucket: str,
217
+ compaction_artifact_path: str,
209
218
  last_stream_position_to_compact: int,
210
219
  hash_bucket_count: Optional[int],
211
220
  sort_keys: List[SortKey],
@@ -221,24 +230,25 @@ def _execute_compaction_round(
221
230
  metrics_config: Optional[MetricsConfig],
222
231
  list_deltas_kwargs: Optional[Dict[str, Any]],
223
232
  read_kwargs_provider: Optional[ReadKwargsProvider],
224
- s3_table_writer_kwargs: Optional[Dict[str, Any]],
233
+ table_writer_kwargs: Optional[Dict[str, Any]],
225
234
  object_store: Optional[IObjectStore],
226
- s3_client_kwargs: Optional[Dict[str, Any]],
227
- deltacat_storage=unimplemented_deltacat_storage,
235
+ deltacat_storage=metastore,
228
236
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
229
237
  **kwargs,
230
- ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
238
+ ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo]]:
231
239
  if deltacat_storage_kwargs is None:
232
240
  deltacat_storage_kwargs = {}
233
- rcf_source_partition_locator = (
241
+ rci_source_partition_locator = (
234
242
  rebase_source_partition_locator
235
243
  if rebase_source_partition_locator
236
244
  else source_partition_locator
237
245
  )
238
- base_audit_url = rcf_source_partition_locator.path(
239
- f"s3://{compaction_artifact_s3_bucket}/compaction-audit"
246
+ # Construct audit URL using filesystem-agnostic path joining
247
+ audit_url = posixpath.join(
248
+ compaction_artifact_path,
249
+ "compaction-audit.json",
250
+ f"{rci_source_partition_locator.hexdigest()}.json",
240
251
  )
241
- audit_url = f"{base_audit_url}.json"
242
252
 
243
253
  logger.info(f"Compaction audit will be written to {audit_url}")
244
254
 
@@ -312,11 +322,11 @@ def _execute_compaction_round(
312
322
  # read the results from any previously completed compaction round
313
323
  round_completion_info = None
314
324
  if not rebase_source_partition_locator:
315
- round_completion_info = rcf.read_round_completion_file(
316
- compaction_artifact_s3_bucket,
317
- source_partition_locator,
318
- destination_partition_locator,
319
- **s3_client_kwargs,
325
+ round_completion_info = rci.read_round_completion_info(
326
+ source_partition_locator=source_partition_locator,
327
+ destination_partition_locator=destination_partition_locator,
328
+ deltacat_storage=deltacat_storage,
329
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
320
330
  )
321
331
  if not round_completion_info:
322
332
  logger.info(
@@ -363,15 +373,11 @@ def _execute_compaction_round(
363
373
  delta_discovery_end - delta_discovery_start
364
374
  )
365
375
 
366
- s3_utils.upload(
367
- compaction_audit.audit_url,
368
- str(json.dumps(compaction_audit)),
369
- **s3_client_kwargs,
370
- )
376
+ _upload_audit_data(audit_url, json.dumps(compaction_audit))
371
377
 
372
378
  if not input_deltas:
373
379
  logger.info("No input deltas found to compact.")
374
- return None, None, None
380
+ return None, None
375
381
 
376
382
  # limit the input deltas to fit on this cluster and convert them to
377
383
  # annotated deltas of equivalent size for easy parallel distribution
@@ -464,11 +470,7 @@ def _execute_compaction_round(
464
470
  hb_end - hb_start,
465
471
  )
466
472
 
467
- s3_utils.upload(
468
- compaction_audit.audit_url,
469
- str(json.dumps(compaction_audit)),
470
- **s3_client_kwargs,
471
- )
473
+ _upload_audit_data(audit_url, json.dumps(compaction_audit))
472
474
 
473
475
  all_hash_group_idx_to_obj_id = defaultdict(list)
474
476
  for hb_result in hb_results:
@@ -485,9 +487,9 @@ def _execute_compaction_round(
485
487
  )
486
488
 
487
489
  compaction_audit.set_input_records(total_hb_record_count.item())
488
- # TODO (pdames): when resources are freed during the last round of hash
489
- # bucketing, start running dedupe tasks that read existing dedupe
490
- # output from S3 then wait for hash bucketing to finish before continuing
490
+ # TODO(pdames): when resources are freed during the last round of hash bucketing,
491
+ # start running dedupe tasks that read hash bucket output from storage then
492
+ # wait for hash bucketing to finish before continuing
491
493
 
492
494
  # create a new stream for this round
493
495
  compacted_stream_locator = destination_partition_locator.stream_locator
@@ -497,6 +499,7 @@ def _execute_compaction_round(
497
499
  compacted_stream_locator.table_version,
498
500
  **deltacat_storage_kwargs,
499
501
  )
502
+
500
503
  partition = deltacat_storage.stage_partition(
501
504
  stream,
502
505
  destination_partition_locator.partition_values,
@@ -571,9 +574,9 @@ def _execute_compaction_round(
571
574
  logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
572
575
 
573
576
  compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
574
- # TODO(pdames): when resources are freed during the last round of deduping
577
+ # TODO(pdames): when resources are freed during the last round of deduping,
575
578
  # start running materialize tasks that read materialization source file
576
- # tables from S3 then wait for deduping to finish before continuing
579
+ # tables from storage then wait for deduping to finish before continuing
577
580
 
578
581
  # TODO(pdames): balance inputs to materialization tasks to ensure that each
579
582
  # task has an approximately equal amount of input to materialize
@@ -584,11 +587,7 @@ def _execute_compaction_round(
584
587
  # parallel step 3:
585
588
  # materialize records to keep by index
586
589
 
587
- s3_utils.upload(
588
- compaction_audit.audit_url,
589
- str(json.dumps(compaction_audit)),
590
- **s3_client_kwargs,
591
- )
590
+ _upload_audit_data(audit_url, json.dumps(compaction_audit))
592
591
 
593
592
  materialize_start = time.monotonic()
594
593
  mat_tasks_pending = invoke_parallel(
@@ -610,7 +609,7 @@ def _execute_compaction_round(
610
609
  enable_profiler=enable_profiler,
611
610
  metrics_config=metrics_config,
612
611
  read_kwargs_provider=read_kwargs_provider,
613
- s3_table_writer_kwargs=s3_table_writer_kwargs,
612
+ table_writer_kwargs=table_writer_kwargs,
614
613
  object_store=object_store,
615
614
  deltacat_storage=deltacat_storage,
616
615
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -693,11 +692,7 @@ def _execute_compaction_round(
693
692
  telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
694
693
  )
695
694
 
696
- s3_utils.upload(
697
- compaction_audit.audit_url,
698
- str(json.dumps(compaction_audit)),
699
- **s3_client_kwargs,
700
- )
695
+ _upload_audit_data(audit_url, json.dumps(compaction_audit))
701
696
 
702
697
  new_round_completion_info = RoundCompletionInfo.of(
703
698
  last_stream_position_compacted,
@@ -710,6 +705,7 @@ def _execute_compaction_round(
710
705
  hash_bucket_count,
711
706
  None,
712
707
  CompactorVersion.V1.value,
708
+ prev_source_partition_locator=rci_source_partition_locator,
713
709
  )
714
710
 
715
711
  logger.info(
@@ -721,17 +717,43 @@ def _execute_compaction_round(
721
717
  return (
722
718
  partition,
723
719
  new_round_completion_info,
724
- rcf_source_partition_locator,
725
720
  )
726
721
 
727
722
 
728
723
  def compact_partition_from_request(
729
724
  compact_partition_params: CompactPartitionParams,
730
725
  *compact_partition_pos_args,
731
- ) -> Optional[str]:
726
+ ) -> None:
732
727
  """
733
728
  Wrapper for compact_partition that allows for the compact_partition parameters to be
734
729
  passed in as a custom dictionary-like CompactPartitionParams object along with any compact_partition positional arguments.
735
730
  :param compact_partition_params:
736
731
  """
737
- return compact_partition(*compact_partition_pos_args, **compact_partition_params)
732
+ # Extract required positional arguments
733
+ source_partition_locator = compact_partition_params.source_partition_locator
734
+ destination_partition_locator = (
735
+ compact_partition_params.destination_partition_locator
736
+ )
737
+ primary_keys = compact_partition_params.primary_keys
738
+ compaction_artifact_path = compact_partition_params.compaction_artifact_path
739
+ last_stream_position_to_compact = (
740
+ compact_partition_params.last_stream_position_to_compact
741
+ )
742
+
743
+ # Create a copy of params without the positional arguments
744
+ kwargs_params = dict(compact_partition_params)
745
+ kwargs_params.pop("source_partition_locator", None)
746
+ kwargs_params.pop("destination_partition_locator", None)
747
+ kwargs_params.pop("primary_keys", None)
748
+ kwargs_params.pop("last_stream_position_to_compact", None)
749
+ # Don't pop compaction_artifact_path as it's a computed property, not stored in the dict
750
+
751
+ compact_partition(
752
+ source_partition_locator,
753
+ destination_partition_locator,
754
+ primary_keys,
755
+ compaction_artifact_path,
756
+ last_stream_position_to_compact,
757
+ *compact_partition_pos_args,
758
+ **kwargs_params,
759
+ )