deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -127,7 +127,7 @@ class Catalogs:
|
|
127
127
|
|
128
128
|
def put(self, name: str, catalog: Catalog, set_default: bool = False) -> None:
|
129
129
|
self._catalogs[name] = catalog
|
130
|
-
if set_default:
|
130
|
+
if set_default or len(self._catalogs) == 1:
|
131
131
|
self._default_catalog = catalog
|
132
132
|
|
133
133
|
def get(self, name) -> Optional[Catalog]:
|
@@ -182,7 +182,7 @@ def init(
|
|
182
182
|
ray_init_args: Dict[str, Any] = {},
|
183
183
|
*,
|
184
184
|
force=False,
|
185
|
-
) ->
|
185
|
+
) -> Optional[ray.runtime.BaseContext]:
|
186
186
|
"""
|
187
187
|
Initialize DeltaCAT catalogs.
|
188
188
|
|
@@ -194,16 +194,17 @@ def init(
|
|
194
194
|
:param force: Whether to force DeltaCAT reinitialization. If True, reruns
|
195
195
|
ray.init(**ray_init_args) and overwrites all previously registered
|
196
196
|
catalogs.
|
197
|
+
:returns: The Ray context object if Ray was initialized, otherwise None.
|
197
198
|
"""
|
198
199
|
global all_catalogs
|
199
200
|
|
200
201
|
if is_initialized() and not force:
|
201
202
|
logger.warning("DeltaCAT already initialized.")
|
202
|
-
return
|
203
|
+
return None
|
203
204
|
|
204
205
|
# initialize ray (and ignore reinitialization errors)
|
205
206
|
ray_init_args["ignore_reinit_error"] = True
|
206
|
-
ray.init(**ray_init_args)
|
207
|
+
context = ray.init(**ray_init_args)
|
207
208
|
|
208
209
|
# register custom serializer for catalogs since these may contain
|
209
210
|
# unserializable objects like boto3 clients with SSLContext
|
@@ -213,6 +214,39 @@ def init(
|
|
213
214
|
# TODO(pdames): If no catalogs are provided then re-initialize DeltaCAT
|
214
215
|
# with all catalogs from the last session
|
215
216
|
all_catalogs = Catalogs.remote(catalogs=catalogs, default=default)
|
217
|
+
return context
|
218
|
+
|
219
|
+
|
220
|
+
def init_local(
|
221
|
+
path: Optional[str] = None,
|
222
|
+
ray_init_args: Dict[str, Any] = {},
|
223
|
+
*,
|
224
|
+
force=False,
|
225
|
+
) -> Optional[ray.runtime.BaseContext]:
|
226
|
+
"""
|
227
|
+
Initialize DeltaCAT with a default local catalog.
|
228
|
+
|
229
|
+
This is a convenience function that creates a default catalog for local usage.
|
230
|
+
Equivalent to calling init(catalogs={"default": Catalog()}).
|
231
|
+
|
232
|
+
:param path: Optional path for catalog root directory. If not provided, uses
|
233
|
+
the default behavior of CatalogProperties (DELTACAT_ROOT env var or
|
234
|
+
"./.deltacat/").
|
235
|
+
:param ray_init_args: Keyword arguments to pass to `ray.init()`.
|
236
|
+
:param force: Whether to force DeltaCAT reinitialization. If True, reruns
|
237
|
+
ray.init(**ray_init_args) and overwrites all previously registered
|
238
|
+
catalogs.
|
239
|
+
:returns: The Ray context object if Ray was initialized, otherwise None.
|
240
|
+
"""
|
241
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
242
|
+
|
243
|
+
config = CatalogProperties(root=path) if path is not None else None
|
244
|
+
return init(
|
245
|
+
catalogs={"default": Catalog(config=config)},
|
246
|
+
default="default",
|
247
|
+
ray_init_args=ray_init_args,
|
248
|
+
force=force,
|
249
|
+
)
|
216
250
|
|
217
251
|
|
218
252
|
def get_catalog(name: Optional[str] = None) -> Catalog:
|
@@ -244,7 +278,7 @@ def get_catalog(name: Optional[str] = None) -> Catalog:
|
|
244
278
|
else:
|
245
279
|
catalog = ray.get(all_catalogs.default.remote())
|
246
280
|
if not catalog:
|
247
|
-
available_catalogs = ray.get(all_catalogs.all.remote()).
|
281
|
+
available_catalogs = list(ray.get(all_catalogs.all.remote()).keys())
|
248
282
|
raise ValueError(
|
249
283
|
f"Call to get_catalog without name set failed because there "
|
250
284
|
f"is no default Catalog set. Available catalogs: "
|
@@ -334,12 +368,17 @@ def put_catalog(
|
|
334
368
|
if fail_if_exists:
|
335
369
|
try:
|
336
370
|
get_catalog(name)
|
337
|
-
|
371
|
+
# If we get here, catalog exists - raise error
|
372
|
+
raise ValueError(
|
373
|
+
f"Failed to put catalog {name} because it already exists and "
|
374
|
+
f"fail_if_exists={fail_if_exists}"
|
375
|
+
)
|
376
|
+
except ValueError as e:
|
377
|
+
if "not found" not in str(e):
|
378
|
+
# Re-raise if it's not a "catalog not found" error
|
379
|
+
raise
|
380
|
+
# If catalog doesn't exist, continue normally
|
338
381
|
pass
|
339
|
-
raise ValueError(
|
340
|
-
f"Failed to put catalog {name} because it already exists and "
|
341
|
-
f"fail_if_exists={fail_if_exists}"
|
342
|
-
)
|
343
382
|
|
344
383
|
# Add the catalog (which may overwrite existing if fail_if_exists=False)
|
345
384
|
ray.get(all_catalogs.put.remote(name, catalog, default))
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from typing import Optional, Any
|
4
|
+
import urllib.parse
|
4
5
|
|
5
6
|
import os
|
6
7
|
|
@@ -76,6 +77,7 @@ class CatalogProperties:
|
|
76
77
|
reading these files. If None, a filesystem will be inferred.
|
77
78
|
If provided, this will be validated for compatibility with the
|
78
79
|
catalog root path.
|
80
|
+
storage: DeltaCAT storage implementation override.
|
79
81
|
"""
|
80
82
|
# set root, using precedence rules described in pydoc
|
81
83
|
if root is None:
|
@@ -85,6 +87,10 @@ class CatalogProperties:
|
|
85
87
|
# Default to "./.deltacat/"
|
86
88
|
root = os.path.join(os.getcwd(), ".deltacat")
|
87
89
|
|
90
|
+
# Store the original root with its scheme for reconstruction later
|
91
|
+
self._original_root = root
|
92
|
+
self._original_scheme = urllib.parse.urlparse(root).scheme
|
93
|
+
|
88
94
|
resolved_root, resolved_filesystem = resolve_path_and_filesystem(
|
89
95
|
path=root,
|
90
96
|
filesystem=filesystem,
|
@@ -108,6 +114,38 @@ class CatalogProperties:
|
|
108
114
|
"""
|
109
115
|
return self._storage
|
110
116
|
|
117
|
+
def reconstruct_full_path(self, path: str) -> str:
|
118
|
+
"""
|
119
|
+
Reconstruct a full path with the original scheme for external readers.
|
120
|
+
|
121
|
+
This addresses GitHub issue #567 by ensuring that cloud storage URIs
|
122
|
+
include the relevant scheme prefix (e.g., s3://) that some file readers
|
123
|
+
require regardless of the filesystem being used to read the file
|
124
|
+
(e.g., Daft).
|
125
|
+
|
126
|
+
Args:
|
127
|
+
path: A path relative to the catalog root or absolute path
|
128
|
+
|
129
|
+
Returns:
|
130
|
+
Full path with appropriate scheme prefix for external readers
|
131
|
+
"""
|
132
|
+
# If the path already has a scheme, return it as-is
|
133
|
+
if urllib.parse.urlparse(path).scheme:
|
134
|
+
return path
|
135
|
+
|
136
|
+
# If we don't have an original scheme (local filesystem), return as-is
|
137
|
+
if not self._original_scheme:
|
138
|
+
return path
|
139
|
+
|
140
|
+
# Reconstruct the full path with the original scheme
|
141
|
+
# Handle both absolute and relative paths
|
142
|
+
if path.startswith("/"):
|
143
|
+
# Absolute path - this shouldn't happen normally but handle it
|
144
|
+
return f"{self._original_scheme}:/{path}"
|
145
|
+
else:
|
146
|
+
# Relative path - prepend the s3:// scheme
|
147
|
+
return f"{self._original_scheme}://{path}"
|
148
|
+
|
111
149
|
def __str__(self):
|
112
150
|
return (
|
113
151
|
f"{self.__class__.__name__}(root={self.root}, filesystem={self.filesystem})"
|
@@ -5,7 +5,8 @@ import logging
|
|
5
5
|
import ray
|
6
6
|
import time
|
7
7
|
import json
|
8
|
-
|
8
|
+
import posixpath
|
9
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
9
10
|
import deltacat
|
10
11
|
from deltacat import logs
|
11
12
|
import pyarrow as pa
|
@@ -25,7 +26,7 @@ from deltacat.storage import (
|
|
25
26
|
DeltaLocator,
|
26
27
|
Partition,
|
27
28
|
PartitionLocator,
|
28
|
-
|
29
|
+
metastore,
|
29
30
|
)
|
30
31
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
31
32
|
CompactPartitionParams,
|
@@ -40,7 +41,7 @@ from deltacat.compute.compactor.steps import dedupe as dd
|
|
40
41
|
from deltacat.compute.compactor.steps import hash_bucket as hb
|
41
42
|
from deltacat.compute.compactor.steps import materialize as mat
|
42
43
|
from deltacat.compute.compactor.utils import io
|
43
|
-
from deltacat.compute.compactor.utils import
|
44
|
+
from deltacat.compute.compactor.utils import round_completion_reader as rci
|
44
45
|
|
45
46
|
from deltacat.types.media import ContentType
|
46
47
|
from deltacat.utils.placement import PlacementGroupConfig
|
@@ -65,13 +66,37 @@ DEFAULT_DEDUPE_MAX_PARALLELISM_RATIO_ARG: int = 1
|
|
65
66
|
DEFAULT_PROPERTIES_ARG: Dict[str, Any] = {}
|
66
67
|
|
67
68
|
|
69
|
+
def _upload_audit_data(url: str, content: str, **kwargs) -> None:
|
70
|
+
"""
|
71
|
+
Upload audit data to the specified URL using filesystem-agnostic operations.
|
72
|
+
"""
|
73
|
+
try:
|
74
|
+
path, filesystem = resolve_path_and_filesystem(url)
|
75
|
+
|
76
|
+
# Create parent directories if they don't exist
|
77
|
+
parent_dir = posixpath.dirname(path)
|
78
|
+
if parent_dir:
|
79
|
+
try:
|
80
|
+
filesystem.create_dir(parent_dir, recursive=True)
|
81
|
+
except Exception as dir_error:
|
82
|
+
# Directory might already exist, which is fine
|
83
|
+
logger.debug(
|
84
|
+
f"Directory creation warning for {parent_dir}: {dir_error}"
|
85
|
+
)
|
86
|
+
|
87
|
+
with filesystem.open_output_stream(path) as stream:
|
88
|
+
stream.write(content.encode("utf-8"))
|
89
|
+
except Exception as e:
|
90
|
+
logger.warning(f"Failed to upload audit data to {url}: {e}")
|
91
|
+
|
92
|
+
|
68
93
|
def check_preconditions(
|
69
94
|
source_partition_locator: PartitionLocator,
|
70
95
|
destination_partition_locator: PartitionLocator,
|
71
96
|
sort_keys: List[SortKey],
|
72
97
|
max_records_per_output_file: int,
|
73
98
|
new_hash_bucket_count: Optional[int],
|
74
|
-
deltacat_storage=
|
99
|
+
deltacat_storage=metastore,
|
75
100
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
76
101
|
**kwargs,
|
77
102
|
) -> int:
|
@@ -104,7 +129,7 @@ def compact_partition(
|
|
104
129
|
source_partition_locator: PartitionLocator,
|
105
130
|
destination_partition_locator: PartitionLocator,
|
106
131
|
primary_keys: Set[str],
|
107
|
-
|
132
|
+
compaction_artifact_path: str,
|
108
133
|
last_stream_position_to_compact: int,
|
109
134
|
*,
|
110
135
|
hash_bucket_count: Optional[int] = None,
|
@@ -123,37 +148,29 @@ def compact_partition(
|
|
123
148
|
metrics_config: Optional[MetricsConfig] = None,
|
124
149
|
list_deltas_kwargs: Optional[Dict[str, Any]] = None,
|
125
150
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
126
|
-
|
151
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
127
152
|
object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
|
128
|
-
|
129
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
153
|
+
deltacat_storage=metastore,
|
130
154
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
131
155
|
**kwargs,
|
132
|
-
) ->
|
156
|
+
) -> None:
|
133
157
|
if deltacat_storage_kwargs is None:
|
134
158
|
deltacat_storage_kwargs = {}
|
135
159
|
if not importlib.util.find_spec("memray"):
|
136
160
|
logger.info(f"memray profiler not available, disabling all profiling")
|
137
161
|
enable_profiler = False
|
138
162
|
|
139
|
-
if s3_client_kwargs is None:
|
140
|
-
s3_client_kwargs = {}
|
141
|
-
|
142
163
|
# memray official documentation link:
|
143
164
|
# https://bloomberg.github.io/memray/getting_started.html
|
144
165
|
with memray.Tracker(
|
145
166
|
f"compaction_partition.bin"
|
146
167
|
) if enable_profiler else nullcontext():
|
147
168
|
partition = None
|
148
|
-
(
|
149
|
-
new_partition,
|
150
|
-
new_rci,
|
151
|
-
new_rcf_partition_locator,
|
152
|
-
) = _execute_compaction_round(
|
169
|
+
(new_partition, new_rci,) = _execute_compaction_round(
|
153
170
|
source_partition_locator,
|
154
171
|
destination_partition_locator,
|
155
172
|
primary_keys,
|
156
|
-
|
173
|
+
compaction_artifact_path,
|
157
174
|
last_stream_position_to_compact,
|
158
175
|
hash_bucket_count,
|
159
176
|
sort_keys,
|
@@ -169,9 +186,8 @@ def compact_partition(
|
|
169
186
|
metrics_config,
|
170
187
|
list_deltas_kwargs,
|
171
188
|
read_kwargs_provider,
|
172
|
-
|
189
|
+
table_writer_kwargs,
|
173
190
|
object_store,
|
174
|
-
s3_client_kwargs,
|
175
191
|
deltacat_storage,
|
176
192
|
deltacat_storage_kwargs,
|
177
193
|
**kwargs,
|
@@ -182,30 +198,23 @@ def compact_partition(
|
|
182
198
|
logger.info(
|
183
199
|
f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed"
|
184
200
|
)
|
185
|
-
round_completion_file_s3_url = None
|
186
201
|
if partition:
|
187
202
|
logger.info(f"Committing compacted partition to: {partition.locator}")
|
203
|
+
# Set the round completion info on the partition before committing
|
204
|
+
partition.compaction_round_completion_info = new_rci
|
188
205
|
partition = deltacat_storage.commit_partition(
|
189
|
-
partition,
|
206
|
+
partition,
|
207
|
+
**deltacat_storage_kwargs,
|
190
208
|
)
|
191
209
|
logger.info(f"Committed compacted partition: {partition}")
|
192
|
-
|
193
|
-
round_completion_file_s3_url = rcf.write_round_completion_file(
|
194
|
-
compaction_artifact_s3_bucket,
|
195
|
-
new_rcf_partition_locator,
|
196
|
-
partition.locator,
|
197
|
-
new_rci,
|
198
|
-
**s3_client_kwargs,
|
199
|
-
)
|
200
210
|
logger.info(f"Completed compaction session for: {source_partition_locator}")
|
201
|
-
return round_completion_file_s3_url
|
202
211
|
|
203
212
|
|
204
213
|
def _execute_compaction_round(
|
205
214
|
source_partition_locator: PartitionLocator,
|
206
215
|
destination_partition_locator: PartitionLocator,
|
207
216
|
primary_keys: Set[str],
|
208
|
-
|
217
|
+
compaction_artifact_path: str,
|
209
218
|
last_stream_position_to_compact: int,
|
210
219
|
hash_bucket_count: Optional[int],
|
211
220
|
sort_keys: List[SortKey],
|
@@ -221,24 +230,25 @@ def _execute_compaction_round(
|
|
221
230
|
metrics_config: Optional[MetricsConfig],
|
222
231
|
list_deltas_kwargs: Optional[Dict[str, Any]],
|
223
232
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
224
|
-
|
233
|
+
table_writer_kwargs: Optional[Dict[str, Any]],
|
225
234
|
object_store: Optional[IObjectStore],
|
226
|
-
|
227
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
235
|
+
deltacat_storage=metastore,
|
228
236
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
229
237
|
**kwargs,
|
230
|
-
) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo]
|
238
|
+
) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo]]:
|
231
239
|
if deltacat_storage_kwargs is None:
|
232
240
|
deltacat_storage_kwargs = {}
|
233
|
-
|
241
|
+
rci_source_partition_locator = (
|
234
242
|
rebase_source_partition_locator
|
235
243
|
if rebase_source_partition_locator
|
236
244
|
else source_partition_locator
|
237
245
|
)
|
238
|
-
|
239
|
-
|
246
|
+
# Construct audit URL using filesystem-agnostic path joining
|
247
|
+
audit_url = posixpath.join(
|
248
|
+
compaction_artifact_path,
|
249
|
+
"compaction-audit.json",
|
250
|
+
f"{rci_source_partition_locator.hexdigest()}.json",
|
240
251
|
)
|
241
|
-
audit_url = f"{base_audit_url}.json"
|
242
252
|
|
243
253
|
logger.info(f"Compaction audit will be written to {audit_url}")
|
244
254
|
|
@@ -312,11 +322,11 @@ def _execute_compaction_round(
|
|
312
322
|
# read the results from any previously completed compaction round
|
313
323
|
round_completion_info = None
|
314
324
|
if not rebase_source_partition_locator:
|
315
|
-
round_completion_info =
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
325
|
+
round_completion_info = rci.read_round_completion_info(
|
326
|
+
source_partition_locator=source_partition_locator,
|
327
|
+
destination_partition_locator=destination_partition_locator,
|
328
|
+
deltacat_storage=deltacat_storage,
|
329
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
320
330
|
)
|
321
331
|
if not round_completion_info:
|
322
332
|
logger.info(
|
@@ -363,15 +373,11 @@ def _execute_compaction_round(
|
|
363
373
|
delta_discovery_end - delta_discovery_start
|
364
374
|
)
|
365
375
|
|
366
|
-
|
367
|
-
compaction_audit.audit_url,
|
368
|
-
str(json.dumps(compaction_audit)),
|
369
|
-
**s3_client_kwargs,
|
370
|
-
)
|
376
|
+
_upload_audit_data(audit_url, json.dumps(compaction_audit))
|
371
377
|
|
372
378
|
if not input_deltas:
|
373
379
|
logger.info("No input deltas found to compact.")
|
374
|
-
return None, None
|
380
|
+
return None, None
|
375
381
|
|
376
382
|
# limit the input deltas to fit on this cluster and convert them to
|
377
383
|
# annotated deltas of equivalent size for easy parallel distribution
|
@@ -464,11 +470,7 @@ def _execute_compaction_round(
|
|
464
470
|
hb_end - hb_start,
|
465
471
|
)
|
466
472
|
|
467
|
-
|
468
|
-
compaction_audit.audit_url,
|
469
|
-
str(json.dumps(compaction_audit)),
|
470
|
-
**s3_client_kwargs,
|
471
|
-
)
|
473
|
+
_upload_audit_data(audit_url, json.dumps(compaction_audit))
|
472
474
|
|
473
475
|
all_hash_group_idx_to_obj_id = defaultdict(list)
|
474
476
|
for hb_result in hb_results:
|
@@ -485,9 +487,9 @@ def _execute_compaction_round(
|
|
485
487
|
)
|
486
488
|
|
487
489
|
compaction_audit.set_input_records(total_hb_record_count.item())
|
488
|
-
# TODO
|
489
|
-
#
|
490
|
-
#
|
490
|
+
# TODO(pdames): when resources are freed during the last round of hash bucketing,
|
491
|
+
# start running dedupe tasks that read hash bucket output from storage then
|
492
|
+
# wait for hash bucketing to finish before continuing
|
491
493
|
|
492
494
|
# create a new stream for this round
|
493
495
|
compacted_stream_locator = destination_partition_locator.stream_locator
|
@@ -497,6 +499,7 @@ def _execute_compaction_round(
|
|
497
499
|
compacted_stream_locator.table_version,
|
498
500
|
**deltacat_storage_kwargs,
|
499
501
|
)
|
502
|
+
|
500
503
|
partition = deltacat_storage.stage_partition(
|
501
504
|
stream,
|
502
505
|
destination_partition_locator.partition_values,
|
@@ -571,9 +574,9 @@ def _execute_compaction_round(
|
|
571
574
|
logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
|
572
575
|
|
573
576
|
compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
|
574
|
-
# TODO(pdames): when resources are freed during the last round of deduping
|
577
|
+
# TODO(pdames): when resources are freed during the last round of deduping,
|
575
578
|
# start running materialize tasks that read materialization source file
|
576
|
-
# tables from
|
579
|
+
# tables from storage then wait for deduping to finish before continuing
|
577
580
|
|
578
581
|
# TODO(pdames): balance inputs to materialization tasks to ensure that each
|
579
582
|
# task has an approximately equal amount of input to materialize
|
@@ -584,11 +587,7 @@ def _execute_compaction_round(
|
|
584
587
|
# parallel step 3:
|
585
588
|
# materialize records to keep by index
|
586
589
|
|
587
|
-
|
588
|
-
compaction_audit.audit_url,
|
589
|
-
str(json.dumps(compaction_audit)),
|
590
|
-
**s3_client_kwargs,
|
591
|
-
)
|
590
|
+
_upload_audit_data(audit_url, json.dumps(compaction_audit))
|
592
591
|
|
593
592
|
materialize_start = time.monotonic()
|
594
593
|
mat_tasks_pending = invoke_parallel(
|
@@ -610,7 +609,7 @@ def _execute_compaction_round(
|
|
610
609
|
enable_profiler=enable_profiler,
|
611
610
|
metrics_config=metrics_config,
|
612
611
|
read_kwargs_provider=read_kwargs_provider,
|
613
|
-
|
612
|
+
table_writer_kwargs=table_writer_kwargs,
|
614
613
|
object_store=object_store,
|
615
614
|
deltacat_storage=deltacat_storage,
|
616
615
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
@@ -693,11 +692,7 @@ def _execute_compaction_round(
|
|
693
692
|
telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
|
694
693
|
)
|
695
694
|
|
696
|
-
|
697
|
-
compaction_audit.audit_url,
|
698
|
-
str(json.dumps(compaction_audit)),
|
699
|
-
**s3_client_kwargs,
|
700
|
-
)
|
695
|
+
_upload_audit_data(audit_url, json.dumps(compaction_audit))
|
701
696
|
|
702
697
|
new_round_completion_info = RoundCompletionInfo.of(
|
703
698
|
last_stream_position_compacted,
|
@@ -710,6 +705,7 @@ def _execute_compaction_round(
|
|
710
705
|
hash_bucket_count,
|
711
706
|
None,
|
712
707
|
CompactorVersion.V1.value,
|
708
|
+
prev_source_partition_locator=rci_source_partition_locator,
|
713
709
|
)
|
714
710
|
|
715
711
|
logger.info(
|
@@ -721,17 +717,43 @@ def _execute_compaction_round(
|
|
721
717
|
return (
|
722
718
|
partition,
|
723
719
|
new_round_completion_info,
|
724
|
-
rcf_source_partition_locator,
|
725
720
|
)
|
726
721
|
|
727
722
|
|
728
723
|
def compact_partition_from_request(
|
729
724
|
compact_partition_params: CompactPartitionParams,
|
730
725
|
*compact_partition_pos_args,
|
731
|
-
) ->
|
726
|
+
) -> None:
|
732
727
|
"""
|
733
728
|
Wrapper for compact_partition that allows for the compact_partition parameters to be
|
734
729
|
passed in as a custom dictionary-like CompactPartitionParams object along with any compact_partition positional arguments.
|
735
730
|
:param compact_partition_params:
|
736
731
|
"""
|
737
|
-
|
732
|
+
# Extract required positional arguments
|
733
|
+
source_partition_locator = compact_partition_params.source_partition_locator
|
734
|
+
destination_partition_locator = (
|
735
|
+
compact_partition_params.destination_partition_locator
|
736
|
+
)
|
737
|
+
primary_keys = compact_partition_params.primary_keys
|
738
|
+
compaction_artifact_path = compact_partition_params.compaction_artifact_path
|
739
|
+
last_stream_position_to_compact = (
|
740
|
+
compact_partition_params.last_stream_position_to_compact
|
741
|
+
)
|
742
|
+
|
743
|
+
# Create a copy of params without the positional arguments
|
744
|
+
kwargs_params = dict(compact_partition_params)
|
745
|
+
kwargs_params.pop("source_partition_locator", None)
|
746
|
+
kwargs_params.pop("destination_partition_locator", None)
|
747
|
+
kwargs_params.pop("primary_keys", None)
|
748
|
+
kwargs_params.pop("last_stream_position_to_compact", None)
|
749
|
+
# Don't pop compaction_artifact_path as it's a computed property, not stored in the dict
|
750
|
+
|
751
|
+
compact_partition(
|
752
|
+
source_partition_locator,
|
753
|
+
destination_partition_locator,
|
754
|
+
primary_keys,
|
755
|
+
compaction_artifact_path,
|
756
|
+
last_stream_position_to_compact,
|
757
|
+
*compact_partition_pos_args,
|
758
|
+
**kwargs_params,
|
759
|
+
)
|