deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +17 -6
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/compute/compactor/compaction_session.py +42 -18
- deltacat/compute/compactor/model/compact_partition_params.py +297 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
- deltacat/compute/compactor/model/delta_annotated.py +95 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +4 -1
- deltacat/compute/compactor/steps/dedupe.py +9 -6
- deltacat/compute/compactor/steps/hash_bucket.py +24 -3
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +22 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/sort_key.py +5 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +509 -0
- deltacat/compute/compactor_v2/constants.py +37 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +143 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +469 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
- deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
- deltacat/compute/compactor_v2/utils/io.py +152 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
- deltacat/compute/compactor_v2/utils/task_options.py +221 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/interface.py +8 -1
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/test_clients.py +16 -3
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
- deltacat/tests/compute/testcases.py +395 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +62 -19
- deltacat/tests/test_utils/pyarrow.py +49 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +83 -0
- deltacat/types/tables.py +6 -0
- deltacat/utils/arguments.py +25 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +218 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
- /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
deltacat/aws/clients.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
from functools import lru_cache
|
3
|
-
from typing import Optional
|
3
|
+
from typing import Optional, FrozenSet
|
4
4
|
from http import HTTPStatus
|
5
5
|
|
6
6
|
import boto3
|
@@ -38,7 +38,7 @@ RETRYABLE_HTTP_STATUS_CODES = [
|
|
38
38
|
]
|
39
39
|
|
40
40
|
|
41
|
-
class
|
41
|
+
class RetryIfRetryableHTTPStatusCode(retry_if_exception):
|
42
42
|
"""
|
43
43
|
Retry strategy that retries if the exception is an ``HTTPError`` with
|
44
44
|
a status code in the retryable errors list.
|
@@ -72,6 +72,7 @@ def retrying_get(
|
|
72
72
|
retry_strategy,
|
73
73
|
wait_strategy,
|
74
74
|
stop_strategy,
|
75
|
+
short_circuit_on_status: FrozenSet[int] = {HTTPStatus.OK},
|
75
76
|
) -> Optional[Response]:
|
76
77
|
"""Retries a request to the given URL until it succeeds.
|
77
78
|
|
@@ -86,6 +87,9 @@ def retrying_get(
|
|
86
87
|
failed after the maximum number of retries.
|
87
88
|
"""
|
88
89
|
try:
|
90
|
+
resp = _get_url(url)
|
91
|
+
if resp.status_code in short_circuit_on_status:
|
92
|
+
return resp
|
89
93
|
for attempt in Retrying(
|
90
94
|
retry=retry_strategy(),
|
91
95
|
wait=wait_strategy,
|
@@ -103,7 +107,7 @@ def retrying_get(
|
|
103
107
|
|
104
108
|
def block_until_instance_metadata_service_returns_success(
|
105
109
|
url=INSTANCE_METADATA_SERVICE_IPV4_URI,
|
106
|
-
retry_strategy=
|
110
|
+
retry_strategy=RetryIfRetryableHTTPStatusCode,
|
107
111
|
wait_strategy=wait_fixed(2), # wait 2 seconds before retrying,
|
108
112
|
stop_strategy=stop_after_delay(60 * 10), # stop trying after 10 minutes
|
109
113
|
) -> Optional[Response]:
|
@@ -121,7 +125,14 @@ def block_until_instance_metadata_service_returns_success(
|
|
121
125
|
|
122
126
|
https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
123
127
|
"""
|
124
|
-
|
128
|
+
# We will get a 403 HTTP status code if running deltacat not in an EC2 instance. In that case we won't want to block.
|
129
|
+
return retrying_get(
|
130
|
+
url,
|
131
|
+
retry_strategy,
|
132
|
+
wait_strategy,
|
133
|
+
stop_strategy,
|
134
|
+
short_circuit_on_status={HTTPStatus.OK, HTTPStatus.FORBIDDEN},
|
135
|
+
)
|
125
136
|
|
126
137
|
|
127
138
|
def _get_session_from_kwargs(input_kwargs):
|
@@ -139,7 +150,7 @@ def _get_session_from_kwargs(input_kwargs):
|
|
139
150
|
def _resource(name: str, region: Optional[str], **kwargs) -> ServiceResource:
|
140
151
|
boto3_session = _get_session_from_kwargs(kwargs)
|
141
152
|
|
142
|
-
boto_config = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "
|
153
|
+
boto_config = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
|
143
154
|
return boto3_session.resource(
|
144
155
|
name,
|
145
156
|
region,
|
@@ -156,7 +167,7 @@ def _client(name: str, region: Optional[str], **kwargs) -> BaseClient:
|
|
156
167
|
# fall back for clients without an associated resource
|
157
168
|
boto3_session = _get_session_from_kwargs(kwargs)
|
158
169
|
boto_config = Config(
|
159
|
-
retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "
|
170
|
+
retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"}
|
160
171
|
)
|
161
172
|
return boto3_session.client(
|
162
173
|
name,
|
@@ -170,6 +170,10 @@ class ManifestMeta(dict):
|
|
170
170
|
def content_type_parameters(self) -> Optional[List[Dict[str, str]]]:
|
171
171
|
return self.get("content_type_parameters")
|
172
172
|
|
173
|
+
@content_type_parameters.setter
|
174
|
+
def content_type_parameters(self, params: List[Dict[str, str]]) -> None:
|
175
|
+
self["content_type_parameters"] = params
|
176
|
+
|
173
177
|
@property
|
174
178
|
def credentials(self) -> Optional[Dict[str, str]]:
|
175
179
|
return self.get("credentials")
|
deltacat/aws/s3u.py
CHANGED
@@ -3,6 +3,8 @@ import multiprocessing
|
|
3
3
|
from functools import partial
|
4
4
|
from typing import Any, Callable, Dict, Generator, List, Optional, Union
|
5
5
|
from uuid import uuid4
|
6
|
+
from botocore.config import Config
|
7
|
+
from deltacat.aws.constants import BOTO_MAX_RETRIES
|
6
8
|
|
7
9
|
import pyarrow as pa
|
8
10
|
import ray
|
@@ -39,6 +41,7 @@ from deltacat.types.tables import (
|
|
39
41
|
TABLE_TYPE_TO_READER_FUNC,
|
40
42
|
get_table_length,
|
41
43
|
)
|
44
|
+
from deltacat.types.partial_download import PartialFileDownloadParams
|
42
45
|
from deltacat.utils.common import ReadKwargsProvider
|
43
46
|
|
44
47
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
@@ -197,6 +200,7 @@ def read_file(
|
|
197
200
|
column_names: Optional[List[str]] = None,
|
198
201
|
include_columns: Optional[List[str]] = None,
|
199
202
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
203
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
200
204
|
**s3_client_kwargs,
|
201
205
|
) -> LocalTable:
|
202
206
|
|
@@ -209,6 +213,7 @@ def read_file(
|
|
209
213
|
column_names,
|
210
214
|
include_columns,
|
211
215
|
file_reader_kwargs_provider,
|
216
|
+
partial_file_download_params,
|
212
217
|
**s3_client_kwargs,
|
213
218
|
)
|
214
219
|
return table
|
@@ -217,6 +222,13 @@ def read_file(
|
|
217
222
|
# Timeout error not caught by botocore
|
218
223
|
raise RetryableError(f"Retry table download from: {s3_url}") from e
|
219
224
|
raise NonRetryableError(f"Failed table download from: {s3_url}") from e
|
225
|
+
except BaseException as e:
|
226
|
+
logger.warn(
|
227
|
+
f"Read has failed for {s3_url} and content_type={content_type} "
|
228
|
+
f"and encoding={content_encoding}. Error: {e}",
|
229
|
+
exc_info=True,
|
230
|
+
)
|
231
|
+
raise e
|
220
232
|
|
221
233
|
|
222
234
|
def upload_sliced_table(
|
@@ -385,14 +397,16 @@ def download_manifest_entry(
|
|
385
397
|
content_encoding: Optional[ContentEncoding] = None,
|
386
398
|
) -> LocalTable:
|
387
399
|
|
400
|
+
conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
|
388
401
|
s3_client_kwargs = (
|
389
402
|
{
|
390
403
|
"aws_access_key_id": token_holder["accessKeyId"],
|
391
404
|
"aws_secret_access_key": token_holder["secretAccessKey"],
|
392
405
|
"aws_session_token": token_holder["sessionToken"],
|
406
|
+
"config": conf,
|
393
407
|
}
|
394
408
|
if token_holder
|
395
|
-
else {}
|
409
|
+
else {"config": conf}
|
396
410
|
)
|
397
411
|
if not content_type:
|
398
412
|
content_type = manifest_entry.meta.content_type
|
@@ -409,6 +423,14 @@ def download_manifest_entry(
|
|
409
423
|
s3_url = manifest_entry.uri
|
410
424
|
if s3_url is None:
|
411
425
|
s3_url = manifest_entry.url
|
426
|
+
|
427
|
+
partial_file_download_params = None
|
428
|
+
if manifest_entry.meta and manifest_entry.meta.content_type_parameters:
|
429
|
+
for type_params in manifest_entry.meta.content_type_parameters:
|
430
|
+
if isinstance(type_params, PartialFileDownloadParams):
|
431
|
+
partial_file_download_params = type_params
|
432
|
+
break
|
433
|
+
|
412
434
|
# @retry decorator can't be pickled by Ray, so wrap download in Retrying
|
413
435
|
retrying = Retrying(
|
414
436
|
wait=wait_random_exponential(multiplier=1, max=60),
|
@@ -424,6 +446,7 @@ def download_manifest_entry(
|
|
424
446
|
column_names,
|
425
447
|
include_columns,
|
426
448
|
file_reader_kwargs_provider,
|
449
|
+
partial_file_download_params,
|
427
450
|
**s3_client_kwargs,
|
428
451
|
)
|
429
452
|
return table
|
@@ -60,6 +60,9 @@ if importlib.util.find_spec("memray"):
|
|
60
60
|
|
61
61
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
62
62
|
|
63
|
+
DEFAULT_DEDUPE_MAX_PARALLELISM_RATIO_ARG: int = 1
|
64
|
+
DEFAULT_PROPERTIES_ARG: Dict[str, Any] = {}
|
65
|
+
|
63
66
|
|
64
67
|
def check_preconditions(
|
65
68
|
source_partition_locator: PartitionLocator,
|
@@ -68,8 +71,11 @@ def check_preconditions(
|
|
68
71
|
max_records_per_output_file: int,
|
69
72
|
new_hash_bucket_count: Optional[int],
|
70
73
|
deltacat_storage=unimplemented_deltacat_storage,
|
74
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
75
|
+
**kwargs,
|
71
76
|
) -> int:
|
72
|
-
|
77
|
+
if deltacat_storage_kwargs is None:
|
78
|
+
deltacat_storage_kwargs = {}
|
73
79
|
assert (
|
74
80
|
source_partition_locator.partition_values
|
75
81
|
== destination_partition_locator.partition_values
|
@@ -88,6 +94,8 @@ def check_preconditions(
|
|
88
94
|
source_partition_locator,
|
89
95
|
sort_keys,
|
90
96
|
deltacat_storage,
|
97
|
+
deltacat_storage_kwargs,
|
98
|
+
**kwargs,
|
91
99
|
)
|
92
100
|
|
93
101
|
|
@@ -118,9 +126,11 @@ def compact_partition(
|
|
118
126
|
object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
|
119
127
|
s3_client_kwargs: Optional[Dict[str, Any]] = None,
|
120
128
|
deltacat_storage=unimplemented_deltacat_storage,
|
129
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
121
130
|
**kwargs,
|
122
131
|
) -> Optional[str]:
|
123
|
-
|
132
|
+
if deltacat_storage_kwargs is None:
|
133
|
+
deltacat_storage_kwargs = {}
|
124
134
|
if not importlib.util.find_spec("memray"):
|
125
135
|
logger.info(f"memray profiler not available, disabling all profiling")
|
126
136
|
enable_profiler = False
|
@@ -162,6 +172,7 @@ def compact_partition(
|
|
162
172
|
object_store,
|
163
173
|
s3_client_kwargs,
|
164
174
|
deltacat_storage,
|
175
|
+
deltacat_storage_kwargs,
|
165
176
|
**kwargs,
|
166
177
|
)
|
167
178
|
if new_partition:
|
@@ -173,7 +184,9 @@ def compact_partition(
|
|
173
184
|
round_completion_file_s3_url = None
|
174
185
|
if partition:
|
175
186
|
logger.info(f"Committing compacted partition to: {partition.locator}")
|
176
|
-
partition = deltacat_storage.commit_partition(
|
187
|
+
partition = deltacat_storage.commit_partition(
|
188
|
+
partition, **deltacat_storage_kwargs
|
189
|
+
)
|
177
190
|
logger.info(f"Committed compacted partition: {partition}")
|
178
191
|
|
179
192
|
round_completion_file_s3_url = rcf.write_round_completion_file(
|
@@ -210,15 +223,16 @@ def _execute_compaction_round(
|
|
210
223
|
object_store: Optional[IObjectStore],
|
211
224
|
s3_client_kwargs: Optional[Dict[str, Any]],
|
212
225
|
deltacat_storage=unimplemented_deltacat_storage,
|
226
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
213
227
|
**kwargs,
|
214
228
|
) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
|
215
|
-
|
229
|
+
if deltacat_storage_kwargs is None:
|
230
|
+
deltacat_storage_kwargs = {}
|
216
231
|
rcf_source_partition_locator = (
|
217
232
|
rebase_source_partition_locator
|
218
233
|
if rebase_source_partition_locator
|
219
234
|
else source_partition_locator
|
220
235
|
)
|
221
|
-
|
222
236
|
base_audit_url = rcf_source_partition_locator.path(
|
223
237
|
f"s3://{compaction_artifact_s3_bucket}/compaction-audit"
|
224
238
|
)
|
@@ -251,6 +265,8 @@ def _execute_compaction_round(
|
|
251
265
|
records_per_compacted_file,
|
252
266
|
hash_bucket_count,
|
253
267
|
deltacat_storage,
|
268
|
+
deltacat_storage_kwargs,
|
269
|
+
**kwargs,
|
254
270
|
)
|
255
271
|
|
256
272
|
# sort primary keys to produce the same pk digest regardless of input order
|
@@ -330,7 +346,8 @@ def _execute_compaction_round(
|
|
330
346
|
rebase_source_partition_locator,
|
331
347
|
rebase_source_partition_high_watermark,
|
332
348
|
deltacat_storage,
|
333
|
-
|
349
|
+
deltacat_storage_kwargs,
|
350
|
+
list_deltas_kwargs,
|
334
351
|
)
|
335
352
|
|
336
353
|
delta_discovery_end = time.monotonic()
|
@@ -363,6 +380,8 @@ def _execute_compaction_round(
|
|
363
380
|
compaction_audit,
|
364
381
|
hash_bucket_count,
|
365
382
|
deltacat_storage=deltacat_storage,
|
383
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
384
|
+
**kwargs,
|
366
385
|
)
|
367
386
|
if input_deltas_stats is None
|
368
387
|
else io.limit_input_deltas(
|
@@ -373,6 +392,8 @@ def _execute_compaction_round(
|
|
373
392
|
compaction_audit=compaction_audit,
|
374
393
|
input_deltas_stats=input_deltas_stats,
|
375
394
|
deltacat_storage=deltacat_storage,
|
395
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
396
|
+
**kwargs,
|
376
397
|
)
|
377
398
|
)
|
378
399
|
|
@@ -400,9 +421,7 @@ def _execute_compaction_round(
|
|
400
421
|
raise AssertionError(
|
401
422
|
"Multiple rounds are not supported. Please increase the cluster size and run again."
|
402
423
|
)
|
403
|
-
|
404
424
|
hb_start = time.monotonic()
|
405
|
-
|
406
425
|
hb_tasks_pending = invoke_parallel(
|
407
426
|
items=uniform_deltas,
|
408
427
|
ray_task=hb.hash_bucket,
|
@@ -418,8 +437,9 @@ def _execute_compaction_round(
|
|
418
437
|
read_kwargs_provider=read_kwargs_provider,
|
419
438
|
object_store=object_store,
|
420
439
|
deltacat_storage=deltacat_storage,
|
440
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
441
|
+
**kwargs,
|
421
442
|
)
|
422
|
-
|
423
443
|
hb_invoke_end = time.monotonic()
|
424
444
|
|
425
445
|
logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
|
@@ -457,7 +477,6 @@ def _execute_compaction_round(
|
|
457
477
|
)
|
458
478
|
|
459
479
|
compaction_audit.set_input_records(total_hb_record_count.item())
|
460
|
-
|
461
480
|
# TODO (pdames): when resources are freed during the last round of hash
|
462
481
|
# bucketing, start running dedupe tasks that read existing dedupe
|
463
482
|
# output from S3 then wait for hash bucketing to finish before continuing
|
@@ -468,13 +487,14 @@ def _execute_compaction_round(
|
|
468
487
|
compacted_stream_locator.namespace,
|
469
488
|
compacted_stream_locator.table_name,
|
470
489
|
compacted_stream_locator.table_version,
|
490
|
+
**deltacat_storage_kwargs,
|
471
491
|
)
|
472
492
|
partition = deltacat_storage.stage_partition(
|
473
493
|
stream,
|
474
494
|
destination_partition_locator.partition_values,
|
495
|
+
**deltacat_storage_kwargs,
|
475
496
|
)
|
476
497
|
new_compacted_partition_locator = partition.locator
|
477
|
-
|
478
498
|
# parallel step 2:
|
479
499
|
# discover records with duplicate primary keys in each hash bucket, and
|
480
500
|
# identify the index of records to keep or drop based on sort keys
|
@@ -483,7 +503,10 @@ def _execute_compaction_round(
|
|
483
503
|
|
484
504
|
dedupe_start = time.monotonic()
|
485
505
|
dd_max_parallelism = int(
|
486
|
-
max_parallelism
|
506
|
+
max_parallelism
|
507
|
+
* kwargs.get(
|
508
|
+
"dd_max_parallelism_ratio", DEFAULT_DEDUPE_MAX_PARALLELISM_RATIO_ARG
|
509
|
+
)
|
487
510
|
)
|
488
511
|
logger.info(
|
489
512
|
f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
|
@@ -527,7 +550,6 @@ def _execute_compaction_round(
|
|
527
550
|
)
|
528
551
|
|
529
552
|
compaction_audit.set_records_deduped(total_dd_record_count.item())
|
530
|
-
|
531
553
|
all_mat_buckets_to_obj_id = defaultdict(list)
|
532
554
|
for dd_result in dd_results:
|
533
555
|
for (
|
@@ -541,7 +563,6 @@ def _execute_compaction_round(
|
|
541
563
|
logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
|
542
564
|
|
543
565
|
compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
|
544
|
-
|
545
566
|
# TODO(pdames): when resources are freed during the last round of deduping
|
546
567
|
# start running materialize tasks that read materialization source file
|
547
568
|
# tables from S3 then wait for deduping to finish before continuing
|
@@ -562,7 +583,6 @@ def _execute_compaction_round(
|
|
562
583
|
)
|
563
584
|
|
564
585
|
materialize_start = time.monotonic()
|
565
|
-
|
566
586
|
mat_tasks_pending = invoke_parallel(
|
567
587
|
items=all_mat_buckets_to_obj_id.items(),
|
568
588
|
ray_task=mat.materialize,
|
@@ -585,6 +605,7 @@ def _execute_compaction_round(
|
|
585
605
|
s3_table_writer_kwargs=s3_table_writer_kwargs,
|
586
606
|
object_store=object_store,
|
587
607
|
deltacat_storage=deltacat_storage,
|
608
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
588
609
|
)
|
589
610
|
|
590
611
|
materialize_invoke_end = time.monotonic()
|
@@ -630,7 +651,9 @@ def _execute_compaction_round(
|
|
630
651
|
f" {record_info_msg}"
|
631
652
|
)
|
632
653
|
compacted_delta = deltacat_storage.commit_delta(
|
633
|
-
merged_delta,
|
654
|
+
merged_delta,
|
655
|
+
properties=kwargs.get("properties", DEFAULT_PROPERTIES_ARG),
|
656
|
+
**deltacat_storage_kwargs,
|
634
657
|
)
|
635
658
|
logger.info(f"Committed compacted delta: {compacted_delta}")
|
636
659
|
|
@@ -692,10 +715,11 @@ def _execute_compaction_round(
|
|
692
715
|
|
693
716
|
def compact_partition_from_request(
|
694
717
|
compact_partition_params: CompactPartitionParams,
|
718
|
+
*compact_partition_pos_args,
|
695
719
|
) -> Optional[str]:
|
696
720
|
"""
|
697
721
|
Wrapper for compact_partition that allows for the compact_partition parameters to be
|
698
|
-
passed in as a custom dictionary-like CompactPartitionParams object.
|
722
|
+
passed in as a custom dictionary-like CompactPartitionParams object along with any compact_partition positional arguments.
|
699
723
|
:param compact_partition_params:
|
700
724
|
"""
|
701
|
-
return compact_partition(**compact_partition_params)
|
725
|
+
return compact_partition(*compact_partition_pos_args, **compact_partition_params)
|