deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +3 -2
- deltacat/aws/clients.py +123 -3
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
- deltacat/benchmarking/conftest.py +61 -0
- deltacat/catalog/delegate.py +1 -1
- deltacat/catalog/interface.py +1 -1
- deltacat/compute/compactor/__init__.py +0 -3
- deltacat/compute/compactor/compaction_session.py +45 -20
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
- deltacat/compute/compactor/model/primary_key_index.py +1 -1
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +5 -3
- deltacat/compute/compactor/steps/dedupe.py +10 -8
- deltacat/compute/compactor/steps/hash_bucket.py +25 -4
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/primary_key_index.py +1 -15
- deltacat/compute/compactor/utils/sort_key.py +57 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/__init__.py +0 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/__init__.py +3 -0
- deltacat/storage/interface.py +11 -2
- deltacat/storage/model/sort_key.py +33 -0
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/__init__.py +0 -0
- deltacat/tests/aws/test_clients.py +80 -0
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/performance.py +4 -2
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
- deltacat-0.1.18b15.dist-info/RECORD +176 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/model/sort_key.py +0 -98
- deltacat-0.1.18b13.dist-info/RECORD +0 -136
- /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -28,7 +28,6 @@ from deltacat.catalog.model.catalog import ( # noqa: F401
|
|
28
28
|
init,
|
29
29
|
)
|
30
30
|
from deltacat.catalog.model.table_definition import TableDefinition
|
31
|
-
from deltacat.compute.compactor import SortKey, SortOrder
|
32
31
|
from deltacat.storage import (
|
33
32
|
DistributedDataset,
|
34
33
|
LifecycleState,
|
@@ -37,13 +36,15 @@ from deltacat.storage import (
|
|
37
36
|
LocalTable,
|
38
37
|
Namespace,
|
39
38
|
SchemaConsistencyType,
|
39
|
+
SortKey,
|
40
|
+
SortOrder,
|
40
41
|
)
|
41
42
|
from deltacat.types.media import ContentEncoding, ContentType, TableType
|
42
43
|
from deltacat.types.tables import TableWriteMode
|
43
44
|
|
44
45
|
deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
|
45
46
|
|
46
|
-
__version__ = "0.1.
|
47
|
+
__version__ = "0.1.18b15"
|
47
48
|
|
48
49
|
|
49
50
|
__all__ = [
|
deltacat/aws/clients.py
CHANGED
@@ -1,22 +1,142 @@
|
|
1
1
|
import logging
|
2
2
|
from functools import lru_cache
|
3
|
-
from typing import Optional
|
3
|
+
from typing import Optional, FrozenSet
|
4
|
+
from http import HTTPStatus
|
4
5
|
|
5
6
|
import boto3
|
6
7
|
from boto3.exceptions import ResourceNotExistsError
|
7
8
|
from boto3.resources.base import ServiceResource
|
8
9
|
from botocore.client import BaseClient
|
9
10
|
from botocore.config import Config
|
11
|
+
from requests.adapters import Response
|
12
|
+
from tenacity import (
|
13
|
+
RetryError,
|
14
|
+
Retrying,
|
15
|
+
wait_fixed,
|
16
|
+
retry_if_exception,
|
17
|
+
stop_after_delay,
|
18
|
+
)
|
10
19
|
|
11
20
|
from deltacat import logs
|
12
21
|
from deltacat.aws.constants import BOTO_MAX_RETRIES
|
22
|
+
import requests
|
23
|
+
|
13
24
|
|
14
25
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
26
|
|
16
27
|
BOTO3_PROFILE_NAME_KWARG_KEY = "boto3_profile_name"
|
28
|
+
INSTANCE_METADATA_SERVICE_IPV4_URI = "http://169.254.169.254/latest/meta-data/" # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
29
|
+
RETRYABLE_HTTP_STATUS_CODES = [
|
30
|
+
# 429
|
31
|
+
HTTPStatus.TOO_MANY_REQUESTS,
|
32
|
+
# 5xx
|
33
|
+
HTTPStatus.INTERNAL_SERVER_ERROR,
|
34
|
+
HTTPStatus.NOT_IMPLEMENTED,
|
35
|
+
HTTPStatus.BAD_GATEWAY,
|
36
|
+
HTTPStatus.SERVICE_UNAVAILABLE,
|
37
|
+
HTTPStatus.GATEWAY_TIMEOUT,
|
38
|
+
]
|
39
|
+
|
40
|
+
|
41
|
+
class RetryIfRetryableHTTPStatusCode(retry_if_exception):
|
42
|
+
"""
|
43
|
+
Retry strategy that retries if the exception is an ``HTTPError`` with
|
44
|
+
a status code in the retryable errors list.
|
45
|
+
"""
|
46
|
+
|
47
|
+
def __init__(self):
|
48
|
+
def is_retryable_error(exception):
|
49
|
+
return (
|
50
|
+
isinstance(exception, requests.exceptions.HTTPError)
|
51
|
+
and exception.response.status_code in RETRYABLE_HTTP_STATUS_CODES
|
52
|
+
)
|
53
|
+
|
54
|
+
super().__init__(predicate=is_retryable_error)
|
55
|
+
|
56
|
+
|
57
|
+
def _log_attempt_number(retry_state):
|
58
|
+
"""return the result of the last call attempt"""
|
59
|
+
logger.warning(f"Retrying: {retry_state.attempt_number}...")
|
60
|
+
|
61
|
+
|
62
|
+
def _get_url(url: str, get_url_kwargs=None):
|
63
|
+
if get_url_kwargs is None:
|
64
|
+
get_url_kwargs = {}
|
65
|
+
resp = requests.get(url, **get_url_kwargs)
|
66
|
+
resp.raise_for_status()
|
67
|
+
return resp
|
68
|
+
|
69
|
+
|
70
|
+
def retrying_get(
|
71
|
+
url: str,
|
72
|
+
retry_strategy,
|
73
|
+
wait_strategy,
|
74
|
+
stop_strategy,
|
75
|
+
short_circuit_on_status: FrozenSet[int] = {HTTPStatus.OK},
|
76
|
+
) -> Optional[Response]:
|
77
|
+
"""Retries a request to the given URL until it succeeds.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
retry_strategy (Callable): A function that returns a retry strategy.
|
81
|
+
wait_strategy (Callable): A function that returns a wait strategy.
|
82
|
+
stop_strategy (Callable): A function that returns a stop strategy.
|
83
|
+
url (str): The URL to retry.
|
84
|
+
|
85
|
+
Returns:
|
86
|
+
Optional[Response]: The response from the URL, or None if the request
|
87
|
+
failed after the maximum number of retries.
|
88
|
+
"""
|
89
|
+
try:
|
90
|
+
resp = _get_url(url)
|
91
|
+
if resp.status_code in short_circuit_on_status:
|
92
|
+
return resp
|
93
|
+
for attempt in Retrying(
|
94
|
+
retry=retry_strategy(),
|
95
|
+
wait=wait_strategy,
|
96
|
+
stop=stop_strategy,
|
97
|
+
after=_log_attempt_number,
|
98
|
+
):
|
99
|
+
with attempt:
|
100
|
+
resp = _get_url(url)
|
101
|
+
return resp
|
102
|
+
except RetryError as re:
|
103
|
+
logger.error(f"Failed to retry URL: {url} - {re}")
|
104
|
+
logger.info(f"Unable to get from URL: {url}")
|
105
|
+
return None
|
106
|
+
|
107
|
+
|
108
|
+
def block_until_instance_metadata_service_returns_success(
|
109
|
+
url=INSTANCE_METADATA_SERVICE_IPV4_URI,
|
110
|
+
retry_strategy=RetryIfRetryableHTTPStatusCode,
|
111
|
+
wait_strategy=wait_fixed(2), # wait 2 seconds before retrying,
|
112
|
+
stop_strategy=stop_after_delay(60 * 10), # stop trying after 10 minutes
|
113
|
+
) -> Optional[Response]:
|
114
|
+
"""Blocks until the instance metadata service returns a successful response.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
retry_strategy (Callable): A function that returns a retry strategy.
|
118
|
+
wait_strategy (Callable): A function that returns a wait strategy.
|
119
|
+
stop_strategy (Callable): A function that returns a stop strategy.
|
120
|
+
url (str): The URL of the instance metadata service.
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
Optional[Response]: The response from the instance metadata service,
|
124
|
+
or None if the request failed after the maximum number of retries.
|
125
|
+
|
126
|
+
https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
127
|
+
"""
|
128
|
+
# We will get a 403 HTTP status code if running deltacat not in an EC2 instance. In that case we won't want to block.
|
129
|
+
return retrying_get(
|
130
|
+
url,
|
131
|
+
retry_strategy,
|
132
|
+
wait_strategy,
|
133
|
+
stop_strategy,
|
134
|
+
short_circuit_on_status={HTTPStatus.OK, HTTPStatus.FORBIDDEN},
|
135
|
+
)
|
17
136
|
|
18
137
|
|
19
138
|
def _get_session_from_kwargs(input_kwargs):
|
139
|
+
block_until_instance_metadata_service_returns_success()
|
20
140
|
if input_kwargs.get(BOTO3_PROFILE_NAME_KWARG_KEY) is not None:
|
21
141
|
boto3_session = boto3.Session(
|
22
142
|
profile_name=input_kwargs.get(BOTO3_PROFILE_NAME_KWARG_KEY)
|
@@ -30,7 +150,7 @@ def _get_session_from_kwargs(input_kwargs):
|
|
30
150
|
def _resource(name: str, region: Optional[str], **kwargs) -> ServiceResource:
|
31
151
|
boto3_session = _get_session_from_kwargs(kwargs)
|
32
152
|
|
33
|
-
boto_config = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "
|
153
|
+
boto_config = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
|
34
154
|
return boto3_session.resource(
|
35
155
|
name,
|
36
156
|
region,
|
@@ -47,7 +167,7 @@ def _client(name: str, region: Optional[str], **kwargs) -> BaseClient:
|
|
47
167
|
# fall back for clients without an associated resource
|
48
168
|
boto3_session = _get_session_from_kwargs(kwargs)
|
49
169
|
boto_config = Config(
|
50
|
-
retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "
|
170
|
+
retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"}
|
51
171
|
)
|
52
172
|
return boto3_session.client(
|
53
173
|
name,
|
@@ -170,6 +170,10 @@ class ManifestMeta(dict):
|
|
170
170
|
def content_type_parameters(self) -> Optional[List[Dict[str, str]]]:
|
171
171
|
return self.get("content_type_parameters")
|
172
172
|
|
173
|
+
@content_type_parameters.setter
|
174
|
+
def content_type_parameters(self, params: List[Dict[str, str]]) -> None:
|
175
|
+
self["content_type_parameters"] = params
|
176
|
+
|
173
177
|
@property
|
174
178
|
def credentials(self) -> Optional[Dict[str, str]]:
|
175
179
|
return self.get("credentials")
|
deltacat/aws/s3u.py
CHANGED
@@ -3,6 +3,8 @@ import multiprocessing
|
|
3
3
|
from functools import partial
|
4
4
|
from typing import Any, Callable, Dict, Generator, List, Optional, Union
|
5
5
|
from uuid import uuid4
|
6
|
+
from botocore.config import Config
|
7
|
+
from deltacat.aws.constants import BOTO_MAX_RETRIES
|
6
8
|
|
7
9
|
import pyarrow as pa
|
8
10
|
import ray
|
@@ -39,6 +41,7 @@ from deltacat.types.tables import (
|
|
39
41
|
TABLE_TYPE_TO_READER_FUNC,
|
40
42
|
get_table_length,
|
41
43
|
)
|
44
|
+
from deltacat.types.partial_download import PartialFileDownloadParams
|
42
45
|
from deltacat.utils.common import ReadKwargsProvider
|
43
46
|
|
44
47
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
@@ -197,6 +200,7 @@ def read_file(
|
|
197
200
|
column_names: Optional[List[str]] = None,
|
198
201
|
include_columns: Optional[List[str]] = None,
|
199
202
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
203
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
200
204
|
**s3_client_kwargs,
|
201
205
|
) -> LocalTable:
|
202
206
|
|
@@ -209,6 +213,7 @@ def read_file(
|
|
209
213
|
column_names,
|
210
214
|
include_columns,
|
211
215
|
file_reader_kwargs_provider,
|
216
|
+
partial_file_download_params,
|
212
217
|
**s3_client_kwargs,
|
213
218
|
)
|
214
219
|
return table
|
@@ -217,6 +222,13 @@ def read_file(
|
|
217
222
|
# Timeout error not caught by botocore
|
218
223
|
raise RetryableError(f"Retry table download from: {s3_url}") from e
|
219
224
|
raise NonRetryableError(f"Failed table download from: {s3_url}") from e
|
225
|
+
except BaseException as e:
|
226
|
+
logger.warn(
|
227
|
+
f"Read has failed for {s3_url} and content_type={content_type} "
|
228
|
+
f"and encoding={content_encoding}. Error: {e}",
|
229
|
+
exc_info=True,
|
230
|
+
)
|
231
|
+
raise e
|
220
232
|
|
221
233
|
|
222
234
|
def upload_sliced_table(
|
@@ -385,14 +397,16 @@ def download_manifest_entry(
|
|
385
397
|
content_encoding: Optional[ContentEncoding] = None,
|
386
398
|
) -> LocalTable:
|
387
399
|
|
400
|
+
conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
|
388
401
|
s3_client_kwargs = (
|
389
402
|
{
|
390
403
|
"aws_access_key_id": token_holder["accessKeyId"],
|
391
404
|
"aws_secret_access_key": token_holder["secretAccessKey"],
|
392
405
|
"aws_session_token": token_holder["sessionToken"],
|
406
|
+
"config": conf,
|
393
407
|
}
|
394
408
|
if token_holder
|
395
|
-
else {}
|
409
|
+
else {"config": conf}
|
396
410
|
)
|
397
411
|
if not content_type:
|
398
412
|
content_type = manifest_entry.meta.content_type
|
@@ -409,6 +423,14 @@ def download_manifest_entry(
|
|
409
423
|
s3_url = manifest_entry.uri
|
410
424
|
if s3_url is None:
|
411
425
|
s3_url = manifest_entry.url
|
426
|
+
|
427
|
+
partial_file_download_params = None
|
428
|
+
if manifest_entry.meta and manifest_entry.meta.content_type_parameters:
|
429
|
+
for type_params in manifest_entry.meta.content_type_parameters:
|
430
|
+
if isinstance(type_params, PartialFileDownloadParams):
|
431
|
+
partial_file_download_params = type_params
|
432
|
+
break
|
433
|
+
|
412
434
|
# @retry decorator can't be pickled by Ray, so wrap download in Retrying
|
413
435
|
retrying = Retrying(
|
414
436
|
wait=wait_random_exponential(multiplier=1, max=60),
|
@@ -424,6 +446,7 @@ def download_manifest_entry(
|
|
424
446
|
column_names,
|
425
447
|
include_columns,
|
426
448
|
file_reader_kwargs_provider,
|
449
|
+
partial_file_download_params,
|
427
450
|
**s3_client_kwargs,
|
428
451
|
)
|
429
452
|
return table
|
@@ -0,0 +1,53 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
|
6
|
+
# Benchmarks for retrieving a single column in the Parquet file
|
7
|
+
SINGLE_COLUMN_BENCHMARKS = {
|
8
|
+
"mvp": ("s3://daft-public-data/test_fixtures/parquet-dev/mvp.parquet", ["a"]),
|
9
|
+
"TPCH-lineitems-200MB-2RG": (
|
10
|
+
"s3://daft-public-data/test_fixtures/parquet-dev/daft_200MB_lineitem_chunk.RG-2.parquet",
|
11
|
+
["L_ORDERKEY"],
|
12
|
+
),
|
13
|
+
}
|
14
|
+
|
15
|
+
# Benchmarks for retrieving all columns in the Parquet file
|
16
|
+
ALL_COLUMN_BENCHMARKS = {
|
17
|
+
"mvp": ("s3://daft-public-data/test_fixtures/parquet-dev/mvp.parquet", None),
|
18
|
+
"TPCH-lineitems-200MB-2RG": (
|
19
|
+
"s3://daft-public-data/test_fixtures/parquet-dev/daft_200MB_lineitem_chunk.RG-2.parquet",
|
20
|
+
None,
|
21
|
+
),
|
22
|
+
}
|
23
|
+
|
24
|
+
|
25
|
+
@pytest.mark.benchmark(group="num_rowgroups_single_column")
|
26
|
+
@pytest.mark.parametrize(
|
27
|
+
["name", "path", "columns"],
|
28
|
+
[
|
29
|
+
(name, path, columns)
|
30
|
+
for name, (path, columns) in SINGLE_COLUMN_BENCHMARKS.items()
|
31
|
+
],
|
32
|
+
ids=[name for name in SINGLE_COLUMN_BENCHMARKS],
|
33
|
+
)
|
34
|
+
def test_read_parquet_num_rowgroups_single_column(
|
35
|
+
name, path, columns, read_fn, benchmark
|
36
|
+
):
|
37
|
+
data = benchmark(read_fn, path, columns=columns)
|
38
|
+
if columns is not None:
|
39
|
+
assert data.column_names == columns
|
40
|
+
|
41
|
+
|
42
|
+
@pytest.mark.benchmark(group="num_rowgroups_all_columns")
|
43
|
+
@pytest.mark.parametrize(
|
44
|
+
["name", "path", "columns"],
|
45
|
+
[(name, path, columns) for name, (path, columns) in ALL_COLUMN_BENCHMARKS.items()],
|
46
|
+
ids=[name for name in ALL_COLUMN_BENCHMARKS],
|
47
|
+
)
|
48
|
+
def test_read_parquet_num_rowgroups_all_columns(
|
49
|
+
name, path, columns, read_fn, benchmark
|
50
|
+
):
|
51
|
+
data = benchmark(read_fn, path, columns=columns)
|
52
|
+
if columns is not None:
|
53
|
+
assert data.column_names == columns
|
@@ -0,0 +1,61 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import pyarrow as pa
|
4
|
+
import pyarrow.fs as pafs
|
5
|
+
import pyarrow.parquet as papq
|
6
|
+
import pytest
|
7
|
+
|
8
|
+
from deltacat.utils.pyarrow import s3_file_to_table
|
9
|
+
from deltacat.types.media import (
|
10
|
+
ContentEncoding,
|
11
|
+
ContentType,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
def pyarrow_read(path: str, columns: list[str] | None = None) -> pa.Table:
|
16
|
+
assert path.startswith(
|
17
|
+
"s3://"
|
18
|
+
), f"Expected file path to start with 's3://', but got {path}."
|
19
|
+
fs = pafs.S3FileSystem()
|
20
|
+
path = path.replace("s3://", "")
|
21
|
+
return papq.read_table(path, columns=columns, filesystem=fs)
|
22
|
+
|
23
|
+
|
24
|
+
def deltacat_read(path: str, columns: list[str] | None = None) -> pa.Table:
|
25
|
+
assert path.startswith("s3://")
|
26
|
+
return s3_file_to_table(
|
27
|
+
path,
|
28
|
+
content_type=ContentType.PARQUET,
|
29
|
+
content_encoding=ContentEncoding.IDENTITY,
|
30
|
+
column_names=None, # Parquet files are schemaful
|
31
|
+
include_columns=columns,
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
def daft_table_read(path: str, columns: list[str] | None = None) -> pa.Table:
|
36
|
+
try:
|
37
|
+
import daft
|
38
|
+
except ImportError:
|
39
|
+
raise ImportError(
|
40
|
+
"Daft not installed. Install Daft using pip to run these benchmarks: `pip install getdaft`"
|
41
|
+
)
|
42
|
+
|
43
|
+
tbl = daft.table.Table.read_parquet(path, columns=columns)
|
44
|
+
return tbl.to_arrow()
|
45
|
+
|
46
|
+
|
47
|
+
@pytest.fixture(
|
48
|
+
params=[
|
49
|
+
daft_table_read,
|
50
|
+
pyarrow_read,
|
51
|
+
deltacat_read,
|
52
|
+
],
|
53
|
+
ids=[
|
54
|
+
"daft_table",
|
55
|
+
"pyarrow",
|
56
|
+
"deltacat",
|
57
|
+
],
|
58
|
+
)
|
59
|
+
def read_fn(request):
|
60
|
+
"""Fixture which returns the function to read a PyArrow table from a path"""
|
61
|
+
return request.param
|
deltacat/catalog/delegate.py
CHANGED
@@ -5,7 +5,7 @@ import ray
|
|
5
5
|
|
6
6
|
from deltacat.catalog.model.catalog import Catalog, all_catalogs
|
7
7
|
from deltacat.catalog.model.table_definition import TableDefinition
|
8
|
-
from deltacat.
|
8
|
+
from deltacat.storage.model.sort_key import SortKey
|
9
9
|
from deltacat.storage.model.list_result import ListResult
|
10
10
|
from deltacat.storage.model.namespace import Namespace
|
11
11
|
from deltacat.storage.model.types import (
|
deltacat/catalog/interface.py
CHANGED
@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Set, Union
|
|
3
3
|
import pyarrow as pa
|
4
4
|
|
5
5
|
from deltacat.catalog.model.table_definition import TableDefinition
|
6
|
-
from deltacat.
|
6
|
+
from deltacat.storage.model.sort_key import SortKey
|
7
7
|
from deltacat.storage.model.list_result import ListResult
|
8
8
|
from deltacat.storage.model.namespace import Namespace
|
9
9
|
from deltacat.storage.model.types import (
|
@@ -13,7 +13,6 @@ from deltacat.compute.compactor.model.round_completion_info import (
|
|
13
13
|
RoundCompletionInfo,
|
14
14
|
HighWatermark,
|
15
15
|
)
|
16
|
-
from deltacat.compute.compactor.model.sort_key import SortKey, SortOrder
|
17
16
|
|
18
17
|
__all__ = [
|
19
18
|
"DeltaAnnotated",
|
@@ -27,6 +26,4 @@ __all__ = [
|
|
27
26
|
"PyArrowWriteResult",
|
28
27
|
"RoundCompletionInfo",
|
29
28
|
"HighWatermark",
|
30
|
-
"SortKey",
|
31
|
-
"SortOrder",
|
32
29
|
]
|
@@ -12,8 +12,8 @@ import pyarrow as pa
|
|
12
12
|
from deltacat.compute.compactor import (
|
13
13
|
PyArrowWriteResult,
|
14
14
|
RoundCompletionInfo,
|
15
|
-
SortKey,
|
16
15
|
)
|
16
|
+
from deltacat.storage.model.sort_key import SortKey
|
17
17
|
from deltacat.compute.compactor.model.dedupe_result import DedupeResult
|
18
18
|
from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
|
19
19
|
from deltacat.io.object_store import IObjectStore
|
@@ -50,6 +50,7 @@ from deltacat.utils.metrics import MetricsConfig
|
|
50
50
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
51
51
|
CompactionSessionAuditInfo,
|
52
52
|
)
|
53
|
+
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
53
54
|
from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
|
54
55
|
|
55
56
|
|
@@ -59,6 +60,9 @@ if importlib.util.find_spec("memray"):
|
|
59
60
|
|
60
61
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
61
62
|
|
63
|
+
DEFAULT_DEDUPE_MAX_PARALLELISM_RATIO_ARG: int = 1
|
64
|
+
DEFAULT_PROPERTIES_ARG: Dict[str, Any] = {}
|
65
|
+
|
62
66
|
|
63
67
|
def check_preconditions(
|
64
68
|
source_partition_locator: PartitionLocator,
|
@@ -67,8 +71,11 @@ def check_preconditions(
|
|
67
71
|
max_records_per_output_file: int,
|
68
72
|
new_hash_bucket_count: Optional[int],
|
69
73
|
deltacat_storage=unimplemented_deltacat_storage,
|
74
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
75
|
+
**kwargs,
|
70
76
|
) -> int:
|
71
|
-
|
77
|
+
if deltacat_storage_kwargs is None:
|
78
|
+
deltacat_storage_kwargs = {}
|
72
79
|
assert (
|
73
80
|
source_partition_locator.partition_values
|
74
81
|
== destination_partition_locator.partition_values
|
@@ -83,10 +90,12 @@ def check_preconditions(
|
|
83
90
|
assert (
|
84
91
|
new_hash_bucket_count >= 1
|
85
92
|
), "New hash bucket count must be a positive value"
|
86
|
-
return
|
93
|
+
return validate_sort_keys(
|
87
94
|
source_partition_locator,
|
88
95
|
sort_keys,
|
89
96
|
deltacat_storage,
|
97
|
+
deltacat_storage_kwargs,
|
98
|
+
**kwargs,
|
90
99
|
)
|
91
100
|
|
92
101
|
|
@@ -117,9 +126,11 @@ def compact_partition(
|
|
117
126
|
object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
|
118
127
|
s3_client_kwargs: Optional[Dict[str, Any]] = None,
|
119
128
|
deltacat_storage=unimplemented_deltacat_storage,
|
129
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
120
130
|
**kwargs,
|
121
131
|
) -> Optional[str]:
|
122
|
-
|
132
|
+
if deltacat_storage_kwargs is None:
|
133
|
+
deltacat_storage_kwargs = {}
|
123
134
|
if not importlib.util.find_spec("memray"):
|
124
135
|
logger.info(f"memray profiler not available, disabling all profiling")
|
125
136
|
enable_profiler = False
|
@@ -161,6 +172,7 @@ def compact_partition(
|
|
161
172
|
object_store,
|
162
173
|
s3_client_kwargs,
|
163
174
|
deltacat_storage,
|
175
|
+
deltacat_storage_kwargs,
|
164
176
|
**kwargs,
|
165
177
|
)
|
166
178
|
if new_partition:
|
@@ -172,7 +184,9 @@ def compact_partition(
|
|
172
184
|
round_completion_file_s3_url = None
|
173
185
|
if partition:
|
174
186
|
logger.info(f"Committing compacted partition to: {partition.locator}")
|
175
|
-
partition = deltacat_storage.commit_partition(
|
187
|
+
partition = deltacat_storage.commit_partition(
|
188
|
+
partition, **deltacat_storage_kwargs
|
189
|
+
)
|
176
190
|
logger.info(f"Committed compacted partition: {partition}")
|
177
191
|
|
178
192
|
round_completion_file_s3_url = rcf.write_round_completion_file(
|
@@ -209,15 +223,16 @@ def _execute_compaction_round(
|
|
209
223
|
object_store: Optional[IObjectStore],
|
210
224
|
s3_client_kwargs: Optional[Dict[str, Any]],
|
211
225
|
deltacat_storage=unimplemented_deltacat_storage,
|
226
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
212
227
|
**kwargs,
|
213
228
|
) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
|
214
|
-
|
229
|
+
if deltacat_storage_kwargs is None:
|
230
|
+
deltacat_storage_kwargs = {}
|
215
231
|
rcf_source_partition_locator = (
|
216
232
|
rebase_source_partition_locator
|
217
233
|
if rebase_source_partition_locator
|
218
234
|
else source_partition_locator
|
219
235
|
)
|
220
|
-
|
221
236
|
base_audit_url = rcf_source_partition_locator.path(
|
222
237
|
f"s3://{compaction_artifact_s3_bucket}/compaction-audit"
|
223
238
|
)
|
@@ -250,6 +265,8 @@ def _execute_compaction_round(
|
|
250
265
|
records_per_compacted_file,
|
251
266
|
hash_bucket_count,
|
252
267
|
deltacat_storage,
|
268
|
+
deltacat_storage_kwargs,
|
269
|
+
**kwargs,
|
253
270
|
)
|
254
271
|
|
255
272
|
# sort primary keys to produce the same pk digest regardless of input order
|
@@ -329,7 +346,8 @@ def _execute_compaction_round(
|
|
329
346
|
rebase_source_partition_locator,
|
330
347
|
rebase_source_partition_high_watermark,
|
331
348
|
deltacat_storage,
|
332
|
-
|
349
|
+
deltacat_storage_kwargs,
|
350
|
+
list_deltas_kwargs,
|
333
351
|
)
|
334
352
|
|
335
353
|
delta_discovery_end = time.monotonic()
|
@@ -362,6 +380,8 @@ def _execute_compaction_round(
|
|
362
380
|
compaction_audit,
|
363
381
|
hash_bucket_count,
|
364
382
|
deltacat_storage=deltacat_storage,
|
383
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
384
|
+
**kwargs,
|
365
385
|
)
|
366
386
|
if input_deltas_stats is None
|
367
387
|
else io.limit_input_deltas(
|
@@ -372,6 +392,8 @@ def _execute_compaction_round(
|
|
372
392
|
compaction_audit=compaction_audit,
|
373
393
|
input_deltas_stats=input_deltas_stats,
|
374
394
|
deltacat_storage=deltacat_storage,
|
395
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
396
|
+
**kwargs,
|
375
397
|
)
|
376
398
|
)
|
377
399
|
|
@@ -399,9 +421,7 @@ def _execute_compaction_round(
|
|
399
421
|
raise AssertionError(
|
400
422
|
"Multiple rounds are not supported. Please increase the cluster size and run again."
|
401
423
|
)
|
402
|
-
|
403
424
|
hb_start = time.monotonic()
|
404
|
-
|
405
425
|
hb_tasks_pending = invoke_parallel(
|
406
426
|
items=uniform_deltas,
|
407
427
|
ray_task=hb.hash_bucket,
|
@@ -417,8 +437,9 @@ def _execute_compaction_round(
|
|
417
437
|
read_kwargs_provider=read_kwargs_provider,
|
418
438
|
object_store=object_store,
|
419
439
|
deltacat_storage=deltacat_storage,
|
440
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
441
|
+
**kwargs,
|
420
442
|
)
|
421
|
-
|
422
443
|
hb_invoke_end = time.monotonic()
|
423
444
|
|
424
445
|
logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
|
@@ -456,7 +477,6 @@ def _execute_compaction_round(
|
|
456
477
|
)
|
457
478
|
|
458
479
|
compaction_audit.set_input_records(total_hb_record_count.item())
|
459
|
-
|
460
480
|
# TODO (pdames): when resources are freed during the last round of hash
|
461
481
|
# bucketing, start running dedupe tasks that read existing dedupe
|
462
482
|
# output from S3 then wait for hash bucketing to finish before continuing
|
@@ -467,13 +487,14 @@ def _execute_compaction_round(
|
|
467
487
|
compacted_stream_locator.namespace,
|
468
488
|
compacted_stream_locator.table_name,
|
469
489
|
compacted_stream_locator.table_version,
|
490
|
+
**deltacat_storage_kwargs,
|
470
491
|
)
|
471
492
|
partition = deltacat_storage.stage_partition(
|
472
493
|
stream,
|
473
494
|
destination_partition_locator.partition_values,
|
495
|
+
**deltacat_storage_kwargs,
|
474
496
|
)
|
475
497
|
new_compacted_partition_locator = partition.locator
|
476
|
-
|
477
498
|
# parallel step 2:
|
478
499
|
# discover records with duplicate primary keys in each hash bucket, and
|
479
500
|
# identify the index of records to keep or drop based on sort keys
|
@@ -482,7 +503,10 @@ def _execute_compaction_round(
|
|
482
503
|
|
483
504
|
dedupe_start = time.monotonic()
|
484
505
|
dd_max_parallelism = int(
|
485
|
-
max_parallelism
|
506
|
+
max_parallelism
|
507
|
+
* kwargs.get(
|
508
|
+
"dd_max_parallelism_ratio", DEFAULT_DEDUPE_MAX_PARALLELISM_RATIO_ARG
|
509
|
+
)
|
486
510
|
)
|
487
511
|
logger.info(
|
488
512
|
f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
|
@@ -526,7 +550,6 @@ def _execute_compaction_round(
|
|
526
550
|
)
|
527
551
|
|
528
552
|
compaction_audit.set_records_deduped(total_dd_record_count.item())
|
529
|
-
|
530
553
|
all_mat_buckets_to_obj_id = defaultdict(list)
|
531
554
|
for dd_result in dd_results:
|
532
555
|
for (
|
@@ -540,7 +563,6 @@ def _execute_compaction_round(
|
|
540
563
|
logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
|
541
564
|
|
542
565
|
compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
|
543
|
-
|
544
566
|
# TODO(pdames): when resources are freed during the last round of deduping
|
545
567
|
# start running materialize tasks that read materialization source file
|
546
568
|
# tables from S3 then wait for deduping to finish before continuing
|
@@ -561,7 +583,6 @@ def _execute_compaction_round(
|
|
561
583
|
)
|
562
584
|
|
563
585
|
materialize_start = time.monotonic()
|
564
|
-
|
565
586
|
mat_tasks_pending = invoke_parallel(
|
566
587
|
items=all_mat_buckets_to_obj_id.items(),
|
567
588
|
ray_task=mat.materialize,
|
@@ -584,6 +605,7 @@ def _execute_compaction_round(
|
|
584
605
|
s3_table_writer_kwargs=s3_table_writer_kwargs,
|
585
606
|
object_store=object_store,
|
586
607
|
deltacat_storage=deltacat_storage,
|
608
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
587
609
|
)
|
588
610
|
|
589
611
|
materialize_invoke_end = time.monotonic()
|
@@ -629,7 +651,9 @@ def _execute_compaction_round(
|
|
629
651
|
f" {record_info_msg}"
|
630
652
|
)
|
631
653
|
compacted_delta = deltacat_storage.commit_delta(
|
632
|
-
merged_delta,
|
654
|
+
merged_delta,
|
655
|
+
properties=kwargs.get("properties", DEFAULT_PROPERTIES_ARG),
|
656
|
+
**deltacat_storage_kwargs,
|
633
657
|
)
|
634
658
|
logger.info(f"Committed compacted delta: {compacted_delta}")
|
635
659
|
|
@@ -691,10 +715,11 @@ def _execute_compaction_round(
|
|
691
715
|
|
692
716
|
def compact_partition_from_request(
|
693
717
|
compact_partition_params: CompactPartitionParams,
|
718
|
+
*compact_partition_pos_args,
|
694
719
|
) -> Optional[str]:
|
695
720
|
"""
|
696
721
|
Wrapper for compact_partition that allows for the compact_partition parameters to be
|
697
|
-
passed in as a custom dictionary-like CompactPartitionParams object.
|
722
|
+
passed in as a custom dictionary-like CompactPartitionParams object along with any compact_partition positional arguments.
|
698
723
|
:param compact_partition_params:
|
699
724
|
"""
|
700
|
-
return compact_partition(**compact_partition_params)
|
725
|
+
return compact_partition(*compact_partition_pos_args, **compact_partition_params)
|