deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. deltacat/__init__.py +3 -2
  2. deltacat/aws/clients.py +123 -3
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
  6. deltacat/benchmarking/conftest.py +61 -0
  7. deltacat/catalog/delegate.py +1 -1
  8. deltacat/catalog/interface.py +1 -1
  9. deltacat/compute/compactor/__init__.py +0 -3
  10. deltacat/compute/compactor/compaction_session.py +45 -20
  11. deltacat/compute/compactor/model/compact_partition_params.py +287 -58
  12. deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
  13. deltacat/compute/compactor/model/delta_annotated.py +91 -9
  14. deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
  15. deltacat/compute/compactor/model/primary_key_index.py +1 -1
  16. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  17. deltacat/compute/compactor/repartition_session.py +5 -3
  18. deltacat/compute/compactor/steps/dedupe.py +10 -8
  19. deltacat/compute/compactor/steps/hash_bucket.py +25 -4
  20. deltacat/compute/compactor/steps/materialize.py +11 -6
  21. deltacat/compute/compactor/steps/repartition.py +16 -1
  22. deltacat/compute/compactor/utils/io.py +40 -23
  23. deltacat/compute/compactor/utils/primary_key_index.py +1 -15
  24. deltacat/compute/compactor/utils/sort_key.py +57 -0
  25. deltacat/compute/compactor/utils/system_columns.py +43 -0
  26. deltacat/compute/compactor_v2/compaction_session.py +506 -0
  27. deltacat/compute/compactor_v2/constants.py +34 -0
  28. deltacat/compute/compactor_v2/model/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  30. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  31. deltacat/compute/compactor_v2/model/merge_input.py +127 -0
  32. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  33. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  34. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  35. deltacat/compute/compactor_v2/steps/merge.py +41 -0
  36. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  37. deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
  38. deltacat/compute/compactor_v2/utils/io.py +149 -0
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
  40. deltacat/compute/compactor_v2/utils/task_options.py +228 -0
  41. deltacat/compute/metastats/meta_stats.py +4 -2
  42. deltacat/compute/metastats/stats.py +1 -0
  43. deltacat/compute/metastats/utils/io.py +4 -0
  44. deltacat/compute/stats/utils/io.py +20 -5
  45. deltacat/exceptions.py +4 -0
  46. deltacat/io/memcached_object_store.py +37 -14
  47. deltacat/logs.py +4 -3
  48. deltacat/storage/__init__.py +3 -0
  49. deltacat/storage/interface.py +11 -2
  50. deltacat/storage/model/sort_key.py +33 -0
  51. deltacat/storage/model/table_version.py +11 -0
  52. deltacat/storage/model/types.py +2 -1
  53. deltacat/tests/aws/__init__.py +0 -0
  54. deltacat/tests/aws/test_clients.py +80 -0
  55. deltacat/tests/compute/__init__.py +0 -0
  56. deltacat/tests/compute/common.py +96 -0
  57. deltacat/tests/compute/compactor/__init__.py +0 -0
  58. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  59. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
  60. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  61. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  62. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  63. deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
  64. deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
  65. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  66. deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
  67. deltacat/tests/compute/testcases.py +390 -0
  68. deltacat/tests/io/test_memcached_object_store.py +5 -4
  69. deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
  70. deltacat/tests/test_utils/pyarrow.py +32 -0
  71. deltacat/tests/test_utils/utils.py +13 -0
  72. deltacat/tests/utils/data/__init__.py +0 -0
  73. deltacat/tests/utils/test_daft.py +76 -0
  74. deltacat/tests/utils/test_pyarrow.py +133 -0
  75. deltacat/tests/utils/test_resources.py +23 -20
  76. deltacat/types/media.py +1 -0
  77. deltacat/types/partial_download.py +82 -0
  78. deltacat/types/tables.py +1 -0
  79. deltacat/utils/arguments.py +26 -0
  80. deltacat/utils/daft.py +87 -0
  81. deltacat/utils/performance.py +4 -2
  82. deltacat/utils/placement.py +20 -3
  83. deltacat/utils/pyarrow.py +213 -1
  84. deltacat/utils/ray_utils/concurrency.py +26 -1
  85. deltacat/utils/resources.py +72 -1
  86. deltacat/utils/s3fs.py +21 -0
  87. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
  88. deltacat-0.1.18b15.dist-info/RECORD +176 -0
  89. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
  90. deltacat/compute/compactor/model/sort_key.py +0 -98
  91. deltacat-0.1.18b13.dist-info/RECORD +0 -136
  92. /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
  93. /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
  94. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
  95. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -28,7 +28,6 @@ from deltacat.catalog.model.catalog import ( # noqa: F401
28
28
  init,
29
29
  )
30
30
  from deltacat.catalog.model.table_definition import TableDefinition
31
- from deltacat.compute.compactor import SortKey, SortOrder
32
31
  from deltacat.storage import (
33
32
  DistributedDataset,
34
33
  LifecycleState,
@@ -37,13 +36,15 @@ from deltacat.storage import (
37
36
  LocalTable,
38
37
  Namespace,
39
38
  SchemaConsistencyType,
39
+ SortKey,
40
+ SortOrder,
40
41
  )
41
42
  from deltacat.types.media import ContentEncoding, ContentType, TableType
42
43
  from deltacat.types.tables import TableWriteMode
43
44
 
44
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
45
46
 
46
- __version__ = "0.1.18b13"
47
+ __version__ = "0.1.18b15"
47
48
 
48
49
 
49
50
  __all__ = [
deltacat/aws/clients.py CHANGED
@@ -1,22 +1,142 @@
1
1
  import logging
2
2
  from functools import lru_cache
3
- from typing import Optional
3
+ from typing import Optional, FrozenSet
4
+ from http import HTTPStatus
4
5
 
5
6
  import boto3
6
7
  from boto3.exceptions import ResourceNotExistsError
7
8
  from boto3.resources.base import ServiceResource
8
9
  from botocore.client import BaseClient
9
10
  from botocore.config import Config
11
+ from requests.adapters import Response
12
+ from tenacity import (
13
+ RetryError,
14
+ Retrying,
15
+ wait_fixed,
16
+ retry_if_exception,
17
+ stop_after_delay,
18
+ )
10
19
 
11
20
  from deltacat import logs
12
21
  from deltacat.aws.constants import BOTO_MAX_RETRIES
22
+ import requests
23
+
13
24
 
14
25
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
26
 
16
27
  BOTO3_PROFILE_NAME_KWARG_KEY = "boto3_profile_name"
28
+ INSTANCE_METADATA_SERVICE_IPV4_URI = "http://169.254.169.254/latest/meta-data/" # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
29
+ RETRYABLE_HTTP_STATUS_CODES = [
30
+ # 429
31
+ HTTPStatus.TOO_MANY_REQUESTS,
32
+ # 5xx
33
+ HTTPStatus.INTERNAL_SERVER_ERROR,
34
+ HTTPStatus.NOT_IMPLEMENTED,
35
+ HTTPStatus.BAD_GATEWAY,
36
+ HTTPStatus.SERVICE_UNAVAILABLE,
37
+ HTTPStatus.GATEWAY_TIMEOUT,
38
+ ]
39
+
40
+
41
+ class RetryIfRetryableHTTPStatusCode(retry_if_exception):
42
+ """
43
+ Retry strategy that retries if the exception is an ``HTTPError`` with
44
+ a status code in the retryable errors list.
45
+ """
46
+
47
+ def __init__(self):
48
+ def is_retryable_error(exception):
49
+ return (
50
+ isinstance(exception, requests.exceptions.HTTPError)
51
+ and exception.response.status_code in RETRYABLE_HTTP_STATUS_CODES
52
+ )
53
+
54
+ super().__init__(predicate=is_retryable_error)
55
+
56
+
57
+ def _log_attempt_number(retry_state):
58
+ """return the result of the last call attempt"""
59
+ logger.warning(f"Retrying: {retry_state.attempt_number}...")
60
+
61
+
62
+ def _get_url(url: str, get_url_kwargs=None):
63
+ if get_url_kwargs is None:
64
+ get_url_kwargs = {}
65
+ resp = requests.get(url, **get_url_kwargs)
66
+ resp.raise_for_status()
67
+ return resp
68
+
69
+
70
+ def retrying_get(
71
+ url: str,
72
+ retry_strategy,
73
+ wait_strategy,
74
+ stop_strategy,
75
+ short_circuit_on_status: FrozenSet[int] = {HTTPStatus.OK},
76
+ ) -> Optional[Response]:
77
+ """Retries a request to the given URL until it succeeds.
78
+
79
+ Args:
80
+ retry_strategy (Callable): A function that returns a retry strategy.
81
+ wait_strategy (Callable): A function that returns a wait strategy.
82
+ stop_strategy (Callable): A function that returns a stop strategy.
83
+ url (str): The URL to retry.
84
+
85
+ Returns:
86
+ Optional[Response]: The response from the URL, or None if the request
87
+ failed after the maximum number of retries.
88
+ """
89
+ try:
90
+ resp = _get_url(url)
91
+ if resp.status_code in short_circuit_on_status:
92
+ return resp
93
+ for attempt in Retrying(
94
+ retry=retry_strategy(),
95
+ wait=wait_strategy,
96
+ stop=stop_strategy,
97
+ after=_log_attempt_number,
98
+ ):
99
+ with attempt:
100
+ resp = _get_url(url)
101
+ return resp
102
+ except RetryError as re:
103
+ logger.error(f"Failed to retry URL: {url} - {re}")
104
+ logger.info(f"Unable to get from URL: {url}")
105
+ return None
106
+
107
+
108
+ def block_until_instance_metadata_service_returns_success(
109
+ url=INSTANCE_METADATA_SERVICE_IPV4_URI,
110
+ retry_strategy=RetryIfRetryableHTTPStatusCode,
111
+ wait_strategy=wait_fixed(2), # wait 2 seconds before retrying,
112
+ stop_strategy=stop_after_delay(60 * 10), # stop trying after 10 minutes
113
+ ) -> Optional[Response]:
114
+ """Blocks until the instance metadata service returns a successful response.
115
+
116
+ Args:
117
+ retry_strategy (Callable): A function that returns a retry strategy.
118
+ wait_strategy (Callable): A function that returns a wait strategy.
119
+ stop_strategy (Callable): A function that returns a stop strategy.
120
+ url (str): The URL of the instance metadata service.
121
+
122
+ Returns:
123
+ Optional[Response]: The response from the instance metadata service,
124
+ or None if the request failed after the maximum number of retries.
125
+
126
+ https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
127
+ """
128
+ # We will get a 403 HTTP status code if running deltacat not in an EC2 instance. In that case we won't want to block.
129
+ return retrying_get(
130
+ url,
131
+ retry_strategy,
132
+ wait_strategy,
133
+ stop_strategy,
134
+ short_circuit_on_status={HTTPStatus.OK, HTTPStatus.FORBIDDEN},
135
+ )
17
136
 
18
137
 
19
138
  def _get_session_from_kwargs(input_kwargs):
139
+ block_until_instance_metadata_service_returns_success()
20
140
  if input_kwargs.get(BOTO3_PROFILE_NAME_KWARG_KEY) is not None:
21
141
  boto3_session = boto3.Session(
22
142
  profile_name=input_kwargs.get(BOTO3_PROFILE_NAME_KWARG_KEY)
@@ -30,7 +150,7 @@ def _get_session_from_kwargs(input_kwargs):
30
150
  def _resource(name: str, region: Optional[str], **kwargs) -> ServiceResource:
31
151
  boto3_session = _get_session_from_kwargs(kwargs)
32
152
 
33
- boto_config = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "standard"})
153
+ boto_config = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
34
154
  return boto3_session.resource(
35
155
  name,
36
156
  region,
@@ -47,7 +167,7 @@ def _client(name: str, region: Optional[str], **kwargs) -> BaseClient:
47
167
  # fall back for clients without an associated resource
48
168
  boto3_session = _get_session_from_kwargs(kwargs)
49
169
  boto_config = Config(
50
- retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "standard"}
170
+ retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"}
51
171
  )
52
172
  return boto3_session.client(
53
173
  name,
@@ -170,6 +170,10 @@ class ManifestMeta(dict):
170
170
  def content_type_parameters(self) -> Optional[List[Dict[str, str]]]:
171
171
  return self.get("content_type_parameters")
172
172
 
173
+ @content_type_parameters.setter
174
+ def content_type_parameters(self, params: List[Dict[str, str]]) -> None:
175
+ self["content_type_parameters"] = params
176
+
173
177
  @property
174
178
  def credentials(self) -> Optional[Dict[str, str]]:
175
179
  return self.get("credentials")
deltacat/aws/s3u.py CHANGED
@@ -3,6 +3,8 @@ import multiprocessing
3
3
  from functools import partial
4
4
  from typing import Any, Callable, Dict, Generator, List, Optional, Union
5
5
  from uuid import uuid4
6
+ from botocore.config import Config
7
+ from deltacat.aws.constants import BOTO_MAX_RETRIES
6
8
 
7
9
  import pyarrow as pa
8
10
  import ray
@@ -39,6 +41,7 @@ from deltacat.types.tables import (
39
41
  TABLE_TYPE_TO_READER_FUNC,
40
42
  get_table_length,
41
43
  )
44
+ from deltacat.types.partial_download import PartialFileDownloadParams
42
45
  from deltacat.utils.common import ReadKwargsProvider
43
46
 
44
47
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -197,6 +200,7 @@ def read_file(
197
200
  column_names: Optional[List[str]] = None,
198
201
  include_columns: Optional[List[str]] = None,
199
202
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
203
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
200
204
  **s3_client_kwargs,
201
205
  ) -> LocalTable:
202
206
 
@@ -209,6 +213,7 @@ def read_file(
209
213
  column_names,
210
214
  include_columns,
211
215
  file_reader_kwargs_provider,
216
+ partial_file_download_params,
212
217
  **s3_client_kwargs,
213
218
  )
214
219
  return table
@@ -217,6 +222,13 @@ def read_file(
217
222
  # Timeout error not caught by botocore
218
223
  raise RetryableError(f"Retry table download from: {s3_url}") from e
219
224
  raise NonRetryableError(f"Failed table download from: {s3_url}") from e
225
+ except BaseException as e:
226
+ logger.warn(
227
+ f"Read has failed for {s3_url} and content_type={content_type} "
228
+ f"and encoding={content_encoding}. Error: {e}",
229
+ exc_info=True,
230
+ )
231
+ raise e
220
232
 
221
233
 
222
234
  def upload_sliced_table(
@@ -385,14 +397,16 @@ def download_manifest_entry(
385
397
  content_encoding: Optional[ContentEncoding] = None,
386
398
  ) -> LocalTable:
387
399
 
400
+ conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
388
401
  s3_client_kwargs = (
389
402
  {
390
403
  "aws_access_key_id": token_holder["accessKeyId"],
391
404
  "aws_secret_access_key": token_holder["secretAccessKey"],
392
405
  "aws_session_token": token_holder["sessionToken"],
406
+ "config": conf,
393
407
  }
394
408
  if token_holder
395
- else {}
409
+ else {"config": conf}
396
410
  )
397
411
  if not content_type:
398
412
  content_type = manifest_entry.meta.content_type
@@ -409,6 +423,14 @@ def download_manifest_entry(
409
423
  s3_url = manifest_entry.uri
410
424
  if s3_url is None:
411
425
  s3_url = manifest_entry.url
426
+
427
+ partial_file_download_params = None
428
+ if manifest_entry.meta and manifest_entry.meta.content_type_parameters:
429
+ for type_params in manifest_entry.meta.content_type_parameters:
430
+ if isinstance(type_params, PartialFileDownloadParams):
431
+ partial_file_download_params = type_params
432
+ break
433
+
412
434
  # @retry decorator can't be pickled by Ray, so wrap download in Retrying
413
435
  retrying = Retrying(
414
436
  wait=wait_random_exponential(multiplier=1, max=60),
@@ -424,6 +446,7 @@ def download_manifest_entry(
424
446
  column_names,
425
447
  include_columns,
426
448
  file_reader_kwargs_provider,
449
+ partial_file_download_params,
427
450
  **s3_client_kwargs,
428
451
  )
429
452
  return table
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+
3
+ import pytest
4
+
5
+
6
+ # Benchmarks for retrieving a single column in the Parquet file
7
+ SINGLE_COLUMN_BENCHMARKS = {
8
+ "mvp": ("s3://daft-public-data/test_fixtures/parquet-dev/mvp.parquet", ["a"]),
9
+ "TPCH-lineitems-200MB-2RG": (
10
+ "s3://daft-public-data/test_fixtures/parquet-dev/daft_200MB_lineitem_chunk.RG-2.parquet",
11
+ ["L_ORDERKEY"],
12
+ ),
13
+ }
14
+
15
+ # Benchmarks for retrieving all columns in the Parquet file
16
+ ALL_COLUMN_BENCHMARKS = {
17
+ "mvp": ("s3://daft-public-data/test_fixtures/parquet-dev/mvp.parquet", None),
18
+ "TPCH-lineitems-200MB-2RG": (
19
+ "s3://daft-public-data/test_fixtures/parquet-dev/daft_200MB_lineitem_chunk.RG-2.parquet",
20
+ None,
21
+ ),
22
+ }
23
+
24
+
25
+ @pytest.mark.benchmark(group="num_rowgroups_single_column")
26
+ @pytest.mark.parametrize(
27
+ ["name", "path", "columns"],
28
+ [
29
+ (name, path, columns)
30
+ for name, (path, columns) in SINGLE_COLUMN_BENCHMARKS.items()
31
+ ],
32
+ ids=[name for name in SINGLE_COLUMN_BENCHMARKS],
33
+ )
34
+ def test_read_parquet_num_rowgroups_single_column(
35
+ name, path, columns, read_fn, benchmark
36
+ ):
37
+ data = benchmark(read_fn, path, columns=columns)
38
+ if columns is not None:
39
+ assert data.column_names == columns
40
+
41
+
42
+ @pytest.mark.benchmark(group="num_rowgroups_all_columns")
43
+ @pytest.mark.parametrize(
44
+ ["name", "path", "columns"],
45
+ [(name, path, columns) for name, (path, columns) in ALL_COLUMN_BENCHMARKS.items()],
46
+ ids=[name for name in ALL_COLUMN_BENCHMARKS],
47
+ )
48
+ def test_read_parquet_num_rowgroups_all_columns(
49
+ name, path, columns, read_fn, benchmark
50
+ ):
51
+ data = benchmark(read_fn, path, columns=columns)
52
+ if columns is not None:
53
+ assert data.column_names == columns
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+
3
+ import pyarrow as pa
4
+ import pyarrow.fs as pafs
5
+ import pyarrow.parquet as papq
6
+ import pytest
7
+
8
+ from deltacat.utils.pyarrow import s3_file_to_table
9
+ from deltacat.types.media import (
10
+ ContentEncoding,
11
+ ContentType,
12
+ )
13
+
14
+
15
+ def pyarrow_read(path: str, columns: list[str] | None = None) -> pa.Table:
16
+ assert path.startswith(
17
+ "s3://"
18
+ ), f"Expected file path to start with 's3://', but got {path}."
19
+ fs = pafs.S3FileSystem()
20
+ path = path.replace("s3://", "")
21
+ return papq.read_table(path, columns=columns, filesystem=fs)
22
+
23
+
24
+ def deltacat_read(path: str, columns: list[str] | None = None) -> pa.Table:
25
+ assert path.startswith("s3://")
26
+ return s3_file_to_table(
27
+ path,
28
+ content_type=ContentType.PARQUET,
29
+ content_encoding=ContentEncoding.IDENTITY,
30
+ column_names=None, # Parquet files are schemaful
31
+ include_columns=columns,
32
+ )
33
+
34
+
35
+ def daft_table_read(path: str, columns: list[str] | None = None) -> pa.Table:
36
+ try:
37
+ import daft
38
+ except ImportError:
39
+ raise ImportError(
40
+ "Daft not installed. Install Daft using pip to run these benchmarks: `pip install getdaft`"
41
+ )
42
+
43
+ tbl = daft.table.Table.read_parquet(path, columns=columns)
44
+ return tbl.to_arrow()
45
+
46
+
47
+ @pytest.fixture(
48
+ params=[
49
+ daft_table_read,
50
+ pyarrow_read,
51
+ deltacat_read,
52
+ ],
53
+ ids=[
54
+ "daft_table",
55
+ "pyarrow",
56
+ "deltacat",
57
+ ],
58
+ )
59
+ def read_fn(request):
60
+ """Fixture which returns the function to read a PyArrow table from a path"""
61
+ return request.param
@@ -5,7 +5,7 @@ import ray
5
5
 
6
6
  from deltacat.catalog.model.catalog import Catalog, all_catalogs
7
7
  from deltacat.catalog.model.table_definition import TableDefinition
8
- from deltacat.compute.compactor.model.sort_key import SortKey
8
+ from deltacat.storage.model.sort_key import SortKey
9
9
  from deltacat.storage.model.list_result import ListResult
10
10
  from deltacat.storage.model.namespace import Namespace
11
11
  from deltacat.storage.model.types import (
@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Set, Union
3
3
  import pyarrow as pa
4
4
 
5
5
  from deltacat.catalog.model.table_definition import TableDefinition
6
- from deltacat.compute.compactor.model.sort_key import SortKey
6
+ from deltacat.storage.model.sort_key import SortKey
7
7
  from deltacat.storage.model.list_result import ListResult
8
8
  from deltacat.storage.model.namespace import Namespace
9
9
  from deltacat.storage.model.types import (
@@ -13,7 +13,6 @@ from deltacat.compute.compactor.model.round_completion_info import (
13
13
  RoundCompletionInfo,
14
14
  HighWatermark,
15
15
  )
16
- from deltacat.compute.compactor.model.sort_key import SortKey, SortOrder
17
16
 
18
17
  __all__ = [
19
18
  "DeltaAnnotated",
@@ -27,6 +26,4 @@ __all__ = [
27
26
  "PyArrowWriteResult",
28
27
  "RoundCompletionInfo",
29
28
  "HighWatermark",
30
- "SortKey",
31
- "SortOrder",
32
29
  ]
@@ -12,8 +12,8 @@ import pyarrow as pa
12
12
  from deltacat.compute.compactor import (
13
13
  PyArrowWriteResult,
14
14
  RoundCompletionInfo,
15
- SortKey,
16
15
  )
16
+ from deltacat.storage.model.sort_key import SortKey
17
17
  from deltacat.compute.compactor.model.dedupe_result import DedupeResult
18
18
  from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
19
19
  from deltacat.io.object_store import IObjectStore
@@ -50,6 +50,7 @@ from deltacat.utils.metrics import MetricsConfig
50
50
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
51
51
  CompactionSessionAuditInfo,
52
52
  )
53
+ from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
53
54
  from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
54
55
 
55
56
 
@@ -59,6 +60,9 @@ if importlib.util.find_spec("memray"):
59
60
 
60
61
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
61
62
 
63
+ DEFAULT_DEDUPE_MAX_PARALLELISM_RATIO_ARG: int = 1
64
+ DEFAULT_PROPERTIES_ARG: Dict[str, Any] = {}
65
+
62
66
 
63
67
  def check_preconditions(
64
68
  source_partition_locator: PartitionLocator,
@@ -67,8 +71,11 @@ def check_preconditions(
67
71
  max_records_per_output_file: int,
68
72
  new_hash_bucket_count: Optional[int],
69
73
  deltacat_storage=unimplemented_deltacat_storage,
74
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
75
+ **kwargs,
70
76
  ) -> int:
71
-
77
+ if deltacat_storage_kwargs is None:
78
+ deltacat_storage_kwargs = {}
72
79
  assert (
73
80
  source_partition_locator.partition_values
74
81
  == destination_partition_locator.partition_values
@@ -83,10 +90,12 @@ def check_preconditions(
83
90
  assert (
84
91
  new_hash_bucket_count >= 1
85
92
  ), "New hash bucket count must be a positive value"
86
- return SortKey.validate_sort_keys(
93
+ return validate_sort_keys(
87
94
  source_partition_locator,
88
95
  sort_keys,
89
96
  deltacat_storage,
97
+ deltacat_storage_kwargs,
98
+ **kwargs,
90
99
  )
91
100
 
92
101
 
@@ -117,9 +126,11 @@ def compact_partition(
117
126
  object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
118
127
  s3_client_kwargs: Optional[Dict[str, Any]] = None,
119
128
  deltacat_storage=unimplemented_deltacat_storage,
129
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
120
130
  **kwargs,
121
131
  ) -> Optional[str]:
122
-
132
+ if deltacat_storage_kwargs is None:
133
+ deltacat_storage_kwargs = {}
123
134
  if not importlib.util.find_spec("memray"):
124
135
  logger.info(f"memray profiler not available, disabling all profiling")
125
136
  enable_profiler = False
@@ -161,6 +172,7 @@ def compact_partition(
161
172
  object_store,
162
173
  s3_client_kwargs,
163
174
  deltacat_storage,
175
+ deltacat_storage_kwargs,
164
176
  **kwargs,
165
177
  )
166
178
  if new_partition:
@@ -172,7 +184,9 @@ def compact_partition(
172
184
  round_completion_file_s3_url = None
173
185
  if partition:
174
186
  logger.info(f"Committing compacted partition to: {partition.locator}")
175
- partition = deltacat_storage.commit_partition(partition)
187
+ partition = deltacat_storage.commit_partition(
188
+ partition, **deltacat_storage_kwargs
189
+ )
176
190
  logger.info(f"Committed compacted partition: {partition}")
177
191
 
178
192
  round_completion_file_s3_url = rcf.write_round_completion_file(
@@ -209,15 +223,16 @@ def _execute_compaction_round(
209
223
  object_store: Optional[IObjectStore],
210
224
  s3_client_kwargs: Optional[Dict[str, Any]],
211
225
  deltacat_storage=unimplemented_deltacat_storage,
226
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
212
227
  **kwargs,
213
228
  ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
214
-
229
+ if deltacat_storage_kwargs is None:
230
+ deltacat_storage_kwargs = {}
215
231
  rcf_source_partition_locator = (
216
232
  rebase_source_partition_locator
217
233
  if rebase_source_partition_locator
218
234
  else source_partition_locator
219
235
  )
220
-
221
236
  base_audit_url = rcf_source_partition_locator.path(
222
237
  f"s3://{compaction_artifact_s3_bucket}/compaction-audit"
223
238
  )
@@ -250,6 +265,8 @@ def _execute_compaction_round(
250
265
  records_per_compacted_file,
251
266
  hash_bucket_count,
252
267
  deltacat_storage,
268
+ deltacat_storage_kwargs,
269
+ **kwargs,
253
270
  )
254
271
 
255
272
  # sort primary keys to produce the same pk digest regardless of input order
@@ -329,7 +346,8 @@ def _execute_compaction_round(
329
346
  rebase_source_partition_locator,
330
347
  rebase_source_partition_high_watermark,
331
348
  deltacat_storage,
332
- **list_deltas_kwargs,
349
+ deltacat_storage_kwargs,
350
+ list_deltas_kwargs,
333
351
  )
334
352
 
335
353
  delta_discovery_end = time.monotonic()
@@ -362,6 +380,8 @@ def _execute_compaction_round(
362
380
  compaction_audit,
363
381
  hash_bucket_count,
364
382
  deltacat_storage=deltacat_storage,
383
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
384
+ **kwargs,
365
385
  )
366
386
  if input_deltas_stats is None
367
387
  else io.limit_input_deltas(
@@ -372,6 +392,8 @@ def _execute_compaction_round(
372
392
  compaction_audit=compaction_audit,
373
393
  input_deltas_stats=input_deltas_stats,
374
394
  deltacat_storage=deltacat_storage,
395
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
396
+ **kwargs,
375
397
  )
376
398
  )
377
399
 
@@ -399,9 +421,7 @@ def _execute_compaction_round(
399
421
  raise AssertionError(
400
422
  "Multiple rounds are not supported. Please increase the cluster size and run again."
401
423
  )
402
-
403
424
  hb_start = time.monotonic()
404
-
405
425
  hb_tasks_pending = invoke_parallel(
406
426
  items=uniform_deltas,
407
427
  ray_task=hb.hash_bucket,
@@ -417,8 +437,9 @@ def _execute_compaction_round(
417
437
  read_kwargs_provider=read_kwargs_provider,
418
438
  object_store=object_store,
419
439
  deltacat_storage=deltacat_storage,
440
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
441
+ **kwargs,
420
442
  )
421
-
422
443
  hb_invoke_end = time.monotonic()
423
444
 
424
445
  logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
@@ -456,7 +477,6 @@ def _execute_compaction_round(
456
477
  )
457
478
 
458
479
  compaction_audit.set_input_records(total_hb_record_count.item())
459
-
460
480
  # TODO (pdames): when resources are freed during the last round of hash
461
481
  # bucketing, start running dedupe tasks that read existing dedupe
462
482
  # output from S3 then wait for hash bucketing to finish before continuing
@@ -467,13 +487,14 @@ def _execute_compaction_round(
467
487
  compacted_stream_locator.namespace,
468
488
  compacted_stream_locator.table_name,
469
489
  compacted_stream_locator.table_version,
490
+ **deltacat_storage_kwargs,
470
491
  )
471
492
  partition = deltacat_storage.stage_partition(
472
493
  stream,
473
494
  destination_partition_locator.partition_values,
495
+ **deltacat_storage_kwargs,
474
496
  )
475
497
  new_compacted_partition_locator = partition.locator
476
-
477
498
  # parallel step 2:
478
499
  # discover records with duplicate primary keys in each hash bucket, and
479
500
  # identify the index of records to keep or drop based on sort keys
@@ -482,7 +503,10 @@ def _execute_compaction_round(
482
503
 
483
504
  dedupe_start = time.monotonic()
484
505
  dd_max_parallelism = int(
485
- max_parallelism * kwargs.get("dd_max_parallelism_ratio", 1)
506
+ max_parallelism
507
+ * kwargs.get(
508
+ "dd_max_parallelism_ratio", DEFAULT_DEDUPE_MAX_PARALLELISM_RATIO_ARG
509
+ )
486
510
  )
487
511
  logger.info(
488
512
  f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
@@ -526,7 +550,6 @@ def _execute_compaction_round(
526
550
  )
527
551
 
528
552
  compaction_audit.set_records_deduped(total_dd_record_count.item())
529
-
530
553
  all_mat_buckets_to_obj_id = defaultdict(list)
531
554
  for dd_result in dd_results:
532
555
  for (
@@ -540,7 +563,6 @@ def _execute_compaction_round(
540
563
  logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
541
564
 
542
565
  compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
543
-
544
566
  # TODO(pdames): when resources are freed during the last round of deduping
545
567
  # start running materialize tasks that read materialization source file
546
568
  # tables from S3 then wait for deduping to finish before continuing
@@ -561,7 +583,6 @@ def _execute_compaction_round(
561
583
  )
562
584
 
563
585
  materialize_start = time.monotonic()
564
-
565
586
  mat_tasks_pending = invoke_parallel(
566
587
  items=all_mat_buckets_to_obj_id.items(),
567
588
  ray_task=mat.materialize,
@@ -584,6 +605,7 @@ def _execute_compaction_round(
584
605
  s3_table_writer_kwargs=s3_table_writer_kwargs,
585
606
  object_store=object_store,
586
607
  deltacat_storage=deltacat_storage,
608
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
587
609
  )
588
610
 
589
611
  materialize_invoke_end = time.monotonic()
@@ -629,7 +651,9 @@ def _execute_compaction_round(
629
651
  f" {record_info_msg}"
630
652
  )
631
653
  compacted_delta = deltacat_storage.commit_delta(
632
- merged_delta, properties=kwargs.get("properties", {})
654
+ merged_delta,
655
+ properties=kwargs.get("properties", DEFAULT_PROPERTIES_ARG),
656
+ **deltacat_storage_kwargs,
633
657
  )
634
658
  logger.info(f"Committed compacted delta: {compacted_delta}")
635
659
 
@@ -691,10 +715,11 @@ def _execute_compaction_round(
691
715
 
692
716
  def compact_partition_from_request(
693
717
  compact_partition_params: CompactPartitionParams,
718
+ *compact_partition_pos_args,
694
719
  ) -> Optional[str]:
695
720
  """
696
721
  Wrapper for compact_partition that allows for the compact_partition parameters to be
697
- passed in as a custom dictionary-like CompactPartitionParams object.
722
+ passed in as a custom dictionary-like CompactPartitionParams object along with any compact_partition positional arguments.
698
723
  :param compact_partition_params:
699
724
  """
700
- return compact_partition(**compact_partition_params)
725
+ return compact_partition(*compact_partition_pos_args, **compact_partition_params)