deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +17 -6
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/compute/compactor/compaction_session.py +42 -18
- deltacat/compute/compactor/model/compact_partition_params.py +297 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
- deltacat/compute/compactor/model/delta_annotated.py +95 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +4 -1
- deltacat/compute/compactor/steps/dedupe.py +9 -6
- deltacat/compute/compactor/steps/hash_bucket.py +24 -3
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +22 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/sort_key.py +5 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +509 -0
- deltacat/compute/compactor_v2/constants.py +37 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +143 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +469 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
- deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
- deltacat/compute/compactor_v2/utils/io.py +152 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
- deltacat/compute/compactor_v2/utils/task_options.py +221 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/interface.py +8 -1
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/test_clients.py +16 -3
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
- deltacat/tests/compute/testcases.py +395 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +62 -19
- deltacat/tests/test_utils/pyarrow.py +49 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +83 -0
- deltacat/types/tables.py +6 -0
- deltacat/utils/arguments.py +25 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +218 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
- /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,7 @@ import functools
|
|
5
5
|
import logging
|
6
6
|
import os
|
7
7
|
import pathlib
|
8
|
-
from typing import Dict, List, Optional, Set
|
8
|
+
from typing import Any, Dict, List, Optional, Set
|
9
9
|
|
10
10
|
import ray
|
11
11
|
from ray.types import ObjectRef
|
@@ -118,10 +118,12 @@ def collect_from_partition(
|
|
118
118
|
stat_results_s3_bucket: Optional[str] = None,
|
119
119
|
metastats_results_s3_bucket: Optional[str] = None,
|
120
120
|
deltacat_storage=unimplemented_deltacat_storage,
|
121
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
121
122
|
*args,
|
122
123
|
**kwargs,
|
123
124
|
) -> ObjectRef[Dict[int, DeltaStats]]:
|
124
|
-
|
125
|
+
if deltacat_storage_kwargs is None:
|
126
|
+
deltacat_storage_kwargs = {}
|
125
127
|
if not columns:
|
126
128
|
columns = deltacat_storage.get_table_version_column_names(
|
127
129
|
source_partition_locator.namespace,
|
@@ -33,6 +33,7 @@ def start_stats_collection(
|
|
33
33
|
stat_results_s3_bucket: Optional[str] = None,
|
34
34
|
metastats_results_s3_bucket: Optional[str] = None,
|
35
35
|
deltacat_storage=unimplemented_deltacat_storage,
|
36
|
+
**kwargs,
|
36
37
|
) -> Dict[str, List[DeltaStats]]:
|
37
38
|
"""Collects statistics on deltas, given a set of delta stream position ranges.
|
38
39
|
Example:
|
@@ -171,6 +171,7 @@ def collect_stats_by_columns(
|
|
171
171
|
delta_annotated: DeltaAnnotated,
|
172
172
|
columns_to_compute: Optional[List[str]] = None,
|
173
173
|
deltacat_storage=unimplemented_deltacat_storage,
|
174
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
174
175
|
) -> Dict[str, Any]:
|
175
176
|
"""Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
|
176
177
|
|
@@ -182,6 +183,8 @@ def collect_stats_by_columns(
|
|
182
183
|
Returns:
|
183
184
|
A delta wide stats container
|
184
185
|
"""
|
186
|
+
if deltacat_storage_kwargs is None:
|
187
|
+
deltacat_storage_kwargs = {}
|
185
188
|
total_tables_size = 0
|
186
189
|
|
187
190
|
# Mapping of column_name -> [stats_file_idx_1, stats_file_idx_2, ... stats_file_idx_n]
|
@@ -198,6 +201,7 @@ def collect_stats_by_columns(
|
|
198
201
|
TableType.PYARROW,
|
199
202
|
columns_to_compute,
|
200
203
|
equivalent_table_types="uncompacted",
|
204
|
+
**deltacat_storage_kwargs,
|
201
205
|
)
|
202
206
|
)
|
203
207
|
assert isinstance(entry_pyarrow_table, pyarrow.Table), (
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
from collections import defaultdict
|
3
|
-
from typing import Dict, List, Optional
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
4
|
|
5
5
|
import pyarrow
|
6
6
|
import ray
|
@@ -83,6 +83,7 @@ def get_delta_stats(
|
|
83
83
|
delta_locator: DeltaLocator,
|
84
84
|
columns: Optional[List[str]] = None,
|
85
85
|
deltacat_storage=unimplemented_deltacat_storage,
|
86
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
86
87
|
) -> DeltaStats:
|
87
88
|
"""Ray distributed task to compute and collect stats for a requested delta.
|
88
89
|
If no columns are requested, stats will be computed for all columns.
|
@@ -93,10 +94,15 @@ def get_delta_stats(
|
|
93
94
|
Returns:
|
94
95
|
A delta wide stats container
|
95
96
|
"""
|
96
|
-
|
97
|
-
|
97
|
+
if deltacat_storage_kwargs is None:
|
98
|
+
deltacat_storage_kwargs = {}
|
99
|
+
manifest = deltacat_storage.get_delta_manifest(
|
100
|
+
delta_locator, **deltacat_storage_kwargs
|
101
|
+
)
|
98
102
|
delta = Delta.of(delta_locator, None, None, None, manifest)
|
99
|
-
return _collect_stats_by_columns(
|
103
|
+
return _collect_stats_by_columns(
|
104
|
+
delta, columns, deltacat_storage, deltacat_storage_kwargs
|
105
|
+
)
|
100
106
|
|
101
107
|
|
102
108
|
@ray.remote
|
@@ -105,6 +111,7 @@ def get_deltas_from_range(
|
|
105
111
|
start_position_inclusive: DeltaRange,
|
106
112
|
end_position_inclusive: DeltaRange,
|
107
113
|
deltacat_storage=unimplemented_deltacat_storage,
|
114
|
+
**kwargs,
|
108
115
|
) -> List[Delta]:
|
109
116
|
"""Looks up deltas in the specified partition using Ray, given both starting and ending delta stream positions.
|
110
117
|
|
@@ -137,6 +144,7 @@ def get_deltas_from_range(
|
|
137
144
|
end_position_inclusive,
|
138
145
|
ascending_order=True,
|
139
146
|
include_manifest=False,
|
147
|
+
**kwargs,
|
140
148
|
)
|
141
149
|
return deltas_list_result.all_items()
|
142
150
|
|
@@ -145,6 +153,7 @@ def _collect_stats_by_columns(
|
|
145
153
|
delta: Delta,
|
146
154
|
columns_to_compute: Optional[List[str]] = None,
|
147
155
|
deltacat_storage=unimplemented_deltacat_storage,
|
156
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
148
157
|
) -> DeltaStats:
|
149
158
|
"""Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
|
150
159
|
Args:
|
@@ -154,6 +163,8 @@ def _collect_stats_by_columns(
|
|
154
163
|
Returns:
|
155
164
|
A delta wide stats container
|
156
165
|
"""
|
166
|
+
if deltacat_storage_kwargs is None:
|
167
|
+
deltacat_storage_kwargs = {}
|
157
168
|
assert (
|
158
169
|
delta.manifest is not None
|
159
170
|
), f"Manifest should not be missing from delta for stats calculation: {delta}"
|
@@ -167,7 +178,11 @@ def _collect_stats_by_columns(
|
|
167
178
|
for file_idx, manifest in enumerate(delta.manifest.entries):
|
168
179
|
entry_pyarrow_table: LocalTable = (
|
169
180
|
deltacat_storage.download_delta_manifest_entry(
|
170
|
-
delta,
|
181
|
+
delta,
|
182
|
+
file_idx,
|
183
|
+
TableType.PYARROW,
|
184
|
+
columns_to_compute,
|
185
|
+
**deltacat_storage_kwargs,
|
171
186
|
)
|
172
187
|
)
|
173
188
|
assert isinstance(entry_pyarrow_table, pyarrow.Table), (
|
deltacat/exceptions.py
CHANGED
@@ -3,13 +3,14 @@ from ray import cloudpickle
|
|
3
3
|
from collections import defaultdict
|
4
4
|
import time
|
5
5
|
from deltacat.io.object_store import IObjectStore
|
6
|
-
from typing import Any, List
|
6
|
+
from typing import Any, List, Optional
|
7
7
|
from deltacat import logs
|
8
8
|
import uuid
|
9
9
|
import socket
|
10
10
|
from pymemcache.client.base import Client
|
11
11
|
from pymemcache.client.retrying import RetryingClient
|
12
12
|
from pymemcache.exceptions import MemcacheUnexpectedCloseError
|
13
|
+
from pymemcache.client.rendezvous import RendezvousHash
|
13
14
|
|
14
15
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
16
|
|
@@ -19,36 +20,46 @@ class MemcachedObjectStore(IObjectStore):
|
|
19
20
|
An implementation of object store that uses Memcached.
|
20
21
|
"""
|
21
22
|
|
22
|
-
def __init__(
|
23
|
+
def __init__(
|
24
|
+
self, storage_node_ips: Optional[List[str]] = None, port: Optional[int] = 11212
|
25
|
+
) -> None:
|
23
26
|
self.client_cache = {}
|
24
27
|
self.current_ip = None
|
25
28
|
self.SEPARATOR = "_"
|
26
29
|
self.port = port
|
30
|
+
self.storage_node_ips = storage_node_ips
|
31
|
+
self.hasher = None
|
27
32
|
super().__init__()
|
28
33
|
|
34
|
+
def initialize_hasher(self):
|
35
|
+
if not self.hasher and self.storage_node_ips:
|
36
|
+
self.hasher = RendezvousHash()
|
37
|
+
for n in self.storage_node_ips:
|
38
|
+
self.hasher.add_node(n)
|
39
|
+
|
29
40
|
def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
|
30
|
-
input =
|
41
|
+
input = defaultdict(dict)
|
31
42
|
result = []
|
32
|
-
current_ip = self._get_current_ip()
|
33
43
|
for obj in objects:
|
34
44
|
serialized = cloudpickle.dumps(obj)
|
35
45
|
uid = uuid.uuid4()
|
36
|
-
|
37
|
-
|
46
|
+
create_ref_ip = self._get_create_ref_ip(uid.__str__())
|
47
|
+
ref = self._create_ref(uid, create_ref_ip)
|
48
|
+
input[create_ref_ip][uid.__str__()] = serialized
|
38
49
|
result.append(ref)
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
50
|
+
for create_ref_ip, uid_to_object in input.items():
|
51
|
+
client = self._get_client_by_ip(create_ref_ip)
|
52
|
+
if client.set_many(uid_to_object, noreply=False):
|
53
|
+
raise RuntimeError("Unable to write few keys to cache")
|
43
54
|
|
44
55
|
return result
|
45
56
|
|
46
57
|
def put(self, obj: object, *args, **kwargs) -> Any:
|
47
58
|
serialized = cloudpickle.dumps(obj)
|
48
59
|
uid = uuid.uuid4()
|
49
|
-
|
50
|
-
ref = self._create_ref(uid,
|
51
|
-
client = self._get_client_by_ip(
|
60
|
+
create_ref_ip = self._get_create_ref_ip(uid.__str__())
|
61
|
+
ref = self._create_ref(uid, create_ref_ip)
|
62
|
+
client = self._get_client_by_ip(create_ref_ip)
|
52
63
|
|
53
64
|
if client.set(uid.__str__(), serialized):
|
54
65
|
return ref
|
@@ -99,6 +110,18 @@ class MemcachedObjectStore(IObjectStore):
|
|
99
110
|
def _create_ref(self, uid, ip) -> str:
|
100
111
|
return f"{uid}{self.SEPARATOR}{ip}"
|
101
112
|
|
113
|
+
def _get_storage_node_ip(self, key: str):
|
114
|
+
self.initialize_hasher()
|
115
|
+
storage_node_ip = self.hasher.get_node(key)
|
116
|
+
return storage_node_ip
|
117
|
+
|
118
|
+
def _get_create_ref_ip(self, uid: str):
|
119
|
+
if self.storage_node_ips:
|
120
|
+
create_ref_ip = self._get_storage_node_ip(uid)
|
121
|
+
else:
|
122
|
+
create_ref_ip = self._get_current_ip()
|
123
|
+
return create_ref_ip
|
124
|
+
|
102
125
|
def _get_client_by_ip(self, ip_address: str):
|
103
126
|
if ip_address in self.client_cache:
|
104
127
|
return self.client_cache[ip_address]
|
@@ -108,7 +131,7 @@ class MemcachedObjectStore(IObjectStore):
|
|
108
131
|
base_client,
|
109
132
|
attempts=3,
|
110
133
|
retry_delay=0.01,
|
111
|
-
retry_for=[MemcacheUnexpectedCloseError],
|
134
|
+
retry_for=[MemcacheUnexpectedCloseError, ConnectionResetError],
|
112
135
|
)
|
113
136
|
|
114
137
|
self.client_cache[ip_address] = client
|
deltacat/logs.py
CHANGED
@@ -143,9 +143,10 @@ def _configure_logger(
|
|
143
143
|
log_dir, log_base_file_name, primary_log_level
|
144
144
|
)
|
145
145
|
_add_logger_handler(logger, handler)
|
146
|
-
|
147
|
-
|
148
|
-
|
146
|
+
if ray.is_initialized():
|
147
|
+
ray_runtime_ctx = ray.get_runtime_context()
|
148
|
+
if ray_runtime_ctx.worker.connected:
|
149
|
+
logger = RayRuntimeContextLoggerAdapter(logger, ray_runtime_ctx)
|
149
150
|
|
150
151
|
return logger
|
151
152
|
|
deltacat/storage/interface.py
CHANGED
@@ -21,6 +21,7 @@ from deltacat.storage import (
|
|
21
21
|
Table,
|
22
22
|
TableVersion,
|
23
23
|
SortKey,
|
24
|
+
PartitionLocator,
|
24
25
|
)
|
25
26
|
from deltacat.types.media import ContentType, StorageType, TableType
|
26
27
|
from deltacat.utils.common import ReadKwargsProvider
|
@@ -105,7 +106,13 @@ def list_deltas(
|
|
105
106
|
|
106
107
|
|
107
108
|
def list_partition_deltas(
|
108
|
-
|
109
|
+
partition_like: Union[Partition, PartitionLocator],
|
110
|
+
first_stream_position: Optional[int] = None,
|
111
|
+
last_stream_position: Optional[int] = None,
|
112
|
+
ascending_order: bool = False,
|
113
|
+
include_manifest: bool = False,
|
114
|
+
*args,
|
115
|
+
**kwargs
|
109
116
|
) -> ListResult[Delta]:
|
110
117
|
"""
|
111
118
|
Lists a page of deltas committed to the given partition.
|
deltacat/storage/model/types.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from enum import Enum
|
2
2
|
from typing import List, Union, Any
|
3
3
|
|
4
|
+
from pyarrow.parquet import ParquetFile
|
4
5
|
import numpy as np
|
5
6
|
import pandas as pd
|
6
7
|
import pyarrow as pa
|
@@ -8,7 +9,7 @@ import pkg_resources
|
|
8
9
|
from ray.data._internal.arrow_block import ArrowRow
|
9
10
|
from ray.data.dataset import Dataset
|
10
11
|
|
11
|
-
LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray]
|
12
|
+
LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray, ParquetFile]
|
12
13
|
LocalDataset = List[LocalTable]
|
13
14
|
# Starting Ray 2.5.0, Dataset follows a strict mode (https://docs.ray.io/en/latest/data/faq.html#migrating-to-strict-mode),
|
14
15
|
# and generic annotation is removed. So add a version checker to determine whether to use the old or new definition.
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from unittest.mock import patch
|
2
2
|
import unittest
|
3
|
-
import json
|
4
3
|
from http import HTTPStatus
|
4
|
+
import requests
|
5
5
|
|
6
6
|
HAPPY_RESPONSE = {
|
7
7
|
"AccessKeyId": "ASIA123456789",
|
@@ -20,7 +20,7 @@ class MockResponse:
|
|
20
20
|
"""
|
21
21
|
|
22
22
|
def __init__(self, status_code: int, text: str, reason: str = "") -> None:
|
23
|
-
self.status_code = status_code
|
23
|
+
self.status_code: requests.Response.status_code = status_code
|
24
24
|
self.text = text
|
25
25
|
self.reason = reason
|
26
26
|
|
@@ -55,7 +55,7 @@ class TestBlockUntilInstanceMetadataServiceReturnsSuccess(unittest.TestCase):
|
|
55
55
|
)
|
56
56
|
|
57
57
|
requests_mock.get.side_effect = [
|
58
|
-
MockResponse(HTTPStatus.OK,
|
58
|
+
MockResponse(HTTPStatus.OK, "foo"),
|
59
59
|
MockResponse(HTTPStatus.TOO_MANY_REQUESTS, "foo"),
|
60
60
|
MockResponse(HTTPStatus.INTERNAL_SERVER_ERROR, "foo"),
|
61
61
|
MockResponse(HTTPStatus.NOT_IMPLEMENTED, "bar"),
|
@@ -65,3 +65,16 @@ class TestBlockUntilInstanceMetadataServiceReturnsSuccess(unittest.TestCase):
|
|
65
65
|
self.assertEqual(
|
66
66
|
block_until_instance_metadata_service_returns_success().status_code, 200
|
67
67
|
)
|
68
|
+
|
69
|
+
@patch("deltacat.aws.clients.requests")
|
70
|
+
def test_retrying_status_on_shortlist_returns_early(self, requests_mock):
|
71
|
+
from deltacat.aws.clients import (
|
72
|
+
block_until_instance_metadata_service_returns_success,
|
73
|
+
)
|
74
|
+
|
75
|
+
requests_mock.get.side_effect = [
|
76
|
+
MockResponse(HTTPStatus.FORBIDDEN, "foo"),
|
77
|
+
]
|
78
|
+
self.assertEqual(
|
79
|
+
block_until_instance_metadata_service_returns_success().status_code, 403
|
80
|
+
)
|
File without changes
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
from enum import Enum
|
4
|
+
from typing import List
|
5
|
+
import datetime as dt
|
6
|
+
from datetime import timezone
|
7
|
+
|
8
|
+
TEST_S3_RCF_BUCKET_NAME = "test-compaction-artifacts-bucket"
|
9
|
+
# REBASE src = spark compacted table to create an initial version of ray compacted table
|
10
|
+
BASE_TEST_SOURCE_NAMESPACE = "source_test_namespace"
|
11
|
+
BASE_TEST_SOURCE_TABLE_NAME = "test_table"
|
12
|
+
BASE_TEST_SOURCE_TABLE_VERSION = "1"
|
13
|
+
|
14
|
+
BASE_TEST_DESTINATION_NAMESPACE = "destination_test_namespace"
|
15
|
+
BASE_TEST_DESTINATION_TABLE_NAME = "destination_test_table_RAY"
|
16
|
+
BASE_TEST_DESTINATION_TABLE_VERSION = "1"
|
17
|
+
|
18
|
+
HASH_BUCKET_COUNT: int = 3
|
19
|
+
|
20
|
+
MAX_RECORDS_PER_FILE: int = 1
|
21
|
+
|
22
|
+
UTC_ISO_8601_FORMAT_WITHOUT_MILLIS = "%Y-%m-%dT%H:%M:%SZ" # '2018-09-05T14:09:03Z'
|
23
|
+
|
24
|
+
|
25
|
+
class PartitionKeyType(str, Enum):
|
26
|
+
INT = "int"
|
27
|
+
STRING = "string"
|
28
|
+
TIMESTAMP = "timestamp"
|
29
|
+
|
30
|
+
|
31
|
+
class PartitionKey(dict):
|
32
|
+
@staticmethod
|
33
|
+
def of(key_name: str, key_type: PartitionKeyType) -> PartitionKey:
|
34
|
+
return PartitionKey({"keyName": key_name, "keyType": key_type.value})
|
35
|
+
|
36
|
+
@property
|
37
|
+
def key_name(self) -> str:
|
38
|
+
return self["keyName"]
|
39
|
+
|
40
|
+
@property
|
41
|
+
def key_type(self) -> PartitionKeyType:
|
42
|
+
key_type = self["keyType"]
|
43
|
+
return None if key_type is None else PartitionKeyType(key_type)
|
44
|
+
|
45
|
+
|
46
|
+
def setup_sort_and_partition_keys(sort_keys_param, partition_keys_param):
|
47
|
+
from deltacat.storage.model.sort_key import SortKey
|
48
|
+
|
49
|
+
sort_keys, partition_keys = None, None
|
50
|
+
if sort_keys_param is not None:
|
51
|
+
sort_keys = [SortKey.of(sort_key["key_name"]) for sort_key in sort_keys_param]
|
52
|
+
if partition_keys_param is not None:
|
53
|
+
partition_keys = [
|
54
|
+
PartitionKey.of(
|
55
|
+
partition_key["key_name"], PartitionKeyType(partition_key["key_type"])
|
56
|
+
)
|
57
|
+
for partition_key in partition_keys_param
|
58
|
+
]
|
59
|
+
return sort_keys, partition_keys
|
60
|
+
|
61
|
+
|
62
|
+
def offer_iso8601_timestamp_list(
|
63
|
+
periods: int,
|
64
|
+
unit_of_time: str,
|
65
|
+
end_time=dt.datetime(2023, 5, 3, 10, 0, 0, 0, tzinfo=timezone.utc),
|
66
|
+
) -> List[str]:
|
67
|
+
"""
|
68
|
+
Returns a list of ISO 8601 timestamps, each periods units of time before the start time.
|
69
|
+
|
70
|
+
Args:
|
71
|
+
periods: The number of timestamps to return.
|
72
|
+
unit_of_time: The unit of time to use for the timestamps. Must be one of "seconds", "minutes", "hours", "days", or "weeks".
|
73
|
+
end_time: The end time for the timestamps. Defaults to 2023-05-03T10:00:00Z.
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
A list of ISO 8601 timestamps, each periods units of time before the start time.
|
77
|
+
|
78
|
+
Raises:
|
79
|
+
ValueError: If the unit_of_time argument is not one of "seconds", "minutes", "hours", "days", or "weeks".
|
80
|
+
"""
|
81
|
+
import datetime as dt
|
82
|
+
|
83
|
+
acceptable_units_of_time = ["seconds", "minutes", "hours", "days", "weeks"]
|
84
|
+
if unit_of_time not in acceptable_units_of_time:
|
85
|
+
raise ValueError(
|
86
|
+
f"unit_of_time {unit_of_time} is not supported. Please use one of these time units: {acceptable_units_of_time}"
|
87
|
+
)
|
88
|
+
res = []
|
89
|
+
for i in range(periods):
|
90
|
+
kwarg = {unit_of_time: i}
|
91
|
+
res.append(
|
92
|
+
(end_time - dt.timedelta(**kwarg)).strftime(
|
93
|
+
UTC_ISO_8601_FORMAT_WITHOUT_MILLIS
|
94
|
+
)
|
95
|
+
)
|
96
|
+
return res
|
File without changes
|
File without changes
|
@@ -49,8 +49,10 @@ class TestRepartitionRange(unittest.TestCase):
|
|
49
49
|
self.destination_partition: PartitionLocator = MagicMock()
|
50
50
|
self.repartition_args = {"column": "last_updated", "ranges": [1678665487112747]}
|
51
51
|
self.max_records_per_output_file = 2
|
52
|
+
self.s3_table_writer_kwargs = {}
|
52
53
|
self.repartitioned_file_content_type = ContentType.PARQUET
|
53
54
|
self.deltacat_storage = MagicMock()
|
55
|
+
self.deltacat_storage_kwargs = MagicMock()
|
54
56
|
|
55
57
|
def test_repartition_range(self):
|
56
58
|
result = repartition_range(
|
@@ -58,8 +60,10 @@ class TestRepartitionRange(unittest.TestCase):
|
|
58
60
|
self.destination_partition,
|
59
61
|
self.repartition_args,
|
60
62
|
self.max_records_per_output_file,
|
63
|
+
self.s3_table_writer_kwargs,
|
61
64
|
self.repartitioned_file_content_type,
|
62
65
|
self.deltacat_storage,
|
66
|
+
self.deltacat_storage_kwargs,
|
63
67
|
)
|
64
68
|
# Assert that a RepartitionResult object is returned
|
65
69
|
self.assertIsInstance(result, RepartitionResult)
|
@@ -83,8 +87,10 @@ class TestRepartitionRange(unittest.TestCase):
|
|
83
87
|
self.destination_partition,
|
84
88
|
self.repartition_args,
|
85
89
|
self.max_records_per_output_file,
|
90
|
+
self.s3_table_writer_kwargs,
|
86
91
|
self.repartitioned_file_content_type,
|
87
92
|
self.deltacat_storage,
|
93
|
+
self.deltacat_storage_kwargs,
|
88
94
|
)
|
89
95
|
|
90
96
|
def test_empty_ranges(self):
|
@@ -95,8 +101,10 @@ class TestRepartitionRange(unittest.TestCase):
|
|
95
101
|
self.destination_partition,
|
96
102
|
self.repartition_args,
|
97
103
|
self.max_records_per_output_file,
|
104
|
+
self.s3_table_writer_kwargs,
|
98
105
|
self.repartitioned_file_content_type,
|
99
106
|
self.deltacat_storage,
|
107
|
+
self.deltacat_storage_kwargs,
|
100
108
|
)
|
101
109
|
|
102
110
|
def test_one_value_in_ranges(self):
|
@@ -106,8 +114,10 @@ class TestRepartitionRange(unittest.TestCase):
|
|
106
114
|
self.destination_partition,
|
107
115
|
self.repartition_args,
|
108
116
|
self.max_records_per_output_file,
|
117
|
+
self.s3_table_writer_kwargs,
|
109
118
|
self.repartitioned_file_content_type,
|
110
119
|
self.deltacat_storage,
|
120
|
+
self.deltacat_storage_kwargs,
|
111
121
|
)
|
112
122
|
self.assertEqual(len(result.range_deltas), 2)
|
113
123
|
|
@@ -118,8 +128,10 @@ class TestRepartitionRange(unittest.TestCase):
|
|
118
128
|
self.destination_partition,
|
119
129
|
self.repartition_args,
|
120
130
|
self.max_records_per_output_file,
|
131
|
+
self.s3_table_writer_kwargs,
|
121
132
|
self.repartitioned_file_content_type,
|
122
133
|
self.deltacat_storage,
|
134
|
+
self.deltacat_storage_kwargs,
|
123
135
|
)
|
124
136
|
self.assertEqual(len(result.range_deltas), 3)
|
125
137
|
|
@@ -131,8 +143,10 @@ class TestRepartitionRange(unittest.TestCase):
|
|
131
143
|
self.destination_partition,
|
132
144
|
self.repartition_args,
|
133
145
|
self.max_records_per_output_file,
|
146
|
+
self.s3_table_writer_kwargs,
|
134
147
|
self.repartitioned_file_content_type,
|
135
148
|
self.deltacat_storage,
|
149
|
+
self.deltacat_storage_kwargs,
|
136
150
|
)
|
137
151
|
self.assertLess(len(result.range_deltas), 2)
|
138
152
|
|
@@ -144,8 +158,10 @@ class TestRepartitionRange(unittest.TestCase):
|
|
144
158
|
self.destination_partition,
|
145
159
|
self.repartition_args,
|
146
160
|
self.max_records_per_output_file,
|
161
|
+
self.s3_table_writer_kwargs,
|
147
162
|
self.repartitioned_file_content_type,
|
148
163
|
self.deltacat_storage,
|
164
|
+
self.deltacat_storage_kwargs,
|
149
165
|
)
|
150
166
|
|
151
167
|
def test_unsorted_ranges(self):
|
@@ -159,8 +175,10 @@ class TestRepartitionRange(unittest.TestCase):
|
|
159
175
|
self.destination_partition,
|
160
176
|
self.repartition_args,
|
161
177
|
self.max_records_per_output_file,
|
178
|
+
self.s3_table_writer_kwargs,
|
162
179
|
self.repartitioned_file_content_type,
|
163
180
|
self.deltacat_storage,
|
181
|
+
self.deltacat_storage_kwargs,
|
164
182
|
)
|
165
183
|
self.assertEqual(len(result.range_deltas), 4)
|
166
184
|
|
@@ -171,22 +189,28 @@ class TestRepartitionRange(unittest.TestCase):
|
|
171
189
|
self.destination_partition,
|
172
190
|
self.repartition_args,
|
173
191
|
self.max_records_per_output_file,
|
192
|
+
self.s3_table_writer_kwargs,
|
174
193
|
self.repartitioned_file_content_type,
|
175
194
|
self.deltacat_storage,
|
195
|
+
self.deltacat_storage_kwargs,
|
176
196
|
)
|
177
197
|
self.assertEqual(len(result.range_deltas), 2)
|
178
198
|
|
179
199
|
def test_ranges_with_inf(self):
|
180
200
|
self.repartition_args["ranges"] = [1678665487112747, float("inf")]
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
201
|
+
|
202
|
+
self.assertRaises(
|
203
|
+
pa.lib.ArrowInvalid,
|
204
|
+
lambda: repartition_range(
|
205
|
+
self.tables,
|
206
|
+
self.destination_partition,
|
207
|
+
self.repartition_args,
|
208
|
+
self.max_records_per_output_file,
|
209
|
+
self.s3_table_writer_kwargs,
|
210
|
+
self.repartitioned_file_content_type,
|
211
|
+
self.deltacat_storage,
|
212
|
+
),
|
188
213
|
)
|
189
|
-
self.assertEqual(len(result.range_deltas), 2)
|
190
214
|
|
191
215
|
def test_null_rows_are_not_dropped(self):
|
192
216
|
# Add null value to the first table
|
@@ -209,8 +233,10 @@ class TestRepartitionRange(unittest.TestCase):
|
|
209
233
|
self.destination_partition,
|
210
234
|
self.repartition_args,
|
211
235
|
self.max_records_per_output_file,
|
236
|
+
self.s3_table_writer_kwargs,
|
212
237
|
self.repartitioned_file_content_type,
|
213
238
|
self.deltacat_storage,
|
239
|
+
self.deltacat_storage_kwargs,
|
214
240
|
)
|
215
241
|
|
216
242
|
# Assuming range_deltas is a list of DataFrames,
|
File without changes
|