deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +17 -6
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/compute/compactor/compaction_session.py +42 -18
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +2 -1
- deltacat/compute/compactor/steps/dedupe.py +9 -6
- deltacat/compute/compactor/steps/hash_bucket.py +24 -3
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/sort_key.py +5 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/interface.py +8 -1
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/test_clients.py +16 -3
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +62 -19
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +17 -3
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/RECORD +80 -47
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -3,13 +3,14 @@ from ray import cloudpickle
|
|
3
3
|
from collections import defaultdict
|
4
4
|
import time
|
5
5
|
from deltacat.io.object_store import IObjectStore
|
6
|
-
from typing import Any, List
|
6
|
+
from typing import Any, List, Optional
|
7
7
|
from deltacat import logs
|
8
8
|
import uuid
|
9
9
|
import socket
|
10
10
|
from pymemcache.client.base import Client
|
11
11
|
from pymemcache.client.retrying import RetryingClient
|
12
12
|
from pymemcache.exceptions import MemcacheUnexpectedCloseError
|
13
|
+
from pymemcache.client.rendezvous import RendezvousHash
|
13
14
|
|
14
15
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
16
|
|
@@ -19,36 +20,46 @@ class MemcachedObjectStore(IObjectStore):
|
|
19
20
|
An implementation of object store that uses Memcached.
|
20
21
|
"""
|
21
22
|
|
22
|
-
def __init__(
|
23
|
+
def __init__(
|
24
|
+
self, storage_node_ips: Optional[List[str]] = None, port: Optional[int] = 11212
|
25
|
+
) -> None:
|
23
26
|
self.client_cache = {}
|
24
27
|
self.current_ip = None
|
25
28
|
self.SEPARATOR = "_"
|
26
29
|
self.port = port
|
30
|
+
self.storage_node_ips = storage_node_ips
|
31
|
+
self.hasher = None
|
27
32
|
super().__init__()
|
28
33
|
|
34
|
+
def initialize_hasher(self):
|
35
|
+
if not self.hasher and self.storage_node_ips:
|
36
|
+
self.hasher = RendezvousHash()
|
37
|
+
for n in self.storage_node_ips:
|
38
|
+
self.hasher.add_node(n)
|
39
|
+
|
29
40
|
def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
|
30
|
-
input =
|
41
|
+
input = defaultdict(dict)
|
31
42
|
result = []
|
32
|
-
current_ip = self._get_current_ip()
|
33
43
|
for obj in objects:
|
34
44
|
serialized = cloudpickle.dumps(obj)
|
35
45
|
uid = uuid.uuid4()
|
36
|
-
|
37
|
-
|
46
|
+
create_ref_ip = self._get_create_ref_ip(uid.__str__())
|
47
|
+
ref = self._create_ref(uid, create_ref_ip)
|
48
|
+
input[create_ref_ip][uid.__str__()] = serialized
|
38
49
|
result.append(ref)
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
50
|
+
for create_ref_ip, uid_to_object in input.items():
|
51
|
+
client = self._get_client_by_ip(create_ref_ip)
|
52
|
+
if client.set_many(uid_to_object, noreply=False):
|
53
|
+
raise RuntimeError("Unable to write few keys to cache")
|
43
54
|
|
44
55
|
return result
|
45
56
|
|
46
57
|
def put(self, obj: object, *args, **kwargs) -> Any:
|
47
58
|
serialized = cloudpickle.dumps(obj)
|
48
59
|
uid = uuid.uuid4()
|
49
|
-
|
50
|
-
ref = self._create_ref(uid,
|
51
|
-
client = self._get_client_by_ip(
|
60
|
+
create_ref_ip = self._get_create_ref_ip(uid.__str__())
|
61
|
+
ref = self._create_ref(uid, create_ref_ip)
|
62
|
+
client = self._get_client_by_ip(create_ref_ip)
|
52
63
|
|
53
64
|
if client.set(uid.__str__(), serialized):
|
54
65
|
return ref
|
@@ -99,6 +110,18 @@ class MemcachedObjectStore(IObjectStore):
|
|
99
110
|
def _create_ref(self, uid, ip) -> str:
|
100
111
|
return f"{uid}{self.SEPARATOR}{ip}"
|
101
112
|
|
113
|
+
def _get_storage_node_ip(self, key: str):
|
114
|
+
self.initialize_hasher()
|
115
|
+
storage_node_ip = self.hasher.get_node(key)
|
116
|
+
return storage_node_ip
|
117
|
+
|
118
|
+
def _get_create_ref_ip(self, uid: str):
|
119
|
+
if self.storage_node_ips:
|
120
|
+
create_ref_ip = self._get_storage_node_ip(uid)
|
121
|
+
else:
|
122
|
+
create_ref_ip = self._get_current_ip()
|
123
|
+
return create_ref_ip
|
124
|
+
|
102
125
|
def _get_client_by_ip(self, ip_address: str):
|
103
126
|
if ip_address in self.client_cache:
|
104
127
|
return self.client_cache[ip_address]
|
@@ -108,7 +131,7 @@ class MemcachedObjectStore(IObjectStore):
|
|
108
131
|
base_client,
|
109
132
|
attempts=3,
|
110
133
|
retry_delay=0.01,
|
111
|
-
retry_for=[MemcacheUnexpectedCloseError],
|
134
|
+
retry_for=[MemcacheUnexpectedCloseError, ConnectionResetError],
|
112
135
|
)
|
113
136
|
|
114
137
|
self.client_cache[ip_address] = client
|
deltacat/logs.py
CHANGED
@@ -143,9 +143,10 @@ def _configure_logger(
|
|
143
143
|
log_dir, log_base_file_name, primary_log_level
|
144
144
|
)
|
145
145
|
_add_logger_handler(logger, handler)
|
146
|
-
|
147
|
-
|
148
|
-
|
146
|
+
if ray.is_initialized():
|
147
|
+
ray_runtime_ctx = ray.get_runtime_context()
|
148
|
+
if ray_runtime_ctx.worker.connected:
|
149
|
+
logger = RayRuntimeContextLoggerAdapter(logger, ray_runtime_ctx)
|
149
150
|
|
150
151
|
return logger
|
151
152
|
|
deltacat/storage/interface.py
CHANGED
@@ -21,6 +21,7 @@ from deltacat.storage import (
|
|
21
21
|
Table,
|
22
22
|
TableVersion,
|
23
23
|
SortKey,
|
24
|
+
PartitionLocator,
|
24
25
|
)
|
25
26
|
from deltacat.types.media import ContentType, StorageType, TableType
|
26
27
|
from deltacat.utils.common import ReadKwargsProvider
|
@@ -105,7 +106,13 @@ def list_deltas(
|
|
105
106
|
|
106
107
|
|
107
108
|
def list_partition_deltas(
|
108
|
-
|
109
|
+
partition_like: Union[Partition, PartitionLocator],
|
110
|
+
first_stream_position: Optional[int] = None,
|
111
|
+
last_stream_position: Optional[int] = None,
|
112
|
+
ascending_order: bool = False,
|
113
|
+
include_manifest: bool = False,
|
114
|
+
*args,
|
115
|
+
**kwargs
|
109
116
|
) -> ListResult[Delta]:
|
110
117
|
"""
|
111
118
|
Lists a page of deltas committed to the given partition.
|
deltacat/storage/model/types.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from enum import Enum
|
2
2
|
from typing import List, Union, Any
|
3
3
|
|
4
|
+
from pyarrow.parquet import ParquetFile
|
4
5
|
import numpy as np
|
5
6
|
import pandas as pd
|
6
7
|
import pyarrow as pa
|
@@ -8,7 +9,7 @@ import pkg_resources
|
|
8
9
|
from ray.data._internal.arrow_block import ArrowRow
|
9
10
|
from ray.data.dataset import Dataset
|
10
11
|
|
11
|
-
LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray]
|
12
|
+
LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray, ParquetFile]
|
12
13
|
LocalDataset = List[LocalTable]
|
13
14
|
# Starting Ray 2.5.0, Dataset follows a strict mode (https://docs.ray.io/en/latest/data/faq.html#migrating-to-strict-mode),
|
14
15
|
# and generic annotation is removed. So add a version checker to determine whether to use the old or new definition.
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from unittest.mock import patch
|
2
2
|
import unittest
|
3
|
-
import json
|
4
3
|
from http import HTTPStatus
|
4
|
+
import requests
|
5
5
|
|
6
6
|
HAPPY_RESPONSE = {
|
7
7
|
"AccessKeyId": "ASIA123456789",
|
@@ -20,7 +20,7 @@ class MockResponse:
|
|
20
20
|
"""
|
21
21
|
|
22
22
|
def __init__(self, status_code: int, text: str, reason: str = "") -> None:
|
23
|
-
self.status_code = status_code
|
23
|
+
self.status_code: requests.Response.status_code = status_code
|
24
24
|
self.text = text
|
25
25
|
self.reason = reason
|
26
26
|
|
@@ -55,7 +55,7 @@ class TestBlockUntilInstanceMetadataServiceReturnsSuccess(unittest.TestCase):
|
|
55
55
|
)
|
56
56
|
|
57
57
|
requests_mock.get.side_effect = [
|
58
|
-
MockResponse(HTTPStatus.OK,
|
58
|
+
MockResponse(HTTPStatus.OK, "foo"),
|
59
59
|
MockResponse(HTTPStatus.TOO_MANY_REQUESTS, "foo"),
|
60
60
|
MockResponse(HTTPStatus.INTERNAL_SERVER_ERROR, "foo"),
|
61
61
|
MockResponse(HTTPStatus.NOT_IMPLEMENTED, "bar"),
|
@@ -65,3 +65,16 @@ class TestBlockUntilInstanceMetadataServiceReturnsSuccess(unittest.TestCase):
|
|
65
65
|
self.assertEqual(
|
66
66
|
block_until_instance_metadata_service_returns_success().status_code, 200
|
67
67
|
)
|
68
|
+
|
69
|
+
@patch("deltacat.aws.clients.requests")
|
70
|
+
def test_retrying_status_on_shortlist_returns_early(self, requests_mock):
|
71
|
+
from deltacat.aws.clients import (
|
72
|
+
block_until_instance_metadata_service_returns_success,
|
73
|
+
)
|
74
|
+
|
75
|
+
requests_mock.get.side_effect = [
|
76
|
+
MockResponse(HTTPStatus.FORBIDDEN, "foo"),
|
77
|
+
]
|
78
|
+
self.assertEqual(
|
79
|
+
block_until_instance_metadata_service_returns_success().status_code, 403
|
80
|
+
)
|
File without changes
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
from enum import Enum
|
4
|
+
from typing import List
|
5
|
+
import datetime as dt
|
6
|
+
from datetime import timezone
|
7
|
+
|
8
|
+
TEST_S3_RCF_BUCKET_NAME = "test-compaction-artifacts-bucket"
|
9
|
+
# REBASE src = spark compacted table to create an initial version of ray compacted table
|
10
|
+
BASE_TEST_SOURCE_NAMESPACE = "source_test_namespace"
|
11
|
+
BASE_TEST_SOURCE_TABLE_NAME = "test_table"
|
12
|
+
BASE_TEST_SOURCE_TABLE_VERSION = "1"
|
13
|
+
|
14
|
+
BASE_TEST_DESTINATION_NAMESPACE = "destination_test_namespace"
|
15
|
+
BASE_TEST_DESTINATION_TABLE_NAME = "destination_test_table_RAY"
|
16
|
+
BASE_TEST_DESTINATION_TABLE_VERSION = "1"
|
17
|
+
|
18
|
+
HASH_BUCKET_COUNT: int = 1
|
19
|
+
|
20
|
+
MAX_RECORDS_PER_FILE: int = 1
|
21
|
+
|
22
|
+
UTC_ISO_8601_FORMAT_WITHOUT_MILLIS = "%Y-%m-%dT%H:%M:%SZ" # '2018-09-05T14:09:03Z'
|
23
|
+
|
24
|
+
|
25
|
+
class PartitionKeyType(str, Enum):
|
26
|
+
INT = "int"
|
27
|
+
STRING = "string"
|
28
|
+
TIMESTAMP = "timestamp"
|
29
|
+
|
30
|
+
|
31
|
+
class PartitionKey(dict):
|
32
|
+
@staticmethod
|
33
|
+
def of(key_name: str, key_type: PartitionKeyType) -> PartitionKey:
|
34
|
+
return PartitionKey({"keyName": key_name, "keyType": key_type.value})
|
35
|
+
|
36
|
+
@property
|
37
|
+
def key_name(self) -> str:
|
38
|
+
return self["keyName"]
|
39
|
+
|
40
|
+
@property
|
41
|
+
def key_type(self) -> PartitionKeyType:
|
42
|
+
key_type = self["keyType"]
|
43
|
+
return None if key_type is None else PartitionKeyType(key_type)
|
44
|
+
|
45
|
+
|
46
|
+
def setup_sort_and_partition_keys(sort_keys_param, partition_keys_param):
|
47
|
+
from deltacat.storage.model.sort_key import SortKey
|
48
|
+
|
49
|
+
sort_keys, partition_keys = None, None
|
50
|
+
if sort_keys_param is not None:
|
51
|
+
sort_keys = [SortKey.of(sort_key["key_name"]) for sort_key in sort_keys_param]
|
52
|
+
if partition_keys_param is not None:
|
53
|
+
partition_keys = [
|
54
|
+
PartitionKey.of(
|
55
|
+
partition_key["key_name"], PartitionKeyType(partition_key["key_type"])
|
56
|
+
)
|
57
|
+
for partition_key in partition_keys_param
|
58
|
+
]
|
59
|
+
return sort_keys, partition_keys
|
60
|
+
|
61
|
+
|
62
|
+
def offer_iso8601_timestamp_list(
|
63
|
+
periods: int,
|
64
|
+
unit_of_time: str,
|
65
|
+
end_time=dt.datetime(2023, 5, 3, 10, 0, 0, 0, tzinfo=timezone.utc),
|
66
|
+
) -> List[str]:
|
67
|
+
"""
|
68
|
+
Returns a list of ISO 8601 timestamps, each periods units of time before the start time.
|
69
|
+
|
70
|
+
Args:
|
71
|
+
periods: The number of timestamps to return.
|
72
|
+
unit_of_time: The unit of time to use for the timestamps. Must be one of "seconds", "minutes", "hours", "days", or "weeks".
|
73
|
+
end_time: The end time for the timestamps. Defaults to 2023-05-03T10:00:00Z.
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
A list of ISO 8601 timestamps, each periods units of time before the start time.
|
77
|
+
|
78
|
+
Raises:
|
79
|
+
ValueError: If the unit_of_time argument is not one of "seconds", "minutes", "hours", "days", or "weeks".
|
80
|
+
"""
|
81
|
+
import datetime as dt
|
82
|
+
|
83
|
+
acceptable_units_of_time = ["seconds", "minutes", "hours", "days", "weeks"]
|
84
|
+
if unit_of_time not in acceptable_units_of_time:
|
85
|
+
raise ValueError(
|
86
|
+
f"unit_of_time {unit_of_time} is not supported. Please use one of these time units: {acceptable_units_of_time}"
|
87
|
+
)
|
88
|
+
res = []
|
89
|
+
for i in range(periods):
|
90
|
+
kwarg = {unit_of_time: i}
|
91
|
+
res.append(
|
92
|
+
(end_time - dt.timedelta(**kwarg)).strftime(
|
93
|
+
UTC_ISO_8601_FORMAT_WITHOUT_MILLIS
|
94
|
+
)
|
95
|
+
)
|
96
|
+
return res
|
File without changes
|
File without changes
|
@@ -51,6 +51,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
51
51
|
self.max_records_per_output_file = 2
|
52
52
|
self.repartitioned_file_content_type = ContentType.PARQUET
|
53
53
|
self.deltacat_storage = MagicMock()
|
54
|
+
self.deltacat_storage_kwargs = MagicMock()
|
54
55
|
|
55
56
|
def test_repartition_range(self):
|
56
57
|
result = repartition_range(
|
@@ -60,6 +61,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
60
61
|
self.max_records_per_output_file,
|
61
62
|
self.repartitioned_file_content_type,
|
62
63
|
self.deltacat_storage,
|
64
|
+
self.deltacat_storage_kwargs,
|
63
65
|
)
|
64
66
|
# Assert that a RepartitionResult object is returned
|
65
67
|
self.assertIsInstance(result, RepartitionResult)
|
@@ -85,6 +87,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
85
87
|
self.max_records_per_output_file,
|
86
88
|
self.repartitioned_file_content_type,
|
87
89
|
self.deltacat_storage,
|
90
|
+
self.deltacat_storage_kwargs,
|
88
91
|
)
|
89
92
|
|
90
93
|
def test_empty_ranges(self):
|
@@ -97,6 +100,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
97
100
|
self.max_records_per_output_file,
|
98
101
|
self.repartitioned_file_content_type,
|
99
102
|
self.deltacat_storage,
|
103
|
+
self.deltacat_storage_kwargs,
|
100
104
|
)
|
101
105
|
|
102
106
|
def test_one_value_in_ranges(self):
|
@@ -108,6 +112,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
108
112
|
self.max_records_per_output_file,
|
109
113
|
self.repartitioned_file_content_type,
|
110
114
|
self.deltacat_storage,
|
115
|
+
self.deltacat_storage_kwargs,
|
111
116
|
)
|
112
117
|
self.assertEqual(len(result.range_deltas), 2)
|
113
118
|
|
@@ -120,6 +125,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
120
125
|
self.max_records_per_output_file,
|
121
126
|
self.repartitioned_file_content_type,
|
122
127
|
self.deltacat_storage,
|
128
|
+
self.deltacat_storage_kwargs,
|
123
129
|
)
|
124
130
|
self.assertEqual(len(result.range_deltas), 3)
|
125
131
|
|
@@ -133,6 +139,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
133
139
|
self.max_records_per_output_file,
|
134
140
|
self.repartitioned_file_content_type,
|
135
141
|
self.deltacat_storage,
|
142
|
+
self.deltacat_storage_kwargs,
|
136
143
|
)
|
137
144
|
self.assertLess(len(result.range_deltas), 2)
|
138
145
|
|
@@ -146,6 +153,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
146
153
|
self.max_records_per_output_file,
|
147
154
|
self.repartitioned_file_content_type,
|
148
155
|
self.deltacat_storage,
|
156
|
+
self.deltacat_storage_kwargs,
|
149
157
|
)
|
150
158
|
|
151
159
|
def test_unsorted_ranges(self):
|
@@ -161,6 +169,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
161
169
|
self.max_records_per_output_file,
|
162
170
|
self.repartitioned_file_content_type,
|
163
171
|
self.deltacat_storage,
|
172
|
+
self.deltacat_storage_kwargs,
|
164
173
|
)
|
165
174
|
self.assertEqual(len(result.range_deltas), 4)
|
166
175
|
|
@@ -173,20 +182,24 @@ class TestRepartitionRange(unittest.TestCase):
|
|
173
182
|
self.max_records_per_output_file,
|
174
183
|
self.repartitioned_file_content_type,
|
175
184
|
self.deltacat_storage,
|
185
|
+
self.deltacat_storage_kwargs,
|
176
186
|
)
|
177
187
|
self.assertEqual(len(result.range_deltas), 2)
|
178
188
|
|
179
189
|
def test_ranges_with_inf(self):
|
180
190
|
self.repartition_args["ranges"] = [1678665487112747, float("inf")]
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
191
|
+
|
192
|
+
self.assertRaises(
|
193
|
+
pa.lib.ArrowInvalid,
|
194
|
+
lambda: repartition_range(
|
195
|
+
self.tables,
|
196
|
+
self.destination_partition,
|
197
|
+
self.repartition_args,
|
198
|
+
self.max_records_per_output_file,
|
199
|
+
self.repartitioned_file_content_type,
|
200
|
+
self.deltacat_storage,
|
201
|
+
),
|
188
202
|
)
|
189
|
-
self.assertEqual(len(result.range_deltas), 2)
|
190
203
|
|
191
204
|
def test_null_rows_are_not_dropped(self):
|
192
205
|
# Add null value to the first table
|
@@ -211,6 +224,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
211
224
|
self.max_records_per_output_file,
|
212
225
|
self.repartitioned_file_content_type,
|
213
226
|
self.deltacat_storage,
|
227
|
+
self.deltacat_storage_kwargs,
|
214
228
|
)
|
215
229
|
|
216
230
|
# Assuming range_deltas is a list of DataFrames,
|
File without changes
|
@@ -1,6 +1,12 @@
|
|
1
1
|
import unittest
|
2
2
|
from unittest import mock
|
3
3
|
from deltacat.tests.test_utils.constants import TEST_DELTA
|
4
|
+
from typing import Any, Dict
|
5
|
+
|
6
|
+
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
7
|
+
"db_file_path",
|
8
|
+
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
9
|
+
)
|
4
10
|
|
5
11
|
|
6
12
|
class TestFitInputDeltas(unittest.TestCase):
|
@@ -13,6 +19,10 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
13
19
|
CompactionSessionAuditInfo,
|
14
20
|
)
|
15
21
|
|
22
|
+
cls.kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
23
|
+
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
24
|
+
}
|
25
|
+
|
16
26
|
cls.COMPACTION_AUDIT = CompactionSessionAuditInfo("1.0", "test")
|
17
27
|
|
18
28
|
super().setUpClass()
|
@@ -23,6 +33,7 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
23
33
|
|
24
34
|
def test_sanity(self):
|
25
35
|
from deltacat.compute.compactor.utils import io
|
36
|
+
import deltacat.tests.local_deltacat_storage as ds
|
26
37
|
|
27
38
|
(
|
28
39
|
delta_list,
|
@@ -30,7 +41,12 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
30
41
|
high_watermark,
|
31
42
|
require_multiple_rounds,
|
32
43
|
) = io.fit_input_deltas(
|
33
|
-
[TEST_DELTA],
|
44
|
+
[TEST_DELTA],
|
45
|
+
{"CPU": 1, "memory": 20000000},
|
46
|
+
self.COMPACTION_AUDIT,
|
47
|
+
None,
|
48
|
+
ds,
|
49
|
+
self.kwargs_for_local_deltacat_storage,
|
34
50
|
)
|
35
51
|
|
36
52
|
self.assertIsNotNone(hash_bucket_count)
|
@@ -44,6 +60,7 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
44
60
|
|
45
61
|
def test_when_hash_bucket_count_overridden(self):
|
46
62
|
from deltacat.compute.compactor.utils import io
|
63
|
+
import deltacat.tests.local_deltacat_storage as ds
|
47
64
|
|
48
65
|
(
|
49
66
|
delta_list,
|
@@ -51,7 +68,12 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
51
68
|
high_watermark,
|
52
69
|
require_multiple_rounds,
|
53
70
|
) = io.fit_input_deltas(
|
54
|
-
[TEST_DELTA],
|
71
|
+
[TEST_DELTA],
|
72
|
+
{"CPU": 1, "memory": 20000000},
|
73
|
+
self.COMPACTION_AUDIT,
|
74
|
+
20,
|
75
|
+
ds,
|
76
|
+
self.kwargs_for_local_deltacat_storage,
|
55
77
|
)
|
56
78
|
|
57
79
|
self.assertEqual(20, hash_bucket_count)
|
@@ -61,6 +83,7 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
61
83
|
|
62
84
|
def test_when_not_enough_memory_splits_manifest_entries(self):
|
63
85
|
from deltacat.compute.compactor.utils import io
|
86
|
+
import deltacat.tests.local_deltacat_storage as ds
|
64
87
|
|
65
88
|
(
|
66
89
|
delta_list,
|
@@ -68,7 +91,12 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
68
91
|
high_watermark,
|
69
92
|
require_multiple_rounds,
|
70
93
|
) = io.fit_input_deltas(
|
71
|
-
[TEST_DELTA],
|
94
|
+
[TEST_DELTA],
|
95
|
+
{"CPU": 2, "memory": 10},
|
96
|
+
self.COMPACTION_AUDIT,
|
97
|
+
20,
|
98
|
+
ds,
|
99
|
+
self.kwargs_for_local_deltacat_storage,
|
72
100
|
)
|
73
101
|
|
74
102
|
self.assertIsNotNone(hash_bucket_count)
|
@@ -78,14 +106,28 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
78
106
|
|
79
107
|
def test_when_no_input_deltas(self):
|
80
108
|
from deltacat.compute.compactor.utils import io
|
109
|
+
import deltacat.tests.local_deltacat_storage as ds
|
81
110
|
|
82
111
|
with self.assertRaises(AssertionError):
|
83
112
|
io.fit_input_deltas(
|
84
|
-
[],
|
113
|
+
[],
|
114
|
+
{"CPU": 100, "memory": 20000.0},
|
115
|
+
self.COMPACTION_AUDIT,
|
116
|
+
None,
|
117
|
+
ds,
|
118
|
+
self.kwargs_for_local_deltacat_storage,
|
85
119
|
)
|
86
120
|
|
87
121
|
def test_when_cpu_resources_is_not_passed(self):
|
88
122
|
from deltacat.compute.compactor.utils import io
|
123
|
+
import deltacat.tests.local_deltacat_storage as ds
|
89
124
|
|
90
125
|
with self.assertRaises(KeyError):
|
91
|
-
io.fit_input_deltas(
|
126
|
+
io.fit_input_deltas(
|
127
|
+
[],
|
128
|
+
{},
|
129
|
+
self.COMPACTION_AUDIT,
|
130
|
+
None,
|
131
|
+
ds,
|
132
|
+
self.kwargs_for_local_deltacat_storage,
|
133
|
+
)
|
File without changes
|
File without changes
|