deltacat 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +188 -218
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +259 -316
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +152 -259
- deltacat/compute/compactor/steps/hash_bucket.py +57 -73
- deltacat/compute/compactor/steps/materialize.py +138 -99
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +131 -90
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -42
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +8 -10
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +276 -231
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +38 -32
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/METADATA +22 -12
- deltacat-0.1.11.dist-info/RECORD +110 -0
- {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
- deltacat/autoscaler/events/__init__.py +0 -0
- deltacat/autoscaler/events/compaction/__init__.py +0 -0
- deltacat/autoscaler/events/compaction/cluster.py +0 -82
- deltacat/autoscaler/events/compaction/collections/__init__.py +0 -0
- deltacat/autoscaler/events/compaction/collections/partition_key_value.py +0 -36
- deltacat/autoscaler/events/compaction/dispatcher.py +0 -28
- deltacat/autoscaler/events/compaction/input.py +0 -27
- deltacat/autoscaler/events/compaction/process.py +0 -25
- deltacat/autoscaler/events/compaction/session_manager.py +0 -13
- deltacat/autoscaler/events/compaction/utils.py +0 -216
- deltacat/autoscaler/events/compaction/workflow.py +0 -303
- deltacat/autoscaler/events/dispatcher.py +0 -95
- deltacat/autoscaler/events/dynamodb/__init__.py +0 -0
- deltacat/autoscaler/events/dynamodb/event_store.py +0 -164
- deltacat/autoscaler/events/event_store.py +0 -55
- deltacat/autoscaler/events/exceptions.py +0 -6
- deltacat/autoscaler/events/processor.py +0 -177
- deltacat/autoscaler/events/session_manager.py +0 -25
- deltacat/autoscaler/events/states.py +0 -88
- deltacat/autoscaler/events/workflow.py +0 -54
- deltacat/autoscaler/node_group.py +0 -230
- deltacat/autoscaler/utils.py +0 -69
- deltacat-0.1.8.dist-info/RECORD +0 -131
- /deltacat/{autoscaler → tests/utils}/__init__.py +0 -0
- {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
- {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,26 +1,25 @@
|
|
1
1
|
import logging
|
2
|
+
from typing import Callable, Dict, List, Optional
|
2
3
|
|
3
|
-
from pyarrow import csv as pacsv
|
4
4
|
from fsspec import AbstractFileSystem
|
5
|
-
|
5
|
+
from pyarrow import csv as pacsv
|
6
6
|
from ray.data import Dataset
|
7
7
|
from ray.data.datasource import BlockWritePathProvider
|
8
8
|
|
9
9
|
from deltacat import logs
|
10
|
-
from deltacat.types.media import
|
11
|
-
|
12
|
-
from typing import Callable, Dict, List, Optional
|
10
|
+
from deltacat.types.media import ContentEncoding, ContentType
|
13
11
|
|
14
12
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
13
|
|
16
14
|
|
17
15
|
def write_parquet(
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
16
|
+
dataset: Dataset,
|
17
|
+
base_path: str,
|
18
|
+
*,
|
19
|
+
filesystem: AbstractFileSystem,
|
20
|
+
block_path_provider: BlockWritePathProvider,
|
21
|
+
**kwargs,
|
22
|
+
) -> None:
|
24
23
|
|
25
24
|
dataset.write_parquet(
|
26
25
|
base_path,
|
@@ -32,12 +31,13 @@ def write_parquet(
|
|
32
31
|
|
33
32
|
|
34
33
|
def write_csv(
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
34
|
+
dataset: Dataset,
|
35
|
+
base_path: str,
|
36
|
+
*,
|
37
|
+
filesystem: AbstractFileSystem,
|
38
|
+
block_path_provider: BlockWritePathProvider,
|
39
|
+
**kwargs,
|
40
|
+
) -> None:
|
41
41
|
|
42
42
|
# column names are kept in table metadata, so omit header
|
43
43
|
arrow_csv_args_fn = lambda: {
|
@@ -61,9 +61,7 @@ CONTENT_TYPE_TO_DATASET_WRITE_FUNC: Dict[str, Callable] = {
|
|
61
61
|
}
|
62
62
|
|
63
63
|
|
64
|
-
def slice_dataset(
|
65
|
-
dataset: Dataset,
|
66
|
-
max_len: Optional[int]) -> List[Dataset]:
|
64
|
+
def slice_dataset(dataset: Dataset, max_len: Optional[int]) -> List[Dataset]:
|
67
65
|
"""
|
68
66
|
Returns equally-sized dataset slices of up to `max_len` records each.
|
69
67
|
"""
|
@@ -88,12 +86,13 @@ def dataset_size(dataset: Dataset) -> int:
|
|
88
86
|
|
89
87
|
|
90
88
|
def dataset_to_file(
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
89
|
+
table: Dataset,
|
90
|
+
base_path: str,
|
91
|
+
file_system: AbstractFileSystem,
|
92
|
+
block_path_provider: BlockWritePathProvider,
|
93
|
+
content_type: str = ContentType.PARQUET.value,
|
94
|
+
**kwargs,
|
95
|
+
) -> None:
|
97
96
|
"""
|
98
97
|
Writes the given Distributed Dataset to one or more files.
|
99
98
|
"""
|
@@ -102,11 +101,12 @@ def dataset_to_file(
|
|
102
101
|
raise NotImplementedError(
|
103
102
|
f"Distributed Dataset writer for content type '{content_type}' not"
|
104
103
|
f" implemented. Known content types: "
|
105
|
-
f"{CONTENT_TYPE_TO_DATASET_WRITE_FUNC.keys}"
|
104
|
+
f"{CONTENT_TYPE_TO_DATASET_WRITE_FUNC.keys}"
|
105
|
+
)
|
106
106
|
writer(
|
107
107
|
table,
|
108
108
|
base_path,
|
109
109
|
filesystem=file_system,
|
110
110
|
block_path_provider=block_path_provider,
|
111
|
-
**kwargs
|
111
|
+
**kwargs,
|
112
112
|
)
|
@@ -1,20 +1,16 @@
|
|
1
1
|
import time
|
2
|
-
from deltacat.utils.ray_utils.collections import DistributedCounter
|
3
2
|
from typing import Any, Callable, Tuple
|
4
3
|
|
4
|
+
from deltacat.utils.ray_utils.collections import DistributedCounter
|
5
|
+
|
5
6
|
|
6
7
|
def invoke_with_perf_counter(
|
7
|
-
|
8
|
-
|
9
|
-
func: Callable,
|
10
|
-
*args,
|
11
|
-
**kwargs) -> Tuple[Any, float]:
|
8
|
+
counter: DistributedCounter, counter_key: Any, func: Callable, *args, **kwargs
|
9
|
+
) -> Tuple[Any, float]:
|
12
10
|
|
13
11
|
start = time.perf_counter()
|
14
12
|
result = func(*args, **kwargs)
|
15
13
|
stop = time.perf_counter()
|
16
14
|
latency = stop - start
|
17
|
-
counter.increment.remote(
|
18
|
-
counter_key,
|
19
|
-
latency)
|
15
|
+
counter.increment.remote(counter_key, latency)
|
20
16
|
return result, latency
|
@@ -1,17 +1,17 @@
|
|
1
|
-
import ray
|
2
1
|
import logging
|
3
2
|
import time
|
3
|
+
from typing import Any, Callable, Dict, List
|
4
4
|
|
5
|
-
|
5
|
+
import ray
|
6
6
|
|
7
|
-
from
|
7
|
+
from deltacat import logs
|
8
8
|
|
9
9
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
10
10
|
|
11
11
|
|
12
12
|
def node_resource_keys(
|
13
|
-
|
14
|
-
|
13
|
+
filter_fn: Callable[[Dict[str, Any]], bool] = lambda n: True
|
14
|
+
) -> List[str]:
|
15
15
|
"""Get all Ray resource keys for cluster nodes that pass the given filter
|
16
16
|
as a list of strings of the form: "node:{node_resource_name}". The returned
|
17
17
|
keys can be used to place tasks or actors on that node via:
|
@@ -39,8 +39,9 @@ def current_node_resource_key() -> str:
|
|
39
39
|
"""
|
40
40
|
current_node_id = ray.get_runtime_context().node_id.hex()
|
41
41
|
keys = node_resource_keys(lambda n: n["NodeID"] == current_node_id)
|
42
|
-
assert
|
43
|
-
|
42
|
+
assert (
|
43
|
+
len(keys) <= 1
|
44
|
+
), f"Expected <= 1 keys for the current node, but found {len(keys)}"
|
44
45
|
return keys[0] if len(keys) == 1 else None
|
45
46
|
|
46
47
|
|
@@ -55,9 +56,7 @@ def live_node_count() -> int:
|
|
55
56
|
return sum(1 for n in ray.nodes() if is_node_alive(n))
|
56
57
|
|
57
58
|
|
58
|
-
def live_node_waiter(
|
59
|
-
min_live_nodes: int,
|
60
|
-
poll_interval_seconds: float = 0.5) -> None:
|
59
|
+
def live_node_waiter(min_live_nodes: int, poll_interval_seconds: float = 0.5) -> None:
|
61
60
|
"""Waits until the given minimum number of live nodes are present in the
|
62
61
|
cluster. Checks the current number of live nodes every
|
63
62
|
`poll_interval_seconds`."""
|
@@ -1,9 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: deltacat
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.11
|
4
4
|
Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
|
5
5
|
Home-page: https://github.com/ray-project/deltacat
|
6
6
|
Author: Ray Team
|
7
|
+
License: UNKNOWN
|
8
|
+
Platform: UNKNOWN
|
7
9
|
Classifier: Development Status :: 4 - Beta
|
8
10
|
Classifier: Intended Audience :: Developers
|
9
11
|
Classifier: Programming Language :: Python :: 3 :: Only
|
@@ -13,25 +15,33 @@ Classifier: Programming Language :: Python :: 3.9
|
|
13
15
|
Classifier: Operating System :: OS Independent
|
14
16
|
Requires-Python: >=3.7
|
15
17
|
Description-Content-Type: text/markdown
|
16
|
-
License-File: LICENSE
|
17
|
-
Requires-Dist: s3fs (==2022.1.0)
|
18
|
-
Requires-Dist: tenacity (==8.0.1)
|
19
|
-
Requires-Dist: ray[default] (==2.0.0)
|
20
|
-
Requires-Dist: pandas (>=1.3.1)
|
21
|
-
Requires-Dist: pyarrow (==8.0.0)
|
22
|
-
Requires-Dist: pydantic (==1.10.2)
|
23
|
-
Requires-Dist: numpy (>=1.21.1)
|
24
18
|
Requires-Dist: boto3 (==1.20.24)
|
19
|
+
Requires-Dist: numpy (==1.21.5)
|
20
|
+
Requires-Dist: pandas (==1.3.5)
|
21
|
+
Requires-Dist: pyarrow (==10.0.1)
|
22
|
+
Requires-Dist: pydantic (==1.10.4)
|
23
|
+
Requires-Dist: ray[default] (==2.0.0)
|
24
|
+
Requires-Dist: s3fs (==2022.2.0)
|
25
|
+
Requires-Dist: tenacity (==8.1.0)
|
25
26
|
Requires-Dist: typing-extensions (==4.4.0)
|
26
27
|
|
27
28
|
# DeltaCAT
|
28
29
|
|
29
30
|
DeltaCAT is a Pythonic Data Catalog powered by Ray.
|
30
31
|
|
31
|
-
Its data storage model allows you to define and manage fast, scalable,
|
32
|
-
ACID-compliant data catalogs through git-like stage/commit APIs, and has been
|
32
|
+
Its data storage model allows you to define and manage fast, scalable,
|
33
|
+
ACID-compliant data catalogs through git-like stage/commit APIs, and has been
|
33
34
|
used to successfully host exabyte-scale enterprise data lakes.
|
34
35
|
|
35
36
|
DeltaCAT uses the Ray distributed compute framework together with Apache Arrow
|
36
|
-
for common table management tasks, including petabyte-scale
|
37
|
+
for common table management tasks, including petabyte-scale
|
37
38
|
change-data-capture, data consistency checks, and table repair.
|
39
|
+
|
40
|
+
## Getting Started
|
41
|
+
---
|
42
|
+
### Install
|
43
|
+
```
|
44
|
+
pip install deltacat
|
45
|
+
```
|
46
|
+
|
47
|
+
|
@@ -0,0 +1,110 @@
|
|
1
|
+
deltacat/__init__.py,sha256=gF2hBR7_JIL4gQRa1JN-fDraLLLmwXisIOcZLpbwTCM,1808
|
2
|
+
deltacat/constants.py,sha256=E_1hOQolyvJCWB8eIVWtlAMgk2dmXGyXBhW05czilwQ,1173
|
3
|
+
deltacat/exceptions.py,sha256=x7qem7FLujXf-DzPsNcQ-XYkW3cF3A0YGIbxkcpz0Mw,146
|
4
|
+
deltacat/logs.py,sha256=T_-_JwOZFRSV64-KvmhMf-dInvYvuO4CSN-1EDrJJsU,5808
|
5
|
+
deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
deltacat/aws/clients.py,sha256=gBwSjCUfzQyiq-empApD685S-FhVlTjdJtDIWpR_alg,1743
|
7
|
+
deltacat/aws/constants.py,sha256=4OnwC1H12FPs2bpVN7tXkxn-DAEJS4MYRrFh5HKFv7s,204
|
8
|
+
deltacat/aws/s3u.py,sha256=KTAG9uNCYpANG-rWNAByhF062bpEnCqFP-_Crp1y6dA,17371
|
9
|
+
deltacat/aws/redshift/__init__.py,sha256=fjuv3jWdPE8IgF4uSrL0YEqV3XUfqDULX3xV27ICceo,266
|
10
|
+
deltacat/aws/redshift/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
deltacat/aws/redshift/model/manifest.py,sha256=E71avhRHQGEZ6It3-axCB5FdiieQhSMu9Wt8oZAdXro,9519
|
12
|
+
deltacat/catalog/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
+
deltacat/catalog/delegate.py,sha256=yuMh8vcXwYBcaMO9HYasbj4DHJIl6Y5xZ5Qd2kTT278,8755
|
14
|
+
deltacat/catalog/interface.py,sha256=A3Mr5tOBEG4VgDJuzrt5XEwrbNxYZPWHq34TAJvnX5M,6566
|
15
|
+
deltacat/catalog/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
deltacat/catalog/model/catalog.py,sha256=-Ho7a3rV1hiOS9cSRCAor9AtXV9nJn9t_MDVql9pIxo,2212
|
17
|
+
deltacat/catalog/model/table_definition.py,sha256=tKrM1mmaQlvxqXrLt3QJVZK5BZfaJnhjTZ6KjybYlhE,727
|
18
|
+
deltacat/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
+
deltacat/compute/compactor/__init__.py,sha256=5wjMMS0-22weCtKZe76dQwT6YGFCYgLtvhsb2gd5a8M,1078
|
20
|
+
deltacat/compute/compactor/compaction_session.py,sha256=JJHZZmsQ4bOXYnmazomlnd64kcDCEdsfN7TMXRIqYvs,21182
|
21
|
+
deltacat/compute/compactor/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
+
deltacat/compute/compactor/model/delta_annotated.py,sha256=BFnyeoeLkSwCDsfzipENgRBU8x8GiudSqZj946H8agY,7703
|
23
|
+
deltacat/compute/compactor/model/delta_file_envelope.py,sha256=SfmdEPbfCfUekV_NvlWcyQOYo1CEHcRCGXHWHqmYQYg,1835
|
24
|
+
deltacat/compute/compactor/model/delta_file_locator.py,sha256=Cc-YzxxyrXK6FlY8ek2L92XzfT0qkMCxs6yrC_FsEwU,1766
|
25
|
+
deltacat/compute/compactor/model/materialize_result.py,sha256=b1Pwa89fgvr7rX3uSWwIt2ld-ElmqOSu-BXkZ1wwXdA,1253
|
26
|
+
deltacat/compute/compactor/model/primary_key_index.py,sha256=MT4zqwhzh3e9qZotWvZavT_MtWXm_81ojfcOCv1t17w,10459
|
27
|
+
deltacat/compute/compactor/model/pyarrow_write_result.py,sha256=WYIa0DRcyaemR6yUS8_8RLQ2voTmCVNFUL99qxPmt70,1324
|
28
|
+
deltacat/compute/compactor/model/round_completion_info.py,sha256=3s0rAjJoV_IZ9OBe6KxopOijte2cS4khS2Nuw-Q2NQ8,3041
|
29
|
+
deltacat/compute/compactor/model/sort_key.py,sha256=XDIoYrV18FciomV5yWxu1OaDsD78trmUUtseyRurIKo,4124
|
30
|
+
deltacat/compute/compactor/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
+
deltacat/compute/compactor/steps/dedupe.py,sha256=cSmgeTYm0lQvteNZhvRFJjX0zeBJd2-gvnY9smsVmzk,11044
|
32
|
+
deltacat/compute/compactor/steps/hash_bucket.py,sha256=XIPudRw4a9l7cCjfIQhYz_szktaY4tD6pcHRNBIh-HM,5809
|
33
|
+
deltacat/compute/compactor/steps/materialize.py,sha256=8xsKBhBxankFLas6Ay97KeYcwyFlCs82UieBTLneTTE,8786
|
34
|
+
deltacat/compute/compactor/steps/rehash/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
|
+
deltacat/compute/compactor/steps/rehash/rehash_bucket.py,sha256=yh-sBuUI3hqw2vk_nK9o-KDrgSww4oSvAz2hBxTkv8s,1765
|
36
|
+
deltacat/compute/compactor/steps/rehash/rewrite_index.py,sha256=-HVM08pk5ROHEgDP-FVty55-a_0dsGRiSnPlNJw7C6Q,1838
|
37
|
+
deltacat/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
+
deltacat/compute/compactor/utils/io.py,sha256=QMyh-oh4EqpN-lnurxJAS50IvY5LNoWHT6o6KhEPDiw,9637
|
39
|
+
deltacat/compute/compactor/utils/primary_key_index.py,sha256=taYw1AjGIFlD9c8OXyj9ps816a15B61aoV4I00EAUyo,12072
|
40
|
+
deltacat/compute/compactor/utils/round_completion_file.py,sha256=F4HwEG3egg59w4eiSvwrYImcDWgk56oXOyprnuYZYdE,2078
|
41
|
+
deltacat/compute/compactor/utils/system_columns.py,sha256=ge4cL0RVsZ-9vTyU0xErnB-ClVASF5_CxsOTnAXnpfc,7106
|
42
|
+
deltacat/compute/metastats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
43
|
+
deltacat/compute/metastats/meta_stats.py,sha256=-Fb0yQAdUUgm2IShcWlPZto-qdivF-nK05sQqJu7K5s,18588
|
44
|
+
deltacat/compute/metastats/stats.py,sha256=-aFFrh7b--PzvQWNJG5_PgdN7ZM1bmGMeha5khwxhNw,7285
|
45
|
+
deltacat/compute/metastats/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
46
|
+
deltacat/compute/metastats/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
|
+
deltacat/compute/metastats/model/partition_stats_dict.py,sha256=FbfoOxmTZfjRT7iHwc_96gHmB_r6iUvVM9BoTldD5mY,1123
|
48
|
+
deltacat/compute/metastats/model/stats_cluster_size_estimator.py,sha256=AfH2rsC1DdJ2R_CwOPgjGJ04h-yWROsMfTw83GdpGXM,2849
|
49
|
+
deltacat/compute/metastats/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
50
|
+
deltacat/compute/metastats/utils/constants.py,sha256=bFUPKmR3FkjEnwpHuToQYZ9QcHqYpd4OMMSwVwnJcaA,869
|
51
|
+
deltacat/compute/metastats/utils/io.py,sha256=F9bY0Wo-qeokBLn5eXN9zIV2duLTXO5aNUMbL3_Ae2U,8825
|
52
|
+
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py,sha256=-3utoiC9fP2UFiJ-u7KbESNiHCRVzh5NGtSld0xRXX0,1143
|
53
|
+
deltacat/compute/metastats/utils/ray_utils.py,sha256=sEDzcA0K8DMbQ_i8axBCQiPRrySPM14piaTqzKqhkss,4516
|
54
|
+
deltacat/compute/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
55
|
+
deltacat/compute/stats/basic.py,sha256=m_tDdtLbsyyky-UJ0UULBZDoAAjYr02O0sSvFCKyHGk,8837
|
56
|
+
deltacat/compute/stats/types.py,sha256=cp0lT8nITTKbnkc03OysRjXfcfXzQml9a4wqCnR6kqs,215
|
57
|
+
deltacat/compute/stats/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
58
|
+
deltacat/compute/stats/models/delta_column_stats.py,sha256=-wXjB2c0BC1RDheumjL_j5-DfRNql4WsK9GpMFQI1cg,3300
|
59
|
+
deltacat/compute/stats/models/delta_stats.py,sha256=hBith8_hbF9TVr6HocLAt6RJ_kZZKO4zrGP8VOP05vA,8556
|
60
|
+
deltacat/compute/stats/models/delta_stats_cache_result.py,sha256=mbJYxpZd5jaER_BWrCD2hROFy3p1nNdBrj66nUpc6io,1624
|
61
|
+
deltacat/compute/stats/models/manifest_entry_stats.py,sha256=NCDAe2nPDEI4kOkuwNkRFgGPS-rqQaQqLuaLoKk20KQ,2419
|
62
|
+
deltacat/compute/stats/models/stats_result.py,sha256=XQAlmzhUqRmg4jzEMUAOqcYn1HUOBTMryBH1CCVlet8,3820
|
63
|
+
deltacat/compute/stats/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
64
|
+
deltacat/compute/stats/utils/intervals.py,sha256=9ezOzIrBGU1fWBuAn1CorJ3uX5COU7vxrfA7kI1cB7I,3094
|
65
|
+
deltacat/compute/stats/utils/io.py,sha256=ZXpntXqa41l5bxxAa2vcTW5mVpWeBIvd3QA9VWnX-aw,8573
|
66
|
+
deltacat/compute/stats/utils/manifest_stats_file.py,sha256=PtqW5Zc5e09HcfiAgvoZHVMJ2gamGdwmynMXOJNJUaY,3693
|
67
|
+
deltacat/io/__init__.py,sha256=5Al7BPSaQghEp1K3PfiKIJJ0HR6MUuaN7HTMyM_9lf4,154
|
68
|
+
deltacat/io/dataset.py,sha256=8w9sPVDpGnjjGVDWB39YSKWxq4zRv9VEfDtj7PYwjqM,3755
|
69
|
+
deltacat/io/read_api.py,sha256=BhkjL3xjY-fsa62AA9Yv20_88uTskn4_Bv2W6VmMXVA,7023
|
70
|
+
deltacat/io/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
|
+
deltacat/io/aws/redshift/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
72
|
+
deltacat/io/aws/redshift/redshift_datasource.py,sha256=X183O4tgBqtaZOSFmMFvp-9mv8NX5kGvRvX0eoSX8rA,22599
|
73
|
+
deltacat/storage/__init__.py,sha256=ElzZuG5zrX9nUIe7f0Sp21WDX7yBoclclq3TIL-doag,1371
|
74
|
+
deltacat/storage/interface.py,sha256=pw8t0jCqPakw13wDpg_cW0eBGBpqG1GO0djg-ZNNW6Q,21133
|
75
|
+
deltacat/storage/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
76
|
+
deltacat/storage/model/delta.py,sha256=bmcG1rF6mwUdM3YHh6M9gLV6uqxbwHZVNS3WHkXFeDw,13734
|
77
|
+
deltacat/storage/model/list_result.py,sha256=FgD6oYeKo0EPe8z7jC8T4pAFjBOuBwd4axxGrnYyBG4,2466
|
78
|
+
deltacat/storage/model/locator.py,sha256=1S7szmDSx-R4Z3arFNILOvS4t7dF7_rJNV9fHyRc3G4,1296
|
79
|
+
deltacat/storage/model/namespace.py,sha256=KI2umYWShXFTx1ykLwsQjuce078WYo_Hmavn3DDeBzE,2086
|
80
|
+
deltacat/storage/model/partition.py,sha256=6Sknqi2GhtaSpkM--3oMjR9agRLHS4i7izFWM4iiGmY,11068
|
81
|
+
deltacat/storage/model/stream.py,sha256=XZ-c4EQR89NWydEOEG5GCaT8zST10OmjLZBKHZPdrzA,7738
|
82
|
+
deltacat/storage/model/table.py,sha256=IOu1ZOrdRkVDB-FOxYMRvnNf5TukIDfbdHWTqHYN_OY,4225
|
83
|
+
deltacat/storage/model/table_version.py,sha256=j57er3zlN0_2kwVMpWZ3iouABO-Kl8_Txi0UWIZ0dtk,7034
|
84
|
+
deltacat/storage/model/types.py,sha256=LQPe_CxcoW2N67Leu3fNbnSXhbl9ubDa8LVvBY0JUiY,1580
|
85
|
+
deltacat/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
86
|
+
deltacat/tests/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
87
|
+
deltacat/tests/stats/test_intervals.py,sha256=S92DgkALQ1WmbLWcxtvS7RlVGvL-XoPJKUUbkdn9_CQ,1955
|
88
|
+
deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
89
|
+
deltacat/tests/utils/test_record_batch_tables.py,sha256=yLExx5jZfi65uSjkdhOCGnP7Km6zWqKCzmULf1PEKA0,11322
|
90
|
+
deltacat/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
91
|
+
deltacat/types/media.py,sha256=py1BnfMqNpJlW1RKzHWwB0NmQ33oCk9qg1fz7alvi3E,2187
|
92
|
+
deltacat/types/tables.py,sha256=yUzkzmUij8kssEYI_dfVDSLXf8HfMm_jpgWkPxDHAas,3893
|
93
|
+
deltacat/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
94
|
+
deltacat/utils/common.py,sha256=RG_-enXNpLKaYrqyx1ne2lL10lxN9vK7F631oJP6SE8,1375
|
95
|
+
deltacat/utils/numpy.py,sha256=ZiGREobTVT6IZXgPxkSUpLJFN2Hn8KEZcrqybLDXCIA,2027
|
96
|
+
deltacat/utils/pandas.py,sha256=eGOpiZE1zLznTtuwoN80j4PBp1_bUV8SE4c951r0a3o,9561
|
97
|
+
deltacat/utils/performance.py,sha256=rC3CPfroZP3T5TbRNZXB9GRBr0F9i2KUeZYL45JBgCU,610
|
98
|
+
deltacat/utils/placement.py,sha256=JE6OsW16VonlMhdH5B2IYuLJxItoYguaKpZNgbpMNLw,11066
|
99
|
+
deltacat/utils/pyarrow.py,sha256=Xf7KKTlA6wPJXcd_Uopm6iTSM9IlZ0M6Ajr4tWJP8OU,18230
|
100
|
+
deltacat/utils/ray_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
101
|
+
deltacat/utils/ray_utils/collections.py,sha256=hj20s4D2RF2jZETU_44r6mFbsczA0JI_I_4kWKTmqes,1951
|
102
|
+
deltacat/utils/ray_utils/concurrency.py,sha256=AyL7hpvYjkmsz-KcpYjVgPpNsmu-x8-rlLyG0qXoV_c,5123
|
103
|
+
deltacat/utils/ray_utils/dataset.py,sha256=SIljK3UkSqQ6Ntit_iSiYt9yYjN_gGrCTX6_72XdQ3w,3244
|
104
|
+
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
105
|
+
deltacat/utils/ray_utils/runtime.py,sha256=pUCSt2Fo8iMrvjmbkQdFtgSxZW9PA05H1molItzr5Rc,4786
|
106
|
+
deltacat-0.1.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
107
|
+
deltacat-0.1.11.dist-info/METADATA,sha256=Ce07iEuy13s5C0jOpFDUkcL8jhDWy8BvWTP4GHPWDdw,1493
|
108
|
+
deltacat-0.1.11.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
109
|
+
deltacat-0.1.11.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
110
|
+
deltacat-0.1.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|
@@ -1,82 +0,0 @@
|
|
1
|
-
from typing import Dict, Any
|
2
|
-
|
3
|
-
import math
|
4
|
-
|
5
|
-
|
6
|
-
MB_IN_BYTES = 1_000_000
|
7
|
-
|
8
|
-
|
9
|
-
class ClusterSizeSuggester:
|
10
|
-
def __init__(self,
|
11
|
-
cluster_memory_bytes: float = None,
|
12
|
-
heap_memory_alloc_ratio: float = 0.7, # Ray defaults
|
13
|
-
object_store_memory_alloc_ratio: float = 0.3, # Ray defaults
|
14
|
-
instance_type: str = "r5.8xlarge"):
|
15
|
-
"""Given the total required memory constraints, constructs a helper class that
|
16
|
-
recommends an instance type, number of instance nodes, max usable heap and object store memory,
|
17
|
-
|
18
|
-
Args:
|
19
|
-
cluster_memory_bytes: Total memory needed for the cluster.
|
20
|
-
If not provided, defaults to memory size of instance type.
|
21
|
-
heap_memory_alloc_ratio: Optional. Set to 0.7 by default.
|
22
|
-
object_store_memory_alloc_ratio: Optional. Set to 0.3 by default.
|
23
|
-
instance_type: Optional. Set to r5.8xlarge by default, to allow for up to 8GB of memory per vCPU.
|
24
|
-
# TODO: suggest various r5 instance types based on memory input
|
25
|
-
"""
|
26
|
-
self._instance_type = instance_type
|
27
|
-
self.cluster_memory_bytes = cluster_memory_bytes if cluster_memory_bytes else self.get_node_memory_size()
|
28
|
-
self.heap_memory_alloc_ratio = heap_memory_alloc_ratio
|
29
|
-
self.object_store_memory_alloc_ratio = object_store_memory_alloc_ratio
|
30
|
-
|
31
|
-
@property
|
32
|
-
def instance_type(self):
|
33
|
-
return self._instance_type
|
34
|
-
|
35
|
-
def get_instance_type_specifications(self) -> Dict[str, Any]:
|
36
|
-
"""Assumes r5.8xlarge instances (for now)
|
37
|
-
|
38
|
-
Returns: a dict of hardware details
|
39
|
-
|
40
|
-
"""
|
41
|
-
# TODO: call ec2 describe-instance-types to extract hardware details (vCPUs, memory, network bandwidth).
|
42
|
-
# Current implementation assumes we only serve r5.8xlarge node types.
|
43
|
-
if self.instance_type == "r5.8xlarge":
|
44
|
-
return {
|
45
|
-
# Intentionally mimic the output format of describe-instance-types API
|
46
|
-
"VCpuInfo": {
|
47
|
-
"DefaultVCpus": 32,
|
48
|
-
"DefaultCores": 16,
|
49
|
-
"DefaultThreadsPerCore": 2,
|
50
|
-
},
|
51
|
-
"MemoryInfo": {
|
52
|
-
"SizeInMiB": 262144
|
53
|
-
}
|
54
|
-
}
|
55
|
-
|
56
|
-
def get_num_vcpu_per_node(self):
|
57
|
-
spec = self.get_instance_type_specifications()
|
58
|
-
return spec["VCpuInfo"]["DefaultVCpus"]
|
59
|
-
|
60
|
-
def get_node_memory_size(self):
|
61
|
-
spec = self.get_instance_type_specifications()
|
62
|
-
return spec["MemoryInfo"]["SizeInMiB"] * MB_IN_BYTES
|
63
|
-
|
64
|
-
def get_max_memory_per_vcpu(self):
|
65
|
-
return self.get_node_memory_size() / self.get_num_vcpu_per_node()
|
66
|
-
|
67
|
-
def get_node_max_object_store_memory(self):
|
68
|
-
return self.object_store_memory_alloc_ratio * self.get_node_memory_size()
|
69
|
-
|
70
|
-
def get_node_max_heap_memory(self):
|
71
|
-
return self.heap_memory_alloc_ratio * self.get_node_memory_size()
|
72
|
-
|
73
|
-
def get_suggested_vcpu_count(self):
|
74
|
-
return self.cluster_memory_bytes / self.get_max_memory_per_vcpu()
|
75
|
-
|
76
|
-
def get_suggested_node_size(self):
|
77
|
-
return math.ceil(self.cluster_memory_bytes / (self.get_num_vcpu_per_node() * self.get_max_memory_per_vcpu()))
|
78
|
-
|
79
|
-
|
80
|
-
class InstanceTypeSuggester:
|
81
|
-
def __init__(self):
|
82
|
-
raise NotImplementedError("Instance Type Suggester is not implemented.")
|
File without changes
|
@@ -1,36 +0,0 @@
|
|
1
|
-
from typing import List, Iterable
|
2
|
-
|
3
|
-
|
4
|
-
class PartitionKeyValue(dict):
|
5
|
-
@staticmethod
|
6
|
-
def of(key_name: str,
|
7
|
-
key_type: str,
|
8
|
-
value: str):
|
9
|
-
pkv = PartitionKeyValue()
|
10
|
-
pkv["key"] = {}
|
11
|
-
pkv["key"]["keyName"] = key_name
|
12
|
-
pkv["key"]["keyType"] = key_type
|
13
|
-
pkv["value"] = value
|
14
|
-
return pkv
|
15
|
-
|
16
|
-
@property
|
17
|
-
def key(self):
|
18
|
-
return self["key"]
|
19
|
-
|
20
|
-
@property
|
21
|
-
def key_name(self):
|
22
|
-
return self["key"]["keyName"]
|
23
|
-
|
24
|
-
@property
|
25
|
-
def key_type(self):
|
26
|
-
return self["key"]["keyType"]
|
27
|
-
|
28
|
-
@property
|
29
|
-
def value(self):
|
30
|
-
return self["value"]
|
31
|
-
|
32
|
-
|
33
|
-
class PartitionKeyValues(tuple):
|
34
|
-
@property
|
35
|
-
def id(self):
|
36
|
-
return ".".join([pkv.value for pkv in self if isinstance(pkv, PartitionKeyValue)])
|
@@ -1,28 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
|
3
|
-
from deltacat.autoscaler.events.compaction.session_manager import CompactionSessionManager
|
4
|
-
from deltacat.autoscaler.events.dispatcher import EventDispatcher
|
5
|
-
from ray.autoscaler._private.aws.events import EventPublisher
|
6
|
-
|
7
|
-
from deltacat import logs
|
8
|
-
from deltacat.storage import interface as unimplemented_deltacat_storage
|
9
|
-
|
10
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
11
|
-
|
12
|
-
|
13
|
-
class CompactionEventDispatcher(EventDispatcher):
|
14
|
-
def __init__(self,
|
15
|
-
events_publisher: EventPublisher,
|
16
|
-
deltacat_storage: unimplemented_deltacat_storage,
|
17
|
-
session_manager: CompactionSessionManager = None):
|
18
|
-
"""Constructor for the event dispatcher.
|
19
|
-
|
20
|
-
Intended for usage by Ray parent and child clusters running compaction jobs.
|
21
|
-
|
22
|
-
Args:
|
23
|
-
events_publisher: Events manager for publishing events through a cloud provider
|
24
|
-
deltacat_storage: Storage interface for deltacat
|
25
|
-
session_manager: Manager for launching child Ray sessions
|
26
|
-
"""
|
27
|
-
super().__init__(events_publisher, deltacat_storage, session_manager)
|
28
|
-
self.session_manager = session_manager
|
@@ -1,27 +0,0 @@
|
|
1
|
-
from typing import Set, NamedTuple, List, Optional, Dict
|
2
|
-
|
3
|
-
from deltacat import ContentType, SortKey
|
4
|
-
from deltacat.autoscaler.events.compaction.collections.partition_key_value import PartitionKeyValues
|
5
|
-
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
6
|
-
from deltacat.storage import PartitionLocator, interface as unimplemented_deltacat_storage
|
7
|
-
|
8
|
-
import pyarrow as pa
|
9
|
-
|
10
|
-
|
11
|
-
class CompactionInput(NamedTuple):
|
12
|
-
source_partition_locator: PartitionLocator
|
13
|
-
compacted_partition_locator: PartitionLocator
|
14
|
-
primary_keys: Set[str]
|
15
|
-
compaction_artifact_s3_bucket: str
|
16
|
-
last_stream_position_to_compact: int
|
17
|
-
hash_bucket_count: Optional[int] = None
|
18
|
-
sort_keys: List[SortKey] = None
|
19
|
-
records_per_primary_key_index_file: int = 38_000_000
|
20
|
-
records_per_compacted_file: int = 4_000_000
|
21
|
-
input_deltas_stats: Optional[Dict[int, DeltaStats]] = None
|
22
|
-
min_hash_bucket_chunk_size: int = 0
|
23
|
-
compacted_file_content_type: ContentType = ContentType.PARQUET
|
24
|
-
delete_prev_primary_key_index: bool = False
|
25
|
-
schema_on_read: Optional[pa.schema] = None
|
26
|
-
deltacat_storage = unimplemented_deltacat_storage
|
27
|
-
partition_key_values: PartitionKeyValues = None
|
@@ -1,25 +0,0 @@
|
|
1
|
-
from typing import Optional
|
2
|
-
|
3
|
-
from deltacat.autoscaler.events.compaction.collections.partition_key_value import PartitionKeyValues
|
4
|
-
from deltacat.storage import PartitionLocator
|
5
|
-
|
6
|
-
|
7
|
-
class CompactionProcess:
|
8
|
-
def __init__(self,
|
9
|
-
partition_locator: PartitionLocator,
|
10
|
-
compaction_cluster_config_path: str,
|
11
|
-
hash_bucket_count: Optional[int] = None,
|
12
|
-
last_stream_position_to_compact: Optional[int] = None,
|
13
|
-
partition_key_values: PartitionKeyValues = None,
|
14
|
-
cluster_memory_bytes: Optional[int] = None,
|
15
|
-
input_delta_total_bytes: Optional[int] = None):
|
16
|
-
self.partition_locator = partition_locator
|
17
|
-
self.compaction_cluster_config_path = compaction_cluster_config_path
|
18
|
-
self.hash_bucket_count = hash_bucket_count
|
19
|
-
self.last_stream_position_to_compact = last_stream_position_to_compact
|
20
|
-
self.partition_values = partition_key_values
|
21
|
-
self.cluster_memory_bytes = cluster_memory_bytes
|
22
|
-
self.input_delta_total_bytes = input_delta_total_bytes
|
23
|
-
self.id = ".".join([pkv.value for pkv in partition_key_values])
|
24
|
-
|
25
|
-
|
@@ -1,13 +0,0 @@
|
|
1
|
-
from typing import List
|
2
|
-
|
3
|
-
from deltacat.autoscaler.events.compaction.process import CompactionProcess
|
4
|
-
from deltacat.autoscaler.events.session_manager import SessionManager
|
5
|
-
from deltacat.storage import PartitionLocator
|
6
|
-
|
7
|
-
|
8
|
-
class CompactionSessionManager(SessionManager):
|
9
|
-
def launch_stats_metadata_collection(self, source_partition_locators: List[PartitionLocator]):
|
10
|
-
raise NotImplementedError("Stats Metadata Collection is not implemented.")
|
11
|
-
|
12
|
-
def launch_compaction(self, compaction_processes: List[CompactionProcess]):
|
13
|
-
raise NotImplementedError("Compaction is not implemented.")
|