deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +17 -6
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/compute/compactor/compaction_session.py +42 -18
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +2 -1
- deltacat/compute/compactor/steps/dedupe.py +9 -6
- deltacat/compute/compactor/steps/hash_bucket.py +24 -3
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/sort_key.py +5 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/interface.py +8 -1
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/test_clients.py +16 -3
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +62 -19
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +17 -3
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/RECORD +80 -47
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
from unittest import TestCase
|
2
|
+
from deltacat.utils.pyarrow import (
|
3
|
+
s3_parquet_file_to_table,
|
4
|
+
s3_partial_parquet_file_to_table,
|
5
|
+
)
|
6
|
+
from deltacat.types.media import ContentEncoding, ContentType
|
7
|
+
from deltacat.types.partial_download import PartialParquetParameters
|
8
|
+
from pyarrow.parquet import ParquetFile
|
9
|
+
import pyarrow as pa
|
10
|
+
|
11
|
+
PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
|
12
|
+
|
13
|
+
|
14
|
+
class TestS3ParquetFileToTable(TestCase):
|
15
|
+
def test_s3_parquet_file_to_table_sanity(self):
|
16
|
+
|
17
|
+
result = s3_parquet_file_to_table(
|
18
|
+
PARQUET_FILE_PATH,
|
19
|
+
ContentType.PARQUET.value,
|
20
|
+
ContentEncoding.IDENTITY.value,
|
21
|
+
["n_legs", "animal"],
|
22
|
+
["n_legs"],
|
23
|
+
)
|
24
|
+
|
25
|
+
self.assertEqual(len(result), 6)
|
26
|
+
self.assertEqual(len(result.column_names), 1)
|
27
|
+
schema = result.schema
|
28
|
+
schema_index = schema.get_field_index("n_legs")
|
29
|
+
self.assertEqual(schema.field(schema_index).type, "int64")
|
30
|
+
|
31
|
+
def test_s3_parquet_file_to_table_when_schema_overridden(self):
|
32
|
+
|
33
|
+
schema = pa.schema(
|
34
|
+
[pa.field("animal", pa.string()), pa.field("n_legs", pa.string())]
|
35
|
+
)
|
36
|
+
|
37
|
+
pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
|
38
|
+
|
39
|
+
result = s3_parquet_file_to_table(
|
40
|
+
PARQUET_FILE_PATH,
|
41
|
+
ContentType.PARQUET.value,
|
42
|
+
ContentEncoding.IDENTITY.value,
|
43
|
+
["n_legs", "animal"],
|
44
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
45
|
+
)
|
46
|
+
|
47
|
+
self.assertEqual(len(result), 6)
|
48
|
+
self.assertEqual(len(result.column_names), 2)
|
49
|
+
|
50
|
+
result_schema = result.schema
|
51
|
+
for index, field in enumerate(result_schema):
|
52
|
+
self.assertEqual(field.name, schema.field(index).name)
|
53
|
+
|
54
|
+
self.assertEqual(result.schema.field(1).type, "string")
|
55
|
+
|
56
|
+
|
57
|
+
class TestS3PartialParquetFileToTable(TestCase):
|
58
|
+
def test_s3_partial_parquet_file_to_table_sanity(self):
|
59
|
+
|
60
|
+
pq_file = ParquetFile(PARQUET_FILE_PATH)
|
61
|
+
partial_parquet_params = PartialParquetParameters.of(
|
62
|
+
pq_metadata=pq_file.metadata
|
63
|
+
)
|
64
|
+
|
65
|
+
self.assertEqual(
|
66
|
+
partial_parquet_params.num_row_groups, 2, "test_file.parquet has changed."
|
67
|
+
)
|
68
|
+
|
69
|
+
# only first row group to be downloaded
|
70
|
+
partial_parquet_params.row_groups_to_download.pop()
|
71
|
+
|
72
|
+
result = s3_partial_parquet_file_to_table(
|
73
|
+
PARQUET_FILE_PATH,
|
74
|
+
include_columns=["n_legs"],
|
75
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
76
|
+
content_type=ContentType.PARQUET.value,
|
77
|
+
partial_file_download_params=partial_parquet_params,
|
78
|
+
)
|
79
|
+
|
80
|
+
self.assertEqual(len(result), 3)
|
81
|
+
self.assertEqual(len(result.columns), 1)
|
82
|
+
|
83
|
+
def test_s3_partial_parquet_file_to_table_when_schema_passed(self):
|
84
|
+
|
85
|
+
pq_file = ParquetFile(PARQUET_FILE_PATH)
|
86
|
+
partial_parquet_params = PartialParquetParameters.of(
|
87
|
+
pq_metadata=pq_file.metadata
|
88
|
+
)
|
89
|
+
# only first row group to be downloaded
|
90
|
+
partial_parquet_params.row_groups_to_download.pop()
|
91
|
+
|
92
|
+
schema = pa.schema(
|
93
|
+
[pa.field("animal", pa.string()), pa.field("n_legs", pa.string())]
|
94
|
+
)
|
95
|
+
|
96
|
+
pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
|
97
|
+
|
98
|
+
result = s3_partial_parquet_file_to_table(
|
99
|
+
PARQUET_FILE_PATH,
|
100
|
+
ContentType.PARQUET.value,
|
101
|
+
ContentEncoding.IDENTITY.value,
|
102
|
+
["n_legs", "animal"],
|
103
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
104
|
+
partial_file_download_params=partial_parquet_params,
|
105
|
+
)
|
106
|
+
|
107
|
+
self.assertEqual(len(result), 3)
|
108
|
+
self.assertEqual(len(result.column_names), 2)
|
109
|
+
|
110
|
+
result_schema = result.schema
|
111
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
112
|
+
self.assertEqual(result_schema.field(0).name, "n_legs") # order doesn't change
|
113
|
+
|
114
|
+
def test_s3_partial_parquet_file_to_table_when_multiple_row_groups(self):
|
115
|
+
|
116
|
+
pq_file = ParquetFile(PARQUET_FILE_PATH)
|
117
|
+
partial_parquet_params = PartialParquetParameters.of(
|
118
|
+
pq_metadata=pq_file.metadata
|
119
|
+
)
|
120
|
+
|
121
|
+
self.assertEqual(
|
122
|
+
partial_parquet_params.num_row_groups, 2, "test_file.parquet has changed."
|
123
|
+
)
|
124
|
+
|
125
|
+
result = s3_partial_parquet_file_to_table(
|
126
|
+
PARQUET_FILE_PATH,
|
127
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
128
|
+
content_type=ContentType.PARQUET.value,
|
129
|
+
partial_file_download_params=partial_parquet_params,
|
130
|
+
)
|
131
|
+
|
132
|
+
self.assertEqual(len(result), 6)
|
133
|
+
self.assertEqual(len(result.columns), 2)
|
@@ -1,37 +1,22 @@
|
|
1
1
|
import unittest
|
2
2
|
from unittest import mock
|
3
|
-
import
|
3
|
+
import time
|
4
4
|
|
5
5
|
|
6
6
|
class TestGetCurrentClusterUtilization(unittest.TestCase):
|
7
|
-
@
|
8
|
-
def
|
9
|
-
|
10
|
-
cls.ray_mock.cluster_resources.return_value = {
|
7
|
+
@mock.patch("deltacat.utils.resources.ray")
|
8
|
+
def test_sanity(self, ray_mock):
|
9
|
+
ray_mock.cluster_resources.return_value = {
|
11
10
|
"CPU": 10,
|
12
11
|
"memory": 10,
|
13
12
|
"object_store_memory": 5,
|
14
13
|
}
|
15
|
-
|
14
|
+
ray_mock.available_resources.return_value = {
|
16
15
|
"CPU": 6,
|
17
16
|
"memory": 4,
|
18
17
|
"object_store_memory": 5,
|
19
18
|
}
|
20
19
|
|
21
|
-
cls.module_patcher = mock.patch.dict("sys.modules", {"ray": cls.ray_mock})
|
22
|
-
cls.module_patcher.start()
|
23
|
-
|
24
|
-
# delete reference to reload from mocked ray
|
25
|
-
if "deltacat.utils.resources" in sys.modules:
|
26
|
-
del sys.modules["deltacat.utils.resources"]
|
27
|
-
|
28
|
-
super().setUpClass()
|
29
|
-
|
30
|
-
@classmethod
|
31
|
-
def tearDownClass(cls) -> None:
|
32
|
-
cls.module_patcher.stop()
|
33
|
-
|
34
|
-
def test_sanity(self):
|
35
20
|
from deltacat.utils.resources import ClusterUtilization
|
36
21
|
|
37
22
|
result = ClusterUtilization.get_current_cluster_utilization()
|
@@ -43,3 +28,21 @@ class TestGetCurrentClusterUtilization(unittest.TestCase):
|
|
43
28
|
self.assertEqual(0, result.used_object_store_memory_bytes)
|
44
29
|
self.assertEqual(6, result.used_memory_bytes)
|
45
30
|
self.assertIsNotNone(result.used_resources)
|
31
|
+
|
32
|
+
|
33
|
+
class TestClusterUtilizationOverTimeRange(unittest.TestCase):
|
34
|
+
@mock.patch("deltacat.utils.resources.ray")
|
35
|
+
def test_sanity(self, ray_mock):
|
36
|
+
from deltacat.utils.resources import ClusterUtilizationOverTimeRange
|
37
|
+
|
38
|
+
ray_mock.cluster_resources.side_effect = [{"CPU": 32} for _ in range(5)]
|
39
|
+
ray_mock.available_resources.side_effect = [
|
40
|
+
{"CPU": 2 ** (i + 1)} for i in range(5)
|
41
|
+
]
|
42
|
+
|
43
|
+
with ClusterUtilizationOverTimeRange() as cu:
|
44
|
+
time.sleep(3)
|
45
|
+
self.assertTrue(cu.used_vcpu_seconds <= 82) # 30 + 28 + 24
|
46
|
+
self.assertTrue(
|
47
|
+
cu.total_vcpu_seconds >= cu.used_vcpu_seconds
|
48
|
+
) # total is greater than used
|
deltacat/types/media.py
CHANGED
@@ -0,0 +1,82 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Dict, Any, Optional, List
|
4
|
+
from pyarrow.parquet import FileMetaData
|
5
|
+
|
6
|
+
|
7
|
+
class PartialFileDownloadParams(Dict[str, Any]):
|
8
|
+
"""
|
9
|
+
A content type params class used to represent arguments required
|
10
|
+
to down the file partially. This is useful specifically in cases
|
11
|
+
where you'd like to instruct downloader to partially download a
|
12
|
+
manifest entry.
|
13
|
+
"""
|
14
|
+
|
15
|
+
pass
|
16
|
+
|
17
|
+
|
18
|
+
class PartialParquetParameters(PartialFileDownloadParams):
|
19
|
+
@staticmethod
|
20
|
+
def of(
|
21
|
+
row_groups_to_download: Optional[List[int]] = None,
|
22
|
+
num_row_groups: Optional[int] = None,
|
23
|
+
num_rows: Optional[int] = None,
|
24
|
+
in_memory_size_bytes: Optional[float] = None,
|
25
|
+
pq_metadata: Optional[FileMetaData] = None,
|
26
|
+
) -> PartialParquetParameters:
|
27
|
+
|
28
|
+
if (
|
29
|
+
row_groups_to_download is None
|
30
|
+
or num_row_groups is None
|
31
|
+
or num_rows is None
|
32
|
+
or in_memory_size_bytes is None
|
33
|
+
):
|
34
|
+
assert (
|
35
|
+
pq_metadata is not None
|
36
|
+
), "Parquet file metadata must be passed explicitly"
|
37
|
+
|
38
|
+
num_row_groups = pq_metadata.num_row_groups
|
39
|
+
row_groups_to_download = [rg for rg in range(num_row_groups)]
|
40
|
+
in_memory_size_bytes = 0.0
|
41
|
+
|
42
|
+
for rg in row_groups_to_download:
|
43
|
+
row_group_meta = pq_metadata.row_group(rg)
|
44
|
+
in_memory_size_bytes += row_group_meta.total_byte_size
|
45
|
+
|
46
|
+
result = PartialParquetParameters(
|
47
|
+
{
|
48
|
+
"row_groups_to_download": row_groups_to_download,
|
49
|
+
"num_row_groups": num_row_groups,
|
50
|
+
"num_rows": num_rows,
|
51
|
+
"in_memory_size_bytes": in_memory_size_bytes,
|
52
|
+
}
|
53
|
+
)
|
54
|
+
|
55
|
+
if pq_metadata:
|
56
|
+
result["pq_metadata"] = pq_metadata
|
57
|
+
|
58
|
+
return result
|
59
|
+
|
60
|
+
@property
|
61
|
+
def row_groups_to_download(self) -> List[int]:
|
62
|
+
return self["row_groups_to_download"]
|
63
|
+
|
64
|
+
@property
|
65
|
+
def num_row_groups(self) -> List[int]:
|
66
|
+
return self["num_row_groups"]
|
67
|
+
|
68
|
+
@property
|
69
|
+
def num_rows(self) -> int:
|
70
|
+
return self["num_rows"]
|
71
|
+
|
72
|
+
@property
|
73
|
+
def in_memory_size_bytes(self) -> float:
|
74
|
+
return self["in_memory_size_bytes"]
|
75
|
+
|
76
|
+
@property
|
77
|
+
def pq_metadata(self) -> Optional[FileMetaData]:
|
78
|
+
return self.get("pq_metadata")
|
79
|
+
|
80
|
+
@pq_metadata.setter
|
81
|
+
def pq_metadata(self, metadata: FileMetaData) -> None:
|
82
|
+
self["pq_metadata"] = metadata
|
deltacat/types/tables.py
CHANGED
@@ -21,6 +21,7 @@ from deltacat.utils import pyarrow as pa_utils
|
|
21
21
|
from deltacat.utils.ray_utils import dataset as ds_utils
|
22
22
|
|
23
23
|
TABLE_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
|
24
|
+
TableType.PYARROW_PARQUET.value: pa_utils.s3_file_to_parquet,
|
24
25
|
TableType.PYARROW.value: pa_utils.s3_file_to_table,
|
25
26
|
TableType.PANDAS.value: pd_utils.s3_file_to_dataframe,
|
26
27
|
TableType.NUMPY.value: np_utils.s3_file_to_ndarray,
|
@@ -0,0 +1,26 @@
|
|
1
|
+
import inspect
|
2
|
+
import copy
|
3
|
+
from typing import Any, Dict
|
4
|
+
|
5
|
+
|
6
|
+
def sanitize_kwargs_to_callable(callable: Any, kwargs: Dict) -> Dict:
|
7
|
+
"""
|
8
|
+
This method removes any upsupported key word arguments if variable
|
9
|
+
kwargs are not allowed in the method signature.
|
10
|
+
|
11
|
+
Returns: a sanitized dict of kwargs.
|
12
|
+
"""
|
13
|
+
signature = inspect.signature(callable)
|
14
|
+
params = signature.parameters
|
15
|
+
|
16
|
+
new_kwargs = copy.copy(kwargs)
|
17
|
+
|
18
|
+
for key in params:
|
19
|
+
if params[key].kind == inspect.Parameter.VAR_KEYWORD:
|
20
|
+
return kwargs
|
21
|
+
|
22
|
+
for key in kwargs.keys():
|
23
|
+
if key not in params:
|
24
|
+
new_kwargs.pop(key)
|
25
|
+
|
26
|
+
return new_kwargs
|
deltacat/utils/daft.py
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional, List
|
3
|
+
|
4
|
+
from daft.table import Table
|
5
|
+
from daft.logical.schema import Schema
|
6
|
+
from daft import TimeUnit
|
7
|
+
from daft.io import IOConfig, S3Config
|
8
|
+
import pyarrow as pa
|
9
|
+
|
10
|
+
from deltacat import logs
|
11
|
+
from deltacat.utils.common import ReadKwargsProvider
|
12
|
+
|
13
|
+
from deltacat.types.media import ContentType, ContentEncoding
|
14
|
+
from deltacat.aws.constants import BOTO_MAX_RETRIES
|
15
|
+
from deltacat.utils.performance import timed_invocation
|
16
|
+
|
17
|
+
from deltacat.types.partial_download import (
|
18
|
+
PartialFileDownloadParams,
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
23
|
+
|
24
|
+
|
25
|
+
def daft_s3_file_to_table(
|
26
|
+
s3_url: str,
|
27
|
+
content_type: str,
|
28
|
+
content_encoding: str,
|
29
|
+
column_names: Optional[List[str]] = None,
|
30
|
+
include_columns: Optional[List[str]] = None,
|
31
|
+
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
32
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
33
|
+
**s3_client_kwargs,
|
34
|
+
):
|
35
|
+
assert (
|
36
|
+
content_type == ContentType.PARQUET.value
|
37
|
+
), f"daft native reader currently only supports parquet, got {content_type}"
|
38
|
+
|
39
|
+
assert (
|
40
|
+
content_encoding == ContentEncoding.IDENTITY.value
|
41
|
+
), f"daft native reader currently only supports identity encoding, got {content_encoding}"
|
42
|
+
|
43
|
+
kwargs = {}
|
44
|
+
if pa_read_func_kwargs_provider is not None:
|
45
|
+
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
46
|
+
|
47
|
+
coerce_int96_timestamp_unit = TimeUnit.from_str(
|
48
|
+
kwargs.get("coerce_int96_timestamp_unit", "ms")
|
49
|
+
)
|
50
|
+
|
51
|
+
row_groups = None
|
52
|
+
if (
|
53
|
+
partial_file_download_params is not None
|
54
|
+
and partial_file_download_params.row_groups_to_download is not None
|
55
|
+
):
|
56
|
+
row_groups = partial_file_download_params.row_groups_to_download
|
57
|
+
|
58
|
+
io_config = IOConfig(
|
59
|
+
s3=S3Config(
|
60
|
+
key_id=s3_client_kwargs.get("aws_access_key_id"),
|
61
|
+
access_key=s3_client_kwargs.get("aws_secret_access_key"),
|
62
|
+
session_token=s3_client_kwargs.get("aws_session_token"),
|
63
|
+
retry_mode="adaptive",
|
64
|
+
num_tries=BOTO_MAX_RETRIES,
|
65
|
+
)
|
66
|
+
)
|
67
|
+
|
68
|
+
table, latency = timed_invocation(
|
69
|
+
Table.read_parquet,
|
70
|
+
path=s3_url,
|
71
|
+
columns=include_columns or column_names,
|
72
|
+
row_groups=row_groups,
|
73
|
+
io_config=io_config,
|
74
|
+
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
|
75
|
+
multithreaded_io=False,
|
76
|
+
)
|
77
|
+
|
78
|
+
logger.debug(f"Time to read S3 object from {s3_url} into daft table: {latency}s")
|
79
|
+
|
80
|
+
if kwargs.get("schema") is not None:
|
81
|
+
schema = kwargs["schema"]
|
82
|
+
if include_columns is not None:
|
83
|
+
schema = pa.schema([schema.field(col) for col in include_columns])
|
84
|
+
daft_schema = Schema.from_pyarrow_schema(schema)
|
85
|
+
return table.cast_to_schema(daft_schema).to_arrow()
|
86
|
+
else:
|
87
|
+
return table.to_arrow()
|
deltacat/utils/placement.py
CHANGED
@@ -21,9 +21,10 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
21
21
|
|
22
22
|
@dataclass
|
23
23
|
class PlacementGroupConfig:
|
24
|
-
def __init__(self, opts, resource):
|
24
|
+
def __init__(self, opts, resource, node_ips):
|
25
25
|
self.opts = opts
|
26
26
|
self.resource = resource
|
27
|
+
self.node_ips = node_ips
|
27
28
|
|
28
29
|
|
29
30
|
class NodeGroupManager:
|
@@ -207,6 +208,7 @@ class PlacementGroupManager:
|
|
207
208
|
cpu_per_bundle: int,
|
208
209
|
strategy="SPREAD",
|
209
210
|
capture_child_tasks=True,
|
211
|
+
memory_per_bundle=None,
|
210
212
|
):
|
211
213
|
head_res_key = self.get_current_node_resource_key()
|
212
214
|
# run the task on head and consume a fractional cpu, so that pg can be created on non-head node
|
@@ -216,7 +218,11 @@ class PlacementGroupManager:
|
|
216
218
|
self._pg_configs = ray.get(
|
217
219
|
[
|
218
220
|
_config.options(resources={head_res_key: 0.01}).remote(
|
219
|
-
total_cpus_per_pg,
|
221
|
+
total_cpus_per_pg,
|
222
|
+
cpu_per_bundle,
|
223
|
+
strategy,
|
224
|
+
capture_child_tasks,
|
225
|
+
memory_per_bundle=memory_per_bundle,
|
220
226
|
)
|
221
227
|
for i in range(num_pgs)
|
222
228
|
]
|
@@ -251,12 +257,18 @@ def _config(
|
|
251
257
|
strategy="SPREAD",
|
252
258
|
capture_child_tasks=True,
|
253
259
|
time_out: Optional[float] = None,
|
260
|
+
memory_per_bundle: Optional[float] = None,
|
254
261
|
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
255
262
|
pg_config = None
|
256
263
|
opts = {}
|
257
264
|
cluster_resources = {}
|
258
265
|
num_bundles = (int)(total_cpus_per_pg / cpu_per_node)
|
259
266
|
bundles = [{"CPU": cpu_per_node} for i in range(num_bundles)]
|
267
|
+
|
268
|
+
if memory_per_bundle:
|
269
|
+
for bundle in bundles:
|
270
|
+
bundle["memory"] = memory_per_bundle
|
271
|
+
|
260
272
|
pg = placement_group(bundles, strategy=strategy)
|
261
273
|
ray.get(pg.ready(), timeout=time_out)
|
262
274
|
if not pg:
|
@@ -275,6 +287,7 @@ def _config(
|
|
275
287
|
# query available resources given list of node id
|
276
288
|
all_nodes_available_res = ray._private.state.state._available_resources_per_node()
|
277
289
|
pg_res = {"CPU": 0, "memory": 0, "object_store_memory": 0}
|
290
|
+
node_ips = []
|
278
291
|
for node_id in node_ids:
|
279
292
|
if node_id in all_nodes_available_res:
|
280
293
|
v = all_nodes_available_res[node_id]
|
@@ -282,10 +295,14 @@ def _config(
|
|
282
295
|
pg_res["CPU"] += node_detail["resources_total"]["CPU"]
|
283
296
|
pg_res["memory"] += v["memory"]
|
284
297
|
pg_res["object_store_memory"] += v["object_store_memory"]
|
298
|
+
node_ips.append(node_detail["node_ip"])
|
285
299
|
cluster_resources["CPU"] = int(pg_res["CPU"])
|
286
300
|
cluster_resources["memory"] = float(pg_res["memory"])
|
287
301
|
cluster_resources["object_store_memory"] = float(pg_res["object_store_memory"])
|
288
|
-
pg_config = PlacementGroupConfig(
|
302
|
+
pg_config = PlacementGroupConfig(
|
303
|
+
opts=opts, resource=cluster_resources, node_ips=node_ips
|
304
|
+
)
|
289
305
|
logger.info(f"pg has resources:{cluster_resources}")
|
306
|
+
logger.debug(f"pg has node ips:{node_ips}")
|
290
307
|
|
291
308
|
return pg_config
|