deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
deltacat/utils/placement.py
CHANGED
@@ -1,257 +1,286 @@
|
|
1
|
-
import
|
1
|
+
import logging
|
2
2
|
import re
|
3
3
|
import time
|
4
|
-
import yaml
|
5
|
-
import logging
|
6
4
|
from dataclasses import dataclass
|
7
|
-
from typing import
|
8
|
-
from ray.util.placement_group import (
|
9
|
-
placement_group,
|
10
|
-
placement_group_table,
|
11
|
-
get_current_placement_group
|
12
|
-
)
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
13
6
|
|
14
|
-
|
7
|
+
import ray
|
8
|
+
import yaml
|
15
9
|
from ray.experimental.state.api import get_node, get_placement_group
|
16
|
-
|
10
|
+
from ray.util.placement_group import placement_group, placement_group_table
|
11
|
+
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
17
12
|
|
18
13
|
from deltacat import logs
|
19
|
-
|
14
|
+
|
20
15
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
21
16
|
|
22
|
-
#Limitation of current node group or placement group manager
|
23
|
-
#Must run on driver or head node bc state.api needs to query dashboard api server at 127.0.0.1.
|
24
|
-
#Issue: https://github.com/ray-project/ray/issues/29959
|
17
|
+
# Limitation of current node group or placement group manager
|
18
|
+
# Must run on driver or head node bc state.api needs to query dashboard api server at 127.0.0.1.
|
19
|
+
# Issue: https://github.com/ray-project/ray/issues/29959
|
20
|
+
|
25
21
|
|
26
22
|
@dataclass
|
27
|
-
class PlacementGroupConfig
|
28
|
-
|
29
|
-
|
30
|
-
|
23
|
+
class PlacementGroupConfig:
|
24
|
+
def __init__(self, opts, resource):
|
25
|
+
self.opts = opts
|
26
|
+
self.resource = resource
|
31
27
|
|
32
|
-
class NodeGroupManager():
|
33
28
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
def _cluster_node_groups(self, config: Dict[str, Any]) -> Dict[str, Any]:
|
47
|
-
"""Get Worker Groups
|
48
|
-
Args:
|
49
|
-
config: cluster yaml data
|
50
|
-
Returns:
|
51
|
-
worker groups: a dict of worker node group
|
29
|
+
class NodeGroupManager:
|
30
|
+
def __init__(self, path: str, gname: str):
|
31
|
+
"""Node Group Manager
|
32
|
+
Args:
|
33
|
+
path: cluster yaml file
|
34
|
+
gname: node group prefix, e.g., 'partition'
|
35
|
+
"""
|
36
|
+
# cluster init status:
|
37
|
+
self.NODE_GROUP_PREFIX = gname
|
38
|
+
self.cluster_config = self._read_yaml(path)
|
39
|
+
self.init_groups = self._cluster_node_groups(self.cluster_config)
|
40
|
+
self.init_group_res = self._parse_node_resources()
|
52
41
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
#in future, update with fleet resource
|
60
|
-
if len(worker_node_types)>0:
|
61
|
-
self.INSTANCE_TYPE = worker_node_types[0][1]['node_config']['InstanceType']
|
62
|
-
return worker_node_types
|
42
|
+
def _cluster_node_groups(self, config: Dict[str, Any]) -> Dict[str, Any]:
|
43
|
+
"""Get Worker Groups
|
44
|
+
Args:
|
45
|
+
config: cluster yaml data
|
46
|
+
Returns:
|
47
|
+
worker groups: a dict of worker node group
|
63
48
|
|
49
|
+
"""
|
50
|
+
avail_node_types = list(config["available_node_types"].items())
|
51
|
+
# exclude head node type
|
52
|
+
head_node_types = [
|
53
|
+
nt
|
54
|
+
for nt in avail_node_types
|
55
|
+
if "resources" in nt[1]
|
56
|
+
and "CPU" in nt[1]["resources"]
|
57
|
+
and nt[1]["resources"]["CPU"] == 0
|
58
|
+
][0]
|
59
|
+
worker_node_types = [x for x in avail_node_types if x != head_node_types]
|
60
|
+
# assuming homogenous cluster
|
61
|
+
# in future, update with fleet resource
|
62
|
+
if len(worker_node_types) > 0:
|
63
|
+
self.INSTANCE_TYPE = worker_node_types[0][1]["node_config"]["InstanceType"]
|
64
|
+
return worker_node_types
|
64
65
|
|
65
|
-
|
66
|
-
|
67
|
-
|
66
|
+
def _read_yaml(self, path: str) -> Dict[str, Any]:
|
67
|
+
with open(path, "rt") as f:
|
68
|
+
return yaml.safe_load(f)
|
68
69
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
70
|
+
def _update_groups(self) -> List[Tuple[str, float]]:
|
71
|
+
"""
|
72
|
+
Node groups can come and go during runtime, whenever a node group is needed, we need to check the current available groups
|
73
|
+
Returns:
|
74
|
+
current_groups: dict of custom resource groups
|
75
|
+
"""
|
76
|
+
# Add 1.1 second latency to avoid inconsistency issue between raylet and head
|
77
|
+
time.sleep(1.1)
|
78
|
+
all_available_res = ray.available_resources()
|
79
|
+
current_groups = [
|
80
|
+
(k, all_available_res[k])
|
81
|
+
for k in all_available_res.keys()
|
82
|
+
if self.NODE_GROUP_PREFIX in k
|
83
|
+
]
|
84
|
+
return current_groups
|
80
85
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
86
|
+
def _parse_node_resources(self) -> Dict[str, Dict[str, float]]:
|
87
|
+
"""
|
88
|
+
Parse resources per node to get detailed resource tighted to each node group
|
89
|
+
Returns:
|
90
|
+
group_res: a dict of resources, e.g., {'CPU':0,'memory':0,'object_store_memory':0}
|
91
|
+
"""
|
92
|
+
all_available_resources = (
|
93
|
+
ray._private.state.state._available_resources_per_node()
|
94
|
+
)
|
95
|
+
group_keys = [x[0] for x in self.init_groups]
|
96
|
+
group_res = {}
|
97
|
+
for k in group_keys:
|
98
|
+
group_res[k] = {
|
99
|
+
"CPU": 0,
|
100
|
+
"memory": 0,
|
101
|
+
"object_store_memory": 0,
|
102
|
+
"node_id": [],
|
103
|
+
}
|
104
|
+
for v in all_available_resources.values():
|
105
|
+
keys = v.keys()
|
106
|
+
r = re.compile(self.NODE_GROUP_PREFIX)
|
107
|
+
partition = list(filter(r.match, list(keys)))
|
108
|
+
r = re.compile("node:")
|
109
|
+
node_id = list(filter(r.match, list(keys)))
|
110
|
+
if len(partition) > 0:
|
111
|
+
partition = partition[0]
|
112
|
+
if len(node_id) > 0:
|
113
|
+
node_id = node_id[0]
|
114
|
+
if self.NODE_GROUP_PREFIX in partition:
|
115
|
+
group_res[partition]["CPU"] += v["CPU"]
|
116
|
+
group_res[partition]["memory"] += v["memory"]
|
117
|
+
group_res[partition]["object_store_memory"] += v["object_store_memory"]
|
118
|
+
group_res[partition]["node_id"].append(node_id)
|
119
|
+
return group_res
|
108
120
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
121
|
+
def _update_group_res(self, gname: str) -> Dict[str, Union[str, float]]:
|
122
|
+
"""
|
123
|
+
Get the realtime resource of a node group
|
124
|
+
Args:
|
125
|
+
gname: name of node group
|
126
|
+
Returns:
|
127
|
+
group_res: dict of updated resource(cpu, memory, object store memory) for a given group
|
128
|
+
"""
|
129
|
+
all_available_resources = (
|
130
|
+
ray._private.state.state._available_resources_per_node()
|
131
|
+
)
|
132
|
+
group_res = {"CPU": 0, "memory": 0, "object_store_memory": 0, "node_id": []}
|
133
|
+
for v in all_available_resources.values():
|
134
|
+
keys = v.keys()
|
135
|
+
r = re.compile("node:")
|
136
|
+
node_id = list(filter(r.match, list(keys)))
|
137
|
+
if len(node_id) > 0:
|
138
|
+
node_id = node_id[0]
|
139
|
+
if gname in v.keys():
|
140
|
+
group_res["CPU"] += v["CPU"]
|
141
|
+
group_res["memory"] += v["memory"]
|
142
|
+
group_res["object_store_memory"] += v["object_store_memory"]
|
143
|
+
group_res["node_id"].append(node_id)
|
144
|
+
return group_res
|
131
145
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
146
|
+
def get_one_group(self) -> Optional[Dict[str, Union[str, float]]]:
|
147
|
+
"""
|
148
|
+
Pop up one node group
|
149
|
+
Returns:
|
150
|
+
group_res: dict of node group resource, {"group":"partition_1","CPU":2,...}
|
151
|
+
"""
|
152
|
+
current_groups = self._update_groups()
|
153
|
+
if len(current_groups) > 0:
|
154
|
+
gname = current_groups[-1][0]
|
155
|
+
group_res = self._update_group_res(gname)
|
156
|
+
group_res["group"] = gname
|
157
|
+
try:
|
158
|
+
group_res["group_res"] = ray.available_resources()[gname]
|
159
|
+
except Exception as e:
|
160
|
+
logger.info(f"Error: {e}. There is no available resources for {gname}")
|
161
|
+
return None
|
162
|
+
return group_res
|
163
|
+
else:
|
164
|
+
return None
|
151
165
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
166
|
+
def get_group_by_name(self, gname: str) -> Optional[Dict[str, Union[str, float]]]:
|
167
|
+
"""
|
168
|
+
Get the specific node group given its pre-filled name
|
169
|
+
Args:
|
170
|
+
gname: name of the node group
|
171
|
+
Returns:
|
172
|
+
group_res: dict of node group resource
|
159
173
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
174
|
+
"""
|
175
|
+
group_res = self._update_group_res(gname)
|
176
|
+
group_res["group"] = gname
|
177
|
+
try:
|
178
|
+
group_res["group_res"] = ray.available_resources()[gname]
|
179
|
+
except Exception as e:
|
180
|
+
logger.info(f"Error: {e}. There is no available resources for {gname}")
|
181
|
+
return None
|
182
|
+
return group_res
|
169
183
|
|
170
|
-
class PlacementGroupManager():
|
171
|
-
"""Placement Group Manager
|
172
|
-
Create a list of placement group with the desired number of cpus
|
173
|
-
e.g., create a pg with 32 cpus, then this class will look for a node that has 32 cpus, and collect all
|
174
|
-
resources, including cpu, memory, and object store;
|
175
|
-
How to use:
|
176
|
-
```
|
177
|
-
from deltacat.utils.placement import PlacementGroupManager as pgm
|
178
|
-
pgm = pgm(10, 32)
|
179
|
-
pg_configs = pgm.pgs
|
180
|
-
opts = pg_configs[0][0]
|
181
|
-
fun.options(**opts).remote()
|
182
|
-
```
|
183
|
-
Args:
|
184
|
-
num_pgs: number of placement groups to be created
|
185
|
-
instance_cpus: number of cpus per instance
|
186
|
-
"""
|
187
|
-
def __init__(self, num_pgs: int,
|
188
|
-
total_cpus_per_pg: int,
|
189
|
-
cpu_per_bundle: int,
|
190
|
-
strategy="SPREAD",
|
191
|
-
capture_child_tasks=True):
|
192
|
-
head_res_key = self.get_current_node_resource_key()
|
193
|
-
#run the task on head and consume a fractional cpu, so that pg can be created on non-head node
|
194
|
-
#if cpu_per_bundle is less than the cpus per node, the pg can still be created on head
|
195
|
-
#curent assumption is that the cpu_per_bundle = cpus per node
|
196
|
-
#TODO: figure out how to create pg on non-head explicitly
|
197
|
-
self._pg_configs = ray.get([_config.options(resources={head_res_key:0.01}).remote(total_cpus_per_pg, \
|
198
|
-
cpu_per_bundle, strategy, capture_child_tasks) for i in range(num_pgs)])
|
199
|
-
#TODO: handle the cases where cpu_per_bundle is larger than max cpus per node, support it on ec2/flex/manta
|
200
|
-
|
201
|
-
@property
|
202
|
-
def pgs(self):
|
203
|
-
return self._pg_configs
|
204
184
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
185
|
+
class PlacementGroupManager:
|
186
|
+
"""Placement Group Manager
|
187
|
+
Create a list of placement group with the desired number of cpus
|
188
|
+
e.g., create a pg with 32 cpus, then this class will look for a node that has 32 cpus, and collect all
|
189
|
+
resources, including cpu, memory, and object store;
|
190
|
+
How to use:
|
191
|
+
```
|
192
|
+
from deltacat.utils.placement import PlacementGroupManager as pgm
|
193
|
+
pgm = pgm(10, 32)
|
194
|
+
pg_configs = pgm.pgs
|
195
|
+
opts = pg_configs[0][0]
|
196
|
+
fun.options(**opts).remote()
|
197
|
+
```
|
198
|
+
Args:
|
199
|
+
num_pgs: number of placement groups to be created
|
200
|
+
instance_cpus: number of cpus per instance
|
201
|
+
"""
|
215
202
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
for bd in bundles:
|
239
|
-
node_ids.append(bd['node_id'])
|
240
|
-
#query available resources given list of node id
|
241
|
-
all_nodes_available_res = ray._private.state.state._available_resources_per_node()
|
242
|
-
pg_res = {'CPU':0,'memory':0,'object_store_memory':0}
|
243
|
-
for node_id in node_ids:
|
244
|
-
if node_id in all_nodes_available_res:
|
245
|
-
v = all_nodes_available_res[node_id]
|
246
|
-
node_detail = get_node(node_id)
|
247
|
-
pg_res['CPU']+=node_detail['resources_total']['CPU']
|
248
|
-
pg_res['memory']+=v['memory']
|
249
|
-
pg_res['object_store_memory']+=v['object_store_memory']
|
250
|
-
cluster_resources['CPU'] = int(pg_res['CPU'])
|
251
|
-
cluster_resources['memory'] = float(pg_res['memory'])
|
252
|
-
cluster_resources['object_store_memory'] = float(pg_res['object_store_memory'])
|
253
|
-
pg_config=PlacementGroupConfig(opts,cluster_resources)
|
254
|
-
logger.info(f"pg has resources:{cluster_resources}")
|
203
|
+
def __init__(
|
204
|
+
self,
|
205
|
+
num_pgs: int,
|
206
|
+
total_cpus_per_pg: int,
|
207
|
+
cpu_per_bundle: int,
|
208
|
+
strategy="SPREAD",
|
209
|
+
capture_child_tasks=True,
|
210
|
+
):
|
211
|
+
head_res_key = self.get_current_node_resource_key()
|
212
|
+
# run the task on head and consume a fractional cpu, so that pg can be created on non-head node
|
213
|
+
# if cpu_per_bundle is less than the cpus per node, the pg can still be created on head
|
214
|
+
# curent assumption is that the cpu_per_bundle = cpus per node
|
215
|
+
# TODO: figure out how to create pg on non-head explicitly
|
216
|
+
self._pg_configs = ray.get(
|
217
|
+
[
|
218
|
+
_config.options(resources={head_res_key: 0.01}).remote(
|
219
|
+
total_cpus_per_pg, cpu_per_bundle, strategy, capture_child_tasks
|
220
|
+
)
|
221
|
+
for i in range(num_pgs)
|
222
|
+
]
|
223
|
+
)
|
224
|
+
# TODO: handle the cases where cpu_per_bundle is larger than max cpus per node, support it on ec2/flex/manta
|
255
225
|
|
256
|
-
|
226
|
+
@property
|
227
|
+
def pgs(self):
|
228
|
+
return self._pg_configs
|
229
|
+
|
230
|
+
def get_current_node_resource_key(self) -> str:
|
231
|
+
# on ec2: address="172.31.34.51:6379"
|
232
|
+
# on manta: address = "2600:1f10:4674:6815:aadb:2dc8:de61:bc8e:6379"
|
233
|
+
current_node_name = ray.experimental.internal_kv.global_gcs_client.address[:-5]
|
234
|
+
for node in ray.nodes():
|
235
|
+
if node["NodeName"] == current_node_name:
|
236
|
+
# Found the node.
|
237
|
+
for key in node["Resources"].keys():
|
238
|
+
if key.startswith("node:"):
|
239
|
+
return key
|
240
|
+
|
241
|
+
|
242
|
+
@ray.remote(num_cpus=0.01)
|
243
|
+
def _config(
|
244
|
+
total_cpus_per_pg: int,
|
245
|
+
cpu_per_node: int,
|
246
|
+
strategy="SPREAD",
|
247
|
+
capture_child_tasks=True,
|
248
|
+
time_out: Optional[float] = None,
|
249
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
250
|
+
pg_config = None
|
251
|
+
opts = {}
|
252
|
+
cluster_resources = {}
|
253
|
+
num_bundles = (int)(total_cpus_per_pg / cpu_per_node)
|
254
|
+
bundles = [{"CPU": cpu_per_node} for i in range(num_bundles)]
|
255
|
+
pg = placement_group(bundles, strategy=strategy)
|
256
|
+
ray.get(pg.ready(), timeout=time_out)
|
257
|
+
if not pg:
|
258
|
+
return None
|
259
|
+
opts = {
|
260
|
+
"scheduling_strategy": PlacementGroupSchedulingStrategy(
|
261
|
+
placement_group=pg, placement_group_capture_child_tasks=capture_child_tasks
|
262
|
+
)
|
263
|
+
}
|
264
|
+
pg_id = placement_group_table(pg)["placement_group_id"]
|
265
|
+
pg_details = get_placement_group(pg_id)
|
266
|
+
bundles = pg_details["bundles"]
|
267
|
+
node_ids = []
|
268
|
+
for bd in bundles:
|
269
|
+
node_ids.append(bd["node_id"])
|
270
|
+
# query available resources given list of node id
|
271
|
+
all_nodes_available_res = ray._private.state.state._available_resources_per_node()
|
272
|
+
pg_res = {"CPU": 0, "memory": 0, "object_store_memory": 0}
|
273
|
+
for node_id in node_ids:
|
274
|
+
if node_id in all_nodes_available_res:
|
275
|
+
v = all_nodes_available_res[node_id]
|
276
|
+
node_detail = get_node(node_id)
|
277
|
+
pg_res["CPU"] += node_detail["resources_total"]["CPU"]
|
278
|
+
pg_res["memory"] += v["memory"]
|
279
|
+
pg_res["object_store_memory"] += v["object_store_memory"]
|
280
|
+
cluster_resources["CPU"] = int(pg_res["CPU"])
|
281
|
+
cluster_resources["memory"] = float(pg_res["memory"])
|
282
|
+
cluster_resources["object_store_memory"] = float(pg_res["object_store_memory"])
|
283
|
+
pg_config = PlacementGroupConfig(opts, cluster_resources)
|
284
|
+
logger.info(f"pg has resources:{cluster_resources}")
|
257
285
|
|
286
|
+
return pg_config
|