deltacat 0.1.6__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +183 -194
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +249 -198
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +153 -260
  22. deltacat/compute/compactor/steps/hash_bucket.py +56 -56
  23. deltacat/compute/compactor/steps/materialize.py +139 -100
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +131 -90
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -42
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +8 -10
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +276 -228
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +36 -29
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/METADATA +21 -11
  79. deltacat-0.1.11.dist-info/RECORD +110 -0
  80. {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
  81. deltacat-0.1.6.dist-info/RECORD +0 -108
  82. {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
  83. {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,238 +1,286 @@
1
- import ray
1
+ import logging
2
2
  import re
3
3
  import time
4
- import yaml
5
- import logging
6
- from typing import Optional, Union, List, Dict, Any, Callable, Tuple
7
- from ray.util.placement_group import (
8
- placement_group,
9
- placement_group_table,
10
- get_current_placement_group
11
- )
4
+ from dataclasses import dataclass
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
12
6
 
13
- from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
7
+ import ray
8
+ import yaml
14
9
  from ray.experimental.state.api import get_node, get_placement_group
15
-
10
+ from ray.util.placement_group import placement_group, placement_group_table
11
+ from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
16
12
 
17
13
  from deltacat import logs
14
+
18
15
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
19
16
 
20
- #Limitation of current node group or placement group manager
21
- #Must run on driver or head node bc state.api needs to query dashboard api server at 127.0.0.1.
22
- #Issue: https://github.com/ray-project/ray/issues/29959
23
-
24
- class NodeGroupManager():
25
-
26
- def __init__(self,path: str, gname: str):
27
- """Node Group Manager
28
- Args:
29
- path: cluster yaml file
30
- gname: node group prefix, e.g., 'partition'
31
- """
32
- #cluster init status:
33
- self.NODE_GROUP_PREFIX=gname
34
- self.cluster_config=self._read_yaml(path)
35
- self.init_groups = self._cluster_node_groups(self.cluster_config)
36
- self.init_group_res = self._parse_node_resources()
37
-
38
- def _cluster_node_groups(self, config: Dict[str, Any]) -> Dict[str, Any]:
39
- """Get Worker Groups
40
- Args:
41
- config: cluster yaml data
42
- Returns:
43
- worker groups: a dict of worker node group
44
-
45
- """
46
- avail_node_types = list(config['available_node_types'].items())
47
- #exclude head node type
48
- head_node_types = [nt for nt in avail_node_types if 'resources' in nt[1] and 'CPU' in nt[1]['resources'] and nt[1]['resources']['CPU']==0][0]
49
- worker_node_types = [x for x in avail_node_types if x !=head_node_types]
50
- #assuming homogenous cluster
51
- #in future, update with fleet resource
52
- if len(worker_node_types)>0:
53
- self.INSTANCE_TYPE = worker_node_types[0][1]['node_config']['InstanceType']
54
- return worker_node_types
55
-
56
-
57
- def _read_yaml(self, path: str) -> Dict[str, Any]:
58
- with open(path, "rt") as f:
59
- return yaml.safe_load(f)
60
-
61
- def _update_groups(self) -> List[Tuple[str, float]]:
62
- """
63
- Node groups can come and go during runtime, whenever a node group is needed, we need to check the current available groups
64
- Returns:
65
- current_groups: dict of custom resource groups
66
- """
67
- #Add 1.1 second latency to avoid inconsistency issue between raylet and head
68
- time.sleep(1.1)
69
- all_available_res = ray.available_resources()
70
- current_groups =[(k,all_available_res[k]) for k in all_available_res.keys() if self.NODE_GROUP_PREFIX in k]
71
- return current_groups
72
-
73
- def _parse_node_resources(self) -> Dict[str, Dict[str, float]]:
74
- """
75
- Parse resources per node to get detailed resource tighted to each node group
76
- Returns:
77
- group_res: a dict of resources, e.g., {'CPU':0,'memory':0,'object_store_memory':0}
78
- """
79
- all_available_resources= ray._private.state.state._available_resources_per_node()
80
- group_keys = [x[0] for x in self.init_groups]
81
- group_res={}
82
- for k in group_keys:
83
- group_res[k]={'CPU':0,'memory':0,'object_store_memory':0,'node_id':[]}
84
- for v in all_available_resources.values():
85
- keys =v.keys()
86
- r = re.compile(self.NODE_GROUP_PREFIX)
87
- partition=list(filter(r.match, list(keys)))
88
- r = re.compile("node:")
89
- node_id = list(filter(r.match, list(keys)))
90
- if len(partition)>0:
91
- partition = partition[0]
92
- if len(node_id)>0:
93
- node_id = node_id[0]
94
- if self.NODE_GROUP_PREFIX in partition:
95
- group_res[partition]['CPU']+=v['CPU']
96
- group_res[partition]['memory']+=v['memory']
97
- group_res[partition]['object_store_memory']+=v['object_store_memory']
98
- group_res[partition]['node_id'].append(node_id)
99
- return group_res
100
-
101
- def _update_group_res(self, gname: str) -> Dict[str, Union[str, float]]:
102
- """
103
- Get the realtime resource of a node group
104
- Args:
105
- gname: name of node group
106
- Returns:
107
- group_res: dict of updated resource(cpu, memory, object store memory) for a given group
108
- """
109
- all_available_resources= ray._private.state.state._available_resources_per_node()
110
- group_res={'CPU':0,'memory':0,'object_store_memory':0,'node_id':[]}
111
- for v in all_available_resources.values():
112
- keys =v.keys()
113
- r = re.compile("node:")
114
- node_id = list(filter(r.match, list(keys)))
115
- if len(node_id)>0:
116
- node_id = node_id[0]
117
- if gname in v.keys():
118
- group_res['CPU']+=v['CPU']
119
- group_res['memory']+=v['memory']
120
- group_res['object_store_memory']+=v['object_store_memory']
121
- group_res['node_id'].append(node_id)
122
- return group_res
123
-
124
- def get_one_group(self) -> Optional[Dict[str, Union[str, float]]]:
125
- """
126
- Pop up one node group
127
- Returns:
128
- group_res: dict of node group resource, {"group":"partition_1","CPU":2,...}
129
- """
130
- current_groups = self._update_groups()
131
- if len(current_groups)>0:
132
- gname = current_groups[-1][0]
133
- group_res=self._update_group_res(gname)
134
- group_res['group']=gname
135
- try:
136
- group_res['group_res']=ray.available_resources()[gname]
137
- except Exception as e:
138
- logger.info(f"There is no available resources for {gname}")
139
- return None
140
- return group_res
141
- else:
142
- return None
143
-
144
- def get_group_by_name(self, gname: str) -> Optional[Dict[str, Union[str, float]]]:
145
- """
146
- Get the specific node group given its pre-filled name
147
- Args:
148
- gname: name of the node group
149
- Returns:
150
- group_res: dict of node group resource
151
-
152
- """
153
- group_res=self._update_group_res(gname)
154
- group_res['group']=gname
155
- try:
156
- group_res['group_res']=ray.available_resources()[gname]
157
- except Exception as e:
158
- logger.info(f"There is no available resources for {gname}")
159
- return None
160
- return group_res
161
-
162
-
163
- class PlacementGroupManager():
164
- """Placement Group Manager
165
- Create a list of placement group with the desired number of cpus
166
- e.g., create a pg with 32 cpus, then this class will look for a node that has 32 cpus, and collect all
167
- resources, including cpu, memory, and object store;
168
- How to use:
169
- ```
170
- from deltacat.utils.placement import PlacementGroupManager as pgm
171
- pgm = pgm(10, 32)
172
- pg_configs = pgm.pgs
173
- opts = pg_configs[0][0]
174
- fun.options(**opts).remote()
175
- ```
176
- Args:
177
- num_pgs: number of placement groups to be created
178
- instance_cpus: number of cpus per instance
179
- """
180
- def __init__(self, num_pgs: int, instance_cpus: int, instance_type: int = 8, time_out: Optional[float] = None):
181
- head_res_key = self.get_current_node_resource_key()
182
- self._pg_configs = ray.get([_config.options(resources={head_res_key:0.01}).remote(instance_cpus, instance_type) for _ in range(num_pgs)])
183
- @property
184
- def pgs(self):
185
- return self._pg_configs
186
-
187
- def get_current_node_resource_key(self) -> str:
188
- current_node_id = ray.get_runtime_context().node_id.hex()
189
- for node in ray.nodes():
190
- if node["NodeID"] == current_node_id:
191
- # Found the node.
192
- for key in node["Resources"].keys():
193
- if key.startswith("node:"):
194
- return key
17
+ # Limitation of current node group or placement group manager
18
+ # Must run on driver or head node bc state.api needs to query dashboard api server at 127.0.0.1.
19
+ # Issue: https://github.com/ray-project/ray/issues/29959
20
+
21
+
22
+ @dataclass
23
+ class PlacementGroupConfig:
24
+ def __init__(self, opts, resource):
25
+ self.opts = opts
26
+ self.resource = resource
27
+
28
+
29
+ class NodeGroupManager:
30
+ def __init__(self, path: str, gname: str):
31
+ """Node Group Manager
32
+ Args:
33
+ path: cluster yaml file
34
+ gname: node group prefix, e.g., 'partition'
35
+ """
36
+ # cluster init status:
37
+ self.NODE_GROUP_PREFIX = gname
38
+ self.cluster_config = self._read_yaml(path)
39
+ self.init_groups = self._cluster_node_groups(self.cluster_config)
40
+ self.init_group_res = self._parse_node_resources()
41
+
42
+ def _cluster_node_groups(self, config: Dict[str, Any]) -> Dict[str, Any]:
43
+ """Get Worker Groups
44
+ Args:
45
+ config: cluster yaml data
46
+ Returns:
47
+ worker groups: a dict of worker node group
48
+
49
+ """
50
+ avail_node_types = list(config["available_node_types"].items())
51
+ # exclude head node type
52
+ head_node_types = [
53
+ nt
54
+ for nt in avail_node_types
55
+ if "resources" in nt[1]
56
+ and "CPU" in nt[1]["resources"]
57
+ and nt[1]["resources"]["CPU"] == 0
58
+ ][0]
59
+ worker_node_types = [x for x in avail_node_types if x != head_node_types]
60
+ # assuming homogenous cluster
61
+ # in future, update with fleet resource
62
+ if len(worker_node_types) > 0:
63
+ self.INSTANCE_TYPE = worker_node_types[0][1]["node_config"]["InstanceType"]
64
+ return worker_node_types
65
+
66
+ def _read_yaml(self, path: str) -> Dict[str, Any]:
67
+ with open(path, "rt") as f:
68
+ return yaml.safe_load(f)
69
+
70
+ def _update_groups(self) -> List[Tuple[str, float]]:
71
+ """
72
+ Node groups can come and go during runtime, whenever a node group is needed, we need to check the current available groups
73
+ Returns:
74
+ current_groups: dict of custom resource groups
75
+ """
76
+ # Add 1.1 second latency to avoid inconsistency issue between raylet and head
77
+ time.sleep(1.1)
78
+ all_available_res = ray.available_resources()
79
+ current_groups = [
80
+ (k, all_available_res[k])
81
+ for k in all_available_res.keys()
82
+ if self.NODE_GROUP_PREFIX in k
83
+ ]
84
+ return current_groups
85
+
86
+ def _parse_node_resources(self) -> Dict[str, Dict[str, float]]:
87
+ """
88
+ Parse resources per node to get detailed resource tighted to each node group
89
+ Returns:
90
+ group_res: a dict of resources, e.g., {'CPU':0,'memory':0,'object_store_memory':0}
91
+ """
92
+ all_available_resources = (
93
+ ray._private.state.state._available_resources_per_node()
94
+ )
95
+ group_keys = [x[0] for x in self.init_groups]
96
+ group_res = {}
97
+ for k in group_keys:
98
+ group_res[k] = {
99
+ "CPU": 0,
100
+ "memory": 0,
101
+ "object_store_memory": 0,
102
+ "node_id": [],
103
+ }
104
+ for v in all_available_resources.values():
105
+ keys = v.keys()
106
+ r = re.compile(self.NODE_GROUP_PREFIX)
107
+ partition = list(filter(r.match, list(keys)))
108
+ r = re.compile("node:")
109
+ node_id = list(filter(r.match, list(keys)))
110
+ if len(partition) > 0:
111
+ partition = partition[0]
112
+ if len(node_id) > 0:
113
+ node_id = node_id[0]
114
+ if self.NODE_GROUP_PREFIX in partition:
115
+ group_res[partition]["CPU"] += v["CPU"]
116
+ group_res[partition]["memory"] += v["memory"]
117
+ group_res[partition]["object_store_memory"] += v["object_store_memory"]
118
+ group_res[partition]["node_id"].append(node_id)
119
+ return group_res
120
+
121
+ def _update_group_res(self, gname: str) -> Dict[str, Union[str, float]]:
122
+ """
123
+ Get the realtime resource of a node group
124
+ Args:
125
+ gname: name of node group
126
+ Returns:
127
+ group_res: dict of updated resource(cpu, memory, object store memory) for a given group
128
+ """
129
+ all_available_resources = (
130
+ ray._private.state.state._available_resources_per_node()
131
+ )
132
+ group_res = {"CPU": 0, "memory": 0, "object_store_memory": 0, "node_id": []}
133
+ for v in all_available_resources.values():
134
+ keys = v.keys()
135
+ r = re.compile("node:")
136
+ node_id = list(filter(r.match, list(keys)))
137
+ if len(node_id) > 0:
138
+ node_id = node_id[0]
139
+ if gname in v.keys():
140
+ group_res["CPU"] += v["CPU"]
141
+ group_res["memory"] += v["memory"]
142
+ group_res["object_store_memory"] += v["object_store_memory"]
143
+ group_res["node_id"].append(node_id)
144
+ return group_res
145
+
146
+ def get_one_group(self) -> Optional[Dict[str, Union[str, float]]]:
147
+ """
148
+ Pop up one node group
149
+ Returns:
150
+ group_res: dict of node group resource, {"group":"partition_1","CPU":2,...}
151
+ """
152
+ current_groups = self._update_groups()
153
+ if len(current_groups) > 0:
154
+ gname = current_groups[-1][0]
155
+ group_res = self._update_group_res(gname)
156
+ group_res["group"] = gname
157
+ try:
158
+ group_res["group_res"] = ray.available_resources()[gname]
159
+ except Exception as e:
160
+ logger.info(f"Error: {e}. There is no available resources for {gname}")
161
+ return None
162
+ return group_res
163
+ else:
164
+ return None
165
+
166
+ def get_group_by_name(self, gname: str) -> Optional[Dict[str, Union[str, float]]]:
167
+ """
168
+ Get the specific node group given its pre-filled name
169
+ Args:
170
+ gname: name of the node group
171
+ Returns:
172
+ group_res: dict of node group resource
173
+
174
+ """
175
+ group_res = self._update_group_res(gname)
176
+ group_res["group"] = gname
177
+ try:
178
+ group_res["group_res"] = ray.available_resources()[gname]
179
+ except Exception as e:
180
+ logger.info(f"Error: {e}. There is no available resources for {gname}")
181
+ return None
182
+ return group_res
183
+
184
+
185
+ class PlacementGroupManager:
186
+ """Placement Group Manager
187
+ Create a list of placement group with the desired number of cpus
188
+ e.g., create a pg with 32 cpus, then this class will look for a node that has 32 cpus, and collect all
189
+ resources, including cpu, memory, and object store;
190
+ How to use:
191
+ ```
192
+ from deltacat.utils.placement import PlacementGroupManager as pgm
193
+ pgm = pgm(10, 32)
194
+ pg_configs = pgm.pgs
195
+ opts = pg_configs[0][0]
196
+ fun.options(**opts).remote()
197
+ ```
198
+ Args:
199
+ num_pgs: number of placement groups to be created
200
+ instance_cpus: number of cpus per instance
201
+ """
202
+
203
+ def __init__(
204
+ self,
205
+ num_pgs: int,
206
+ total_cpus_per_pg: int,
207
+ cpu_per_bundle: int,
208
+ strategy="SPREAD",
209
+ capture_child_tasks=True,
210
+ ):
211
+ head_res_key = self.get_current_node_resource_key()
212
+ # run the task on head and consume a fractional cpu, so that pg can be created on non-head node
213
+ # if cpu_per_bundle is less than the cpus per node, the pg can still be created on head
214
+ # curent assumption is that the cpu_per_bundle = cpus per node
215
+ # TODO: figure out how to create pg on non-head explicitly
216
+ self._pg_configs = ray.get(
217
+ [
218
+ _config.options(resources={head_res_key: 0.01}).remote(
219
+ total_cpus_per_pg, cpu_per_bundle, strategy, capture_child_tasks
220
+ )
221
+ for i in range(num_pgs)
222
+ ]
223
+ )
224
+ # TODO: handle the cases where cpu_per_bundle is larger than max cpus per node, support it on ec2/flex/manta
225
+
226
+ @property
227
+ def pgs(self):
228
+ return self._pg_configs
229
+
230
+ def get_current_node_resource_key(self) -> str:
231
+ # on ec2: address="172.31.34.51:6379"
232
+ # on manta: address = "2600:1f10:4674:6815:aadb:2dc8:de61:bc8e:6379"
233
+ current_node_name = ray.experimental.internal_kv.global_gcs_client.address[:-5]
234
+ for node in ray.nodes():
235
+ if node["NodeName"] == current_node_name:
236
+ # Found the node.
237
+ for key in node["Resources"].keys():
238
+ if key.startswith("node:"):
239
+ return key
240
+
241
+
195
242
  @ray.remote(num_cpus=0.01)
196
- def _config(instance_cpus: int, instance_type: int, time_out: Optional[float] = None) -> Tuple[Dict[str,Any], Dict[str,Any]]:
197
- pg_config = None
198
- try:
199
- opts ={}
200
- cluster_resources={}
201
- num_bundles = (int)(instance_cpus/instance_type)
202
- bundles = [{'CPU':instance_type} for _ in range(num_bundles)]
203
- pg = placement_group(bundles, strategy="SPREAD")
204
- ray.get(pg.ready(), timeout=time_out)
205
- if not pg:
206
- return None
207
- opts = {"scheduling_strategy":PlacementGroupSchedulingStrategy(
208
- placement_group=pg, placement_group_capture_child_tasks=True)
209
- }
210
- pg_id = placement_group_table(pg)['placement_group_id']
211
- pg_details = get_placement_group(pg_id)
212
- bundles = pg_details['bundles']
213
- node_ids =[]
214
- for bd in bundles:
215
- node_ids.append(bd['node_id'])
216
- #query available resources given list of node id
217
- all_nodes_available_res = ray._private.state.state._available_resources_per_node()
218
- pg_res = {'CPU':0,'memory':0,'object_store_memory':0,'node_id':[]}
219
- for node_id in node_ids:
220
- if node_id in all_nodes_available_res:
221
- v = all_nodes_available_res[node_id]
222
- node_detail = get_node(node_id)
223
- pg_res['CPU']+=node_detail['resources_total']['CPU']
224
- pg_res['memory']+=v['memory']
225
- pg_res['object_store_memory']+=v['object_store_memory']
226
- pg_res['node_id'].append(node_id)
227
- cluster_resources['CPU'] = int(pg_res['CPU'])
228
- cluster_resources['memory'] = float(pg_res['memory'])
229
- cluster_resources['object_store_memory'] = float(pg_res['object_store_memory'])
230
- cluster_resources['node_id'] = pg_res['node_id']
231
- pg_config=[opts,cluster_resources]
232
- logger.info(f"pg has resources:{cluster_resources}")
233
-
234
- except Exception as e:
235
- logger.error(f"placement group error:{e}")
236
- pass
237
- return pg_config
243
+ def _config(
244
+ total_cpus_per_pg: int,
245
+ cpu_per_node: int,
246
+ strategy="SPREAD",
247
+ capture_child_tasks=True,
248
+ time_out: Optional[float] = None,
249
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
250
+ pg_config = None
251
+ opts = {}
252
+ cluster_resources = {}
253
+ num_bundles = (int)(total_cpus_per_pg / cpu_per_node)
254
+ bundles = [{"CPU": cpu_per_node} for i in range(num_bundles)]
255
+ pg = placement_group(bundles, strategy=strategy)
256
+ ray.get(pg.ready(), timeout=time_out)
257
+ if not pg:
258
+ return None
259
+ opts = {
260
+ "scheduling_strategy": PlacementGroupSchedulingStrategy(
261
+ placement_group=pg, placement_group_capture_child_tasks=capture_child_tasks
262
+ )
263
+ }
264
+ pg_id = placement_group_table(pg)["placement_group_id"]
265
+ pg_details = get_placement_group(pg_id)
266
+ bundles = pg_details["bundles"]
267
+ node_ids = []
268
+ for bd in bundles:
269
+ node_ids.append(bd["node_id"])
270
+ # query available resources given list of node id
271
+ all_nodes_available_res = ray._private.state.state._available_resources_per_node()
272
+ pg_res = {"CPU": 0, "memory": 0, "object_store_memory": 0}
273
+ for node_id in node_ids:
274
+ if node_id in all_nodes_available_res:
275
+ v = all_nodes_available_res[node_id]
276
+ node_detail = get_node(node_id)
277
+ pg_res["CPU"] += node_detail["resources_total"]["CPU"]
278
+ pg_res["memory"] += v["memory"]
279
+ pg_res["object_store_memory"] += v["object_store_memory"]
280
+ cluster_resources["CPU"] = int(pg_res["CPU"])
281
+ cluster_resources["memory"] = float(pg_res["memory"])
282
+ cluster_resources["object_store_memory"] = float(pg_res["object_store_memory"])
283
+ pg_config = PlacementGroupConfig(opts, cluster_resources)
284
+ logger.info(f"pg has resources:{cluster_resources}")
238
285
 
286
+ return pg_config