deltacat 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +188 -218
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +259 -316
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +152 -259
  22. deltacat/compute/compactor/steps/hash_bucket.py +57 -73
  23. deltacat/compute/compactor/steps/materialize.py +138 -99
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +131 -90
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -42
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +8 -10
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  64. deltacat/types/media.py +3 -4
  65. deltacat/types/tables.py +31 -21
  66. deltacat/utils/common.py +5 -11
  67. deltacat/utils/numpy.py +20 -22
  68. deltacat/utils/pandas.py +73 -100
  69. deltacat/utils/performance.py +3 -9
  70. deltacat/utils/placement.py +276 -231
  71. deltacat/utils/pyarrow.py +302 -89
  72. deltacat/utils/ray_utils/collections.py +2 -1
  73. deltacat/utils/ray_utils/concurrency.py +38 -32
  74. deltacat/utils/ray_utils/dataset.py +28 -28
  75. deltacat/utils/ray_utils/performance.py +5 -9
  76. deltacat/utils/ray_utils/runtime.py +9 -10
  77. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/METADATA +22 -12
  78. deltacat-0.1.11.dist-info/RECORD +110 -0
  79. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
  80. deltacat/autoscaler/events/__init__.py +0 -0
  81. deltacat/autoscaler/events/compaction/__init__.py +0 -0
  82. deltacat/autoscaler/events/compaction/cluster.py +0 -82
  83. deltacat/autoscaler/events/compaction/collections/__init__.py +0 -0
  84. deltacat/autoscaler/events/compaction/collections/partition_key_value.py +0 -36
  85. deltacat/autoscaler/events/compaction/dispatcher.py +0 -28
  86. deltacat/autoscaler/events/compaction/input.py +0 -27
  87. deltacat/autoscaler/events/compaction/process.py +0 -25
  88. deltacat/autoscaler/events/compaction/session_manager.py +0 -13
  89. deltacat/autoscaler/events/compaction/utils.py +0 -216
  90. deltacat/autoscaler/events/compaction/workflow.py +0 -303
  91. deltacat/autoscaler/events/dispatcher.py +0 -95
  92. deltacat/autoscaler/events/dynamodb/__init__.py +0 -0
  93. deltacat/autoscaler/events/dynamodb/event_store.py +0 -164
  94. deltacat/autoscaler/events/event_store.py +0 -55
  95. deltacat/autoscaler/events/exceptions.py +0 -6
  96. deltacat/autoscaler/events/processor.py +0 -177
  97. deltacat/autoscaler/events/session_manager.py +0 -25
  98. deltacat/autoscaler/events/states.py +0 -88
  99. deltacat/autoscaler/events/workflow.py +0 -54
  100. deltacat/autoscaler/node_group.py +0 -230
  101. deltacat/autoscaler/utils.py +0 -69
  102. deltacat-0.1.8.dist-info/RECORD +0 -131
  103. /deltacat/{autoscaler → tests/utils}/__init__.py +0 -0
  104. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
  105. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,241 +1,286 @@
1
- import ray
1
+ import logging
2
2
  import re
3
3
  import time
4
- import yaml
5
- import logging
6
- from typing import Optional, Union, List, Dict, Any, Callable, Tuple
7
- from ray.util.placement_group import (
8
- placement_group,
9
- placement_group_table,
10
- get_current_placement_group
11
- )
4
+ from dataclasses import dataclass
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
12
6
 
13
- from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
7
+ import ray
8
+ import yaml
14
9
  from ray.experimental.state.api import get_node, get_placement_group
15
-
10
+ from ray.util.placement_group import placement_group, placement_group_table
11
+ from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
16
12
 
17
13
  from deltacat import logs
18
- from deltacat.utils.ray_utils.runtime import live_node_resource_keys
14
+
19
15
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
20
16
 
21
- #Limitation of current node group or placement group manager
22
- #Must run on driver or head node bc state.api needs to query dashboard api server at 127.0.0.1.
23
- #Issue: https://github.com/ray-project/ray/issues/29959
24
-
25
- class NodeGroupManager():
26
-
27
- def __init__(self,path: str, gname: str):
28
- """Node Group Manager
29
- Args:
30
- path: cluster yaml file
31
- gname: node group prefix, e.g., 'partition'
32
- """
33
- #cluster init status:
34
- self.NODE_GROUP_PREFIX=gname
35
- self.cluster_config=self._read_yaml(path)
36
- self.init_groups = self._cluster_node_groups(self.cluster_config)
37
- self.init_group_res = self._parse_node_resources()
38
-
39
- def _cluster_node_groups(self, config: Dict[str, Any]) -> Dict[str, Any]:
40
- """Get Worker Groups
41
- Args:
42
- config: cluster yaml data
43
- Returns:
44
- worker groups: a dict of worker node group
45
-
46
- """
47
- avail_node_types = list(config['available_node_types'].items())
48
- #exclude head node type
49
- head_node_types = [nt for nt in avail_node_types if 'resources' in nt[1] and 'CPU' in nt[1]['resources'] and nt[1]['resources']['CPU']==0][0]
50
- worker_node_types = [x for x in avail_node_types if x !=head_node_types]
51
- #assuming homogenous cluster
52
- #in future, update with fleet resource
53
- if len(worker_node_types)>0:
54
- self.INSTANCE_TYPE = worker_node_types[0][1]['node_config']['InstanceType']
55
- return worker_node_types
56
-
57
-
58
- def _read_yaml(self, path: str) -> Dict[str, Any]:
59
- with open(path, "rt") as f:
60
- return yaml.safe_load(f)
61
-
62
- def _update_groups(self) -> List[Tuple[str, float]]:
63
- """
64
- Node groups can come and go during runtime, whenever a node group is needed, we need to check the current available groups
65
- Returns:
66
- current_groups: dict of custom resource groups
67
- """
68
- #Add 1.1 second latency to avoid inconsistency issue between raylet and head
69
- time.sleep(1.1)
70
- all_available_res = ray.available_resources()
71
- current_groups =[(k,all_available_res[k]) for k in all_available_res.keys() if self.NODE_GROUP_PREFIX in k]
72
- return current_groups
73
-
74
- def _parse_node_resources(self) -> Dict[str, Dict[str, float]]:
75
- """
76
- Parse resources per node to get detailed resource tighted to each node group
77
- Returns:
78
- group_res: a dict of resources, e.g., {'CPU':0,'memory':0,'object_store_memory':0}
79
- """
80
- all_available_resources= ray._private.state.state._available_resources_per_node()
81
- group_keys = [x[0] for x in self.init_groups]
82
- group_res={}
83
- for k in group_keys:
84
- group_res[k]={'CPU':0,'memory':0,'object_store_memory':0,'node_id':[]}
85
- for v in all_available_resources.values():
86
- keys =v.keys()
87
- r = re.compile(self.NODE_GROUP_PREFIX)
88
- partition=list(filter(r.match, list(keys)))
89
- r = re.compile("node:")
90
- node_id = list(filter(r.match, list(keys)))
91
- if len(partition)>0:
92
- partition = partition[0]
93
- if len(node_id)>0:
94
- node_id = node_id[0]
95
- if self.NODE_GROUP_PREFIX in partition:
96
- group_res[partition]['CPU']+=v['CPU']
97
- group_res[partition]['memory']+=v['memory']
98
- group_res[partition]['object_store_memory']+=v['object_store_memory']
99
- group_res[partition]['node_id'].append(node_id)
100
- return group_res
101
-
102
- def _update_group_res(self, gname: str) -> Dict[str, Union[str, float]]:
103
- """
104
- Get the realtime resource of a node group
105
- Args:
106
- gname: name of node group
107
- Returns:
108
- group_res: dict of updated resource(cpu, memory, object store memory) for a given group
109
- """
110
- all_available_resources= ray._private.state.state._available_resources_per_node()
111
- group_res={'CPU':0,'memory':0,'object_store_memory':0,'node_id':[]}
112
- for v in all_available_resources.values():
113
- keys =v.keys()
114
- r = re.compile("node:")
115
- node_id = list(filter(r.match, list(keys)))
116
- if len(node_id)>0:
117
- node_id = node_id[0]
118
- if gname in v.keys():
119
- group_res['CPU']+=v['CPU']
120
- group_res['memory']+=v['memory']
121
- group_res['object_store_memory']+=v['object_store_memory']
122
- group_res['node_id'].append(node_id)
123
- return group_res
124
-
125
- def get_one_group(self) -> Optional[Dict[str, Union[str, float]]]:
126
- """
127
- Pop up one node group
128
- Returns:
129
- group_res: dict of node group resource, {"group":"partition_1","CPU":2,...}
130
- """
131
- current_groups = self._update_groups()
132
- if len(current_groups)>0:
133
- gname = current_groups[-1][0]
134
- group_res=self._update_group_res(gname)
135
- group_res['group']=gname
136
- try:
137
- group_res['group_res']=ray.available_resources()[gname]
138
- except Exception as e:
139
- logger.info(f"There is no available resources for {gname}")
140
- return None
141
- return group_res
142
- else:
143
- return None
144
-
145
- def get_group_by_name(self, gname: str) -> Optional[Dict[str, Union[str, float]]]:
146
- """
147
- Get the specific node group given its pre-filled name
148
- Args:
149
- gname: name of the node group
150
- Returns:
151
- group_res: dict of node group resource
152
-
153
- """
154
- group_res=self._update_group_res(gname)
155
- group_res['group']=gname
156
- try:
157
- group_res['group_res']=ray.available_resources()[gname]
158
- except Exception as e:
159
- logger.info(f"There is no available resources for {gname}")
160
- return None
161
- return group_res
162
-
163
-
164
- class PlacementGroupManager():
165
- """Placement Group Manager
166
- Create a list of placement group with the desired number of cpus
167
- e.g., create a pg with 32 cpus, then this class will look for a node that has 32 cpus, and collect all
168
- resources, including cpu, memory, and object store;
169
- How to use:
170
- ```
171
- from deltacat.utils.placement import PlacementGroupManager as pgm
172
- pgm = pgm(10, 32)
173
- pg_configs = pgm.pgs
174
- opts = pg_configs[0][0]
175
- fun.options(**opts).remote()
176
- ```
177
- Args:
178
- num_pgs: number of placement groups to be created
179
- instance_cpus: number of cpus per instance
180
- """
181
- def __init__(self, num_pgs: int, instance_cpus: int, instance_type: int = 8, time_out: Optional[float] = None):
182
- head_res_key = self.get_current_node_resource_key()
183
- all_node_res_key = live_node_resource_keys()
184
- all_node_res_key.remove(head_res_key)
185
- num_bundles = (int)(instance_cpus/instance_type)
186
- self._pg_configs = ray.get([_config.options(resources={head_res_key:0.01}).remote(instance_cpus, instance_type, all_node_res_key[i*num_bundles:(i+1)*num_bundles]) for i in range(num_pgs)])
187
- @property
188
- def pgs(self):
189
- return self._pg_configs
190
-
191
- def get_current_node_resource_key(self) -> str:
192
- current_node_id = ray.get_runtime_context().node_id.hex()
193
- for node in ray.nodes():
194
- if node["NodeID"] == current_node_id:
195
- # Found the node.
196
- for key in node["Resources"].keys():
197
- if key.startswith("node:"):
198
- return key
17
+ # Limitation of current node group or placement group manager
18
+ # Must run on driver or head node bc state.api needs to query dashboard api server at 127.0.0.1.
19
+ # Issue: https://github.com/ray-project/ray/issues/29959
20
+
21
+
22
+ @dataclass
23
+ class PlacementGroupConfig:
24
+ def __init__(self, opts, resource):
25
+ self.opts = opts
26
+ self.resource = resource
27
+
28
+
29
+ class NodeGroupManager:
30
+ def __init__(self, path: str, gname: str):
31
+ """Node Group Manager
32
+ Args:
33
+ path: cluster yaml file
34
+ gname: node group prefix, e.g., 'partition'
35
+ """
36
+ # cluster init status:
37
+ self.NODE_GROUP_PREFIX = gname
38
+ self.cluster_config = self._read_yaml(path)
39
+ self.init_groups = self._cluster_node_groups(self.cluster_config)
40
+ self.init_group_res = self._parse_node_resources()
41
+
42
+ def _cluster_node_groups(self, config: Dict[str, Any]) -> Dict[str, Any]:
43
+ """Get Worker Groups
44
+ Args:
45
+ config: cluster yaml data
46
+ Returns:
47
+ worker groups: a dict of worker node group
48
+
49
+ """
50
+ avail_node_types = list(config["available_node_types"].items())
51
+ # exclude head node type
52
+ head_node_types = [
53
+ nt
54
+ for nt in avail_node_types
55
+ if "resources" in nt[1]
56
+ and "CPU" in nt[1]["resources"]
57
+ and nt[1]["resources"]["CPU"] == 0
58
+ ][0]
59
+ worker_node_types = [x for x in avail_node_types if x != head_node_types]
60
+ # assuming homogenous cluster
61
+ # in future, update with fleet resource
62
+ if len(worker_node_types) > 0:
63
+ self.INSTANCE_TYPE = worker_node_types[0][1]["node_config"]["InstanceType"]
64
+ return worker_node_types
65
+
66
+ def _read_yaml(self, path: str) -> Dict[str, Any]:
67
+ with open(path, "rt") as f:
68
+ return yaml.safe_load(f)
69
+
70
+ def _update_groups(self) -> List[Tuple[str, float]]:
71
+ """
72
+ Node groups can come and go during runtime, whenever a node group is needed, we need to check the current available groups
73
+ Returns:
74
+ current_groups: dict of custom resource groups
75
+ """
76
+ # Add 1.1 second latency to avoid inconsistency issue between raylet and head
77
+ time.sleep(1.1)
78
+ all_available_res = ray.available_resources()
79
+ current_groups = [
80
+ (k, all_available_res[k])
81
+ for k in all_available_res.keys()
82
+ if self.NODE_GROUP_PREFIX in k
83
+ ]
84
+ return current_groups
85
+
86
+ def _parse_node_resources(self) -> Dict[str, Dict[str, float]]:
87
+ """
88
+ Parse resources per node to get detailed resource tighted to each node group
89
+ Returns:
90
+ group_res: a dict of resources, e.g., {'CPU':0,'memory':0,'object_store_memory':0}
91
+ """
92
+ all_available_resources = (
93
+ ray._private.state.state._available_resources_per_node()
94
+ )
95
+ group_keys = [x[0] for x in self.init_groups]
96
+ group_res = {}
97
+ for k in group_keys:
98
+ group_res[k] = {
99
+ "CPU": 0,
100
+ "memory": 0,
101
+ "object_store_memory": 0,
102
+ "node_id": [],
103
+ }
104
+ for v in all_available_resources.values():
105
+ keys = v.keys()
106
+ r = re.compile(self.NODE_GROUP_PREFIX)
107
+ partition = list(filter(r.match, list(keys)))
108
+ r = re.compile("node:")
109
+ node_id = list(filter(r.match, list(keys)))
110
+ if len(partition) > 0:
111
+ partition = partition[0]
112
+ if len(node_id) > 0:
113
+ node_id = node_id[0]
114
+ if self.NODE_GROUP_PREFIX in partition:
115
+ group_res[partition]["CPU"] += v["CPU"]
116
+ group_res[partition]["memory"] += v["memory"]
117
+ group_res[partition]["object_store_memory"] += v["object_store_memory"]
118
+ group_res[partition]["node_id"].append(node_id)
119
+ return group_res
120
+
121
+ def _update_group_res(self, gname: str) -> Dict[str, Union[str, float]]:
122
+ """
123
+ Get the realtime resource of a node group
124
+ Args:
125
+ gname: name of node group
126
+ Returns:
127
+ group_res: dict of updated resource(cpu, memory, object store memory) for a given group
128
+ """
129
+ all_available_resources = (
130
+ ray._private.state.state._available_resources_per_node()
131
+ )
132
+ group_res = {"CPU": 0, "memory": 0, "object_store_memory": 0, "node_id": []}
133
+ for v in all_available_resources.values():
134
+ keys = v.keys()
135
+ r = re.compile("node:")
136
+ node_id = list(filter(r.match, list(keys)))
137
+ if len(node_id) > 0:
138
+ node_id = node_id[0]
139
+ if gname in v.keys():
140
+ group_res["CPU"] += v["CPU"]
141
+ group_res["memory"] += v["memory"]
142
+ group_res["object_store_memory"] += v["object_store_memory"]
143
+ group_res["node_id"].append(node_id)
144
+ return group_res
145
+
146
+ def get_one_group(self) -> Optional[Dict[str, Union[str, float]]]:
147
+ """
148
+ Pop up one node group
149
+ Returns:
150
+ group_res: dict of node group resource, {"group":"partition_1","CPU":2,...}
151
+ """
152
+ current_groups = self._update_groups()
153
+ if len(current_groups) > 0:
154
+ gname = current_groups[-1][0]
155
+ group_res = self._update_group_res(gname)
156
+ group_res["group"] = gname
157
+ try:
158
+ group_res["group_res"] = ray.available_resources()[gname]
159
+ except Exception as e:
160
+ logger.info(f"Error: {e}. There is no available resources for {gname}")
161
+ return None
162
+ return group_res
163
+ else:
164
+ return None
165
+
166
+ def get_group_by_name(self, gname: str) -> Optional[Dict[str, Union[str, float]]]:
167
+ """
168
+ Get the specific node group given its pre-filled name
169
+ Args:
170
+ gname: name of the node group
171
+ Returns:
172
+ group_res: dict of node group resource
173
+
174
+ """
175
+ group_res = self._update_group_res(gname)
176
+ group_res["group"] = gname
177
+ try:
178
+ group_res["group_res"] = ray.available_resources()[gname]
179
+ except Exception as e:
180
+ logger.info(f"Error: {e}. There is no available resources for {gname}")
181
+ return None
182
+ return group_res
183
+
184
+
185
+ class PlacementGroupManager:
186
+ """Placement Group Manager
187
+ Create a list of placement group with the desired number of cpus
188
+ e.g., create a pg with 32 cpus, then this class will look for a node that has 32 cpus, and collect all
189
+ resources, including cpu, memory, and object store;
190
+ How to use:
191
+ ```
192
+ from deltacat.utils.placement import PlacementGroupManager as pgm
193
+ pgm = pgm(10, 32)
194
+ pg_configs = pgm.pgs
195
+ opts = pg_configs[0][0]
196
+ fun.options(**opts).remote()
197
+ ```
198
+ Args:
199
+ num_pgs: number of placement groups to be created
200
+ instance_cpus: number of cpus per instance
201
+ """
202
+
203
+ def __init__(
204
+ self,
205
+ num_pgs: int,
206
+ total_cpus_per_pg: int,
207
+ cpu_per_bundle: int,
208
+ strategy="SPREAD",
209
+ capture_child_tasks=True,
210
+ ):
211
+ head_res_key = self.get_current_node_resource_key()
212
+ # run the task on head and consume a fractional cpu, so that pg can be created on non-head node
213
+ # if cpu_per_bundle is less than the cpus per node, the pg can still be created on head
214
+ # curent assumption is that the cpu_per_bundle = cpus per node
215
+ # TODO: figure out how to create pg on non-head explicitly
216
+ self._pg_configs = ray.get(
217
+ [
218
+ _config.options(resources={head_res_key: 0.01}).remote(
219
+ total_cpus_per_pg, cpu_per_bundle, strategy, capture_child_tasks
220
+ )
221
+ for i in range(num_pgs)
222
+ ]
223
+ )
224
+ # TODO: handle the cases where cpu_per_bundle is larger than max cpus per node, support it on ec2/flex/manta
225
+
226
+ @property
227
+ def pgs(self):
228
+ return self._pg_configs
229
+
230
+ def get_current_node_resource_key(self) -> str:
231
+ # on ec2: address="172.31.34.51:6379"
232
+ # on manta: address = "2600:1f10:4674:6815:aadb:2dc8:de61:bc8e:6379"
233
+ current_node_name = ray.experimental.internal_kv.global_gcs_client.address[:-5]
234
+ for node in ray.nodes():
235
+ if node["NodeName"] == current_node_name:
236
+ # Found the node.
237
+ for key in node["Resources"].keys():
238
+ if key.startswith("node:"):
239
+ return key
240
+
241
+
199
242
  @ray.remote(num_cpus=0.01)
200
- def _config(instance_cpus: int, instance_type: int, node_res_keys: List[str], time_out: Optional[float] = None) -> Tuple[Dict[str,Any], Dict[str,Any]]:
201
- pg_config = None
202
- try:
203
- opts ={}
204
- cluster_resources={}
205
- num_bundles = (int)(instance_cpus/instance_type)
206
- bundles = [{'CPU':instance_type,node_res_keys[i]:1} for i in range(num_bundles)]
207
- pg = placement_group(bundles, strategy="SPREAD")
208
- ray.get(pg.ready(), timeout=time_out)
209
- if not pg:
210
- return None
211
- opts = {"scheduling_strategy":PlacementGroupSchedulingStrategy(
212
- placement_group=pg, placement_group_capture_child_tasks=True)
213
- }
214
- pg_id = placement_group_table(pg)['placement_group_id']
215
- pg_details = get_placement_group(pg_id)
216
- bundles = pg_details['bundles']
217
- node_ids =[]
218
- for bd in bundles:
219
- node_ids.append(bd['node_id'])
220
- #query available resources given list of node id
221
- all_nodes_available_res = ray._private.state.state._available_resources_per_node()
222
- pg_res = {'CPU':0,'memory':0,'object_store_memory':0,'node_id':[]}
223
- for node_id in node_ids:
224
- if node_id in all_nodes_available_res:
225
- v = all_nodes_available_res[node_id]
226
- node_detail = get_node(node_id)
227
- pg_res['CPU']+=node_detail['resources_total']['CPU']
228
- pg_res['memory']+=v['memory']
229
- pg_res['object_store_memory']+=v['object_store_memory']
230
- cluster_resources['CPU'] = int(pg_res['CPU'])
231
- cluster_resources['memory'] = float(pg_res['memory'])
232
- cluster_resources['object_store_memory'] = float(pg_res['object_store_memory'])
233
- cluster_resources['node_id'] = node_res_keys
234
- pg_config=[opts,cluster_resources]
235
- logger.info(f"pg has resources:{cluster_resources}")
236
-
237
- except Exception as e:
238
- logger.error(f"placement group error:{e}")
239
- pass
240
- return pg_config
243
+ def _config(
244
+ total_cpus_per_pg: int,
245
+ cpu_per_node: int,
246
+ strategy="SPREAD",
247
+ capture_child_tasks=True,
248
+ time_out: Optional[float] = None,
249
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
250
+ pg_config = None
251
+ opts = {}
252
+ cluster_resources = {}
253
+ num_bundles = (int)(total_cpus_per_pg / cpu_per_node)
254
+ bundles = [{"CPU": cpu_per_node} for i in range(num_bundles)]
255
+ pg = placement_group(bundles, strategy=strategy)
256
+ ray.get(pg.ready(), timeout=time_out)
257
+ if not pg:
258
+ return None
259
+ opts = {
260
+ "scheduling_strategy": PlacementGroupSchedulingStrategy(
261
+ placement_group=pg, placement_group_capture_child_tasks=capture_child_tasks
262
+ )
263
+ }
264
+ pg_id = placement_group_table(pg)["placement_group_id"]
265
+ pg_details = get_placement_group(pg_id)
266
+ bundles = pg_details["bundles"]
267
+ node_ids = []
268
+ for bd in bundles:
269
+ node_ids.append(bd["node_id"])
270
+ # query available resources given list of node id
271
+ all_nodes_available_res = ray._private.state.state._available_resources_per_node()
272
+ pg_res = {"CPU": 0, "memory": 0, "object_store_memory": 0}
273
+ for node_id in node_ids:
274
+ if node_id in all_nodes_available_res:
275
+ v = all_nodes_available_res[node_id]
276
+ node_detail = get_node(node_id)
277
+ pg_res["CPU"] += node_detail["resources_total"]["CPU"]
278
+ pg_res["memory"] += v["memory"]
279
+ pg_res["object_store_memory"] += v["object_store_memory"]
280
+ cluster_resources["CPU"] = int(pg_res["CPU"])
281
+ cluster_resources["memory"] = float(pg_res["memory"])
282
+ cluster_resources["object_store_memory"] = float(pg_res["object_store_memory"])
283
+ pg_config = PlacementGroupConfig(opts, cluster_resources)
284
+ logger.info(f"pg has resources:{cluster_resources}")
241
285
 
286
+ return pg_config