ray-embedding 0.12.9__tar.gz → 0.12.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ray-embedding might be problematic. Click here for more details.
- {ray_embedding-0.12.9 → ray_embedding-0.12.11}/PKG-INFO +1 -1
- {ray_embedding-0.12.9 → ray_embedding-0.12.11}/ray_embedding/deploy.py +3 -1
- {ray_embedding-0.12.9 → ray_embedding-0.12.11}/ray_embedding/dto.py +0 -7
- {ray_embedding-0.12.9 → ray_embedding-0.12.11}/ray_embedding/embedding_model.py +18 -12
- {ray_embedding-0.12.9 → ray_embedding-0.12.11}/ray_embedding/model_router.py +2 -2
- ray_embedding-0.12.11/ray_embedding/node_health.py +90 -0
- {ray_embedding-0.12.9 → ray_embedding-0.12.11}/ray_embedding.egg-info/PKG-INFO +1 -1
- {ray_embedding-0.12.9 → ray_embedding-0.12.11}/setup.cfg +1 -1
- ray_embedding-0.12.9/ray_embedding/node_health.py +0 -48
- {ray_embedding-0.12.9 → ray_embedding-0.12.11}/README.md +0 -0
- {ray_embedding-0.12.9 → ray_embedding-0.12.11}/pyproject.toml +0 -0
- {ray_embedding-0.12.9 → ray_embedding-0.12.11}/ray_embedding/__init__.py +0 -0
- {ray_embedding-0.12.9 → ray_embedding-0.12.11}/ray_embedding.egg-info/SOURCES.txt +0 -0
- {ray_embedding-0.12.9 → ray_embedding-0.12.11}/ray_embedding.egg-info/dependency_links.txt +0 -0
- {ray_embedding-0.12.9 → ray_embedding-0.12.11}/ray_embedding.egg-info/top_level.txt +0 -0
|
@@ -58,7 +58,9 @@ def build_app(args: AppConfig) -> Application:
|
|
|
58
58
|
assert model_router.path_prefix
|
|
59
59
|
|
|
60
60
|
node_health_check_interval_s = args.node_health_check_interval_s or DEFAULT_NODE_HEALTH_CHECK_INTERVAL_S
|
|
61
|
-
|
|
61
|
+
tracked_model_deployments = [model_config.deployment for model_config in models]
|
|
62
|
+
node_health_tracker = (NodeHealthTracker.options(health_check_period_s=node_health_check_interval_s)
|
|
63
|
+
.bind(tracked_model_deployments=tracked_model_deployments))
|
|
62
64
|
deployed_models = {model_config.served_model_name: build_model(model_config, node_health_tracker=node_health_tracker)
|
|
63
65
|
for model_config in models}
|
|
64
66
|
router = (ModelRouter.options(name=model_router.deployment)
|
|
@@ -38,13 +38,6 @@ class ModelDeploymentConfig(BaseModel):
|
|
|
38
38
|
deployment: str
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
class ReplicaState(BaseModel):
|
|
42
|
-
deployment_name: str
|
|
43
|
-
replica_actor_name: str
|
|
44
|
-
node_ip: str
|
|
45
|
-
state: Optional[str] = None
|
|
46
|
-
|
|
47
|
-
|
|
48
41
|
class AppConfig(BaseModel):
|
|
49
42
|
model_router: ModelRouterConfig
|
|
50
43
|
node_health_check_interval_s: Optional[int] = 30
|
|
@@ -3,6 +3,7 @@ import os.path
|
|
|
3
3
|
import time
|
|
4
4
|
from typing import Optional, Dict, Any, List, Union
|
|
5
5
|
|
|
6
|
+
import ray
|
|
6
7
|
import torch
|
|
7
8
|
from pynvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
|
|
8
9
|
from ray import serve
|
|
@@ -76,16 +77,11 @@ class EmbeddingModel:
|
|
|
76
77
|
def wait_for_cuda(self, wait: int = 10):
|
|
77
78
|
if self.init_device == "cuda" and not torch.cuda.is_available():
|
|
78
79
|
time.sleep(wait)
|
|
79
|
-
self.
|
|
80
|
+
self.check_cuda()
|
|
80
81
|
|
|
81
|
-
|
|
82
|
+
def check_cuda(self) -> Any:
|
|
82
83
|
if self.init_device != "cuda":
|
|
83
|
-
return
|
|
84
|
-
|
|
85
|
-
if self.node_health_tracker:
|
|
86
|
-
if await self.node_health_tracker.is_bad_node.remote(self.node_ip):
|
|
87
|
-
raise RuntimeError(f"The node {self.node_ip} is marked bad.")
|
|
88
|
-
|
|
84
|
+
return None
|
|
89
85
|
try:
|
|
90
86
|
# Even though CUDA was available at init time,
|
|
91
87
|
# CUDA can become unavailable - this is a known problem in AWS EC2+Docker
|
|
@@ -96,12 +92,22 @@ class EmbeddingModel:
|
|
|
96
92
|
|
|
97
93
|
# replicas only have access to GPU 0
|
|
98
94
|
handle = nvmlDeviceGetHandleByIndex(0)
|
|
99
|
-
|
|
95
|
+
return handle
|
|
100
96
|
except Exception as e:
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
97
|
+
error_msg = f"CUDA health check failed for deployment: " \
|
|
98
|
+
f"{self.deployment_name}, replica: {self.replica_actor_name}, node: {self.node_ip}.\n{e}"
|
|
99
|
+
self.logger.error(error_msg)
|
|
100
|
+
if self.node_health_tracker:
|
|
101
|
+
self.node_health_tracker.report_bad_gpu_node.remote(self.node_ip, self.deployment_name, self.replica_actor_name)
|
|
102
|
+
raise RuntimeError(error_msg)
|
|
103
|
+
|
|
104
|
+
async def check_health(self):
|
|
105
|
+
if self.node_health_tracker:
|
|
106
|
+
if await self.node_health_tracker.is_bad_gpu_node.remote(self.node_ip):
|
|
107
|
+
raise RuntimeError(f"The node {self.node_ip} is marked bad.")
|
|
104
108
|
|
|
109
|
+
handle = self.check_cuda() # Raises an exception if CUDA is unavailable
|
|
110
|
+
mem_info = nvmlDeviceGetMemoryInfo(handle)
|
|
105
111
|
reserved = torch.cuda.memory_reserved() # bytes currently reserved by CUDA cache
|
|
106
112
|
threshold_bytes = self.cuda_memory_flush_threshold * mem_info.total
|
|
107
113
|
|
|
@@ -124,5 +124,5 @@ class ModelRouter:
|
|
|
124
124
|
|
|
125
125
|
async def check_health(self):
|
|
126
126
|
if self.node_health_tracker:
|
|
127
|
-
if await self.node_health_tracker.
|
|
128
|
-
raise RuntimeError(f"The node {self.node_ip} is marked bad.")
|
|
127
|
+
if await self.node_health_tracker.is_bad_gpu_or_no_tracked_model_on_node.remote(self.node_ip):
|
|
128
|
+
raise RuntimeError(f"The node {self.node_ip} is marked bad, or there are no tracked models running on it.")
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import threading
|
|
3
|
+
from typing import Set, List
|
|
4
|
+
|
|
5
|
+
import ray
|
|
6
|
+
from networkx.algorithms.boundary import node_boundary
|
|
7
|
+
from networkx.algorithms.cuts import node_expansion
|
|
8
|
+
from ray import serve
|
|
9
|
+
from ray._private.services import get_node_ip_address
|
|
10
|
+
from ray.util.state import list_actors
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@serve.deployment(autoscaling_config=dict(min_replicas=1, max_replicas=1),
|
|
14
|
+
ray_actor_options=dict(num_cpus=0.1))
|
|
15
|
+
class NodeHealthTracker:
|
|
16
|
+
"""Maintains a list of bad nodes, as reported by replicas that call the report_bad_node func.
|
|
17
|
+
Bad nodes are those that fail GPU/CUDA health check.
|
|
18
|
+
What's the purpose? Because when an embedding model replica becomes unhealthy
|
|
19
|
+
(due to GPU/CUDA issues), we want Ray to kill all replicas running on the node.
|
|
20
|
+
When Ray detects that there are no running replicas on a node, the node is stopped
|
|
21
|
+
and replaced with a new one.
|
|
22
|
+
"""
|
|
23
|
+
def __init__(self, tracked_model_deployments: List[str] = None):
|
|
24
|
+
logging.basicConfig(level=logging.INFO)
|
|
25
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
26
|
+
self.tracked_model_deployments = tracked_model_deployments or []
|
|
27
|
+
self.bad_gpu_node_ips: Set[str] = set()
|
|
28
|
+
self.lock = threading.RLock()
|
|
29
|
+
replica_context = serve.get_replica_context()
|
|
30
|
+
self.deployment_name = replica_context.deployment
|
|
31
|
+
self.replica_actor_name = replica_context.replica_id.to_full_id_str()
|
|
32
|
+
self.node_ip = get_node_ip_address()
|
|
33
|
+
self.logger.info(f"Successfully initialized NodeHealthTracker. Tracked model deployments: {self.tracked_model_deployments}")
|
|
34
|
+
|
|
35
|
+
async def report_bad_gpu_node(self, node_ip: str, deployment_name: str, replica_actor_name: str):
|
|
36
|
+
with self.lock:
|
|
37
|
+
if node_ip not in self.bad_gpu_node_ips:
|
|
38
|
+
self.bad_gpu_node_ips.add(node_ip)
|
|
39
|
+
self.logger.warning(
|
|
40
|
+
f"[Bad GPU node reported] Deployment: {deployment_name}, Replica: {replica_actor_name}, Node IP: {node_ip}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
async def is_bad_gpu_node(self, node_ip: str) -> bool:
|
|
44
|
+
with self.lock:
|
|
45
|
+
return node_ip in self.bad_gpu_node_ips
|
|
46
|
+
|
|
47
|
+
async def is_bad_gpu_or_no_tracked_model_on_node(self, node_ip: str):
|
|
48
|
+
return (await self.is_bad_gpu_node(node_ip) or
|
|
49
|
+
not await self.is_tracked_model_running_on_node(node_ip))
|
|
50
|
+
|
|
51
|
+
async def check_health(self):
|
|
52
|
+
"""Called periodically by Ray Serve. Used here to clean up stale node IDs."""
|
|
53
|
+
try:
|
|
54
|
+
current_node_ips = {node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]}
|
|
55
|
+
with self.lock:
|
|
56
|
+
stale_nodes = self.bad_gpu_node_ips - current_node_ips
|
|
57
|
+
if stale_nodes:
|
|
58
|
+
self.logger.info(f"Removing stale bad node_ips: {stale_nodes}")
|
|
59
|
+
self.bad_gpu_node_ips.intersection_update(current_node_ips)
|
|
60
|
+
self.logger.info(f"Current nodes: {current_node_ips}. Bad GPU nodes: {self.bad_gpu_node_ips}.")
|
|
61
|
+
except Exception as e:
|
|
62
|
+
raise RuntimeError(f"An error occurred in check_health during bad node cleanup: {e}")
|
|
63
|
+
|
|
64
|
+
if not await self.is_tracked_model_running_on_node(node_ip=self.node_ip):
|
|
65
|
+
raise RuntimeError(f"There are no model replicas running on node {self.node_ip}.")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
async def is_tracked_model_running_on_node(self, node_ip: str) -> bool:
|
|
69
|
+
"""
|
|
70
|
+
Return True if there is at least one replica of any of the self.tracked_model_deployments
|
|
71
|
+
running on the specified node_ip.
|
|
72
|
+
"""
|
|
73
|
+
try:
|
|
74
|
+
target_node_id = next(node["NodeID"] for node in ray.nodes() if node["Alive"] and node["NodeManagerAddress"] == node_ip)
|
|
75
|
+
assert target_node_id, f"No node found with IP {node_ip}"
|
|
76
|
+
prefixes = tuple(f"SERVE_REPLICA::{d}" for d in self.tracked_model_deployments)
|
|
77
|
+
|
|
78
|
+
for actor in list_actors(detail=False):
|
|
79
|
+
if (actor.state in ["DEPENDENCIES_UNREADY", 'PENDING_CREATION', 'ALIVE', 'RESTARTING'] and
|
|
80
|
+
actor.node_id == target_node_id and
|
|
81
|
+
actor.name.startswith(prefixes)):
|
|
82
|
+
self.logger.info(f"There is at least one replica of tracked_deployments={self.tracked_model_deployments} "
|
|
83
|
+
f"running on node {node_ip}")
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
self.logger.info(f"No replicas of tracked deployments={self.tracked_model_deployments} running on node: {node_ip}.")
|
|
87
|
+
return False
|
|
88
|
+
except Exception as e:
|
|
89
|
+
self.logger.error(f"An error occurred while checking replicas on node {node_ip}: {e}")
|
|
90
|
+
return False
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import threading
|
|
3
|
-
from typing import Set
|
|
4
|
-
|
|
5
|
-
import ray
|
|
6
|
-
from ray import serve
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@serve.deployment(autoscaling_config=dict(min_replicas=1, max_replicas=1))
|
|
10
|
-
class NodeHealthTracker:
|
|
11
|
-
"""Maintains a list of bad nodes, as reported by replicas that call the report_bad_node func.
|
|
12
|
-
Bad nodes are those that fail GPU/CUDA health check.
|
|
13
|
-
What's the purpose? Because when an embedding model replica becomes unhealthy
|
|
14
|
-
(due to GPU/CUDA issues), we want Ray to kill all replicas running on the node.
|
|
15
|
-
When Ray detects that there are no running replicas on a node, the node is stopped
|
|
16
|
-
and replaced with a new one.
|
|
17
|
-
"""
|
|
18
|
-
def __init__(self):
|
|
19
|
-
logging.basicConfig(level=logging.INFO)
|
|
20
|
-
self.logger = logging.getLogger(self.__class__.__name__)
|
|
21
|
-
self.bad_node_ips: Set[str] = set()
|
|
22
|
-
self.lock = threading.RLock()
|
|
23
|
-
self.logger.info(f"Successfully initialized NodeHealthTracker.")
|
|
24
|
-
|
|
25
|
-
async def report_bad_node(self, node_ip: str, deployment_name: str, replica_actor_name: str):
|
|
26
|
-
with self.lock:
|
|
27
|
-
if node_ip not in self.bad_node_ips:
|
|
28
|
-
self.bad_node_ips.add(node_ip)
|
|
29
|
-
self.logger.warning(
|
|
30
|
-
f"[Bad Node Reported] Deployment: {deployment_name}, Replica: {replica_actor_name}, Node IP: {node_ip}"
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
async def is_bad_node(self, node_ip: str) -> bool:
|
|
34
|
-
with self.lock:
|
|
35
|
-
return node_ip in self.bad_node_ips
|
|
36
|
-
|
|
37
|
-
async def check_health(self):
|
|
38
|
-
"""Called periodically by Ray Serve. Used here to clean up stale node IDs."""
|
|
39
|
-
try:
|
|
40
|
-
current_node_ips = {node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]}
|
|
41
|
-
with self.lock:
|
|
42
|
-
stale_nodes = self.bad_node_ips - current_node_ips
|
|
43
|
-
if stale_nodes:
|
|
44
|
-
self.logger.info(f"Removing stale bad node_ips: {stale_nodes}")
|
|
45
|
-
self.bad_node_ips.intersection_update(current_node_ips)
|
|
46
|
-
self.logger.info(f"Current nodes: {current_node_ips}. Bad nodes: {self.bad_node_ips}.")
|
|
47
|
-
except Exception as e:
|
|
48
|
-
raise RuntimeError(f"Exception in check_health during bad node cleanup: {e}")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|