ray-embedding 0.12.6__tar.gz → 0.12.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ray-embedding might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ray-embedding
3
- Version: 0.12.6
3
+ Version: 0.12.8
4
4
  Summary: Deploy SentenceTransformers embedding models to a ray cluster
5
5
  Author: Crispin Almodovar
6
6
  Author-email:
@@ -31,6 +31,6 @@ to see how this library is used.
31
31
  - onnx-gpu
32
32
  - onnx-cpu
33
33
  - openvino-cpu
34
- - fastembed-onnx-cpu
34
+
35
35
 
36
36
 
@@ -19,6 +19,6 @@ to see how this library is used.
19
19
  - onnx-gpu
20
20
  - onnx-cpu
21
21
  - openvino-cpu
22
- - fastembed-onnx-cpu
22
+
23
23
 
24
24
 
@@ -1,14 +1,19 @@
1
1
  import os
2
+ from typing import Optional
2
3
 
3
4
  import torch
4
5
  from ray.serve import Application
6
+ from ray.serve.handle import DeploymentHandle
5
7
 
6
8
  from ray_embedding.dto import AppConfig, ModelDeploymentConfig, DeployedModel
7
9
  from ray_embedding.embedding_model import EmbeddingModel
8
10
  from ray_embedding.model_router import ModelRouter
11
+ from ray_embedding.node_health import NodeHealthTracker
9
12
 
13
+ DEFAULT_NODE_HEALTH_CHECK_INTERVAL_S = 30
10
14
 
11
- def build_model(model_config: ModelDeploymentConfig) -> DeployedModel:
15
+
16
+ def build_model(model_config: ModelDeploymentConfig, node_health_tracker: Optional[DeploymentHandle] = None) -> DeployedModel:
12
17
  deployment_name = model_config.deployment
13
18
  model = model_config.model
14
19
  served_model_name = model_config.served_model_name or os.path.basename(model)
@@ -37,7 +42,8 @@ def build_model(model_config: ModelDeploymentConfig) -> DeployedModel:
37
42
  matryoshka_dim=matryoshka_dim,
38
43
  trust_remote_code=trust_remote_code,
39
44
  model_kwargs=model_kwargs,
40
- cuda_memory_flush_threshold=cuda_memory_flush_threshold
45
+ cuda_memory_flush_threshold=cuda_memory_flush_threshold,
46
+ node_health_tracker=node_health_tracker
41
47
  )
42
48
  return DeployedModel(model=served_model_name,
43
49
  deployment_handle=deployment,
@@ -51,6 +57,12 @@ def build_app(args: AppConfig) -> Application:
51
57
  assert model_router and models
52
58
  assert model_router.path_prefix
53
59
 
54
- deployed_models = {model_config.served_model_name: build_model(model_config) for model_config in models}
55
- router = ModelRouter.options(name=model_router.deployment).bind(deployed_models, model_router.path_prefix)
60
+ node_health_check_interval_s = args.node_health_check_interval_s or DEFAULT_NODE_HEALTH_CHECK_INTERVAL_S
61
+ node_health_tracker = NodeHealthTracker.options(health_check_period_s=node_health_check_interval_s).bind()
62
+ deployed_models = {model_config.served_model_name: build_model(model_config, node_health_tracker=node_health_tracker)
63
+ for model_config in models}
64
+ router = (ModelRouter.options(name=model_router.deployment)
65
+ .bind(deployed_models=deployed_models,
66
+ path_prefix=model_router.path_prefix,
67
+ node_health_tracker=node_health_tracker))
56
68
  return router
@@ -38,8 +38,16 @@ class ModelDeploymentConfig(BaseModel):
38
38
  deployment: str
39
39
 
40
40
 
41
+ class ReplicaState(BaseModel):
42
+ deployment_name: str
43
+ replica_actor_name: str
44
+ node_ip: str
45
+ state: Optional[str] = None
46
+
47
+
41
48
  class AppConfig(BaseModel):
42
49
  model_router: ModelRouterConfig
50
+ node_health_check_interval_s: Optional[int] = 30
43
51
  models: List[ModelDeploymentConfig]
44
52
 
45
53
 
@@ -6,6 +6,8 @@ from typing import Optional, Dict, Any, List, Union
6
6
  import torch
7
7
  from pynvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
8
8
  from ray import serve
9
+ from ray.util import get_node_ip_address
10
+ from ray.serve.handle import DeploymentHandle
9
11
  from sentence_transformers import SentenceTransformer
10
12
 
11
13
 
@@ -14,12 +16,14 @@ class EmbeddingModel:
14
16
  def __init__(self, model: str, served_model_name: Optional[str] = None,
15
17
  device: Optional[str] = None, backend: Optional[str] = "torch",
16
18
  matryoshka_dim: Optional[int] = None, trust_remote_code: Optional[bool] = False,
17
- model_kwargs: Dict[str, Any] = None, cuda_memory_flush_threshold: Optional[float] = 0.8):
19
+ model_kwargs: Dict[str, Any] = None, cuda_memory_flush_threshold: Optional[float] = 0.8,
20
+ node_health_tracker: Optional[DeploymentHandle] = None):
18
21
  logging.basicConfig(level=logging.INFO)
19
22
  self.logger = logging.getLogger(self.__class__.__name__)
20
23
  self.model = model
21
24
  self.served_model_name = served_model_name or os.path.basename(self.model)
22
25
  self.init_device = device
26
+ self.cuda_memory_flush_threshold = cuda_memory_flush_threshold
23
27
  if self.init_device is None or self.init_device == "auto":
24
28
  self.init_device = "cuda" if torch.cuda.is_available() else "cpu"
25
29
  if self.init_device == "cuda":
@@ -29,13 +33,19 @@ class EmbeddingModel:
29
33
  self.matryoshka_dim = matryoshka_dim
30
34
  self.trust_remote_code = trust_remote_code or False
31
35
  self.model_kwargs = model_kwargs or {}
32
- self.cuda_memory_flush_threshold = cuda_memory_flush_threshold
36
+
33
37
  self.logger.info(f"Initializing embedding model: {self.model}")
34
38
  self.embedding_model = SentenceTransformer(self.model, device=self.init_device, backend=self.backend,
35
39
  trust_remote_code=self.trust_remote_code,
36
40
  model_kwargs=self.model_kwargs)
37
41
 
38
- self.logger.info(f"Successfully initialized model {self.model} using device {self.torch_device}")
42
+ self.node_health_tracker = node_health_tracker
43
+ replica_context = serve.get_replica_context()
44
+ self.deployment_name = replica_context.deployment
45
+ self.replica_actor_name = replica_context.replica_id.to_full_id_str()
46
+ self.node_ip = get_node_ip_address()
47
+ self.logger.info(f"Successfully initialized model {self.model} using device {self.torch_device}. "
48
+ f"Deployment name: {self.deployment_name}, Replica actor name: {self.replica_actor_name}, Node IP: {self.node_ip}")
39
49
 
40
50
  async def __call__(self, text: Union[str, List[str]], dimensions: Optional[int] = None) -> List[List[float]]:
41
51
  """Compute embeddings for the input text using the current model."""
@@ -68,10 +78,14 @@ class EmbeddingModel:
68
78
  time.sleep(wait)
69
79
  self.check_health()
70
80
 
71
- def check_health(self):
81
+ async def check_health(self):
72
82
  if self.init_device != "cuda":
73
83
  return
74
84
 
85
+ if self.node_health_tracker:
86
+ if await self.node_health_tracker.is_bad_node.remote(self.node_ip):
87
+ raise RuntimeError(f"The node {self.node_ip} is marked bad.")
88
+
75
89
  try:
76
90
  # Even though CUDA was available at init time,
77
91
  # CUDA can become unavailable - this is a known problem in AWS EC2+Docker
@@ -84,7 +98,9 @@ class EmbeddingModel:
84
98
  handle = nvmlDeviceGetHandleByIndex(0)
85
99
  mem_info = nvmlDeviceGetMemoryInfo(handle)
86
100
  except Exception as e:
87
- raise RuntimeError(f"CUDA health check failed: {e}")
101
+ await self.node_health_tracker.report_bad_node.remote(self.node_ip, self.deployment_name, self.replica_actor_name)
102
+ raise RuntimeError(f"CUDA health check failed for deployment: "
103
+ f"{self.deployment_name}, replica: {self.replica_actor_name}, node: {self.node_ip}.\n{e}")
88
104
 
89
105
  reserved = torch.cuda.memory_reserved() # bytes currently reserved by CUDA cache
90
106
  threshold_bytes = self.cuda_memory_flush_threshold * mem_info.total
@@ -6,6 +6,7 @@ from typing import Optional, Dict, List, Tuple
6
6
  from fastapi import FastAPI, HTTPException
7
7
  from ray import serve
8
8
  from ray.serve.handle import DeploymentHandle
9
+ from ray.util import get_node_ip_address
9
10
 
10
11
  from ray_embedding.dto import DeployedModel, EmbeddingRequest, EmbeddingResponse
11
12
 
@@ -14,7 +15,9 @@ web_api = FastAPI(title="Ray Embeddings - OpenAI-compatible API")
14
15
  @serve.deployment
15
16
  @serve.ingress(web_api)
16
17
  class ModelRouter:
17
- def __init__(self, deployed_models: Dict[str, DeployedModel], path_prefix: List[str], max_concurrency: Optional[int] = 32):
18
+ def __init__(self, deployed_models: Dict[str, DeployedModel],
19
+ path_prefix: List[str], max_concurrency: Optional[int] = 32,
20
+ node_health_tracker: Optional[DeploymentHandle] = None):
18
21
  assert deployed_models, "models cannot be empty"
19
22
  assert path_prefix, "path_prefix cannot be empty"
20
23
 
@@ -32,6 +35,13 @@ class ModelRouter:
32
35
  "permission": []} for item in self.deployed_models.keys()
33
36
  ]
34
37
  self.logger.info(f"Successfully registered models: {self.available_models}")
38
+ self.node_health_tracker = node_health_tracker
39
+ replica_context = serve.get_replica_context()
40
+ self.deployment_name = replica_context.deployment
41
+ self.replica_actor_name = replica_context.replica_id.to_full_id_str()
42
+ self.node_ip = get_node_ip_address()
43
+ self.logger.info(f"Successfully initialized model router. "
44
+ f"Deployment name: {self.deployment_name}, Replica actor name: {self.replica_actor_name}, Node IP: {self.node_ip}")
35
45
 
36
46
  async def _compute_embeddings_from_resized_batches(self, model: str, inputs: List[str], dimensions: Optional[int] = None):
37
47
  deployed_model = self.deployed_models[model]
@@ -111,3 +121,8 @@ class ModelRouter:
111
121
  if path_prefix not in self.path_prefix:
112
122
  raise HTTPException(status_code=400, detail=f"The API path prefix specified is invalid: '{path_prefix}'")
113
123
  return {"object": "list", "data": self.available_models}
124
+
125
+ async def check_health(self):
126
+ if self.node_health_tracker:
127
+ if await self.node_health_tracker.is_bad_node.remote(self.node_ip):
128
+ raise RuntimeError(f"The node {self.node_ip} is marked bad.")
@@ -0,0 +1,48 @@
1
+ import logging
2
+ import threading
3
+ from typing import Set
4
+
5
+ import ray
6
+ from ray import serve
7
+
8
+
9
+ @serve.deployment(min_replicas=1, max_replicas=1)
10
+ class NodeHealthTracker:
11
+ """Maintains a list of bad nodes, as reported by replicas that call the report_bad_node func.
12
+ Bad nodes are those that fail GPU/CUDA health check.
13
+ What's the purpose? Because when an embedding model replica becomes unhealthy
14
+ (due to GPU/CUDA issues), we want Ray to kill all replicas running on the node.
15
+ When Ray detects that there are no running replicas on a node, the node is stopped
16
+ and replaced with a new one.
17
+ """
18
+ def __init__(self):
19
+ logging.basicConfig(level=logging.INFO)
20
+ self.logger = logging.getLogger(self.__class__.__name__)
21
+ self.bad_node_ips: Set[str] = set()
22
+ self.lock = threading.RLock()
23
+ self.logger.info(f"Successfully initialized NodeHealthTracker.")
24
+
25
+ async def report_bad_node(self, node_ip: str, deployment_name: str, replica_actor_name: str):
26
+ with self.lock:
27
+ if node_ip not in self.bad_node_ips:
28
+ self.bad_node_ips.add(node_ip)
29
+ self.logger.warning(
30
+ f"[Bad Node Reported] Deployment: {deployment_name}, Replica: {replica_actor_name}, Node IP: {node_ip}"
31
+ )
32
+
33
+ async def is_bad_node(self, node_ip: str) -> bool:
34
+ with self.lock:
35
+ return node_ip in self.bad_node_ips
36
+
37
+ async def check_health(self):
38
+ """Called periodically by Ray Serve. Used here to clean up stale node IDs."""
39
+ try:
40
+ current_node_ips = {node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]}
41
+ with self.lock:
42
+ stale_nodes = self.bad_node_ips - current_node_ips
43
+ if stale_nodes:
44
+ self.logger.info(f"Removing stale bad node_ips: {stale_nodes}")
45
+ self.bad_node_ips.intersection_update(current_node_ips)
46
+ self.logger.info(f"Current nodes: {current_node_ips}. Bad nodes: {self.bad_node_ips}.")
47
+ except Exception as e:
48
+ raise RuntimeError(f"Exception in check_health during bad node cleanup: {e}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ray-embedding
3
- Version: 0.12.6
3
+ Version: 0.12.8
4
4
  Summary: Deploy SentenceTransformers embedding models to a ray cluster
5
5
  Author: Crispin Almodovar
6
6
  Author-email:
@@ -31,6 +31,6 @@ to see how this library is used.
31
31
  - onnx-gpu
32
32
  - onnx-cpu
33
33
  - openvino-cpu
34
- - fastembed-onnx-cpu
34
+
35
35
 
36
36
 
@@ -6,6 +6,7 @@ ray_embedding/deploy.py
6
6
  ray_embedding/dto.py
7
7
  ray_embedding/embedding_model.py
8
8
  ray_embedding/model_router.py
9
+ ray_embedding/node_health.py
9
10
  ray_embedding.egg-info/PKG-INFO
10
11
  ray_embedding.egg-info/SOURCES.txt
11
12
  ray_embedding.egg-info/dependency_links.txt
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = ray-embedding
3
- version = 0.12.6
3
+ version = 0.12.8
4
4
  author = Crispin Almodovar
5
5
  author_email =
6
6
  description = Deploy SentenceTransformers embedding models to a ray cluster