ray-embedding 0.12.4__py3-none-any.whl → 0.12.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ray-embedding might be problematic. Click here for more details.

ray_embedding/deploy.py CHANGED
@@ -17,6 +17,7 @@ def build_model(model_config: ModelDeploymentConfig) -> DeployedModel:
17
17
  matryoshka_dim = model_config.matryoshka_dim
18
18
  trust_remote_code = model_config.trust_remote_code or False
19
19
  model_kwargs = model_config.model_kwargs or {}
20
+ cuda_memory_flush_threshold = model_config.cuda_memory_flush_threshold or 0.8
20
21
 
21
22
  if "torch_dtype" in model_kwargs:
22
23
  torch_dtype = model_kwargs["torch_dtype"].strip()
ray_embedding/dto.py CHANGED
@@ -25,7 +25,6 @@ class ModelRouterConfig(BaseModel):
25
25
 
26
26
 
27
27
  class ModelDeploymentConfig(BaseModel):
28
- deployment: str
29
28
  model: str
30
29
  served_model_name: str
31
30
  batch_size: Optional[int] = 8
@@ -35,6 +34,8 @@ class ModelDeploymentConfig(BaseModel):
35
34
  matryoshka_dim: Optional[int] = 768
36
35
  trust_remote_code: Optional[bool] = False
37
36
  model_kwargs: Optional[Dict[str, Any]] = {}
37
+ cuda_memory_flush_threshold: Optional[float] = 0.8
38
+ deployment: str
38
39
 
39
40
 
40
41
  class AppConfig(BaseModel):
@@ -4,7 +4,7 @@ import time
4
4
  from typing import Optional, Dict, Any, List, Union
5
5
 
6
6
  import torch
7
- from pynvml import nvmlInit, nvmlDeviceGetCount
7
+ from pynvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
8
8
  from ray import serve
9
9
  from sentence_transformers import SentenceTransformer
10
10
 
@@ -14,7 +14,7 @@ class EmbeddingModel:
14
14
  def __init__(self, model: str, served_model_name: Optional[str] = None,
15
15
  device: Optional[str] = None, backend: Optional[str] = "torch",
16
16
  matryoshka_dim: Optional[int] = None, trust_remote_code: Optional[bool] = False,
17
- model_kwargs: Dict[str, Any] = None):
17
+ model_kwargs: Dict[str, Any] = None, cuda_memory_flush_threshold: Optional[float] = 0.8):
18
18
  logging.basicConfig(level=logging.INFO)
19
19
  self.logger = logging.getLogger(self.__class__.__name__)
20
20
  self.model = model
@@ -29,6 +29,7 @@ class EmbeddingModel:
29
29
  self.matryoshka_dim = matryoshka_dim
30
30
  self.trust_remote_code = trust_remote_code or False
31
31
  self.model_kwargs = model_kwargs or {}
32
+ self.cuda_memory_flush_threshold = cuda_memory_flush_threshold
32
33
  self.logger.info(f"Initializing embedding model: {self.model}")
33
34
  self.embedding_model = SentenceTransformer(self.model, device=self.init_device, backend=self.backend,
34
35
  trust_remote_code=self.trust_remote_code,
@@ -57,10 +58,8 @@ class EmbeddingModel:
57
58
  # Move all embeddings to CPU at once before conversion
58
59
  embeddings_list = embeddings.cpu().tolist()
59
60
 
60
- # free GPU memory now, don't wait for GC
61
+ # don't wait for GC
61
62
  del embeddings
62
- if torch.cuda.is_available():
63
- torch.cuda.empty_cache()
64
63
 
65
64
  return embeddings_list
66
65
 
@@ -70,15 +69,29 @@ class EmbeddingModel:
70
69
  self.check_health()
71
70
 
72
71
  def check_health(self):
73
- if self.init_device == "cuda":
72
+ if self.init_device != "cuda":
73
+ return
74
+
75
+ try:
74
76
  # Even though CUDA was available at init time,
75
77
  # CUDA can become unavailable - this is a known problem in AWS EC2+Docker
76
78
  # https://github.com/ray-project/ray/issues/49594
77
- try:
78
- nvmlInit()
79
- assert nvmlDeviceGetCount() >= 1
80
- except:
81
- raise RuntimeError("CUDA device is not available")
79
+ nvmlInit()
80
+ count = nvmlDeviceGetCount()
81
+ assert count >= 1, "No CUDA devices found"
82
+
83
+ # replicas only have access to GPU 0
84
+ handle = nvmlDeviceGetHandleByIndex(0)
85
+ mem_info = nvmlDeviceGetMemoryInfo(handle)
86
+ except Exception as e:
87
+ raise RuntimeError(f"CUDA health check failed: {e}")
88
+
89
+ reserved = torch.cuda.memory_reserved() # bytes currently reserved by CUDA cache
90
+ threshold_bytes = self.cuda_memory_flush_threshold * mem_info.total
91
+
92
+ if reserved > threshold_bytes:
93
+ # flush only when cache exceeds the percentage threshold
94
+ torch.cuda.empty_cache()
82
95
 
83
96
  def __del__(self):
84
97
  # Clean up and free any remaining GPU memory
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ray-embedding
3
- Version: 0.12.4
3
+ Version: 0.12.5
4
4
  Summary: Deploy SentenceTransformers embedding models to a ray cluster
5
5
  Author: Crispin Almodovar
6
6
  Author-email:
@@ -0,0 +1,9 @@
1
+ ray_embedding/__init__.py,sha256=YS5LAZfRIwwVvE3C9g7hsauvjgIkqKtHyxkwMFFfAGY,46
2
+ ray_embedding/deploy.py,sha256=1Nzb39OylxBEMDqCcfD-ByTefhANXbjLMzLo_YAkCfw,2710
3
+ ray_embedding/dto.py,sha256=l0hxz_fdGjZtLMZS3BzQ1tLzAOiO_8NpX4i5Wdyuk6Q,1519
4
+ ray_embedding/embedding_model.py,sha256=6iEaIg_mCpGEY-5F0uff2wTOMH1wI42u2N8DnaZE3mA,4670
5
+ ray_embedding/model_router.py,sha256=BsOEz24ttvpDD4LZsDVg9rLhn26FxgUsDAvcjI0Feao,5917
6
+ ray_embedding-0.12.5.dist-info/METADATA,sha256=EvWeadexmzrfUATF6dYl8c54cGCfdp5EEeW23vkni38,1094
7
+ ray_embedding-0.12.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ ray_embedding-0.12.5.dist-info/top_level.txt,sha256=ziCblpJq1YsrryshFqxTRuRMgNuO1_tgvAAkGShATNA,14
9
+ ray_embedding-0.12.5.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- ray_embedding/__init__.py,sha256=YS5LAZfRIwwVvE3C9g7hsauvjgIkqKtHyxkwMFFfAGY,46
2
- ray_embedding/deploy.py,sha256=xE-NznVlYftTPIfN3aAqCF0DpFIvTuC4vMTNrGfjkxI,2627
3
- ray_embedding/dto.py,sha256=rzPEB-R7XDYlTqeaXGGgfOjTWyTeRinnJ6LbI1oOWGY,1463
4
- ray_embedding/embedding_model.py,sha256=j8jPhfVqS_x11oZrlXDbjo6z1NHXxOk8CnGIQemGfUQ,4040
5
- ray_embedding/model_router.py,sha256=BsOEz24ttvpDD4LZsDVg9rLhn26FxgUsDAvcjI0Feao,5917
6
- ray_embedding-0.12.4.dist-info/METADATA,sha256=e7i2fUgJhRYIvIrzLVlGYVr3FvPD1Eaoj7L08Slbxp4,1094
7
- ray_embedding-0.12.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
- ray_embedding-0.12.4.dist-info/top_level.txt,sha256=ziCblpJq1YsrryshFqxTRuRMgNuO1_tgvAAkGShATNA,14
9
- ray_embedding-0.12.4.dist-info/RECORD,,