PyPI - ray-embedding - Versions diffs - 0.12.4__py3-none-any.whl → 0.12.6__py3-none-any.whl - Mend

ray-embedding 0.12.4py3-none-any.whl → 0.12.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

ray_embedding/deploy.py CHANGED Viewed

@@ -17,6 +17,7 @@ def build_model(model_config: ModelDeploymentConfig) -> DeployedModel:
     matryoshka_dim = model_config.matryoshka_dim
     trust_remote_code = model_config.trust_remote_code or False
     model_kwargs = model_config.model_kwargs or {}
+    cuda_memory_flush_threshold = model_config.cuda_memory_flush_threshold or 0.8
     if "torch_dtype" in model_kwargs:
         torch_dtype = model_kwargs["torch_dtype"].strip()
@@ -35,7 +36,8 @@ def build_model(model_config: ModelDeploymentConfig) -> DeployedModel:
                                                                    backend=backend,
                                                                    matryoshka_dim=matryoshka_dim,
                                                                    trust_remote_code=trust_remote_code,
-                                                                   model_kwargs=model_kwargs
+                                                                   model_kwargs=model_kwargs,
+                                                                   cuda_memory_flush_threshold=cuda_memory_flush_threshold
                                                                    )
     return DeployedModel(model=served_model_name,
                          deployment_handle=deployment,

ray_embedding/dto.py CHANGED Viewed

@@ -25,7 +25,6 @@ class ModelRouterConfig(BaseModel):
 class ModelDeploymentConfig(BaseModel):
-    deployment: str
     model: str
     served_model_name: str
     batch_size: Optional[int] = 8
@@ -35,6 +34,8 @@ class ModelDeploymentConfig(BaseModel):
     matryoshka_dim: Optional[int] = 768
     trust_remote_code: Optional[bool] = False
     model_kwargs: Optional[Dict[str, Any]] = {}
+    cuda_memory_flush_threshold: Optional[float] = 0.8
+    deployment: str
 class AppConfig(BaseModel):

ray_embedding/embedding_model.py CHANGED Viewed

@@ -4,7 +4,7 @@ import time
 from typing import Optional, Dict, Any, List, Union
 import torch
-from pynvml import nvmlInit, nvmlDeviceGetCount
+from pynvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
 from ray import serve
 from sentence_transformers import SentenceTransformer
@@ -14,7 +14,7 @@ class EmbeddingModel:
     def __init__(self, model: str, served_model_name: Optional[str] = None,
                  device: Optional[str] = None, backend: Optional[str] = "torch",
                  matryoshka_dim: Optional[int] = None, trust_remote_code: Optional[bool] = False,
-                 model_kwargs: Dict[str, Any] = None):
+                 model_kwargs: Dict[str, Any] = None, cuda_memory_flush_threshold: Optional[float] = 0.8):
         logging.basicConfig(level=logging.INFO)
         self.logger = logging.getLogger(self.__class__.__name__)
         self.model = model
@@ -29,6 +29,7 @@ class EmbeddingModel:
         self.matryoshka_dim = matryoshka_dim
         self.trust_remote_code = trust_remote_code or False
         self.model_kwargs = model_kwargs or {}
+        self.cuda_memory_flush_threshold = cuda_memory_flush_threshold
         self.logger.info(f"Initializing embedding model: {self.model}")
         self.embedding_model = SentenceTransformer(self.model, device=self.init_device, backend=self.backend,
                                                    trust_remote_code=self.trust_remote_code,
@@ -57,10 +58,8 @@ class EmbeddingModel:
         # Move all embeddings to CPU at once before conversion
         embeddings_list = embeddings.cpu().tolist()
-        # free GPU memory now, don't wait for GC
+        # don't wait for GC
         del embeddings
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
         return embeddings_list
@@ -70,15 +69,29 @@ class EmbeddingModel:
         self.check_health()
     def check_health(self):
-        if self.init_device == "cuda":
+        if self.init_device != "cuda":
+            return
+        try:
             # Even though CUDA was available at init time,
             # CUDA can become unavailable - this is a known problem in AWS EC2+Docker
             # https://github.com/ray-project/ray/issues/49594
-            try:
-                nvmlInit()
-                assert nvmlDeviceGetCount() >= 1
-            except:
-                raise RuntimeError("CUDA device is not available")
+            nvmlInit()
+            count = nvmlDeviceGetCount()
+            assert count >= 1, "No CUDA devices found"
+            # replicas only have access to GPU 0
+            handle = nvmlDeviceGetHandleByIndex(0)
+            mem_info = nvmlDeviceGetMemoryInfo(handle)
+        except Exception as e:
+            raise RuntimeError(f"CUDA health check failed: {e}")
+        reserved = torch.cuda.memory_reserved()  # bytes currently reserved by CUDA cache
+        threshold_bytes = self.cuda_memory_flush_threshold * mem_info.total
+        if reserved > threshold_bytes:
+            # flush only when cache exceeds the percentage threshold
+            torch.cuda.empty_cache()
     def __del__(self):
         # Clean up and free any remaining GPU memory

{ray_embedding-0.12.4.dist-info → ray_embedding-0.12.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ray-embedding
-Version: 0.12.4
+Version: 0.12.6
 Summary: Deploy SentenceTransformers embedding models to a ray cluster
 Author: Crispin Almodovar
 Author-email:

ray_embedding-0.12.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+ray_embedding/__init__.py,sha256=YS5LAZfRIwwVvE3C9g7hsauvjgIkqKtHyxkwMFFfAGY,46
+ray_embedding/deploy.py,sha256=VzFqfGTLA5cr-6e50GAhpdfw5KlRnJvdpXXQRSdaO6o,2835
+ray_embedding/dto.py,sha256=l0hxz_fdGjZtLMZS3BzQ1tLzAOiO_8NpX4i5Wdyuk6Q,1519
+ray_embedding/embedding_model.py,sha256=6iEaIg_mCpGEY-5F0uff2wTOMH1wI42u2N8DnaZE3mA,4670
+ray_embedding/model_router.py,sha256=BsOEz24ttvpDD4LZsDVg9rLhn26FxgUsDAvcjI0Feao,5917
+ray_embedding-0.12.6.dist-info/METADATA,sha256=DBrk-wJPVgwwYh06A2tmrgBM2ethb_65ololwManbbc,1094
+ray_embedding-0.12.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ray_embedding-0.12.6.dist-info/top_level.txt,sha256=ziCblpJq1YsrryshFqxTRuRMgNuO1_tgvAAkGShATNA,14
+ray_embedding-0.12.6.dist-info/RECORD,,

ray_embedding-0.12.4.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-ray_embedding/__init__.py,sha256=YS5LAZfRIwwVvE3C9g7hsauvjgIkqKtHyxkwMFFfAGY,46
-ray_embedding/deploy.py,sha256=xE-NznVlYftTPIfN3aAqCF0DpFIvTuC4vMTNrGfjkxI,2627
-ray_embedding/dto.py,sha256=rzPEB-R7XDYlTqeaXGGgfOjTWyTeRinnJ6LbI1oOWGY,1463
-ray_embedding/embedding_model.py,sha256=j8jPhfVqS_x11oZrlXDbjo6z1NHXxOk8CnGIQemGfUQ,4040
-ray_embedding/model_router.py,sha256=BsOEz24ttvpDD4LZsDVg9rLhn26FxgUsDAvcjI0Feao,5917
-ray_embedding-0.12.4.dist-info/METADATA,sha256=e7i2fUgJhRYIvIrzLVlGYVr3FvPD1Eaoj7L08Slbxp4,1094
-ray_embedding-0.12.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ray_embedding-0.12.4.dist-info/top_level.txt,sha256=ziCblpJq1YsrryshFqxTRuRMgNuO1_tgvAAkGShATNA,14
-ray_embedding-0.12.4.dist-info/RECORD,,

{ray_embedding-0.12.4.dist-info → ray_embedding-0.12.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{ray_embedding-0.12.4.dist-info → ray_embedding-0.12.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

ray-embedding 0.12.4__py3-none-any.whl → 0.12.6__py3-none-any.whl

ray-embedding 0.12.4py3-none-any.whl → 0.12.6py3-none-any.whl