ray-embedding 0.12.4__py3-none-any.whl → 0.12.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ray_embedding/deploy.py +3 -1
- ray_embedding/dto.py +2 -1
- ray_embedding/embedding_model.py +24 -11
- {ray_embedding-0.12.4.dist-info → ray_embedding-0.12.6.dist-info}/METADATA +1 -1
- ray_embedding-0.12.6.dist-info/RECORD +9 -0
- ray_embedding-0.12.4.dist-info/RECORD +0 -9
- {ray_embedding-0.12.4.dist-info → ray_embedding-0.12.6.dist-info}/WHEEL +0 -0
- {ray_embedding-0.12.4.dist-info → ray_embedding-0.12.6.dist-info}/top_level.txt +0 -0
ray_embedding/deploy.py
CHANGED
|
@@ -17,6 +17,7 @@ def build_model(model_config: ModelDeploymentConfig) -> DeployedModel:
|
|
|
17
17
|
matryoshka_dim = model_config.matryoshka_dim
|
|
18
18
|
trust_remote_code = model_config.trust_remote_code or False
|
|
19
19
|
model_kwargs = model_config.model_kwargs or {}
|
|
20
|
+
cuda_memory_flush_threshold = model_config.cuda_memory_flush_threshold or 0.8
|
|
20
21
|
|
|
21
22
|
if "torch_dtype" in model_kwargs:
|
|
22
23
|
torch_dtype = model_kwargs["torch_dtype"].strip()
|
|
@@ -35,7 +36,8 @@ def build_model(model_config: ModelDeploymentConfig) -> DeployedModel:
|
|
|
35
36
|
backend=backend,
|
|
36
37
|
matryoshka_dim=matryoshka_dim,
|
|
37
38
|
trust_remote_code=trust_remote_code,
|
|
38
|
-
model_kwargs=model_kwargs
|
|
39
|
+
model_kwargs=model_kwargs,
|
|
40
|
+
cuda_memory_flush_threshold=cuda_memory_flush_threshold
|
|
39
41
|
)
|
|
40
42
|
return DeployedModel(model=served_model_name,
|
|
41
43
|
deployment_handle=deployment,
|
ray_embedding/dto.py
CHANGED
|
@@ -25,7 +25,6 @@ class ModelRouterConfig(BaseModel):
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class ModelDeploymentConfig(BaseModel):
|
|
28
|
-
deployment: str
|
|
29
28
|
model: str
|
|
30
29
|
served_model_name: str
|
|
31
30
|
batch_size: Optional[int] = 8
|
|
@@ -35,6 +34,8 @@ class ModelDeploymentConfig(BaseModel):
|
|
|
35
34
|
matryoshka_dim: Optional[int] = 768
|
|
36
35
|
trust_remote_code: Optional[bool] = False
|
|
37
36
|
model_kwargs: Optional[Dict[str, Any]] = {}
|
|
37
|
+
cuda_memory_flush_threshold: Optional[float] = 0.8
|
|
38
|
+
deployment: str
|
|
38
39
|
|
|
39
40
|
|
|
40
41
|
class AppConfig(BaseModel):
|
ray_embedding/embedding_model.py
CHANGED
|
@@ -4,7 +4,7 @@ import time
|
|
|
4
4
|
from typing import Optional, Dict, Any, List, Union
|
|
5
5
|
|
|
6
6
|
import torch
|
|
7
|
-
from pynvml import nvmlInit, nvmlDeviceGetCount
|
|
7
|
+
from pynvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
|
|
8
8
|
from ray import serve
|
|
9
9
|
from sentence_transformers import SentenceTransformer
|
|
10
10
|
|
|
@@ -14,7 +14,7 @@ class EmbeddingModel:
|
|
|
14
14
|
def __init__(self, model: str, served_model_name: Optional[str] = None,
|
|
15
15
|
device: Optional[str] = None, backend: Optional[str] = "torch",
|
|
16
16
|
matryoshka_dim: Optional[int] = None, trust_remote_code: Optional[bool] = False,
|
|
17
|
-
model_kwargs: Dict[str, Any] = None):
|
|
17
|
+
model_kwargs: Dict[str, Any] = None, cuda_memory_flush_threshold: Optional[float] = 0.8):
|
|
18
18
|
logging.basicConfig(level=logging.INFO)
|
|
19
19
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
20
20
|
self.model = model
|
|
@@ -29,6 +29,7 @@ class EmbeddingModel:
|
|
|
29
29
|
self.matryoshka_dim = matryoshka_dim
|
|
30
30
|
self.trust_remote_code = trust_remote_code or False
|
|
31
31
|
self.model_kwargs = model_kwargs or {}
|
|
32
|
+
self.cuda_memory_flush_threshold = cuda_memory_flush_threshold
|
|
32
33
|
self.logger.info(f"Initializing embedding model: {self.model}")
|
|
33
34
|
self.embedding_model = SentenceTransformer(self.model, device=self.init_device, backend=self.backend,
|
|
34
35
|
trust_remote_code=self.trust_remote_code,
|
|
@@ -57,10 +58,8 @@ class EmbeddingModel:
|
|
|
57
58
|
# Move all embeddings to CPU at once before conversion
|
|
58
59
|
embeddings_list = embeddings.cpu().tolist()
|
|
59
60
|
|
|
60
|
-
#
|
|
61
|
+
# don't wait for GC
|
|
61
62
|
del embeddings
|
|
62
|
-
if torch.cuda.is_available():
|
|
63
|
-
torch.cuda.empty_cache()
|
|
64
63
|
|
|
65
64
|
return embeddings_list
|
|
66
65
|
|
|
@@ -70,15 +69,29 @@ class EmbeddingModel:
|
|
|
70
69
|
self.check_health()
|
|
71
70
|
|
|
72
71
|
def check_health(self):
|
|
73
|
-
if self.init_device
|
|
72
|
+
if self.init_device != "cuda":
|
|
73
|
+
return
|
|
74
|
+
|
|
75
|
+
try:
|
|
74
76
|
# Even though CUDA was available at init time,
|
|
75
77
|
# CUDA can become unavailable - this is a known problem in AWS EC2+Docker
|
|
76
78
|
# https://github.com/ray-project/ray/issues/49594
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
79
|
+
nvmlInit()
|
|
80
|
+
count = nvmlDeviceGetCount()
|
|
81
|
+
assert count >= 1, "No CUDA devices found"
|
|
82
|
+
|
|
83
|
+
# replicas only have access to GPU 0
|
|
84
|
+
handle = nvmlDeviceGetHandleByIndex(0)
|
|
85
|
+
mem_info = nvmlDeviceGetMemoryInfo(handle)
|
|
86
|
+
except Exception as e:
|
|
87
|
+
raise RuntimeError(f"CUDA health check failed: {e}")
|
|
88
|
+
|
|
89
|
+
reserved = torch.cuda.memory_reserved() # bytes currently reserved by CUDA cache
|
|
90
|
+
threshold_bytes = self.cuda_memory_flush_threshold * mem_info.total
|
|
91
|
+
|
|
92
|
+
if reserved > threshold_bytes:
|
|
93
|
+
# flush only when cache exceeds the percentage threshold
|
|
94
|
+
torch.cuda.empty_cache()
|
|
82
95
|
|
|
83
96
|
def __del__(self):
|
|
84
97
|
# Clean up and free any remaining GPU memory
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
ray_embedding/__init__.py,sha256=YS5LAZfRIwwVvE3C9g7hsauvjgIkqKtHyxkwMFFfAGY,46
|
|
2
|
+
ray_embedding/deploy.py,sha256=VzFqfGTLA5cr-6e50GAhpdfw5KlRnJvdpXXQRSdaO6o,2835
|
|
3
|
+
ray_embedding/dto.py,sha256=l0hxz_fdGjZtLMZS3BzQ1tLzAOiO_8NpX4i5Wdyuk6Q,1519
|
|
4
|
+
ray_embedding/embedding_model.py,sha256=6iEaIg_mCpGEY-5F0uff2wTOMH1wI42u2N8DnaZE3mA,4670
|
|
5
|
+
ray_embedding/model_router.py,sha256=BsOEz24ttvpDD4LZsDVg9rLhn26FxgUsDAvcjI0Feao,5917
|
|
6
|
+
ray_embedding-0.12.6.dist-info/METADATA,sha256=DBrk-wJPVgwwYh06A2tmrgBM2ethb_65ololwManbbc,1094
|
|
7
|
+
ray_embedding-0.12.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
ray_embedding-0.12.6.dist-info/top_level.txt,sha256=ziCblpJq1YsrryshFqxTRuRMgNuO1_tgvAAkGShATNA,14
|
|
9
|
+
ray_embedding-0.12.6.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
ray_embedding/__init__.py,sha256=YS5LAZfRIwwVvE3C9g7hsauvjgIkqKtHyxkwMFFfAGY,46
|
|
2
|
-
ray_embedding/deploy.py,sha256=xE-NznVlYftTPIfN3aAqCF0DpFIvTuC4vMTNrGfjkxI,2627
|
|
3
|
-
ray_embedding/dto.py,sha256=rzPEB-R7XDYlTqeaXGGgfOjTWyTeRinnJ6LbI1oOWGY,1463
|
|
4
|
-
ray_embedding/embedding_model.py,sha256=j8jPhfVqS_x11oZrlXDbjo6z1NHXxOk8CnGIQemGfUQ,4040
|
|
5
|
-
ray_embedding/model_router.py,sha256=BsOEz24ttvpDD4LZsDVg9rLhn26FxgUsDAvcjI0Feao,5917
|
|
6
|
-
ray_embedding-0.12.4.dist-info/METADATA,sha256=e7i2fUgJhRYIvIrzLVlGYVr3FvPD1Eaoj7L08Slbxp4,1094
|
|
7
|
-
ray_embedding-0.12.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
-
ray_embedding-0.12.4.dist-info/top_level.txt,sha256=ziCblpJq1YsrryshFqxTRuRMgNuO1_tgvAAkGShATNA,14
|
|
9
|
-
ray_embedding-0.12.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|