ray-embedding 0.12.3__tar.gz → 0.12.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ray-embedding might be problematic. Click here for more details.
- {ray_embedding-0.12.3 → ray_embedding-0.12.4}/PKG-INFO +1 -1
- {ray_embedding-0.12.3 → ray_embedding-0.12.4}/ray_embedding/embedding_model.py +16 -0
- {ray_embedding-0.12.3 → ray_embedding-0.12.4}/ray_embedding/model_router.py +2 -2
- {ray_embedding-0.12.3 → ray_embedding-0.12.4}/ray_embedding.egg-info/PKG-INFO +1 -1
- {ray_embedding-0.12.3 → ray_embedding-0.12.4}/setup.cfg +1 -1
- {ray_embedding-0.12.3 → ray_embedding-0.12.4}/README.md +0 -0
- {ray_embedding-0.12.3 → ray_embedding-0.12.4}/pyproject.toml +0 -0
- {ray_embedding-0.12.3 → ray_embedding-0.12.4}/ray_embedding/__init__.py +0 -0
- {ray_embedding-0.12.3 → ray_embedding-0.12.4}/ray_embedding/deploy.py +0 -0
- {ray_embedding-0.12.3 → ray_embedding-0.12.4}/ray_embedding/dto.py +0 -0
- {ray_embedding-0.12.3 → ray_embedding-0.12.4}/ray_embedding.egg-info/SOURCES.txt +0 -0
- {ray_embedding-0.12.3 → ray_embedding-0.12.4}/ray_embedding.egg-info/dependency_links.txt +0 -0
- {ray_embedding-0.12.3 → ray_embedding-0.12.4}/ray_embedding.egg-info/top_level.txt +0 -0
|
@@ -56,6 +56,12 @@ class EmbeddingModel:
|
|
|
56
56
|
|
|
57
57
|
# Move all embeddings to CPU at once before conversion
|
|
58
58
|
embeddings_list = embeddings.cpu().tolist()
|
|
59
|
+
|
|
60
|
+
# free GPU memory now, don't wait for GC
|
|
61
|
+
del embeddings
|
|
62
|
+
if torch.cuda.is_available():
|
|
63
|
+
torch.cuda.empty_cache()
|
|
64
|
+
|
|
59
65
|
return embeddings_list
|
|
60
66
|
|
|
61
67
|
def wait_for_cuda(self, wait: int = 10):
|
|
@@ -73,3 +79,13 @@ class EmbeddingModel:
|
|
|
73
79
|
assert nvmlDeviceGetCount() >= 1
|
|
74
80
|
except:
|
|
75
81
|
raise RuntimeError("CUDA device is not available")
|
|
82
|
+
|
|
83
|
+
def __del__(self):
|
|
84
|
+
# Clean up and free any remaining GPU memory
|
|
85
|
+
try:
|
|
86
|
+
if hasattr(self, 'embedding_model'):
|
|
87
|
+
del self.embedding_model
|
|
88
|
+
if torch.cuda.is_available():
|
|
89
|
+
torch.cuda.empty_cache()
|
|
90
|
+
except Exception as e:
|
|
91
|
+
self.logger.warning(f"Error during cleanup: {e}")
|
|
@@ -46,7 +46,7 @@ class ModelRouter:
|
|
|
46
46
|
f"to {len(batches)} mini-batches, each with max length {batch_size}.")
|
|
47
47
|
|
|
48
48
|
# Call embedding model replicas in parallel (rate-limited)
|
|
49
|
-
tasks = [self.
|
|
49
|
+
tasks = [self._compute_embeddings_rate_limited(model_handle, batch, dimensions) for batch in batches]
|
|
50
50
|
all_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
51
51
|
|
|
52
52
|
# Retry any failed model calls
|
|
@@ -64,7 +64,7 @@ class ModelRouter:
|
|
|
64
64
|
self.logger.info(f"Successfully computed embeddings from {len(batches)} mini-batches")
|
|
65
65
|
return [emb for result in all_results for emb in result]
|
|
66
66
|
|
|
67
|
-
async def
|
|
67
|
+
async def _compute_embeddings_rate_limited(self, model_handle: DeploymentHandle, batch: List[str], dimensions: int):
|
|
68
68
|
async with self.rate_limiter:
|
|
69
69
|
return await model_handle.remote(batch, dimensions)
|
|
70
70
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|