ray-embedding 0.12.2__tar.gz → 0.12.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ray-embedding might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ray-embedding
3
- Version: 0.12.2
3
+ Version: 0.12.4
4
4
  Summary: Deploy SentenceTransformers embedding models to a ray cluster
5
5
  Author: Crispin Almodovar
6
6
  Author-email:
@@ -56,6 +56,12 @@ class EmbeddingModel:
56
56
 
57
57
  # Move all embeddings to CPU at once before conversion
58
58
  embeddings_list = embeddings.cpu().tolist()
59
+
60
+ # free GPU memory now, don't wait for GC
61
+ del embeddings
62
+ if torch.cuda.is_available():
63
+ torch.cuda.empty_cache()
64
+
59
65
  return embeddings_list
60
66
 
61
67
  def wait_for_cuda(self, wait: int = 10):
@@ -73,3 +79,13 @@ class EmbeddingModel:
73
79
  assert nvmlDeviceGetCount() >= 1
74
80
  except:
75
81
  raise RuntimeError("CUDA device is not available")
82
+
83
+ def __del__(self):
84
+ # Clean up and free any remaining GPU memory
85
+ try:
86
+ if hasattr(self, 'embedding_model'):
87
+ del self.embedding_model
88
+ if torch.cuda.is_available():
89
+ torch.cuda.empty_cache()
90
+ except Exception as e:
91
+ self.logger.warning(f"Error during cleanup: {e}")
@@ -46,7 +46,7 @@ class ModelRouter:
46
46
  f"to {len(batches)} mini-batches, each with max length {batch_size}.")
47
47
 
48
48
  # Call embedding model replicas in parallel (rate-limited)
49
- tasks = [self._rate_limited_embedding_call(model_handle, batch, dimensions) for batch in batches]
49
+ tasks = [self._compute_embeddings_rate_limited(model_handle, batch, dimensions) for batch in batches]
50
50
  all_results = await asyncio.gather(*tasks, return_exceptions=True)
51
51
 
52
52
  # Retry any failed model calls
@@ -64,7 +64,7 @@ class ModelRouter:
64
64
  self.logger.info(f"Successfully computed embeddings from {len(batches)} mini-batches")
65
65
  return [emb for result in all_results for emb in result]
66
66
 
67
- async def _rate_limited_embedding_call(self, model_handle: DeploymentHandle, batch: List[str], dimensions: int):
67
+ async def _compute_embeddings_rate_limited(self, model_handle: DeploymentHandle, batch: List[str], dimensions: int):
68
68
  async with self.rate_limiter:
69
69
  return await model_handle.remote(batch, dimensions)
70
70
 
@@ -88,10 +88,10 @@ class ModelRouter:
88
88
 
89
89
  @web_api.post("/{path_prefix}/v1/embeddings", response_model=EmbeddingResponse)
90
90
  async def compute_embeddings(self, path_prefix: str, request: EmbeddingRequest):
91
- assert path_prefix in self.path_prefix, f"Invalid path prefix: {path_prefix}"
92
- assert request.model in self.deployed_models, f"Invalid model: {request.model}"
93
-
94
91
  try:
92
+ assert path_prefix in self.path_prefix, f"The API path prefix specified is invalid: '{path_prefix}'"
93
+ assert request.model in self.deployed_models, f"The model specified is invalid: {request.model}"
94
+
95
95
  inputs = request.input if isinstance(request.input, list) else [request.input]
96
96
  self.logger.info(f"Computing embeddings for a batch of {len(inputs)} texts using model: {request.model}")
97
97
  embeddings = await self._compute_embeddings_from_resized_batches(request.model, inputs, request.dimensions)
@@ -101,11 +101,13 @@ class ModelRouter:
101
101
  ]
102
102
  return EmbeddingResponse(object="list", data=response_data, model=request.model)
103
103
  except Exception as e:
104
+ status_code = 400 if isinstance(e, AssertionError) else 500
104
105
  self.logger.error(f"Failed to create embeddings: {e}")
105
- raise HTTPException(status_code=500, detail=str(e))
106
+ raise HTTPException(status_code=status_code, detail=str(e))
106
107
 
107
108
  @web_api.get("/{path_prefix}/v1/models")
108
109
  async def list_models(self, path_prefix: str):
109
110
  """Returns the list of available models in OpenAI-compatible format."""
110
- assert path_prefix in self.path_prefix, f"Invalid path prefix: {path_prefix}"
111
+ if path_prefix not in self.path_prefix:
112
+ raise HTTPException(status_code=400, detail=f"The API path prefix specified is invalid: '{path_prefix}'")
111
113
  return {"object": "list", "data": self.available_models}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ray-embedding
3
- Version: 0.12.2
3
+ Version: 0.12.4
4
4
  Summary: Deploy SentenceTransformers embedding models to a ray cluster
5
5
  Author: Crispin Almodovar
6
6
  Author-email:
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = ray-embedding
3
- version = 0.12.2
3
+ version = 0.12.4
4
4
  author = Crispin Almodovar
5
5
  author_email =
6
6
  description = Deploy SentenceTransformers embedding models to a ray cluster
File without changes