ray-embedding 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ray-embedding might be problematic. Click here for more details.

ray_embedding/deploy.py CHANGED
@@ -5,6 +5,10 @@ import torch
5
5
 
6
6
 
7
7
  def deploy_model(args: Dict[str, Any]) -> Application:
8
+ """Builds and deploys a SentenceTransformer embedding model.
9
+ :arg args: arguments for initializing a SentenceTransformer model
10
+ :returns: a Ray Serve Application
11
+ """
8
12
  assert args
9
13
  deployment_name: str = args.pop("deployment", "")
10
14
  assert deployment_name
@@ -17,13 +21,15 @@ def deploy_model(args: Dict[str, Any]) -> Application:
17
21
  trust_remote_code: Optional[bool] = args.pop("trust_remote_code", False)
18
22
  model_kwargs: Dict[str, Any] = args.pop("model_kwargs", {})
19
23
  if "torch_dtype" in model_kwargs:
20
- model_kwargs["torch_dtype"] = model_kwargs["torch_dtype"].strip()
21
- if model_kwargs["torch_dtype"] == "float16":
24
+ torch_dtype = model_kwargs["torch_dtype"].strip()
25
+ if torch_dtype == "float16":
22
26
  model_kwargs["torch_dtype"] = torch.float16
23
- elif model_kwargs["torch_dtype"] == "bfloat16":
27
+ elif torch_dtype == "bfloat16":
24
28
  model_kwargs["torch_dtype"] = torch.bfloat16
29
+ elif torch_dtype == "float32":
30
+ model_kwargs["torch_dtype"] = torch.float32
25
31
  else:
26
- del model_kwargs["torch_dtype"]
32
+ del model_kwargs["torch_dtype"] # Remove
27
33
 
28
34
  deployment = EmbeddingModel.options(name=deployment_name).bind(model=model,
29
35
  backend=backend,
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  import os.path
3
3
  import time
4
- from typing import Optional, Dict, Any
4
+ from typing import Optional, Dict, Any, List
5
5
 
6
6
  import torch
7
7
  from fastapi import FastAPI, HTTPException
@@ -14,12 +14,22 @@ web_api = FastAPI(title=f"Ray Embeddings - OpenAI-compatible API")
14
14
 
15
15
 
16
16
  @serve.deployment(
17
- ray_actor_options={"num_gpus": 1},
17
+ num_replicas="auto",
18
+ ray_actor_options={
19
+ "num_cpus": 1,
20
+ "num_gpus": 0,
21
+ "max_restarts": -1,
22
+ "max_task_retries": -1
23
+ },
18
24
  autoscaling_config={
19
25
  "target_ongoing_requests": 2,
20
26
  "min_replicas": 0,
21
27
  "initial_replicas": 1,
22
28
  "max_replicas": 1,
29
+ },
30
+ user_config={
31
+ "max_batch_size": 8,
32
+ "batch_wait_timeout_s": 0.25,
23
33
  }
24
34
  )
25
35
  @serve.ingress(web_api)
@@ -46,7 +56,13 @@ class EmbeddingModel:
46
56
  "owned_by": "openai",
47
57
  "permission": []}
48
58
  ]
49
- self.logger.info(f"Successfully initialized embedding model {self.model}")
59
+ self.logger.info(f"Successfully initialized embedding model {self.model} using device {self.torch_device}")
60
+
61
+ def reconfigure(self, user_config: Dict):
62
+ assert "max_batch_size" in user_config and "batch_wait_timeout_s" in user_config, "Invalid user config"
63
+ self.logger.info(f"Reconfiguring dynamic batching parameters: {user_config}")
64
+ self.create_embeddings_batch.set_max_batch_size(user_config["max_batch_size"])
65
+ self.create_embeddings_batch.set_batch_wait_timeout_s(user_config["batch_wait_timeout_s"])
50
66
 
51
67
  @web_api.post("/v1/embeddings", response_model=EmbeddingResponse)
52
68
  async def create_embeddings(self, request: EmbeddingRequest):
@@ -55,34 +71,46 @@ class EmbeddingModel:
55
71
  assert request.model == self.served_model_name, (
56
72
  f"Model '{request.model}' is not supported. Use '{self.served_model_name}' instead."
57
73
  )
74
+ return await self.create_embeddings_batch(request)
75
+ except Exception as e:
76
+ raise HTTPException(status_code=500, detail=str(e))
58
77
 
78
+ @serve.batch(max_batch_size=8, batch_wait_timeout_s=0.25)
79
+ async def create_embeddings_batch(self, requests: List[EmbeddingRequest]) -> List[EmbeddingResponse]:
80
+ # Batch the text inputs
81
+ inputs = [], truncate_dims = []
82
+ for request in requests:
59
83
  if isinstance(request.input, str):
60
84
  request.input = [request.input]
61
-
62
- truncate_dim = request.dimensions or self.matryoshka_dim
63
-
64
- # Compute embeddings and convert to a PyTorch tensor on the GPU
65
- embeddings = self.embedding_model.encode(
66
- request.input, convert_to_tensor=True, normalize_embeddings=True
67
- ).to(self.torch_device)
85
+ inputs.extend(request.input)
86
+ truncate_dims.append(request.dimensions or self.matryoshka_dim)
87
+
88
+ # Compute embeddings for the batch of text inputs
89
+ embeddings = self.embedding_model.encode(
90
+ inputs, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False,
91
+ ).to(self.torch_device)
92
+
93
+ # Truncate the embeddings; note that the truncate_dim can be different for each request
94
+ # so we need to this step one by one
95
+ results = []
96
+ ix = 0
97
+ for truncate_dim, request in zip(truncate_dims, requests):
98
+ num_inputs = len(request.input)
99
+ batch_embeddings = embeddings[ix: ix + num_inputs]
100
+ ix += num_inputs
68
101
 
69
102
  if truncate_dim is not None:
70
- # Truncate and re-normalize the embeddings
71
- embeddings = embeddings[:, :truncate_dim]
72
- embeddings = embeddings / torch.norm(embeddings, dim=1, keepdim=True)
73
-
74
- # Move all embeddings to CPU at once before conversion
75
- embeddings = embeddings.cpu().tolist()
103
+ batch_embeddings = batch_embeddings[:, :truncate_dim]
104
+ batch_embeddings = batch_embeddings / torch.norm(batch_embeddings, dim=1, keepdim=True)
76
105
 
77
- # Convert embeddings to list format for response
106
+ batch_embeddings = batch_embeddings.cpu().tolist()
78
107
  response_data = [
79
- {"index": idx, "embedding": emb}
80
- for idx, emb in enumerate(embeddings)
108
+ {"index": emb_ix, "embedding": emb}
109
+ for emb_ix, emb in enumerate(batch_embeddings)
81
110
  ]
82
- return EmbeddingResponse(object="list", data=response_data, model=request.model)
111
+ results.append(EmbeddingResponse(object="list", data=response_data, model=request.model))
83
112
 
84
- except Exception as e:
85
- raise HTTPException(status_code=500, detail=str(e))
113
+ return results
86
114
 
87
115
  @web_api.get("/v1/models")
88
116
  async def list_models(self):
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ray-embedding
3
- Version: 0.9.0
4
- Summary: Deploy SentenceTransformers models to a ray cluster
3
+ Version: 0.9.1
4
+ Summary: Deploy SentenceTransformers embedding models to a ray cluster
5
5
  Author: Crispin Almodovar
6
6
  Author-email: crispin.almodovar@docorto.ai
7
7
  Classifier: Programming Language :: Python :: 3
@@ -16,9 +16,15 @@ A tool for deploying SentenceTransformers models to a ray cluster.
16
16
 
17
17
  ### Supports the following backends
18
18
 
19
- - sbert-pytorch-gpu
20
- - sbert-pytorch-cpu
21
- - sbert-onnx-gpu
22
- - sbert-onnx-cpu
23
- - sbert-openvino-cpu
19
+ - pytorch-gpu
20
+ - pytorch-cpu
21
+
22
+ ### Planned:
23
+ - onnx-gpu
24
+ - onnx-cpu
25
+ - openvino-cpu
24
26
  - fastembed-onnx-cpu
27
+
28
+ - spot instances
29
+ - grpc
30
+
@@ -0,0 +1,8 @@
1
+ ray_embedding/__init__.py,sha256=OYJT0rVaaGzY613JqgfktsCgroDnBkGOHxR2FE9UtRU,49
2
+ ray_embedding/deploy.py,sha256=YD_udSm13QbFPgSAkCrTQso15DmtIn0QEhErOFNg7jM,1841
3
+ ray_embedding/dto.py,sha256=e91ejZbM_NB9WTjF1YnfuV71cajYIh0vOX8oV_g2OwM,595
4
+ ray_embedding/embedding_model.py,sha256=QvJRwQYgqMs99F8Nbo3aC0N5oMsMEtNB6W2C4-zSfPs,5131
5
+ ray_embedding-0.9.1.dist-info/METADATA,sha256=w1Khp-pKHk6AwTzljTU_Z-JQG3kqImS6ICEMzHQHF1c,657
6
+ ray_embedding-0.9.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
7
+ ray_embedding-0.9.1.dist-info/top_level.txt,sha256=ziCblpJq1YsrryshFqxTRuRMgNuO1_tgvAAkGShATNA,14
8
+ ray_embedding-0.9.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (77.0.3)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,8 +0,0 @@
1
- ray_embedding/__init__.py,sha256=OYJT0rVaaGzY613JqgfktsCgroDnBkGOHxR2FE9UtRU,49
2
- ray_embedding/deploy.py,sha256=E79J0bcVNXAFlFMVrjZTaSXLnrMZ6LvtRdD4d1DKu1w,1598
3
- ray_embedding/dto.py,sha256=e91ejZbM_NB9WTjF1YnfuV71cajYIh0vOX8oV_g2OwM,595
4
- ray_embedding/embedding_model.py,sha256=1sx5jXo61UgLLL8BtFBPflsbBay6J3yG1u2UMIPgAtk,3768
5
- ray_embedding-0.9.0.dist-info/METADATA,sha256=zfS00PtWu6mTcHloTDJDqEQ9bwbLJWUSgiv21degWNc,636
6
- ray_embedding-0.9.0.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
7
- ray_embedding-0.9.0.dist-info/top_level.txt,sha256=ziCblpJq1YsrryshFqxTRuRMgNuO1_tgvAAkGShATNA,14
8
- ray_embedding-0.9.0.dist-info/RECORD,,