ray-embedding 0.9.0__tar.gz → 0.9.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ray-embedding might be problematic. Click here for more details.
- {ray_embedding-0.9.0 → ray_embedding-0.9.1}/PKG-INFO +13 -7
- ray_embedding-0.9.1/README.md +18 -0
- {ray_embedding-0.9.0 → ray_embedding-0.9.1}/ray_embedding/deploy.py +10 -4
- {ray_embedding-0.9.0 → ray_embedding-0.9.1}/ray_embedding/embedding_model.py +50 -22
- {ray_embedding-0.9.0 → ray_embedding-0.9.1}/ray_embedding.egg-info/PKG-INFO +13 -7
- {ray_embedding-0.9.0 → ray_embedding-0.9.1}/setup.cfg +2 -2
- ray_embedding-0.9.0/README.md +0 -12
- {ray_embedding-0.9.0 → ray_embedding-0.9.1}/pyproject.toml +0 -0
- {ray_embedding-0.9.0 → ray_embedding-0.9.1}/ray_embedding/__init__.py +0 -0
- {ray_embedding-0.9.0 → ray_embedding-0.9.1}/ray_embedding/dto.py +0 -0
- {ray_embedding-0.9.0 → ray_embedding-0.9.1}/ray_embedding.egg-info/SOURCES.txt +0 -0
- {ray_embedding-0.9.0 → ray_embedding-0.9.1}/ray_embedding.egg-info/dependency_links.txt +0 -0
- {ray_embedding-0.9.0 → ray_embedding-0.9.1}/ray_embedding.egg-info/top_level.txt +0 -0
- {ray_embedding-0.9.0 → ray_embedding-0.9.1}/test/test.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ray-embedding
|
|
3
|
-
Version: 0.9.
|
|
4
|
-
Summary: Deploy SentenceTransformers models to a ray cluster
|
|
3
|
+
Version: 0.9.1
|
|
4
|
+
Summary: Deploy SentenceTransformers embedding models to a ray cluster
|
|
5
5
|
Author: Crispin Almodovar
|
|
6
6
|
Author-email: crispin.almodovar@docorto.ai
|
|
7
7
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -16,9 +16,15 @@ A tool for deploying SentenceTransformers models to a ray cluster.
|
|
|
16
16
|
|
|
17
17
|
### Supports the following backends
|
|
18
18
|
|
|
19
|
-
-
|
|
20
|
-
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
-
|
|
19
|
+
- pytorch-gpu
|
|
20
|
+
- pytorch-cpu
|
|
21
|
+
|
|
22
|
+
### Planned:
|
|
23
|
+
- onnx-gpu
|
|
24
|
+
- onnx-cpu
|
|
25
|
+
- openvino-cpu
|
|
24
26
|
- fastembed-onnx-cpu
|
|
27
|
+
|
|
28
|
+
- spot instances
|
|
29
|
+
- grpc
|
|
30
|
+
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# ray-embedding
|
|
2
|
+
|
|
3
|
+
A tool for deploying SentenceTransformers models to a ray cluster.
|
|
4
|
+
|
|
5
|
+
### Supports the following backends
|
|
6
|
+
|
|
7
|
+
- pytorch-gpu
|
|
8
|
+
- pytorch-cpu
|
|
9
|
+
|
|
10
|
+
### Planned:
|
|
11
|
+
- onnx-gpu
|
|
12
|
+
- onnx-cpu
|
|
13
|
+
- openvino-cpu
|
|
14
|
+
- fastembed-onnx-cpu
|
|
15
|
+
|
|
16
|
+
- spot instances
|
|
17
|
+
- grpc
|
|
18
|
+
|
|
@@ -5,6 +5,10 @@ import torch
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def deploy_model(args: Dict[str, Any]) -> Application:
|
|
8
|
+
"""Builds and deploys a SentenceTransformer embedding model.
|
|
9
|
+
:arg args: arguments for initializing a SentenceTransformer model
|
|
10
|
+
:returns: a Ray Serve Application
|
|
11
|
+
"""
|
|
8
12
|
assert args
|
|
9
13
|
deployment_name: str = args.pop("deployment", "")
|
|
10
14
|
assert deployment_name
|
|
@@ -17,13 +21,15 @@ def deploy_model(args: Dict[str, Any]) -> Application:
|
|
|
17
21
|
trust_remote_code: Optional[bool] = args.pop("trust_remote_code", False)
|
|
18
22
|
model_kwargs: Dict[str, Any] = args.pop("model_kwargs", {})
|
|
19
23
|
if "torch_dtype" in model_kwargs:
|
|
20
|
-
|
|
21
|
-
if
|
|
24
|
+
torch_dtype = model_kwargs["torch_dtype"].strip()
|
|
25
|
+
if torch_dtype == "float16":
|
|
22
26
|
model_kwargs["torch_dtype"] = torch.float16
|
|
23
|
-
elif
|
|
27
|
+
elif torch_dtype == "bfloat16":
|
|
24
28
|
model_kwargs["torch_dtype"] = torch.bfloat16
|
|
29
|
+
elif torch_dtype == "float32":
|
|
30
|
+
model_kwargs["torch_dtype"] = torch.float32
|
|
25
31
|
else:
|
|
26
|
-
del model_kwargs["torch_dtype"]
|
|
32
|
+
del model_kwargs["torch_dtype"] # Remove
|
|
27
33
|
|
|
28
34
|
deployment = EmbeddingModel.options(name=deployment_name).bind(model=model,
|
|
29
35
|
backend=backend,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os.path
|
|
3
3
|
import time
|
|
4
|
-
from typing import Optional, Dict, Any
|
|
4
|
+
from typing import Optional, Dict, Any, List
|
|
5
5
|
|
|
6
6
|
import torch
|
|
7
7
|
from fastapi import FastAPI, HTTPException
|
|
@@ -14,12 +14,22 @@ web_api = FastAPI(title=f"Ray Embeddings - OpenAI-compatible API")
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
@serve.deployment(
|
|
17
|
-
|
|
17
|
+
num_replicas="auto",
|
|
18
|
+
ray_actor_options={
|
|
19
|
+
"num_cpus": 1,
|
|
20
|
+
"num_gpus": 0,
|
|
21
|
+
"max_restarts": -1,
|
|
22
|
+
"max_task_retries": -1
|
|
23
|
+
},
|
|
18
24
|
autoscaling_config={
|
|
19
25
|
"target_ongoing_requests": 2,
|
|
20
26
|
"min_replicas": 0,
|
|
21
27
|
"initial_replicas": 1,
|
|
22
28
|
"max_replicas": 1,
|
|
29
|
+
},
|
|
30
|
+
user_config={
|
|
31
|
+
"max_batch_size": 8,
|
|
32
|
+
"batch_wait_timeout_s": 0.25,
|
|
23
33
|
}
|
|
24
34
|
)
|
|
25
35
|
@serve.ingress(web_api)
|
|
@@ -46,7 +56,13 @@ class EmbeddingModel:
|
|
|
46
56
|
"owned_by": "openai",
|
|
47
57
|
"permission": []}
|
|
48
58
|
]
|
|
49
|
-
self.logger.info(f"Successfully initialized embedding model {self.model}")
|
|
59
|
+
self.logger.info(f"Successfully initialized embedding model {self.model} using device {self.torch_device}")
|
|
60
|
+
|
|
61
|
+
def reconfigure(self, user_config: Dict):
|
|
62
|
+
assert "max_batch_size" in user_config and "batch_wait_timeout_s" in user_config, "Invalid user config"
|
|
63
|
+
self.logger.info(f"Reconfiguring dynamic batching parameters: {user_config}")
|
|
64
|
+
self.create_embeddings_batch.set_max_batch_size(user_config["max_batch_size"])
|
|
65
|
+
self.create_embeddings_batch.set_batch_wait_timeout_s(user_config["batch_wait_timeout_s"])
|
|
50
66
|
|
|
51
67
|
@web_api.post("/v1/embeddings", response_model=EmbeddingResponse)
|
|
52
68
|
async def create_embeddings(self, request: EmbeddingRequest):
|
|
@@ -55,34 +71,46 @@ class EmbeddingModel:
|
|
|
55
71
|
assert request.model == self.served_model_name, (
|
|
56
72
|
f"Model '{request.model}' is not supported. Use '{self.served_model_name}' instead."
|
|
57
73
|
)
|
|
74
|
+
return await self.create_embeddings_batch(request)
|
|
75
|
+
except Exception as e:
|
|
76
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
58
77
|
|
|
78
|
+
@serve.batch(max_batch_size=8, batch_wait_timeout_s=0.25)
|
|
79
|
+
async def create_embeddings_batch(self, requests: List[EmbeddingRequest]) -> List[EmbeddingResponse]:
|
|
80
|
+
# Batch the text inputs
|
|
81
|
+
inputs = [], truncate_dims = []
|
|
82
|
+
for request in requests:
|
|
59
83
|
if isinstance(request.input, str):
|
|
60
84
|
request.input = [request.input]
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
85
|
+
inputs.extend(request.input)
|
|
86
|
+
truncate_dims.append(request.dimensions or self.matryoshka_dim)
|
|
87
|
+
|
|
88
|
+
# Compute embeddings for the batch of text inputs
|
|
89
|
+
embeddings = self.embedding_model.encode(
|
|
90
|
+
inputs, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False,
|
|
91
|
+
).to(self.torch_device)
|
|
92
|
+
|
|
93
|
+
# Truncate the embeddings; note that the truncate_dim can be different for each request
|
|
94
|
+
# so we need to this step one by one
|
|
95
|
+
results = []
|
|
96
|
+
ix = 0
|
|
97
|
+
for truncate_dim, request in zip(truncate_dims, requests):
|
|
98
|
+
num_inputs = len(request.input)
|
|
99
|
+
batch_embeddings = embeddings[ix: ix + num_inputs]
|
|
100
|
+
ix += num_inputs
|
|
68
101
|
|
|
69
102
|
if truncate_dim is not None:
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
embeddings = embeddings / torch.norm(embeddings, dim=1, keepdim=True)
|
|
73
|
-
|
|
74
|
-
# Move all embeddings to CPU at once before conversion
|
|
75
|
-
embeddings = embeddings.cpu().tolist()
|
|
103
|
+
batch_embeddings = batch_embeddings[:, :truncate_dim]
|
|
104
|
+
batch_embeddings = batch_embeddings / torch.norm(batch_embeddings, dim=1, keepdim=True)
|
|
76
105
|
|
|
77
|
-
|
|
106
|
+
batch_embeddings = batch_embeddings.cpu().tolist()
|
|
78
107
|
response_data = [
|
|
79
|
-
{"index":
|
|
80
|
-
for
|
|
108
|
+
{"index": emb_ix, "embedding": emb}
|
|
109
|
+
for emb_ix, emb in enumerate(batch_embeddings)
|
|
81
110
|
]
|
|
82
|
-
|
|
111
|
+
results.append(EmbeddingResponse(object="list", data=response_data, model=request.model))
|
|
83
112
|
|
|
84
|
-
|
|
85
|
-
raise HTTPException(status_code=500, detail=str(e))
|
|
113
|
+
return results
|
|
86
114
|
|
|
87
115
|
@web_api.get("/v1/models")
|
|
88
116
|
async def list_models(self):
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ray-embedding
|
|
3
|
-
Version: 0.9.
|
|
4
|
-
Summary: Deploy SentenceTransformers models to a ray cluster
|
|
3
|
+
Version: 0.9.1
|
|
4
|
+
Summary: Deploy SentenceTransformers embedding models to a ray cluster
|
|
5
5
|
Author: Crispin Almodovar
|
|
6
6
|
Author-email: crispin.almodovar@docorto.ai
|
|
7
7
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -16,9 +16,15 @@ A tool for deploying SentenceTransformers models to a ray cluster.
|
|
|
16
16
|
|
|
17
17
|
### Supports the following backends
|
|
18
18
|
|
|
19
|
-
-
|
|
20
|
-
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
-
|
|
19
|
+
- pytorch-gpu
|
|
20
|
+
- pytorch-cpu
|
|
21
|
+
|
|
22
|
+
### Planned:
|
|
23
|
+
- onnx-gpu
|
|
24
|
+
- onnx-cpu
|
|
25
|
+
- openvino-cpu
|
|
24
26
|
- fastembed-onnx-cpu
|
|
27
|
+
|
|
28
|
+
- spot instances
|
|
29
|
+
- grpc
|
|
30
|
+
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
[metadata]
|
|
2
2
|
name = ray-embedding
|
|
3
|
-
version = 0.9.
|
|
3
|
+
version = 0.9.1
|
|
4
4
|
author = Crispin Almodovar
|
|
5
5
|
author_email = crispin.almodovar@docorto.ai
|
|
6
|
-
description = Deploy SentenceTransformers models to a ray cluster
|
|
6
|
+
description = Deploy SentenceTransformers embedding models to a ray cluster
|
|
7
7
|
long_description = file: README.md
|
|
8
8
|
long_description_content_type = text/markdown
|
|
9
9
|
classifiers =
|
ray_embedding-0.9.0/README.md
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
# ray-embedding
|
|
2
|
-
|
|
3
|
-
A tool for deploying SentenceTransformers models to a ray cluster.
|
|
4
|
-
|
|
5
|
-
### Supports the following backends
|
|
6
|
-
|
|
7
|
-
- sbert-pytorch-gpu
|
|
8
|
-
- sbert-pytorch-cpu
|
|
9
|
-
- sbert-onnx-gpu
|
|
10
|
-
- sbert-onnx-cpu
|
|
11
|
-
- sbert-openvino-cpu
|
|
12
|
-
- fastembed-onnx-cpu
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|