xinference 1.6.1__py3-none-any.whl → 1.7.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (76) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +79 -2
  3. xinference/client/restful/restful_client.py +64 -2
  4. xinference/core/media_interface.py +123 -0
  5. xinference/core/model.py +31 -0
  6. xinference/core/supervisor.py +8 -17
  7. xinference/core/worker.py +5 -17
  8. xinference/deploy/cmdline.py +6 -2
  9. xinference/model/audio/chattts.py +24 -39
  10. xinference/model/audio/cosyvoice.py +18 -30
  11. xinference/model/audio/funasr.py +42 -0
  12. xinference/model/audio/model_spec.json +18 -0
  13. xinference/model/audio/model_spec_modelscope.json +19 -1
  14. xinference/model/audio/utils.py +75 -0
  15. xinference/model/core.py +1 -0
  16. xinference/model/embedding/__init__.py +74 -18
  17. xinference/model/embedding/core.py +98 -597
  18. xinference/model/embedding/embed_family.py +133 -0
  19. xinference/model/embedding/flag/__init__.py +13 -0
  20. xinference/model/embedding/flag/core.py +282 -0
  21. xinference/model/embedding/model_spec.json +24 -0
  22. xinference/model/embedding/model_spec_modelscope.json +24 -0
  23. xinference/model/embedding/sentence_transformers/__init__.py +13 -0
  24. xinference/model/embedding/sentence_transformers/core.py +399 -0
  25. xinference/model/embedding/vllm/__init__.py +0 -0
  26. xinference/model/embedding/vllm/core.py +95 -0
  27. xinference/model/image/model_spec.json +20 -2
  28. xinference/model/image/model_spec_modelscope.json +21 -2
  29. xinference/model/image/stable_diffusion/core.py +144 -53
  30. xinference/model/llm/llama_cpp/memory.py +4 -2
  31. xinference/model/llm/llm_family.json +57 -0
  32. xinference/model/llm/llm_family_modelscope.json +61 -0
  33. xinference/model/llm/sglang/core.py +4 -0
  34. xinference/model/llm/utils.py +11 -0
  35. xinference/model/llm/vllm/core.py +3 -0
  36. xinference/model/rerank/core.py +96 -4
  37. xinference/model/rerank/model_spec.json +24 -0
  38. xinference/model/rerank/model_spec_modelscope.json +24 -0
  39. xinference/model/rerank/utils.py +4 -3
  40. xinference/model/utils.py +38 -1
  41. xinference/model/video/diffusers.py +65 -3
  42. xinference/model/video/model_spec.json +31 -4
  43. xinference/model/video/model_spec_modelscope.json +32 -4
  44. xinference/web/ui/build/asset-manifest.json +6 -6
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/css/main.013f296b.css +2 -0
  47. xinference/web/ui/build/static/css/main.013f296b.css.map +1 -0
  48. xinference/web/ui/build/static/js/main.8a9e3ba0.js +3 -0
  49. xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/6595880facebca7ceace6f17cf21c3a5a9219a2f52fb0ba9f3cf1131eddbcf6b.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/aa998bc2d9c11853add6b8a2e08f50327f56d8824ccaaec92d6dde1b305f0d85.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/c748246b1d7bcebc16153be69f37e955bb2145526c47dd425aeeff70d3004dbc.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/e31234e95d60a5a7883fbcd70de2475dc1c88c90705df1a530abb68f86f80a51.json +1 -0
  56. xinference/web/ui/src/locales/en.json +18 -7
  57. xinference/web/ui/src/locales/ja.json +224 -0
  58. xinference/web/ui/src/locales/ko.json +224 -0
  59. xinference/web/ui/src/locales/zh.json +18 -7
  60. {xinference-1.6.1.dist-info → xinference-1.7.0.post1.dist-info}/METADATA +9 -8
  61. {xinference-1.6.1.dist-info → xinference-1.7.0.post1.dist-info}/RECORD +66 -57
  62. xinference/web/ui/build/static/css/main.337afe76.css +0 -2
  63. xinference/web/ui/build/static/css/main.337afe76.css.map +0 -1
  64. xinference/web/ui/build/static/js/main.ddf9eaee.js +0 -3
  65. xinference/web/ui/build/static/js/main.ddf9eaee.js.map +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/12e637ed5fa9ca6491b03892b6949c03afd4960fe36ac25744488e7e1982aa19.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/77ac2665a784e99501ae95d32ef5937837a0439a47e965d291b38e99cb619f5b.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/d4ed4e82bfe69915999ec83f5feaa4301c75ecc6bdf1c78f2d03e4671ecbefc8.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +0 -1
  71. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +0 -1
  72. /xinference/web/ui/build/static/js/{main.ddf9eaee.js.LICENSE.txt → main.8a9e3ba0.js.LICENSE.txt} +0 -0
  73. {xinference-1.6.1.dist-info → xinference-1.7.0.post1.dist-info}/WHEEL +0 -0
  74. {xinference-1.6.1.dist-info → xinference-1.7.0.post1.dist-info}/entry_points.txt +0 -0
  75. {xinference-1.6.1.dist-info → xinference-1.7.0.post1.dist-info}/licenses/LICENSE +0 -0
  76. {xinference-1.6.1.dist-info → xinference-1.7.0.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,399 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import importlib.util
16
+ import logging
17
+ from collections import defaultdict
18
+ from typing import List, Optional, Union, no_type_check
19
+
20
+ import numpy as np
21
+ import torch
22
+
23
+ from ....types import Dict, Embedding, EmbeddingData, EmbeddingUsage
24
+ from ..core import EmbeddingModel, EmbeddingModelSpec
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ # Used for check whether the model is cached.
29
+ # Init when registering all the builtin models.
30
+ MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
31
+ SENTENCE_TRANSFORMER_MODEL_LIST: List[str] = []
32
+
33
+
34
+ class SentenceTransformerEmbeddingModel(EmbeddingModel):
35
+ def load(self):
36
+ # TODO: load model
37
+ try:
38
+ import sentence_transformers
39
+ from sentence_transformers import SentenceTransformer
40
+
41
+ if sentence_transformers.__version__ < "3.1.0":
42
+ raise ValueError(
43
+ "The sentence_transformers version must be greater than 3.1.0. "
44
+ "Please upgrade your version via `pip install -U sentence_transformers` or refer to "
45
+ "https://github.com/UKPLab/sentence-transformers"
46
+ )
47
+ except ImportError:
48
+ error_message = "Failed to import module 'SentenceTransformer'"
49
+ installation_guide = [
50
+ "Please make sure 'sentence-transformers' is installed. ",
51
+ "You can install it by `pip install sentence-transformers`\n",
52
+ ]
53
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
54
+
55
+ class XSentenceTransformer(SentenceTransformer):
56
+ def to(self, *args, **kwargs):
57
+ pass
58
+
59
+ torch_dtype = None
60
+ if torch_dtype_str := self._kwargs.get("torch_dtype"):
61
+ try:
62
+ torch_dtype = getattr(torch, torch_dtype_str)
63
+ if torch_dtype not in [
64
+ torch.float16,
65
+ torch.float32,
66
+ torch.bfloat16,
67
+ ]:
68
+ logger.warning(
69
+ f"Load embedding model with unsupported torch dtype : {torch_dtype_str}. Using default torch dtype: fp32."
70
+ )
71
+ torch_dtype = torch.float32
72
+ except AttributeError:
73
+ logger.warning(
74
+ f"Load embedding model with unknown torch dtype '{torch_dtype_str}'. Using default torch dtype: fp32."
75
+ )
76
+ torch_dtype = torch.float32
77
+
78
+ if (
79
+ "gte" in self._model_spec.model_name.lower()
80
+ and "qwen2" in self._model_spec.model_name.lower()
81
+ ):
82
+ model_kwargs = {"device_map": "auto"}
83
+ if torch_dtype:
84
+ model_kwargs["torch_dtype"] = torch_dtype
85
+ self._model = XSentenceTransformer(
86
+ self._model_path,
87
+ device=self._device,
88
+ model_kwargs=model_kwargs,
89
+ )
90
+ elif "qwen3" in self._model_spec.model_name.lower():
91
+ # qwen3 embedding
92
+ flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
93
+ model_kwargs = {"device_map": "auto"}
94
+ tokenizer_kwargs = {}
95
+ if flash_attn_installed:
96
+ model_kwargs["attn_implementation"] = "flash_attention_2"
97
+ model_kwargs["torch_dtype"] = "bfloat16"
98
+ tokenizer_kwargs["padding_side"] = "left"
99
+ if torch_dtype:
100
+ model_kwargs["torch_dtype"] = torch_dtype
101
+ logger.debug(
102
+ "Loading qwen3 embedding with model kwargs: %s, tokenizer kwargs: %s",
103
+ model_kwargs,
104
+ tokenizer_kwargs,
105
+ )
106
+ self._model = XSentenceTransformer(
107
+ self._model_path,
108
+ device=self._device,
109
+ model_kwargs=model_kwargs,
110
+ tokenizer_kwargs=tokenizer_kwargs,
111
+ )
112
+ else:
113
+ model_kwargs = {"torch_dtype": torch_dtype} if torch_dtype else None
114
+ self._model = SentenceTransformer(
115
+ self._model_path,
116
+ device=self._device,
117
+ model_kwargs=model_kwargs,
118
+ trust_remote_code=True,
119
+ )
120
+
121
+ self._tokenizer = self._model.tokenizer
122
+
123
+ def create_embedding(
124
+ self,
125
+ sentences: Union[str, List[str]],
126
+ **kwargs,
127
+ ):
128
+ sentences = self._fix_langchain_openai_inputs(sentences)
129
+ model_uid = kwargs.pop("model_uid", None)
130
+
131
+ from sentence_transformers import SentenceTransformer
132
+
133
+ kwargs.setdefault("normalize_embeddings", True)
134
+ if kwargs.get("return_sparse", False):
135
+ raise ValueError(
136
+ "`return_sparse` is not supported for `sentence_transformers` backend, "
137
+ "please use `flag` instead"
138
+ )
139
+
140
+ # copied from sentence-transformers, and modify it to return tokens num
141
+ @no_type_check
142
+ def encode(
143
+ model: SentenceTransformer,
144
+ sentences: Union[str, List[str]],
145
+ prompt_name: Optional[str] = None,
146
+ prompt: Optional[str] = None,
147
+ batch_size: int = 32,
148
+ show_progress_bar: bool = None,
149
+ output_value: str = "sentence_embedding",
150
+ convert_to_numpy: bool = True,
151
+ convert_to_tensor: bool = False,
152
+ device: str = None,
153
+ normalize_embeddings: bool = False,
154
+ **kwargs,
155
+ ):
156
+ """
157
+ Computes sentence embeddings
158
+
159
+ :param sentences: the sentences to embed
160
+ :param batch_size: the batch size used for the computation
161
+ :param show_progress_bar: Output a progress bar when encode sentences
162
+ :param output_value: Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
163
+ :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
164
+ :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
165
+ :param device: Which torch.device to use for the computation
166
+ :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
167
+
168
+ :return:
169
+ By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
170
+ """
171
+ import torch
172
+ from sentence_transformers.util import batch_to_device
173
+ from tqdm.autonotebook import trange
174
+
175
+ model.eval()
176
+ if show_progress_bar is None:
177
+ show_progress_bar = (
178
+ logger.getEffectiveLevel() == logging.INFO
179
+ or logger.getEffectiveLevel() == logging.DEBUG
180
+ )
181
+
182
+ if convert_to_tensor:
183
+ convert_to_numpy = False
184
+
185
+ if output_value != "sentence_embedding":
186
+ convert_to_tensor = False
187
+ convert_to_numpy = False
188
+
189
+ input_was_string = False
190
+ if isinstance(sentences, str) or not hasattr(
191
+ sentences, "__len__"
192
+ ): # Cast an individual sentence to a list with length 1
193
+ sentences = [sentences]
194
+ input_was_string = True
195
+
196
+ if prompt is None:
197
+ if prompt_name is not None:
198
+ try:
199
+ prompt = model.prompts[prompt_name]
200
+ except KeyError:
201
+ raise ValueError(
202
+ f"Prompt name '{prompt_name}' not found in the configured prompts dictionary with keys {list(model.prompts.keys())!r}."
203
+ )
204
+ elif model.default_prompt_name is not None:
205
+ prompt = model.prompts.get(model.default_prompt_name, None)
206
+ else:
207
+ if prompt_name is not None:
208
+ logger.warning(
209
+ "Encode with either a `prompt`, a `prompt_name`, or neither, but not both. "
210
+ "Ignoring the `prompt_name` in favor of `prompt`."
211
+ )
212
+
213
+ extra_features = {}
214
+ if prompt is not None:
215
+ sentences = [prompt + sentence for sentence in sentences]
216
+
217
+ # Some models (e.g. INSTRUCTOR, GRIT) require removing the prompt before pooling
218
+ # Tracking the prompt length allow us to remove the prompt during pooling
219
+ tokenized_prompt = model.tokenize([prompt])
220
+ if "input_ids" in tokenized_prompt:
221
+ extra_features["prompt_length"] = (
222
+ tokenized_prompt["input_ids"].shape[-1] - 1
223
+ )
224
+
225
+ if device is None:
226
+ device = model._target_device
227
+
228
+ if (
229
+ "gte" in self._model_spec.model_name.lower()
230
+ and "qwen2" in self._model_spec.model_name.lower()
231
+ ):
232
+ model.to(device)
233
+
234
+ all_embeddings = []
235
+ all_token_nums = 0
236
+ length_sorted_idx = np.argsort(
237
+ [-self._text_length(sen) for sen in sentences]
238
+ )
239
+ sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
240
+
241
+ for start_index in trange(
242
+ 0,
243
+ len(sentences),
244
+ batch_size,
245
+ desc="Batches",
246
+ disable=not show_progress_bar,
247
+ ):
248
+ sentences_batch = sentences_sorted[
249
+ start_index : start_index + batch_size
250
+ ]
251
+ features = model.tokenize(sentences_batch)
252
+ features = batch_to_device(features, device)
253
+ features.update(extra_features)
254
+ # when batching, the attention mask 1 means there is a token
255
+ # thus we just sum up it to get the total number of tokens
256
+ if "clip" in self._model_spec.model_name.lower():
257
+ all_token_nums += features["input_ids"].numel()
258
+ all_token_nums += features["pixel_values"].numel()
259
+ else:
260
+ all_token_nums += features["attention_mask"].sum().item()
261
+
262
+ with torch.no_grad():
263
+ out_features = model.forward(features, **kwargs)
264
+
265
+ if output_value == "token_embeddings":
266
+ embeddings = []
267
+ for token_emb, attention in zip(
268
+ out_features[output_value], out_features["attention_mask"]
269
+ ):
270
+ last_mask_id = len(attention) - 1
271
+ while (
272
+ last_mask_id > 0 and attention[last_mask_id].item() == 0
273
+ ):
274
+ last_mask_id -= 1
275
+
276
+ embeddings.append(token_emb[0 : last_mask_id + 1])
277
+ elif output_value is None: # Return all outputs
278
+ embeddings = []
279
+ for sent_idx in range(len(out_features["sentence_embedding"])):
280
+ row = {
281
+ name: out_features[name][sent_idx]
282
+ for name in out_features
283
+ }
284
+ embeddings.append(row)
285
+ else: # Sentence embeddings
286
+ embeddings = out_features[output_value]
287
+ embeddings = embeddings.detach()
288
+ if normalize_embeddings:
289
+ embeddings = torch.nn.functional.normalize(
290
+ embeddings, p=2, dim=1
291
+ )
292
+
293
+ # fixes for #522 and #487 to avoid oom problems on gpu with large datasets
294
+ if convert_to_numpy:
295
+ embeddings = embeddings.cpu()
296
+
297
+ all_embeddings.extend(embeddings)
298
+
299
+ all_embeddings = [
300
+ all_embeddings[idx] for idx in np.argsort(length_sorted_idx)
301
+ ]
302
+
303
+ if convert_to_tensor:
304
+ if len(all_embeddings):
305
+ all_embeddings = torch.stack(all_embeddings)
306
+ else:
307
+ all_embeddings = torch.Tensor()
308
+ elif convert_to_numpy:
309
+ all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
310
+
311
+ if input_was_string:
312
+ all_embeddings = all_embeddings[0]
313
+
314
+ return all_embeddings, all_token_nums
315
+
316
+ # seems already support prompt in embedding model
317
+ if (
318
+ "gte" in self._model_spec.model_name.lower()
319
+ and "qwen2" in self._model_spec.model_name.lower()
320
+ ):
321
+ all_embeddings, all_token_nums = encode(
322
+ self._model,
323
+ sentences,
324
+ prompt_name="query",
325
+ convert_to_numpy=False,
326
+ **kwargs,
327
+ )
328
+ elif "clip" in self._model_spec.model_name.lower():
329
+ import base64
330
+ import re
331
+ from io import BytesIO
332
+
333
+ from PIL import Image
334
+
335
+ def base64_to_image(base64_str: str) -> Image.Image:
336
+ # base64_data = re.sub("^data:image/.+;base64,", "", base64_str)
337
+ base64_data = base64_str.split(",", 1)[1]
338
+ byte_data = base64.b64decode(base64_data)
339
+ image_data = BytesIO(byte_data)
340
+ img = Image.open(image_data)
341
+ return img
342
+
343
+ objs: list[dict[str, str]] = []
344
+ for item in sentences:
345
+ if isinstance(item, dict):
346
+ if item.get("text") is not None:
347
+ objs.append(item["text"])
348
+ elif item.get("image") is not None:
349
+ if re.match(r"^data:image/.+;base64,", item["image"]):
350
+ image = base64_to_image(item["image"])
351
+ objs.append(image)
352
+ else:
353
+ objs.append(item["image"])
354
+ else:
355
+ logger.error("Please check the input data.")
356
+ all_embeddings, all_token_nums = encode(
357
+ self._model,
358
+ objs,
359
+ convert_to_numpy=False,
360
+ **self._kwargs,
361
+ )
362
+ else:
363
+ all_embeddings, all_token_nums = encode(
364
+ self._model,
365
+ sentences,
366
+ convert_to_numpy=False,
367
+ **kwargs,
368
+ )
369
+ if isinstance(sentences, str):
370
+ all_embeddings = [all_embeddings]
371
+ embedding_list = []
372
+ for index, data in enumerate(all_embeddings):
373
+ embedding_list.append(
374
+ EmbeddingData(index=index, object="embedding", embedding=data.tolist())
375
+ )
376
+ usage = EmbeddingUsage(
377
+ prompt_tokens=all_token_nums, total_tokens=all_token_nums
378
+ )
379
+ result = Embedding(
380
+ object="list",
381
+ model=model_uid,
382
+ model_replica=self._model_uid,
383
+ data=embedding_list,
384
+ usage=usage,
385
+ )
386
+
387
+ # clean cache if possible
388
+ self._clean_cache_if_needed(all_token_nums)
389
+
390
+ return result
391
+
392
+ @classmethod
393
+ def check_lib(cls) -> bool:
394
+ return importlib.util.find_spec("sentence_transformers") is not None
395
+
396
+ @classmethod
397
+ def match_json(cls, model_spec: EmbeddingModelSpec) -> bool:
398
+ # As default embedding engine, sentence-transformer support all models
399
+ return True
File without changes
@@ -0,0 +1,95 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import importlib.util
16
+ import logging
17
+ from typing import List, Union
18
+
19
+ from ....types import Embedding, EmbeddingData, EmbeddingUsage
20
+ from ..core import EmbeddingModel, EmbeddingModelSpec
21
+
22
+ logger = logging.getLogger(__name__)
23
+ SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"]
24
+
25
+
26
+ class VLLMEmbeddingModel(EmbeddingModel):
27
+ def load(self):
28
+ try:
29
+ from vllm import LLM
30
+
31
+ except ImportError:
32
+ error_message = "Failed to import module 'vllm'"
33
+ installation_guide = [
34
+ "Please make sure 'vllm' is installed. ",
35
+ "You can install it by `pip install vllm`\n",
36
+ ]
37
+
38
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
39
+
40
+ self._model = LLM(model=self._model_path, task="embed")
41
+ self._tokenizer = self._model.get_tokenizer()
42
+
43
+ @staticmethod
44
+ def _get_detailed_instruct(task_description: str, query: str) -> str:
45
+ return f"Instruct: {task_description}\nQuery:{query}"
46
+
47
+ def create_embedding(
48
+ self,
49
+ sentences: Union[str, List[str]],
50
+ **kwargs,
51
+ ):
52
+ sentences = self._fix_langchain_openai_inputs(sentences)
53
+ model_uid = kwargs.pop("model_uid", None)
54
+
55
+ normalize_embedding = kwargs.get("normalize_embedding", True)
56
+ if not normalize_embedding:
57
+ raise ValueError(
58
+ "vllm embedding engine does not support "
59
+ "setting `normalize_embedding=False`"
60
+ )
61
+
62
+ assert self._model is not None
63
+ outputs = self._model.embed(sentences, use_tqdm=False)
64
+ embedding_list = []
65
+ all_token_nums = 0
66
+ for index, output in enumerate(outputs):
67
+ embedding_list.append(
68
+ EmbeddingData(
69
+ index=index, object="embedding", embedding=output.outputs.embedding
70
+ )
71
+ )
72
+ all_token_nums += len(output.prompt_token_ids)
73
+ usage = EmbeddingUsage(
74
+ prompt_tokens=all_token_nums, total_tokens=all_token_nums
75
+ )
76
+ result = Embedding(
77
+ object="list",
78
+ model=model_uid,
79
+ model_replica=self._model_uid,
80
+ data=embedding_list,
81
+ usage=usage,
82
+ )
83
+
84
+ return result
85
+
86
+ @classmethod
87
+ def check_lib(cls) -> bool:
88
+ return importlib.util.find_spec("vllm") is not None
89
+
90
+ @classmethod
91
+ def match_json(cls, model_spec: EmbeddingModelSpec) -> bool:
92
+ prefix = model_spec.model_name.split("-", 1)[0]
93
+ if prefix in SUPPORTED_MODELS_PREFIXES:
94
+ return True
95
+ return False
@@ -123,7 +123,7 @@
123
123
  "quantize": true,
124
124
  "quantize_text_encoder": "text_encoder_3",
125
125
  "torch_dtype": "bfloat16",
126
- "transformer_nf4": true
126
+ "transformer_quantization": "nf4"
127
127
  },
128
128
  "gguf_model_id": "city96/stable-diffusion-3.5-large-gguf",
129
129
  "gguf_quantizations": [
@@ -150,7 +150,7 @@
150
150
  "quantize": true,
151
151
  "quantize_text_encoder": "text_encoder_3",
152
152
  "torch_dtype": "bfloat16",
153
- "transformer_nf4": true
153
+ "transformer_quantization": "nf4"
154
154
  },
155
155
  "default_generate_config": {
156
156
  "guidance_scale": 1.0,
@@ -314,6 +314,24 @@
314
314
  ]
315
315
  }
316
316
  },
317
+ {
318
+ "model_name": "cogview4",
319
+ "model_family": "stable_diffusion",
320
+ "model_id": "THUDM/CogView4-6B",
321
+ "model_revision": "63a52b7f6dace7033380cd6da14d0915eab3e6b5",
322
+ "model_ability": [
323
+ "text2image"
324
+ ],
325
+ "default_model_config": {
326
+ "torch_dtype": "bfloat16"
327
+ },
328
+ "virtualenv": {
329
+ "packages": [
330
+ "diffusers>=0.33.0",
331
+ "#system_numpy#"
332
+ ]
333
+ }
334
+ },
317
335
  {
318
336
  "model_name": "stable-diffusion-inpainting",
319
337
  "model_family": "stable_diffusion",
@@ -128,7 +128,7 @@
128
128
  "quantize": true,
129
129
  "quantize_text_encoder": "text_encoder_3",
130
130
  "torch_dtype": "bfloat16",
131
- "transformer_nf4": true
131
+ "transformer_quantization": "nf4"
132
132
  },
133
133
  "gguf_model_id": "Xorbits/stable-diffusion-3.5-large-gguf",
134
134
  "gguf_quantizations": [
@@ -156,7 +156,7 @@
156
156
  "quantize": true,
157
157
  "quantize_text_encoder": "text_encoder_3",
158
158
  "torch_dtype": "bfloat16",
159
- "transformer_nf4": true
159
+ "transformer_quantization": "nf4"
160
160
  },
161
161
  "default_generate_config": {
162
162
  "guidance_scale": 1.0,
@@ -327,6 +327,25 @@
327
327
  ]
328
328
  }
329
329
  },
330
+ {
331
+ "model_name": "cogview4",
332
+ "model_family": "stable_diffusion",
333
+ "model_hub": "modelscope",
334
+ "model_id": "ZhipuAI/CogView4-6B",
335
+ "model_revision": "master",
336
+ "model_ability": [
337
+ "text2image"
338
+ ],
339
+ "default_model_config": {
340
+ "torch_dtype": "bfloat16"
341
+ },
342
+ "virtualenv": {
343
+ "packages": [
344
+ "diffusers>=0.33.0",
345
+ "#system_numpy#"
346
+ ]
347
+ }
348
+ },
330
349
  {
331
350
  "model_name": "GOT-OCR2_0",
332
351
  "model_family": "ocr",