xinference 0.13.3__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +4 -1
- xinference/client/restful/restful_client.py +2 -2
- xinference/constants.py +0 -4
- xinference/core/image_interface.py +6 -3
- xinference/core/model.py +1 -1
- xinference/core/supervisor.py +2 -0
- xinference/core/worker.py +7 -0
- xinference/deploy/utils.py +6 -0
- xinference/model/audio/core.py +4 -2
- xinference/model/core.py +25 -4
- xinference/model/embedding/core.py +88 -13
- xinference/model/embedding/model_spec.json +8 -0
- xinference/model/embedding/model_spec_modelscope.json +8 -0
- xinference/model/flexible/core.py +8 -2
- xinference/model/image/core.py +8 -5
- xinference/model/image/model_spec.json +30 -6
- xinference/model/image/model_spec_modelscope.json +21 -3
- xinference/model/image/stable_diffusion/core.py +30 -27
- xinference/model/llm/core.py +6 -4
- xinference/model/llm/ggml/llamacpp.py +7 -5
- xinference/model/llm/llm_family.py +6 -6
- xinference/model/llm/mlx/core.py +7 -0
- xinference/model/llm/pytorch/chatglm.py +4 -1
- xinference/model/llm/pytorch/deepseek_vl.py +2 -1
- xinference/model/llm/pytorch/falcon.py +2 -1
- xinference/model/llm/pytorch/llama_2.py +4 -2
- xinference/model/llm/pytorch/omnilmm.py +2 -1
- xinference/model/llm/pytorch/qwen_vl.py +2 -1
- xinference/model/llm/pytorch/vicuna.py +2 -1
- xinference/model/llm/pytorch/yi_vl.py +2 -1
- xinference/model/llm/sglang/core.py +12 -6
- xinference/model/llm/vllm/core.py +1 -5
- xinference/model/rerank/core.py +4 -3
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.2ef0cfaf.js → main.af906659.js} +3 -3
- xinference/web/ui/build/static/js/main.af906659.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2cd5e4279ad7e13a1f41d486e9fca7756295bfad5bd77d90992f4ac3e10b496d.json +1 -0
- {xinference-0.13.3.dist-info → xinference-0.14.0.dist-info}/METADATA +24 -4
- {xinference-0.13.3.dist-info → xinference-0.14.0.dist-info}/RECORD +46 -46
- xinference/web/ui/build/static/js/main.2ef0cfaf.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b6807ecc0c231fea699533518a0eb2a2bf68a081ce00d452be40600dbffa17a7.json +0 -1
- /xinference/web/ui/build/static/js/{main.2ef0cfaf.js.LICENSE.txt → main.af906659.js.LICENSE.txt} +0 -0
- {xinference-0.13.3.dist-info → xinference-0.14.0.dist-info}/LICENSE +0 -0
- {xinference-0.13.3.dist-info → xinference-0.14.0.dist-info}/WHEEL +0 -0
- {xinference-0.13.3.dist-info → xinference-0.14.0.dist-info}/entry_points.txt +0 -0
- {xinference-0.13.3.dist-info → xinference-0.14.0.dist-info}/top_level.txt +0 -0
|
@@ -4,21 +4,31 @@
|
|
|
4
4
|
"model_family": "stable_diffusion",
|
|
5
5
|
"model_hub": "modelscope",
|
|
6
6
|
"model_id": "AI-ModelScope/stable-diffusion-3-medium-diffusers",
|
|
7
|
-
"model_revision": "master"
|
|
7
|
+
"model_revision": "master",
|
|
8
|
+
"abilities": [
|
|
9
|
+
"text2iamge",
|
|
10
|
+
"image2image"
|
|
11
|
+
]
|
|
8
12
|
},
|
|
9
13
|
{
|
|
10
14
|
"model_name": "sd-turbo",
|
|
11
15
|
"model_family": "stable_diffusion",
|
|
12
16
|
"model_hub": "modelscope",
|
|
13
17
|
"model_id": "AI-ModelScope/sd-turbo",
|
|
14
|
-
"model_revision": "master"
|
|
18
|
+
"model_revision": "master",
|
|
19
|
+
"abilities": [
|
|
20
|
+
"text2iamge"
|
|
21
|
+
]
|
|
15
22
|
},
|
|
16
23
|
{
|
|
17
24
|
"model_name": "sdxl-turbo",
|
|
18
25
|
"model_family": "stable_diffusion",
|
|
19
26
|
"model_hub": "modelscope",
|
|
20
27
|
"model_id": "AI-ModelScope/sdxl-turbo",
|
|
21
|
-
"model_revision": "master"
|
|
28
|
+
"model_revision": "master",
|
|
29
|
+
"abilities": [
|
|
30
|
+
"text2iamge"
|
|
31
|
+
]
|
|
22
32
|
},
|
|
23
33
|
{
|
|
24
34
|
"model_name": "stable-diffusion-v1.5",
|
|
@@ -26,6 +36,10 @@
|
|
|
26
36
|
"model_hub": "modelscope",
|
|
27
37
|
"model_id": "AI-ModelScope/stable-diffusion-v1-5",
|
|
28
38
|
"model_revision": "master",
|
|
39
|
+
"abilities": [
|
|
40
|
+
"text2iamge",
|
|
41
|
+
"image2image"
|
|
42
|
+
],
|
|
29
43
|
"controlnet": [
|
|
30
44
|
{
|
|
31
45
|
"model_name":"canny",
|
|
@@ -77,6 +91,10 @@
|
|
|
77
91
|
"model_hub": "modelscope",
|
|
78
92
|
"model_id": "AI-ModelScope/stable-diffusion-xl-base-1.0",
|
|
79
93
|
"model_revision": "master",
|
|
94
|
+
"abilities": [
|
|
95
|
+
"text2iamge",
|
|
96
|
+
"image2image"
|
|
97
|
+
],
|
|
80
98
|
"controlnet": [
|
|
81
99
|
{
|
|
82
100
|
"model_name":"canny",
|
|
@@ -35,22 +35,23 @@ class DiffusionModel:
|
|
|
35
35
|
def __init__(
|
|
36
36
|
self,
|
|
37
37
|
model_uid: str,
|
|
38
|
-
model_path: str,
|
|
38
|
+
model_path: Optional[str] = None,
|
|
39
39
|
device: Optional[str] = None,
|
|
40
40
|
lora_model: Optional[List[LoRA]] = None,
|
|
41
41
|
lora_load_kwargs: Optional[Dict] = None,
|
|
42
42
|
lora_fuse_kwargs: Optional[Dict] = None,
|
|
43
|
-
|
|
43
|
+
abilities: Optional[List[str]] = None,
|
|
44
44
|
**kwargs,
|
|
45
45
|
):
|
|
46
46
|
self._model_uid = model_uid
|
|
47
47
|
self._model_path = model_path
|
|
48
48
|
self._device = device
|
|
49
49
|
self._model = None
|
|
50
|
+
self._i2i_model = None # image to image model
|
|
50
51
|
self._lora_model = lora_model
|
|
51
52
|
self._lora_load_kwargs = lora_load_kwargs or {}
|
|
52
53
|
self._lora_fuse_kwargs = lora_fuse_kwargs or {}
|
|
53
|
-
self.
|
|
54
|
+
self._abilities = abilities
|
|
54
55
|
self._kwargs = kwargs
|
|
55
56
|
|
|
56
57
|
def _apply_lora(self):
|
|
@@ -69,12 +70,12 @@ class DiffusionModel:
|
|
|
69
70
|
def load(self):
|
|
70
71
|
import torch
|
|
71
72
|
|
|
72
|
-
if
|
|
73
|
+
if "text2image" in self._abilities or "image2image" in self._abilities:
|
|
73
74
|
from diffusers import AutoPipelineForText2Image as AutoPipelineModel
|
|
74
|
-
elif
|
|
75
|
+
elif "inpainting" in self._abilities:
|
|
75
76
|
from diffusers import AutoPipelineForInpainting as AutoPipelineModel
|
|
76
77
|
else:
|
|
77
|
-
raise ValueError(f"Unknown ability: {self.
|
|
78
|
+
raise ValueError(f"Unknown ability: {self._abilities}")
|
|
78
79
|
|
|
79
80
|
controlnet = self._kwargs.get("controlnet")
|
|
80
81
|
if controlnet is not None:
|
|
@@ -106,28 +107,17 @@ class DiffusionModel:
|
|
|
106
107
|
|
|
107
108
|
def _call_model(
|
|
108
109
|
self,
|
|
109
|
-
height: int,
|
|
110
|
-
width: int,
|
|
111
|
-
num_images_per_prompt: int,
|
|
112
110
|
response_format: str,
|
|
111
|
+
model=None,
|
|
113
112
|
**kwargs,
|
|
114
113
|
):
|
|
115
114
|
logger.debug(
|
|
116
115
|
"stable diffusion args: %s",
|
|
117
|
-
|
|
118
|
-
kwargs,
|
|
119
|
-
height=height,
|
|
120
|
-
width=width,
|
|
121
|
-
num_images_per_prompt=num_images_per_prompt,
|
|
122
|
-
),
|
|
116
|
+
kwargs,
|
|
123
117
|
)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
width=width,
|
|
128
|
-
num_images_per_prompt=num_images_per_prompt,
|
|
129
|
-
**kwargs,
|
|
130
|
-
).images
|
|
118
|
+
model = model if model is not None else self._model
|
|
119
|
+
assert callable(model)
|
|
120
|
+
images = model(**kwargs).images
|
|
131
121
|
if response_format == "url":
|
|
132
122
|
os.makedirs(XINFERENCE_IMAGE_DIR, exist_ok=True)
|
|
133
123
|
image_list = []
|
|
@@ -145,7 +135,7 @@ class DiffusionModel:
|
|
|
145
135
|
return base64.b64encode(buffered.getvalue()).decode()
|
|
146
136
|
|
|
147
137
|
with ThreadPoolExecutor() as executor:
|
|
148
|
-
results = list(map(partial(executor.submit, _gen_base64_image), images))
|
|
138
|
+
results = list(map(partial(executor.submit, _gen_base64_image), images)) # type: ignore
|
|
149
139
|
image_list = [Image(url=None, b64_json=s.result()) for s in results]
|
|
150
140
|
return ImageList(created=int(time.time()), data=image_list)
|
|
151
141
|
else:
|
|
@@ -177,19 +167,32 @@ class DiffusionModel:
|
|
|
177
167
|
prompt: Optional[Union[str, List[str]]] = None,
|
|
178
168
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
|
179
169
|
n: int = 1,
|
|
180
|
-
size: str =
|
|
170
|
+
size: Optional[str] = None,
|
|
181
171
|
response_format: str = "url",
|
|
182
172
|
**kwargs,
|
|
183
173
|
):
|
|
184
|
-
|
|
174
|
+
if "controlnet" in self._kwargs:
|
|
175
|
+
model = self._model
|
|
176
|
+
else:
|
|
177
|
+
if self._i2i_model is not None:
|
|
178
|
+
model = self._i2i_model
|
|
179
|
+
else:
|
|
180
|
+
from diffusers import AutoPipelineForImage2Image
|
|
181
|
+
|
|
182
|
+
self._i2i_model = model = AutoPipelineForImage2Image.from_pipe(
|
|
183
|
+
self._model
|
|
184
|
+
)
|
|
185
|
+
if size:
|
|
186
|
+
width, height = map(int, re.split(r"[^\d]+", size))
|
|
187
|
+
kwargs["width"] = width
|
|
188
|
+
kwargs["height"] = height
|
|
185
189
|
return self._call_model(
|
|
186
190
|
image=image,
|
|
187
191
|
prompt=prompt,
|
|
188
192
|
negative_prompt=negative_prompt,
|
|
189
|
-
height=height,
|
|
190
|
-
width=width,
|
|
191
193
|
num_images_per_prompt=n,
|
|
192
194
|
response_format=response_format,
|
|
195
|
+
model=model,
|
|
193
196
|
**kwargs,
|
|
194
197
|
)
|
|
195
198
|
|
xinference/model/llm/core.py
CHANGED
|
@@ -194,6 +194,7 @@ def create_llm_model_instance(
|
|
|
194
194
|
quantization: Optional[str] = None,
|
|
195
195
|
peft_model_config: Optional[PeftModelConfig] = None,
|
|
196
196
|
download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
|
|
197
|
+
model_path: Optional[str] = None,
|
|
197
198
|
**kwargs,
|
|
198
199
|
) -> Tuple[LLM, LLMDescription]:
|
|
199
200
|
from .llm_family import cache, check_engine_by_spec_parameters, match_llm
|
|
@@ -221,7 +222,8 @@ def create_llm_model_instance(
|
|
|
221
222
|
)
|
|
222
223
|
logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
|
|
223
224
|
|
|
224
|
-
|
|
225
|
+
if not model_path:
|
|
226
|
+
model_path = cache(llm_family, llm_spec, quantization)
|
|
225
227
|
|
|
226
228
|
peft_model = peft_model_config.peft_model if peft_model_config else None
|
|
227
229
|
if peft_model is not None:
|
|
@@ -231,7 +233,7 @@ def create_llm_model_instance(
|
|
|
231
233
|
llm_family,
|
|
232
234
|
llm_spec,
|
|
233
235
|
quantization,
|
|
234
|
-
|
|
236
|
+
model_path,
|
|
235
237
|
kwargs,
|
|
236
238
|
peft_model,
|
|
237
239
|
)
|
|
@@ -241,11 +243,11 @@ def create_llm_model_instance(
|
|
|
241
243
|
f"Load this without lora."
|
|
242
244
|
)
|
|
243
245
|
model = llm_cls(
|
|
244
|
-
model_uid, llm_family, llm_spec, quantization,
|
|
246
|
+
model_uid, llm_family, llm_spec, quantization, model_path, kwargs
|
|
245
247
|
)
|
|
246
248
|
else:
|
|
247
249
|
model = llm_cls(
|
|
248
|
-
model_uid, llm_family, llm_spec, quantization,
|
|
250
|
+
model_uid, llm_family, llm_spec, quantization, model_path, kwargs
|
|
249
251
|
)
|
|
250
252
|
return model, LLMDescription(
|
|
251
253
|
subpool_addr, devices, llm_family, llm_spec, quantization
|
|
@@ -155,11 +155,13 @@ class LlamaCppModel(LLM):
|
|
|
155
155
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
156
156
|
|
|
157
157
|
# handle legacy cache.
|
|
158
|
-
model_path = os.path.
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
158
|
+
model_path = os.path.realpath(
|
|
159
|
+
os.path.join(
|
|
160
|
+
self.model_path,
|
|
161
|
+
self.model_spec.model_file_name_template.format(
|
|
162
|
+
quantization=self.quantization
|
|
163
|
+
),
|
|
164
|
+
)
|
|
163
165
|
)
|
|
164
166
|
legacy_model_file_path = os.path.join(self.model_path, "model.bin")
|
|
165
167
|
if os.path.exists(legacy_model_file_path):
|
|
@@ -699,12 +699,12 @@ def _generate_model_file_names(
|
|
|
699
699
|
def _merge_cached_files(
|
|
700
700
|
cache_dir: str, input_file_names: List[str], output_file_name: str
|
|
701
701
|
):
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
702
|
+
# now llama.cpp can find the gguf parts automatically
|
|
703
|
+
# we only need to provide the first part
|
|
704
|
+
# thus we create the symlink to the first part
|
|
705
|
+
symlink_local_file(
|
|
706
|
+
os.path.join(cache_dir, input_file_names[0]), cache_dir, output_file_name
|
|
707
|
+
)
|
|
708
708
|
|
|
709
709
|
logger.info(f"Merge complete.")
|
|
710
710
|
|
xinference/model/llm/mlx/core.py
CHANGED
|
@@ -101,6 +101,7 @@ class MLXModel(LLM):
|
|
|
101
101
|
|
|
102
102
|
def _load_model(self, **kwargs):
|
|
103
103
|
try:
|
|
104
|
+
import mlx.core as mx
|
|
104
105
|
from mlx_lm import load
|
|
105
106
|
except ImportError:
|
|
106
107
|
error_message = "Failed to import module 'mlx_lm'"
|
|
@@ -122,6 +123,11 @@ class MLXModel(LLM):
|
|
|
122
123
|
self._model_config,
|
|
123
124
|
)
|
|
124
125
|
|
|
126
|
+
cache_limit_gb = kwargs.get("cache_limit_gb", None)
|
|
127
|
+
if cache_limit_gb:
|
|
128
|
+
logger.debug(f"Setting cache limit to {cache_limit_gb} GB")
|
|
129
|
+
mx.metal.set_cache_limit(cache_limit_gb * 1024 * 1024 * 1024)
|
|
130
|
+
|
|
125
131
|
return load(
|
|
126
132
|
self.model_path,
|
|
127
133
|
tokenizer_config=tokenizer_config,
|
|
@@ -134,6 +140,7 @@ class MLXModel(LLM):
|
|
|
134
140
|
"revision", self.model_spec.model_revision
|
|
135
141
|
)
|
|
136
142
|
kwargs["trust_remote_code"] = self._model_config.get("trust_remote_code")
|
|
143
|
+
kwargs["cache_limit_gb"] = self._model_config.pop("cache_limit_gb", None)
|
|
137
144
|
|
|
138
145
|
self._model, self._tokenizer = self._load_model(**kwargs)
|
|
139
146
|
|
|
@@ -430,7 +430,10 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
430
430
|
outputs = self._model.generate(**kwargs)
|
|
431
431
|
outputs = outputs[:, kwargs["input_ids"].shape[1] :]
|
|
432
432
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
433
|
-
|
|
433
|
+
if tools:
|
|
434
|
+
return self._process_response(response, history, tools, end=True)
|
|
435
|
+
else:
|
|
436
|
+
return self._process_response(response, history, tools)
|
|
434
437
|
|
|
435
438
|
def chat(
|
|
436
439
|
self,
|
|
@@ -52,7 +52,8 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
52
52
|
def match(
|
|
53
53
|
cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
|
|
54
54
|
) -> bool:
|
|
55
|
-
|
|
55
|
+
llm_family = model_family.model_family or model_family.model_name
|
|
56
|
+
if "deepseek-vl" in llm_family:
|
|
56
57
|
return True
|
|
57
58
|
return False
|
|
58
59
|
|
|
@@ -71,7 +71,8 @@ class FalconPytorchModel(PytorchModel):
|
|
|
71
71
|
) -> bool:
|
|
72
72
|
if llm_spec.model_format != "pytorch":
|
|
73
73
|
return False
|
|
74
|
-
|
|
74
|
+
model_family = llm_family.model_family or llm_family.model_name
|
|
75
|
+
if "falcon" not in model_family:
|
|
75
76
|
return False
|
|
76
77
|
if "generate" not in llm_family.model_ability:
|
|
77
78
|
return False
|
|
@@ -55,7 +55,8 @@ class LlamaPytorchModel(PytorchModel):
|
|
|
55
55
|
) -> bool:
|
|
56
56
|
if llm_spec.model_format != "pytorch":
|
|
57
57
|
return False
|
|
58
|
-
|
|
58
|
+
model_family = llm_family.model_family or llm_family.model_name
|
|
59
|
+
if "llama-2" not in model_family:
|
|
59
60
|
return False
|
|
60
61
|
if "generate" not in llm_family.model_ability:
|
|
61
62
|
return False
|
|
@@ -99,7 +100,8 @@ class LlamaPytorchChatModel(PytorchChatModel):
|
|
|
99
100
|
) -> bool:
|
|
100
101
|
if llm_spec.model_format != "pytorch":
|
|
101
102
|
return False
|
|
102
|
-
|
|
103
|
+
model_family = llm_family.model_family or llm_family.model_name
|
|
104
|
+
if "llama-2" not in model_family:
|
|
103
105
|
return False
|
|
104
106
|
if "chat" not in llm_family.model_ability:
|
|
105
107
|
return False
|
|
@@ -44,7 +44,8 @@ class OmniLMMModel(PytorchChatModel):
|
|
|
44
44
|
def match(
|
|
45
45
|
cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
|
|
46
46
|
) -> bool:
|
|
47
|
-
|
|
47
|
+
llm_family = model_family.model_family or model_family.model_name
|
|
48
|
+
if "OmniLMM" in llm_family:
|
|
48
49
|
return True
|
|
49
50
|
return False
|
|
50
51
|
|
|
@@ -52,7 +52,8 @@ class QwenVLChatModel(PytorchChatModel):
|
|
|
52
52
|
def match(
|
|
53
53
|
cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
|
|
54
54
|
) -> bool:
|
|
55
|
-
|
|
55
|
+
llm_family = model_family.model_family or model_family.model_name
|
|
56
|
+
if "qwen" in llm_family and "vision" in model_family.model_ability:
|
|
56
57
|
return True
|
|
57
58
|
return False
|
|
58
59
|
|
|
@@ -61,7 +61,8 @@ class VicunaPytorchChatModel(PytorchChatModel):
|
|
|
61
61
|
) -> bool:
|
|
62
62
|
if llm_spec.model_format != "pytorch":
|
|
63
63
|
return False
|
|
64
|
-
|
|
64
|
+
model_family = llm_family.model_family or llm_family.model_name
|
|
65
|
+
if "vicuna" not in model_family:
|
|
65
66
|
return False
|
|
66
67
|
if "chat" not in llm_family.model_ability:
|
|
67
68
|
return False
|
|
@@ -51,7 +51,8 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
51
51
|
def match(
|
|
52
52
|
cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
|
|
53
53
|
) -> bool:
|
|
54
|
-
|
|
54
|
+
llm_family = model_family.model_family or model_family.model_name
|
|
55
|
+
if "yi-vl" in llm_family:
|
|
55
56
|
return True
|
|
56
57
|
return False
|
|
57
58
|
|
|
@@ -17,7 +17,6 @@ import time
|
|
|
17
17
|
import uuid
|
|
18
18
|
from typing import AsyncGenerator, Dict, List, Optional, TypedDict, Union
|
|
19
19
|
|
|
20
|
-
from ....constants import XINFERENCE_ENABLE_SGLANG
|
|
21
20
|
from ....types import (
|
|
22
21
|
ChatCompletion,
|
|
23
22
|
ChatCompletionChunk,
|
|
@@ -63,15 +62,26 @@ try:
|
|
|
63
62
|
except ImportError:
|
|
64
63
|
SGLANG_INSTALLED = False
|
|
65
64
|
|
|
66
|
-
SGLANG_SUPPORTED_MODELS = [
|
|
65
|
+
SGLANG_SUPPORTED_MODELS = [
|
|
66
|
+
"llama-2",
|
|
67
|
+
"llama-3",
|
|
68
|
+
"llama-3.1",
|
|
69
|
+
"mistral-v0.1",
|
|
70
|
+
"mixtral-v0.1",
|
|
71
|
+
]
|
|
67
72
|
SGLANG_SUPPORTED_CHAT_MODELS = [
|
|
68
73
|
"llama-2-chat",
|
|
74
|
+
"llama-3-instruct",
|
|
75
|
+
"llama-3.1-instruct",
|
|
69
76
|
"qwen-chat",
|
|
70
77
|
"qwen1.5-chat",
|
|
78
|
+
"qwen2-instruct",
|
|
79
|
+
"qwen2-moe-instruct",
|
|
71
80
|
"mistral-instruct-v0.1",
|
|
72
81
|
"mistral-instruct-v0.2",
|
|
73
82
|
"mixtral-instruct-v0.1",
|
|
74
83
|
"gemma-it",
|
|
84
|
+
"gemma-2-it",
|
|
75
85
|
]
|
|
76
86
|
|
|
77
87
|
|
|
@@ -168,8 +178,6 @@ class SGLANGModel(LLM):
|
|
|
168
178
|
def match(
|
|
169
179
|
cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
|
|
170
180
|
) -> bool:
|
|
171
|
-
if not XINFERENCE_ENABLE_SGLANG:
|
|
172
|
-
return False
|
|
173
181
|
if not cls._has_cuda_device():
|
|
174
182
|
return False
|
|
175
183
|
if not cls._is_linux():
|
|
@@ -332,8 +340,6 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
|
|
|
332
340
|
def match(
|
|
333
341
|
cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
|
|
334
342
|
) -> bool:
|
|
335
|
-
if not XINFERENCE_ENABLE_SGLANG:
|
|
336
|
-
return False
|
|
337
343
|
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
338
344
|
return False
|
|
339
345
|
if llm_spec.model_format == "pytorch":
|
|
@@ -28,7 +28,6 @@ from typing import (
|
|
|
28
28
|
Union,
|
|
29
29
|
)
|
|
30
30
|
|
|
31
|
-
from ....constants import XINFERENCE_DISABLE_VLLM
|
|
32
31
|
from ....types import (
|
|
33
32
|
ChatCompletion,
|
|
34
33
|
ChatCompletionChunk,
|
|
@@ -152,6 +151,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
|
|
|
152
151
|
VLLM_SUPPORTED_CHAT_MODELS.append("c4ai-command-r-v01")
|
|
153
152
|
|
|
154
153
|
if VLLM_INSTALLED and vllm.__version__ >= "0.5.3":
|
|
154
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-2-it")
|
|
155
155
|
VLLM_SUPPORTED_CHAT_MODELS.append("mistral-nemo-instruct")
|
|
156
156
|
VLLM_SUPPORTED_CHAT_MODELS.append("mistral-large-instruct")
|
|
157
157
|
|
|
@@ -296,8 +296,6 @@ class VLLMModel(LLM):
|
|
|
296
296
|
def match(
|
|
297
297
|
cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
|
|
298
298
|
) -> bool:
|
|
299
|
-
if XINFERENCE_DISABLE_VLLM:
|
|
300
|
-
return False
|
|
301
299
|
if not cls._has_cuda_device():
|
|
302
300
|
return False
|
|
303
301
|
if not cls._is_linux():
|
|
@@ -522,8 +520,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
522
520
|
def match(
|
|
523
521
|
cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
|
|
524
522
|
) -> bool:
|
|
525
|
-
if XINFERENCE_DISABLE_VLLM:
|
|
526
|
-
return False
|
|
527
523
|
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
528
524
|
return False
|
|
529
525
|
if llm_spec.model_format == "pytorch":
|
xinference/model/rerank/core.py
CHANGED
|
@@ -107,7 +107,7 @@ class RerankModel:
|
|
|
107
107
|
self,
|
|
108
108
|
model_spec: RerankModelSpec,
|
|
109
109
|
model_uid: str,
|
|
110
|
-
model_path: str,
|
|
110
|
+
model_path: Optional[str] = None,
|
|
111
111
|
device: Optional[str] = None,
|
|
112
112
|
use_fp16: bool = False,
|
|
113
113
|
model_config: Optional[Dict] = None,
|
|
@@ -290,6 +290,7 @@ def create_rerank_model_instance(
|
|
|
290
290
|
model_uid: str,
|
|
291
291
|
model_name: str,
|
|
292
292
|
download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
|
|
293
|
+
model_path: Optional[str] = None,
|
|
293
294
|
**kwargs,
|
|
294
295
|
) -> Tuple[RerankModel, RerankModelDescription]:
|
|
295
296
|
from ..utils import download_from_modelscope
|
|
@@ -321,8 +322,8 @@ def create_rerank_model_instance(
|
|
|
321
322
|
f"Huggingface: {BUILTIN_RERANK_MODELS.keys()}"
|
|
322
323
|
f"ModelScope: {MODELSCOPE_RERANK_MODELS.keys()}"
|
|
323
324
|
)
|
|
324
|
-
|
|
325
|
-
|
|
325
|
+
if not model_path:
|
|
326
|
+
model_path = cache(model_spec)
|
|
326
327
|
use_fp16 = kwargs.pop("use_fp16", False)
|
|
327
328
|
model = RerankModel(
|
|
328
329
|
model_spec, model_uid, model_path, use_fp16=use_fp16, model_config=kwargs
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
3
|
"main.css": "./static/css/main.4bafd904.css",
|
|
4
|
-
"main.js": "./static/js/main.
|
|
4
|
+
"main.js": "./static/js/main.af906659.js",
|
|
5
5
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
6
6
|
"index.html": "./index.html",
|
|
7
7
|
"main.4bafd904.css.map": "./static/css/main.4bafd904.css.map",
|
|
8
|
-
"main.
|
|
8
|
+
"main.af906659.js.map": "./static/js/main.af906659.js.map"
|
|
9
9
|
},
|
|
10
10
|
"entrypoints": [
|
|
11
11
|
"static/css/main.4bafd904.css",
|
|
12
|
-
"static/js/main.
|
|
12
|
+
"static/js/main.af906659.js"
|
|
13
13
|
]
|
|
14
14
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.af906659.js"></script><link href="./static/css/main.4bafd904.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|