xinference 1.5.1__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +97 -8
- xinference/client/restful/restful_client.py +51 -11
- xinference/core/media_interface.py +758 -0
- xinference/core/model.py +49 -9
- xinference/core/worker.py +31 -37
- xinference/deploy/utils.py +0 -3
- xinference/model/audio/__init__.py +16 -27
- xinference/model/audio/core.py +1 -0
- xinference/model/audio/cosyvoice.py +4 -2
- xinference/model/audio/model_spec.json +20 -3
- xinference/model/audio/model_spec_modelscope.json +18 -1
- xinference/model/embedding/__init__.py +16 -24
- xinference/model/image/__init__.py +15 -25
- xinference/model/llm/__init__.py +37 -110
- xinference/model/llm/core.py +15 -6
- xinference/model/llm/llama_cpp/core.py +25 -353
- xinference/model/llm/llm_family.json +613 -89
- xinference/model/llm/llm_family.py +9 -1
- xinference/model/llm/llm_family_modelscope.json +540 -90
- xinference/model/llm/mlx/core.py +6 -3
- xinference/model/llm/reasoning_parser.py +281 -5
- xinference/model/llm/sglang/core.py +16 -3
- xinference/model/llm/transformers/chatglm.py +2 -2
- xinference/model/llm/transformers/cogagent.py +1 -1
- xinference/model/llm/transformers/cogvlm2.py +1 -1
- xinference/model/llm/transformers/core.py +9 -3
- xinference/model/llm/transformers/glm4v.py +1 -1
- xinference/model/llm/transformers/minicpmv26.py +1 -1
- xinference/model/llm/transformers/qwen-omni.py +6 -0
- xinference/model/llm/transformers/qwen_vl.py +1 -1
- xinference/model/llm/utils.py +68 -45
- xinference/model/llm/vllm/core.py +38 -18
- xinference/model/llm/vllm/xavier/test/test_xavier.py +1 -10
- xinference/model/rerank/__init__.py +13 -24
- xinference/model/video/__init__.py +15 -25
- xinference/model/video/core.py +3 -3
- xinference/model/video/diffusers.py +133 -16
- xinference/model/video/model_spec.json +54 -0
- xinference/model/video/model_spec_modelscope.json +56 -0
- xinference/thirdparty/cosyvoice/bin/average_model.py +5 -4
- xinference/thirdparty/cosyvoice/bin/export_jit.py +50 -20
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +136 -51
- xinference/thirdparty/cosyvoice/bin/inference.py +15 -5
- xinference/thirdparty/cosyvoice/bin/train.py +7 -2
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +72 -52
- xinference/thirdparty/cosyvoice/cli/frontend.py +58 -58
- xinference/thirdparty/cosyvoice/cli/model.py +140 -155
- xinference/thirdparty/cosyvoice/dataset/processor.py +9 -5
- xinference/thirdparty/cosyvoice/flow/decoder.py +656 -54
- xinference/thirdparty/cosyvoice/flow/flow.py +69 -11
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +167 -63
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +1 -0
- xinference/thirdparty/cosyvoice/hifigan/discriminator.py +91 -1
- xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +4 -1
- xinference/thirdparty/cosyvoice/hifigan/generator.py +4 -1
- xinference/thirdparty/cosyvoice/hifigan/hifigan.py +2 -2
- xinference/thirdparty/cosyvoice/llm/llm.py +198 -18
- xinference/thirdparty/cosyvoice/transformer/embedding.py +12 -4
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +124 -21
- xinference/thirdparty/cosyvoice/utils/class_utils.py +13 -0
- xinference/thirdparty/cosyvoice/utils/common.py +1 -1
- xinference/thirdparty/cosyvoice/utils/file_utils.py +40 -2
- xinference/thirdparty/cosyvoice/utils/frontend_utils.py +7 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +4 -0
- xinference/thirdparty/cosyvoice/utils/train_utils.py +5 -1
- xinference/thirdparty/matcha/hifigan/xutils.py +3 -3
- xinference/types.py +0 -71
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.ae579a97.js +3 -0
- xinference/web/ui/build/static/js/main.ae579a97.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0196a4b09e3264614e54360d5f832c46b31d964ec58296765ebff191ace6adbf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/18fa271456b31cded36c05c4c71c6b2b1cf4e4128c1e32f0e45d8b9f21764397.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +1 -0
- xinference/web/ui/src/locales/en.json +6 -4
- xinference/web/ui/src/locales/zh.json +6 -4
- {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/METADATA +56 -36
- {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/RECORD +87 -87
- {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/WHEEL +1 -1
- xinference/core/image_interface.py +0 -377
- xinference/thirdparty/cosyvoice/bin/export_trt.sh +0 -9
- xinference/web/ui/build/static/js/main.91e77b5c.js +0 -3
- xinference/web/ui/build/static/js/main.91e77b5c.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5e6edb0fb87e3798f142e9abf8dd2dc46bab33a60d31dff525797c0c99887097.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/6087820be1bd5c02c42dff797e7df365448ef35ab26dd5d6bd33e967e05cbfd4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +0 -1
- /xinference/web/ui/build/static/js/{main.91e77b5c.js.LICENSE.txt → main.ae579a97.js.LICENSE.txt} +0 -0
- {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/top_level.txt +0 -0
xinference/model/llm/utils.py
CHANGED
|
@@ -42,6 +42,7 @@ from ...types import (
|
|
|
42
42
|
ChatCompletion,
|
|
43
43
|
ChatCompletionChoice,
|
|
44
44
|
ChatCompletionChunk,
|
|
45
|
+
ChatCompletionChunkChoice,
|
|
45
46
|
ChatCompletionChunkDelta,
|
|
46
47
|
ChatCompletionMessage,
|
|
47
48
|
Completion,
|
|
@@ -68,8 +69,11 @@ QWEN_TOOL_CALL_FAMILY = [
|
|
|
68
69
|
"qwen2-moe-instruct",
|
|
69
70
|
"qwen2.5-instruct",
|
|
70
71
|
"qwen2.5-coder-instruct",
|
|
72
|
+
"XiYanSQL-QwenCoder-2504",
|
|
71
73
|
"QwQ-32B",
|
|
72
74
|
"qwen3",
|
|
75
|
+
"HuatuoGPT-o1-Qwen2.5",
|
|
76
|
+
"DianJin-R1",
|
|
73
77
|
]
|
|
74
78
|
|
|
75
79
|
GLM4_TOOL_CALL_FAMILY = [
|
|
@@ -79,6 +83,7 @@ GLM4_TOOL_CALL_FAMILY = [
|
|
|
79
83
|
|
|
80
84
|
LLAMA3_TOOL_CALL_FAMILY = [
|
|
81
85
|
"llama-3.1-instruct",
|
|
86
|
+
"HuatuoGPT-o1-LLaMA-3.1",
|
|
82
87
|
]
|
|
83
88
|
|
|
84
89
|
DEEPSEEK_TOOL_CALL_FAMILY = [
|
|
@@ -160,7 +165,12 @@ class ChatModelMixin:
|
|
|
160
165
|
@staticmethod
|
|
161
166
|
def _get_chat_template_kwargs_from_generate_config(
|
|
162
167
|
generate_config: Optional[Union[dict, Any]],
|
|
168
|
+
reasoning_parser: Optional[ReasoningParser] = None,
|
|
163
169
|
) -> Optional[dict]:
|
|
170
|
+
if reasoning_parser and not reasoning_parser.enable_thinking:
|
|
171
|
+
# hybrid model like qwen3,
|
|
172
|
+
# disabled thinking
|
|
173
|
+
return {"enable_thinking": False}
|
|
164
174
|
if not generate_config:
|
|
165
175
|
return None
|
|
166
176
|
if "chat_template_kwargs" in generate_config:
|
|
@@ -285,7 +295,7 @@ class ChatModelMixin:
|
|
|
285
295
|
and "delta" in choices[0]
|
|
286
296
|
):
|
|
287
297
|
if choices[0]["finish_reason"] is None:
|
|
288
|
-
if reasoning_parser
|
|
298
|
+
if reasoning_parser and reasoning_parser.check_content_parser():
|
|
289
299
|
# process parsing reasoning content
|
|
290
300
|
assert previous_texts is not None
|
|
291
301
|
delta = choices[0]["delta"] # type: ignore
|
|
@@ -302,7 +312,7 @@ class ChatModelMixin:
|
|
|
302
312
|
delta = choices[0]["delta"] # type: ignore
|
|
303
313
|
if "content" not in delta:
|
|
304
314
|
delta["content"] = "" # type: ignore
|
|
305
|
-
if reasoning_parser
|
|
315
|
+
if reasoning_parser and reasoning_parser.check_content_parser():
|
|
306
316
|
delta["reasoning_content"] = None # type: ignore
|
|
307
317
|
# Already a ChatCompletionChunk, we don't need to convert chunk.
|
|
308
318
|
return cast(ChatCompletionChunk, chunk)
|
|
@@ -311,7 +321,7 @@ class ChatModelMixin:
|
|
|
311
321
|
for i, choice in enumerate(choices): # type: ignore
|
|
312
322
|
delta = ChatCompletionChunkDelta()
|
|
313
323
|
if "text" in choice and choice["finish_reason"] is None:
|
|
314
|
-
if reasoning_parser
|
|
324
|
+
if not reasoning_parser or not reasoning_parser.check_content_parser():
|
|
315
325
|
delta["content"] = choice["text"]
|
|
316
326
|
else:
|
|
317
327
|
assert previous_texts is not None
|
|
@@ -324,7 +334,7 @@ class ChatModelMixin:
|
|
|
324
334
|
previous_texts[-1] = current_text
|
|
325
335
|
elif "text" in choice and choice["finish_reason"] is not None:
|
|
326
336
|
delta["content"] = choice["text"]
|
|
327
|
-
if reasoning_parser
|
|
337
|
+
if reasoning_parser and reasoning_parser.check_content_parser():
|
|
328
338
|
delta["reasoning_content"] = None
|
|
329
339
|
elif "tool_calls" in choice:
|
|
330
340
|
delta["tool_calls"] = choice["tool_calls"]
|
|
@@ -338,7 +348,9 @@ class ChatModelMixin:
|
|
|
338
348
|
assert choices is not None
|
|
339
349
|
usage = (
|
|
340
350
|
chunk["usage"]
|
|
341
|
-
if choices[0]["finish_reason"] is not None
|
|
351
|
+
if choices[0]["finish_reason"] is not None
|
|
352
|
+
and reasoning_parser
|
|
353
|
+
and reasoning_parser.check_content_parser()
|
|
342
354
|
else None
|
|
343
355
|
)
|
|
344
356
|
chat_chunk = {
|
|
@@ -356,28 +368,32 @@ class ChatModelMixin:
|
|
|
356
368
|
cls,
|
|
357
369
|
chunk: CompletionChunk,
|
|
358
370
|
reasoning_parser: Optional[ReasoningParser] = None,
|
|
359
|
-
) -> ChatCompletionChunk:
|
|
360
|
-
choices_list = []
|
|
371
|
+
) -> List[ChatCompletionChunk]:
|
|
372
|
+
choices_list: List[ChatCompletionChunkChoice] = []
|
|
373
|
+
chunks: List[ChatCompletionChunk] = []
|
|
361
374
|
for i, choice in enumerate(chunk["choices"]):
|
|
362
375
|
delta = ChatCompletionChunkDelta(role="assistant", content="")
|
|
363
|
-
if reasoning_parser
|
|
376
|
+
if reasoning_parser and reasoning_parser.check_content_parser():
|
|
364
377
|
delta["content"] = None
|
|
365
378
|
delta["reasoning_content"] = ""
|
|
366
379
|
choices_list.append(
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
380
|
+
ChatCompletionChunkChoice(
|
|
381
|
+
index=i,
|
|
382
|
+
delta=delta,
|
|
383
|
+
finish_reason=None,
|
|
384
|
+
)
|
|
372
385
|
)
|
|
373
|
-
chat_chunk =
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
386
|
+
chat_chunk = ChatCompletionChunk(
|
|
387
|
+
id="chat" + chunk["id"],
|
|
388
|
+
model=chunk["model"],
|
|
389
|
+
created=chunk["created"],
|
|
390
|
+
object="chat.completion.chunk",
|
|
391
|
+
choices=choices_list,
|
|
392
|
+
)
|
|
393
|
+
chunks.append(chat_chunk)
|
|
394
|
+
if reasoning_parser:
|
|
395
|
+
chunks.extend(reasoning_parser.prepare_first_reasoning_content_chunk(chunk))
|
|
396
|
+
return chunks
|
|
381
397
|
|
|
382
398
|
@classmethod
|
|
383
399
|
def _get_final_chat_completion_chunk(
|
|
@@ -402,6 +418,8 @@ class ChatModelMixin:
|
|
|
402
418
|
reasoning_parse: Optional[ReasoningParser] = None,
|
|
403
419
|
) -> Iterator[ChatCompletionChunk]:
|
|
404
420
|
previous_texts = [""]
|
|
421
|
+
if reasoning_parse:
|
|
422
|
+
chunks = reasoning_parse.prepare_reasoning_content_sync(chunks)
|
|
405
423
|
for _, chunk in enumerate(chunks):
|
|
406
424
|
# usage
|
|
407
425
|
choices = chunk.get("choices")
|
|
@@ -449,6 +467,9 @@ class ChatModelMixin:
|
|
|
449
467
|
reasoning_parser: Optional[ReasoningParser] = None,
|
|
450
468
|
) -> AsyncGenerator[ChatCompletionChunk, None]:
|
|
451
469
|
previous_texts = [""]
|
|
470
|
+
# Process chunks
|
|
471
|
+
if reasoning_parser:
|
|
472
|
+
chunks = reasoning_parser.prepare_reasoning_content_streaming(chunks)
|
|
452
473
|
async for chunk in chunks:
|
|
453
474
|
choices = chunk.get("choices")
|
|
454
475
|
if not choices:
|
|
@@ -464,19 +485,25 @@ class ChatModelMixin:
|
|
|
464
485
|
def _to_chat_completion(
|
|
465
486
|
completion: Completion, reasoning_parser: Optional[ReasoningParser] = None
|
|
466
487
|
) -> ChatCompletion:
|
|
488
|
+
# prepare reasoning content
|
|
489
|
+
if reasoning_parser:
|
|
490
|
+
completion = reasoning_parser.prepare_reasoning_content(completion)
|
|
491
|
+
|
|
467
492
|
if completion.get("object") == "chat.completion" and completion.get("choices"):
|
|
468
493
|
# Already a ChatCompletion
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
494
|
+
for choice in completion["choices"]:
|
|
495
|
+
message = choice["message"] # type: ignore
|
|
496
|
+
text = message["content"] # Original content from the message
|
|
497
|
+
|
|
498
|
+
if reasoning_parser and reasoning_parser.check_content_parser():
|
|
499
|
+
# Parse into reasoning and content parts
|
|
473
500
|
(
|
|
474
|
-
|
|
475
|
-
|
|
501
|
+
reasoning_val,
|
|
502
|
+
content_val,
|
|
476
503
|
) = reasoning_parser.extract_reasoning_content(text)
|
|
477
|
-
message["content"] =
|
|
478
|
-
if
|
|
479
|
-
message["reasoning_content"] =
|
|
504
|
+
message["content"] = content_val
|
|
505
|
+
if reasoning_val is not None:
|
|
506
|
+
message["reasoning_content"] = reasoning_val
|
|
480
507
|
return cast(ChatCompletion, completion)
|
|
481
508
|
|
|
482
509
|
choices = []
|
|
@@ -484,7 +511,7 @@ class ChatModelMixin:
|
|
|
484
511
|
content = choice["text"]
|
|
485
512
|
reasoning_content = None
|
|
486
513
|
|
|
487
|
-
if reasoning_parser
|
|
514
|
+
if reasoning_parser and reasoning_parser.check_content_parser():
|
|
488
515
|
reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
|
|
489
516
|
choice
|
|
490
517
|
)
|
|
@@ -681,20 +708,12 @@ class ChatModelMixin:
|
|
|
681
708
|
failed_contents.append(content)
|
|
682
709
|
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
683
710
|
|
|
684
|
-
reasoning_content = None
|
|
685
711
|
content = ". ".join(failed_contents) if failed_contents else None
|
|
686
|
-
if reasoning_parser is not None:
|
|
687
|
-
reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
|
|
688
|
-
content
|
|
689
|
-
)
|
|
690
712
|
d = {
|
|
691
713
|
"role": "assistant",
|
|
692
714
|
"content": content,
|
|
693
715
|
"tool_calls": tool_calls,
|
|
694
716
|
}
|
|
695
|
-
# add only reasoning_content is None
|
|
696
|
-
if reasoning_content is not None:
|
|
697
|
-
d["reasoning_content"] = reasoning_content
|
|
698
717
|
|
|
699
718
|
try:
|
|
700
719
|
usage = c.get("usage")
|
|
@@ -729,7 +748,17 @@ class ChatModelMixin:
|
|
|
729
748
|
c,
|
|
730
749
|
reasoning_parser: Optional[ReasoningParser] = None,
|
|
731
750
|
):
|
|
751
|
+
if reasoning_parser:
|
|
752
|
+
c = reasoning_parser.prepare_reasoning_content(c)
|
|
732
753
|
_id = str(uuid.uuid4())
|
|
754
|
+
reasoning_content = None
|
|
755
|
+
if reasoning_parser and reasoning_parser.check_content_parser():
|
|
756
|
+
text = c["choices"][0]["text"]
|
|
757
|
+
reasoning_content, content = reasoning_parser.extract_reasoning_content(
|
|
758
|
+
text
|
|
759
|
+
)
|
|
760
|
+
c["choices"][0]["text"] = content
|
|
761
|
+
|
|
733
762
|
tool_result = cls._eval_tool_arguments(model_family, c)
|
|
734
763
|
|
|
735
764
|
tool_calls = []
|
|
@@ -750,12 +779,6 @@ class ChatModelMixin:
|
|
|
750
779
|
failed_contents.append(content)
|
|
751
780
|
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
752
781
|
|
|
753
|
-
reasoning_content = None
|
|
754
|
-
content = ". ".join(failed_contents) if failed_contents else None
|
|
755
|
-
if reasoning_parser is not None:
|
|
756
|
-
reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
|
|
757
|
-
content
|
|
758
|
-
)
|
|
759
782
|
m = {
|
|
760
783
|
"role": "assistant",
|
|
761
784
|
"content": content,
|
|
@@ -170,6 +170,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
|
|
|
170
170
|
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
|
|
171
171
|
VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
|
|
172
172
|
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
|
|
173
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("XiYanSQL-QwenCoder-2504")
|
|
173
174
|
VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
|
|
174
175
|
VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B")
|
|
175
176
|
VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
|
|
@@ -177,6 +178,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
|
|
|
177
178
|
VLLM_SUPPORTED_CHAT_MODELS.append("fin-r1")
|
|
178
179
|
VLLM_SUPPORTED_CHAT_MODELS.append("seallms-v3")
|
|
179
180
|
VLLM_SUPPORTED_CHAT_MODELS.append("skywork-or1-preview")
|
|
181
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("skywork-or1")
|
|
182
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("HuatuoGPT-o1-Qwen2.5")
|
|
183
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("DianJin-R1")
|
|
180
184
|
|
|
181
185
|
if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
|
|
182
186
|
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
|
|
@@ -207,6 +211,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
|
|
|
207
211
|
VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
|
|
208
212
|
VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.3-instruct")
|
|
209
213
|
VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-llama")
|
|
214
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("HuatuoGPT-o1-LLaMA-3.1")
|
|
210
215
|
|
|
211
216
|
if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
|
|
212
217
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
|
|
@@ -347,8 +352,10 @@ class VLLMModel(LLM):
|
|
|
347
352
|
self._device_count = self._get_cuda_count()
|
|
348
353
|
self._model_config = self._sanitize_model_config(self._model_config)
|
|
349
354
|
reasoning_content = self._model_config.pop("reasoning_content")
|
|
350
|
-
|
|
351
|
-
self.prepare_parse_reasoning_content(
|
|
355
|
+
enable_thinking = self._model_config.pop("enable_thinking", False)
|
|
356
|
+
self.prepare_parse_reasoning_content(
|
|
357
|
+
reasoning_content, enable_thinking=enable_thinking
|
|
358
|
+
)
|
|
352
359
|
|
|
353
360
|
if (
|
|
354
361
|
isinstance(self.model_spec, LlamaCppLLMSpecV1)
|
|
@@ -811,10 +818,6 @@ class VLLMModel(LLM):
|
|
|
811
818
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
812
819
|
|
|
813
820
|
sanitized_generate_config = self._sanitize_generate_config(generate_config)
|
|
814
|
-
if self.reasoning_parser:
|
|
815
|
-
# For reasoning model, the </think> we be split into multiple words,
|
|
816
|
-
# if `stop` param is passed, so we pop it from config.
|
|
817
|
-
sanitized_generate_config.pop("stop")
|
|
818
821
|
logger.debug(
|
|
819
822
|
"Enter generate, prompt: %s, generate config: %s", prompt, generate_config
|
|
820
823
|
)
|
|
@@ -1029,13 +1032,19 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1029
1032
|
) -> Dict:
|
|
1030
1033
|
if not generate_config:
|
|
1031
1034
|
generate_config = {}
|
|
1032
|
-
if
|
|
1033
|
-
generate_config
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
and self.model_family.
|
|
1037
|
-
|
|
1038
|
-
|
|
1035
|
+
if "reasoning" in getattr(self.model_family, "model_ability", []):
|
|
1036
|
+
generate_config.pop("stop", None)
|
|
1037
|
+
generate_config.pop("stop_token_ids", None)
|
|
1038
|
+
else:
|
|
1039
|
+
if not generate_config.get("stop") and self.model_family.stop:
|
|
1040
|
+
generate_config["stop"] = self.model_family.stop.copy()
|
|
1041
|
+
if (
|
|
1042
|
+
not generate_config.get("stop_token_ids")
|
|
1043
|
+
and self.model_family.stop_token_ids
|
|
1044
|
+
):
|
|
1045
|
+
generate_config[
|
|
1046
|
+
"stop_token_ids"
|
|
1047
|
+
] = self.model_family.stop_token_ids.copy()
|
|
1039
1048
|
return generate_config
|
|
1040
1049
|
|
|
1041
1050
|
@staticmethod
|
|
@@ -1047,11 +1056,15 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1047
1056
|
chunks: AsyncGenerator[CompletionChunk, None],
|
|
1048
1057
|
) -> AsyncGenerator[ChatCompletionChunk, None]:
|
|
1049
1058
|
i = 0
|
|
1059
|
+
previous_texts = [""]
|
|
1060
|
+
if self.reasoning_parser:
|
|
1061
|
+
chunks = self.reasoning_parser.prepare_reasoning_content(chunks)
|
|
1050
1062
|
async for chunk in chunks:
|
|
1051
1063
|
if i == 0:
|
|
1052
|
-
|
|
1064
|
+
for first_chunk in self._get_first_chat_completion_chunk(
|
|
1053
1065
|
chunk, self.reasoning_parser
|
|
1054
|
-
)
|
|
1066
|
+
):
|
|
1067
|
+
yield first_chunk
|
|
1055
1068
|
# usage
|
|
1056
1069
|
choices = chunk.get("choices")
|
|
1057
1070
|
if not choices:
|
|
@@ -1065,7 +1078,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1065
1078
|
reasoning_parser=self.reasoning_parser,
|
|
1066
1079
|
)
|
|
1067
1080
|
else:
|
|
1068
|
-
yield self._to_chat_completion_chunk(
|
|
1081
|
+
yield self._to_chat_completion_chunk(
|
|
1082
|
+
chunk, self.reasoning_parser, previous_texts
|
|
1083
|
+
)
|
|
1069
1084
|
i += 1
|
|
1070
1085
|
|
|
1071
1086
|
@vllm_check
|
|
@@ -1078,7 +1093,10 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1078
1093
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
1079
1094
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
1080
1095
|
full_context_kwargs = (
|
|
1081
|
-
self._get_chat_template_kwargs_from_generate_config(
|
|
1096
|
+
self._get_chat_template_kwargs_from_generate_config(
|
|
1097
|
+
generate_config, self.reasoning_parser
|
|
1098
|
+
)
|
|
1099
|
+
or {}
|
|
1082
1100
|
)
|
|
1083
1101
|
if tools:
|
|
1084
1102
|
if (
|
|
@@ -1198,7 +1216,9 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
1198
1216
|
from qwen_vl_utils import process_vision_info
|
|
1199
1217
|
|
|
1200
1218
|
full_context_kwargs = (
|
|
1201
|
-
self._get_chat_template_kwargs_from_generate_config(
|
|
1219
|
+
self._get_chat_template_kwargs_from_generate_config(
|
|
1220
|
+
generate_config, self.reasoning_parser
|
|
1221
|
+
)
|
|
1202
1222
|
or {}
|
|
1203
1223
|
)
|
|
1204
1224
|
if tools and model_family in QWEN_TOOL_CALL_FAMILY:
|
|
@@ -11,8 +11,6 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
import os
|
|
15
|
-
import sys
|
|
16
14
|
|
|
17
15
|
import pytest
|
|
18
16
|
import xoscar as xo
|
|
@@ -30,14 +28,7 @@ class ExtendedBlockTracker(VLLMBlockTracker):
|
|
|
30
28
|
|
|
31
29
|
@pytest.fixture
|
|
32
30
|
async def actor_pool_context():
|
|
33
|
-
|
|
34
|
-
os.environ.get("POOL_START_METHOD", "forkserver")
|
|
35
|
-
if sys.platform != "win32"
|
|
36
|
-
else None
|
|
37
|
-
)
|
|
38
|
-
pool = await xo.create_actor_pool(
|
|
39
|
-
"127.0.0.1", n_process=2, subprocess_start_method=start_method
|
|
40
|
-
)
|
|
31
|
+
pool = await xo.create_actor_pool("127.0.0.1", n_process=2)
|
|
41
32
|
async with pool:
|
|
42
33
|
yield pool
|
|
43
34
|
|
|
@@ -56,29 +56,8 @@ def register_custom_model():
|
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
def _install():
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
os.path.dirname(__file__), "model_spec_modelscope.json"
|
|
62
|
-
)
|
|
63
|
-
BUILTIN_RERANK_MODELS.update(
|
|
64
|
-
dict(
|
|
65
|
-
(spec["model_name"], RerankModelSpec(**spec))
|
|
66
|
-
for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
|
|
67
|
-
)
|
|
68
|
-
)
|
|
69
|
-
for model_name, model_spec in BUILTIN_RERANK_MODELS.items():
|
|
70
|
-
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
71
|
-
|
|
72
|
-
MODELSCOPE_RERANK_MODELS.update(
|
|
73
|
-
dict(
|
|
74
|
-
(spec["model_name"], RerankModelSpec(**spec))
|
|
75
|
-
for spec in json.load(
|
|
76
|
-
codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
|
|
77
|
-
)
|
|
78
|
-
)
|
|
79
|
-
)
|
|
80
|
-
for model_name, model_spec in MODELSCOPE_RERANK_MODELS.items():
|
|
81
|
-
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
59
|
+
load_model_family_from_json("model_spec.json", BUILTIN_RERANK_MODELS)
|
|
60
|
+
load_model_family_from_json("model_spec_modelscope.json", MODELSCOPE_RERANK_MODELS)
|
|
82
61
|
|
|
83
62
|
# register model description after recording model revision
|
|
84
63
|
for model_spec_info in [BUILTIN_RERANK_MODELS, MODELSCOPE_RERANK_MODELS]:
|
|
@@ -94,5 +73,15 @@ def _install():
|
|
|
94
73
|
for ud_rerank in get_user_defined_reranks():
|
|
95
74
|
RERANK_MODEL_DESCRIPTIONS.update(generate_rerank_description(ud_rerank))
|
|
96
75
|
|
|
76
|
+
|
|
77
|
+
def load_model_family_from_json(json_filename, target_families):
|
|
78
|
+
_model_spec_json = os.path.join(os.path.dirname(__file__), json_filename)
|
|
79
|
+
target_families.update(
|
|
80
|
+
dict(
|
|
81
|
+
(spec["model_name"], RerankModelSpec(**spec))
|
|
82
|
+
for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
for model_name, model_spec in target_families.items():
|
|
86
|
+
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
97
87
|
del _model_spec_json
|
|
98
|
-
del _model_spec_modelscope_json
|
|
@@ -30,29 +30,8 @@ from .core import (
|
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
def _install():
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
os.path.dirname(__file__), "model_spec_modelscope.json"
|
|
36
|
-
)
|
|
37
|
-
BUILTIN_VIDEO_MODELS.update(
|
|
38
|
-
dict(
|
|
39
|
-
(spec["model_name"], VideoModelFamilyV1(**spec))
|
|
40
|
-
for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
|
|
41
|
-
)
|
|
42
|
-
)
|
|
43
|
-
for model_name, model_spec in BUILTIN_VIDEO_MODELS.items():
|
|
44
|
-
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
45
|
-
|
|
46
|
-
MODELSCOPE_VIDEO_MODELS.update(
|
|
47
|
-
dict(
|
|
48
|
-
(spec["model_name"], VideoModelFamilyV1(**spec))
|
|
49
|
-
for spec in json.load(
|
|
50
|
-
codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
|
|
51
|
-
)
|
|
52
|
-
)
|
|
53
|
-
)
|
|
54
|
-
for model_name, model_spec in MODELSCOPE_VIDEO_MODELS.items():
|
|
55
|
-
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
33
|
+
load_model_family_from_json("model_spec.json", BUILTIN_VIDEO_MODELS)
|
|
34
|
+
load_model_family_from_json("model_spec_modelscope.json", MODELSCOPE_VIDEO_MODELS)
|
|
56
35
|
|
|
57
36
|
# register model description
|
|
58
37
|
for model_name, model_spec in chain(
|
|
@@ -60,5 +39,16 @@ def _install():
|
|
|
60
39
|
):
|
|
61
40
|
VIDEO_MODEL_DESCRIPTIONS.update(generate_video_description(model_spec))
|
|
62
41
|
|
|
63
|
-
|
|
64
|
-
|
|
42
|
+
|
|
43
|
+
def load_model_family_from_json(json_filename, target_families):
|
|
44
|
+
json_path = os.path.join(os.path.dirname(__file__), json_filename)
|
|
45
|
+
target_families.update(
|
|
46
|
+
dict(
|
|
47
|
+
(spec["model_name"], VideoModelFamilyV1(**spec))
|
|
48
|
+
for spec in json.load(codecs.open(json_path, "r", encoding="utf-8"))
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
for model_name, model_spec in target_families.items():
|
|
52
|
+
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
53
|
+
|
|
54
|
+
del json_path
|
xinference/model/video/core.py
CHANGED
|
@@ -19,7 +19,7 @@ from typing import Any, Dict, List, Literal, Optional, Tuple
|
|
|
19
19
|
from ...constants import XINFERENCE_CACHE_DIR
|
|
20
20
|
from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
|
|
21
21
|
from ..utils import valid_model_revision
|
|
22
|
-
from .diffusers import
|
|
22
|
+
from .diffusers import DiffusersVideoModel
|
|
23
23
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
25
25
|
|
|
@@ -169,13 +169,13 @@ def create_video_model_instance(
|
|
|
169
169
|
] = None,
|
|
170
170
|
model_path: Optional[str] = None,
|
|
171
171
|
**kwargs,
|
|
172
|
-
) -> Tuple[
|
|
172
|
+
) -> Tuple[DiffusersVideoModel, VideoModelDescription]:
|
|
173
173
|
model_spec = match_diffusion(model_name, download_hub)
|
|
174
174
|
if not model_path:
|
|
175
175
|
model_path = cache(model_spec)
|
|
176
176
|
assert model_path is not None
|
|
177
177
|
|
|
178
|
-
model =
|
|
178
|
+
model = DiffusersVideoModel(
|
|
179
179
|
model_uid,
|
|
180
180
|
model_path,
|
|
181
181
|
model_spec,
|