xinference 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/core/chat_interface.py +39 -24
- xinference/model/llm/__init__.py +3 -0
- xinference/model/llm/core.py +2 -5
- xinference/model/llm/llama_cpp/core.py +52 -16
- xinference/model/llm/llm_family.json +364 -21
- xinference/model/llm/llm_family_modelscope.json +258 -23
- xinference/model/llm/mlx/core.py +15 -11
- xinference/model/llm/{reasoning_parsers/deepseek_r1_reasoning_parser.py → reasoning_parser.py} +19 -14
- xinference/model/llm/sglang/core.py +2 -0
- xinference/model/llm/transformers/core.py +3 -2
- xinference/model/llm/transformers/gemma3.py +185 -0
- xinference/model/llm/transformers/intern_vl.py +0 -2
- xinference/model/llm/utils.py +78 -32
- xinference/model/llm/vllm/core.py +10 -3
- xinference/types.py +2 -2
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.b494ae7e.css +2 -0
- xinference/web/ui/build/static/css/main.b494ae7e.css.map +1 -0
- xinference/web/ui/build/static/js/main.3cea968e.js +3 -0
- xinference/web/ui/build/static/js/main.3cea968e.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/7f59e45e3f268ab8a4788b6fb024cf8dab088736dff22f5a3a39c122a83ab930.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/cc97b49285d7717c63374766c789141a4329a04582ab32756d7e0e614d4c5c7f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/dcd60488509450bfff37bfff56de2c096d51de17dd00ec60d4db49c8b483ada1.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +1 -0
- xinference/web/ui/src/locales/en.json +2 -2
- xinference/web/ui/src/locales/zh.json +1 -1
- {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/METADATA +3 -3
- {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/RECORD +35 -36
- xinference/model/llm/reasoning_parsers/__init__.py +0 -13
- xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +0 -98
- xinference/web/ui/build/static/css/main.f8177338.css +0 -2
- xinference/web/ui/build/static/css/main.f8177338.css.map +0 -1
- xinference/web/ui/build/static/js/main.55b70cb7.js +0 -3
- xinference/web/ui/build/static/js/main.55b70cb7.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +0 -1
- /xinference/web/ui/build/static/js/{main.55b70cb7.js.LICENSE.txt → main.3cea968e.js.LICENSE.txt} +0 -0
- {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/LICENSE +0 -0
- {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/WHEEL +0 -0
- {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/top_level.txt +0 -0
|
@@ -3738,6 +3738,241 @@
|
|
|
3738
3738
|
"<start_of_turn>"
|
|
3739
3739
|
]
|
|
3740
3740
|
},
|
|
3741
|
+
{
|
|
3742
|
+
"version": 1,
|
|
3743
|
+
"context_length": 32768,
|
|
3744
|
+
"model_name": "gemma-3-1b-it",
|
|
3745
|
+
"model_lang": [
|
|
3746
|
+
"en"
|
|
3747
|
+
],
|
|
3748
|
+
"model_ability": [
|
|
3749
|
+
"chat"
|
|
3750
|
+
],
|
|
3751
|
+
"model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
|
|
3752
|
+
"model_specs": [
|
|
3753
|
+
{
|
|
3754
|
+
"model_format": "pytorch",
|
|
3755
|
+
"model_size_in_billions": 1,
|
|
3756
|
+
"quantizations": [
|
|
3757
|
+
"none",
|
|
3758
|
+
"4-bit",
|
|
3759
|
+
"8-bit"
|
|
3760
|
+
],
|
|
3761
|
+
"model_id": "LLM-Research/gemma-3-1b-it",
|
|
3762
|
+
"model_hub": "modelscope"
|
|
3763
|
+
},
|
|
3764
|
+
{
|
|
3765
|
+
"model_format": "ggufv2",
|
|
3766
|
+
"model_size_in_billions": 1,
|
|
3767
|
+
"quantizations": [
|
|
3768
|
+
"Q2_K",
|
|
3769
|
+
"Q3_K_L",
|
|
3770
|
+
"Q3_K_M",
|
|
3771
|
+
"Q3_K_S",
|
|
3772
|
+
"Q4_K_L",
|
|
3773
|
+
"Q4_K_M",
|
|
3774
|
+
"Q4_K_S",
|
|
3775
|
+
"Q5_K_L",
|
|
3776
|
+
"Q5_K_M",
|
|
3777
|
+
"Q5_K_S",
|
|
3778
|
+
"Q6_K",
|
|
3779
|
+
"Q6_K_L",
|
|
3780
|
+
"Q8_0",
|
|
3781
|
+
"bf16"
|
|
3782
|
+
],
|
|
3783
|
+
"model_id": "bartowski/google_gemma-3-1b-it-GGUF",
|
|
3784
|
+
"model_file_name_template": "google_gemma-3-1b-it-{quantization}.gguf",
|
|
3785
|
+
"model_hub": "modelscope"
|
|
3786
|
+
},
|
|
3787
|
+
{
|
|
3788
|
+
"model_format": "mlx",
|
|
3789
|
+
"model_size_in_billions": 1,
|
|
3790
|
+
"quantizations": [
|
|
3791
|
+
"4bit",
|
|
3792
|
+
"6bit",
|
|
3793
|
+
"8bit",
|
|
3794
|
+
"fp16"
|
|
3795
|
+
],
|
|
3796
|
+
"model_id": "mlx-community/gemma-3-1b-it-{quantization}",
|
|
3797
|
+
"model_hub": "modelscope"
|
|
3798
|
+
}
|
|
3799
|
+
],
|
|
3800
|
+
"chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
|
|
3801
|
+
"stop_token_ids": [
|
|
3802
|
+
1,
|
|
3803
|
+
106,
|
|
3804
|
+
107
|
|
3805
|
+
],
|
|
3806
|
+
"stop": [
|
|
3807
|
+
"<eos>",
|
|
3808
|
+
"<end_of_turn>",
|
|
3809
|
+
"<start_of_turn>"
|
|
3810
|
+
]
|
|
3811
|
+
},
|
|
3812
|
+
{
|
|
3813
|
+
"version": 1,
|
|
3814
|
+
"context_length": 131072,
|
|
3815
|
+
"model_name": "gemma-3-it",
|
|
3816
|
+
"model_lang": [
|
|
3817
|
+
"en"
|
|
3818
|
+
],
|
|
3819
|
+
"model_ability": [
|
|
3820
|
+
"chat",
|
|
3821
|
+
"vision"
|
|
3822
|
+
],
|
|
3823
|
+
"model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
|
|
3824
|
+
"model_specs": [
|
|
3825
|
+
{
|
|
3826
|
+
"model_format": "pytorch",
|
|
3827
|
+
"model_size_in_billions": 4,
|
|
3828
|
+
"quantizations": [
|
|
3829
|
+
"none",
|
|
3830
|
+
"4-bit",
|
|
3831
|
+
"8-bit"
|
|
3832
|
+
],
|
|
3833
|
+
"model_id": "LLM-Research/gemma-3-4b-it",
|
|
3834
|
+
"model_hub": "modelscope"
|
|
3835
|
+
},
|
|
3836
|
+
{
|
|
3837
|
+
"model_format": "pytorch",
|
|
3838
|
+
"model_size_in_billions": 12,
|
|
3839
|
+
"quantizations": [
|
|
3840
|
+
"none",
|
|
3841
|
+
"4-bit",
|
|
3842
|
+
"8-bit"
|
|
3843
|
+
],
|
|
3844
|
+
"model_id": "LLM-Research/gemma-3-12b-it",
|
|
3845
|
+
"model_hub": "modelscope"
|
|
3846
|
+
},
|
|
3847
|
+
{
|
|
3848
|
+
"model_format": "pytorch",
|
|
3849
|
+
"model_size_in_billions": 27,
|
|
3850
|
+
"quantizations": [
|
|
3851
|
+
"none",
|
|
3852
|
+
"4-bit",
|
|
3853
|
+
"8-bit"
|
|
3854
|
+
],
|
|
3855
|
+
"model_id": "LLM-Research/gemma-3-27b-it",
|
|
3856
|
+
"model_hub": "modelscope"
|
|
3857
|
+
},
|
|
3858
|
+
{
|
|
3859
|
+
"model_format": "ggufv2",
|
|
3860
|
+
"model_size_in_billions": 4,
|
|
3861
|
+
"quantizations": [
|
|
3862
|
+
"Q2_K",
|
|
3863
|
+
"Q3_K_L",
|
|
3864
|
+
"Q3_K_M",
|
|
3865
|
+
"Q3_K_S",
|
|
3866
|
+
"Q4_K_L",
|
|
3867
|
+
"Q4_K_M",
|
|
3868
|
+
"Q4_K_S",
|
|
3869
|
+
"Q5_K_L",
|
|
3870
|
+
"Q5_K_M",
|
|
3871
|
+
"Q5_K_S",
|
|
3872
|
+
"Q6_K",
|
|
3873
|
+
"Q6_K_L",
|
|
3874
|
+
"Q8_0",
|
|
3875
|
+
"bf16"
|
|
3876
|
+
],
|
|
3877
|
+
"model_id": "bartowski/google_gemma-3-4b-it-GGUF",
|
|
3878
|
+
"model_file_name_template": "google_gemma-3-4b-it-{quantization}.gguf",
|
|
3879
|
+
"model_hub": "modelscope"
|
|
3880
|
+
},
|
|
3881
|
+
{
|
|
3882
|
+
"model_format": "ggufv2",
|
|
3883
|
+
"model_size_in_billions": 12,
|
|
3884
|
+
"quantizations": [
|
|
3885
|
+
"Q2_K",
|
|
3886
|
+
"Q3_K_L",
|
|
3887
|
+
"Q3_K_M",
|
|
3888
|
+
"Q3_K_S",
|
|
3889
|
+
"Q4_K_L",
|
|
3890
|
+
"Q4_K_M",
|
|
3891
|
+
"Q4_K_S",
|
|
3892
|
+
"Q5_K_L",
|
|
3893
|
+
"Q5_K_M",
|
|
3894
|
+
"Q5_K_S",
|
|
3895
|
+
"Q6_K",
|
|
3896
|
+
"Q6_K_L",
|
|
3897
|
+
"Q8_0",
|
|
3898
|
+
"bf16"
|
|
3899
|
+
],
|
|
3900
|
+
"model_id": "bartowski/google_gemma-3-12b-it-GGUF",
|
|
3901
|
+
"model_file_name_template": "google_gemma-3-12b-it-{quantization}.gguf",
|
|
3902
|
+
"model_hub": "modelscope"
|
|
3903
|
+
},
|
|
3904
|
+
{
|
|
3905
|
+
"model_format": "ggufv2",
|
|
3906
|
+
"model_size_in_billions": 27,
|
|
3907
|
+
"quantizations": [
|
|
3908
|
+
"Q2_K",
|
|
3909
|
+
"Q3_K_L",
|
|
3910
|
+
"Q3_K_M",
|
|
3911
|
+
"Q3_K_S",
|
|
3912
|
+
"Q4_K_L",
|
|
3913
|
+
"Q4_K_M",
|
|
3914
|
+
"Q4_K_S",
|
|
3915
|
+
"Q5_K_L",
|
|
3916
|
+
"Q5_K_M",
|
|
3917
|
+
"Q5_K_S",
|
|
3918
|
+
"Q6_K",
|
|
3919
|
+
"Q6_K_L",
|
|
3920
|
+
"Q8_0",
|
|
3921
|
+
"bf16"
|
|
3922
|
+
],
|
|
3923
|
+
"model_id": "bartowski/google_gemma-3-27b-it-GGUF",
|
|
3924
|
+
"model_file_name_template": "google_gemma-3-27b-it-{quantization}.gguf",
|
|
3925
|
+
"model_hub": "modelscope"
|
|
3926
|
+
},
|
|
3927
|
+
{
|
|
3928
|
+
"model_format": "mlx",
|
|
3929
|
+
"model_size_in_billions": 4,
|
|
3930
|
+
"quantizations": [
|
|
3931
|
+
"4bit",
|
|
3932
|
+
"6bit",
|
|
3933
|
+
"8bit",
|
|
3934
|
+
"fp16"
|
|
3935
|
+
],
|
|
3936
|
+
"model_id": "mlx-community/gemma-3-4b-it-{quantization}",
|
|
3937
|
+
"model_hub": "modelscope"
|
|
3938
|
+
},
|
|
3939
|
+
{
|
|
3940
|
+
"model_format": "mlx",
|
|
3941
|
+
"model_size_in_billions": 12,
|
|
3942
|
+
"quantizations": [
|
|
3943
|
+
"4bit",
|
|
3944
|
+
"6bit",
|
|
3945
|
+
"8bit",
|
|
3946
|
+
"fp16"
|
|
3947
|
+
],
|
|
3948
|
+
"model_id": "mlx-community/gemma-3-12b-it-{quantization}",
|
|
3949
|
+
"model_hub": "modelscope"
|
|
3950
|
+
},
|
|
3951
|
+
{
|
|
3952
|
+
"model_format": "mlx",
|
|
3953
|
+
"model_size_in_billions": 27,
|
|
3954
|
+
"quantizations": [
|
|
3955
|
+
"4bit",
|
|
3956
|
+
"6bit",
|
|
3957
|
+
"8bit",
|
|
3958
|
+
"fp16"
|
|
3959
|
+
],
|
|
3960
|
+
"model_id": "mlx-community/gemma-3-27b-it-{quantization}",
|
|
3961
|
+
"model_hub": "modelscope"
|
|
3962
|
+
}
|
|
3963
|
+
],
|
|
3964
|
+
"chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
|
|
3965
|
+
"stop_token_ids": [
|
|
3966
|
+
1,
|
|
3967
|
+
106,
|
|
3968
|
+
107
|
|
3969
|
+
],
|
|
3970
|
+
"stop": [
|
|
3971
|
+
"<eos>",
|
|
3972
|
+
"<end_of_turn>",
|
|
3973
|
+
"<start_of_turn>"
|
|
3974
|
+
]
|
|
3975
|
+
},
|
|
3741
3976
|
{
|
|
3742
3977
|
"version":1,
|
|
3743
3978
|
"context_length":2048,
|
|
@@ -4673,7 +4908,7 @@
|
|
|
4673
4908
|
"none"
|
|
4674
4909
|
],
|
|
4675
4910
|
"model_hub": "modelscope",
|
|
4676
|
-
"model_id": "OpenGVLab/InternVL2_5-MPO
|
|
4911
|
+
"model_id": "OpenGVLab/InternVL2_5-1B-MPO",
|
|
4677
4912
|
"model_revision": "master"
|
|
4678
4913
|
},
|
|
4679
4914
|
{
|
|
@@ -4685,7 +4920,7 @@
|
|
|
4685
4920
|
"none"
|
|
4686
4921
|
],
|
|
4687
4922
|
"model_hub": "modelscope",
|
|
4688
|
-
"model_id": "OpenGVLab/InternVL2_5-MPO
|
|
4923
|
+
"model_id": "OpenGVLab/InternVL2_5-2B-MPO",
|
|
4689
4924
|
"model_revision": "master"
|
|
4690
4925
|
},
|
|
4691
4926
|
{
|
|
@@ -4697,7 +4932,7 @@
|
|
|
4697
4932
|
"none"
|
|
4698
4933
|
],
|
|
4699
4934
|
"model_hub": "modelscope",
|
|
4700
|
-
"model_id": "OpenGVLab/InternVL2_5-MPO
|
|
4935
|
+
"model_id": "OpenGVLab/InternVL2_5-4B-MPO",
|
|
4701
4936
|
"model_revision": "master"
|
|
4702
4937
|
},
|
|
4703
4938
|
{
|
|
@@ -4707,7 +4942,7 @@
|
|
|
4707
4942
|
"Int4"
|
|
4708
4943
|
],
|
|
4709
4944
|
"model_hub": "modelscope",
|
|
4710
|
-
"model_id": "OpenGVLab/InternVL2_5-MPO-
|
|
4945
|
+
"model_id": "OpenGVLab/InternVL2_5-4B-MPO-AWQ",
|
|
4711
4946
|
"model_revision": "master"
|
|
4712
4947
|
},
|
|
4713
4948
|
{
|
|
@@ -4719,7 +4954,7 @@
|
|
|
4719
4954
|
"none"
|
|
4720
4955
|
],
|
|
4721
4956
|
"model_hub": "modelscope",
|
|
4722
|
-
"model_id": "OpenGVLab/InternVL2_5-MPO
|
|
4957
|
+
"model_id": "OpenGVLab/InternVL2_5-8B-MPO",
|
|
4723
4958
|
"model_revision": "master"
|
|
4724
4959
|
},
|
|
4725
4960
|
{
|
|
@@ -4729,7 +4964,7 @@
|
|
|
4729
4964
|
"Int4"
|
|
4730
4965
|
],
|
|
4731
4966
|
"model_hub": "modelscope",
|
|
4732
|
-
"model_id": "OpenGVLab/InternVL2_5-MPO-
|
|
4967
|
+
"model_id": "OpenGVLab/InternVL2_5-8B-MPO-AWQ",
|
|
4733
4968
|
"model_revision": "master"
|
|
4734
4969
|
},
|
|
4735
4970
|
{
|
|
@@ -4741,7 +4976,7 @@
|
|
|
4741
4976
|
"none"
|
|
4742
4977
|
],
|
|
4743
4978
|
"model_hub": "modelscope",
|
|
4744
|
-
"model_id": "OpenGVLab/InternVL2_5-MPO
|
|
4979
|
+
"model_id": "OpenGVLab/InternVL2_5-26B-MPO",
|
|
4745
4980
|
"model_revision": "master"
|
|
4746
4981
|
},
|
|
4747
4982
|
{
|
|
@@ -4751,7 +4986,7 @@
|
|
|
4751
4986
|
"Int4"
|
|
4752
4987
|
],
|
|
4753
4988
|
"model_hub": "modelscope",
|
|
4754
|
-
"model_id": "OpenGVLab/InternVL2_5-MPO-
|
|
4989
|
+
"model_id": "OpenGVLab/InternVL2_5-26B-MPO-AWQ",
|
|
4755
4990
|
"model_revision": "master"
|
|
4756
4991
|
},
|
|
4757
4992
|
{
|
|
@@ -4763,7 +4998,7 @@
|
|
|
4763
4998
|
"none"
|
|
4764
4999
|
],
|
|
4765
5000
|
"model_hub": "modelscope",
|
|
4766
|
-
"model_id": "OpenGVLab/InternVL2_5-MPO
|
|
5001
|
+
"model_id": "OpenGVLab/InternVL2_5-38B-MPO",
|
|
4767
5002
|
"model_revision": "master"
|
|
4768
5003
|
},
|
|
4769
5004
|
{
|
|
@@ -4773,7 +5008,7 @@
|
|
|
4773
5008
|
"Int4"
|
|
4774
5009
|
],
|
|
4775
5010
|
"model_hub": "modelscope",
|
|
4776
|
-
"model_id": "OpenGVLab/InternVL2_5-MPO-
|
|
5011
|
+
"model_id": "OpenGVLab/InternVL2_5-38B-MPO-AWQ",
|
|
4777
5012
|
"model_revision": "master"
|
|
4778
5013
|
},
|
|
4779
5014
|
{
|
|
@@ -4785,7 +5020,7 @@
|
|
|
4785
5020
|
"none"
|
|
4786
5021
|
],
|
|
4787
5022
|
"model_hub": "modelscope",
|
|
4788
|
-
"model_id": "OpenGVLab/InternVL2_5-MPO
|
|
5023
|
+
"model_id": "OpenGVLab/InternVL2_5-78B-MPO",
|
|
4789
5024
|
"model_revision": "master"
|
|
4790
5025
|
},
|
|
4791
5026
|
{
|
|
@@ -4795,7 +5030,7 @@
|
|
|
4795
5030
|
"Int4"
|
|
4796
5031
|
],
|
|
4797
5032
|
"model_hub": "modelscope",
|
|
4798
|
-
"model_id": "OpenGVLab/InternVL2_5-MPO-
|
|
5033
|
+
"model_id": "OpenGVLab/InternVL2_5-78B-MPO-AWQ",
|
|
4799
5034
|
"model_revision": "master"
|
|
4800
5035
|
}
|
|
4801
5036
|
],
|
|
@@ -5657,7 +5892,7 @@
|
|
|
5657
5892
|
"model_hub": "modelscope"
|
|
5658
5893
|
}
|
|
5659
5894
|
],
|
|
5660
|
-
"chat_template": "{% if
|
|
5895
|
+
"chat_template": "{% if messages %} {% if system or tools %} {% if system %} {{ system }} {% endif %} {% if tools %} {# Handle tools here if needed #} {% endif %} {% endif %} {% for message in messages %} {% set last = loop.index == loop.length %} {% if message.role == \"user\" %} <|User|> {% if tools and last %} Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}. Do not use variables. {{ tools }} {% endif %} {{ message.content }} {% if last %} <|Assistant|> {% endif %} {% elif message.role == \"assistant\" %} <|Assistant|> {% if message.tool_calls %} <|tool▁calls▁begin|> {% for tool in message.tool_calls %} <|tool▁call▁begin|> {\"name\": \"{{ tool.function.name }}\", \"parameters\": {{ tool.function.arguments }}} <|tool▁call▁end|> {% endfor %} <|tool▁calls▁end|> {% else %} {{ message.content }} {% if not last %} <|end▁of▁sentence|> {% endif %} {% endif %} {% elif message.role == \"tool\" %} <|tool▁outputs▁begin|> <|tool▁output▁begin|> {{ message.content }} <|tool▁output▁end|> <|tool▁outputs▁end|> {% if last and message.role != \"assistant\" %} <|Assistant|> {% endif %} {% endif %} {% endfor %} {% else %} {% if system %} {{ system }} {% endif %} {% if prompt %} <|User|> {{ prompt }} {% endif %} <|Assistant|> {{ response }} {% if response %} {{ response }} {% endif %} {% endif %}",
|
|
5661
5896
|
"stop_token_ids": [
|
|
5662
5897
|
1
|
|
5663
5898
|
],
|
|
@@ -7217,7 +7452,7 @@
|
|
|
7217
7452
|
],
|
|
7218
7453
|
"model_id": "AI-ModelScope/QwQ-32B-Preview-GGUF",
|
|
7219
7454
|
"model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf",
|
|
7220
|
-
|
|
7455
|
+
"model_hub": "modelscope"
|
|
7221
7456
|
}
|
|
7222
7457
|
],
|
|
7223
7458
|
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
|
|
@@ -7234,7 +7469,7 @@
|
|
|
7234
7469
|
},
|
|
7235
7470
|
{
|
|
7236
7471
|
"version": 1,
|
|
7237
|
-
"context_length":
|
|
7472
|
+
"context_length": 131072,
|
|
7238
7473
|
"model_name": "QwQ-32B",
|
|
7239
7474
|
"model_lang": [
|
|
7240
7475
|
"en",
|
|
@@ -7284,14 +7519,14 @@
|
|
|
7284
7519
|
"model_size_in_billions": 32,
|
|
7285
7520
|
"quantizations": [
|
|
7286
7521
|
"fp16",
|
|
7287
|
-
"
|
|
7288
|
-
"
|
|
7289
|
-
"
|
|
7290
|
-
"
|
|
7291
|
-
"
|
|
7292
|
-
"
|
|
7293
|
-
"
|
|
7294
|
-
"
|
|
7522
|
+
"q2_k",
|
|
7523
|
+
"q3_k_m",
|
|
7524
|
+
"q4_0",
|
|
7525
|
+
"q4_k_m",
|
|
7526
|
+
"q5_0",
|
|
7527
|
+
"q5_k_m",
|
|
7528
|
+
"q6_k",
|
|
7529
|
+
"q8_0"
|
|
7295
7530
|
],
|
|
7296
7531
|
"model_id": "Qwen/QwQ-32B-GGUF",
|
|
7297
7532
|
"model_file_name_template": "qwq-32b-{quantization}.gguf",
|
xinference/model/llm/mlx/core.py
CHANGED
|
@@ -148,11 +148,15 @@ class MLXModel(LLM):
|
|
|
148
148
|
self._max_kv_size = kwargs.get("max_kv_size", None)
|
|
149
149
|
self._prompt_cache = PromptCache()
|
|
150
150
|
|
|
151
|
-
|
|
151
|
+
model, tokenizer = load(
|
|
152
152
|
self.model_path,
|
|
153
153
|
tokenizer_config=tokenizer_config,
|
|
154
154
|
model_config=self._model_config,
|
|
155
155
|
)
|
|
156
|
+
if stop_token_ids := self.model_family.stop_token_ids:
|
|
157
|
+
for stop_token_id in stop_token_ids:
|
|
158
|
+
tokenizer.add_eos_token(stop_token_id)
|
|
159
|
+
return model, tokenizer
|
|
156
160
|
|
|
157
161
|
def load(self):
|
|
158
162
|
reasoning_content = self._model_config.pop("reasoning_content")
|
|
@@ -260,7 +264,7 @@ class MLXModel(LLM):
|
|
|
260
264
|
start = time.time()
|
|
261
265
|
output = ""
|
|
262
266
|
tokens = []
|
|
263
|
-
for
|
|
267
|
+
for i, chunk_resp in enumerate(
|
|
264
268
|
self._generate_stream_inner(
|
|
265
269
|
prompt_token_ids=prompt_token_ids,
|
|
266
270
|
max_tokens=max_tokens,
|
|
@@ -269,8 +273,7 @@ class MLXModel(LLM):
|
|
|
269
273
|
repetition_penalty=kwargs["repetition_penalty"],
|
|
270
274
|
repetition_context_size=kwargs["repetition_context_size"],
|
|
271
275
|
prompt_cache=self._prompt_cache.cache if self._prompt_cache else None, # type: ignore
|
|
272
|
-
)
|
|
273
|
-
range(max_tokens),
|
|
276
|
+
)
|
|
274
277
|
):
|
|
275
278
|
token = chunk_resp.token
|
|
276
279
|
tokens.append(token)
|
|
@@ -435,10 +438,11 @@ class MLXChatModel(MLXModel, ChatModelMixin):
|
|
|
435
438
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
436
439
|
full_context_kwargs = {}
|
|
437
440
|
if tools:
|
|
438
|
-
if
|
|
441
|
+
if (
|
|
442
|
+
model_family in QWEN_TOOL_CALL_FAMILY
|
|
443
|
+
or model_family in DEEPSEEK_TOOL_CALL_FAMILY
|
|
444
|
+
):
|
|
439
445
|
full_context_kwargs["tools"] = tools
|
|
440
|
-
elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
|
|
441
|
-
self._tools_to_messages_for_deepseek(messages, tools)
|
|
442
446
|
assert self.model_family.chat_template is not None
|
|
443
447
|
full_prompt = self.get_full_context(
|
|
444
448
|
messages, self.model_family.chat_template, **full_context_kwargs
|
|
@@ -507,19 +511,19 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
507
511
|
from mlx_lm.utils import GenerationResponse
|
|
508
512
|
from mlx_vlm.utils import generate_step
|
|
509
513
|
|
|
510
|
-
inputs = kwargs
|
|
514
|
+
inputs = kwargs.pop("prompt_token_ids")
|
|
511
515
|
|
|
512
|
-
|
|
516
|
+
extra_kwargs = kwargs.copy()
|
|
513
517
|
input_ids, pixel_values, mask, kwargs = inputs
|
|
518
|
+
kwargs.update(extra_kwargs)
|
|
514
519
|
|
|
515
520
|
tokenizer = self._processor.tokenizer
|
|
516
521
|
detokenizer = self._processor.detokenizer
|
|
517
522
|
|
|
518
523
|
detokenizer.reset()
|
|
519
524
|
tic = time.perf_counter()
|
|
520
|
-
for (token, logprobs)
|
|
525
|
+
for n, (token, logprobs) in enumerate(
|
|
521
526
|
generate_step(input_ids, self._model, pixel_values, mask, **kwargs),
|
|
522
|
-
range(max_tokens),
|
|
523
527
|
):
|
|
524
528
|
if n == 0:
|
|
525
529
|
prompt_time = time.perf_counter() - tic
|
xinference/model/llm/{reasoning_parsers/deepseek_r1_reasoning_parser.py → reasoning_parser.py}
RENAMED
|
@@ -1,20 +1,17 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from typing import Optional, Tuple, Union
|
|
3
3
|
|
|
4
|
-
from
|
|
5
|
-
from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
|
|
4
|
+
from ...types import ChatCompletionChunkDelta, CompletionChoice
|
|
6
5
|
|
|
7
6
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@ReasoningParserManager.register_module("deepseek-r1-distill-llama")
|
|
11
|
-
class DeepSeekR1ReasoningParser(ReasoningParser):
|
|
12
|
-
"""Reasoning parser for DeepSeek-R1 model."""
|
|
7
|
+
class ReasoningParser:
|
|
8
|
+
"""Reasoning parser for reasoning model."""
|
|
13
9
|
|
|
14
10
|
def __init__(
|
|
15
11
|
self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
|
|
16
12
|
):
|
|
17
|
-
|
|
13
|
+
self.reasoning_start_tag = reasoning_start_tag
|
|
14
|
+
self.reasoning_end_tag = reasoning_end_tag
|
|
18
15
|
self.reasoning_regex = re.compile(
|
|
19
16
|
rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
|
|
20
17
|
)
|
|
@@ -34,9 +31,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
|
|
|
34
31
|
Yields:
|
|
35
32
|
str: Extracted reasoning content chunks.
|
|
36
33
|
"""
|
|
37
|
-
delta = ChatCompletionChunkDelta(
|
|
38
|
-
content=delta_text,
|
|
39
|
-
)
|
|
34
|
+
delta = ChatCompletionChunkDelta()
|
|
40
35
|
|
|
41
36
|
# Check if <think> is present in previous or delta.
|
|
42
37
|
# Keep compatibility with models that don't generate <think> tokens.
|
|
@@ -50,17 +45,21 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
|
|
|
50
45
|
delta["reasoning_content"] = reasoning_content
|
|
51
46
|
if content is not None:
|
|
52
47
|
delta["content"] = content
|
|
48
|
+
else:
|
|
49
|
+
delta["content"] = None
|
|
53
50
|
return delta
|
|
54
51
|
elif self.reasoning_end_tag in previous_text:
|
|
55
52
|
# <think> in previous, </think> in previous,
|
|
56
53
|
# <think> in previous, </think> in previous,
|
|
57
54
|
# reasoning content ends
|
|
55
|
+
delta["reasoning_content"] = None
|
|
56
|
+
delta["content"] = delta_text
|
|
58
57
|
return delta
|
|
59
58
|
else:
|
|
60
59
|
# <think> in previous, no </think> in previous or delta,
|
|
61
60
|
# reasoning content continues
|
|
62
61
|
delta["reasoning_content"] = delta_text
|
|
63
|
-
delta["content"] =
|
|
62
|
+
delta["content"] = None
|
|
64
63
|
return delta
|
|
65
64
|
elif self.reasoning_start_tag in delta_text:
|
|
66
65
|
if self.reasoning_end_tag in delta_text:
|
|
@@ -74,12 +73,14 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
|
|
|
74
73
|
delta["reasoning_content"] = reasoning_content
|
|
75
74
|
if content is not None:
|
|
76
75
|
delta["content"] = content
|
|
76
|
+
else:
|
|
77
|
+
delta["content"] = None
|
|
77
78
|
return delta
|
|
78
79
|
else:
|
|
79
80
|
# <think> in delta, no </think> in delta,
|
|
80
81
|
# reasoning content continues
|
|
81
82
|
delta["reasoning_content"] = delta_text
|
|
82
|
-
delta["content"] =
|
|
83
|
+
delta["content"] = None
|
|
83
84
|
return delta
|
|
84
85
|
else:
|
|
85
86
|
# No <think> in previous or delta, also need to check for </think>.
|
|
@@ -94,14 +95,18 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
|
|
|
94
95
|
delta["reasoning_content"] = reasoning_content
|
|
95
96
|
if content is not None:
|
|
96
97
|
delta["content"] = content
|
|
98
|
+
else:
|
|
99
|
+
delta["content"] = None
|
|
97
100
|
return delta
|
|
98
101
|
elif self.reasoning_end_tag in previous_text:
|
|
99
102
|
# </think> in previous, thinking content ends
|
|
103
|
+
delta["reasoning_content"] = None
|
|
104
|
+
delta["content"] = delta_text
|
|
100
105
|
return delta
|
|
101
106
|
else:
|
|
102
107
|
# no </think> in previous or delta, reasoning content continues
|
|
103
108
|
delta["reasoning_content"] = delta_text
|
|
104
|
-
delta["content"] =
|
|
109
|
+
delta["content"] = None
|
|
105
110
|
return delta
|
|
106
111
|
|
|
107
112
|
def extract_reasoning_content(
|
|
@@ -79,6 +79,8 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
|
|
|
79
79
|
"glm-edge-v",
|
|
80
80
|
"QvQ-72B-Preview",
|
|
81
81
|
"cogagent",
|
|
82
|
+
"gemma-3-1b-it",
|
|
83
|
+
"gemma-3-it",
|
|
82
84
|
]
|
|
83
85
|
|
|
84
86
|
|
|
@@ -691,10 +693,9 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
691
693
|
tools
|
|
692
694
|
and model_family in QWEN_TOOL_CALL_FAMILY
|
|
693
695
|
or model_family in LLAMA3_TOOL_CALL_FAMILY
|
|
696
|
+
or model_family in DEEPSEEK_TOOL_CALL_FAMILY
|
|
694
697
|
):
|
|
695
698
|
full_context_kwargs["tools"] = tools
|
|
696
|
-
elif tools and model_family in DEEPSEEK_TOOL_CALL_FAMILY:
|
|
697
|
-
self._tools_to_messages_for_deepseek(messages, tools)
|
|
698
699
|
assert self.model_family.chat_template is not None
|
|
699
700
|
full_prompt = self.get_full_context(
|
|
700
701
|
messages,
|