xinference 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (45) hide show
  1. xinference/_version.py +3 -3
  2. xinference/core/chat_interface.py +39 -24
  3. xinference/model/llm/__init__.py +3 -0
  4. xinference/model/llm/core.py +2 -5
  5. xinference/model/llm/llama_cpp/core.py +52 -16
  6. xinference/model/llm/llm_family.json +364 -21
  7. xinference/model/llm/llm_family_modelscope.json +258 -23
  8. xinference/model/llm/mlx/core.py +15 -11
  9. xinference/model/llm/{reasoning_parsers/deepseek_r1_reasoning_parser.py → reasoning_parser.py} +19 -14
  10. xinference/model/llm/sglang/core.py +2 -0
  11. xinference/model/llm/transformers/core.py +3 -2
  12. xinference/model/llm/transformers/gemma3.py +185 -0
  13. xinference/model/llm/transformers/intern_vl.py +0 -2
  14. xinference/model/llm/utils.py +78 -32
  15. xinference/model/llm/vllm/core.py +10 -3
  16. xinference/types.py +2 -2
  17. xinference/web/ui/build/asset-manifest.json +6 -6
  18. xinference/web/ui/build/index.html +1 -1
  19. xinference/web/ui/build/static/css/main.b494ae7e.css +2 -0
  20. xinference/web/ui/build/static/css/main.b494ae7e.css.map +1 -0
  21. xinference/web/ui/build/static/js/main.3cea968e.js +3 -0
  22. xinference/web/ui/build/static/js/main.3cea968e.js.map +1 -0
  23. xinference/web/ui/node_modules/.cache/babel-loader/7f59e45e3f268ab8a4788b6fb024cf8dab088736dff22f5a3a39c122a83ab930.json +1 -0
  24. xinference/web/ui/node_modules/.cache/babel-loader/cc97b49285d7717c63374766c789141a4329a04582ab32756d7e0e614d4c5c7f.json +1 -0
  25. xinference/web/ui/node_modules/.cache/babel-loader/dcd60488509450bfff37bfff56de2c096d51de17dd00ec60d4db49c8b483ada1.json +1 -0
  26. xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +1 -0
  27. xinference/web/ui/src/locales/en.json +2 -2
  28. xinference/web/ui/src/locales/zh.json +1 -1
  29. {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/METADATA +3 -3
  30. {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/RECORD +35 -36
  31. xinference/model/llm/reasoning_parsers/__init__.py +0 -13
  32. xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +0 -98
  33. xinference/web/ui/build/static/css/main.f8177338.css +0 -2
  34. xinference/web/ui/build/static/css/main.f8177338.css.map +0 -1
  35. xinference/web/ui/build/static/js/main.55b70cb7.js +0 -3
  36. xinference/web/ui/build/static/js/main.55b70cb7.js.map +0 -1
  37. xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +0 -1
  38. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +0 -1
  39. xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +0 -1
  40. xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +0 -1
  41. /xinference/web/ui/build/static/js/{main.55b70cb7.js.LICENSE.txt → main.3cea968e.js.LICENSE.txt} +0 -0
  42. {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/LICENSE +0 -0
  43. {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/WHEEL +0 -0
  44. {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/entry_points.txt +0 -0
  45. {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/top_level.txt +0 -0
@@ -3738,6 +3738,241 @@
3738
3738
  "<start_of_turn>"
3739
3739
  ]
3740
3740
  },
3741
+ {
3742
+ "version": 1,
3743
+ "context_length": 32768,
3744
+ "model_name": "gemma-3-1b-it",
3745
+ "model_lang": [
3746
+ "en"
3747
+ ],
3748
+ "model_ability": [
3749
+ "chat"
3750
+ ],
3751
+ "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
3752
+ "model_specs": [
3753
+ {
3754
+ "model_format": "pytorch",
3755
+ "model_size_in_billions": 1,
3756
+ "quantizations": [
3757
+ "none",
3758
+ "4-bit",
3759
+ "8-bit"
3760
+ ],
3761
+ "model_id": "LLM-Research/gemma-3-1b-it",
3762
+ "model_hub": "modelscope"
3763
+ },
3764
+ {
3765
+ "model_format": "ggufv2",
3766
+ "model_size_in_billions": 1,
3767
+ "quantizations": [
3768
+ "Q2_K",
3769
+ "Q3_K_L",
3770
+ "Q3_K_M",
3771
+ "Q3_K_S",
3772
+ "Q4_K_L",
3773
+ "Q4_K_M",
3774
+ "Q4_K_S",
3775
+ "Q5_K_L",
3776
+ "Q5_K_M",
3777
+ "Q5_K_S",
3778
+ "Q6_K",
3779
+ "Q6_K_L",
3780
+ "Q8_0",
3781
+ "bf16"
3782
+ ],
3783
+ "model_id": "bartowski/google_gemma-3-1b-it-GGUF",
3784
+ "model_file_name_template": "google_gemma-3-1b-it-{quantization}.gguf",
3785
+ "model_hub": "modelscope"
3786
+ },
3787
+ {
3788
+ "model_format": "mlx",
3789
+ "model_size_in_billions": 1,
3790
+ "quantizations": [
3791
+ "4bit",
3792
+ "6bit",
3793
+ "8bit",
3794
+ "fp16"
3795
+ ],
3796
+ "model_id": "mlx-community/gemma-3-1b-it-{quantization}",
3797
+ "model_hub": "modelscope"
3798
+ }
3799
+ ],
3800
+ "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
3801
+ "stop_token_ids": [
3802
+ 1,
3803
+ 106,
3804
+ 107
3805
+ ],
3806
+ "stop": [
3807
+ "<eos>",
3808
+ "<end_of_turn>",
3809
+ "<start_of_turn>"
3810
+ ]
3811
+ },
3812
+ {
3813
+ "version": 1,
3814
+ "context_length": 131072,
3815
+ "model_name": "gemma-3-it",
3816
+ "model_lang": [
3817
+ "en"
3818
+ ],
3819
+ "model_ability": [
3820
+ "chat",
3821
+ "vision"
3822
+ ],
3823
+ "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
3824
+ "model_specs": [
3825
+ {
3826
+ "model_format": "pytorch",
3827
+ "model_size_in_billions": 4,
3828
+ "quantizations": [
3829
+ "none",
3830
+ "4-bit",
3831
+ "8-bit"
3832
+ ],
3833
+ "model_id": "LLM-Research/gemma-3-4b-it",
3834
+ "model_hub": "modelscope"
3835
+ },
3836
+ {
3837
+ "model_format": "pytorch",
3838
+ "model_size_in_billions": 12,
3839
+ "quantizations": [
3840
+ "none",
3841
+ "4-bit",
3842
+ "8-bit"
3843
+ ],
3844
+ "model_id": "LLM-Research/gemma-3-12b-it",
3845
+ "model_hub": "modelscope"
3846
+ },
3847
+ {
3848
+ "model_format": "pytorch",
3849
+ "model_size_in_billions": 27,
3850
+ "quantizations": [
3851
+ "none",
3852
+ "4-bit",
3853
+ "8-bit"
3854
+ ],
3855
+ "model_id": "LLM-Research/gemma-3-27b-it",
3856
+ "model_hub": "modelscope"
3857
+ },
3858
+ {
3859
+ "model_format": "ggufv2",
3860
+ "model_size_in_billions": 4,
3861
+ "quantizations": [
3862
+ "Q2_K",
3863
+ "Q3_K_L",
3864
+ "Q3_K_M",
3865
+ "Q3_K_S",
3866
+ "Q4_K_L",
3867
+ "Q4_K_M",
3868
+ "Q4_K_S",
3869
+ "Q5_K_L",
3870
+ "Q5_K_M",
3871
+ "Q5_K_S",
3872
+ "Q6_K",
3873
+ "Q6_K_L",
3874
+ "Q8_0",
3875
+ "bf16"
3876
+ ],
3877
+ "model_id": "bartowski/google_gemma-3-4b-it-GGUF",
3878
+ "model_file_name_template": "google_gemma-3-4b-it-{quantization}.gguf",
3879
+ "model_hub": "modelscope"
3880
+ },
3881
+ {
3882
+ "model_format": "ggufv2",
3883
+ "model_size_in_billions": 12,
3884
+ "quantizations": [
3885
+ "Q2_K",
3886
+ "Q3_K_L",
3887
+ "Q3_K_M",
3888
+ "Q3_K_S",
3889
+ "Q4_K_L",
3890
+ "Q4_K_M",
3891
+ "Q4_K_S",
3892
+ "Q5_K_L",
3893
+ "Q5_K_M",
3894
+ "Q5_K_S",
3895
+ "Q6_K",
3896
+ "Q6_K_L",
3897
+ "Q8_0",
3898
+ "bf16"
3899
+ ],
3900
+ "model_id": "bartowski/google_gemma-3-12b-it-GGUF",
3901
+ "model_file_name_template": "google_gemma-3-12b-it-{quantization}.gguf",
3902
+ "model_hub": "modelscope"
3903
+ },
3904
+ {
3905
+ "model_format": "ggufv2",
3906
+ "model_size_in_billions": 27,
3907
+ "quantizations": [
3908
+ "Q2_K",
3909
+ "Q3_K_L",
3910
+ "Q3_K_M",
3911
+ "Q3_K_S",
3912
+ "Q4_K_L",
3913
+ "Q4_K_M",
3914
+ "Q4_K_S",
3915
+ "Q5_K_L",
3916
+ "Q5_K_M",
3917
+ "Q5_K_S",
3918
+ "Q6_K",
3919
+ "Q6_K_L",
3920
+ "Q8_0",
3921
+ "bf16"
3922
+ ],
3923
+ "model_id": "bartowski/google_gemma-3-27b-it-GGUF",
3924
+ "model_file_name_template": "google_gemma-3-27b-it-{quantization}.gguf",
3925
+ "model_hub": "modelscope"
3926
+ },
3927
+ {
3928
+ "model_format": "mlx",
3929
+ "model_size_in_billions": 4,
3930
+ "quantizations": [
3931
+ "4bit",
3932
+ "6bit",
3933
+ "8bit",
3934
+ "fp16"
3935
+ ],
3936
+ "model_id": "mlx-community/gemma-3-4b-it-{quantization}",
3937
+ "model_hub": "modelscope"
3938
+ },
3939
+ {
3940
+ "model_format": "mlx",
3941
+ "model_size_in_billions": 12,
3942
+ "quantizations": [
3943
+ "4bit",
3944
+ "6bit",
3945
+ "8bit",
3946
+ "fp16"
3947
+ ],
3948
+ "model_id": "mlx-community/gemma-3-12b-it-{quantization}",
3949
+ "model_hub": "modelscope"
3950
+ },
3951
+ {
3952
+ "model_format": "mlx",
3953
+ "model_size_in_billions": 27,
3954
+ "quantizations": [
3955
+ "4bit",
3956
+ "6bit",
3957
+ "8bit",
3958
+ "fp16"
3959
+ ],
3960
+ "model_id": "mlx-community/gemma-3-27b-it-{quantization}",
3961
+ "model_hub": "modelscope"
3962
+ }
3963
+ ],
3964
+ "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
3965
+ "stop_token_ids": [
3966
+ 1,
3967
+ 106,
3968
+ 107
3969
+ ],
3970
+ "stop": [
3971
+ "<eos>",
3972
+ "<end_of_turn>",
3973
+ "<start_of_turn>"
3974
+ ]
3975
+ },
3741
3976
  {
3742
3977
  "version":1,
3743
3978
  "context_length":2048,
@@ -4673,7 +4908,7 @@
4673
4908
  "none"
4674
4909
  ],
4675
4910
  "model_hub": "modelscope",
4676
- "model_id": "OpenGVLab/InternVL2_5-MPO-1B",
4911
+ "model_id": "OpenGVLab/InternVL2_5-1B-MPO",
4677
4912
  "model_revision": "master"
4678
4913
  },
4679
4914
  {
@@ -4685,7 +4920,7 @@
4685
4920
  "none"
4686
4921
  ],
4687
4922
  "model_hub": "modelscope",
4688
- "model_id": "OpenGVLab/InternVL2_5-MPO-2B",
4923
+ "model_id": "OpenGVLab/InternVL2_5-2B-MPO",
4689
4924
  "model_revision": "master"
4690
4925
  },
4691
4926
  {
@@ -4697,7 +4932,7 @@
4697
4932
  "none"
4698
4933
  ],
4699
4934
  "model_hub": "modelscope",
4700
- "model_id": "OpenGVLab/InternVL2_5-MPO-4B",
4935
+ "model_id": "OpenGVLab/InternVL2_5-4B-MPO",
4701
4936
  "model_revision": "master"
4702
4937
  },
4703
4938
  {
@@ -4707,7 +4942,7 @@
4707
4942
  "Int4"
4708
4943
  ],
4709
4944
  "model_hub": "modelscope",
4710
- "model_id": "OpenGVLab/InternVL2_5-MPO-4B-AWQ",
4945
+ "model_id": "OpenGVLab/InternVL2_5-4B-MPO-AWQ",
4711
4946
  "model_revision": "master"
4712
4947
  },
4713
4948
  {
@@ -4719,7 +4954,7 @@
4719
4954
  "none"
4720
4955
  ],
4721
4956
  "model_hub": "modelscope",
4722
- "model_id": "OpenGVLab/InternVL2_5-MPO-8B",
4957
+ "model_id": "OpenGVLab/InternVL2_5-8B-MPO",
4723
4958
  "model_revision": "master"
4724
4959
  },
4725
4960
  {
@@ -4729,7 +4964,7 @@
4729
4964
  "Int4"
4730
4965
  ],
4731
4966
  "model_hub": "modelscope",
4732
- "model_id": "OpenGVLab/InternVL2_5-MPO-8B-AWQ",
4967
+ "model_id": "OpenGVLab/InternVL2_5-8B-MPO-AWQ",
4733
4968
  "model_revision": "master"
4734
4969
  },
4735
4970
  {
@@ -4741,7 +4976,7 @@
4741
4976
  "none"
4742
4977
  ],
4743
4978
  "model_hub": "modelscope",
4744
- "model_id": "OpenGVLab/InternVL2_5-MPO-26B",
4979
+ "model_id": "OpenGVLab/InternVL2_5-26B-MPO",
4745
4980
  "model_revision": "master"
4746
4981
  },
4747
4982
  {
@@ -4751,7 +4986,7 @@
4751
4986
  "Int4"
4752
4987
  ],
4753
4988
  "model_hub": "modelscope",
4754
- "model_id": "OpenGVLab/InternVL2_5-MPO-26B-AWQ",
4989
+ "model_id": "OpenGVLab/InternVL2_5-26B-MPO-AWQ",
4755
4990
  "model_revision": "master"
4756
4991
  },
4757
4992
  {
@@ -4763,7 +4998,7 @@
4763
4998
  "none"
4764
4999
  ],
4765
5000
  "model_hub": "modelscope",
4766
- "model_id": "OpenGVLab/InternVL2_5-MPO-38B",
5001
+ "model_id": "OpenGVLab/InternVL2_5-38B-MPO",
4767
5002
  "model_revision": "master"
4768
5003
  },
4769
5004
  {
@@ -4773,7 +5008,7 @@
4773
5008
  "Int4"
4774
5009
  ],
4775
5010
  "model_hub": "modelscope",
4776
- "model_id": "OpenGVLab/InternVL2_5-MPO-38B-AWQ",
5011
+ "model_id": "OpenGVLab/InternVL2_5-38B-MPO-AWQ",
4777
5012
  "model_revision": "master"
4778
5013
  },
4779
5014
  {
@@ -4785,7 +5020,7 @@
4785
5020
  "none"
4786
5021
  ],
4787
5022
  "model_hub": "modelscope",
4788
- "model_id": "OpenGVLab/InternVL2_5-MPO-78B",
5023
+ "model_id": "OpenGVLab/InternVL2_5-78B-MPO",
4789
5024
  "model_revision": "master"
4790
5025
  },
4791
5026
  {
@@ -4795,7 +5030,7 @@
4795
5030
  "Int4"
4796
5031
  ],
4797
5032
  "model_hub": "modelscope",
4798
- "model_id": "OpenGVLab/InternVL2_5-MPO-78B-AWQ",
5033
+ "model_id": "OpenGVLab/InternVL2_5-78B-MPO-AWQ",
4799
5034
  "model_revision": "master"
4800
5035
  }
4801
5036
  ],
@@ -5657,7 +5892,7 @@
5657
5892
  "model_hub": "modelscope"
5658
5893
  }
5659
5894
  ],
5660
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁callend|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
5895
+ "chat_template": "{% if messages %} {% if system or tools %} {% if system %} {{ system }} {% endif %} {% if tools %} {# Handle tools here if needed #} {% endif %} {% endif %} {% for message in messages %} {% set last = loop.index == loop.length %} {% if message.role == \"user\" %} <|User|> {% if tools and last %} Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}. Do not use variables. {{ tools }} {% endif %} {{ message.content }} {% if last %} <|Assistant|> {% endif %} {% elif message.role == \"assistant\" %} <|Assistant|> {% if message.tool_calls %} <|tool▁callsbegin|> {% for tool in message.tool_calls %} <|tool▁call▁begin|> {\"name\": \"{{ tool.function.name }}\", \"parameters\": {{ tool.function.arguments }}} <|tool▁call▁end|> {% endfor %} <|tool▁calls▁end|> {% else %} {{ message.content }} {% if not last %} <|end▁of▁sentence|> {% endif %} {% endif %} {% elif message.role == \"tool\" %} <|tool▁outputs▁begin|> <|tool▁output▁begin|> {{ message.content }} <|tool▁output▁end|> <|tool▁outputs▁end|> {% if last and message.role != \"assistant\" %} <|Assistant|> {% endif %} {% endif %} {% endfor %} {% else %} {% if system %} {{ system }} {% endif %} {% if prompt %} <|User|> {{ prompt }} {% endif %} <|Assistant|> {{ response }} {% if response %} {{ response }} {% endif %} {% endif %}",
5661
5896
  "stop_token_ids": [
5662
5897
  1
5663
5898
  ],
@@ -7217,7 +7452,7 @@
7217
7452
  ],
7218
7453
  "model_id": "AI-ModelScope/QwQ-32B-Preview-GGUF",
7219
7454
  "model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf",
7220
- "model_hub": "modelscope"
7455
+ "model_hub": "modelscope"
7221
7456
  }
7222
7457
  ],
7223
7458
  "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
@@ -7234,7 +7469,7 @@
7234
7469
  },
7235
7470
  {
7236
7471
  "version": 1,
7237
- "context_length": 32768,
7472
+ "context_length": 131072,
7238
7473
  "model_name": "QwQ-32B",
7239
7474
  "model_lang": [
7240
7475
  "en",
@@ -7284,14 +7519,14 @@
7284
7519
  "model_size_in_billions": 32,
7285
7520
  "quantizations": [
7286
7521
  "fp16",
7287
- "Q2_k",
7288
- "Q3_K_M",
7289
- "Q4_0",
7290
- "Q4_K_M",
7291
- "Q5_0",
7292
- "Q5_K_M",
7293
- "Q6_K",
7294
- "Q8_0"
7522
+ "q2_k",
7523
+ "q3_k_m",
7524
+ "q4_0",
7525
+ "q4_k_m",
7526
+ "q5_0",
7527
+ "q5_k_m",
7528
+ "q6_k",
7529
+ "q8_0"
7295
7530
  ],
7296
7531
  "model_id": "Qwen/QwQ-32B-GGUF",
7297
7532
  "model_file_name_template": "qwq-32b-{quantization}.gguf",
@@ -148,11 +148,15 @@ class MLXModel(LLM):
148
148
  self._max_kv_size = kwargs.get("max_kv_size", None)
149
149
  self._prompt_cache = PromptCache()
150
150
 
151
- return load(
151
+ model, tokenizer = load(
152
152
  self.model_path,
153
153
  tokenizer_config=tokenizer_config,
154
154
  model_config=self._model_config,
155
155
  )
156
+ if stop_token_ids := self.model_family.stop_token_ids:
157
+ for stop_token_id in stop_token_ids:
158
+ tokenizer.add_eos_token(stop_token_id)
159
+ return model, tokenizer
156
160
 
157
161
  def load(self):
158
162
  reasoning_content = self._model_config.pop("reasoning_content")
@@ -260,7 +264,7 @@ class MLXModel(LLM):
260
264
  start = time.time()
261
265
  output = ""
262
266
  tokens = []
263
- for chunk_resp, i in zip(
267
+ for i, chunk_resp in enumerate(
264
268
  self._generate_stream_inner(
265
269
  prompt_token_ids=prompt_token_ids,
266
270
  max_tokens=max_tokens,
@@ -269,8 +273,7 @@ class MLXModel(LLM):
269
273
  repetition_penalty=kwargs["repetition_penalty"],
270
274
  repetition_context_size=kwargs["repetition_context_size"],
271
275
  prompt_cache=self._prompt_cache.cache if self._prompt_cache else None, # type: ignore
272
- ),
273
- range(max_tokens),
276
+ )
274
277
  ):
275
278
  token = chunk_resp.token
276
279
  tokens.append(token)
@@ -435,10 +438,11 @@ class MLXChatModel(MLXModel, ChatModelMixin):
435
438
  tools = generate_config.pop("tools", []) if generate_config else None
436
439
  full_context_kwargs = {}
437
440
  if tools:
438
- if model_family in QWEN_TOOL_CALL_FAMILY:
441
+ if (
442
+ model_family in QWEN_TOOL_CALL_FAMILY
443
+ or model_family in DEEPSEEK_TOOL_CALL_FAMILY
444
+ ):
439
445
  full_context_kwargs["tools"] = tools
440
- elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
441
- self._tools_to_messages_for_deepseek(messages, tools)
442
446
  assert self.model_family.chat_template is not None
443
447
  full_prompt = self.get_full_context(
444
448
  messages, self.model_family.chat_template, **full_context_kwargs
@@ -507,19 +511,19 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
507
511
  from mlx_lm.utils import GenerationResponse
508
512
  from mlx_vlm.utils import generate_step
509
513
 
510
- inputs = kwargs["prompt_token_ids"]
514
+ inputs = kwargs.pop("prompt_token_ids")
511
515
 
512
- max_tokens = kwargs.pop("max_tokens")
516
+ extra_kwargs = kwargs.copy()
513
517
  input_ids, pixel_values, mask, kwargs = inputs
518
+ kwargs.update(extra_kwargs)
514
519
 
515
520
  tokenizer = self._processor.tokenizer
516
521
  detokenizer = self._processor.detokenizer
517
522
 
518
523
  detokenizer.reset()
519
524
  tic = time.perf_counter()
520
- for (token, logprobs), n in zip(
525
+ for n, (token, logprobs) in enumerate(
521
526
  generate_step(input_ids, self._model, pixel_values, mask, **kwargs),
522
- range(max_tokens),
523
527
  ):
524
528
  if n == 0:
525
529
  prompt_time = time.perf_counter() - tic
@@ -1,20 +1,17 @@
1
1
  import re
2
2
  from typing import Optional, Tuple, Union
3
3
 
4
- from ....types import ChatCompletionChunkDelta, CompletionChoice
5
- from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
4
+ from ...types import ChatCompletionChunkDelta, CompletionChoice
6
5
 
7
6
 
8
- @ReasoningParserManager.register_module("deepseek-v3")
9
- @ReasoningParserManager.register_module("deepseek-r1-distill-qwen")
10
- @ReasoningParserManager.register_module("deepseek-r1-distill-llama")
11
- class DeepSeekR1ReasoningParser(ReasoningParser):
12
- """Reasoning parser for DeepSeek-R1 model."""
7
+ class ReasoningParser:
8
+ """Reasoning parser for reasoning model."""
13
9
 
14
10
  def __init__(
15
11
  self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
16
12
  ):
17
- super().__init__(reasoning_start_tag, reasoning_end_tag)
13
+ self.reasoning_start_tag = reasoning_start_tag
14
+ self.reasoning_end_tag = reasoning_end_tag
18
15
  self.reasoning_regex = re.compile(
19
16
  rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
20
17
  )
@@ -34,9 +31,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
34
31
  Yields:
35
32
  str: Extracted reasoning content chunks.
36
33
  """
37
- delta = ChatCompletionChunkDelta(
38
- content=delta_text,
39
- )
34
+ delta = ChatCompletionChunkDelta()
40
35
 
41
36
  # Check if <think> is present in previous or delta.
42
37
  # Keep compatibility with models that don't generate <think> tokens.
@@ -50,17 +45,21 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
50
45
  delta["reasoning_content"] = reasoning_content
51
46
  if content is not None:
52
47
  delta["content"] = content
48
+ else:
49
+ delta["content"] = None
53
50
  return delta
54
51
  elif self.reasoning_end_tag in previous_text:
55
52
  # <think> in previous, </think> in previous,
56
53
  # <think> in previous, </think> in previous,
57
54
  # reasoning content ends
55
+ delta["reasoning_content"] = None
56
+ delta["content"] = delta_text
58
57
  return delta
59
58
  else:
60
59
  # <think> in previous, no </think> in previous or delta,
61
60
  # reasoning content continues
62
61
  delta["reasoning_content"] = delta_text
63
- delta["content"] = ""
62
+ delta["content"] = None
64
63
  return delta
65
64
  elif self.reasoning_start_tag in delta_text:
66
65
  if self.reasoning_end_tag in delta_text:
@@ -74,12 +73,14 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
74
73
  delta["reasoning_content"] = reasoning_content
75
74
  if content is not None:
76
75
  delta["content"] = content
76
+ else:
77
+ delta["content"] = None
77
78
  return delta
78
79
  else:
79
80
  # <think> in delta, no </think> in delta,
80
81
  # reasoning content continues
81
82
  delta["reasoning_content"] = delta_text
82
- delta["content"] = ""
83
+ delta["content"] = None
83
84
  return delta
84
85
  else:
85
86
  # No <think> in previous or delta, also need to check for </think>.
@@ -94,14 +95,18 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
94
95
  delta["reasoning_content"] = reasoning_content
95
96
  if content is not None:
96
97
  delta["content"] = content
98
+ else:
99
+ delta["content"] = None
97
100
  return delta
98
101
  elif self.reasoning_end_tag in previous_text:
99
102
  # </think> in previous, thinking content ends
103
+ delta["reasoning_content"] = None
104
+ delta["content"] = delta_text
100
105
  return delta
101
106
  else:
102
107
  # no </think> in previous or delta, reasoning content continues
103
108
  delta["reasoning_content"] = delta_text
104
- delta["content"] = ""
109
+ delta["content"] = None
105
110
  return delta
106
111
 
107
112
  def extract_reasoning_content(
@@ -94,6 +94,8 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
94
94
  "mixtral-instruct-v0.1",
95
95
  "gemma-it",
96
96
  "gemma-2-it",
97
+ "gemma-3-1b-it",
98
+ "gemma-3-it",
97
99
  "deepseek-v2.5",
98
100
  "deepseek-v2-chat",
99
101
  "deepseek-v2-chat-0628",
@@ -79,6 +79,8 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
79
79
  "glm-edge-v",
80
80
  "QvQ-72B-Preview",
81
81
  "cogagent",
82
+ "gemma-3-1b-it",
83
+ "gemma-3-it",
82
84
  ]
83
85
 
84
86
 
@@ -691,10 +693,9 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
691
693
  tools
692
694
  and model_family in QWEN_TOOL_CALL_FAMILY
693
695
  or model_family in LLAMA3_TOOL_CALL_FAMILY
696
+ or model_family in DEEPSEEK_TOOL_CALL_FAMILY
694
697
  ):
695
698
  full_context_kwargs["tools"] = tools
696
- elif tools and model_family in DEEPSEEK_TOOL_CALL_FAMILY:
697
- self._tools_to_messages_for_deepseek(messages, tools)
698
699
  assert self.model_family.chat_template is not None
699
700
  full_prompt = self.get_full_context(
700
701
  messages,