xinference 1.3.1.post1__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (75) hide show
  1. xinference/_compat.py +1 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +4 -0
  4. xinference/core/chat_interface.py +1 -1
  5. xinference/core/model.py +23 -3
  6. xinference/core/supervisor.py +6 -0
  7. xinference/core/worker.py +54 -11
  8. xinference/model/llm/__init__.py +7 -2
  9. xinference/model/llm/core.py +1 -0
  10. xinference/model/llm/llama_cpp/core.py +50 -15
  11. xinference/model/llm/llm_family.json +388 -13
  12. xinference/model/llm/llm_family_modelscope.json +373 -14
  13. xinference/model/llm/mlx/core.py +15 -11
  14. xinference/model/llm/reasoning_parser.py +17 -9
  15. xinference/model/llm/sglang/core.py +112 -12
  16. xinference/model/llm/transformers/core.py +4 -2
  17. xinference/model/llm/transformers/deepseek_vl.py +1 -1
  18. xinference/model/llm/transformers/deepseek_vl2.py +287 -0
  19. xinference/model/llm/transformers/gemma3.py +185 -0
  20. xinference/model/llm/transformers/intern_vl.py +0 -2
  21. xinference/model/llm/utils.py +62 -42
  22. xinference/model/llm/vllm/core.py +157 -11
  23. xinference/model/llm/vllm/distributed_executor.py +314 -0
  24. xinference/model/rerank/core.py +16 -11
  25. xinference/thirdparty/deepseek_vl2/__init__.py +31 -0
  26. xinference/thirdparty/deepseek_vl2/models/__init__.py +26 -0
  27. xinference/thirdparty/deepseek_vl2/models/configuration_deepseek.py +210 -0
  28. xinference/thirdparty/deepseek_vl2/models/conversation.py +310 -0
  29. xinference/thirdparty/deepseek_vl2/models/modeling_deepseek.py +1975 -0
  30. xinference/thirdparty/deepseek_vl2/models/modeling_deepseek_vl_v2.py +697 -0
  31. xinference/thirdparty/deepseek_vl2/models/processing_deepseek_vl_v2.py +675 -0
  32. xinference/thirdparty/deepseek_vl2/models/siglip_vit.py +661 -0
  33. xinference/thirdparty/deepseek_vl2/serve/__init__.py +0 -0
  34. xinference/thirdparty/deepseek_vl2/serve/app_modules/__init__.py +0 -0
  35. xinference/thirdparty/deepseek_vl2/serve/app_modules/gradio_utils.py +83 -0
  36. xinference/thirdparty/deepseek_vl2/serve/app_modules/overwrites.py +81 -0
  37. xinference/thirdparty/deepseek_vl2/serve/app_modules/presets.py +115 -0
  38. xinference/thirdparty/deepseek_vl2/serve/app_modules/utils.py +333 -0
  39. xinference/thirdparty/deepseek_vl2/serve/assets/Kelpy-Codos.js +100 -0
  40. xinference/thirdparty/deepseek_vl2/serve/assets/avatar.png +0 -0
  41. xinference/thirdparty/deepseek_vl2/serve/assets/custom.css +355 -0
  42. xinference/thirdparty/deepseek_vl2/serve/assets/custom.js +22 -0
  43. xinference/thirdparty/deepseek_vl2/serve/assets/favicon.ico +0 -0
  44. xinference/thirdparty/deepseek_vl2/serve/assets/simsun.ttc +0 -0
  45. xinference/thirdparty/deepseek_vl2/serve/inference.py +197 -0
  46. xinference/thirdparty/deepseek_vl2/utils/__init__.py +18 -0
  47. xinference/thirdparty/deepseek_vl2/utils/io.py +80 -0
  48. xinference/types.py +2 -2
  49. xinference/web/ui/build/asset-manifest.json +6 -6
  50. xinference/web/ui/build/index.html +1 -1
  51. xinference/web/ui/build/static/css/main.b494ae7e.css +2 -0
  52. xinference/web/ui/build/static/css/main.b494ae7e.css.map +1 -0
  53. xinference/web/ui/build/static/js/main.5ca4eea1.js +3 -0
  54. xinference/web/ui/build/static/js/main.5ca4eea1.js.map +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/0f0967acaec5df1d45b80010949c258d64297ebbb0f44b8bb3afcbd45c6f0ec4.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/68249645124f37d01eef83b1d897e751f895bea919b6fb466f907c1f87cebc84.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/cc97b49285d7717c63374766c789141a4329a04582ab32756d7e0e614d4c5c7f.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +1 -0
  59. xinference/web/ui/src/locales/en.json +2 -2
  60. xinference/web/ui/src/locales/zh.json +1 -1
  61. {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/METADATA +4 -4
  62. {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/RECORD +67 -41
  63. xinference/web/ui/build/static/css/main.f8177338.css +0 -2
  64. xinference/web/ui/build/static/css/main.f8177338.css.map +0 -1
  65. xinference/web/ui/build/static/js/main.55b70cb7.js +0 -3
  66. xinference/web/ui/build/static/js/main.55b70cb7.js.map +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +0 -1
  71. /xinference/web/ui/build/static/js/{main.55b70cb7.js.LICENSE.txt → main.5ca4eea1.js.LICENSE.txt} +0 -0
  72. {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/LICENSE +0 -0
  73. {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/WHEEL +0 -0
  74. {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/entry_points.txt +0 -0
  75. {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/top_level.txt +0 -0
@@ -3738,6 +3738,241 @@
3738
3738
  "<start_of_turn>"
3739
3739
  ]
3740
3740
  },
3741
+ {
3742
+ "version": 1,
3743
+ "context_length": 32768,
3744
+ "model_name": "gemma-3-1b-it",
3745
+ "model_lang": [
3746
+ "en"
3747
+ ],
3748
+ "model_ability": [
3749
+ "chat"
3750
+ ],
3751
+ "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
3752
+ "model_specs": [
3753
+ {
3754
+ "model_format": "pytorch",
3755
+ "model_size_in_billions": 1,
3756
+ "quantizations": [
3757
+ "none",
3758
+ "4-bit",
3759
+ "8-bit"
3760
+ ],
3761
+ "model_id": "LLM-Research/gemma-3-1b-it",
3762
+ "model_hub": "modelscope"
3763
+ },
3764
+ {
3765
+ "model_format": "ggufv2",
3766
+ "model_size_in_billions": 1,
3767
+ "quantizations": [
3768
+ "Q2_K",
3769
+ "Q3_K_L",
3770
+ "Q3_K_M",
3771
+ "Q3_K_S",
3772
+ "Q4_K_L",
3773
+ "Q4_K_M",
3774
+ "Q4_K_S",
3775
+ "Q5_K_L",
3776
+ "Q5_K_M",
3777
+ "Q5_K_S",
3778
+ "Q6_K",
3779
+ "Q6_K_L",
3780
+ "Q8_0",
3781
+ "bf16"
3782
+ ],
3783
+ "model_id": "bartowski/google_gemma-3-1b-it-GGUF",
3784
+ "model_file_name_template": "google_gemma-3-1b-it-{quantization}.gguf",
3785
+ "model_hub": "modelscope"
3786
+ },
3787
+ {
3788
+ "model_format": "mlx",
3789
+ "model_size_in_billions": 1,
3790
+ "quantizations": [
3791
+ "4bit",
3792
+ "6bit",
3793
+ "8bit",
3794
+ "fp16"
3795
+ ],
3796
+ "model_id": "mlx-community/gemma-3-1b-it-{quantization}",
3797
+ "model_hub": "modelscope"
3798
+ }
3799
+ ],
3800
+ "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
3801
+ "stop_token_ids": [
3802
+ 1,
3803
+ 106,
3804
+ 107
3805
+ ],
3806
+ "stop": [
3807
+ "<eos>",
3808
+ "<end_of_turn>",
3809
+ "<start_of_turn>"
3810
+ ]
3811
+ },
3812
+ {
3813
+ "version": 1,
3814
+ "context_length": 131072,
3815
+ "model_name": "gemma-3-it",
3816
+ "model_lang": [
3817
+ "en"
3818
+ ],
3819
+ "model_ability": [
3820
+ "chat",
3821
+ "vision"
3822
+ ],
3823
+ "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
3824
+ "model_specs": [
3825
+ {
3826
+ "model_format": "pytorch",
3827
+ "model_size_in_billions": 4,
3828
+ "quantizations": [
3829
+ "none",
3830
+ "4-bit",
3831
+ "8-bit"
3832
+ ],
3833
+ "model_id": "LLM-Research/gemma-3-4b-it",
3834
+ "model_hub": "modelscope"
3835
+ },
3836
+ {
3837
+ "model_format": "pytorch",
3838
+ "model_size_in_billions": 12,
3839
+ "quantizations": [
3840
+ "none",
3841
+ "4-bit",
3842
+ "8-bit"
3843
+ ],
3844
+ "model_id": "LLM-Research/gemma-3-12b-it",
3845
+ "model_hub": "modelscope"
3846
+ },
3847
+ {
3848
+ "model_format": "pytorch",
3849
+ "model_size_in_billions": 27,
3850
+ "quantizations": [
3851
+ "none",
3852
+ "4-bit",
3853
+ "8-bit"
3854
+ ],
3855
+ "model_id": "LLM-Research/gemma-3-27b-it",
3856
+ "model_hub": "modelscope"
3857
+ },
3858
+ {
3859
+ "model_format": "ggufv2",
3860
+ "model_size_in_billions": 4,
3861
+ "quantizations": [
3862
+ "Q2_K",
3863
+ "Q3_K_L",
3864
+ "Q3_K_M",
3865
+ "Q3_K_S",
3866
+ "Q4_K_L",
3867
+ "Q4_K_M",
3868
+ "Q4_K_S",
3869
+ "Q5_K_L",
3870
+ "Q5_K_M",
3871
+ "Q5_K_S",
3872
+ "Q6_K",
3873
+ "Q6_K_L",
3874
+ "Q8_0",
3875
+ "bf16"
3876
+ ],
3877
+ "model_id": "bartowski/google_gemma-3-4b-it-GGUF",
3878
+ "model_file_name_template": "google_gemma-3-4b-it-{quantization}.gguf",
3879
+ "model_hub": "modelscope"
3880
+ },
3881
+ {
3882
+ "model_format": "ggufv2",
3883
+ "model_size_in_billions": 12,
3884
+ "quantizations": [
3885
+ "Q2_K",
3886
+ "Q3_K_L",
3887
+ "Q3_K_M",
3888
+ "Q3_K_S",
3889
+ "Q4_K_L",
3890
+ "Q4_K_M",
3891
+ "Q4_K_S",
3892
+ "Q5_K_L",
3893
+ "Q5_K_M",
3894
+ "Q5_K_S",
3895
+ "Q6_K",
3896
+ "Q6_K_L",
3897
+ "Q8_0",
3898
+ "bf16"
3899
+ ],
3900
+ "model_id": "bartowski/google_gemma-3-12b-it-GGUF",
3901
+ "model_file_name_template": "google_gemma-3-12b-it-{quantization}.gguf",
3902
+ "model_hub": "modelscope"
3903
+ },
3904
+ {
3905
+ "model_format": "ggufv2",
3906
+ "model_size_in_billions": 27,
3907
+ "quantizations": [
3908
+ "Q2_K",
3909
+ "Q3_K_L",
3910
+ "Q3_K_M",
3911
+ "Q3_K_S",
3912
+ "Q4_K_L",
3913
+ "Q4_K_M",
3914
+ "Q4_K_S",
3915
+ "Q5_K_L",
3916
+ "Q5_K_M",
3917
+ "Q5_K_S",
3918
+ "Q6_K",
3919
+ "Q6_K_L",
3920
+ "Q8_0",
3921
+ "bf16"
3922
+ ],
3923
+ "model_id": "bartowski/google_gemma-3-27b-it-GGUF",
3924
+ "model_file_name_template": "google_gemma-3-27b-it-{quantization}.gguf",
3925
+ "model_hub": "modelscope"
3926
+ },
3927
+ {
3928
+ "model_format": "mlx",
3929
+ "model_size_in_billions": 4,
3930
+ "quantizations": [
3931
+ "4bit",
3932
+ "6bit",
3933
+ "8bit",
3934
+ "fp16"
3935
+ ],
3936
+ "model_id": "mlx-community/gemma-3-4b-it-{quantization}",
3937
+ "model_hub": "modelscope"
3938
+ },
3939
+ {
3940
+ "model_format": "mlx",
3941
+ "model_size_in_billions": 12,
3942
+ "quantizations": [
3943
+ "4bit",
3944
+ "6bit",
3945
+ "8bit",
3946
+ "fp16"
3947
+ ],
3948
+ "model_id": "mlx-community/gemma-3-12b-it-{quantization}",
3949
+ "model_hub": "modelscope"
3950
+ },
3951
+ {
3952
+ "model_format": "mlx",
3953
+ "model_size_in_billions": 27,
3954
+ "quantizations": [
3955
+ "4bit",
3956
+ "6bit",
3957
+ "8bit",
3958
+ "fp16"
3959
+ ],
3960
+ "model_id": "mlx-community/gemma-3-27b-it-{quantization}",
3961
+ "model_hub": "modelscope"
3962
+ }
3963
+ ],
3964
+ "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
3965
+ "stop_token_ids": [
3966
+ 1,
3967
+ 106,
3968
+ 107
3969
+ ],
3970
+ "stop": [
3971
+ "<eos>",
3972
+ "<end_of_turn>",
3973
+ "<start_of_turn>"
3974
+ ]
3975
+ },
3741
3976
  {
3742
3977
  "version":1,
3743
3978
  "context_length":2048,
@@ -4673,7 +4908,7 @@
4673
4908
  "none"
4674
4909
  ],
4675
4910
  "model_hub": "modelscope",
4676
- "model_id": "OpenGVLab/InternVL2_5-MPO-1B",
4911
+ "model_id": "OpenGVLab/InternVL2_5-1B-MPO",
4677
4912
  "model_revision": "master"
4678
4913
  },
4679
4914
  {
@@ -4685,7 +4920,7 @@
4685
4920
  "none"
4686
4921
  ],
4687
4922
  "model_hub": "modelscope",
4688
- "model_id": "OpenGVLab/InternVL2_5-MPO-2B",
4923
+ "model_id": "OpenGVLab/InternVL2_5-2B-MPO",
4689
4924
  "model_revision": "master"
4690
4925
  },
4691
4926
  {
@@ -4697,7 +4932,7 @@
4697
4932
  "none"
4698
4933
  ],
4699
4934
  "model_hub": "modelscope",
4700
- "model_id": "OpenGVLab/InternVL2_5-MPO-4B",
4935
+ "model_id": "OpenGVLab/InternVL2_5-4B-MPO",
4701
4936
  "model_revision": "master"
4702
4937
  },
4703
4938
  {
@@ -4707,7 +4942,7 @@
4707
4942
  "Int4"
4708
4943
  ],
4709
4944
  "model_hub": "modelscope",
4710
- "model_id": "OpenGVLab/InternVL2_5-MPO-4B-AWQ",
4945
+ "model_id": "OpenGVLab/InternVL2_5-4B-MPO-AWQ",
4711
4946
  "model_revision": "master"
4712
4947
  },
4713
4948
  {
@@ -4719,7 +4954,7 @@
4719
4954
  "none"
4720
4955
  ],
4721
4956
  "model_hub": "modelscope",
4722
- "model_id": "OpenGVLab/InternVL2_5-MPO-8B",
4957
+ "model_id": "OpenGVLab/InternVL2_5-8B-MPO",
4723
4958
  "model_revision": "master"
4724
4959
  },
4725
4960
  {
@@ -4729,7 +4964,7 @@
4729
4964
  "Int4"
4730
4965
  ],
4731
4966
  "model_hub": "modelscope",
4732
- "model_id": "OpenGVLab/InternVL2_5-MPO-8B-AWQ",
4967
+ "model_id": "OpenGVLab/InternVL2_5-8B-MPO-AWQ",
4733
4968
  "model_revision": "master"
4734
4969
  },
4735
4970
  {
@@ -4741,7 +4976,7 @@
4741
4976
  "none"
4742
4977
  ],
4743
4978
  "model_hub": "modelscope",
4744
- "model_id": "OpenGVLab/InternVL2_5-MPO-26B",
4979
+ "model_id": "OpenGVLab/InternVL2_5-26B-MPO",
4745
4980
  "model_revision": "master"
4746
4981
  },
4747
4982
  {
@@ -4751,7 +4986,7 @@
4751
4986
  "Int4"
4752
4987
  ],
4753
4988
  "model_hub": "modelscope",
4754
- "model_id": "OpenGVLab/InternVL2_5-MPO-26B-AWQ",
4989
+ "model_id": "OpenGVLab/InternVL2_5-26B-MPO-AWQ",
4755
4990
  "model_revision": "master"
4756
4991
  },
4757
4992
  {
@@ -4763,7 +4998,7 @@
4763
4998
  "none"
4764
4999
  ],
4765
5000
  "model_hub": "modelscope",
4766
- "model_id": "OpenGVLab/InternVL2_5-MPO-38B",
5001
+ "model_id": "OpenGVLab/InternVL2_5-38B-MPO",
4767
5002
  "model_revision": "master"
4768
5003
  },
4769
5004
  {
@@ -4773,7 +5008,7 @@
4773
5008
  "Int4"
4774
5009
  ],
4775
5010
  "model_hub": "modelscope",
4776
- "model_id": "OpenGVLab/InternVL2_5-MPO-38B-AWQ",
5011
+ "model_id": "OpenGVLab/InternVL2_5-38B-MPO-AWQ",
4777
5012
  "model_revision": "master"
4778
5013
  },
4779
5014
  {
@@ -4785,7 +5020,7 @@
4785
5020
  "none"
4786
5021
  ],
4787
5022
  "model_hub": "modelscope",
4788
- "model_id": "OpenGVLab/InternVL2_5-MPO-78B",
5023
+ "model_id": "OpenGVLab/InternVL2_5-78B-MPO",
4789
5024
  "model_revision": "master"
4790
5025
  },
4791
5026
  {
@@ -4795,7 +5030,7 @@
4795
5030
  "Int4"
4796
5031
  ],
4797
5032
  "model_hub": "modelscope",
4798
- "model_id": "OpenGVLab/InternVL2_5-MPO-78B-AWQ",
5033
+ "model_id": "OpenGVLab/InternVL2_5-78B-MPO-AWQ",
4799
5034
  "model_revision": "master"
4800
5035
  }
4801
5036
  ],
@@ -5164,6 +5399,15 @@
5164
5399
  "model_hub": "modelscope",
5165
5400
  "model_id":"Qwen/Qwen2.5-VL-7B-Instruct"
5166
5401
  },
5402
+ {
5403
+ "model_format":"pytorch",
5404
+ "model_size_in_billions":32,
5405
+ "quantizations":[
5406
+ "none"
5407
+ ],
5408
+ "model_hub": "modelscope",
5409
+ "model_id":"Qwen/Qwen2.5-VL-32B-Instruct"
5410
+ },
5167
5411
  {
5168
5412
  "model_format":"pytorch",
5169
5413
  "model_size_in_billions":72,
@@ -5188,9 +5432,18 @@
5188
5432
  "quantizations":[
5189
5433
  "Int4"
5190
5434
  ],
5191
- "model_hub": "awq",
5435
+ "model_hub": "modelscope",
5192
5436
  "model_id":"Qwen/Qwen2.5-VL-7B-Instruct-AWQ"
5193
5437
  },
5438
+ {
5439
+ "model_format":"awq",
5440
+ "model_size_in_billions":32,
5441
+ "quantizations":[
5442
+ "Int4"
5443
+ ],
5444
+ "model_hub": "modelscope",
5445
+ "model_id":"Qwen/Qwen2.5-VL-32B-Instruct-AWQ"
5446
+ },
5194
5447
  {
5195
5448
  "model_format":"pytorch",
5196
5449
  "model_size_in_billions":72,
@@ -5657,7 +5910,7 @@
5657
5910
  "model_hub": "modelscope"
5658
5911
  }
5659
5912
  ],
5660
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁callend|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
5913
+ "chat_template": "{% if messages %} {% if system or tools %} {% if system %} {{ system }} {% endif %} {% if tools %} {# Handle tools here if needed #} {% endif %} {% endif %} {% for message in messages %} {% set last = loop.index == loop.length %} {% if message.role == \"user\" %} <|User|> {% if tools and last %} Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}. Do not use variables. {{ tools }} {% endif %} {{ message.content }} {% if last %} <|Assistant|> {% endif %} {% elif message.role == \"assistant\" %} <|Assistant|> {% if message.tool_calls %} <|tool▁callsbegin|> {% for tool in message.tool_calls %} <|tool▁call▁begin|> {\"name\": \"{{ tool.function.name }}\", \"parameters\": {{ tool.function.arguments }}} <|tool▁call▁end|> {% endfor %} <|tool▁calls▁end|> {% else %} {{ message.content }} {% if not last %} <|end▁of▁sentence|> {% endif %} {% endif %} {% elif message.role == \"tool\" %} <|tool▁outputs▁begin|> <|tool▁output▁begin|> {{ message.content }} <|tool▁output▁end|> <|tool▁outputs▁end|> {% if last and message.role != \"assistant\" %} <|Assistant|> {% endif %} {% endif %} {% endfor %} {% else %} {% if system %} {{ system }} {% endif %} {% if prompt %} <|User|> {{ prompt }} {% endif %} <|Assistant|> {{ response }} {% if response %} {{ response }} {% endif %} {% endif %}",
5661
5914
  "stop_token_ids": [
5662
5915
  1
5663
5916
  ],
@@ -8185,5 +8438,111 @@
8185
8438
  "stop": [
8186
8439
  "<|im_end|>"
8187
8440
  ]
8441
+ },
8442
+ {
8443
+ "version": 1,
8444
+ "context_length": 131072,
8445
+ "model_name": "fin-r1",
8446
+ "model_lang": [
8447
+ "en",
8448
+ "zh"
8449
+ ],
8450
+ "model_ability": [
8451
+ "chat"
8452
+ ],
8453
+ "model_description": "Fin-R1 is a large language model specifically designed for the field of financial reasoning",
8454
+ "model_specs": [
8455
+ {
8456
+ "model_format": "pytorch",
8457
+ "model_size_in_billions": 7,
8458
+ "quantizations": [
8459
+ "4-bit",
8460
+ "8-bit",
8461
+ "none"
8462
+ ],
8463
+ "model_id": "AI-ModelScope/Fin-R1",
8464
+ "model_hub": "modelscope"
8465
+ },
8466
+ {
8467
+ "model_format": "gptq",
8468
+ "model_size_in_billions": 7,
8469
+ "quantizations": [
8470
+ "Int4",
8471
+ "Int8"
8472
+ ],
8473
+ "model_id": "JunHowie/Fin-R1-GPTQ-{quantization}",
8474
+ "model_hub": "modelscope"
8475
+ },
8476
+ {
8477
+ "model_format": "fp8",
8478
+ "model_size_in_billions": 7,
8479
+ "quantizations": [
8480
+ "FP8"
8481
+ ],
8482
+ "model_id": "JunHowie/Fin-R1-FP8-Dynamic",
8483
+ "model_hub": "modelscope"
8484
+ }
8485
+ ],
8486
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
8487
+ "stop_token_ids": [
8488
+ 151643,
8489
+ 151644,
8490
+ 151645
8491
+ ],
8492
+ "stop": [
8493
+ "<|endoftext|>",
8494
+ "<|im_start|>",
8495
+ "<|im_end|>"
8496
+ ]
8497
+ },
8498
+ {
8499
+ "version": 1,
8500
+ "context_length": 4096,
8501
+ "model_name": "deepseek-vl2",
8502
+ "model_lang": [
8503
+ "en",
8504
+ "zh"
8505
+ ],
8506
+ "model_ability": [
8507
+ "chat",
8508
+ "vision"
8509
+ ],
8510
+ "model_description": "DeepSeek-VL2, an advanced series of large Mixture-of-Experts (MoE) Vision-Language Models that significantly improves upon its predecessor, DeepSeek-VL. DeepSeek-VL2 demonstrates superior capabilities across various tasks, including but not limited to visual question answering, optical character recognition, document/table/chart understanding, and visual grounding.",
8511
+ "model_specs": [
8512
+ {
8513
+ "model_format": "pytorch",
8514
+ "model_size_in_billions": 27,
8515
+ "quantizations": [
8516
+ "none"
8517
+ ],
8518
+ "model_id": "deepseek-ai/deepseek-vl2",
8519
+ "model_hub": "modelscope"
8520
+ },
8521
+ {
8522
+ "model_format": "pytorch",
8523
+ "model_size_in_billions": 16,
8524
+ "quantizations": [
8525
+ "none"
8526
+ ],
8527
+ "model_id": "deepseek-ai/deepseek-vl2-small",
8528
+ "model_hub": "modelscope"
8529
+ },
8530
+ {
8531
+ "model_format": "pytorch",
8532
+ "model_size_in_billions": 3,
8533
+ "quantizations": [
8534
+ "none"
8535
+ ],
8536
+ "model_id": "deepseek-ai/deepseek-vl2-tiny",
8537
+ "model_hub": "modelscope"
8538
+ }
8539
+ ],
8540
+ "chat_template": "",
8541
+ "stop_token_ids": [
8542
+ 1
8543
+ ],
8544
+ "stop": [
8545
+ "<|end▁of▁sentence|>"
8546
+ ]
8188
8547
  }
8189
8548
  ]
@@ -148,11 +148,15 @@ class MLXModel(LLM):
148
148
  self._max_kv_size = kwargs.get("max_kv_size", None)
149
149
  self._prompt_cache = PromptCache()
150
150
 
151
- return load(
151
+ model, tokenizer = load(
152
152
  self.model_path,
153
153
  tokenizer_config=tokenizer_config,
154
154
  model_config=self._model_config,
155
155
  )
156
+ if stop_token_ids := self.model_family.stop_token_ids:
157
+ for stop_token_id in stop_token_ids:
158
+ tokenizer.add_eos_token(stop_token_id)
159
+ return model, tokenizer
156
160
 
157
161
  def load(self):
158
162
  reasoning_content = self._model_config.pop("reasoning_content")
@@ -260,7 +264,7 @@ class MLXModel(LLM):
260
264
  start = time.time()
261
265
  output = ""
262
266
  tokens = []
263
- for chunk_resp, i in zip(
267
+ for i, chunk_resp in enumerate(
264
268
  self._generate_stream_inner(
265
269
  prompt_token_ids=prompt_token_ids,
266
270
  max_tokens=max_tokens,
@@ -269,8 +273,7 @@ class MLXModel(LLM):
269
273
  repetition_penalty=kwargs["repetition_penalty"],
270
274
  repetition_context_size=kwargs["repetition_context_size"],
271
275
  prompt_cache=self._prompt_cache.cache if self._prompt_cache else None, # type: ignore
272
- ),
273
- range(max_tokens),
276
+ )
274
277
  ):
275
278
  token = chunk_resp.token
276
279
  tokens.append(token)
@@ -435,10 +438,11 @@ class MLXChatModel(MLXModel, ChatModelMixin):
435
438
  tools = generate_config.pop("tools", []) if generate_config else None
436
439
  full_context_kwargs = {}
437
440
  if tools:
438
- if model_family in QWEN_TOOL_CALL_FAMILY:
441
+ if (
442
+ model_family in QWEN_TOOL_CALL_FAMILY
443
+ or model_family in DEEPSEEK_TOOL_CALL_FAMILY
444
+ ):
439
445
  full_context_kwargs["tools"] = tools
440
- elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
441
- self._tools_to_messages_for_deepseek(messages, tools)
442
446
  assert self.model_family.chat_template is not None
443
447
  full_prompt = self.get_full_context(
444
448
  messages, self.model_family.chat_template, **full_context_kwargs
@@ -507,19 +511,19 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
507
511
  from mlx_lm.utils import GenerationResponse
508
512
  from mlx_vlm.utils import generate_step
509
513
 
510
- inputs = kwargs["prompt_token_ids"]
514
+ inputs = kwargs.pop("prompt_token_ids")
511
515
 
512
- max_tokens = kwargs.pop("max_tokens")
516
+ extra_kwargs = kwargs.copy()
513
517
  input_ids, pixel_values, mask, kwargs = inputs
518
+ kwargs.update(extra_kwargs)
514
519
 
515
520
  tokenizer = self._processor.tokenizer
516
521
  detokenizer = self._processor.detokenizer
517
522
 
518
523
  detokenizer.reset()
519
524
  tic = time.perf_counter()
520
- for (token, logprobs), n in zip(
525
+ for n, (token, logprobs) in enumerate(
521
526
  generate_step(input_ids, self._model, pixel_values, mask, **kwargs),
522
- range(max_tokens),
523
527
  ):
524
528
  if n == 0:
525
529
  prompt_time = time.perf_counter() - tic
@@ -31,9 +31,7 @@ class ReasoningParser:
31
31
  Yields:
32
32
  str: Extracted reasoning content chunks.
33
33
  """
34
- delta = ChatCompletionChunkDelta(
35
- content=delta_text,
36
- )
34
+ delta = ChatCompletionChunkDelta()
37
35
 
38
36
  # Check if <think> is present in previous or delta.
39
37
  # Keep compatibility with models that don't generate <think> tokens.
@@ -45,19 +43,23 @@ class ReasoningParser:
45
43
  reasoning_content = delta_text[:end_idx]
46
44
  content = delta_text[end_idx + len(self.reasoning_end_tag) :]
47
45
  delta["reasoning_content"] = reasoning_content
48
- if content is not None:
46
+ if content:
49
47
  delta["content"] = content
48
+ else:
49
+ delta["content"] = None
50
50
  return delta
51
51
  elif self.reasoning_end_tag in previous_text:
52
52
  # <think> in previous, </think> in previous,
53
53
  # <think> in previous, </think> in previous,
54
54
  # reasoning content ends
55
+ delta["reasoning_content"] = None
56
+ delta["content"] = delta_text
55
57
  return delta
56
58
  else:
57
59
  # <think> in previous, no </think> in previous or delta,
58
60
  # reasoning content continues
59
61
  delta["reasoning_content"] = delta_text
60
- delta["content"] = ""
62
+ delta["content"] = None
61
63
  return delta
62
64
  elif self.reasoning_start_tag in delta_text:
63
65
  if self.reasoning_end_tag in delta_text:
@@ -69,14 +71,16 @@ class ReasoningParser:
69
71
  ]
70
72
  content = delta_text[end_idx + len(self.reasoning_end_tag) :]
71
73
  delta["reasoning_content"] = reasoning_content
72
- if content is not None:
74
+ if content:
73
75
  delta["content"] = content
76
+ else:
77
+ delta["content"] = None
74
78
  return delta
75
79
  else:
76
80
  # <think> in delta, no </think> in delta,
77
81
  # reasoning content continues
78
82
  delta["reasoning_content"] = delta_text
79
- delta["content"] = ""
83
+ delta["content"] = None
80
84
  return delta
81
85
  else:
82
86
  # No <think> in previous or delta, also need to check for </think>.
@@ -89,16 +93,20 @@ class ReasoningParser:
89
93
  reasoning_content = delta_text[:end_idx]
90
94
  content = delta_text[end_idx + len(self.reasoning_end_tag) :]
91
95
  delta["reasoning_content"] = reasoning_content
92
- if content is not None:
96
+ if content:
93
97
  delta["content"] = content
98
+ else:
99
+ delta["content"] = None
94
100
  return delta
95
101
  elif self.reasoning_end_tag in previous_text:
96
102
  # </think> in previous, thinking content ends
103
+ delta["reasoning_content"] = None
104
+ delta["content"] = delta_text
97
105
  return delta
98
106
  else:
99
107
  # no </think> in previous or delta, reasoning content continues
100
108
  delta["reasoning_content"] = delta_text
101
- delta["content"] = ""
109
+ delta["content"] = None
102
110
  return delta
103
111
 
104
112
  def extract_reasoning_content(