xinference 1.3.0.post2__py3-none-any.whl → 1.3.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (53) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +1 -0
  3. xinference/conftest.py +7 -0
  4. xinference/core/chat_interface.py +39 -24
  5. xinference/core/model.py +3 -1
  6. xinference/core/scheduler.py +3 -0
  7. xinference/core/worker.py +1 -1
  8. xinference/model/embedding/core.py +12 -5
  9. xinference/model/llm/__init__.py +2 -1
  10. xinference/model/llm/core.py +10 -0
  11. xinference/model/llm/llama_cpp/core.py +266 -3
  12. xinference/model/llm/llm_family.json +390 -17
  13. xinference/model/llm/llm_family_modelscope.json +348 -29
  14. xinference/model/llm/mlx/core.py +15 -4
  15. xinference/model/llm/{reasoning_parsers/deepseek_r1_reasoning_parser.py → reasoning_parser.py} +9 -13
  16. xinference/model/llm/sglang/core.py +7 -2
  17. xinference/model/llm/transformers/chatglm.py +4 -4
  18. xinference/model/llm/transformers/core.py +22 -5
  19. xinference/model/llm/transformers/intern_vl.py +2 -1
  20. xinference/model/llm/transformers/utils.py +1 -1
  21. xinference/model/llm/utils.py +134 -60
  22. xinference/model/llm/vllm/core.py +31 -42
  23. xinference/types.py +4 -0
  24. xinference/web/ui/build/asset-manifest.json +3 -3
  25. xinference/web/ui/build/index.html +1 -1
  26. xinference/web/ui/build/static/js/main.55b70cb7.js +3 -0
  27. xinference/web/ui/build/static/js/main.55b70cb7.js.map +1 -0
  28. xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +1 -0
  29. xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +1 -0
  30. xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +1 -0
  31. xinference/web/ui/node_modules/.cache/babel-loader/87a9b13f2466f375ae5c6e7c08b279cc38351d29710d7f7626bbb07a85262b79.json +1 -0
  32. xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/e547bbb18abb4a474b675a8d5782d25617566bea0af8caa9b836ce5649e2250a.json +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +1 -0
  35. xinference/web/ui/src/locales/en.json +9 -1
  36. xinference/web/ui/src/locales/zh.json +9 -1
  37. {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/METADATA +9 -5
  38. {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/RECORD +43 -44
  39. xinference/model/llm/reasoning_parsers/__init__.py +0 -13
  40. xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +0 -98
  41. xinference/web/ui/build/static/js/main.ad42919c.js +0 -3
  42. xinference/web/ui/build/static/js/main.ad42919c.js.map +0 -1
  43. xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +0 -1
  44. xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +0 -1
  45. xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +0 -1
  46. xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +0 -1
  47. xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +0 -1
  48. xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +0 -1
  49. /xinference/web/ui/build/static/js/{main.ad42919c.js.LICENSE.txt → main.55b70cb7.js.LICENSE.txt} +0 -0
  50. {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/LICENSE +0 -0
  51. {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/WHEEL +0 -0
  52. {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/entry_points.txt +0 -0
  53. {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/top_level.txt +0 -0
@@ -4523,36 +4523,169 @@
4523
4523
  "model_id": "OpenGVLab/InternVL2_5-1B",
4524
4524
  "model_revision": "master"
4525
4525
  },
4526
+ {
4527
+ "model_format": "pytorch",
4528
+ "model_size_in_billions": 2,
4529
+ "quantizations": [
4530
+ "4-bit",
4531
+ "8-bit",
4532
+ "none"
4533
+ ],
4534
+ "model_hub": "modelscope",
4535
+ "model_id": "OpenGVLab/InternVL2_5-2B",
4536
+ "model_revision": "master"
4537
+ },
4538
+ {
4539
+ "model_format": "pytorch",
4540
+ "model_size_in_billions": 4,
4541
+ "quantizations": [
4542
+ "4-bit",
4543
+ "8-bit",
4544
+ "none"
4545
+ ],
4546
+ "model_hub": "modelscope",
4547
+ "model_id": "OpenGVLab/InternVL2_5-4B",
4548
+ "model_revision": "master"
4549
+ },
4526
4550
  {
4527
4551
  "model_format": "awq",
4528
- "model_size_in_billions": 1,
4552
+ "model_size_in_billions": 4,
4529
4553
  "quantizations": [
4530
4554
  "Int4"
4531
4555
  ],
4532
4556
  "model_hub": "modelscope",
4533
- "model_id": "OpenGVLab/InternVL2_5-1B-AWQ",
4557
+ "model_id": "OpenGVLab/InternVL2_5-4B-AWQ",
4534
4558
  "model_revision": "master"
4535
4559
  },
4536
4560
  {
4537
4561
  "model_format": "pytorch",
4538
- "model_size_in_billions": 2,
4562
+ "model_size_in_billions": 8,
4539
4563
  "quantizations": [
4540
4564
  "4-bit",
4541
4565
  "8-bit",
4542
4566
  "none"
4543
4567
  ],
4544
4568
  "model_hub": "modelscope",
4545
- "model_id": "OpenGVLab/InternVL2_5-2B",
4569
+ "model_id": "OpenGVLab/InternVL2_5-8B",
4546
4570
  "model_revision": "master"
4547
4571
  },
4548
4572
  {
4549
4573
  "model_format": "awq",
4550
- "model_size_in_billions": 2,
4574
+ "model_size_in_billions": 8,
4575
+ "quantizations": [
4576
+ "Int4"
4577
+ ],
4578
+ "model_hub": "modelscope",
4579
+ "model_id": "OpenGVLab/InternVL2_5-8B-AWQ",
4580
+ "model_revision": "master"
4581
+ },
4582
+ {
4583
+ "model_format": "pytorch",
4584
+ "model_size_in_billions": 26,
4585
+ "quantizations": [
4586
+ "4-bit",
4587
+ "8-bit",
4588
+ "none"
4589
+ ],
4590
+ "model_hub": "modelscope",
4591
+ "model_id": "OpenGVLab/InternVL2_5-26B",
4592
+ "model_revision": "master"
4593
+ },
4594
+ {
4595
+ "model_format": "awq",
4596
+ "model_size_in_billions": 26,
4597
+ "quantizations": [
4598
+ "Int4"
4599
+ ],
4600
+ "model_hub": "modelscope",
4601
+ "model_id": "OpenGVLab/InternVL2_5-26B-AWQ",
4602
+ "model_revision": "master"
4603
+ },
4604
+ {
4605
+ "model_format": "pytorch",
4606
+ "model_size_in_billions": 38,
4607
+ "quantizations": [
4608
+ "4-bit",
4609
+ "8-bit",
4610
+ "none"
4611
+ ],
4612
+ "model_hub": "modelscope",
4613
+ "model_id": "OpenGVLab/InternVL2_5-38B",
4614
+ "model_revision": "master"
4615
+ },
4616
+ {
4617
+ "model_format": "awq",
4618
+ "model_size_in_billions": 38,
4619
+ "quantizations": [
4620
+ "Int4"
4621
+ ],
4622
+ "model_hub": "modelscope",
4623
+ "model_id": "OpenGVLab/InternVL2_5-38B-AWQ",
4624
+ "model_revision": "master"
4625
+ },
4626
+ {
4627
+ "model_format": "pytorch",
4628
+ "model_size_in_billions": 78,
4629
+ "quantizations": [
4630
+ "4-bit",
4631
+ "8-bit",
4632
+ "none"
4633
+ ],
4634
+ "model_hub": "modelscope",
4635
+ "model_id": "OpenGVLab/InternVL2_5-78B",
4636
+ "model_revision": "master"
4637
+ },
4638
+ {
4639
+ "model_format": "awq",
4640
+ "model_size_in_billions": 78,
4551
4641
  "quantizations": [
4552
4642
  "Int4"
4553
4643
  ],
4554
4644
  "model_hub": "modelscope",
4555
- "model_id": "OpenGVLab/InternVL2_5-2B-AWQ",
4645
+ "model_id": "OpenGVLab/InternVL2_5-78B-AWQ",
4646
+ "model_revision": "master"
4647
+ }
4648
+ ],
4649
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
4650
+ "stop_token_ids": [],
4651
+ "stop": []
4652
+ },
4653
+ {
4654
+ "version": 1,
4655
+ "context_length": 16384,
4656
+ "model_name": "InternVL2.5-MPO",
4657
+ "model_lang": [
4658
+ "en",
4659
+ "zh"
4660
+ ],
4661
+ "model_ability": [
4662
+ "chat",
4663
+ "vision"
4664
+ ],
4665
+ "model_description": "InternVL 2.5 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
4666
+ "model_specs": [
4667
+ {
4668
+ "model_format": "pytorch",
4669
+ "model_size_in_billions": 1,
4670
+ "quantizations": [
4671
+ "4-bit",
4672
+ "8-bit",
4673
+ "none"
4674
+ ],
4675
+ "model_hub": "modelscope",
4676
+ "model_id": "OpenGVLab/InternVL2_5-MPO-1B",
4677
+ "model_revision": "master"
4678
+ },
4679
+ {
4680
+ "model_format": "pytorch",
4681
+ "model_size_in_billions": 2,
4682
+ "quantizations": [
4683
+ "4-bit",
4684
+ "8-bit",
4685
+ "none"
4686
+ ],
4687
+ "model_hub": "modelscope",
4688
+ "model_id": "OpenGVLab/InternVL2_5-MPO-2B",
4556
4689
  "model_revision": "master"
4557
4690
  },
4558
4691
  {
@@ -4564,7 +4697,7 @@
4564
4697
  "none"
4565
4698
  ],
4566
4699
  "model_hub": "modelscope",
4567
- "model_id": "OpenGVLab/InternVL2_5-4B",
4700
+ "model_id": "OpenGVLab/InternVL2_5-MPO-4B",
4568
4701
  "model_revision": "master"
4569
4702
  },
4570
4703
  {
@@ -4574,7 +4707,7 @@
4574
4707
  "Int4"
4575
4708
  ],
4576
4709
  "model_hub": "modelscope",
4577
- "model_id": "OpenGVLab/InternVL2_5-4B-AWQ",
4710
+ "model_id": "OpenGVLab/InternVL2_5-MPO-4B-AWQ",
4578
4711
  "model_revision": "master"
4579
4712
  },
4580
4713
  {
@@ -4586,7 +4719,7 @@
4586
4719
  "none"
4587
4720
  ],
4588
4721
  "model_hub": "modelscope",
4589
- "model_id": "OpenGVLab/InternVL2_5-8B",
4722
+ "model_id": "OpenGVLab/InternVL2_5-MPO-8B",
4590
4723
  "model_revision": "master"
4591
4724
  },
4592
4725
  {
@@ -4596,7 +4729,7 @@
4596
4729
  "Int4"
4597
4730
  ],
4598
4731
  "model_hub": "modelscope",
4599
- "model_id": "OpenGVLab/InternVL2_5-8B-AWQ",
4732
+ "model_id": "OpenGVLab/InternVL2_5-MPO-8B-AWQ",
4600
4733
  "model_revision": "master"
4601
4734
  },
4602
4735
  {
@@ -4608,7 +4741,7 @@
4608
4741
  "none"
4609
4742
  ],
4610
4743
  "model_hub": "modelscope",
4611
- "model_id": "OpenGVLab/InternVL2_5-26B",
4744
+ "model_id": "OpenGVLab/InternVL2_5-MPO-26B",
4612
4745
  "model_revision": "master"
4613
4746
  },
4614
4747
  {
@@ -4618,7 +4751,7 @@
4618
4751
  "Int4"
4619
4752
  ],
4620
4753
  "model_hub": "modelscope",
4621
- "model_id": "OpenGVLab/InternVL2_5-26B-AWQ",
4754
+ "model_id": "OpenGVLab/InternVL2_5-MPO-26B-AWQ",
4622
4755
  "model_revision": "master"
4623
4756
  },
4624
4757
  {
@@ -4630,7 +4763,7 @@
4630
4763
  "none"
4631
4764
  ],
4632
4765
  "model_hub": "modelscope",
4633
- "model_id": "OpenGVLab/InternVL2_5-38B",
4766
+ "model_id": "OpenGVLab/InternVL2_5-MPO-38B",
4634
4767
  "model_revision": "master"
4635
4768
  },
4636
4769
  {
@@ -4640,7 +4773,7 @@
4640
4773
  "Int4"
4641
4774
  ],
4642
4775
  "model_hub": "modelscope",
4643
- "model_id": "OpenGVLab/InternVL2_5-38B-AWQ",
4776
+ "model_id": "OpenGVLab/InternVL2_5-MPO-38B-AWQ",
4644
4777
  "model_revision": "master"
4645
4778
  },
4646
4779
  {
@@ -4652,7 +4785,7 @@
4652
4785
  "none"
4653
4786
  ],
4654
4787
  "model_hub": "modelscope",
4655
- "model_id": "OpenGVLab/InternVL2_5-78B",
4788
+ "model_id": "OpenGVLab/InternVL2_5-MPO-78B",
4656
4789
  "model_revision": "master"
4657
4790
  },
4658
4791
  {
@@ -4662,7 +4795,7 @@
4662
4795
  "Int4"
4663
4796
  ],
4664
4797
  "model_hub": "modelscope",
4665
- "model_id": "OpenGVLab/InternVL2_5-78B-AWQ",
4798
+ "model_id": "OpenGVLab/InternVL2_5-MPO-78B-AWQ",
4666
4799
  "model_revision": "master"
4667
4800
  }
4668
4801
  ],
@@ -5020,7 +5153,7 @@
5020
5153
  "none"
5021
5154
  ],
5022
5155
  "model_hub": "modelscope",
5023
- "model_id":"qwen/Qwen2.5-VL-3B-Instruct"
5156
+ "model_id":"Qwen/Qwen2.5-VL-3B-Instruct"
5024
5157
  },
5025
5158
  {
5026
5159
  "model_format":"pytorch",
@@ -5029,7 +5162,7 @@
5029
5162
  "none"
5030
5163
  ],
5031
5164
  "model_hub": "modelscope",
5032
- "model_id":"qwen/Qwen2.5-VL-7B-Instruct"
5165
+ "model_id":"Qwen/Qwen2.5-VL-7B-Instruct"
5033
5166
  },
5034
5167
  {
5035
5168
  "model_format":"pytorch",
@@ -5038,7 +5171,34 @@
5038
5171
  "none"
5039
5172
  ],
5040
5173
  "model_hub": "modelscope",
5041
- "model_id":"qwen/Qwen2.5-VL-72B-Instruct"
5174
+ "model_id":"Qwen/Qwen2.5-VL-72B-Instruct"
5175
+ },
5176
+ {
5177
+ "model_format":"awq",
5178
+ "model_size_in_billions":3,
5179
+ "quantizations":[
5180
+ "Int4"
5181
+ ],
5182
+ "model_hub": "modelscope",
5183
+ "model_id":"Qwen/Qwen2.5-VL-3B-Instruct-AWQ"
5184
+ },
5185
+ {
5186
+ "model_format":"awq",
5187
+ "model_size_in_billions":7,
5188
+ "quantizations":[
5189
+ "Int4"
5190
+ ],
5191
+ "model_hub": "awq",
5192
+ "model_id":"Qwen/Qwen2.5-VL-7B-Instruct-AWQ"
5193
+ },
5194
+ {
5195
+ "model_format":"pytorch",
5196
+ "model_size_in_billions":72,
5197
+ "quantizations":[
5198
+ "Int4"
5199
+ ],
5200
+ "model_hub": "modelscope",
5201
+ "model_id":"Qwen/Qwen2.5-VL-72B-Instruct-AWQ"
5042
5202
  },
5043
5203
  {
5044
5204
  "model_format":"mlx",
@@ -5363,8 +5523,7 @@
5363
5523
  "zh"
5364
5524
  ],
5365
5525
  "model_ability": [
5366
- "chat",
5367
- "reasoning"
5526
+ "chat"
5368
5527
  ],
5369
5528
  "model_description": "DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. ",
5370
5529
  "model_specs": [
@@ -5498,15 +5657,13 @@
5498
5657
  "model_hub": "modelscope"
5499
5658
  }
5500
5659
  ],
5501
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|><think>\\n'}}{% endif %}",
5660
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
5502
5661
  "stop_token_ids": [
5503
5662
  1
5504
5663
  ],
5505
5664
  "stop": [
5506
5665
  "<|end▁of▁sentence|>"
5507
- ],
5508
- "reasoning_start_tag": "<think>",
5509
- "reasoning_end_tag": "</think>"
5666
+ ]
5510
5667
  },
5511
5668
  {
5512
5669
  "version": 1,
@@ -5517,7 +5674,8 @@
5517
5674
  "zh"
5518
5675
  ],
5519
5676
  "model_ability": [
5520
- "chat"
5677
+ "chat",
5678
+ "reasoning"
5521
5679
  ],
5522
5680
  "model_description": "DeepSeek-R1, which incorporates cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1 across math, code, and reasoning tasks.",
5523
5681
  "model_specs": [
@@ -5720,13 +5878,15 @@
5720
5878
  "model_hub": "modelscope"
5721
5879
  }
5722
5880
  ],
5723
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
5881
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|><think>\\n'}}{% endif %}",
5724
5882
  "stop_token_ids": [
5725
5883
  1
5726
5884
  ],
5727
5885
  "stop": [
5728
5886
  "<|end▁of▁sentence|>"
5729
- ]
5887
+ ],
5888
+ "reasoning_start_tag": "<think>",
5889
+ "reasoning_end_tag": "</think>"
5730
5890
  },
5731
5891
  {
5732
5892
  "version": 1,
@@ -7057,7 +7217,7 @@
7057
7217
  ],
7058
7218
  "model_id": "AI-ModelScope/QwQ-32B-Preview-GGUF",
7059
7219
  "model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf",
7060
- "model_hub": "modelscope"
7220
+ "model_hub": "modelscope"
7061
7221
  }
7062
7222
  ],
7063
7223
  "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
@@ -7072,6 +7232,86 @@
7072
7232
  "<|im_end|>"
7073
7233
  ]
7074
7234
  },
7235
+ {
7236
+ "version": 1,
7237
+ "context_length": 131072,
7238
+ "model_name": "QwQ-32B",
7239
+ "model_lang": [
7240
+ "en",
7241
+ "zh"
7242
+ ],
7243
+ "model_ability": [
7244
+ "chat",
7245
+ "reasoning"
7246
+ ],
7247
+ "model_description": "QwQ is the reasoning model of the Qwen series. Compared with conventional instruction-tuned models, QwQ, which is capable of thinking and reasoning, can achieve significantly enhanced performance in downstream tasks, especially hard problems. QwQ-32B is the medium-sized reasoning model, which is capable of achieving competitive performance against state-of-the-art reasoning models, e.g., DeepSeek-R1, o1-mini.",
7248
+ "model_specs": [
7249
+ {
7250
+ "model_format": "pytorch",
7251
+ "model_size_in_billions": 32,
7252
+ "quantizations": [
7253
+ "4-bit",
7254
+ "8-bit",
7255
+ "none"
7256
+ ],
7257
+ "model_id": "Qwen/QwQ-32B",
7258
+ "model_hub": "modelscope"
7259
+ },
7260
+ {
7261
+ "model_format": "awq",
7262
+ "model_size_in_billions": 32,
7263
+ "quantizations": [
7264
+ "Int4"
7265
+ ],
7266
+ "model_id": "Qwen/QwQ-32B-AWQ",
7267
+ "model_hub": "modelscope"
7268
+ },
7269
+ {
7270
+ "model_format": "mlx",
7271
+ "model_size_in_billions": 32,
7272
+ "quantizations": [
7273
+ "3bit",
7274
+ "4bit",
7275
+ "6bit",
7276
+ "8bit",
7277
+ "bf16"
7278
+ ],
7279
+ "model_id": "mlx-community/QwQ-32B-{quantization}",
7280
+ "model_hub": "modelscope"
7281
+ },
7282
+ {
7283
+ "model_format": "ggufv2",
7284
+ "model_size_in_billions": 32,
7285
+ "quantizations": [
7286
+ "fp16",
7287
+ "q2_k",
7288
+ "q3_k_m",
7289
+ "q4_0",
7290
+ "q4_k_m",
7291
+ "q5_0",
7292
+ "q5_k_m",
7293
+ "q6_k",
7294
+ "q8_0"
7295
+ ],
7296
+ "model_id": "Qwen/QwQ-32B-GGUF",
7297
+ "model_file_name_template": "qwq-32b-{quantization}.gguf",
7298
+ "model_hub": "modelscope"
7299
+ }
7300
+ ],
7301
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- '' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" and not message.tool_calls %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n<think>\\n' }}\n{%- endif %}\n",
7302
+ "stop_token_ids": [
7303
+ 151643,
7304
+ 151644,
7305
+ 151645
7306
+ ],
7307
+ "stop": [
7308
+ "<|endoftext|>",
7309
+ "<|im_start|>",
7310
+ "<|im_end|>"
7311
+ ],
7312
+ "reasoning_start_tag": "<think>",
7313
+ "reasoning_end_tag": "</think>"
7314
+ },
7075
7315
  {
7076
7316
  "version": 1,
7077
7317
  "context_length": 131072,
@@ -7866,5 +8106,84 @@
7866
8106
  "</s>",
7867
8107
  "<|im_end|>"
7868
8108
  ]
8109
+ },
8110
+ {
8111
+ "version": 1,
8112
+ "context_length": 1010000,
8113
+ "model_name": "qwen2.5-instruct-1m",
8114
+ "model_lang": [
8115
+ "en",
8116
+ "zh"
8117
+ ],
8118
+ "model_ability": [
8119
+ "chat"
8120
+ ],
8121
+ "model_description": "Qwen2.5-1M is the long-context version of the Qwen2.5 series models, supporting a context length of up to 1M tokens.",
8122
+ "model_specs": [
8123
+ {
8124
+ "model_format": "pytorch",
8125
+ "model_size_in_billions": 7,
8126
+ "quantizations": [
8127
+ "4-bit",
8128
+ "8-bit",
8129
+ "none"
8130
+ ],
8131
+ "model_id": "Qwen/Qwen2.5-7B-Instruct-1M",
8132
+ "model_hub": "modelscope"
8133
+ },
8134
+ {
8135
+ "model_format": "pytorch",
8136
+ "model_size_in_billions": 14,
8137
+ "quantizations": [
8138
+ "4-bit",
8139
+ "8-bit",
8140
+ "none"
8141
+ ],
8142
+ "model_id": "Qwen/Qwen2.5-14B-Instruct-1M",
8143
+ "model_hub": "modelscope"
8144
+ }
8145
+ ],
8146
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
8147
+ "stop_token_ids": [
8148
+ 151645,
8149
+ 151643
8150
+ ],
8151
+ "stop": [
8152
+ "<|im_end|>",
8153
+ "<|endoftext|>"
8154
+ ]
8155
+ },
8156
+ {
8157
+ "version": 1,
8158
+ "context_length": 8192,
8159
+ "model_name": "moonlight-16b-a3b-instruct",
8160
+ "model_lang": [
8161
+ "en",
8162
+ "zh"
8163
+ ],
8164
+ "model_ability": [
8165
+ "chat"
8166
+ ],
8167
+ "model_description": "Kimi Muon is Scalable for LLM Training",
8168
+ "model_specs": [
8169
+ {
8170
+ "model_format": "pytorch",
8171
+ "model_size_in_billions": 3,
8172
+ "quantizations": [
8173
+ "4-bit",
8174
+ "8-bit",
8175
+ "none"
8176
+ ],
8177
+ "model_id": "moonshotai/Moonlight-16B-A3B-Instruct",
8178
+ "model_hub": "modelscope"
8179
+ }
8180
+ ],
8181
+ "chat_template":"{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>{%- endif -%}{%- if message['role'] == 'system' -%}<|im_system|>{%- endif -%}{%- if message['role'] == 'user' -%}<|im_user|>{%- endif -%}{%- if message['role'] == 'assistant' -%}<|im_assistant|>{%- endif -%}{{ message['role'] }}<|im_middle|>{{message['content']}}<|im_end|>{%- endfor -%}{%- if add_generation_prompt -%}<|im_assistant|>assistant<|im_middle|>{%- endif -%}",
8182
+ "stop_token_ids": [
8183
+ 163586
8184
+ ],
8185
+ "stop": [
8186
+ "<|im_end|>"
8187
+ ]
7869
8188
  }
7870
8189
  ]
@@ -45,6 +45,7 @@ class MLXModelConfig(TypedDict, total=False):
45
45
  revision: Optional[str]
46
46
  max_gpu_memory: str
47
47
  trust_remote_code: bool
48
+ reasoning_content: bool
48
49
 
49
50
 
50
51
  class MLXGenerateConfig(TypedDict, total=False):
@@ -95,6 +96,7 @@ class MLXModel(LLM):
95
96
  model_config = MLXModelConfig()
96
97
  model_config.setdefault("revision", self.model_spec.model_revision)
97
98
  model_config.setdefault("trust_remote_code", True)
99
+ model_config.setdefault("reasoning_content", False)
98
100
  return model_config
99
101
 
100
102
  def _sanitize_generate_config(
@@ -153,6 +155,9 @@ class MLXModel(LLM):
153
155
  )
154
156
 
155
157
  def load(self):
158
+ reasoning_content = self._model_config.pop("reasoning_content")
159
+ self.prepare_parse_reasoning_content(reasoning_content)
160
+
156
161
  kwargs = {}
157
162
  kwargs["revision"] = self._model_config.get(
158
163
  "revision", self.model_spec.model_revision
@@ -445,13 +450,15 @@ class MLXChatModel(MLXModel, ChatModelMixin):
445
450
  if stream:
446
451
  it = self.generate(full_prompt, generate_config)
447
452
  assert isinstance(it, Iterator)
448
- return self._to_chat_completion_chunks(it)
453
+ return self._to_chat_completion_chunks(it, self.reasoning_parser)
449
454
  else:
450
455
  c = self.generate(full_prompt, generate_config)
451
456
  assert not isinstance(c, Iterator)
452
457
  if tools:
453
- return self._tool_calls_completion(self.model_family, self.model_uid, c)
454
- return self._to_chat_completion(c)
458
+ return self._post_process_completion(
459
+ self.model_family, self.model_uid, c, self.reasoning_parser
460
+ )
461
+ return self._to_chat_completion(c, self.reasoning_parser)
455
462
 
456
463
 
457
464
  class MLXVisionModel(MLXModel, ChatModelMixin):
@@ -527,6 +534,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
527
534
  text=detokenizer.last_segment,
528
535
  token=token,
529
536
  logprobs=logprobs,
537
+ from_draft=False,
530
538
  prompt_tokens=len(input_ids),
531
539
  prompt_tps=prompt_tps,
532
540
  generation_tokens=n + 1,
@@ -539,6 +547,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
539
547
  text=detokenizer.last_segment,
540
548
  token=token,
541
549
  logprobs=logprobs,
550
+ from_draft=False,
542
551
  prompt_tokens=len(input_ids),
543
552
  prompt_tps=prompt_tps,
544
553
  generation_tokens=n + 1,
@@ -634,5 +643,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
634
643
  c = self.generate(inputs, generate_config)
635
644
  assert not isinstance(c, Iterator)
636
645
  if tools:
637
- return self._tool_calls_completion(self.model_family, self.model_uid, c)
646
+ return self._post_process_completion(
647
+ self.model_family, self.model_uid, c
648
+ )
638
649
  return self._to_chat_completion(c)