xinference 1.3.0.post2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +1 -0
- xinference/conftest.py +7 -0
- xinference/core/model.py +3 -1
- xinference/core/scheduler.py +3 -0
- xinference/core/worker.py +1 -1
- xinference/model/embedding/core.py +12 -5
- xinference/model/llm/__init__.py +2 -1
- xinference/model/llm/core.py +13 -0
- xinference/model/llm/llama_cpp/core.py +260 -3
- xinference/model/llm/llm_family.json +306 -17
- xinference/model/llm/llm_family_modelscope.json +347 -28
- xinference/model/llm/mlx/core.py +15 -4
- xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +1 -1
- xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py +4 -5
- xinference/model/llm/sglang/core.py +7 -2
- xinference/model/llm/transformers/chatglm.py +4 -4
- xinference/model/llm/transformers/core.py +22 -5
- xinference/model/llm/transformers/intern_vl.py +2 -1
- xinference/model/llm/transformers/utils.py +1 -1
- xinference/model/llm/utils.py +103 -67
- xinference/model/llm/vllm/core.py +29 -42
- xinference/types.py +4 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.55b70cb7.js +3 -0
- xinference/web/ui/build/static/js/main.55b70cb7.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/87a9b13f2466f375ae5c6e7c08b279cc38351d29710d7f7626bbb07a85262b79.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e547bbb18abb4a474b675a8d5782d25617566bea0af8caa9b836ce5649e2250a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +1 -0
- xinference/web/ui/src/locales/en.json +9 -1
- xinference/web/ui/src/locales/zh.json +9 -1
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.dist-info}/METADATA +7 -3
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.dist-info}/RECORD +43 -42
- xinference/web/ui/build/static/js/main.ad42919c.js +0 -3
- xinference/web/ui/build/static/js/main.ad42919c.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +0 -1
- /xinference/web/ui/build/static/js/{main.ad42919c.js.LICENSE.txt → main.55b70cb7.js.LICENSE.txt} +0 -0
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.dist-info}/LICENSE +0 -0
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.dist-info}/WHEEL +0 -0
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.dist-info}/entry_points.txt +0 -0
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.dist-info}/top_level.txt +0 -0
|
@@ -4523,36 +4523,169 @@
|
|
|
4523
4523
|
"model_id": "OpenGVLab/InternVL2_5-1B",
|
|
4524
4524
|
"model_revision": "master"
|
|
4525
4525
|
},
|
|
4526
|
+
{
|
|
4527
|
+
"model_format": "pytorch",
|
|
4528
|
+
"model_size_in_billions": 2,
|
|
4529
|
+
"quantizations": [
|
|
4530
|
+
"4-bit",
|
|
4531
|
+
"8-bit",
|
|
4532
|
+
"none"
|
|
4533
|
+
],
|
|
4534
|
+
"model_hub": "modelscope",
|
|
4535
|
+
"model_id": "OpenGVLab/InternVL2_5-2B",
|
|
4536
|
+
"model_revision": "master"
|
|
4537
|
+
},
|
|
4538
|
+
{
|
|
4539
|
+
"model_format": "pytorch",
|
|
4540
|
+
"model_size_in_billions": 4,
|
|
4541
|
+
"quantizations": [
|
|
4542
|
+
"4-bit",
|
|
4543
|
+
"8-bit",
|
|
4544
|
+
"none"
|
|
4545
|
+
],
|
|
4546
|
+
"model_hub": "modelscope",
|
|
4547
|
+
"model_id": "OpenGVLab/InternVL2_5-4B",
|
|
4548
|
+
"model_revision": "master"
|
|
4549
|
+
},
|
|
4526
4550
|
{
|
|
4527
4551
|
"model_format": "awq",
|
|
4528
|
-
"model_size_in_billions":
|
|
4552
|
+
"model_size_in_billions": 4,
|
|
4529
4553
|
"quantizations": [
|
|
4530
4554
|
"Int4"
|
|
4531
4555
|
],
|
|
4532
4556
|
"model_hub": "modelscope",
|
|
4533
|
-
"model_id": "OpenGVLab/InternVL2_5-
|
|
4557
|
+
"model_id": "OpenGVLab/InternVL2_5-4B-AWQ",
|
|
4534
4558
|
"model_revision": "master"
|
|
4535
4559
|
},
|
|
4536
4560
|
{
|
|
4537
4561
|
"model_format": "pytorch",
|
|
4538
|
-
"model_size_in_billions":
|
|
4562
|
+
"model_size_in_billions": 8,
|
|
4539
4563
|
"quantizations": [
|
|
4540
4564
|
"4-bit",
|
|
4541
4565
|
"8-bit",
|
|
4542
4566
|
"none"
|
|
4543
4567
|
],
|
|
4544
4568
|
"model_hub": "modelscope",
|
|
4545
|
-
"model_id": "OpenGVLab/InternVL2_5-
|
|
4569
|
+
"model_id": "OpenGVLab/InternVL2_5-8B",
|
|
4546
4570
|
"model_revision": "master"
|
|
4547
4571
|
},
|
|
4548
4572
|
{
|
|
4549
4573
|
"model_format": "awq",
|
|
4550
|
-
"model_size_in_billions":
|
|
4574
|
+
"model_size_in_billions": 8,
|
|
4575
|
+
"quantizations": [
|
|
4576
|
+
"Int4"
|
|
4577
|
+
],
|
|
4578
|
+
"model_hub": "modelscope",
|
|
4579
|
+
"model_id": "OpenGVLab/InternVL2_5-8B-AWQ",
|
|
4580
|
+
"model_revision": "master"
|
|
4581
|
+
},
|
|
4582
|
+
{
|
|
4583
|
+
"model_format": "pytorch",
|
|
4584
|
+
"model_size_in_billions": 26,
|
|
4585
|
+
"quantizations": [
|
|
4586
|
+
"4-bit",
|
|
4587
|
+
"8-bit",
|
|
4588
|
+
"none"
|
|
4589
|
+
],
|
|
4590
|
+
"model_hub": "modelscope",
|
|
4591
|
+
"model_id": "OpenGVLab/InternVL2_5-26B",
|
|
4592
|
+
"model_revision": "master"
|
|
4593
|
+
},
|
|
4594
|
+
{
|
|
4595
|
+
"model_format": "awq",
|
|
4596
|
+
"model_size_in_billions": 26,
|
|
4551
4597
|
"quantizations": [
|
|
4552
4598
|
"Int4"
|
|
4553
4599
|
],
|
|
4554
4600
|
"model_hub": "modelscope",
|
|
4555
|
-
"model_id": "OpenGVLab/InternVL2_5-
|
|
4601
|
+
"model_id": "OpenGVLab/InternVL2_5-26B-AWQ",
|
|
4602
|
+
"model_revision": "master"
|
|
4603
|
+
},
|
|
4604
|
+
{
|
|
4605
|
+
"model_format": "pytorch",
|
|
4606
|
+
"model_size_in_billions": 38,
|
|
4607
|
+
"quantizations": [
|
|
4608
|
+
"4-bit",
|
|
4609
|
+
"8-bit",
|
|
4610
|
+
"none"
|
|
4611
|
+
],
|
|
4612
|
+
"model_hub": "modelscope",
|
|
4613
|
+
"model_id": "OpenGVLab/InternVL2_5-38B",
|
|
4614
|
+
"model_revision": "master"
|
|
4615
|
+
},
|
|
4616
|
+
{
|
|
4617
|
+
"model_format": "awq",
|
|
4618
|
+
"model_size_in_billions": 38,
|
|
4619
|
+
"quantizations": [
|
|
4620
|
+
"Int4"
|
|
4621
|
+
],
|
|
4622
|
+
"model_hub": "modelscope",
|
|
4623
|
+
"model_id": "OpenGVLab/InternVL2_5-38B-AWQ",
|
|
4624
|
+
"model_revision": "master"
|
|
4625
|
+
},
|
|
4626
|
+
{
|
|
4627
|
+
"model_format": "pytorch",
|
|
4628
|
+
"model_size_in_billions": 78,
|
|
4629
|
+
"quantizations": [
|
|
4630
|
+
"4-bit",
|
|
4631
|
+
"8-bit",
|
|
4632
|
+
"none"
|
|
4633
|
+
],
|
|
4634
|
+
"model_hub": "modelscope",
|
|
4635
|
+
"model_id": "OpenGVLab/InternVL2_5-78B",
|
|
4636
|
+
"model_revision": "master"
|
|
4637
|
+
},
|
|
4638
|
+
{
|
|
4639
|
+
"model_format": "awq",
|
|
4640
|
+
"model_size_in_billions": 78,
|
|
4641
|
+
"quantizations": [
|
|
4642
|
+
"Int4"
|
|
4643
|
+
],
|
|
4644
|
+
"model_hub": "modelscope",
|
|
4645
|
+
"model_id": "OpenGVLab/InternVL2_5-78B-AWQ",
|
|
4646
|
+
"model_revision": "master"
|
|
4647
|
+
}
|
|
4648
|
+
],
|
|
4649
|
+
"chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
|
4650
|
+
"stop_token_ids": [],
|
|
4651
|
+
"stop": []
|
|
4652
|
+
},
|
|
4653
|
+
{
|
|
4654
|
+
"version": 1,
|
|
4655
|
+
"context_length": 16384,
|
|
4656
|
+
"model_name": "InternVL2.5-MPO",
|
|
4657
|
+
"model_lang": [
|
|
4658
|
+
"en",
|
|
4659
|
+
"zh"
|
|
4660
|
+
],
|
|
4661
|
+
"model_ability": [
|
|
4662
|
+
"chat",
|
|
4663
|
+
"vision"
|
|
4664
|
+
],
|
|
4665
|
+
"model_description": "InternVL 2.5 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
|
|
4666
|
+
"model_specs": [
|
|
4667
|
+
{
|
|
4668
|
+
"model_format": "pytorch",
|
|
4669
|
+
"model_size_in_billions": 1,
|
|
4670
|
+
"quantizations": [
|
|
4671
|
+
"4-bit",
|
|
4672
|
+
"8-bit",
|
|
4673
|
+
"none"
|
|
4674
|
+
],
|
|
4675
|
+
"model_hub": "modelscope",
|
|
4676
|
+
"model_id": "OpenGVLab/InternVL2_5-MPO-1B",
|
|
4677
|
+
"model_revision": "master"
|
|
4678
|
+
},
|
|
4679
|
+
{
|
|
4680
|
+
"model_format": "pytorch",
|
|
4681
|
+
"model_size_in_billions": 2,
|
|
4682
|
+
"quantizations": [
|
|
4683
|
+
"4-bit",
|
|
4684
|
+
"8-bit",
|
|
4685
|
+
"none"
|
|
4686
|
+
],
|
|
4687
|
+
"model_hub": "modelscope",
|
|
4688
|
+
"model_id": "OpenGVLab/InternVL2_5-MPO-2B",
|
|
4556
4689
|
"model_revision": "master"
|
|
4557
4690
|
},
|
|
4558
4691
|
{
|
|
@@ -4564,7 +4697,7 @@
|
|
|
4564
4697
|
"none"
|
|
4565
4698
|
],
|
|
4566
4699
|
"model_hub": "modelscope",
|
|
4567
|
-
"model_id": "OpenGVLab/InternVL2_5-4B",
|
|
4700
|
+
"model_id": "OpenGVLab/InternVL2_5-MPO-4B",
|
|
4568
4701
|
"model_revision": "master"
|
|
4569
4702
|
},
|
|
4570
4703
|
{
|
|
@@ -4574,7 +4707,7 @@
|
|
|
4574
4707
|
"Int4"
|
|
4575
4708
|
],
|
|
4576
4709
|
"model_hub": "modelscope",
|
|
4577
|
-
"model_id": "OpenGVLab/InternVL2_5-4B-AWQ",
|
|
4710
|
+
"model_id": "OpenGVLab/InternVL2_5-MPO-4B-AWQ",
|
|
4578
4711
|
"model_revision": "master"
|
|
4579
4712
|
},
|
|
4580
4713
|
{
|
|
@@ -4586,7 +4719,7 @@
|
|
|
4586
4719
|
"none"
|
|
4587
4720
|
],
|
|
4588
4721
|
"model_hub": "modelscope",
|
|
4589
|
-
"model_id": "OpenGVLab/InternVL2_5-8B",
|
|
4722
|
+
"model_id": "OpenGVLab/InternVL2_5-MPO-8B",
|
|
4590
4723
|
"model_revision": "master"
|
|
4591
4724
|
},
|
|
4592
4725
|
{
|
|
@@ -4596,7 +4729,7 @@
|
|
|
4596
4729
|
"Int4"
|
|
4597
4730
|
],
|
|
4598
4731
|
"model_hub": "modelscope",
|
|
4599
|
-
"model_id": "OpenGVLab/InternVL2_5-8B-AWQ",
|
|
4732
|
+
"model_id": "OpenGVLab/InternVL2_5-MPO-8B-AWQ",
|
|
4600
4733
|
"model_revision": "master"
|
|
4601
4734
|
},
|
|
4602
4735
|
{
|
|
@@ -4608,7 +4741,7 @@
|
|
|
4608
4741
|
"none"
|
|
4609
4742
|
],
|
|
4610
4743
|
"model_hub": "modelscope",
|
|
4611
|
-
"model_id": "OpenGVLab/InternVL2_5-26B",
|
|
4744
|
+
"model_id": "OpenGVLab/InternVL2_5-MPO-26B",
|
|
4612
4745
|
"model_revision": "master"
|
|
4613
4746
|
},
|
|
4614
4747
|
{
|
|
@@ -4618,7 +4751,7 @@
|
|
|
4618
4751
|
"Int4"
|
|
4619
4752
|
],
|
|
4620
4753
|
"model_hub": "modelscope",
|
|
4621
|
-
"model_id": "OpenGVLab/InternVL2_5-26B-AWQ",
|
|
4754
|
+
"model_id": "OpenGVLab/InternVL2_5-MPO-26B-AWQ",
|
|
4622
4755
|
"model_revision": "master"
|
|
4623
4756
|
},
|
|
4624
4757
|
{
|
|
@@ -4630,7 +4763,7 @@
|
|
|
4630
4763
|
"none"
|
|
4631
4764
|
],
|
|
4632
4765
|
"model_hub": "modelscope",
|
|
4633
|
-
"model_id": "OpenGVLab/InternVL2_5-38B",
|
|
4766
|
+
"model_id": "OpenGVLab/InternVL2_5-MPO-38B",
|
|
4634
4767
|
"model_revision": "master"
|
|
4635
4768
|
},
|
|
4636
4769
|
{
|
|
@@ -4640,7 +4773,7 @@
|
|
|
4640
4773
|
"Int4"
|
|
4641
4774
|
],
|
|
4642
4775
|
"model_hub": "modelscope",
|
|
4643
|
-
"model_id": "OpenGVLab/InternVL2_5-38B-AWQ",
|
|
4776
|
+
"model_id": "OpenGVLab/InternVL2_5-MPO-38B-AWQ",
|
|
4644
4777
|
"model_revision": "master"
|
|
4645
4778
|
},
|
|
4646
4779
|
{
|
|
@@ -4652,7 +4785,7 @@
|
|
|
4652
4785
|
"none"
|
|
4653
4786
|
],
|
|
4654
4787
|
"model_hub": "modelscope",
|
|
4655
|
-
"model_id": "OpenGVLab/InternVL2_5-78B",
|
|
4788
|
+
"model_id": "OpenGVLab/InternVL2_5-MPO-78B",
|
|
4656
4789
|
"model_revision": "master"
|
|
4657
4790
|
},
|
|
4658
4791
|
{
|
|
@@ -4662,7 +4795,7 @@
|
|
|
4662
4795
|
"Int4"
|
|
4663
4796
|
],
|
|
4664
4797
|
"model_hub": "modelscope",
|
|
4665
|
-
"model_id": "OpenGVLab/InternVL2_5-78B-AWQ",
|
|
4798
|
+
"model_id": "OpenGVLab/InternVL2_5-MPO-78B-AWQ",
|
|
4666
4799
|
"model_revision": "master"
|
|
4667
4800
|
}
|
|
4668
4801
|
],
|
|
@@ -5020,7 +5153,7 @@
|
|
|
5020
5153
|
"none"
|
|
5021
5154
|
],
|
|
5022
5155
|
"model_hub": "modelscope",
|
|
5023
|
-
"model_id":"
|
|
5156
|
+
"model_id":"Qwen/Qwen2.5-VL-3B-Instruct"
|
|
5024
5157
|
},
|
|
5025
5158
|
{
|
|
5026
5159
|
"model_format":"pytorch",
|
|
@@ -5029,7 +5162,7 @@
|
|
|
5029
5162
|
"none"
|
|
5030
5163
|
],
|
|
5031
5164
|
"model_hub": "modelscope",
|
|
5032
|
-
"model_id":"
|
|
5165
|
+
"model_id":"Qwen/Qwen2.5-VL-7B-Instruct"
|
|
5033
5166
|
},
|
|
5034
5167
|
{
|
|
5035
5168
|
"model_format":"pytorch",
|
|
@@ -5038,7 +5171,34 @@
|
|
|
5038
5171
|
"none"
|
|
5039
5172
|
],
|
|
5040
5173
|
"model_hub": "modelscope",
|
|
5041
|
-
"model_id":"
|
|
5174
|
+
"model_id":"Qwen/Qwen2.5-VL-72B-Instruct"
|
|
5175
|
+
},
|
|
5176
|
+
{
|
|
5177
|
+
"model_format":"awq",
|
|
5178
|
+
"model_size_in_billions":3,
|
|
5179
|
+
"quantizations":[
|
|
5180
|
+
"Int4"
|
|
5181
|
+
],
|
|
5182
|
+
"model_hub": "modelscope",
|
|
5183
|
+
"model_id":"Qwen/Qwen2.5-VL-3B-Instruct-AWQ"
|
|
5184
|
+
},
|
|
5185
|
+
{
|
|
5186
|
+
"model_format":"awq",
|
|
5187
|
+
"model_size_in_billions":7,
|
|
5188
|
+
"quantizations":[
|
|
5189
|
+
"Int4"
|
|
5190
|
+
],
|
|
5191
|
+
"model_hub": "awq",
|
|
5192
|
+
"model_id":"Qwen/Qwen2.5-VL-7B-Instruct-AWQ"
|
|
5193
|
+
},
|
|
5194
|
+
{
|
|
5195
|
+
"model_format":"pytorch",
|
|
5196
|
+
"model_size_in_billions":72,
|
|
5197
|
+
"quantizations":[
|
|
5198
|
+
"Int4"
|
|
5199
|
+
],
|
|
5200
|
+
"model_hub": "modelscope",
|
|
5201
|
+
"model_id":"Qwen/Qwen2.5-VL-72B-Instruct-AWQ"
|
|
5042
5202
|
},
|
|
5043
5203
|
{
|
|
5044
5204
|
"model_format":"mlx",
|
|
@@ -5363,8 +5523,7 @@
|
|
|
5363
5523
|
"zh"
|
|
5364
5524
|
],
|
|
5365
5525
|
"model_ability": [
|
|
5366
|
-
"chat"
|
|
5367
|
-
"reasoning"
|
|
5526
|
+
"chat"
|
|
5368
5527
|
],
|
|
5369
5528
|
"model_description": "DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. ",
|
|
5370
5529
|
"model_specs": [
|
|
@@ -5498,15 +5657,13 @@
|
|
|
5498
5657
|
"model_hub": "modelscope"
|
|
5499
5658
|
}
|
|
5500
5659
|
],
|
|
5501
|
-
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '
|
|
5660
|
+
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
|
|
5502
5661
|
"stop_token_ids": [
|
|
5503
5662
|
1
|
|
5504
5663
|
],
|
|
5505
5664
|
"stop": [
|
|
5506
5665
|
"<|end▁of▁sentence|>"
|
|
5507
|
-
]
|
|
5508
|
-
"reasoning_start_tag": "<think>",
|
|
5509
|
-
"reasoning_end_tag": "</think>"
|
|
5666
|
+
]
|
|
5510
5667
|
},
|
|
5511
5668
|
{
|
|
5512
5669
|
"version": 1,
|
|
@@ -5517,7 +5674,8 @@
|
|
|
5517
5674
|
"zh"
|
|
5518
5675
|
],
|
|
5519
5676
|
"model_ability": [
|
|
5520
|
-
"chat"
|
|
5677
|
+
"chat",
|
|
5678
|
+
"reasoning"
|
|
5521
5679
|
],
|
|
5522
5680
|
"model_description": "DeepSeek-R1, which incorporates cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1 across math, code, and reasoning tasks.",
|
|
5523
5681
|
"model_specs": [
|
|
@@ -5720,13 +5878,15 @@
|
|
|
5720
5878
|
"model_hub": "modelscope"
|
|
5721
5879
|
}
|
|
5722
5880
|
],
|
|
5723
|
-
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and
|
|
5881
|
+
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|><think>\\n'}}{% endif %}",
|
|
5724
5882
|
"stop_token_ids": [
|
|
5725
5883
|
1
|
|
5726
5884
|
],
|
|
5727
5885
|
"stop": [
|
|
5728
5886
|
"<|end▁of▁sentence|>"
|
|
5729
|
-
]
|
|
5887
|
+
],
|
|
5888
|
+
"reasoning_start_tag": "<think>",
|
|
5889
|
+
"reasoning_end_tag": "</think>"
|
|
5730
5890
|
},
|
|
5731
5891
|
{
|
|
5732
5892
|
"version": 1,
|
|
@@ -7072,6 +7232,86 @@
|
|
|
7072
7232
|
"<|im_end|>"
|
|
7073
7233
|
]
|
|
7074
7234
|
},
|
|
7235
|
+
{
|
|
7236
|
+
"version": 1,
|
|
7237
|
+
"context_length": 32768,
|
|
7238
|
+
"model_name": "QwQ-32B",
|
|
7239
|
+
"model_lang": [
|
|
7240
|
+
"en",
|
|
7241
|
+
"zh"
|
|
7242
|
+
],
|
|
7243
|
+
"model_ability": [
|
|
7244
|
+
"chat",
|
|
7245
|
+
"reasoning"
|
|
7246
|
+
],
|
|
7247
|
+
"model_description": "QwQ is the reasoning model of the Qwen series. Compared with conventional instruction-tuned models, QwQ, which is capable of thinking and reasoning, can achieve significantly enhanced performance in downstream tasks, especially hard problems. QwQ-32B is the medium-sized reasoning model, which is capable of achieving competitive performance against state-of-the-art reasoning models, e.g., DeepSeek-R1, o1-mini.",
|
|
7248
|
+
"model_specs": [
|
|
7249
|
+
{
|
|
7250
|
+
"model_format": "pytorch",
|
|
7251
|
+
"model_size_in_billions": 32,
|
|
7252
|
+
"quantizations": [
|
|
7253
|
+
"4-bit",
|
|
7254
|
+
"8-bit",
|
|
7255
|
+
"none"
|
|
7256
|
+
],
|
|
7257
|
+
"model_id": "Qwen/QwQ-32B",
|
|
7258
|
+
"model_hub": "modelscope"
|
|
7259
|
+
},
|
|
7260
|
+
{
|
|
7261
|
+
"model_format": "awq",
|
|
7262
|
+
"model_size_in_billions": 32,
|
|
7263
|
+
"quantizations": [
|
|
7264
|
+
"Int4"
|
|
7265
|
+
],
|
|
7266
|
+
"model_id": "Qwen/QwQ-32B-AWQ",
|
|
7267
|
+
"model_hub": "modelscope"
|
|
7268
|
+
},
|
|
7269
|
+
{
|
|
7270
|
+
"model_format": "mlx",
|
|
7271
|
+
"model_size_in_billions": 32,
|
|
7272
|
+
"quantizations": [
|
|
7273
|
+
"3bit",
|
|
7274
|
+
"4bit",
|
|
7275
|
+
"6bit",
|
|
7276
|
+
"8bit",
|
|
7277
|
+
"bf16"
|
|
7278
|
+
],
|
|
7279
|
+
"model_id": "mlx-community/QwQ-32B-{quantization}",
|
|
7280
|
+
"model_hub": "modelscope"
|
|
7281
|
+
},
|
|
7282
|
+
{
|
|
7283
|
+
"model_format": "ggufv2",
|
|
7284
|
+
"model_size_in_billions": 32,
|
|
7285
|
+
"quantizations": [
|
|
7286
|
+
"fp16",
|
|
7287
|
+
"Q2_k",
|
|
7288
|
+
"Q3_K_M",
|
|
7289
|
+
"Q4_0",
|
|
7290
|
+
"Q4_K_M",
|
|
7291
|
+
"Q5_0",
|
|
7292
|
+
"Q5_K_M",
|
|
7293
|
+
"Q6_K",
|
|
7294
|
+
"Q8_0"
|
|
7295
|
+
],
|
|
7296
|
+
"model_id": "Qwen/QwQ-32B-GGUF",
|
|
7297
|
+
"model_file_name_template": "qwq-32b-{quantization}.gguf",
|
|
7298
|
+
"model_hub": "modelscope"
|
|
7299
|
+
}
|
|
7300
|
+
],
|
|
7301
|
+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- '' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" and not message.tool_calls %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n<think>\\n' }}\n{%- endif %}\n",
|
|
7302
|
+
"stop_token_ids": [
|
|
7303
|
+
151643,
|
|
7304
|
+
151644,
|
|
7305
|
+
151645
|
|
7306
|
+
],
|
|
7307
|
+
"stop": [
|
|
7308
|
+
"<|endoftext|>",
|
|
7309
|
+
"<|im_start|>",
|
|
7310
|
+
"<|im_end|>"
|
|
7311
|
+
],
|
|
7312
|
+
"reasoning_start_tag": "<think>",
|
|
7313
|
+
"reasoning_end_tag": "</think>"
|
|
7314
|
+
},
|
|
7075
7315
|
{
|
|
7076
7316
|
"version": 1,
|
|
7077
7317
|
"context_length": 131072,
|
|
@@ -7866,5 +8106,84 @@
|
|
|
7866
8106
|
"</s>",
|
|
7867
8107
|
"<|im_end|>"
|
|
7868
8108
|
]
|
|
8109
|
+
},
|
|
8110
|
+
{
|
|
8111
|
+
"version": 1,
|
|
8112
|
+
"context_length": 1010000,
|
|
8113
|
+
"model_name": "qwen2.5-instruct-1m",
|
|
8114
|
+
"model_lang": [
|
|
8115
|
+
"en",
|
|
8116
|
+
"zh"
|
|
8117
|
+
],
|
|
8118
|
+
"model_ability": [
|
|
8119
|
+
"chat"
|
|
8120
|
+
],
|
|
8121
|
+
"model_description": "Qwen2.5-1M is the long-context version of the Qwen2.5 series models, supporting a context length of up to 1M tokens.",
|
|
8122
|
+
"model_specs": [
|
|
8123
|
+
{
|
|
8124
|
+
"model_format": "pytorch",
|
|
8125
|
+
"model_size_in_billions": 7,
|
|
8126
|
+
"quantizations": [
|
|
8127
|
+
"4-bit",
|
|
8128
|
+
"8-bit",
|
|
8129
|
+
"none"
|
|
8130
|
+
],
|
|
8131
|
+
"model_id": "Qwen/Qwen2.5-7B-Instruct-1M",
|
|
8132
|
+
"model_hub": "modelscope"
|
|
8133
|
+
},
|
|
8134
|
+
{
|
|
8135
|
+
"model_format": "pytorch",
|
|
8136
|
+
"model_size_in_billions": 14,
|
|
8137
|
+
"quantizations": [
|
|
8138
|
+
"4-bit",
|
|
8139
|
+
"8-bit",
|
|
8140
|
+
"none"
|
|
8141
|
+
],
|
|
8142
|
+
"model_id": "Qwen/Qwen2.5-14B-Instruct-1M",
|
|
8143
|
+
"model_hub": "modelscope"
|
|
8144
|
+
}
|
|
8145
|
+
],
|
|
8146
|
+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
|
|
8147
|
+
"stop_token_ids": [
|
|
8148
|
+
151645,
|
|
8149
|
+
151643
|
|
8150
|
+
],
|
|
8151
|
+
"stop": [
|
|
8152
|
+
"<|im_end|>",
|
|
8153
|
+
"<|endoftext|>"
|
|
8154
|
+
]
|
|
8155
|
+
},
|
|
8156
|
+
{
|
|
8157
|
+
"version": 1,
|
|
8158
|
+
"context_length": 8192,
|
|
8159
|
+
"model_name": "moonlight-16b-a3b-instruct",
|
|
8160
|
+
"model_lang": [
|
|
8161
|
+
"en",
|
|
8162
|
+
"zh"
|
|
8163
|
+
],
|
|
8164
|
+
"model_ability": [
|
|
8165
|
+
"chat"
|
|
8166
|
+
],
|
|
8167
|
+
"model_description": "Kimi Muon is Scalable for LLM Training",
|
|
8168
|
+
"model_specs": [
|
|
8169
|
+
{
|
|
8170
|
+
"model_format": "pytorch",
|
|
8171
|
+
"model_size_in_billions": 3,
|
|
8172
|
+
"quantizations": [
|
|
8173
|
+
"4-bit",
|
|
8174
|
+
"8-bit",
|
|
8175
|
+
"none"
|
|
8176
|
+
],
|
|
8177
|
+
"model_id": "moonshotai/Moonlight-16B-A3B-Instruct",
|
|
8178
|
+
"model_hub": "modelscope"
|
|
8179
|
+
}
|
|
8180
|
+
],
|
|
8181
|
+
"chat_template":"{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>{%- endif -%}{%- if message['role'] == 'system' -%}<|im_system|>{%- endif -%}{%- if message['role'] == 'user' -%}<|im_user|>{%- endif -%}{%- if message['role'] == 'assistant' -%}<|im_assistant|>{%- endif -%}{{ message['role'] }}<|im_middle|>{{message['content']}}<|im_end|>{%- endfor -%}{%- if add_generation_prompt -%}<|im_assistant|>assistant<|im_middle|>{%- endif -%}",
|
|
8182
|
+
"stop_token_ids": [
|
|
8183
|
+
163586
|
|
8184
|
+
],
|
|
8185
|
+
"stop": [
|
|
8186
|
+
"<|im_end|>"
|
|
8187
|
+
]
|
|
7869
8188
|
}
|
|
7870
8189
|
]
|
xinference/model/llm/mlx/core.py
CHANGED
|
@@ -45,6 +45,7 @@ class MLXModelConfig(TypedDict, total=False):
|
|
|
45
45
|
revision: Optional[str]
|
|
46
46
|
max_gpu_memory: str
|
|
47
47
|
trust_remote_code: bool
|
|
48
|
+
reasoning_content: bool
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
class MLXGenerateConfig(TypedDict, total=False):
|
|
@@ -95,6 +96,7 @@ class MLXModel(LLM):
|
|
|
95
96
|
model_config = MLXModelConfig()
|
|
96
97
|
model_config.setdefault("revision", self.model_spec.model_revision)
|
|
97
98
|
model_config.setdefault("trust_remote_code", True)
|
|
99
|
+
model_config.setdefault("reasoning_content", False)
|
|
98
100
|
return model_config
|
|
99
101
|
|
|
100
102
|
def _sanitize_generate_config(
|
|
@@ -153,6 +155,9 @@ class MLXModel(LLM):
|
|
|
153
155
|
)
|
|
154
156
|
|
|
155
157
|
def load(self):
|
|
158
|
+
reasoning_content = self._model_config.pop("reasoning_content")
|
|
159
|
+
self.prepare_parse_reasoning_content(reasoning_content)
|
|
160
|
+
|
|
156
161
|
kwargs = {}
|
|
157
162
|
kwargs["revision"] = self._model_config.get(
|
|
158
163
|
"revision", self.model_spec.model_revision
|
|
@@ -445,13 +450,15 @@ class MLXChatModel(MLXModel, ChatModelMixin):
|
|
|
445
450
|
if stream:
|
|
446
451
|
it = self.generate(full_prompt, generate_config)
|
|
447
452
|
assert isinstance(it, Iterator)
|
|
448
|
-
return self._to_chat_completion_chunks(it)
|
|
453
|
+
return self._to_chat_completion_chunks(it, self.reasoning_parser)
|
|
449
454
|
else:
|
|
450
455
|
c = self.generate(full_prompt, generate_config)
|
|
451
456
|
assert not isinstance(c, Iterator)
|
|
452
457
|
if tools:
|
|
453
|
-
return self.
|
|
454
|
-
|
|
458
|
+
return self._post_process_completion(
|
|
459
|
+
self.model_family, self.model_uid, c, self.reasoning_parser
|
|
460
|
+
)
|
|
461
|
+
return self._to_chat_completion(c, self.reasoning_parser)
|
|
455
462
|
|
|
456
463
|
|
|
457
464
|
class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
@@ -527,6 +534,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
527
534
|
text=detokenizer.last_segment,
|
|
528
535
|
token=token,
|
|
529
536
|
logprobs=logprobs,
|
|
537
|
+
from_draft=False,
|
|
530
538
|
prompt_tokens=len(input_ids),
|
|
531
539
|
prompt_tps=prompt_tps,
|
|
532
540
|
generation_tokens=n + 1,
|
|
@@ -539,6 +547,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
539
547
|
text=detokenizer.last_segment,
|
|
540
548
|
token=token,
|
|
541
549
|
logprobs=logprobs,
|
|
550
|
+
from_draft=False,
|
|
542
551
|
prompt_tokens=len(input_ids),
|
|
543
552
|
prompt_tps=prompt_tps,
|
|
544
553
|
generation_tokens=n + 1,
|
|
@@ -634,5 +643,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
634
643
|
c = self.generate(inputs, generate_config)
|
|
635
644
|
assert not isinstance(c, Iterator)
|
|
636
645
|
if tools:
|
|
637
|
-
return self.
|
|
646
|
+
return self._post_process_completion(
|
|
647
|
+
self.model_family, self.model_uid, c
|
|
648
|
+
)
|
|
638
649
|
return self._to_chat_completion(c)
|
|
@@ -23,7 +23,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
|
|
|
23
23
|
self,
|
|
24
24
|
previous_text: str,
|
|
25
25
|
current_text: str,
|
|
26
|
-
|
|
26
|
+
delta_text: str,
|
|
27
27
|
) -> ChatCompletionChunkDelta:
|
|
28
28
|
"""Extract reasoning content from DeepSeek-R1 model output in a streaming fashion.
|
|
29
29
|
|
|
@@ -34,10 +34,9 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
|
|
|
34
34
|
Yields:
|
|
35
35
|
str: Extracted reasoning content chunks.
|
|
36
36
|
"""
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
delta_text = delta["content"]
|
|
37
|
+
delta = ChatCompletionChunkDelta(
|
|
38
|
+
content=delta_text,
|
|
39
|
+
)
|
|
41
40
|
|
|
42
41
|
# Check if <think> is present in previous or delta.
|
|
43
42
|
# Keep compatibility with models that don't generate <think> tokens.
|