xinference 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +4 -7
- xinference/client/handlers.py +3 -0
- xinference/core/chat_interface.py +6 -1
- xinference/core/model.py +2 -0
- xinference/core/scheduler.py +4 -7
- xinference/core/supervisor.py +114 -23
- xinference/core/worker.py +70 -4
- xinference/deploy/local.py +2 -1
- xinference/model/audio/core.py +11 -0
- xinference/model/audio/cosyvoice.py +16 -5
- xinference/model/audio/kokoro.py +139 -0
- xinference/model/audio/melotts.py +110 -0
- xinference/model/audio/model_spec.json +80 -0
- xinference/model/audio/model_spec_modelscope.json +18 -0
- xinference/model/audio/whisper.py +35 -10
- xinference/model/llm/llama_cpp/core.py +21 -14
- xinference/model/llm/llm_family.json +527 -1
- xinference/model/llm/llm_family.py +4 -1
- xinference/model/llm/llm_family_modelscope.json +495 -3
- xinference/model/llm/memory.py +1 -1
- xinference/model/llm/mlx/core.py +24 -6
- xinference/model/llm/transformers/core.py +9 -1
- xinference/model/llm/transformers/qwen2_audio.py +3 -1
- xinference/model/llm/transformers/qwen2_vl.py +20 -3
- xinference/model/llm/transformers/utils.py +22 -11
- xinference/model/llm/utils.py +115 -1
- xinference/model/llm/vllm/core.py +14 -4
- xinference/model/llm/vllm/xavier/block.py +3 -4
- xinference/model/llm/vllm/xavier/block_tracker.py +71 -58
- xinference/model/llm/vllm/xavier/collective.py +74 -0
- xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
- xinference/model/llm/vllm/xavier/executor.py +18 -16
- xinference/model/llm/vllm/xavier/scheduler.py +79 -63
- xinference/model/llm/vllm/xavier/test/test_xavier.py +60 -35
- xinference/model/llm/vllm/xavier/transfer.py +53 -32
- xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
- xinference/thirdparty/melo/__init__.py +0 -0
- xinference/thirdparty/melo/api.py +135 -0
- xinference/thirdparty/melo/app.py +61 -0
- xinference/thirdparty/melo/attentions.py +459 -0
- xinference/thirdparty/melo/commons.py +160 -0
- xinference/thirdparty/melo/configs/config.json +94 -0
- xinference/thirdparty/melo/data/example/metadata.list +20 -0
- xinference/thirdparty/melo/data_utils.py +413 -0
- xinference/thirdparty/melo/download_utils.py +67 -0
- xinference/thirdparty/melo/infer.py +25 -0
- xinference/thirdparty/melo/init_downloads.py +14 -0
- xinference/thirdparty/melo/losses.py +58 -0
- xinference/thirdparty/melo/main.py +36 -0
- xinference/thirdparty/melo/mel_processing.py +174 -0
- xinference/thirdparty/melo/models.py +1030 -0
- xinference/thirdparty/melo/modules.py +598 -0
- xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
- xinference/thirdparty/melo/monotonic_align/core.py +46 -0
- xinference/thirdparty/melo/preprocess_text.py +135 -0
- xinference/thirdparty/melo/split_utils.py +174 -0
- xinference/thirdparty/melo/text/__init__.py +35 -0
- xinference/thirdparty/melo/text/chinese.py +199 -0
- xinference/thirdparty/melo/text/chinese_bert.py +107 -0
- xinference/thirdparty/melo/text/chinese_mix.py +253 -0
- xinference/thirdparty/melo/text/cleaner.py +36 -0
- xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
- xinference/thirdparty/melo/text/cmudict.rep +129530 -0
- xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
- xinference/thirdparty/melo/text/english.py +284 -0
- xinference/thirdparty/melo/text/english_bert.py +39 -0
- xinference/thirdparty/melo/text/english_utils/__init__.py +0 -0
- xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
- xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
- xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
- xinference/thirdparty/melo/text/es_phonemizer/__init__.py +0 -0
- xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
- xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
- xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
- xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
- xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
- xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
- xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
- xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
- xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
- xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
- xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
- xinference/thirdparty/melo/text/fr_phonemizer/__init__.py +0 -0
- xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
- xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
- xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
- xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
- xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
- xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
- xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
- xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
- xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
- xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
- xinference/thirdparty/melo/text/french.py +94 -0
- xinference/thirdparty/melo/text/french_bert.py +39 -0
- xinference/thirdparty/melo/text/japanese.py +647 -0
- xinference/thirdparty/melo/text/japanese_bert.py +49 -0
- xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
- xinference/thirdparty/melo/text/korean.py +192 -0
- xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
- xinference/thirdparty/melo/text/spanish.py +122 -0
- xinference/thirdparty/melo/text/spanish_bert.py +39 -0
- xinference/thirdparty/melo/text/symbols.py +290 -0
- xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
- xinference/thirdparty/melo/train.py +635 -0
- xinference/thirdparty/melo/train.sh +19 -0
- xinference/thirdparty/melo/transforms.py +209 -0
- xinference/thirdparty/melo/utils.py +424 -0
- xinference/types.py +2 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.1eb206d1.js → main.b0936c54.js} +3 -3
- xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
- {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/METADATA +37 -27
- {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/RECORD +122 -45
- xinference/web/ui/build/static/js/main.1eb206d1.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2213d49de260e1f67c888081b18f120f5225462b829ae57c9e05a05cec83689d.json +0 -1
- /xinference/web/ui/build/static/js/{main.1eb206d1.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
- {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/LICENSE +0 -0
- {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/WHEEL +0 -0
- {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/entry_points.txt +0 -0
- {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/top_level.txt +0 -0
|
@@ -4769,10 +4769,11 @@
|
|
|
4769
4769
|
"model_format":"mlx",
|
|
4770
4770
|
"model_size_in_billions":2,
|
|
4771
4771
|
"quantizations":[
|
|
4772
|
+
"4bit",
|
|
4772
4773
|
"8bit"
|
|
4773
4774
|
],
|
|
4774
4775
|
"model_hub": "modelscope",
|
|
4775
|
-
"model_id":"
|
|
4776
|
+
"model_id":"mlx-community/Qwen2-VL-2B-Instruct-{quantization}",
|
|
4776
4777
|
"model_revision":"master"
|
|
4777
4778
|
},
|
|
4778
4779
|
{
|
|
@@ -4825,6 +4826,97 @@
|
|
|
4825
4826
|
"<|endoftext|>"
|
|
4826
4827
|
]
|
|
4827
4828
|
},
|
|
4829
|
+
{
|
|
4830
|
+
"version":1,
|
|
4831
|
+
"context_length":128000,
|
|
4832
|
+
"model_name":"qwen2.5-vl-instruct",
|
|
4833
|
+
"model_lang":[
|
|
4834
|
+
"en",
|
|
4835
|
+
"zh"
|
|
4836
|
+
],
|
|
4837
|
+
"model_ability":[
|
|
4838
|
+
"chat",
|
|
4839
|
+
"vision"
|
|
4840
|
+
],
|
|
4841
|
+
"model_description":"Qwen2.5-VL: Qwen2.5-VL is the latest version of the vision language models in the Qwen model familities.",
|
|
4842
|
+
"model_specs":[
|
|
4843
|
+
{
|
|
4844
|
+
"model_format":"pytorch",
|
|
4845
|
+
"model_size_in_billions":3,
|
|
4846
|
+
"quantizations":[
|
|
4847
|
+
"none"
|
|
4848
|
+
],
|
|
4849
|
+
"model_hub": "modelscope",
|
|
4850
|
+
"model_id":"qwen/Qwen2.5-VL-3B-Instruct"
|
|
4851
|
+
},
|
|
4852
|
+
{
|
|
4853
|
+
"model_format":"pytorch",
|
|
4854
|
+
"model_size_in_billions":7,
|
|
4855
|
+
"quantizations":[
|
|
4856
|
+
"none"
|
|
4857
|
+
],
|
|
4858
|
+
"model_hub": "modelscope",
|
|
4859
|
+
"model_id":"qwen/Qwen2.5-VL-7B-Instruct"
|
|
4860
|
+
},
|
|
4861
|
+
{
|
|
4862
|
+
"model_format":"pytorch",
|
|
4863
|
+
"model_size_in_billions":72,
|
|
4864
|
+
"quantizations":[
|
|
4865
|
+
"none"
|
|
4866
|
+
],
|
|
4867
|
+
"model_hub": "modelscope",
|
|
4868
|
+
"model_id":"qwen/Qwen2.5-VL-72B-Instruct"
|
|
4869
|
+
},
|
|
4870
|
+
{
|
|
4871
|
+
"model_format":"mlx",
|
|
4872
|
+
"model_size_in_billions":3,
|
|
4873
|
+
"quantizations":[
|
|
4874
|
+
"3bit",
|
|
4875
|
+
"4bit",
|
|
4876
|
+
"6bit",
|
|
4877
|
+
"8bit",
|
|
4878
|
+
"bf16"
|
|
4879
|
+
],
|
|
4880
|
+
"model_hub": "modelscope",
|
|
4881
|
+
"model_id":"mlx-community/Qwen2.5-VL-3B-Instruct-{quantization}"
|
|
4882
|
+
},
|
|
4883
|
+
{
|
|
4884
|
+
"model_format":"mlx",
|
|
4885
|
+
"model_size_in_billions":7,
|
|
4886
|
+
"quantizations":[
|
|
4887
|
+
"3bit",
|
|
4888
|
+
"4bit",
|
|
4889
|
+
"6bit",
|
|
4890
|
+
"8bit",
|
|
4891
|
+
"bf16"
|
|
4892
|
+
],
|
|
4893
|
+
"model_hub": "modelscope",
|
|
4894
|
+
"model_id":"mlx-community/Qwen2.5-VL-7B-Instruct-{quantization}"
|
|
4895
|
+
},
|
|
4896
|
+
{
|
|
4897
|
+
"model_format":"mlx",
|
|
4898
|
+
"model_size_in_billions":72,
|
|
4899
|
+
"quantizations":[
|
|
4900
|
+
"3bit",
|
|
4901
|
+
"4bit",
|
|
4902
|
+
"6bit",
|
|
4903
|
+
"8bit",
|
|
4904
|
+
"bf16"
|
|
4905
|
+
],
|
|
4906
|
+
"model_hub": "modelscope",
|
|
4907
|
+
"model_id":"mlx-community/Qwen2.5-VL-72B-Instruct-{quantization}"
|
|
4908
|
+
}
|
|
4909
|
+
],
|
|
4910
|
+
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
|
|
4911
|
+
"stop_token_ids": [
|
|
4912
|
+
151645,
|
|
4913
|
+
151643
|
|
4914
|
+
],
|
|
4915
|
+
"stop": [
|
|
4916
|
+
"<|im_end|>",
|
|
4917
|
+
"<|endoftext|>"
|
|
4918
|
+
]
|
|
4919
|
+
},
|
|
4828
4920
|
{
|
|
4829
4921
|
"version": 1,
|
|
4830
4922
|
"context_length": 32768,
|
|
@@ -5558,7 +5650,7 @@
|
|
|
5558
5650
|
"q8_0"
|
|
5559
5651
|
],
|
|
5560
5652
|
"model_id": "qwen/Qwen2.5-7B-Instruct-GGUF",
|
|
5561
|
-
"model_file_name_template": "
|
|
5653
|
+
"model_file_name_template": "qwen2.5-7b-instruct-{quantization}.gguf",
|
|
5562
5654
|
"model_hub": "modelscope",
|
|
5563
5655
|
"model_file_name_split_template": "qwen2.5-7b-instruct-{quantization}-{part}.gguf",
|
|
5564
5656
|
"quantization_parts": {
|
|
@@ -6433,6 +6525,326 @@
|
|
|
6433
6525
|
"<|im_end|>"
|
|
6434
6526
|
]
|
|
6435
6527
|
},
|
|
6528
|
+
{
|
|
6529
|
+
"version": 1,
|
|
6530
|
+
"context_length": 131072,
|
|
6531
|
+
"model_name": "deepseek-r1-distill-qwen",
|
|
6532
|
+
"model_lang": [
|
|
6533
|
+
"en",
|
|
6534
|
+
"zh"
|
|
6535
|
+
],
|
|
6536
|
+
"model_ability": [
|
|
6537
|
+
"chat"
|
|
6538
|
+
],
|
|
6539
|
+
"model_description": "deepseek-r1-distill-qwen is distilled from DeepSeek-R1 based on Qwen",
|
|
6540
|
+
"model_specs": [
|
|
6541
|
+
{
|
|
6542
|
+
"model_format": "pytorch",
|
|
6543
|
+
"model_size_in_billions": "1_5",
|
|
6544
|
+
"quantizations": [
|
|
6545
|
+
"4-bit",
|
|
6546
|
+
"8-bit",
|
|
6547
|
+
"none"
|
|
6548
|
+
],
|
|
6549
|
+
"model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
|
6550
|
+
"model_hub": "modelscope"
|
|
6551
|
+
},
|
|
6552
|
+
{
|
|
6553
|
+
"model_format": "ggufv2",
|
|
6554
|
+
"model_size_in_billions": "1_5",
|
|
6555
|
+
"quantizations": [
|
|
6556
|
+
"Q2_K",
|
|
6557
|
+
"Q2_K_L",
|
|
6558
|
+
"Q3_K_M",
|
|
6559
|
+
"Q4_K_M",
|
|
6560
|
+
"Q5_K_M",
|
|
6561
|
+
"Q6_K",
|
|
6562
|
+
"Q8_0"
|
|
6563
|
+
],
|
|
6564
|
+
"model_id": "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF",
|
|
6565
|
+
"model_file_name_template": "DeepSeek-R1-Distill-Qwen-1.5B-{quantization}.gguf",
|
|
6566
|
+
"model_hub": "modelscope"
|
|
6567
|
+
},
|
|
6568
|
+
{
|
|
6569
|
+
"model_format": "mlx",
|
|
6570
|
+
"model_size_in_billions": "1_5",
|
|
6571
|
+
"quantizations": [
|
|
6572
|
+
"3bit",
|
|
6573
|
+
"4bit",
|
|
6574
|
+
"6bit",
|
|
6575
|
+
"8bit",
|
|
6576
|
+
"bf16"
|
|
6577
|
+
],
|
|
6578
|
+
"model_id": "mlx-community/DeepSeek-R1-Distill-Qwen-1.5B-{quantization}",
|
|
6579
|
+
"model_hub": "modelscope"
|
|
6580
|
+
},
|
|
6581
|
+
{
|
|
6582
|
+
"model_format": "pytorch",
|
|
6583
|
+
"model_size_in_billions": 7,
|
|
6584
|
+
"quantizations": [
|
|
6585
|
+
"4-bit",
|
|
6586
|
+
"8-bit",
|
|
6587
|
+
"none"
|
|
6588
|
+
],
|
|
6589
|
+
"model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
|
6590
|
+
"model_hub": "modelscope"
|
|
6591
|
+
},
|
|
6592
|
+
{
|
|
6593
|
+
"model_format": "gptq",
|
|
6594
|
+
"model_size_in_billions": 7,
|
|
6595
|
+
"quantizations": [
|
|
6596
|
+
"Int4"
|
|
6597
|
+
],
|
|
6598
|
+
"model_id": "tclf90/deepseek-r1-distill-qwen-7b-gptq-int4",
|
|
6599
|
+
"model_hub": "modelscope"
|
|
6600
|
+
},
|
|
6601
|
+
{
|
|
6602
|
+
"model_format": "ggufv2",
|
|
6603
|
+
"model_size_in_billions": 7,
|
|
6604
|
+
"quantizations": [
|
|
6605
|
+
"Q2_K",
|
|
6606
|
+
"Q2_K_L",
|
|
6607
|
+
"Q3_K_M",
|
|
6608
|
+
"Q4_K_M",
|
|
6609
|
+
"Q5_K_M",
|
|
6610
|
+
"Q6_K",
|
|
6611
|
+
"Q8_0",
|
|
6612
|
+
"F16"
|
|
6613
|
+
],
|
|
6614
|
+
"model_id": "unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF",
|
|
6615
|
+
"model_file_name_template": "DeepSeek-R1-Distill-Qwen-7B-{quantization}.gguf",
|
|
6616
|
+
"model_hub": "modelscope"
|
|
6617
|
+
},
|
|
6618
|
+
{
|
|
6619
|
+
"model_format": "mlx",
|
|
6620
|
+
"model_size_in_billions": 7,
|
|
6621
|
+
"quantizations": [
|
|
6622
|
+
"3bit",
|
|
6623
|
+
"4bit",
|
|
6624
|
+
"6bit",
|
|
6625
|
+
"8bit"
|
|
6626
|
+
],
|
|
6627
|
+
"model_id": "okwinds/DeepSeek-R1-Distill-Qwen-7B-MLX-{quantization}",
|
|
6628
|
+
"model_hub": "modelscope"
|
|
6629
|
+
},
|
|
6630
|
+
{
|
|
6631
|
+
"model_format": "pytorch",
|
|
6632
|
+
"model_size_in_billions": 14,
|
|
6633
|
+
"quantizations": [
|
|
6634
|
+
"4-bit",
|
|
6635
|
+
"8-bit",
|
|
6636
|
+
"none"
|
|
6637
|
+
],
|
|
6638
|
+
"model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
|
6639
|
+
"model_hub": "modelscope"
|
|
6640
|
+
},
|
|
6641
|
+
{
|
|
6642
|
+
"model_format": "ggufv2",
|
|
6643
|
+
"model_size_in_billions": 14,
|
|
6644
|
+
"quantizations": [
|
|
6645
|
+
"Q2_K",
|
|
6646
|
+
"Q2_K_L",
|
|
6647
|
+
"Q3_K_M",
|
|
6648
|
+
"Q4_K_M",
|
|
6649
|
+
"Q5_K_M",
|
|
6650
|
+
"Q6_K",
|
|
6651
|
+
"Q8_0",
|
|
6652
|
+
"F16"
|
|
6653
|
+
],
|
|
6654
|
+
"model_id": "unsloth/DeepSeek-R1-Distill-Qwen-14B-GGUF",
|
|
6655
|
+
"model_file_name_template": "DeepSeek-R1-Distill-Qwen-14B-{quantization}.gguf",
|
|
6656
|
+
"model_hub": "modelscope"
|
|
6657
|
+
},
|
|
6658
|
+
{
|
|
6659
|
+
"model_format": "mlx",
|
|
6660
|
+
"model_size_in_billions": 14,
|
|
6661
|
+
"quantizations": [
|
|
6662
|
+
"3bit",
|
|
6663
|
+
"4bit",
|
|
6664
|
+
"6bit",
|
|
6665
|
+
"8bit"
|
|
6666
|
+
],
|
|
6667
|
+
"model_id": "okwinds/DeepSeek-R1-Distill-Qwen-14B-MLX-{quantization}",
|
|
6668
|
+
"model_hub": "modelscope"
|
|
6669
|
+
},
|
|
6670
|
+
{
|
|
6671
|
+
"model_format": "pytorch",
|
|
6672
|
+
"model_size_in_billions": 32,
|
|
6673
|
+
"quantizations": [
|
|
6674
|
+
"4-bit",
|
|
6675
|
+
"8-bit",
|
|
6676
|
+
"none"
|
|
6677
|
+
],
|
|
6678
|
+
"model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
|
|
6679
|
+
"model_hub": "modelscope"
|
|
6680
|
+
},
|
|
6681
|
+
{
|
|
6682
|
+
"model_format": "gptq",
|
|
6683
|
+
"model_size_in_billions": 32,
|
|
6684
|
+
"quantizations": [
|
|
6685
|
+
"Int4"
|
|
6686
|
+
],
|
|
6687
|
+
"model_id": "tclf90/deepseek-r1-distill-qwen-32b-gptq-int4",
|
|
6688
|
+
"model_hub": "modelscope"
|
|
6689
|
+
},
|
|
6690
|
+
{
|
|
6691
|
+
"model_format": "ggufv2",
|
|
6692
|
+
"model_size_in_billions": 32,
|
|
6693
|
+
"quantizations": [
|
|
6694
|
+
"Q2_K",
|
|
6695
|
+
"Q2_K_L",
|
|
6696
|
+
"Q3_K_M",
|
|
6697
|
+
"Q4_K_M",
|
|
6698
|
+
"Q5_K_M",
|
|
6699
|
+
"Q6_K",
|
|
6700
|
+
"Q8_0",
|
|
6701
|
+
"F16"
|
|
6702
|
+
],
|
|
6703
|
+
"model_id": "unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF",
|
|
6704
|
+
"model_file_name_template": "DeepSeek-R1-Distill-Qwen-32B-{quantization}.gguf",
|
|
6705
|
+
"model_hub": "modelscope"
|
|
6706
|
+
},
|
|
6707
|
+
{
|
|
6708
|
+
"model_format": "mlx",
|
|
6709
|
+
"model_size_in_billions": 32,
|
|
6710
|
+
"quantizations": [
|
|
6711
|
+
"2bit",
|
|
6712
|
+
"3bit",
|
|
6713
|
+
"4bit",
|
|
6714
|
+
"6bit",
|
|
6715
|
+
"8bit"
|
|
6716
|
+
],
|
|
6717
|
+
"model_id": "okwinds/DeepSeek-R1-Distill-Qwen-32B-MLX-{quantization}",
|
|
6718
|
+
"model_hub": "modelscope"
|
|
6719
|
+
}
|
|
6720
|
+
],
|
|
6721
|
+
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
|
|
6722
|
+
"stop_token_ids": [
|
|
6723
|
+
151643
|
|
6724
|
+
],
|
|
6725
|
+
"stop": [
|
|
6726
|
+
"<|end▁of▁sentence|>"
|
|
6727
|
+
]
|
|
6728
|
+
},
|
|
6729
|
+
{
|
|
6730
|
+
"version": 1,
|
|
6731
|
+
"context_length": 131072,
|
|
6732
|
+
"model_name": "deepseek-r1-distill-llama",
|
|
6733
|
+
"model_lang": [
|
|
6734
|
+
"en",
|
|
6735
|
+
"zh"
|
|
6736
|
+
],
|
|
6737
|
+
"model_ability": [
|
|
6738
|
+
"chat"
|
|
6739
|
+
],
|
|
6740
|
+
"model_description": "deepseek-r1-distill-llama is distilled from DeepSeek-R1 based on Llama",
|
|
6741
|
+
"model_specs": [
|
|
6742
|
+
{
|
|
6743
|
+
"model_format": "pytorch",
|
|
6744
|
+
"model_size_in_billions": 8,
|
|
6745
|
+
"quantizations": [
|
|
6746
|
+
"4-bit",
|
|
6747
|
+
"8-bit",
|
|
6748
|
+
"none"
|
|
6749
|
+
],
|
|
6750
|
+
"model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
|
|
6751
|
+
"model_hub": "modelscope"
|
|
6752
|
+
},
|
|
6753
|
+
{
|
|
6754
|
+
"model_format": "ggufv2",
|
|
6755
|
+
"model_size_in_billions": 8,
|
|
6756
|
+
"quantizations": [
|
|
6757
|
+
"Q2_K",
|
|
6758
|
+
"Q2_K_L",
|
|
6759
|
+
"Q3_K_M",
|
|
6760
|
+
"Q4_K_M",
|
|
6761
|
+
"Q5_K_M",
|
|
6762
|
+
"Q6_K",
|
|
6763
|
+
"Q8_0",
|
|
6764
|
+
"F16"
|
|
6765
|
+
],
|
|
6766
|
+
"model_id": "unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF",
|
|
6767
|
+
"model_file_name_template": "DeepSeek-R1-Distill-Llama-8B-{quantization}.gguf",
|
|
6768
|
+
"model_hub": "modelscope"
|
|
6769
|
+
},
|
|
6770
|
+
{
|
|
6771
|
+
"model_format": "mlx",
|
|
6772
|
+
"model_size_in_billions": 8,
|
|
6773
|
+
"quantizations": [
|
|
6774
|
+
"3bit",
|
|
6775
|
+
"4bit",
|
|
6776
|
+
"6bit",
|
|
6777
|
+
"8bit",
|
|
6778
|
+
"bf16"
|
|
6779
|
+
],
|
|
6780
|
+
"model_id": "okwinds/DeepSeek-R1-Distill-Llama-8B-MLX-{quantization}",
|
|
6781
|
+
"model_hub": "modelscope"
|
|
6782
|
+
},
|
|
6783
|
+
{
|
|
6784
|
+
"model_format": "pytorch",
|
|
6785
|
+
"model_size_in_billions": 70,
|
|
6786
|
+
"quantizations": [
|
|
6787
|
+
"4-bit",
|
|
6788
|
+
"8-bit",
|
|
6789
|
+
"none"
|
|
6790
|
+
],
|
|
6791
|
+
"model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
|
6792
|
+
"model_hub": "modelscope"
|
|
6793
|
+
},
|
|
6794
|
+
{
|
|
6795
|
+
"model_format": "ggufv2",
|
|
6796
|
+
"model_size_in_billions": 70,
|
|
6797
|
+
"quantizations": [
|
|
6798
|
+
"Q2_K",
|
|
6799
|
+
"Q2_K_L",
|
|
6800
|
+
"Q3_K_M",
|
|
6801
|
+
"Q4_K_M",
|
|
6802
|
+
"Q5_K_M",
|
|
6803
|
+
"Q6_K",
|
|
6804
|
+
"Q8_0",
|
|
6805
|
+
"F16"
|
|
6806
|
+
],
|
|
6807
|
+
"quantization_parts": {
|
|
6808
|
+
"Q6_K": [
|
|
6809
|
+
"00001-of-00002",
|
|
6810
|
+
"00002-of-00002"
|
|
6811
|
+
],
|
|
6812
|
+
"Q8_0": [
|
|
6813
|
+
"00001-of-00002",
|
|
6814
|
+
"00002-of-00002"
|
|
6815
|
+
],
|
|
6816
|
+
"F16": [
|
|
6817
|
+
"00001-of-00003",
|
|
6818
|
+
"00002-of-00003",
|
|
6819
|
+
"00003-of-00003"
|
|
6820
|
+
]
|
|
6821
|
+
},
|
|
6822
|
+
"model_id": "unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF",
|
|
6823
|
+
"model_file_name_template": "DeepSeek-R1-Distill-Qwen-7B-{quantization}.gguf",
|
|
6824
|
+
"model_file_name_split_template": "DeepSeek-R1-Distill-Llama-70B-{quantization}/DeepSeek-R1-Distill-Llama-70B-{quantization}-{part}.gguf",
|
|
6825
|
+
"model_hub": "modelscope"
|
|
6826
|
+
},
|
|
6827
|
+
{
|
|
6828
|
+
"model_format": "mlx",
|
|
6829
|
+
"model_size_in_billions": 70,
|
|
6830
|
+
"quantizations": [
|
|
6831
|
+
"3bit",
|
|
6832
|
+
"4bit",
|
|
6833
|
+
"6bit",
|
|
6834
|
+
"8bit"
|
|
6835
|
+
],
|
|
6836
|
+
"model_id": "okwinds/DeepSeek-R1-Distill-Llama-70B-MLX-{quantization}",
|
|
6837
|
+
"model_hub": "modelscope"
|
|
6838
|
+
}
|
|
6839
|
+
],
|
|
6840
|
+
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
|
|
6841
|
+
"stop_token_ids": [
|
|
6842
|
+
151643
|
|
6843
|
+
],
|
|
6844
|
+
"stop": [
|
|
6845
|
+
"<|end▁of▁sentence|>"
|
|
6846
|
+
]
|
|
6847
|
+
},
|
|
6436
6848
|
{
|
|
6437
6849
|
"version": 1,
|
|
6438
6850
|
"context_length": 8192,
|
|
@@ -6723,7 +7135,7 @@
|
|
|
6723
7135
|
"<|endoftext|>"
|
|
6724
7136
|
]
|
|
6725
7137
|
},
|
|
6726
|
-
|
|
7138
|
+
{
|
|
6727
7139
|
"version": 1,
|
|
6728
7140
|
"context_length": 32768,
|
|
6729
7141
|
"model_name": "marco-o1",
|
|
@@ -6821,5 +7233,85 @@
|
|
|
6821
7233
|
"<|user|>",
|
|
6822
7234
|
"<|observation|>"
|
|
6823
7235
|
]
|
|
7236
|
+
},
|
|
7237
|
+
{
|
|
7238
|
+
"version": 1,
|
|
7239
|
+
"context_length": 32768,
|
|
7240
|
+
"model_name": "internlm3-instruct",
|
|
7241
|
+
"model_lang": [
|
|
7242
|
+
"en",
|
|
7243
|
+
"zh"
|
|
7244
|
+
],
|
|
7245
|
+
"model_ability": [
|
|
7246
|
+
"chat",
|
|
7247
|
+
"tools"
|
|
7248
|
+
],
|
|
7249
|
+
"model_description": "InternLM3 has open-sourced an 8-billion parameter instruction model, InternLM3-8B-Instruct, designed for general-purpose usage and advanced reasoning.",
|
|
7250
|
+
"model_specs": [
|
|
7251
|
+
{
|
|
7252
|
+
"model_format": "pytorch",
|
|
7253
|
+
"model_size_in_billions": 8,
|
|
7254
|
+
"quantizations": [
|
|
7255
|
+
"4-bit",
|
|
7256
|
+
"8-bit",
|
|
7257
|
+
"none"
|
|
7258
|
+
],
|
|
7259
|
+
"model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct",
|
|
7260
|
+
"model_hub": "modelscope"
|
|
7261
|
+
},
|
|
7262
|
+
{
|
|
7263
|
+
"model_format": "gptq",
|
|
7264
|
+
"model_size_in_billions": 8,
|
|
7265
|
+
"quantizations": [
|
|
7266
|
+
"Int4"
|
|
7267
|
+
],
|
|
7268
|
+
"model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct-gptq-int4",
|
|
7269
|
+
"model_hub": "modelscope"
|
|
7270
|
+
},
|
|
7271
|
+
{
|
|
7272
|
+
"model_format": "awq",
|
|
7273
|
+
"model_size_in_billions": 8,
|
|
7274
|
+
"quantizations": [
|
|
7275
|
+
"Int4"
|
|
7276
|
+
],
|
|
7277
|
+
"model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct-awq",
|
|
7278
|
+
"model_hub": "modelscope"
|
|
7279
|
+
},
|
|
7280
|
+
{
|
|
7281
|
+
"model_format": "ggufv2",
|
|
7282
|
+
"model_size_in_billions": 8,
|
|
7283
|
+
"quantizations": [
|
|
7284
|
+
"q2_k",
|
|
7285
|
+
"q3_k_m",
|
|
7286
|
+
"q4_0",
|
|
7287
|
+
"q4_k_m",
|
|
7288
|
+
"q5_0",
|
|
7289
|
+
"q5_k_m",
|
|
7290
|
+
"q6_k",
|
|
7291
|
+
"q8_0"
|
|
7292
|
+
],
|
|
7293
|
+
"model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct-gguf",
|
|
7294
|
+
"model_file_name_template": "internlm3-8b-instruct-{quantization}.gguf",
|
|
7295
|
+
"model_hub": "modelscope"
|
|
7296
|
+
},
|
|
7297
|
+
{
|
|
7298
|
+
"model_format":"mlx",
|
|
7299
|
+
"model_size_in_billions":8,
|
|
7300
|
+
"quantizations":[
|
|
7301
|
+
"4bit"
|
|
7302
|
+
],
|
|
7303
|
+
"model_hub": "modelscope",
|
|
7304
|
+
"model_id":"mlx-community/internlm3-8b-instruct-{quantization}"
|
|
7305
|
+
}
|
|
7306
|
+
],
|
|
7307
|
+
"chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
|
7308
|
+
"stop_token_ids": [
|
|
7309
|
+
2,
|
|
7310
|
+
128131
|
|
7311
|
+
],
|
|
7312
|
+
"stop": [
|
|
7313
|
+
"</s>",
|
|
7314
|
+
"<|im_end|>"
|
|
7315
|
+
]
|
|
6824
7316
|
}
|
|
6825
7317
|
]
|
xinference/model/llm/memory.py
CHANGED
xinference/model/llm/mlx/core.py
CHANGED
|
@@ -31,7 +31,12 @@ from ....types import (
|
|
|
31
31
|
)
|
|
32
32
|
from ..core import LLM
|
|
33
33
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
34
|
-
from ..utils import
|
|
34
|
+
from ..utils import (
|
|
35
|
+
DEEPSEEK_TOOL_CALL_FAMILY,
|
|
36
|
+
QWEN_TOOL_CALL_FAMILY,
|
|
37
|
+
ChatModelMixin,
|
|
38
|
+
generate_completion_chunk,
|
|
39
|
+
)
|
|
35
40
|
|
|
36
41
|
logger = logging.getLogger(__name__)
|
|
37
42
|
|
|
@@ -103,10 +108,10 @@ class MLXModel(LLM):
|
|
|
103
108
|
# default config is adapted from
|
|
104
109
|
# https://github.com/ml-explore/mlx-examples/blob/f212b770d8b5143e23102eda20400ae43340f844/llms/mlx_lm/utils.py#L129
|
|
105
110
|
generate_config.setdefault("temperature", 0.0)
|
|
111
|
+
generate_config.setdefault("logit_bias", None)
|
|
106
112
|
generate_config.setdefault("repetition_penalty", None)
|
|
107
113
|
generate_config.setdefault("repetition_context_size", 20)
|
|
108
114
|
generate_config.setdefault("top_p", 1.0)
|
|
109
|
-
generate_config.setdefault("logit_bias", None)
|
|
110
115
|
return generate_config
|
|
111
116
|
|
|
112
117
|
def _load_model(self, **kwargs):
|
|
@@ -199,14 +204,24 @@ class MLXModel(LLM):
|
|
|
199
204
|
return prompt
|
|
200
205
|
|
|
201
206
|
def _generate_stream_inner(self, **kwargs):
|
|
202
|
-
from mlx_lm.utils import make_sampler, stream_generate
|
|
207
|
+
from mlx_lm.utils import make_logits_processors, make_sampler, stream_generate
|
|
203
208
|
|
|
204
209
|
sampler = make_sampler(
|
|
205
210
|
temp=kwargs.pop("temperature"), top_p=kwargs.pop("top_p")
|
|
206
211
|
)
|
|
207
212
|
prompt_token_ids = kwargs.pop("prompt_token_ids")
|
|
213
|
+
logits_processors = make_logits_processors(
|
|
214
|
+
logit_bias=kwargs.pop("logits_bias", None),
|
|
215
|
+
repetition_penalty=kwargs.pop("repetition_penalty"),
|
|
216
|
+
repetition_context_size=kwargs.pop("repetition_context_size"),
|
|
217
|
+
)
|
|
208
218
|
yield from stream_generate(
|
|
209
|
-
self._model,
|
|
219
|
+
self._model,
|
|
220
|
+
self._tokenizer,
|
|
221
|
+
prompt_token_ids,
|
|
222
|
+
sampler=sampler,
|
|
223
|
+
logits_processors=logits_processors,
|
|
224
|
+
**kwargs,
|
|
210
225
|
)
|
|
211
226
|
|
|
212
227
|
def _prepare_inputs(
|
|
@@ -414,8 +429,11 @@ class MLXChatModel(MLXModel, ChatModelMixin):
|
|
|
414
429
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
415
430
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
416
431
|
full_context_kwargs = {}
|
|
417
|
-
if tools
|
|
418
|
-
|
|
432
|
+
if tools:
|
|
433
|
+
if model_family in QWEN_TOOL_CALL_FAMILY:
|
|
434
|
+
full_context_kwargs["tools"] = tools
|
|
435
|
+
elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
|
|
436
|
+
self._tools_to_messages_for_deepseek(messages, tools)
|
|
419
437
|
assert self.model_family.chat_template is not None
|
|
420
438
|
full_prompt = self.get_full_context(
|
|
421
439
|
messages, self.model_family.chat_template, **full_context_kwargs
|
|
@@ -39,7 +39,12 @@ from ....types import (
|
|
|
39
39
|
from ...utils import select_device
|
|
40
40
|
from ..core import LLM
|
|
41
41
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
42
|
-
from ..utils import
|
|
42
|
+
from ..utils import (
|
|
43
|
+
DEEPSEEK_TOOL_CALL_FAMILY,
|
|
44
|
+
LLAMA3_TOOL_CALL_FAMILY,
|
|
45
|
+
QWEN_TOOL_CALL_FAMILY,
|
|
46
|
+
ChatModelMixin,
|
|
47
|
+
)
|
|
43
48
|
from .utils import get_context_length, get_max_src_len, pad_prefill_tokens
|
|
44
49
|
|
|
45
50
|
logger = logging.getLogger(__name__)
|
|
@@ -62,6 +67,7 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
|
|
|
62
67
|
"MiniCPM-V-2.6",
|
|
63
68
|
"glm-4v",
|
|
64
69
|
"qwen2-vl-instruct",
|
|
70
|
+
"qwen2.5-vl-instruct",
|
|
65
71
|
"qwen2-audio",
|
|
66
72
|
"qwen2-audio-instruct",
|
|
67
73
|
"deepseek-v2",
|
|
@@ -681,6 +687,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
681
687
|
or model_family in LLAMA3_TOOL_CALL_FAMILY
|
|
682
688
|
):
|
|
683
689
|
full_context_kwargs["tools"] = tools
|
|
690
|
+
elif tools and model_family in DEEPSEEK_TOOL_CALL_FAMILY:
|
|
691
|
+
self._tools_to_messages_for_deepseek(messages, tools)
|
|
684
692
|
assert self.model_family.chat_template is not None
|
|
685
693
|
full_prompt = self.get_full_context(
|
|
686
694
|
messages,
|
|
@@ -55,9 +55,9 @@ class Qwen2AudioChatModel(PytorchChatModel):
|
|
|
55
55
|
|
|
56
56
|
device = self._pytorch_model_config.get("device", "auto")
|
|
57
57
|
device = select_device(device)
|
|
58
|
-
self._device = device
|
|
59
58
|
# for multiple GPU, set back to auto to make multiple devices work
|
|
60
59
|
device = "auto" if device == "cuda" else device
|
|
60
|
+
self._device = device
|
|
61
61
|
|
|
62
62
|
self._processor = AutoProcessor.from_pretrained(
|
|
63
63
|
self.model_path,
|
|
@@ -105,6 +105,8 @@ class Qwen2AudioChatModel(PytorchChatModel):
|
|
|
105
105
|
inputs = self._processor(
|
|
106
106
|
text=text, audios=audios, return_tensors="pt", padding=True
|
|
107
107
|
)
|
|
108
|
+
# Make sure that the inputs and the model are on the same device.
|
|
109
|
+
inputs.data = {k: v.to(self._device) for k, v in inputs.data.items()}
|
|
108
110
|
inputs.input_ids = inputs.input_ids.to(self._device)
|
|
109
111
|
generate_config = generate_config if generate_config else {}
|
|
110
112
|
stream = generate_config.get("stream", False) if generate_config else False
|