xinference 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (124) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +4 -7
  3. xinference/client/handlers.py +3 -0
  4. xinference/core/chat_interface.py +6 -1
  5. xinference/core/model.py +2 -0
  6. xinference/core/scheduler.py +4 -7
  7. xinference/core/supervisor.py +114 -23
  8. xinference/core/worker.py +70 -4
  9. xinference/deploy/local.py +2 -1
  10. xinference/model/audio/core.py +11 -0
  11. xinference/model/audio/cosyvoice.py +16 -5
  12. xinference/model/audio/kokoro.py +139 -0
  13. xinference/model/audio/melotts.py +110 -0
  14. xinference/model/audio/model_spec.json +80 -0
  15. xinference/model/audio/model_spec_modelscope.json +18 -0
  16. xinference/model/audio/whisper.py +35 -10
  17. xinference/model/llm/llama_cpp/core.py +21 -14
  18. xinference/model/llm/llm_family.json +527 -1
  19. xinference/model/llm/llm_family.py +4 -1
  20. xinference/model/llm/llm_family_modelscope.json +495 -3
  21. xinference/model/llm/memory.py +1 -1
  22. xinference/model/llm/mlx/core.py +24 -6
  23. xinference/model/llm/transformers/core.py +9 -1
  24. xinference/model/llm/transformers/qwen2_audio.py +3 -1
  25. xinference/model/llm/transformers/qwen2_vl.py +20 -3
  26. xinference/model/llm/transformers/utils.py +22 -11
  27. xinference/model/llm/utils.py +115 -1
  28. xinference/model/llm/vllm/core.py +14 -4
  29. xinference/model/llm/vllm/xavier/block.py +3 -4
  30. xinference/model/llm/vllm/xavier/block_tracker.py +71 -58
  31. xinference/model/llm/vllm/xavier/collective.py +74 -0
  32. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  33. xinference/model/llm/vllm/xavier/executor.py +18 -16
  34. xinference/model/llm/vllm/xavier/scheduler.py +79 -63
  35. xinference/model/llm/vllm/xavier/test/test_xavier.py +60 -35
  36. xinference/model/llm/vllm/xavier/transfer.py +53 -32
  37. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  38. xinference/thirdparty/melo/__init__.py +0 -0
  39. xinference/thirdparty/melo/api.py +135 -0
  40. xinference/thirdparty/melo/app.py +61 -0
  41. xinference/thirdparty/melo/attentions.py +459 -0
  42. xinference/thirdparty/melo/commons.py +160 -0
  43. xinference/thirdparty/melo/configs/config.json +94 -0
  44. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  45. xinference/thirdparty/melo/data_utils.py +413 -0
  46. xinference/thirdparty/melo/download_utils.py +67 -0
  47. xinference/thirdparty/melo/infer.py +25 -0
  48. xinference/thirdparty/melo/init_downloads.py +14 -0
  49. xinference/thirdparty/melo/losses.py +58 -0
  50. xinference/thirdparty/melo/main.py +36 -0
  51. xinference/thirdparty/melo/mel_processing.py +174 -0
  52. xinference/thirdparty/melo/models.py +1030 -0
  53. xinference/thirdparty/melo/modules.py +598 -0
  54. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  55. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  56. xinference/thirdparty/melo/preprocess_text.py +135 -0
  57. xinference/thirdparty/melo/split_utils.py +174 -0
  58. xinference/thirdparty/melo/text/__init__.py +35 -0
  59. xinference/thirdparty/melo/text/chinese.py +199 -0
  60. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  61. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  62. xinference/thirdparty/melo/text/cleaner.py +36 -0
  63. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  64. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  65. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  66. xinference/thirdparty/melo/text/english.py +284 -0
  67. xinference/thirdparty/melo/text/english_bert.py +39 -0
  68. xinference/thirdparty/melo/text/english_utils/__init__.py +0 -0
  69. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  70. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  71. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  72. xinference/thirdparty/melo/text/es_phonemizer/__init__.py +0 -0
  73. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  74. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  75. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  76. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  77. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  78. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  79. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  80. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  81. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  82. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  83. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  84. xinference/thirdparty/melo/text/fr_phonemizer/__init__.py +0 -0
  85. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  86. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  87. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  88. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  89. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  90. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  91. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  92. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  93. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  94. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  95. xinference/thirdparty/melo/text/french.py +94 -0
  96. xinference/thirdparty/melo/text/french_bert.py +39 -0
  97. xinference/thirdparty/melo/text/japanese.py +647 -0
  98. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  99. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  100. xinference/thirdparty/melo/text/korean.py +192 -0
  101. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  102. xinference/thirdparty/melo/text/spanish.py +122 -0
  103. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  104. xinference/thirdparty/melo/text/symbols.py +290 -0
  105. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  106. xinference/thirdparty/melo/train.py +635 -0
  107. xinference/thirdparty/melo/train.sh +19 -0
  108. xinference/thirdparty/melo/transforms.py +209 -0
  109. xinference/thirdparty/melo/utils.py +424 -0
  110. xinference/types.py +2 -0
  111. xinference/web/ui/build/asset-manifest.json +3 -3
  112. xinference/web/ui/build/index.html +1 -1
  113. xinference/web/ui/build/static/js/{main.1eb206d1.js → main.b0936c54.js} +3 -3
  114. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  115. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  116. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/METADATA +37 -27
  117. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/RECORD +122 -45
  118. xinference/web/ui/build/static/js/main.1eb206d1.js.map +0 -1
  119. xinference/web/ui/node_modules/.cache/babel-loader/2213d49de260e1f67c888081b18f120f5225462b829ae57c9e05a05cec83689d.json +0 -1
  120. /xinference/web/ui/build/static/js/{main.1eb206d1.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  121. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/LICENSE +0 -0
  122. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/WHEEL +0 -0
  123. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/entry_points.txt +0 -0
  124. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/top_level.txt +0 -0
@@ -4769,10 +4769,11 @@
4769
4769
  "model_format":"mlx",
4770
4770
  "model_size_in_billions":2,
4771
4771
  "quantizations":[
4772
+ "4bit",
4772
4773
  "8bit"
4773
4774
  ],
4774
4775
  "model_hub": "modelscope",
4775
- "model_id":"okwinds/Qwen2-VL-2B-Instruct-MLX-8bit",
4776
+ "model_id":"mlx-community/Qwen2-VL-2B-Instruct-{quantization}",
4776
4777
  "model_revision":"master"
4777
4778
  },
4778
4779
  {
@@ -4825,6 +4826,97 @@
4825
4826
  "<|endoftext|>"
4826
4827
  ]
4827
4828
  },
4829
+ {
4830
+ "version":1,
4831
+ "context_length":128000,
4832
+ "model_name":"qwen2.5-vl-instruct",
4833
+ "model_lang":[
4834
+ "en",
4835
+ "zh"
4836
+ ],
4837
+ "model_ability":[
4838
+ "chat",
4839
+ "vision"
4840
+ ],
4841
+ "model_description":"Qwen2.5-VL: Qwen2.5-VL is the latest version of the vision language models in the Qwen model familities.",
4842
+ "model_specs":[
4843
+ {
4844
+ "model_format":"pytorch",
4845
+ "model_size_in_billions":3,
4846
+ "quantizations":[
4847
+ "none"
4848
+ ],
4849
+ "model_hub": "modelscope",
4850
+ "model_id":"qwen/Qwen2.5-VL-3B-Instruct"
4851
+ },
4852
+ {
4853
+ "model_format":"pytorch",
4854
+ "model_size_in_billions":7,
4855
+ "quantizations":[
4856
+ "none"
4857
+ ],
4858
+ "model_hub": "modelscope",
4859
+ "model_id":"qwen/Qwen2.5-VL-7B-Instruct"
4860
+ },
4861
+ {
4862
+ "model_format":"pytorch",
4863
+ "model_size_in_billions":72,
4864
+ "quantizations":[
4865
+ "none"
4866
+ ],
4867
+ "model_hub": "modelscope",
4868
+ "model_id":"qwen/Qwen2.5-VL-72B-Instruct"
4869
+ },
4870
+ {
4871
+ "model_format":"mlx",
4872
+ "model_size_in_billions":3,
4873
+ "quantizations":[
4874
+ "3bit",
4875
+ "4bit",
4876
+ "6bit",
4877
+ "8bit",
4878
+ "bf16"
4879
+ ],
4880
+ "model_hub": "modelscope",
4881
+ "model_id":"mlx-community/Qwen2.5-VL-3B-Instruct-{quantization}"
4882
+ },
4883
+ {
4884
+ "model_format":"mlx",
4885
+ "model_size_in_billions":7,
4886
+ "quantizations":[
4887
+ "3bit",
4888
+ "4bit",
4889
+ "6bit",
4890
+ "8bit",
4891
+ "bf16"
4892
+ ],
4893
+ "model_hub": "modelscope",
4894
+ "model_id":"mlx-community/Qwen2.5-VL-7B-Instruct-{quantization}"
4895
+ },
4896
+ {
4897
+ "model_format":"mlx",
4898
+ "model_size_in_billions":72,
4899
+ "quantizations":[
4900
+ "3bit",
4901
+ "4bit",
4902
+ "6bit",
4903
+ "8bit",
4904
+ "bf16"
4905
+ ],
4906
+ "model_hub": "modelscope",
4907
+ "model_id":"mlx-community/Qwen2.5-VL-72B-Instruct-{quantization}"
4908
+ }
4909
+ ],
4910
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
4911
+ "stop_token_ids": [
4912
+ 151645,
4913
+ 151643
4914
+ ],
4915
+ "stop": [
4916
+ "<|im_end|>",
4917
+ "<|endoftext|>"
4918
+ ]
4919
+ },
4828
4920
  {
4829
4921
  "version": 1,
4830
4922
  "context_length": 32768,
@@ -5558,7 +5650,7 @@
5558
5650
  "q8_0"
5559
5651
  ],
5560
5652
  "model_id": "qwen/Qwen2.5-7B-Instruct-GGUF",
5561
- "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf",
5653
+ "model_file_name_template": "qwen2.5-7b-instruct-{quantization}.gguf",
5562
5654
  "model_hub": "modelscope",
5563
5655
  "model_file_name_split_template": "qwen2.5-7b-instruct-{quantization}-{part}.gguf",
5564
5656
  "quantization_parts": {
@@ -6433,6 +6525,326 @@
6433
6525
  "<|im_end|>"
6434
6526
  ]
6435
6527
  },
6528
+ {
6529
+ "version": 1,
6530
+ "context_length": 131072,
6531
+ "model_name": "deepseek-r1-distill-qwen",
6532
+ "model_lang": [
6533
+ "en",
6534
+ "zh"
6535
+ ],
6536
+ "model_ability": [
6537
+ "chat"
6538
+ ],
6539
+ "model_description": "deepseek-r1-distill-qwen is distilled from DeepSeek-R1 based on Qwen",
6540
+ "model_specs": [
6541
+ {
6542
+ "model_format": "pytorch",
6543
+ "model_size_in_billions": "1_5",
6544
+ "quantizations": [
6545
+ "4-bit",
6546
+ "8-bit",
6547
+ "none"
6548
+ ],
6549
+ "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
6550
+ "model_hub": "modelscope"
6551
+ },
6552
+ {
6553
+ "model_format": "ggufv2",
6554
+ "model_size_in_billions": "1_5",
6555
+ "quantizations": [
6556
+ "Q2_K",
6557
+ "Q2_K_L",
6558
+ "Q3_K_M",
6559
+ "Q4_K_M",
6560
+ "Q5_K_M",
6561
+ "Q6_K",
6562
+ "Q8_0"
6563
+ ],
6564
+ "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF",
6565
+ "model_file_name_template": "DeepSeek-R1-Distill-Qwen-1.5B-{quantization}.gguf",
6566
+ "model_hub": "modelscope"
6567
+ },
6568
+ {
6569
+ "model_format": "mlx",
6570
+ "model_size_in_billions": "1_5",
6571
+ "quantizations": [
6572
+ "3bit",
6573
+ "4bit",
6574
+ "6bit",
6575
+ "8bit",
6576
+ "bf16"
6577
+ ],
6578
+ "model_id": "mlx-community/DeepSeek-R1-Distill-Qwen-1.5B-{quantization}",
6579
+ "model_hub": "modelscope"
6580
+ },
6581
+ {
6582
+ "model_format": "pytorch",
6583
+ "model_size_in_billions": 7,
6584
+ "quantizations": [
6585
+ "4-bit",
6586
+ "8-bit",
6587
+ "none"
6588
+ ],
6589
+ "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
6590
+ "model_hub": "modelscope"
6591
+ },
6592
+ {
6593
+ "model_format": "gptq",
6594
+ "model_size_in_billions": 7,
6595
+ "quantizations": [
6596
+ "Int4"
6597
+ ],
6598
+ "model_id": "tclf90/deepseek-r1-distill-qwen-7b-gptq-int4",
6599
+ "model_hub": "modelscope"
6600
+ },
6601
+ {
6602
+ "model_format": "ggufv2",
6603
+ "model_size_in_billions": 7,
6604
+ "quantizations": [
6605
+ "Q2_K",
6606
+ "Q2_K_L",
6607
+ "Q3_K_M",
6608
+ "Q4_K_M",
6609
+ "Q5_K_M",
6610
+ "Q6_K",
6611
+ "Q8_0",
6612
+ "F16"
6613
+ ],
6614
+ "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF",
6615
+ "model_file_name_template": "DeepSeek-R1-Distill-Qwen-7B-{quantization}.gguf",
6616
+ "model_hub": "modelscope"
6617
+ },
6618
+ {
6619
+ "model_format": "mlx",
6620
+ "model_size_in_billions": 7,
6621
+ "quantizations": [
6622
+ "3bit",
6623
+ "4bit",
6624
+ "6bit",
6625
+ "8bit"
6626
+ ],
6627
+ "model_id": "okwinds/DeepSeek-R1-Distill-Qwen-7B-MLX-{quantization}",
6628
+ "model_hub": "modelscope"
6629
+ },
6630
+ {
6631
+ "model_format": "pytorch",
6632
+ "model_size_in_billions": 14,
6633
+ "quantizations": [
6634
+ "4-bit",
6635
+ "8-bit",
6636
+ "none"
6637
+ ],
6638
+ "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
6639
+ "model_hub": "modelscope"
6640
+ },
6641
+ {
6642
+ "model_format": "ggufv2",
6643
+ "model_size_in_billions": 14,
6644
+ "quantizations": [
6645
+ "Q2_K",
6646
+ "Q2_K_L",
6647
+ "Q3_K_M",
6648
+ "Q4_K_M",
6649
+ "Q5_K_M",
6650
+ "Q6_K",
6651
+ "Q8_0",
6652
+ "F16"
6653
+ ],
6654
+ "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-14B-GGUF",
6655
+ "model_file_name_template": "DeepSeek-R1-Distill-Qwen-14B-{quantization}.gguf",
6656
+ "model_hub": "modelscope"
6657
+ },
6658
+ {
6659
+ "model_format": "mlx",
6660
+ "model_size_in_billions": 14,
6661
+ "quantizations": [
6662
+ "3bit",
6663
+ "4bit",
6664
+ "6bit",
6665
+ "8bit"
6666
+ ],
6667
+ "model_id": "okwinds/DeepSeek-R1-Distill-Qwen-14B-MLX-{quantization}",
6668
+ "model_hub": "modelscope"
6669
+ },
6670
+ {
6671
+ "model_format": "pytorch",
6672
+ "model_size_in_billions": 32,
6673
+ "quantizations": [
6674
+ "4-bit",
6675
+ "8-bit",
6676
+ "none"
6677
+ ],
6678
+ "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
6679
+ "model_hub": "modelscope"
6680
+ },
6681
+ {
6682
+ "model_format": "gptq",
6683
+ "model_size_in_billions": 32,
6684
+ "quantizations": [
6685
+ "Int4"
6686
+ ],
6687
+ "model_id": "tclf90/deepseek-r1-distill-qwen-32b-gptq-int4",
6688
+ "model_hub": "modelscope"
6689
+ },
6690
+ {
6691
+ "model_format": "ggufv2",
6692
+ "model_size_in_billions": 32,
6693
+ "quantizations": [
6694
+ "Q2_K",
6695
+ "Q2_K_L",
6696
+ "Q3_K_M",
6697
+ "Q4_K_M",
6698
+ "Q5_K_M",
6699
+ "Q6_K",
6700
+ "Q8_0",
6701
+ "F16"
6702
+ ],
6703
+ "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF",
6704
+ "model_file_name_template": "DeepSeek-R1-Distill-Qwen-32B-{quantization}.gguf",
6705
+ "model_hub": "modelscope"
6706
+ },
6707
+ {
6708
+ "model_format": "mlx",
6709
+ "model_size_in_billions": 32,
6710
+ "quantizations": [
6711
+ "2bit",
6712
+ "3bit",
6713
+ "4bit",
6714
+ "6bit",
6715
+ "8bit"
6716
+ ],
6717
+ "model_id": "okwinds/DeepSeek-R1-Distill-Qwen-32B-MLX-{quantization}",
6718
+ "model_hub": "modelscope"
6719
+ }
6720
+ ],
6721
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
6722
+ "stop_token_ids": [
6723
+ 151643
6724
+ ],
6725
+ "stop": [
6726
+ "<|end▁of▁sentence|>"
6727
+ ]
6728
+ },
6729
+ {
6730
+ "version": 1,
6731
+ "context_length": 131072,
6732
+ "model_name": "deepseek-r1-distill-llama",
6733
+ "model_lang": [
6734
+ "en",
6735
+ "zh"
6736
+ ],
6737
+ "model_ability": [
6738
+ "chat"
6739
+ ],
6740
+ "model_description": "deepseek-r1-distill-llama is distilled from DeepSeek-R1 based on Llama",
6741
+ "model_specs": [
6742
+ {
6743
+ "model_format": "pytorch",
6744
+ "model_size_in_billions": 8,
6745
+ "quantizations": [
6746
+ "4-bit",
6747
+ "8-bit",
6748
+ "none"
6749
+ ],
6750
+ "model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
6751
+ "model_hub": "modelscope"
6752
+ },
6753
+ {
6754
+ "model_format": "ggufv2",
6755
+ "model_size_in_billions": 8,
6756
+ "quantizations": [
6757
+ "Q2_K",
6758
+ "Q2_K_L",
6759
+ "Q3_K_M",
6760
+ "Q4_K_M",
6761
+ "Q5_K_M",
6762
+ "Q6_K",
6763
+ "Q8_0",
6764
+ "F16"
6765
+ ],
6766
+ "model_id": "unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF",
6767
+ "model_file_name_template": "DeepSeek-R1-Distill-Llama-8B-{quantization}.gguf",
6768
+ "model_hub": "modelscope"
6769
+ },
6770
+ {
6771
+ "model_format": "mlx",
6772
+ "model_size_in_billions": 8,
6773
+ "quantizations": [
6774
+ "3bit",
6775
+ "4bit",
6776
+ "6bit",
6777
+ "8bit",
6778
+ "bf16"
6779
+ ],
6780
+ "model_id": "okwinds/DeepSeek-R1-Distill-Llama-8B-MLX-{quantization}",
6781
+ "model_hub": "modelscope"
6782
+ },
6783
+ {
6784
+ "model_format": "pytorch",
6785
+ "model_size_in_billions": 70,
6786
+ "quantizations": [
6787
+ "4-bit",
6788
+ "8-bit",
6789
+ "none"
6790
+ ],
6791
+ "model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
6792
+ "model_hub": "modelscope"
6793
+ },
6794
+ {
6795
+ "model_format": "ggufv2",
6796
+ "model_size_in_billions": 70,
6797
+ "quantizations": [
6798
+ "Q2_K",
6799
+ "Q2_K_L",
6800
+ "Q3_K_M",
6801
+ "Q4_K_M",
6802
+ "Q5_K_M",
6803
+ "Q6_K",
6804
+ "Q8_0",
6805
+ "F16"
6806
+ ],
6807
+ "quantization_parts": {
6808
+ "Q6_K": [
6809
+ "00001-of-00002",
6810
+ "00002-of-00002"
6811
+ ],
6812
+ "Q8_0": [
6813
+ "00001-of-00002",
6814
+ "00002-of-00002"
6815
+ ],
6816
+ "F16": [
6817
+ "00001-of-00003",
6818
+ "00002-of-00003",
6819
+ "00003-of-00003"
6820
+ ]
6821
+ },
6822
+ "model_id": "unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF",
6823
+ "model_file_name_template": "DeepSeek-R1-Distill-Qwen-7B-{quantization}.gguf",
6824
+ "model_file_name_split_template": "DeepSeek-R1-Distill-Llama-70B-{quantization}/DeepSeek-R1-Distill-Llama-70B-{quantization}-{part}.gguf",
6825
+ "model_hub": "modelscope"
6826
+ },
6827
+ {
6828
+ "model_format": "mlx",
6829
+ "model_size_in_billions": 70,
6830
+ "quantizations": [
6831
+ "3bit",
6832
+ "4bit",
6833
+ "6bit",
6834
+ "8bit"
6835
+ ],
6836
+ "model_id": "okwinds/DeepSeek-R1-Distill-Llama-70B-MLX-{quantization}",
6837
+ "model_hub": "modelscope"
6838
+ }
6839
+ ],
6840
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
6841
+ "stop_token_ids": [
6842
+ 151643
6843
+ ],
6844
+ "stop": [
6845
+ "<|end▁of▁sentence|>"
6846
+ ]
6847
+ },
6436
6848
  {
6437
6849
  "version": 1,
6438
6850
  "context_length": 8192,
@@ -6723,7 +7135,7 @@
6723
7135
  "<|endoftext|>"
6724
7136
  ]
6725
7137
  },
6726
- {
7138
+ {
6727
7139
  "version": 1,
6728
7140
  "context_length": 32768,
6729
7141
  "model_name": "marco-o1",
@@ -6821,5 +7233,85 @@
6821
7233
  "<|user|>",
6822
7234
  "<|observation|>"
6823
7235
  ]
7236
+ },
7237
+ {
7238
+ "version": 1,
7239
+ "context_length": 32768,
7240
+ "model_name": "internlm3-instruct",
7241
+ "model_lang": [
7242
+ "en",
7243
+ "zh"
7244
+ ],
7245
+ "model_ability": [
7246
+ "chat",
7247
+ "tools"
7248
+ ],
7249
+ "model_description": "InternLM3 has open-sourced an 8-billion parameter instruction model, InternLM3-8B-Instruct, designed for general-purpose usage and advanced reasoning.",
7250
+ "model_specs": [
7251
+ {
7252
+ "model_format": "pytorch",
7253
+ "model_size_in_billions": 8,
7254
+ "quantizations": [
7255
+ "4-bit",
7256
+ "8-bit",
7257
+ "none"
7258
+ ],
7259
+ "model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct",
7260
+ "model_hub": "modelscope"
7261
+ },
7262
+ {
7263
+ "model_format": "gptq",
7264
+ "model_size_in_billions": 8,
7265
+ "quantizations": [
7266
+ "Int4"
7267
+ ],
7268
+ "model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct-gptq-int4",
7269
+ "model_hub": "modelscope"
7270
+ },
7271
+ {
7272
+ "model_format": "awq",
7273
+ "model_size_in_billions": 8,
7274
+ "quantizations": [
7275
+ "Int4"
7276
+ ],
7277
+ "model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct-awq",
7278
+ "model_hub": "modelscope"
7279
+ },
7280
+ {
7281
+ "model_format": "ggufv2",
7282
+ "model_size_in_billions": 8,
7283
+ "quantizations": [
7284
+ "q2_k",
7285
+ "q3_k_m",
7286
+ "q4_0",
7287
+ "q4_k_m",
7288
+ "q5_0",
7289
+ "q5_k_m",
7290
+ "q6_k",
7291
+ "q8_0"
7292
+ ],
7293
+ "model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct-gguf",
7294
+ "model_file_name_template": "internlm3-8b-instruct-{quantization}.gguf",
7295
+ "model_hub": "modelscope"
7296
+ },
7297
+ {
7298
+ "model_format":"mlx",
7299
+ "model_size_in_billions":8,
7300
+ "quantizations":[
7301
+ "4bit"
7302
+ ],
7303
+ "model_hub": "modelscope",
7304
+ "model_id":"mlx-community/internlm3-8b-instruct-{quantization}"
7305
+ }
7306
+ ],
7307
+ "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
7308
+ "stop_token_ids": [
7309
+ 2,
7310
+ 128131
7311
+ ],
7312
+ "stop": [
7313
+ "</s>",
7314
+ "<|im_end|>"
7315
+ ]
6824
7316
  }
6825
7317
  ]
@@ -14,7 +14,7 @@
14
14
 
15
15
  # NOTE:
16
16
  #
17
- # The algorithum is ported from https://github.com/RahulSChand/gpu_poor
17
+ # The algorithm is ported from https://github.com/RahulSChand/gpu_poor
18
18
  #
19
19
  # Improvement:
20
20
  #
@@ -31,7 +31,12 @@ from ....types import (
31
31
  )
32
32
  from ..core import LLM
33
33
  from ..llm_family import LLMFamilyV1, LLMSpecV1
34
- from ..utils import QWEN_TOOL_CALL_FAMILY, ChatModelMixin, generate_completion_chunk
34
+ from ..utils import (
35
+ DEEPSEEK_TOOL_CALL_FAMILY,
36
+ QWEN_TOOL_CALL_FAMILY,
37
+ ChatModelMixin,
38
+ generate_completion_chunk,
39
+ )
35
40
 
36
41
  logger = logging.getLogger(__name__)
37
42
 
@@ -103,10 +108,10 @@ class MLXModel(LLM):
103
108
  # default config is adapted from
104
109
  # https://github.com/ml-explore/mlx-examples/blob/f212b770d8b5143e23102eda20400ae43340f844/llms/mlx_lm/utils.py#L129
105
110
  generate_config.setdefault("temperature", 0.0)
111
+ generate_config.setdefault("logit_bias", None)
106
112
  generate_config.setdefault("repetition_penalty", None)
107
113
  generate_config.setdefault("repetition_context_size", 20)
108
114
  generate_config.setdefault("top_p", 1.0)
109
- generate_config.setdefault("logit_bias", None)
110
115
  return generate_config
111
116
 
112
117
  def _load_model(self, **kwargs):
@@ -199,14 +204,24 @@ class MLXModel(LLM):
199
204
  return prompt
200
205
 
201
206
  def _generate_stream_inner(self, **kwargs):
202
- from mlx_lm.utils import make_sampler, stream_generate
207
+ from mlx_lm.utils import make_logits_processors, make_sampler, stream_generate
203
208
 
204
209
  sampler = make_sampler(
205
210
  temp=kwargs.pop("temperature"), top_p=kwargs.pop("top_p")
206
211
  )
207
212
  prompt_token_ids = kwargs.pop("prompt_token_ids")
213
+ logits_processors = make_logits_processors(
214
+ logit_bias=kwargs.pop("logits_bias", None),
215
+ repetition_penalty=kwargs.pop("repetition_penalty"),
216
+ repetition_context_size=kwargs.pop("repetition_context_size"),
217
+ )
208
218
  yield from stream_generate(
209
- self._model, self._tokenizer, prompt_token_ids, sampler=sampler, **kwargs
219
+ self._model,
220
+ self._tokenizer,
221
+ prompt_token_ids,
222
+ sampler=sampler,
223
+ logits_processors=logits_processors,
224
+ **kwargs,
210
225
  )
211
226
 
212
227
  def _prepare_inputs(
@@ -414,8 +429,11 @@ class MLXChatModel(MLXModel, ChatModelMixin):
414
429
  model_family = self.model_family.model_family or self.model_family.model_name
415
430
  tools = generate_config.pop("tools", []) if generate_config else None
416
431
  full_context_kwargs = {}
417
- if tools and model_family in QWEN_TOOL_CALL_FAMILY:
418
- full_context_kwargs["tools"] = tools
432
+ if tools:
433
+ if model_family in QWEN_TOOL_CALL_FAMILY:
434
+ full_context_kwargs["tools"] = tools
435
+ elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
436
+ self._tools_to_messages_for_deepseek(messages, tools)
419
437
  assert self.model_family.chat_template is not None
420
438
  full_prompt = self.get_full_context(
421
439
  messages, self.model_family.chat_template, **full_context_kwargs
@@ -39,7 +39,12 @@ from ....types import (
39
39
  from ...utils import select_device
40
40
  from ..core import LLM
41
41
  from ..llm_family import LLMFamilyV1, LLMSpecV1
42
- from ..utils import LLAMA3_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, ChatModelMixin
42
+ from ..utils import (
43
+ DEEPSEEK_TOOL_CALL_FAMILY,
44
+ LLAMA3_TOOL_CALL_FAMILY,
45
+ QWEN_TOOL_CALL_FAMILY,
46
+ ChatModelMixin,
47
+ )
43
48
  from .utils import get_context_length, get_max_src_len, pad_prefill_tokens
44
49
 
45
50
  logger = logging.getLogger(__name__)
@@ -62,6 +67,7 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
62
67
  "MiniCPM-V-2.6",
63
68
  "glm-4v",
64
69
  "qwen2-vl-instruct",
70
+ "qwen2.5-vl-instruct",
65
71
  "qwen2-audio",
66
72
  "qwen2-audio-instruct",
67
73
  "deepseek-v2",
@@ -681,6 +687,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
681
687
  or model_family in LLAMA3_TOOL_CALL_FAMILY
682
688
  ):
683
689
  full_context_kwargs["tools"] = tools
690
+ elif tools and model_family in DEEPSEEK_TOOL_CALL_FAMILY:
691
+ self._tools_to_messages_for_deepseek(messages, tools)
684
692
  assert self.model_family.chat_template is not None
685
693
  full_prompt = self.get_full_context(
686
694
  messages,
@@ -55,9 +55,9 @@ class Qwen2AudioChatModel(PytorchChatModel):
55
55
 
56
56
  device = self._pytorch_model_config.get("device", "auto")
57
57
  device = select_device(device)
58
- self._device = device
59
58
  # for multiple GPU, set back to auto to make multiple devices work
60
59
  device = "auto" if device == "cuda" else device
60
+ self._device = device
61
61
 
62
62
  self._processor = AutoProcessor.from_pretrained(
63
63
  self.model_path,
@@ -105,6 +105,8 @@ class Qwen2AudioChatModel(PytorchChatModel):
105
105
  inputs = self._processor(
106
106
  text=text, audios=audios, return_tensors="pt", padding=True
107
107
  )
108
+ # Make sure that the inputs and the model are on the same device.
109
+ inputs.data = {k: v.to(self._device) for k, v in inputs.data.items()}
108
110
  inputs.input_ids = inputs.input_ids.to(self._device)
109
111
  generate_config = generate_config if generate_config else {}
110
112
  stream = generate_config.get("stream", False) if generate_config else False