xinference 0.11.3__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +143 -6
- xinference/client/restful/restful_client.py +144 -5
- xinference/constants.py +5 -0
- xinference/core/cache_tracker.py +48 -28
- xinference/core/model.py +160 -19
- xinference/core/scheduler.py +446 -0
- xinference/core/supervisor.py +99 -24
- xinference/core/worker.py +68 -2
- xinference/deploy/cmdline.py +86 -2
- xinference/deploy/test/test_cmdline.py +19 -10
- xinference/isolation.py +9 -2
- xinference/model/audio/__init__.py +14 -1
- xinference/model/audio/chattts.py +84 -0
- xinference/model/audio/core.py +22 -4
- xinference/model/audio/custom.py +6 -4
- xinference/model/audio/model_spec.json +20 -0
- xinference/model/audio/model_spec_modelscope.json +20 -0
- xinference/model/llm/__init__.py +38 -2
- xinference/model/llm/llm_family.json +509 -1
- xinference/model/llm/llm_family.py +86 -1
- xinference/model/llm/llm_family_csghub.json +66 -0
- xinference/model/llm/llm_family_modelscope.json +411 -2
- xinference/model/llm/pytorch/chatglm.py +20 -13
- xinference/model/llm/pytorch/cogvlm2.py +76 -17
- xinference/model/llm/pytorch/core.py +141 -6
- xinference/model/llm/pytorch/glm4v.py +268 -0
- xinference/model/llm/pytorch/minicpmv25.py +232 -0
- xinference/model/llm/pytorch/qwen_vl.py +1 -1
- xinference/model/llm/pytorch/utils.py +405 -8
- xinference/model/llm/utils.py +14 -13
- xinference/model/llm/vllm/core.py +16 -4
- xinference/model/utils.py +8 -2
- xinference/thirdparty/ChatTTS/__init__.py +1 -0
- xinference/thirdparty/ChatTTS/core.py +200 -0
- xinference/thirdparty/ChatTTS/experimental/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/experimental/llm.py +40 -0
- xinference/thirdparty/ChatTTS/infer/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/infer/api.py +125 -0
- xinference/thirdparty/ChatTTS/model/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/model/dvae.py +155 -0
- xinference/thirdparty/ChatTTS/model/gpt.py +265 -0
- xinference/thirdparty/ChatTTS/utils/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/utils/gpu_utils.py +23 -0
- xinference/thirdparty/ChatTTS/utils/infer_utils.py +141 -0
- xinference/thirdparty/ChatTTS/utils/io_utils.py +14 -0
- xinference/types.py +3 -0
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.074e2b31.css +2 -0
- xinference/web/ui/build/static/css/main.074e2b31.css.map +1 -0
- xinference/web/ui/build/static/js/main.a58ff436.js +3 -0
- xinference/web/ui/build/static/js/main.a58ff436.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/10262a281dec3bc2b185f4385ceb6846626f52d41cb4d46c7c649e719f979d4d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/762a75a62daf3bec2cfc97ec8612798493fb34ef87087dcad6aad64ab7f14345.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/7f3bdb3a48fa00c046c8b185acd4da6f2e2940a20dbd77f9373d60de3fd6633e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f2f73bfdc13b12b02c8cbc4769b0b8e6367e9b6d8331c322d94318491a0b3653.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f51bf63ddaa7afd125ef2254a105789333eecc1c94fdf5157a9b88ef7ad0a5bd.json +1 -0
- {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/METADATA +26 -9
- {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/RECORD +65 -47
- xinference/web/ui/build/static/css/main.54bca460.css +0 -2
- xinference/web/ui/build/static/css/main.54bca460.css.map +0 -1
- xinference/web/ui/build/static/js/main.551aa479.js +0 -3
- xinference/web/ui/build/static/js/main.551aa479.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3e737bcdbcbc407ccd65b90e199ef0c3214b261e8e41dbf14d921384a717d9ee.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +0 -1
- /xinference/web/ui/build/static/js/{main.551aa479.js.LICENSE.txt → main.a58ff436.js.LICENSE.txt} +0 -0
- {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/LICENSE +0 -0
- {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/WHEEL +0 -0
- {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"version": 1,
|
|
4
|
+
"context_length": 32768,
|
|
5
|
+
"model_name": "qwen2-instruct",
|
|
6
|
+
"model_lang": [
|
|
7
|
+
"en",
|
|
8
|
+
"zh"
|
|
9
|
+
],
|
|
10
|
+
"model_ability": [
|
|
11
|
+
"chat",
|
|
12
|
+
"tools"
|
|
13
|
+
],
|
|
14
|
+
"model_description": "Qwen2 is the new series of Qwen large language models",
|
|
15
|
+
"model_specs": [
|
|
16
|
+
{
|
|
17
|
+
"model_format": "pytorch",
|
|
18
|
+
"model_size_in_billions": "0_5",
|
|
19
|
+
"quantizations": [
|
|
20
|
+
"4-bit",
|
|
21
|
+
"8-bit",
|
|
22
|
+
"none"
|
|
23
|
+
],
|
|
24
|
+
"model_id": "Qwen/Qwen2-0.5B-Instruct",
|
|
25
|
+
"model_hub": "csghub"
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"model_format": "ggufv2",
|
|
29
|
+
"model_size_in_billions": "0_5",
|
|
30
|
+
"quantizations": [
|
|
31
|
+
"q2_k",
|
|
32
|
+
"q3_k_m",
|
|
33
|
+
"q4_0",
|
|
34
|
+
"q4_k_m",
|
|
35
|
+
"q5_0",
|
|
36
|
+
"q5_k_m",
|
|
37
|
+
"q6_k",
|
|
38
|
+
"q8_0",
|
|
39
|
+
"fp16"
|
|
40
|
+
],
|
|
41
|
+
"model_id": "qwen/Qwen2-0.5B-Instruct-GGUF",
|
|
42
|
+
"model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf",
|
|
43
|
+
"model_hub": "csghub"
|
|
44
|
+
}
|
|
45
|
+
],
|
|
46
|
+
"prompt_style": {
|
|
47
|
+
"style_name": "QWEN",
|
|
48
|
+
"system_prompt": "You are a helpful assistant.",
|
|
49
|
+
"roles": [
|
|
50
|
+
"user",
|
|
51
|
+
"assistant"
|
|
52
|
+
],
|
|
53
|
+
"intra_message_sep": "\n",
|
|
54
|
+
"stop_token_ids": [
|
|
55
|
+
151643,
|
|
56
|
+
151644,
|
|
57
|
+
151645
|
|
58
|
+
],
|
|
59
|
+
"stop": [
|
|
60
|
+
"<|endoftext|>",
|
|
61
|
+
"<|im_start|>",
|
|
62
|
+
"<|im_end|>"
|
|
63
|
+
]
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
]
|
|
@@ -522,6 +522,144 @@
|
|
|
522
522
|
]
|
|
523
523
|
}
|
|
524
524
|
},
|
|
525
|
+
{
|
|
526
|
+
"version": 1,
|
|
527
|
+
"context_length": 131072,
|
|
528
|
+
"model_name": "glm4-chat",
|
|
529
|
+
"model_lang": [
|
|
530
|
+
"en",
|
|
531
|
+
"zh"
|
|
532
|
+
],
|
|
533
|
+
"model_ability": [
|
|
534
|
+
"chat",
|
|
535
|
+
"tools"
|
|
536
|
+
],
|
|
537
|
+
"model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
|
|
538
|
+
"model_specs": [
|
|
539
|
+
{
|
|
540
|
+
"model_format": "pytorch",
|
|
541
|
+
"model_size_in_billions": 9,
|
|
542
|
+
"quantizations": [
|
|
543
|
+
"4-bit",
|
|
544
|
+
"8-bit",
|
|
545
|
+
"none"
|
|
546
|
+
],
|
|
547
|
+
"model_hub": "modelscope",
|
|
548
|
+
"model_id": "ZhipuAI/glm-4-9b-chat",
|
|
549
|
+
"model_revision": "master"
|
|
550
|
+
}
|
|
551
|
+
],
|
|
552
|
+
"prompt_style": {
|
|
553
|
+
"style_name": "CHATGLM3",
|
|
554
|
+
"system_prompt": "",
|
|
555
|
+
"roles": [
|
|
556
|
+
"user",
|
|
557
|
+
"assistant"
|
|
558
|
+
],
|
|
559
|
+
"stop_token_ids": [
|
|
560
|
+
151329,
|
|
561
|
+
151336,
|
|
562
|
+
151338
|
|
563
|
+
],
|
|
564
|
+
"stop": [
|
|
565
|
+
"<|endoftext|>",
|
|
566
|
+
"<|user|>",
|
|
567
|
+
"<|observation|>"
|
|
568
|
+
]
|
|
569
|
+
}
|
|
570
|
+
},
|
|
571
|
+
{
|
|
572
|
+
"version": 1,
|
|
573
|
+
"context_length": 1048576,
|
|
574
|
+
"model_name": "glm4-chat-1m",
|
|
575
|
+
"model_lang": [
|
|
576
|
+
"en",
|
|
577
|
+
"zh"
|
|
578
|
+
],
|
|
579
|
+
"model_ability": [
|
|
580
|
+
"chat",
|
|
581
|
+
"tools"
|
|
582
|
+
],
|
|
583
|
+
"model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
|
|
584
|
+
"model_specs": [
|
|
585
|
+
{
|
|
586
|
+
"model_format": "pytorch",
|
|
587
|
+
"model_size_in_billions": 9,
|
|
588
|
+
"quantizations": [
|
|
589
|
+
"4-bit",
|
|
590
|
+
"8-bit",
|
|
591
|
+
"none"
|
|
592
|
+
],
|
|
593
|
+
"model_hub": "modelscope",
|
|
594
|
+
"model_id": "ZhipuAI/glm-4-9b-chat-1m",
|
|
595
|
+
"model_revision": "master"
|
|
596
|
+
}
|
|
597
|
+
],
|
|
598
|
+
"prompt_style": {
|
|
599
|
+
"style_name": "CHATGLM3",
|
|
600
|
+
"system_prompt": "",
|
|
601
|
+
"roles": [
|
|
602
|
+
"user",
|
|
603
|
+
"assistant"
|
|
604
|
+
],
|
|
605
|
+
"stop_token_ids": [
|
|
606
|
+
151329,
|
|
607
|
+
151336,
|
|
608
|
+
151338
|
|
609
|
+
],
|
|
610
|
+
"stop": [
|
|
611
|
+
"<|endoftext|>",
|
|
612
|
+
"<|user|>",
|
|
613
|
+
"<|observation|>"
|
|
614
|
+
]
|
|
615
|
+
}
|
|
616
|
+
},
|
|
617
|
+
{
|
|
618
|
+
"version": 1,
|
|
619
|
+
"context_length": 8192,
|
|
620
|
+
"model_name": "glm-4v",
|
|
621
|
+
"model_lang": [
|
|
622
|
+
"en",
|
|
623
|
+
"zh"
|
|
624
|
+
],
|
|
625
|
+
"model_ability": [
|
|
626
|
+
"chat",
|
|
627
|
+
"vision"
|
|
628
|
+
],
|
|
629
|
+
"model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
|
|
630
|
+
"model_specs": [
|
|
631
|
+
{
|
|
632
|
+
"model_format": "pytorch",
|
|
633
|
+
"model_size_in_billions": 9,
|
|
634
|
+
"quantizations": [
|
|
635
|
+
"4-bit",
|
|
636
|
+
"8-bit",
|
|
637
|
+
"none"
|
|
638
|
+
],
|
|
639
|
+
"model_hub": "modelscope",
|
|
640
|
+
"model_id": "ZhipuAI/glm-4v-9b",
|
|
641
|
+
"model_revision": "master"
|
|
642
|
+
}
|
|
643
|
+
],
|
|
644
|
+
"prompt_style": {
|
|
645
|
+
"style_name": "CHATGLM3",
|
|
646
|
+
"system_prompt": "",
|
|
647
|
+
"roles": [
|
|
648
|
+
"user",
|
|
649
|
+
"assistant"
|
|
650
|
+
],
|
|
651
|
+
"stop_token_ids": [
|
|
652
|
+
151329,
|
|
653
|
+
151336,
|
|
654
|
+
151338
|
|
655
|
+
],
|
|
656
|
+
"stop": [
|
|
657
|
+
"<|endoftext|>",
|
|
658
|
+
"<|user|>",
|
|
659
|
+
"<|observation|>"
|
|
660
|
+
]
|
|
661
|
+
}
|
|
662
|
+
},
|
|
525
663
|
{
|
|
526
664
|
"version": 1,
|
|
527
665
|
"context_length": 2048,
|
|
@@ -2648,6 +2786,233 @@
|
|
|
2648
2786
|
]
|
|
2649
2787
|
}
|
|
2650
2788
|
},
|
|
2789
|
+
{
|
|
2790
|
+
"version": 1,
|
|
2791
|
+
"context_length": 32768,
|
|
2792
|
+
"model_name": "qwen2-instruct",
|
|
2793
|
+
"model_lang": [
|
|
2794
|
+
"en",
|
|
2795
|
+
"zh"
|
|
2796
|
+
],
|
|
2797
|
+
"model_ability": [
|
|
2798
|
+
"chat",
|
|
2799
|
+
"tools"
|
|
2800
|
+
],
|
|
2801
|
+
"model_description": "Qwen2 is the new series of Qwen large language models",
|
|
2802
|
+
"model_specs": [
|
|
2803
|
+
{
|
|
2804
|
+
"model_format": "pytorch",
|
|
2805
|
+
"model_size_in_billions": "0_5",
|
|
2806
|
+
"quantizations": [
|
|
2807
|
+
"4-bit",
|
|
2808
|
+
"8-bit",
|
|
2809
|
+
"none"
|
|
2810
|
+
],
|
|
2811
|
+
"model_id": "qwen/Qwen2-0.5B-Instruct",
|
|
2812
|
+
"model_hub": "modelscope"
|
|
2813
|
+
},
|
|
2814
|
+
{
|
|
2815
|
+
"model_format": "pytorch",
|
|
2816
|
+
"model_size_in_billions": "1_5",
|
|
2817
|
+
"quantizations": [
|
|
2818
|
+
"4-bit",
|
|
2819
|
+
"8-bit",
|
|
2820
|
+
"none"
|
|
2821
|
+
],
|
|
2822
|
+
"model_id": "qwen/Qwen2-1.5B-Instruct",
|
|
2823
|
+
"model_hub": "modelscope"
|
|
2824
|
+
},
|
|
2825
|
+
{
|
|
2826
|
+
"model_format": "pytorch",
|
|
2827
|
+
"model_size_in_billions": 7,
|
|
2828
|
+
"quantizations": [
|
|
2829
|
+
"4-bit",
|
|
2830
|
+
"8-bit",
|
|
2831
|
+
"none"
|
|
2832
|
+
],
|
|
2833
|
+
"model_id": "qwen/Qwen2-7B-Instruct",
|
|
2834
|
+
"model_hub": "modelscope"
|
|
2835
|
+
},
|
|
2836
|
+
{
|
|
2837
|
+
"model_format": "pytorch",
|
|
2838
|
+
"model_size_in_billions": 72,
|
|
2839
|
+
"quantizations": [
|
|
2840
|
+
"4-bit",
|
|
2841
|
+
"8-bit",
|
|
2842
|
+
"none"
|
|
2843
|
+
],
|
|
2844
|
+
"model_id": "qwen/Qwen2-72B-Instruct",
|
|
2845
|
+
"model_hub": "modelscope"
|
|
2846
|
+
},
|
|
2847
|
+
{
|
|
2848
|
+
"model_format": "gptq",
|
|
2849
|
+
"model_size_in_billions": "0_5",
|
|
2850
|
+
"quantizations": [
|
|
2851
|
+
"Int4",
|
|
2852
|
+
"Int8"
|
|
2853
|
+
],
|
|
2854
|
+
"model_id": "qwen/Qwen2-0.5B-Instruct-GPTQ-{quantization}",
|
|
2855
|
+
"model_hub": "modelscope"
|
|
2856
|
+
},
|
|
2857
|
+
{
|
|
2858
|
+
"model_format": "gptq",
|
|
2859
|
+
"model_size_in_billions": "1_5",
|
|
2860
|
+
"quantizations": [
|
|
2861
|
+
"Int4",
|
|
2862
|
+
"Int8"
|
|
2863
|
+
],
|
|
2864
|
+
"model_id": "qwen/Qwen2-1.5B-Instruct-GPTQ-{quantization}",
|
|
2865
|
+
"model_hub": "modelscope"
|
|
2866
|
+
},
|
|
2867
|
+
{
|
|
2868
|
+
"model_format": "gptq",
|
|
2869
|
+
"model_size_in_billions": 7,
|
|
2870
|
+
"quantizations": [
|
|
2871
|
+
"Int4",
|
|
2872
|
+
"Int8"
|
|
2873
|
+
],
|
|
2874
|
+
"model_id": "qwen/Qwen2-7B-Instruct-GPTQ-{quantization}",
|
|
2875
|
+
"model_hub": "modelscope"
|
|
2876
|
+
},
|
|
2877
|
+
{
|
|
2878
|
+
"model_format": "gptq",
|
|
2879
|
+
"model_size_in_billions": 72,
|
|
2880
|
+
"quantizations": [
|
|
2881
|
+
"Int4",
|
|
2882
|
+
"Int8"
|
|
2883
|
+
],
|
|
2884
|
+
"model_id": "qwen/Qwen2-72B-Instruct-GPTQ-{quantization}",
|
|
2885
|
+
"model_hub": "modelscope"
|
|
2886
|
+
},
|
|
2887
|
+
{
|
|
2888
|
+
"model_format": "awq",
|
|
2889
|
+
"model_size_in_billions": "0_5",
|
|
2890
|
+
"quantizations": [
|
|
2891
|
+
"Int4"
|
|
2892
|
+
],
|
|
2893
|
+
"model_id": "qwen/Qwen2-0.5B-Instruct-AWQ",
|
|
2894
|
+
"model_hub": "modelscope"
|
|
2895
|
+
},
|
|
2896
|
+
{
|
|
2897
|
+
"model_format": "awq",
|
|
2898
|
+
"model_size_in_billions": "1_5",
|
|
2899
|
+
"quantizations": [
|
|
2900
|
+
"Int4"
|
|
2901
|
+
],
|
|
2902
|
+
"model_id": "qwen/Qwen2-1.5B-Instruct-AWQ",
|
|
2903
|
+
"model_hub": "modelscope"
|
|
2904
|
+
},
|
|
2905
|
+
{
|
|
2906
|
+
"model_format": "awq",
|
|
2907
|
+
"model_size_in_billions": 7,
|
|
2908
|
+
"quantizations": [
|
|
2909
|
+
"Int4"
|
|
2910
|
+
],
|
|
2911
|
+
"model_id": "qwen/Qwen2-7B-Instruct-AWQ",
|
|
2912
|
+
"model_hub": "modelscope"
|
|
2913
|
+
},
|
|
2914
|
+
{
|
|
2915
|
+
"model_format": "awq",
|
|
2916
|
+
"model_size_in_billions": 72,
|
|
2917
|
+
"quantizations": [
|
|
2918
|
+
"Int4"
|
|
2919
|
+
],
|
|
2920
|
+
"model_id": "qwen/Qwen2-72B-Instruct-AWQ",
|
|
2921
|
+
"model_hub": "modelscope"
|
|
2922
|
+
},
|
|
2923
|
+
{
|
|
2924
|
+
"model_format": "ggufv2",
|
|
2925
|
+
"model_size_in_billions": "0_5",
|
|
2926
|
+
"quantizations": [
|
|
2927
|
+
"q2_k",
|
|
2928
|
+
"q3_k_m",
|
|
2929
|
+
"q4_0",
|
|
2930
|
+
"q4_k_m",
|
|
2931
|
+
"q5_0",
|
|
2932
|
+
"q5_k_m",
|
|
2933
|
+
"q6_k",
|
|
2934
|
+
"q8_0",
|
|
2935
|
+
"fp16"
|
|
2936
|
+
],
|
|
2937
|
+
"model_id": "qwen/Qwen2-0.5B-Instruct-GGUF",
|
|
2938
|
+
"model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf",
|
|
2939
|
+
"model_hub": "modelscope"
|
|
2940
|
+
}
|
|
2941
|
+
],
|
|
2942
|
+
"prompt_style": {
|
|
2943
|
+
"style_name": "QWEN",
|
|
2944
|
+
"system_prompt": "You are a helpful assistant.",
|
|
2945
|
+
"roles": [
|
|
2946
|
+
"user",
|
|
2947
|
+
"assistant"
|
|
2948
|
+
],
|
|
2949
|
+
"intra_message_sep": "\n",
|
|
2950
|
+
"stop_token_ids": [
|
|
2951
|
+
151643,
|
|
2952
|
+
151644,
|
|
2953
|
+
151645
|
|
2954
|
+
],
|
|
2955
|
+
"stop": [
|
|
2956
|
+
"<|endoftext|>",
|
|
2957
|
+
"<|im_start|>",
|
|
2958
|
+
"<|im_end|>"
|
|
2959
|
+
]
|
|
2960
|
+
}
|
|
2961
|
+
},
|
|
2962
|
+
{
|
|
2963
|
+
"version": 1,
|
|
2964
|
+
"context_length": 32768,
|
|
2965
|
+
"model_name": "qwen2-moe-instruct",
|
|
2966
|
+
"model_lang": [
|
|
2967
|
+
"en",
|
|
2968
|
+
"zh"
|
|
2969
|
+
],
|
|
2970
|
+
"model_ability": [
|
|
2971
|
+
"chat"
|
|
2972
|
+
],
|
|
2973
|
+
"model_description": "Qwen2 is the new series of Qwen large language models. ",
|
|
2974
|
+
"model_specs": [
|
|
2975
|
+
{
|
|
2976
|
+
"model_format": "pytorch",
|
|
2977
|
+
"model_size_in_billions": 14,
|
|
2978
|
+
"quantizations": [
|
|
2979
|
+
"4-bit",
|
|
2980
|
+
"8-bit",
|
|
2981
|
+
"none"
|
|
2982
|
+
],
|
|
2983
|
+
"model_id": "qwen/Qwen2-57B-A14B-Instruct",
|
|
2984
|
+
"model_hub": "modelscope"
|
|
2985
|
+
},
|
|
2986
|
+
{
|
|
2987
|
+
"model_format": "gptq",
|
|
2988
|
+
"model_size_in_billions": 14,
|
|
2989
|
+
"quantizations": [
|
|
2990
|
+
"Int4"
|
|
2991
|
+
],
|
|
2992
|
+
"model_id": "qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4",
|
|
2993
|
+
"model_hub": "modelscope"
|
|
2994
|
+
}
|
|
2995
|
+
],
|
|
2996
|
+
"prompt_style": {
|
|
2997
|
+
"style_name": "QWEN",
|
|
2998
|
+
"system_prompt": "You are a helpful assistant.",
|
|
2999
|
+
"roles": [
|
|
3000
|
+
"user",
|
|
3001
|
+
"assistant"
|
|
3002
|
+
],
|
|
3003
|
+
"intra_message_sep": "\n",
|
|
3004
|
+
"stop_token_ids": [
|
|
3005
|
+
151643,
|
|
3006
|
+
151644,
|
|
3007
|
+
151645
|
|
3008
|
+
],
|
|
3009
|
+
"stop": [
|
|
3010
|
+
"<|endoftext|>",
|
|
3011
|
+
"<|im_start|>",
|
|
3012
|
+
"<|im_end|>"
|
|
3013
|
+
]
|
|
3014
|
+
}
|
|
3015
|
+
},
|
|
2651
3016
|
{
|
|
2652
3017
|
"version": 1,
|
|
2653
3018
|
"context_length": 4096,
|
|
@@ -3236,7 +3601,7 @@
|
|
|
3236
3601
|
"chat",
|
|
3237
3602
|
"vision"
|
|
3238
3603
|
],
|
|
3239
|
-
"model_description":"
|
|
3604
|
+
"model_description":"OmniLMM is a family of open-source large multimodal models (LMMs) adept at vision & language modeling.",
|
|
3240
3605
|
"model_specs":[
|
|
3241
3606
|
{
|
|
3242
3607
|
"model_format":"pytorch",
|
|
@@ -3468,6 +3833,50 @@
|
|
|
3468
3833
|
]
|
|
3469
3834
|
}
|
|
3470
3835
|
},
|
|
3836
|
+
{
|
|
3837
|
+
"version":1,
|
|
3838
|
+
"context_length":2048,
|
|
3839
|
+
"model_name":"MiniCPM-Llama3-V-2_5",
|
|
3840
|
+
"model_lang":[
|
|
3841
|
+
"en",
|
|
3842
|
+
"zh"
|
|
3843
|
+
],
|
|
3844
|
+
"model_ability":[
|
|
3845
|
+
"chat",
|
|
3846
|
+
"vision"
|
|
3847
|
+
],
|
|
3848
|
+
"model_description":"MiniCPM-Llama3-V 2.5 is the latest model in the MiniCPM-V series. The model is built on SigLip-400M and Llama3-8B-Instruct with a total of 8B parameters.",
|
|
3849
|
+
"model_specs":[
|
|
3850
|
+
{
|
|
3851
|
+
"model_format":"pytorch",
|
|
3852
|
+
"model_size_in_billions":8,
|
|
3853
|
+
"quantizations":[
|
|
3854
|
+
"none"
|
|
3855
|
+
],
|
|
3856
|
+
"model_hub": "modelscope",
|
|
3857
|
+
"model_id":"OpenBMB/MiniCPM-Llama3-V-2_5",
|
|
3858
|
+
"model_revision":"master"
|
|
3859
|
+
},
|
|
3860
|
+
{
|
|
3861
|
+
"model_format":"pytorch",
|
|
3862
|
+
"model_size_in_billions":8,
|
|
3863
|
+
"quantizations":[
|
|
3864
|
+
"int4"
|
|
3865
|
+
],
|
|
3866
|
+
"model_hub": "modelscope",
|
|
3867
|
+
"model_id":"OpenBMB/MiniCPM-Llama3-V-2_5-{quantization}",
|
|
3868
|
+
"model_revision":"master"
|
|
3869
|
+
}
|
|
3870
|
+
],
|
|
3871
|
+
"prompt_style":{
|
|
3872
|
+
"style_name":"OmniLMM",
|
|
3873
|
+
"system_prompt":"The role of first msg should be user",
|
|
3874
|
+
"roles":[
|
|
3875
|
+
"user",
|
|
3876
|
+
"assistant"
|
|
3877
|
+
]
|
|
3878
|
+
}
|
|
3879
|
+
},
|
|
3471
3880
|
{
|
|
3472
3881
|
"version": 1,
|
|
3473
3882
|
"context_length": 2048,
|
|
@@ -3860,7 +4269,7 @@
|
|
|
3860
4269
|
"<|im_end|>"
|
|
3861
4270
|
]
|
|
3862
4271
|
}
|
|
3863
|
-
},
|
|
4272
|
+
},
|
|
3864
4273
|
{
|
|
3865
4274
|
"version": 1,
|
|
3866
4275
|
"context_length": 8192,
|
|
@@ -82,30 +82,37 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
82
82
|
) -> bool:
|
|
83
83
|
if llm_spec.model_format != "pytorch":
|
|
84
84
|
return False
|
|
85
|
-
|
|
85
|
+
model_family = llm_family.model_family or llm_family.model_name
|
|
86
|
+
if "chatglm" not in model_family and "glm4" not in model_family:
|
|
86
87
|
return False
|
|
87
88
|
if "chat" not in llm_family.model_ability:
|
|
88
89
|
return False
|
|
89
90
|
return True
|
|
90
91
|
|
|
91
|
-
|
|
92
|
-
def _handle_tools(generate_config) -> Optional[dict]:
|
|
92
|
+
def _handle_tools(self, generate_config) -> Optional[dict]:
|
|
93
93
|
"""Convert openai tools to ChatGLM tools."""
|
|
94
94
|
if generate_config is None:
|
|
95
95
|
return None
|
|
96
96
|
tools = generate_config.pop("tools", None)
|
|
97
97
|
if tools is None:
|
|
98
98
|
return None
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
99
|
+
if self.model_family.model_name == "glm4-chat":
|
|
100
|
+
return {
|
|
101
|
+
"role": "system",
|
|
102
|
+
"content": None,
|
|
103
|
+
"tools": tools,
|
|
104
|
+
}
|
|
105
|
+
else:
|
|
106
|
+
chatglm_tools = []
|
|
107
|
+
for elem in tools:
|
|
108
|
+
if elem.get("type") != "function" or "function" not in elem:
|
|
109
|
+
raise ValueError("ChatGLM tools only support function type.")
|
|
110
|
+
chatglm_tools.append(elem["function"])
|
|
111
|
+
return {
|
|
112
|
+
"role": "system",
|
|
113
|
+
"content": f"Answer the following questions as best as you can. You have access to the following tools:",
|
|
114
|
+
"tools": chatglm_tools,
|
|
115
|
+
}
|
|
109
116
|
|
|
110
117
|
def chat(
|
|
111
118
|
self,
|
|
@@ -30,6 +30,7 @@ from ....types import (
|
|
|
30
30
|
ChatCompletionMessage,
|
|
31
31
|
Completion,
|
|
32
32
|
CompletionChoice,
|
|
33
|
+
CompletionChunk,
|
|
33
34
|
CompletionUsage,
|
|
34
35
|
)
|
|
35
36
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -183,10 +184,7 @@ class CogVLM2Model(PytorchChatModel):
|
|
|
183
184
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
184
185
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
185
186
|
system_prompt = system_prompt if system_prompt else ""
|
|
186
|
-
|
|
187
|
-
raise Exception(
|
|
188
|
-
f"Chat with model {self.model_family.model_name} does not support stream."
|
|
189
|
-
)
|
|
187
|
+
stream = generate_config.get("stream", False) if generate_config else False
|
|
190
188
|
|
|
191
189
|
sanitized_config = {
|
|
192
190
|
"pad_token_id": 128002,
|
|
@@ -234,24 +232,85 @@ class CogVLM2Model(PytorchChatModel):
|
|
|
234
232
|
if image is not None
|
|
235
233
|
else None,
|
|
236
234
|
}
|
|
237
|
-
with torch.no_grad():
|
|
238
|
-
outputs = self._model.generate(**inputs, **sanitized_config)
|
|
239
|
-
outputs = outputs[:, inputs["input_ids"].shape[1] :]
|
|
240
|
-
response = self._tokenizer.decode(outputs[0])
|
|
241
|
-
response = response.split("<|end_of_text|>")[0]
|
|
242
235
|
|
|
243
|
-
|
|
244
|
-
|
|
236
|
+
if stream:
|
|
237
|
+
it = self._streaming_chat_response(inputs, sanitized_config)
|
|
238
|
+
return self._to_chat_completion_chunks(it)
|
|
239
|
+
else:
|
|
240
|
+
with torch.no_grad():
|
|
241
|
+
outputs = self._model.generate(**inputs, **sanitized_config)
|
|
242
|
+
outputs = outputs[:, inputs["input_ids"].shape[1] :]
|
|
243
|
+
response = self._tokenizer.decode(outputs[0])
|
|
244
|
+
response = response.split("<|end_of_text|>")[0]
|
|
245
|
+
|
|
246
|
+
chunk = Completion(
|
|
247
|
+
id=str(uuid.uuid1()),
|
|
248
|
+
object="text_completion",
|
|
249
|
+
created=int(time.time()),
|
|
250
|
+
model=self.model_uid,
|
|
251
|
+
choices=[
|
|
252
|
+
CompletionChoice(
|
|
253
|
+
index=0, text=response, finish_reason="stop", logprobs=None
|
|
254
|
+
)
|
|
255
|
+
],
|
|
256
|
+
usage=CompletionUsage(
|
|
257
|
+
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
258
|
+
),
|
|
259
|
+
)
|
|
260
|
+
return self._to_chat_completion(chunk)
|
|
261
|
+
|
|
262
|
+
def _streaming_chat_response(
|
|
263
|
+
self, inputs: Dict, config: Dict
|
|
264
|
+
) -> Iterator[CompletionChunk]:
|
|
265
|
+
from threading import Thread
|
|
266
|
+
|
|
267
|
+
from transformers import TextIteratorStreamer
|
|
268
|
+
|
|
269
|
+
streamer = TextIteratorStreamer(
|
|
270
|
+
self._tokenizer, skip_prompt=True, skip_special_tokens=True
|
|
271
|
+
)
|
|
272
|
+
generation_kwargs = {
|
|
273
|
+
"input_ids": inputs["input_ids"],
|
|
274
|
+
"attention_mask": inputs["attention_mask"],
|
|
275
|
+
"token_type_ids": inputs["token_type_ids"],
|
|
276
|
+
"images": inputs["images"],
|
|
277
|
+
"max_new_tokens": config["max_new_tokens"],
|
|
278
|
+
"pad_token_id": config["pad_token_id"],
|
|
279
|
+
"streamer": streamer,
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
|
|
283
|
+
thread.start()
|
|
284
|
+
|
|
285
|
+
completion_id = str(uuid.uuid1())
|
|
286
|
+
for new_text in streamer:
|
|
287
|
+
chunk = CompletionChunk(
|
|
288
|
+
id=completion_id,
|
|
289
|
+
object="text_completion",
|
|
290
|
+
created=int(time.time()),
|
|
291
|
+
model=self.model_uid,
|
|
292
|
+
choices=[
|
|
293
|
+
CompletionChoice(
|
|
294
|
+
index=0, text=new_text, finish_reason=None, logprobs=None
|
|
295
|
+
)
|
|
296
|
+
],
|
|
297
|
+
usage=CompletionUsage(
|
|
298
|
+
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
299
|
+
),
|
|
300
|
+
)
|
|
301
|
+
yield chunk
|
|
302
|
+
|
|
303
|
+
completion_choice = CompletionChoice(
|
|
304
|
+
text="", index=0, logprobs=None, finish_reason="stop"
|
|
305
|
+
)
|
|
306
|
+
chunk = CompletionChunk(
|
|
307
|
+
id=completion_id,
|
|
245
308
|
object="text_completion",
|
|
246
309
|
created=int(time.time()),
|
|
247
310
|
model=self.model_uid,
|
|
248
|
-
choices=[
|
|
249
|
-
CompletionChoice(
|
|
250
|
-
index=0, text=response, finish_reason="stop", logprobs=None
|
|
251
|
-
)
|
|
252
|
-
],
|
|
311
|
+
choices=[completion_choice],
|
|
253
312
|
usage=CompletionUsage(
|
|
254
313
|
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
255
314
|
),
|
|
256
315
|
)
|
|
257
|
-
|
|
316
|
+
yield chunk
|