xinference 0.11.3__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +69 -0
- xinference/client/restful/restful_client.py +70 -0
- xinference/constants.py +4 -0
- xinference/core/model.py +141 -12
- xinference/core/scheduler.py +428 -0
- xinference/core/supervisor.py +26 -0
- xinference/isolation.py +9 -2
- xinference/model/audio/chattts.py +84 -0
- xinference/model/audio/core.py +10 -3
- xinference/model/audio/model_spec.json +20 -0
- xinference/model/llm/__init__.py +4 -0
- xinference/model/llm/llm_family.json +507 -1
- xinference/model/llm/llm_family_modelscope.json +409 -2
- xinference/model/llm/pytorch/chatglm.py +2 -1
- xinference/model/llm/pytorch/cogvlm2.py +76 -17
- xinference/model/llm/pytorch/core.py +91 -6
- xinference/model/llm/pytorch/glm4v.py +258 -0
- xinference/model/llm/pytorch/minicpmv25.py +232 -0
- xinference/model/llm/pytorch/utils.py +386 -2
- xinference/model/llm/vllm/core.py +6 -0
- xinference/thirdparty/ChatTTS/__init__.py +1 -0
- xinference/thirdparty/ChatTTS/core.py +200 -0
- xinference/types.py +3 -0
- {xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/METADATA +26 -9
- {xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/RECORD +30 -24
- {xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/LICENSE +0 -0
- {xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/WHEEL +0 -0
- {xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/entry_points.txt +0 -0
- {xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/top_level.txt +0 -0
|
@@ -522,6 +522,142 @@
|
|
|
522
522
|
]
|
|
523
523
|
}
|
|
524
524
|
},
|
|
525
|
+
{
|
|
526
|
+
"version": 1,
|
|
527
|
+
"context_length": 131072,
|
|
528
|
+
"model_name": "glm4-chat",
|
|
529
|
+
"model_lang": [
|
|
530
|
+
"en",
|
|
531
|
+
"zh"
|
|
532
|
+
],
|
|
533
|
+
"model_ability": [
|
|
534
|
+
"chat",
|
|
535
|
+
"tools"
|
|
536
|
+
],
|
|
537
|
+
"model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
|
|
538
|
+
"model_specs": [
|
|
539
|
+
{
|
|
540
|
+
"model_format": "pytorch",
|
|
541
|
+
"model_size_in_billions": 9,
|
|
542
|
+
"quantizations": [
|
|
543
|
+
"4-bit",
|
|
544
|
+
"8-bit",
|
|
545
|
+
"none"
|
|
546
|
+
],
|
|
547
|
+
"model_hub": "modelscope",
|
|
548
|
+
"model_id": "ZhipuAI/glm-4-9b-chat",
|
|
549
|
+
"model_revision": "master"
|
|
550
|
+
}
|
|
551
|
+
],
|
|
552
|
+
"prompt_style": {
|
|
553
|
+
"style_name": "CHATGLM3",
|
|
554
|
+
"system_prompt": "",
|
|
555
|
+
"roles": [
|
|
556
|
+
"user",
|
|
557
|
+
"assistant"
|
|
558
|
+
],
|
|
559
|
+
"stop_token_ids": [
|
|
560
|
+
151329,
|
|
561
|
+
151336,
|
|
562
|
+
151338
|
|
563
|
+
],
|
|
564
|
+
"stop": [
|
|
565
|
+
"<|endoftext|>",
|
|
566
|
+
"<|user|>",
|
|
567
|
+
"<|observation|>"
|
|
568
|
+
]
|
|
569
|
+
}
|
|
570
|
+
},
|
|
571
|
+
{
|
|
572
|
+
"version": 1,
|
|
573
|
+
"context_length": 1048576,
|
|
574
|
+
"model_name": "glm4-chat-1m",
|
|
575
|
+
"model_lang": [
|
|
576
|
+
"en",
|
|
577
|
+
"zh"
|
|
578
|
+
],
|
|
579
|
+
"model_ability": [
|
|
580
|
+
"chat",
|
|
581
|
+
"tools"
|
|
582
|
+
],
|
|
583
|
+
"model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
|
|
584
|
+
"model_specs": [
|
|
585
|
+
{
|
|
586
|
+
"model_format": "pytorch",
|
|
587
|
+
"model_size_in_billions": 9,
|
|
588
|
+
"quantizations": [
|
|
589
|
+
"4-bit",
|
|
590
|
+
"8-bit",
|
|
591
|
+
"none"
|
|
592
|
+
],
|
|
593
|
+
"model_hub": "modelscope",
|
|
594
|
+
"model_id": "ZhipuAI/glm-4-9b-chat-1m",
|
|
595
|
+
"model_revision": "master"
|
|
596
|
+
}
|
|
597
|
+
],
|
|
598
|
+
"prompt_style": {
|
|
599
|
+
"style_name": "CHATGLM3",
|
|
600
|
+
"system_prompt": "",
|
|
601
|
+
"roles": [
|
|
602
|
+
"user",
|
|
603
|
+
"assistant"
|
|
604
|
+
],
|
|
605
|
+
"stop_token_ids": [
|
|
606
|
+
151329,
|
|
607
|
+
151336,
|
|
608
|
+
151338
|
|
609
|
+
],
|
|
610
|
+
"stop": [
|
|
611
|
+
"<|endoftext|>",
|
|
612
|
+
"<|user|>",
|
|
613
|
+
"<|observation|>"
|
|
614
|
+
]
|
|
615
|
+
}
|
|
616
|
+
},
|
|
617
|
+
{
|
|
618
|
+
"version": 1,
|
|
619
|
+
"context_length": 8192,
|
|
620
|
+
"model_name": "glm-4v",
|
|
621
|
+
"model_lang": [
|
|
622
|
+
"en",
|
|
623
|
+
"zh"
|
|
624
|
+
],
|
|
625
|
+
"model_ability": [
|
|
626
|
+
"chat",
|
|
627
|
+
"vision"
|
|
628
|
+
],
|
|
629
|
+
"model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
|
|
630
|
+
"model_specs": [
|
|
631
|
+
{
|
|
632
|
+
"model_format": "pytorch",
|
|
633
|
+
"model_size_in_billions": 9,
|
|
634
|
+
"quantizations": [
|
|
635
|
+
"none"
|
|
636
|
+
],
|
|
637
|
+
"model_hub": "modelscope",
|
|
638
|
+
"model_id": "ZhipuAI/glm-4v-9b",
|
|
639
|
+
"model_revision": "master"
|
|
640
|
+
}
|
|
641
|
+
],
|
|
642
|
+
"prompt_style": {
|
|
643
|
+
"style_name": "CHATGLM3",
|
|
644
|
+
"system_prompt": "",
|
|
645
|
+
"roles": [
|
|
646
|
+
"user",
|
|
647
|
+
"assistant"
|
|
648
|
+
],
|
|
649
|
+
"stop_token_ids": [
|
|
650
|
+
151329,
|
|
651
|
+
151336,
|
|
652
|
+
151338
|
|
653
|
+
],
|
|
654
|
+
"stop": [
|
|
655
|
+
"<|endoftext|>",
|
|
656
|
+
"<|user|>",
|
|
657
|
+
"<|observation|>"
|
|
658
|
+
]
|
|
659
|
+
}
|
|
660
|
+
},
|
|
525
661
|
{
|
|
526
662
|
"version": 1,
|
|
527
663
|
"context_length": 2048,
|
|
@@ -2648,6 +2784,233 @@
|
|
|
2648
2784
|
]
|
|
2649
2785
|
}
|
|
2650
2786
|
},
|
|
2787
|
+
{
|
|
2788
|
+
"version": 1,
|
|
2789
|
+
"context_length": 32768,
|
|
2790
|
+
"model_name": "qwen2-instruct",
|
|
2791
|
+
"model_lang": [
|
|
2792
|
+
"en",
|
|
2793
|
+
"zh"
|
|
2794
|
+
],
|
|
2795
|
+
"model_ability": [
|
|
2796
|
+
"chat",
|
|
2797
|
+
"tools"
|
|
2798
|
+
],
|
|
2799
|
+
"model_description": "Qwen2 is the new series of Qwen large language models",
|
|
2800
|
+
"model_specs": [
|
|
2801
|
+
{
|
|
2802
|
+
"model_format": "pytorch",
|
|
2803
|
+
"model_size_in_billions": "0_5",
|
|
2804
|
+
"quantizations": [
|
|
2805
|
+
"4-bit",
|
|
2806
|
+
"8-bit",
|
|
2807
|
+
"none"
|
|
2808
|
+
],
|
|
2809
|
+
"model_id": "qwen/Qwen2-0.5B-Instruct",
|
|
2810
|
+
"model_hub": "modelscope"
|
|
2811
|
+
},
|
|
2812
|
+
{
|
|
2813
|
+
"model_format": "pytorch",
|
|
2814
|
+
"model_size_in_billions": "1_5",
|
|
2815
|
+
"quantizations": [
|
|
2816
|
+
"4-bit",
|
|
2817
|
+
"8-bit",
|
|
2818
|
+
"none"
|
|
2819
|
+
],
|
|
2820
|
+
"model_id": "qwen/Qwen2-1.5B-Instruct",
|
|
2821
|
+
"model_hub": "modelscope"
|
|
2822
|
+
},
|
|
2823
|
+
{
|
|
2824
|
+
"model_format": "pytorch",
|
|
2825
|
+
"model_size_in_billions": 7,
|
|
2826
|
+
"quantizations": [
|
|
2827
|
+
"4-bit",
|
|
2828
|
+
"8-bit",
|
|
2829
|
+
"none"
|
|
2830
|
+
],
|
|
2831
|
+
"model_id": "qwen/Qwen2-7B-Instruct",
|
|
2832
|
+
"model_hub": "modelscope"
|
|
2833
|
+
},
|
|
2834
|
+
{
|
|
2835
|
+
"model_format": "pytorch",
|
|
2836
|
+
"model_size_in_billions": 72,
|
|
2837
|
+
"quantizations": [
|
|
2838
|
+
"4-bit",
|
|
2839
|
+
"8-bit",
|
|
2840
|
+
"none"
|
|
2841
|
+
],
|
|
2842
|
+
"model_id": "qwen/Qwen2-72B-Instruct",
|
|
2843
|
+
"model_hub": "modelscope"
|
|
2844
|
+
},
|
|
2845
|
+
{
|
|
2846
|
+
"model_format": "gptq",
|
|
2847
|
+
"model_size_in_billions": "0_5",
|
|
2848
|
+
"quantizations": [
|
|
2849
|
+
"Int4",
|
|
2850
|
+
"Int8"
|
|
2851
|
+
],
|
|
2852
|
+
"model_id": "qwen/Qwen2-0.5B-Instruct-GPTQ-{quantization}",
|
|
2853
|
+
"model_hub": "modelscope"
|
|
2854
|
+
},
|
|
2855
|
+
{
|
|
2856
|
+
"model_format": "gptq",
|
|
2857
|
+
"model_size_in_billions": "1_5",
|
|
2858
|
+
"quantizations": [
|
|
2859
|
+
"Int4",
|
|
2860
|
+
"Int8"
|
|
2861
|
+
],
|
|
2862
|
+
"model_id": "qwen/Qwen2-1.5B-Instruct-GPTQ-{quantization}",
|
|
2863
|
+
"model_hub": "modelscope"
|
|
2864
|
+
},
|
|
2865
|
+
{
|
|
2866
|
+
"model_format": "gptq",
|
|
2867
|
+
"model_size_in_billions": 7,
|
|
2868
|
+
"quantizations": [
|
|
2869
|
+
"Int4",
|
|
2870
|
+
"Int8"
|
|
2871
|
+
],
|
|
2872
|
+
"model_id": "qwen/Qwen2-7B-Instruct-GPTQ-{quantization}",
|
|
2873
|
+
"model_hub": "modelscope"
|
|
2874
|
+
},
|
|
2875
|
+
{
|
|
2876
|
+
"model_format": "gptq",
|
|
2877
|
+
"model_size_in_billions": 72,
|
|
2878
|
+
"quantizations": [
|
|
2879
|
+
"Int4",
|
|
2880
|
+
"Int8"
|
|
2881
|
+
],
|
|
2882
|
+
"model_id": "qwen/Qwen2-72B-Instruct-GPTQ-{quantization}",
|
|
2883
|
+
"model_hub": "modelscope"
|
|
2884
|
+
},
|
|
2885
|
+
{
|
|
2886
|
+
"model_format": "awq",
|
|
2887
|
+
"model_size_in_billions": "0_5",
|
|
2888
|
+
"quantizations": [
|
|
2889
|
+
"Int4"
|
|
2890
|
+
],
|
|
2891
|
+
"model_id": "qwen/Qwen2-0.5B-Instruct-AWQ",
|
|
2892
|
+
"model_hub": "modelscope"
|
|
2893
|
+
},
|
|
2894
|
+
{
|
|
2895
|
+
"model_format": "awq",
|
|
2896
|
+
"model_size_in_billions": "1_5",
|
|
2897
|
+
"quantizations": [
|
|
2898
|
+
"Int4"
|
|
2899
|
+
],
|
|
2900
|
+
"model_id": "qwen/Qwen2-1.5B-Instruct-AWQ",
|
|
2901
|
+
"model_hub": "modelscope"
|
|
2902
|
+
},
|
|
2903
|
+
{
|
|
2904
|
+
"model_format": "awq",
|
|
2905
|
+
"model_size_in_billions": 7,
|
|
2906
|
+
"quantizations": [
|
|
2907
|
+
"Int4"
|
|
2908
|
+
],
|
|
2909
|
+
"model_id": "qwen/Qwen2-7B-Instruct-AWQ",
|
|
2910
|
+
"model_hub": "modelscope"
|
|
2911
|
+
},
|
|
2912
|
+
{
|
|
2913
|
+
"model_format": "awq",
|
|
2914
|
+
"model_size_in_billions": 72,
|
|
2915
|
+
"quantizations": [
|
|
2916
|
+
"Int4"
|
|
2917
|
+
],
|
|
2918
|
+
"model_id": "qwen/Qwen2-72B-Instruct-AWQ",
|
|
2919
|
+
"model_hub": "modelscope"
|
|
2920
|
+
},
|
|
2921
|
+
{
|
|
2922
|
+
"model_format": "ggufv2",
|
|
2923
|
+
"model_size_in_billions": "0_5",
|
|
2924
|
+
"quantizations": [
|
|
2925
|
+
"q2_k",
|
|
2926
|
+
"q3_k_m",
|
|
2927
|
+
"q4_0",
|
|
2928
|
+
"q4_k_m",
|
|
2929
|
+
"q5_0",
|
|
2930
|
+
"q5_k_m",
|
|
2931
|
+
"q6_k",
|
|
2932
|
+
"q8_0",
|
|
2933
|
+
"fp16"
|
|
2934
|
+
],
|
|
2935
|
+
"model_id": "qwen/Qwen2-0.5B-Instruct-GGUF",
|
|
2936
|
+
"model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf",
|
|
2937
|
+
"model_hub": "modelscope"
|
|
2938
|
+
}
|
|
2939
|
+
],
|
|
2940
|
+
"prompt_style": {
|
|
2941
|
+
"style_name": "QWEN",
|
|
2942
|
+
"system_prompt": "You are a helpful assistant.",
|
|
2943
|
+
"roles": [
|
|
2944
|
+
"user",
|
|
2945
|
+
"assistant"
|
|
2946
|
+
],
|
|
2947
|
+
"intra_message_sep": "\n",
|
|
2948
|
+
"stop_token_ids": [
|
|
2949
|
+
151643,
|
|
2950
|
+
151644,
|
|
2951
|
+
151645
|
|
2952
|
+
],
|
|
2953
|
+
"stop": [
|
|
2954
|
+
"<|endoftext|>",
|
|
2955
|
+
"<|im_start|>",
|
|
2956
|
+
"<|im_end|>"
|
|
2957
|
+
]
|
|
2958
|
+
}
|
|
2959
|
+
},
|
|
2960
|
+
{
|
|
2961
|
+
"version": 1,
|
|
2962
|
+
"context_length": 32768,
|
|
2963
|
+
"model_name": "qwen2-moe-instruct",
|
|
2964
|
+
"model_lang": [
|
|
2965
|
+
"en",
|
|
2966
|
+
"zh"
|
|
2967
|
+
],
|
|
2968
|
+
"model_ability": [
|
|
2969
|
+
"chat"
|
|
2970
|
+
],
|
|
2971
|
+
"model_description": "Qwen2 is the new series of Qwen large language models. ",
|
|
2972
|
+
"model_specs": [
|
|
2973
|
+
{
|
|
2974
|
+
"model_format": "pytorch",
|
|
2975
|
+
"model_size_in_billions": 14,
|
|
2976
|
+
"quantizations": [
|
|
2977
|
+
"4-bit",
|
|
2978
|
+
"8-bit",
|
|
2979
|
+
"none"
|
|
2980
|
+
],
|
|
2981
|
+
"model_id": "qwen/Qwen2-57B-A14B-Instruct",
|
|
2982
|
+
"model_hub": "modelscope"
|
|
2983
|
+
},
|
|
2984
|
+
{
|
|
2985
|
+
"model_format": "gptq",
|
|
2986
|
+
"model_size_in_billions": 14,
|
|
2987
|
+
"quantizations": [
|
|
2988
|
+
"Int4"
|
|
2989
|
+
],
|
|
2990
|
+
"model_id": "qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4",
|
|
2991
|
+
"model_hub": "modelscope"
|
|
2992
|
+
}
|
|
2993
|
+
],
|
|
2994
|
+
"prompt_style": {
|
|
2995
|
+
"style_name": "QWEN",
|
|
2996
|
+
"system_prompt": "You are a helpful assistant.",
|
|
2997
|
+
"roles": [
|
|
2998
|
+
"user",
|
|
2999
|
+
"assistant"
|
|
3000
|
+
],
|
|
3001
|
+
"intra_message_sep": "\n",
|
|
3002
|
+
"stop_token_ids": [
|
|
3003
|
+
151643,
|
|
3004
|
+
151644,
|
|
3005
|
+
151645
|
|
3006
|
+
],
|
|
3007
|
+
"stop": [
|
|
3008
|
+
"<|endoftext|>",
|
|
3009
|
+
"<|im_start|>",
|
|
3010
|
+
"<|im_end|>"
|
|
3011
|
+
]
|
|
3012
|
+
}
|
|
3013
|
+
},
|
|
2651
3014
|
{
|
|
2652
3015
|
"version": 1,
|
|
2653
3016
|
"context_length": 4096,
|
|
@@ -3236,7 +3599,7 @@
|
|
|
3236
3599
|
"chat",
|
|
3237
3600
|
"vision"
|
|
3238
3601
|
],
|
|
3239
|
-
"model_description":"
|
|
3602
|
+
"model_description":"OmniLMM is a family of open-source large multimodal models (LMMs) adept at vision & language modeling.",
|
|
3240
3603
|
"model_specs":[
|
|
3241
3604
|
{
|
|
3242
3605
|
"model_format":"pytorch",
|
|
@@ -3468,6 +3831,50 @@
|
|
|
3468
3831
|
]
|
|
3469
3832
|
}
|
|
3470
3833
|
},
|
|
3834
|
+
{
|
|
3835
|
+
"version":1,
|
|
3836
|
+
"context_length":2048,
|
|
3837
|
+
"model_name":"MiniCPM-Llama3-V-2_5",
|
|
3838
|
+
"model_lang":[
|
|
3839
|
+
"en",
|
|
3840
|
+
"zh"
|
|
3841
|
+
],
|
|
3842
|
+
"model_ability":[
|
|
3843
|
+
"chat",
|
|
3844
|
+
"vision"
|
|
3845
|
+
],
|
|
3846
|
+
"model_description":"MiniCPM-Llama3-V 2.5 is the latest model in the MiniCPM-V series. The model is built on SigLip-400M and Llama3-8B-Instruct with a total of 8B parameters.",
|
|
3847
|
+
"model_specs":[
|
|
3848
|
+
{
|
|
3849
|
+
"model_format":"pytorch",
|
|
3850
|
+
"model_size_in_billions":8,
|
|
3851
|
+
"quantizations":[
|
|
3852
|
+
"none"
|
|
3853
|
+
],
|
|
3854
|
+
"model_hub": "modelscope",
|
|
3855
|
+
"model_id":"OpenBMB/MiniCPM-Llama3-V-2_5",
|
|
3856
|
+
"model_revision":"master"
|
|
3857
|
+
},
|
|
3858
|
+
{
|
|
3859
|
+
"model_format":"pytorch",
|
|
3860
|
+
"model_size_in_billions":8,
|
|
3861
|
+
"quantizations":[
|
|
3862
|
+
"int4"
|
|
3863
|
+
],
|
|
3864
|
+
"model_hub": "modelscope",
|
|
3865
|
+
"model_id":"OpenBMB/MiniCPM-Llama3-V-2_5-{quantization}",
|
|
3866
|
+
"model_revision":"master"
|
|
3867
|
+
}
|
|
3868
|
+
],
|
|
3869
|
+
"prompt_style":{
|
|
3870
|
+
"style_name":"OmniLMM",
|
|
3871
|
+
"system_prompt":"The role of first msg should be user",
|
|
3872
|
+
"roles":[
|
|
3873
|
+
"user",
|
|
3874
|
+
"assistant"
|
|
3875
|
+
]
|
|
3876
|
+
}
|
|
3877
|
+
},
|
|
3471
3878
|
{
|
|
3472
3879
|
"version": 1,
|
|
3473
3880
|
"context_length": 2048,
|
|
@@ -3860,7 +4267,7 @@
|
|
|
3860
4267
|
"<|im_end|>"
|
|
3861
4268
|
]
|
|
3862
4269
|
}
|
|
3863
|
-
},
|
|
4270
|
+
},
|
|
3864
4271
|
{
|
|
3865
4272
|
"version": 1,
|
|
3866
4273
|
"context_length": 8192,
|
|
@@ -82,7 +82,8 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
82
82
|
) -> bool:
|
|
83
83
|
if llm_spec.model_format != "pytorch":
|
|
84
84
|
return False
|
|
85
|
-
|
|
85
|
+
model_family = llm_family.model_family or llm_family.model_name
|
|
86
|
+
if "chatglm" not in model_family and "glm4" not in model_family:
|
|
86
87
|
return False
|
|
87
88
|
if "chat" not in llm_family.model_ability:
|
|
88
89
|
return False
|
|
@@ -30,6 +30,7 @@ from ....types import (
|
|
|
30
30
|
ChatCompletionMessage,
|
|
31
31
|
Completion,
|
|
32
32
|
CompletionChoice,
|
|
33
|
+
CompletionChunk,
|
|
33
34
|
CompletionUsage,
|
|
34
35
|
)
|
|
35
36
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -183,10 +184,7 @@ class CogVLM2Model(PytorchChatModel):
|
|
|
183
184
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
184
185
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
185
186
|
system_prompt = system_prompt if system_prompt else ""
|
|
186
|
-
|
|
187
|
-
raise Exception(
|
|
188
|
-
f"Chat with model {self.model_family.model_name} does not support stream."
|
|
189
|
-
)
|
|
187
|
+
stream = generate_config.get("stream", False) if generate_config else False
|
|
190
188
|
|
|
191
189
|
sanitized_config = {
|
|
192
190
|
"pad_token_id": 128002,
|
|
@@ -234,24 +232,85 @@ class CogVLM2Model(PytorchChatModel):
|
|
|
234
232
|
if image is not None
|
|
235
233
|
else None,
|
|
236
234
|
}
|
|
237
|
-
with torch.no_grad():
|
|
238
|
-
outputs = self._model.generate(**inputs, **sanitized_config)
|
|
239
|
-
outputs = outputs[:, inputs["input_ids"].shape[1] :]
|
|
240
|
-
response = self._tokenizer.decode(outputs[0])
|
|
241
|
-
response = response.split("<|end_of_text|>")[0]
|
|
242
235
|
|
|
243
|
-
|
|
244
|
-
|
|
236
|
+
if stream:
|
|
237
|
+
it = self._streaming_chat_response(inputs, sanitized_config)
|
|
238
|
+
return self._to_chat_completion_chunks(it)
|
|
239
|
+
else:
|
|
240
|
+
with torch.no_grad():
|
|
241
|
+
outputs = self._model.generate(**inputs, **sanitized_config)
|
|
242
|
+
outputs = outputs[:, inputs["input_ids"].shape[1] :]
|
|
243
|
+
response = self._tokenizer.decode(outputs[0])
|
|
244
|
+
response = response.split("<|end_of_text|>")[0]
|
|
245
|
+
|
|
246
|
+
chunk = Completion(
|
|
247
|
+
id=str(uuid.uuid1()),
|
|
248
|
+
object="text_completion",
|
|
249
|
+
created=int(time.time()),
|
|
250
|
+
model=self.model_uid,
|
|
251
|
+
choices=[
|
|
252
|
+
CompletionChoice(
|
|
253
|
+
index=0, text=response, finish_reason="stop", logprobs=None
|
|
254
|
+
)
|
|
255
|
+
],
|
|
256
|
+
usage=CompletionUsage(
|
|
257
|
+
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
258
|
+
),
|
|
259
|
+
)
|
|
260
|
+
return self._to_chat_completion(chunk)
|
|
261
|
+
|
|
262
|
+
def _streaming_chat_response(
|
|
263
|
+
self, inputs: Dict, config: Dict
|
|
264
|
+
) -> Iterator[CompletionChunk]:
|
|
265
|
+
from threading import Thread
|
|
266
|
+
|
|
267
|
+
from transformers import TextIteratorStreamer
|
|
268
|
+
|
|
269
|
+
streamer = TextIteratorStreamer(
|
|
270
|
+
self._tokenizer, skip_prompt=True, skip_special_tokens=True
|
|
271
|
+
)
|
|
272
|
+
generation_kwargs = {
|
|
273
|
+
"input_ids": inputs["input_ids"],
|
|
274
|
+
"attention_mask": inputs["attention_mask"],
|
|
275
|
+
"token_type_ids": inputs["token_type_ids"],
|
|
276
|
+
"images": inputs["images"],
|
|
277
|
+
"max_new_tokens": config["max_new_tokens"],
|
|
278
|
+
"pad_token_id": config["pad_token_id"],
|
|
279
|
+
"streamer": streamer,
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
|
|
283
|
+
thread.start()
|
|
284
|
+
|
|
285
|
+
completion_id = str(uuid.uuid1())
|
|
286
|
+
for new_text in streamer:
|
|
287
|
+
chunk = CompletionChunk(
|
|
288
|
+
id=completion_id,
|
|
289
|
+
object="text_completion",
|
|
290
|
+
created=int(time.time()),
|
|
291
|
+
model=self.model_uid,
|
|
292
|
+
choices=[
|
|
293
|
+
CompletionChoice(
|
|
294
|
+
index=0, text=new_text, finish_reason=None, logprobs=None
|
|
295
|
+
)
|
|
296
|
+
],
|
|
297
|
+
usage=CompletionUsage(
|
|
298
|
+
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
299
|
+
),
|
|
300
|
+
)
|
|
301
|
+
yield chunk
|
|
302
|
+
|
|
303
|
+
completion_choice = CompletionChoice(
|
|
304
|
+
text="", index=0, logprobs=None, finish_reason="stop"
|
|
305
|
+
)
|
|
306
|
+
chunk = CompletionChunk(
|
|
307
|
+
id=completion_id,
|
|
245
308
|
object="text_completion",
|
|
246
309
|
created=int(time.time()),
|
|
247
310
|
model=self.model_uid,
|
|
248
|
-
choices=[
|
|
249
|
-
CompletionChoice(
|
|
250
|
-
index=0, text=response, finish_reason="stop", logprobs=None
|
|
251
|
-
)
|
|
252
|
-
],
|
|
311
|
+
choices=[completion_choice],
|
|
253
312
|
usage=CompletionUsage(
|
|
254
313
|
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
255
314
|
),
|
|
256
315
|
)
|
|
257
|
-
|
|
316
|
+
yield chunk
|