xinference 0.11.3__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (75) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +143 -6
  3. xinference/client/restful/restful_client.py +144 -5
  4. xinference/constants.py +5 -0
  5. xinference/core/cache_tracker.py +48 -28
  6. xinference/core/model.py +160 -19
  7. xinference/core/scheduler.py +446 -0
  8. xinference/core/supervisor.py +99 -24
  9. xinference/core/worker.py +68 -2
  10. xinference/deploy/cmdline.py +86 -2
  11. xinference/deploy/test/test_cmdline.py +19 -10
  12. xinference/isolation.py +9 -2
  13. xinference/model/audio/__init__.py +14 -1
  14. xinference/model/audio/chattts.py +84 -0
  15. xinference/model/audio/core.py +22 -4
  16. xinference/model/audio/custom.py +6 -4
  17. xinference/model/audio/model_spec.json +20 -0
  18. xinference/model/audio/model_spec_modelscope.json +20 -0
  19. xinference/model/llm/__init__.py +38 -2
  20. xinference/model/llm/llm_family.json +509 -1
  21. xinference/model/llm/llm_family.py +86 -1
  22. xinference/model/llm/llm_family_csghub.json +66 -0
  23. xinference/model/llm/llm_family_modelscope.json +411 -2
  24. xinference/model/llm/pytorch/chatglm.py +20 -13
  25. xinference/model/llm/pytorch/cogvlm2.py +76 -17
  26. xinference/model/llm/pytorch/core.py +141 -6
  27. xinference/model/llm/pytorch/glm4v.py +268 -0
  28. xinference/model/llm/pytorch/minicpmv25.py +232 -0
  29. xinference/model/llm/pytorch/qwen_vl.py +1 -1
  30. xinference/model/llm/pytorch/utils.py +405 -8
  31. xinference/model/llm/utils.py +14 -13
  32. xinference/model/llm/vllm/core.py +16 -4
  33. xinference/model/utils.py +8 -2
  34. xinference/thirdparty/ChatTTS/__init__.py +1 -0
  35. xinference/thirdparty/ChatTTS/core.py +200 -0
  36. xinference/thirdparty/ChatTTS/experimental/__init__.py +0 -0
  37. xinference/thirdparty/ChatTTS/experimental/llm.py +40 -0
  38. xinference/thirdparty/ChatTTS/infer/__init__.py +0 -0
  39. xinference/thirdparty/ChatTTS/infer/api.py +125 -0
  40. xinference/thirdparty/ChatTTS/model/__init__.py +0 -0
  41. xinference/thirdparty/ChatTTS/model/dvae.py +155 -0
  42. xinference/thirdparty/ChatTTS/model/gpt.py +265 -0
  43. xinference/thirdparty/ChatTTS/utils/__init__.py +0 -0
  44. xinference/thirdparty/ChatTTS/utils/gpu_utils.py +23 -0
  45. xinference/thirdparty/ChatTTS/utils/infer_utils.py +141 -0
  46. xinference/thirdparty/ChatTTS/utils/io_utils.py +14 -0
  47. xinference/types.py +3 -0
  48. xinference/web/ui/build/asset-manifest.json +6 -6
  49. xinference/web/ui/build/index.html +1 -1
  50. xinference/web/ui/build/static/css/main.074e2b31.css +2 -0
  51. xinference/web/ui/build/static/css/main.074e2b31.css.map +1 -0
  52. xinference/web/ui/build/static/js/main.a58ff436.js +3 -0
  53. xinference/web/ui/build/static/js/main.a58ff436.js.map +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/10262a281dec3bc2b185f4385ceb6846626f52d41cb4d46c7c649e719f979d4d.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/762a75a62daf3bec2cfc97ec8612798493fb34ef87087dcad6aad64ab7f14345.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/7f3bdb3a48fa00c046c8b185acd4da6f2e2940a20dbd77f9373d60de3fd6633e.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/f2f73bfdc13b12b02c8cbc4769b0b8e6367e9b6d8331c322d94318491a0b3653.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/f51bf63ddaa7afd125ef2254a105789333eecc1c94fdf5157a9b88ef7ad0a5bd.json +1 -0
  59. {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/METADATA +26 -9
  60. {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/RECORD +65 -47
  61. xinference/web/ui/build/static/css/main.54bca460.css +0 -2
  62. xinference/web/ui/build/static/css/main.54bca460.css.map +0 -1
  63. xinference/web/ui/build/static/js/main.551aa479.js +0 -3
  64. xinference/web/ui/build/static/js/main.551aa479.js.map +0 -1
  65. xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/3e737bcdbcbc407ccd65b90e199ef0c3214b261e8e41dbf14d921384a717d9ee.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +0 -1
  71. /xinference/web/ui/build/static/js/{main.551aa479.js.LICENSE.txt → main.a58ff436.js.LICENSE.txt} +0 -0
  72. {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/LICENSE +0 -0
  73. {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/WHEEL +0 -0
  74. {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/entry_points.txt +0 -0
  75. {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,66 @@
1
+ [
2
+ {
3
+ "version": 1,
4
+ "context_length": 32768,
5
+ "model_name": "qwen2-instruct",
6
+ "model_lang": [
7
+ "en",
8
+ "zh"
9
+ ],
10
+ "model_ability": [
11
+ "chat",
12
+ "tools"
13
+ ],
14
+ "model_description": "Qwen2 is the new series of Qwen large language models",
15
+ "model_specs": [
16
+ {
17
+ "model_format": "pytorch",
18
+ "model_size_in_billions": "0_5",
19
+ "quantizations": [
20
+ "4-bit",
21
+ "8-bit",
22
+ "none"
23
+ ],
24
+ "model_id": "Qwen/Qwen2-0.5B-Instruct",
25
+ "model_hub": "csghub"
26
+ },
27
+ {
28
+ "model_format": "ggufv2",
29
+ "model_size_in_billions": "0_5",
30
+ "quantizations": [
31
+ "q2_k",
32
+ "q3_k_m",
33
+ "q4_0",
34
+ "q4_k_m",
35
+ "q5_0",
36
+ "q5_k_m",
37
+ "q6_k",
38
+ "q8_0",
39
+ "fp16"
40
+ ],
41
+ "model_id": "qwen/Qwen2-0.5B-Instruct-GGUF",
42
+ "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf",
43
+ "model_hub": "csghub"
44
+ }
45
+ ],
46
+ "prompt_style": {
47
+ "style_name": "QWEN",
48
+ "system_prompt": "You are a helpful assistant.",
49
+ "roles": [
50
+ "user",
51
+ "assistant"
52
+ ],
53
+ "intra_message_sep": "\n",
54
+ "stop_token_ids": [
55
+ 151643,
56
+ 151644,
57
+ 151645
58
+ ],
59
+ "stop": [
60
+ "<|endoftext|>",
61
+ "<|im_start|>",
62
+ "<|im_end|>"
63
+ ]
64
+ }
65
+ }
66
+ ]
@@ -522,6 +522,144 @@
522
522
  ]
523
523
  }
524
524
  },
525
+ {
526
+ "version": 1,
527
+ "context_length": 131072,
528
+ "model_name": "glm4-chat",
529
+ "model_lang": [
530
+ "en",
531
+ "zh"
532
+ ],
533
+ "model_ability": [
534
+ "chat",
535
+ "tools"
536
+ ],
537
+ "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
538
+ "model_specs": [
539
+ {
540
+ "model_format": "pytorch",
541
+ "model_size_in_billions": 9,
542
+ "quantizations": [
543
+ "4-bit",
544
+ "8-bit",
545
+ "none"
546
+ ],
547
+ "model_hub": "modelscope",
548
+ "model_id": "ZhipuAI/glm-4-9b-chat",
549
+ "model_revision": "master"
550
+ }
551
+ ],
552
+ "prompt_style": {
553
+ "style_name": "CHATGLM3",
554
+ "system_prompt": "",
555
+ "roles": [
556
+ "user",
557
+ "assistant"
558
+ ],
559
+ "stop_token_ids": [
560
+ 151329,
561
+ 151336,
562
+ 151338
563
+ ],
564
+ "stop": [
565
+ "<|endoftext|>",
566
+ "<|user|>",
567
+ "<|observation|>"
568
+ ]
569
+ }
570
+ },
571
+ {
572
+ "version": 1,
573
+ "context_length": 1048576,
574
+ "model_name": "glm4-chat-1m",
575
+ "model_lang": [
576
+ "en",
577
+ "zh"
578
+ ],
579
+ "model_ability": [
580
+ "chat",
581
+ "tools"
582
+ ],
583
+ "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
584
+ "model_specs": [
585
+ {
586
+ "model_format": "pytorch",
587
+ "model_size_in_billions": 9,
588
+ "quantizations": [
589
+ "4-bit",
590
+ "8-bit",
591
+ "none"
592
+ ],
593
+ "model_hub": "modelscope",
594
+ "model_id": "ZhipuAI/glm-4-9b-chat-1m",
595
+ "model_revision": "master"
596
+ }
597
+ ],
598
+ "prompt_style": {
599
+ "style_name": "CHATGLM3",
600
+ "system_prompt": "",
601
+ "roles": [
602
+ "user",
603
+ "assistant"
604
+ ],
605
+ "stop_token_ids": [
606
+ 151329,
607
+ 151336,
608
+ 151338
609
+ ],
610
+ "stop": [
611
+ "<|endoftext|>",
612
+ "<|user|>",
613
+ "<|observation|>"
614
+ ]
615
+ }
616
+ },
617
+ {
618
+ "version": 1,
619
+ "context_length": 8192,
620
+ "model_name": "glm-4v",
621
+ "model_lang": [
622
+ "en",
623
+ "zh"
624
+ ],
625
+ "model_ability": [
626
+ "chat",
627
+ "vision"
628
+ ],
629
+ "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
630
+ "model_specs": [
631
+ {
632
+ "model_format": "pytorch",
633
+ "model_size_in_billions": 9,
634
+ "quantizations": [
635
+ "4-bit",
636
+ "8-bit",
637
+ "none"
638
+ ],
639
+ "model_hub": "modelscope",
640
+ "model_id": "ZhipuAI/glm-4v-9b",
641
+ "model_revision": "master"
642
+ }
643
+ ],
644
+ "prompt_style": {
645
+ "style_name": "CHATGLM3",
646
+ "system_prompt": "",
647
+ "roles": [
648
+ "user",
649
+ "assistant"
650
+ ],
651
+ "stop_token_ids": [
652
+ 151329,
653
+ 151336,
654
+ 151338
655
+ ],
656
+ "stop": [
657
+ "<|endoftext|>",
658
+ "<|user|>",
659
+ "<|observation|>"
660
+ ]
661
+ }
662
+ },
525
663
  {
526
664
  "version": 1,
527
665
  "context_length": 2048,
@@ -2648,6 +2786,233 @@
2648
2786
  ]
2649
2787
  }
2650
2788
  },
2789
+ {
2790
+ "version": 1,
2791
+ "context_length": 32768,
2792
+ "model_name": "qwen2-instruct",
2793
+ "model_lang": [
2794
+ "en",
2795
+ "zh"
2796
+ ],
2797
+ "model_ability": [
2798
+ "chat",
2799
+ "tools"
2800
+ ],
2801
+ "model_description": "Qwen2 is the new series of Qwen large language models",
2802
+ "model_specs": [
2803
+ {
2804
+ "model_format": "pytorch",
2805
+ "model_size_in_billions": "0_5",
2806
+ "quantizations": [
2807
+ "4-bit",
2808
+ "8-bit",
2809
+ "none"
2810
+ ],
2811
+ "model_id": "qwen/Qwen2-0.5B-Instruct",
2812
+ "model_hub": "modelscope"
2813
+ },
2814
+ {
2815
+ "model_format": "pytorch",
2816
+ "model_size_in_billions": "1_5",
2817
+ "quantizations": [
2818
+ "4-bit",
2819
+ "8-bit",
2820
+ "none"
2821
+ ],
2822
+ "model_id": "qwen/Qwen2-1.5B-Instruct",
2823
+ "model_hub": "modelscope"
2824
+ },
2825
+ {
2826
+ "model_format": "pytorch",
2827
+ "model_size_in_billions": 7,
2828
+ "quantizations": [
2829
+ "4-bit",
2830
+ "8-bit",
2831
+ "none"
2832
+ ],
2833
+ "model_id": "qwen/Qwen2-7B-Instruct",
2834
+ "model_hub": "modelscope"
2835
+ },
2836
+ {
2837
+ "model_format": "pytorch",
2838
+ "model_size_in_billions": 72,
2839
+ "quantizations": [
2840
+ "4-bit",
2841
+ "8-bit",
2842
+ "none"
2843
+ ],
2844
+ "model_id": "qwen/Qwen2-72B-Instruct",
2845
+ "model_hub": "modelscope"
2846
+ },
2847
+ {
2848
+ "model_format": "gptq",
2849
+ "model_size_in_billions": "0_5",
2850
+ "quantizations": [
2851
+ "Int4",
2852
+ "Int8"
2853
+ ],
2854
+ "model_id": "qwen/Qwen2-0.5B-Instruct-GPTQ-{quantization}",
2855
+ "model_hub": "modelscope"
2856
+ },
2857
+ {
2858
+ "model_format": "gptq",
2859
+ "model_size_in_billions": "1_5",
2860
+ "quantizations": [
2861
+ "Int4",
2862
+ "Int8"
2863
+ ],
2864
+ "model_id": "qwen/Qwen2-1.5B-Instruct-GPTQ-{quantization}",
2865
+ "model_hub": "modelscope"
2866
+ },
2867
+ {
2868
+ "model_format": "gptq",
2869
+ "model_size_in_billions": 7,
2870
+ "quantizations": [
2871
+ "Int4",
2872
+ "Int8"
2873
+ ],
2874
+ "model_id": "qwen/Qwen2-7B-Instruct-GPTQ-{quantization}",
2875
+ "model_hub": "modelscope"
2876
+ },
2877
+ {
2878
+ "model_format": "gptq",
2879
+ "model_size_in_billions": 72,
2880
+ "quantizations": [
2881
+ "Int4",
2882
+ "Int8"
2883
+ ],
2884
+ "model_id": "qwen/Qwen2-72B-Instruct-GPTQ-{quantization}",
2885
+ "model_hub": "modelscope"
2886
+ },
2887
+ {
2888
+ "model_format": "awq",
2889
+ "model_size_in_billions": "0_5",
2890
+ "quantizations": [
2891
+ "Int4"
2892
+ ],
2893
+ "model_id": "qwen/Qwen2-0.5B-Instruct-AWQ",
2894
+ "model_hub": "modelscope"
2895
+ },
2896
+ {
2897
+ "model_format": "awq",
2898
+ "model_size_in_billions": "1_5",
2899
+ "quantizations": [
2900
+ "Int4"
2901
+ ],
2902
+ "model_id": "qwen/Qwen2-1.5B-Instruct-AWQ",
2903
+ "model_hub": "modelscope"
2904
+ },
2905
+ {
2906
+ "model_format": "awq",
2907
+ "model_size_in_billions": 7,
2908
+ "quantizations": [
2909
+ "Int4"
2910
+ ],
2911
+ "model_id": "qwen/Qwen2-7B-Instruct-AWQ",
2912
+ "model_hub": "modelscope"
2913
+ },
2914
+ {
2915
+ "model_format": "awq",
2916
+ "model_size_in_billions": 72,
2917
+ "quantizations": [
2918
+ "Int4"
2919
+ ],
2920
+ "model_id": "qwen/Qwen2-72B-Instruct-AWQ",
2921
+ "model_hub": "modelscope"
2922
+ },
2923
+ {
2924
+ "model_format": "ggufv2",
2925
+ "model_size_in_billions": "0_5",
2926
+ "quantizations": [
2927
+ "q2_k",
2928
+ "q3_k_m",
2929
+ "q4_0",
2930
+ "q4_k_m",
2931
+ "q5_0",
2932
+ "q5_k_m",
2933
+ "q6_k",
2934
+ "q8_0",
2935
+ "fp16"
2936
+ ],
2937
+ "model_id": "qwen/Qwen2-0.5B-Instruct-GGUF",
2938
+ "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf",
2939
+ "model_hub": "modelscope"
2940
+ }
2941
+ ],
2942
+ "prompt_style": {
2943
+ "style_name": "QWEN",
2944
+ "system_prompt": "You are a helpful assistant.",
2945
+ "roles": [
2946
+ "user",
2947
+ "assistant"
2948
+ ],
2949
+ "intra_message_sep": "\n",
2950
+ "stop_token_ids": [
2951
+ 151643,
2952
+ 151644,
2953
+ 151645
2954
+ ],
2955
+ "stop": [
2956
+ "<|endoftext|>",
2957
+ "<|im_start|>",
2958
+ "<|im_end|>"
2959
+ ]
2960
+ }
2961
+ },
2962
+ {
2963
+ "version": 1,
2964
+ "context_length": 32768,
2965
+ "model_name": "qwen2-moe-instruct",
2966
+ "model_lang": [
2967
+ "en",
2968
+ "zh"
2969
+ ],
2970
+ "model_ability": [
2971
+ "chat"
2972
+ ],
2973
+ "model_description": "Qwen2 is the new series of Qwen large language models. ",
2974
+ "model_specs": [
2975
+ {
2976
+ "model_format": "pytorch",
2977
+ "model_size_in_billions": 14,
2978
+ "quantizations": [
2979
+ "4-bit",
2980
+ "8-bit",
2981
+ "none"
2982
+ ],
2983
+ "model_id": "qwen/Qwen2-57B-A14B-Instruct",
2984
+ "model_hub": "modelscope"
2985
+ },
2986
+ {
2987
+ "model_format": "gptq",
2988
+ "model_size_in_billions": 14,
2989
+ "quantizations": [
2990
+ "Int4"
2991
+ ],
2992
+ "model_id": "qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4",
2993
+ "model_hub": "modelscope"
2994
+ }
2995
+ ],
2996
+ "prompt_style": {
2997
+ "style_name": "QWEN",
2998
+ "system_prompt": "You are a helpful assistant.",
2999
+ "roles": [
3000
+ "user",
3001
+ "assistant"
3002
+ ],
3003
+ "intra_message_sep": "\n",
3004
+ "stop_token_ids": [
3005
+ 151643,
3006
+ 151644,
3007
+ 151645
3008
+ ],
3009
+ "stop": [
3010
+ "<|endoftext|>",
3011
+ "<|im_start|>",
3012
+ "<|im_end|>"
3013
+ ]
3014
+ }
3015
+ },
2651
3016
  {
2652
3017
  "version": 1,
2653
3018
  "context_length": 4096,
@@ -3236,7 +3601,7 @@
3236
3601
  "chat",
3237
3602
  "vision"
3238
3603
  ],
3239
- "model_description":"mniLMM is a family of open-source large multimodal models (LMMs) adept at vision & language modeling.",
3604
+ "model_description":"OmniLMM is a family of open-source large multimodal models (LMMs) adept at vision & language modeling.",
3240
3605
  "model_specs":[
3241
3606
  {
3242
3607
  "model_format":"pytorch",
@@ -3468,6 +3833,50 @@
3468
3833
  ]
3469
3834
  }
3470
3835
  },
3836
+ {
3837
+ "version":1,
3838
+ "context_length":2048,
3839
+ "model_name":"MiniCPM-Llama3-V-2_5",
3840
+ "model_lang":[
3841
+ "en",
3842
+ "zh"
3843
+ ],
3844
+ "model_ability":[
3845
+ "chat",
3846
+ "vision"
3847
+ ],
3848
+ "model_description":"MiniCPM-Llama3-V 2.5 is the latest model in the MiniCPM-V series. The model is built on SigLip-400M and Llama3-8B-Instruct with a total of 8B parameters.",
3849
+ "model_specs":[
3850
+ {
3851
+ "model_format":"pytorch",
3852
+ "model_size_in_billions":8,
3853
+ "quantizations":[
3854
+ "none"
3855
+ ],
3856
+ "model_hub": "modelscope",
3857
+ "model_id":"OpenBMB/MiniCPM-Llama3-V-2_5",
3858
+ "model_revision":"master"
3859
+ },
3860
+ {
3861
+ "model_format":"pytorch",
3862
+ "model_size_in_billions":8,
3863
+ "quantizations":[
3864
+ "int4"
3865
+ ],
3866
+ "model_hub": "modelscope",
3867
+ "model_id":"OpenBMB/MiniCPM-Llama3-V-2_5-{quantization}",
3868
+ "model_revision":"master"
3869
+ }
3870
+ ],
3871
+ "prompt_style":{
3872
+ "style_name":"OmniLMM",
3873
+ "system_prompt":"The role of first msg should be user",
3874
+ "roles":[
3875
+ "user",
3876
+ "assistant"
3877
+ ]
3878
+ }
3879
+ },
3471
3880
  {
3472
3881
  "version": 1,
3473
3882
  "context_length": 2048,
@@ -3860,7 +4269,7 @@
3860
4269
  "<|im_end|>"
3861
4270
  ]
3862
4271
  }
3863
- },
4272
+ },
3864
4273
  {
3865
4274
  "version": 1,
3866
4275
  "context_length": 8192,
@@ -82,30 +82,37 @@ class ChatglmPytorchChatModel(PytorchChatModel):
82
82
  ) -> bool:
83
83
  if llm_spec.model_format != "pytorch":
84
84
  return False
85
- if "chatglm" not in llm_family.model_name:
85
+ model_family = llm_family.model_family or llm_family.model_name
86
+ if "chatglm" not in model_family and "glm4" not in model_family:
86
87
  return False
87
88
  if "chat" not in llm_family.model_ability:
88
89
  return False
89
90
  return True
90
91
 
91
- @staticmethod
92
- def _handle_tools(generate_config) -> Optional[dict]:
92
+ def _handle_tools(self, generate_config) -> Optional[dict]:
93
93
  """Convert openai tools to ChatGLM tools."""
94
94
  if generate_config is None:
95
95
  return None
96
96
  tools = generate_config.pop("tools", None)
97
97
  if tools is None:
98
98
  return None
99
- chatglm_tools = []
100
- for elem in tools:
101
- if elem.get("type") != "function" or "function" not in elem:
102
- raise ValueError("ChatGLM tools only support function type.")
103
- chatglm_tools.append(elem["function"])
104
- return {
105
- "role": "system",
106
- "content": f"Answer the following questions as best as you can. You have access to the following tools:",
107
- "tools": chatglm_tools,
108
- }
99
+ if self.model_family.model_name == "glm4-chat":
100
+ return {
101
+ "role": "system",
102
+ "content": None,
103
+ "tools": tools,
104
+ }
105
+ else:
106
+ chatglm_tools = []
107
+ for elem in tools:
108
+ if elem.get("type") != "function" or "function" not in elem:
109
+ raise ValueError("ChatGLM tools only support function type.")
110
+ chatglm_tools.append(elem["function"])
111
+ return {
112
+ "role": "system",
113
+ "content": f"Answer the following questions as best as you can. You have access to the following tools:",
114
+ "tools": chatglm_tools,
115
+ }
109
116
 
110
117
  def chat(
111
118
  self,
@@ -30,6 +30,7 @@ from ....types import (
30
30
  ChatCompletionMessage,
31
31
  Completion,
32
32
  CompletionChoice,
33
+ CompletionChunk,
33
34
  CompletionUsage,
34
35
  )
35
36
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -183,10 +184,7 @@ class CogVLM2Model(PytorchChatModel):
183
184
  generate_config: Optional[PytorchGenerateConfig] = None,
184
185
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
185
186
  system_prompt = system_prompt if system_prompt else ""
186
- if generate_config and generate_config.get("stream"):
187
- raise Exception(
188
- f"Chat with model {self.model_family.model_name} does not support stream."
189
- )
187
+ stream = generate_config.get("stream", False) if generate_config else False
190
188
 
191
189
  sanitized_config = {
192
190
  "pad_token_id": 128002,
@@ -234,24 +232,85 @@ class CogVLM2Model(PytorchChatModel):
234
232
  if image is not None
235
233
  else None,
236
234
  }
237
- with torch.no_grad():
238
- outputs = self._model.generate(**inputs, **sanitized_config)
239
- outputs = outputs[:, inputs["input_ids"].shape[1] :]
240
- response = self._tokenizer.decode(outputs[0])
241
- response = response.split("<|end_of_text|>")[0]
242
235
 
243
- chunk = Completion(
244
- id=str(uuid.uuid1()),
236
+ if stream:
237
+ it = self._streaming_chat_response(inputs, sanitized_config)
238
+ return self._to_chat_completion_chunks(it)
239
+ else:
240
+ with torch.no_grad():
241
+ outputs = self._model.generate(**inputs, **sanitized_config)
242
+ outputs = outputs[:, inputs["input_ids"].shape[1] :]
243
+ response = self._tokenizer.decode(outputs[0])
244
+ response = response.split("<|end_of_text|>")[0]
245
+
246
+ chunk = Completion(
247
+ id=str(uuid.uuid1()),
248
+ object="text_completion",
249
+ created=int(time.time()),
250
+ model=self.model_uid,
251
+ choices=[
252
+ CompletionChoice(
253
+ index=0, text=response, finish_reason="stop", logprobs=None
254
+ )
255
+ ],
256
+ usage=CompletionUsage(
257
+ prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
258
+ ),
259
+ )
260
+ return self._to_chat_completion(chunk)
261
+
262
+ def _streaming_chat_response(
263
+ self, inputs: Dict, config: Dict
264
+ ) -> Iterator[CompletionChunk]:
265
+ from threading import Thread
266
+
267
+ from transformers import TextIteratorStreamer
268
+
269
+ streamer = TextIteratorStreamer(
270
+ self._tokenizer, skip_prompt=True, skip_special_tokens=True
271
+ )
272
+ generation_kwargs = {
273
+ "input_ids": inputs["input_ids"],
274
+ "attention_mask": inputs["attention_mask"],
275
+ "token_type_ids": inputs["token_type_ids"],
276
+ "images": inputs["images"],
277
+ "max_new_tokens": config["max_new_tokens"],
278
+ "pad_token_id": config["pad_token_id"],
279
+ "streamer": streamer,
280
+ }
281
+
282
+ thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
283
+ thread.start()
284
+
285
+ completion_id = str(uuid.uuid1())
286
+ for new_text in streamer:
287
+ chunk = CompletionChunk(
288
+ id=completion_id,
289
+ object="text_completion",
290
+ created=int(time.time()),
291
+ model=self.model_uid,
292
+ choices=[
293
+ CompletionChoice(
294
+ index=0, text=new_text, finish_reason=None, logprobs=None
295
+ )
296
+ ],
297
+ usage=CompletionUsage(
298
+ prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
299
+ ),
300
+ )
301
+ yield chunk
302
+
303
+ completion_choice = CompletionChoice(
304
+ text="", index=0, logprobs=None, finish_reason="stop"
305
+ )
306
+ chunk = CompletionChunk(
307
+ id=completion_id,
245
308
  object="text_completion",
246
309
  created=int(time.time()),
247
310
  model=self.model_uid,
248
- choices=[
249
- CompletionChoice(
250
- index=0, text=response, finish_reason="stop", logprobs=None
251
- )
252
- ],
311
+ choices=[completion_choice],
253
312
  usage=CompletionUsage(
254
313
  prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
255
314
  ),
256
315
  )
257
- return self._to_chat_completion(chunk)
316
+ yield chunk