xinference 0.11.3__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

@@ -522,6 +522,142 @@
522
522
  ]
523
523
  }
524
524
  },
525
+ {
526
+ "version": 1,
527
+ "context_length": 131072,
528
+ "model_name": "glm4-chat",
529
+ "model_lang": [
530
+ "en",
531
+ "zh"
532
+ ],
533
+ "model_ability": [
534
+ "chat",
535
+ "tools"
536
+ ],
537
+ "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
538
+ "model_specs": [
539
+ {
540
+ "model_format": "pytorch",
541
+ "model_size_in_billions": 9,
542
+ "quantizations": [
543
+ "4-bit",
544
+ "8-bit",
545
+ "none"
546
+ ],
547
+ "model_hub": "modelscope",
548
+ "model_id": "ZhipuAI/glm-4-9b-chat",
549
+ "model_revision": "master"
550
+ }
551
+ ],
552
+ "prompt_style": {
553
+ "style_name": "CHATGLM3",
554
+ "system_prompt": "",
555
+ "roles": [
556
+ "user",
557
+ "assistant"
558
+ ],
559
+ "stop_token_ids": [
560
+ 151329,
561
+ 151336,
562
+ 151338
563
+ ],
564
+ "stop": [
565
+ "<|endoftext|>",
566
+ "<|user|>",
567
+ "<|observation|>"
568
+ ]
569
+ }
570
+ },
571
+ {
572
+ "version": 1,
573
+ "context_length": 1048576,
574
+ "model_name": "glm4-chat-1m",
575
+ "model_lang": [
576
+ "en",
577
+ "zh"
578
+ ],
579
+ "model_ability": [
580
+ "chat",
581
+ "tools"
582
+ ],
583
+ "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
584
+ "model_specs": [
585
+ {
586
+ "model_format": "pytorch",
587
+ "model_size_in_billions": 9,
588
+ "quantizations": [
589
+ "4-bit",
590
+ "8-bit",
591
+ "none"
592
+ ],
593
+ "model_hub": "modelscope",
594
+ "model_id": "ZhipuAI/glm-4-9b-chat-1m",
595
+ "model_revision": "master"
596
+ }
597
+ ],
598
+ "prompt_style": {
599
+ "style_name": "CHATGLM3",
600
+ "system_prompt": "",
601
+ "roles": [
602
+ "user",
603
+ "assistant"
604
+ ],
605
+ "stop_token_ids": [
606
+ 151329,
607
+ 151336,
608
+ 151338
609
+ ],
610
+ "stop": [
611
+ "<|endoftext|>",
612
+ "<|user|>",
613
+ "<|observation|>"
614
+ ]
615
+ }
616
+ },
617
+ {
618
+ "version": 1,
619
+ "context_length": 8192,
620
+ "model_name": "glm-4v",
621
+ "model_lang": [
622
+ "en",
623
+ "zh"
624
+ ],
625
+ "model_ability": [
626
+ "chat",
627
+ "vision"
628
+ ],
629
+ "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
630
+ "model_specs": [
631
+ {
632
+ "model_format": "pytorch",
633
+ "model_size_in_billions": 9,
634
+ "quantizations": [
635
+ "none"
636
+ ],
637
+ "model_hub": "modelscope",
638
+ "model_id": "ZhipuAI/glm-4v-9b",
639
+ "model_revision": "master"
640
+ }
641
+ ],
642
+ "prompt_style": {
643
+ "style_name": "CHATGLM3",
644
+ "system_prompt": "",
645
+ "roles": [
646
+ "user",
647
+ "assistant"
648
+ ],
649
+ "stop_token_ids": [
650
+ 151329,
651
+ 151336,
652
+ 151338
653
+ ],
654
+ "stop": [
655
+ "<|endoftext|>",
656
+ "<|user|>",
657
+ "<|observation|>"
658
+ ]
659
+ }
660
+ },
525
661
  {
526
662
  "version": 1,
527
663
  "context_length": 2048,
@@ -2648,6 +2784,233 @@
2648
2784
  ]
2649
2785
  }
2650
2786
  },
2787
+ {
2788
+ "version": 1,
2789
+ "context_length": 32768,
2790
+ "model_name": "qwen2-instruct",
2791
+ "model_lang": [
2792
+ "en",
2793
+ "zh"
2794
+ ],
2795
+ "model_ability": [
2796
+ "chat",
2797
+ "tools"
2798
+ ],
2799
+ "model_description": "Qwen2 is the new series of Qwen large language models",
2800
+ "model_specs": [
2801
+ {
2802
+ "model_format": "pytorch",
2803
+ "model_size_in_billions": "0_5",
2804
+ "quantizations": [
2805
+ "4-bit",
2806
+ "8-bit",
2807
+ "none"
2808
+ ],
2809
+ "model_id": "qwen/Qwen2-0.5B-Instruct",
2810
+ "model_hub": "modelscope"
2811
+ },
2812
+ {
2813
+ "model_format": "pytorch",
2814
+ "model_size_in_billions": "1_5",
2815
+ "quantizations": [
2816
+ "4-bit",
2817
+ "8-bit",
2818
+ "none"
2819
+ ],
2820
+ "model_id": "qwen/Qwen2-1.5B-Instruct",
2821
+ "model_hub": "modelscope"
2822
+ },
2823
+ {
2824
+ "model_format": "pytorch",
2825
+ "model_size_in_billions": 7,
2826
+ "quantizations": [
2827
+ "4-bit",
2828
+ "8-bit",
2829
+ "none"
2830
+ ],
2831
+ "model_id": "qwen/Qwen2-7B-Instruct",
2832
+ "model_hub": "modelscope"
2833
+ },
2834
+ {
2835
+ "model_format": "pytorch",
2836
+ "model_size_in_billions": 72,
2837
+ "quantizations": [
2838
+ "4-bit",
2839
+ "8-bit",
2840
+ "none"
2841
+ ],
2842
+ "model_id": "qwen/Qwen2-72B-Instruct",
2843
+ "model_hub": "modelscope"
2844
+ },
2845
+ {
2846
+ "model_format": "gptq",
2847
+ "model_size_in_billions": "0_5",
2848
+ "quantizations": [
2849
+ "Int4",
2850
+ "Int8"
2851
+ ],
2852
+ "model_id": "qwen/Qwen2-0.5B-Instruct-GPTQ-{quantization}",
2853
+ "model_hub": "modelscope"
2854
+ },
2855
+ {
2856
+ "model_format": "gptq",
2857
+ "model_size_in_billions": "1_5",
2858
+ "quantizations": [
2859
+ "Int4",
2860
+ "Int8"
2861
+ ],
2862
+ "model_id": "qwen/Qwen2-1.5B-Instruct-GPTQ-{quantization}",
2863
+ "model_hub": "modelscope"
2864
+ },
2865
+ {
2866
+ "model_format": "gptq",
2867
+ "model_size_in_billions": 7,
2868
+ "quantizations": [
2869
+ "Int4",
2870
+ "Int8"
2871
+ ],
2872
+ "model_id": "qwen/Qwen2-7B-Instruct-GPTQ-{quantization}",
2873
+ "model_hub": "modelscope"
2874
+ },
2875
+ {
2876
+ "model_format": "gptq",
2877
+ "model_size_in_billions": 72,
2878
+ "quantizations": [
2879
+ "Int4",
2880
+ "Int8"
2881
+ ],
2882
+ "model_id": "qwen/Qwen2-72B-Instruct-GPTQ-{quantization}",
2883
+ "model_hub": "modelscope"
2884
+ },
2885
+ {
2886
+ "model_format": "awq",
2887
+ "model_size_in_billions": "0_5",
2888
+ "quantizations": [
2889
+ "Int4"
2890
+ ],
2891
+ "model_id": "qwen/Qwen2-0.5B-Instruct-AWQ",
2892
+ "model_hub": "modelscope"
2893
+ },
2894
+ {
2895
+ "model_format": "awq",
2896
+ "model_size_in_billions": "1_5",
2897
+ "quantizations": [
2898
+ "Int4"
2899
+ ],
2900
+ "model_id": "qwen/Qwen2-1.5B-Instruct-AWQ",
2901
+ "model_hub": "modelscope"
2902
+ },
2903
+ {
2904
+ "model_format": "awq",
2905
+ "model_size_in_billions": 7,
2906
+ "quantizations": [
2907
+ "Int4"
2908
+ ],
2909
+ "model_id": "qwen/Qwen2-7B-Instruct-AWQ",
2910
+ "model_hub": "modelscope"
2911
+ },
2912
+ {
2913
+ "model_format": "awq",
2914
+ "model_size_in_billions": 72,
2915
+ "quantizations": [
2916
+ "Int4"
2917
+ ],
2918
+ "model_id": "qwen/Qwen2-72B-Instruct-AWQ",
2919
+ "model_hub": "modelscope"
2920
+ },
2921
+ {
2922
+ "model_format": "ggufv2",
2923
+ "model_size_in_billions": "0_5",
2924
+ "quantizations": [
2925
+ "q2_k",
2926
+ "q3_k_m",
2927
+ "q4_0",
2928
+ "q4_k_m",
2929
+ "q5_0",
2930
+ "q5_k_m",
2931
+ "q6_k",
2932
+ "q8_0",
2933
+ "fp16"
2934
+ ],
2935
+ "model_id": "qwen/Qwen2-0.5B-Instruct-GGUF",
2936
+ "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf",
2937
+ "model_hub": "modelscope"
2938
+ }
2939
+ ],
2940
+ "prompt_style": {
2941
+ "style_name": "QWEN",
2942
+ "system_prompt": "You are a helpful assistant.",
2943
+ "roles": [
2944
+ "user",
2945
+ "assistant"
2946
+ ],
2947
+ "intra_message_sep": "\n",
2948
+ "stop_token_ids": [
2949
+ 151643,
2950
+ 151644,
2951
+ 151645
2952
+ ],
2953
+ "stop": [
2954
+ "<|endoftext|>",
2955
+ "<|im_start|>",
2956
+ "<|im_end|>"
2957
+ ]
2958
+ }
2959
+ },
2960
+ {
2961
+ "version": 1,
2962
+ "context_length": 32768,
2963
+ "model_name": "qwen2-moe-instruct",
2964
+ "model_lang": [
2965
+ "en",
2966
+ "zh"
2967
+ ],
2968
+ "model_ability": [
2969
+ "chat"
2970
+ ],
2971
+ "model_description": "Qwen2 is the new series of Qwen large language models. ",
2972
+ "model_specs": [
2973
+ {
2974
+ "model_format": "pytorch",
2975
+ "model_size_in_billions": 14,
2976
+ "quantizations": [
2977
+ "4-bit",
2978
+ "8-bit",
2979
+ "none"
2980
+ ],
2981
+ "model_id": "qwen/Qwen2-57B-A14B-Instruct",
2982
+ "model_hub": "modelscope"
2983
+ },
2984
+ {
2985
+ "model_format": "gptq",
2986
+ "model_size_in_billions": 14,
2987
+ "quantizations": [
2988
+ "Int4"
2989
+ ],
2990
+ "model_id": "qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4",
2991
+ "model_hub": "modelscope"
2992
+ }
2993
+ ],
2994
+ "prompt_style": {
2995
+ "style_name": "QWEN",
2996
+ "system_prompt": "You are a helpful assistant.",
2997
+ "roles": [
2998
+ "user",
2999
+ "assistant"
3000
+ ],
3001
+ "intra_message_sep": "\n",
3002
+ "stop_token_ids": [
3003
+ 151643,
3004
+ 151644,
3005
+ 151645
3006
+ ],
3007
+ "stop": [
3008
+ "<|endoftext|>",
3009
+ "<|im_start|>",
3010
+ "<|im_end|>"
3011
+ ]
3012
+ }
3013
+ },
2651
3014
  {
2652
3015
  "version": 1,
2653
3016
  "context_length": 4096,
@@ -3236,7 +3599,7 @@
3236
3599
  "chat",
3237
3600
  "vision"
3238
3601
  ],
3239
- "model_description":"mniLMM is a family of open-source large multimodal models (LMMs) adept at vision & language modeling.",
3602
+ "model_description":"OmniLMM is a family of open-source large multimodal models (LMMs) adept at vision & language modeling.",
3240
3603
  "model_specs":[
3241
3604
  {
3242
3605
  "model_format":"pytorch",
@@ -3468,6 +3831,50 @@
3468
3831
  ]
3469
3832
  }
3470
3833
  },
3834
+ {
3835
+ "version":1,
3836
+ "context_length":2048,
3837
+ "model_name":"MiniCPM-Llama3-V-2_5",
3838
+ "model_lang":[
3839
+ "en",
3840
+ "zh"
3841
+ ],
3842
+ "model_ability":[
3843
+ "chat",
3844
+ "vision"
3845
+ ],
3846
+ "model_description":"MiniCPM-Llama3-V 2.5 is the latest model in the MiniCPM-V series. The model is built on SigLip-400M and Llama3-8B-Instruct with a total of 8B parameters.",
3847
+ "model_specs":[
3848
+ {
3849
+ "model_format":"pytorch",
3850
+ "model_size_in_billions":8,
3851
+ "quantizations":[
3852
+ "none"
3853
+ ],
3854
+ "model_hub": "modelscope",
3855
+ "model_id":"OpenBMB/MiniCPM-Llama3-V-2_5",
3856
+ "model_revision":"master"
3857
+ },
3858
+ {
3859
+ "model_format":"pytorch",
3860
+ "model_size_in_billions":8,
3861
+ "quantizations":[
3862
+ "int4"
3863
+ ],
3864
+ "model_hub": "modelscope",
3865
+ "model_id":"OpenBMB/MiniCPM-Llama3-V-2_5-{quantization}",
3866
+ "model_revision":"master"
3867
+ }
3868
+ ],
3869
+ "prompt_style":{
3870
+ "style_name":"OmniLMM",
3871
+ "system_prompt":"The role of first msg should be user",
3872
+ "roles":[
3873
+ "user",
3874
+ "assistant"
3875
+ ]
3876
+ }
3877
+ },
3471
3878
  {
3472
3879
  "version": 1,
3473
3880
  "context_length": 2048,
@@ -3860,7 +4267,7 @@
3860
4267
  "<|im_end|>"
3861
4268
  ]
3862
4269
  }
3863
- },
4270
+ },
3864
4271
  {
3865
4272
  "version": 1,
3866
4273
  "context_length": 8192,
@@ -82,7 +82,8 @@ class ChatglmPytorchChatModel(PytorchChatModel):
82
82
  ) -> bool:
83
83
  if llm_spec.model_format != "pytorch":
84
84
  return False
85
- if "chatglm" not in llm_family.model_name:
85
+ model_family = llm_family.model_family or llm_family.model_name
86
+ if "chatglm" not in model_family and "glm4" not in model_family:
86
87
  return False
87
88
  if "chat" not in llm_family.model_ability:
88
89
  return False
@@ -30,6 +30,7 @@ from ....types import (
30
30
  ChatCompletionMessage,
31
31
  Completion,
32
32
  CompletionChoice,
33
+ CompletionChunk,
33
34
  CompletionUsage,
34
35
  )
35
36
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -183,10 +184,7 @@ class CogVLM2Model(PytorchChatModel):
183
184
  generate_config: Optional[PytorchGenerateConfig] = None,
184
185
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
185
186
  system_prompt = system_prompt if system_prompt else ""
186
- if generate_config and generate_config.get("stream"):
187
- raise Exception(
188
- f"Chat with model {self.model_family.model_name} does not support stream."
189
- )
187
+ stream = generate_config.get("stream", False) if generate_config else False
190
188
 
191
189
  sanitized_config = {
192
190
  "pad_token_id": 128002,
@@ -234,24 +232,85 @@ class CogVLM2Model(PytorchChatModel):
234
232
  if image is not None
235
233
  else None,
236
234
  }
237
- with torch.no_grad():
238
- outputs = self._model.generate(**inputs, **sanitized_config)
239
- outputs = outputs[:, inputs["input_ids"].shape[1] :]
240
- response = self._tokenizer.decode(outputs[0])
241
- response = response.split("<|end_of_text|>")[0]
242
235
 
243
- chunk = Completion(
244
- id=str(uuid.uuid1()),
236
+ if stream:
237
+ it = self._streaming_chat_response(inputs, sanitized_config)
238
+ return self._to_chat_completion_chunks(it)
239
+ else:
240
+ with torch.no_grad():
241
+ outputs = self._model.generate(**inputs, **sanitized_config)
242
+ outputs = outputs[:, inputs["input_ids"].shape[1] :]
243
+ response = self._tokenizer.decode(outputs[0])
244
+ response = response.split("<|end_of_text|>")[0]
245
+
246
+ chunk = Completion(
247
+ id=str(uuid.uuid1()),
248
+ object="text_completion",
249
+ created=int(time.time()),
250
+ model=self.model_uid,
251
+ choices=[
252
+ CompletionChoice(
253
+ index=0, text=response, finish_reason="stop", logprobs=None
254
+ )
255
+ ],
256
+ usage=CompletionUsage(
257
+ prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
258
+ ),
259
+ )
260
+ return self._to_chat_completion(chunk)
261
+
262
+ def _streaming_chat_response(
263
+ self, inputs: Dict, config: Dict
264
+ ) -> Iterator[CompletionChunk]:
265
+ from threading import Thread
266
+
267
+ from transformers import TextIteratorStreamer
268
+
269
+ streamer = TextIteratorStreamer(
270
+ self._tokenizer, skip_prompt=True, skip_special_tokens=True
271
+ )
272
+ generation_kwargs = {
273
+ "input_ids": inputs["input_ids"],
274
+ "attention_mask": inputs["attention_mask"],
275
+ "token_type_ids": inputs["token_type_ids"],
276
+ "images": inputs["images"],
277
+ "max_new_tokens": config["max_new_tokens"],
278
+ "pad_token_id": config["pad_token_id"],
279
+ "streamer": streamer,
280
+ }
281
+
282
+ thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
283
+ thread.start()
284
+
285
+ completion_id = str(uuid.uuid1())
286
+ for new_text in streamer:
287
+ chunk = CompletionChunk(
288
+ id=completion_id,
289
+ object="text_completion",
290
+ created=int(time.time()),
291
+ model=self.model_uid,
292
+ choices=[
293
+ CompletionChoice(
294
+ index=0, text=new_text, finish_reason=None, logprobs=None
295
+ )
296
+ ],
297
+ usage=CompletionUsage(
298
+ prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
299
+ ),
300
+ )
301
+ yield chunk
302
+
303
+ completion_choice = CompletionChoice(
304
+ text="", index=0, logprobs=None, finish_reason="stop"
305
+ )
306
+ chunk = CompletionChunk(
307
+ id=completion_id,
245
308
  object="text_completion",
246
309
  created=int(time.time()),
247
310
  model=self.model_uid,
248
- choices=[
249
- CompletionChoice(
250
- index=0, text=response, finish_reason="stop", logprobs=None
251
- )
252
- ],
311
+ choices=[completion_choice],
253
312
  usage=CompletionUsage(
254
313
  prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
255
314
  ),
256
315
  )
257
- return self._to_chat_completion(chunk)
316
+ yield chunk