xinference 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (42) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +31 -0
  3. xinference/client/common.py +2 -0
  4. xinference/client/restful/restful_client.py +25 -0
  5. xinference/core/supervisor.py +11 -1
  6. xinference/core/worker.py +16 -0
  7. xinference/deploy/cmdline.py +53 -1
  8. xinference/device_utils.py +0 -2
  9. xinference/model/core.py +13 -2
  10. xinference/model/image/core.py +16 -2
  11. xinference/model/image/stable_diffusion/core.py +25 -2
  12. xinference/model/llm/__init__.py +17 -0
  13. xinference/model/llm/core.py +18 -2
  14. xinference/model/llm/ggml/llamacpp.py +3 -19
  15. xinference/model/llm/llm_family.json +8 -3
  16. xinference/model/llm/llm_family.py +100 -29
  17. xinference/model/llm/llm_family_modelscope.json +7 -2
  18. xinference/model/llm/pytorch/baichuan.py +2 -0
  19. xinference/model/llm/pytorch/chatglm.py +2 -0
  20. xinference/model/llm/pytorch/core.py +23 -0
  21. xinference/model/llm/pytorch/falcon.py +4 -0
  22. xinference/model/llm/pytorch/internlm2.py +2 -0
  23. xinference/model/llm/pytorch/llama_2.py +4 -0
  24. xinference/model/llm/pytorch/qwen_vl.py +1 -0
  25. xinference/model/llm/pytorch/vicuna.py +2 -0
  26. xinference/model/llm/pytorch/yi_vl.py +1 -0
  27. xinference/web/ui/build/asset-manifest.json +3 -3
  28. xinference/web/ui/build/index.html +1 -1
  29. xinference/web/ui/build/static/js/{main.ebf7716d.js → main.78829790.js} +3 -3
  30. xinference/web/ui/build/static/js/main.78829790.js.map +1 -0
  31. xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +1 -0
  32. xinference/web/ui/node_modules/.cache/babel-loader/e8687f75d2adacd34852b71c41ca17203d6fb4c8999ea55325bb2939f9d9ea90.json +1 -0
  33. {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/METADATA +3 -1
  34. {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/RECORD +39 -39
  35. xinference/web/ui/build/static/js/main.ebf7716d.js.map +0 -1
  36. xinference/web/ui/node_modules/.cache/babel-loader/0738899eefad7f90261125823d87ea9f0d53667b1479a0c1f398aff14f2bbd2a.json +0 -1
  37. xinference/web/ui/node_modules/.cache/babel-loader/77d4d795f078408fa2dd49da26d1ba1543d51b63cc253e736f4bef2e6014e888.json +0 -1
  38. /xinference/web/ui/build/static/js/{main.ebf7716d.js.LICENSE.txt → main.78829790.js.LICENSE.txt} +0 -0
  39. {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/LICENSE +0 -0
  40. {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/WHEEL +0 -0
  41. {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/entry_points.txt +0 -0
  42. {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/top_level.txt +0 -0
@@ -59,6 +59,8 @@ class GgmlLLMSpecV1(BaseModel):
59
59
  quantizations: List[str]
60
60
  model_id: Optional[str]
61
61
  model_file_name_template: str
62
+ model_file_name_split_template: Optional[str]
63
+ quantization_parts: Optional[Dict[str, List[str]]]
62
64
  model_hub: str = "huggingface"
63
65
  model_uri: Optional[str]
64
66
  model_revision: Optional[str]
@@ -210,6 +212,7 @@ CustomLLMFamilyV1.update_forward_refs()
210
212
 
211
213
 
212
214
  LLM_CLASSES: List[Type[LLM]] = []
215
+ PEFT_SUPPORTED_CLASSES: List[Type[LLM]] = []
213
216
 
214
217
  BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
215
218
  BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
@@ -522,6 +525,52 @@ def _generate_meta_file(
522
525
  json.dump(desc.to_dict(), f)
523
526
 
524
527
 
528
+ def _generate_model_file_names(
529
+ llm_spec: "LLMSpecV1", quantization: Optional[str] = None
530
+ ) -> Tuple[List[str], str, bool]:
531
+ file_names = []
532
+ final_file_name = llm_spec.model_file_name_template.format(
533
+ quantization=quantization
534
+ )
535
+ need_merge = False
536
+
537
+ if llm_spec.quantization_parts is None:
538
+ file_names.append(final_file_name)
539
+ elif quantization is not None and quantization in llm_spec.quantization_parts:
540
+ parts = llm_spec.quantization_parts[quantization]
541
+ need_merge = True
542
+
543
+ logger.info(
544
+ f"Model {llm_spec.model_id} {llm_spec.model_format} {quantization} has {len(parts)} parts."
545
+ )
546
+
547
+ if llm_spec.model_file_name_split_template is None:
548
+ raise ValueError(
549
+ f"No model_file_name_split_template for model spec {llm_spec.model_id}"
550
+ )
551
+
552
+ for part in parts:
553
+ file_name = llm_spec.model_file_name_split_template.format(
554
+ quantization=quantization, part=part
555
+ )
556
+ file_names.append(file_name)
557
+
558
+ return file_names, final_file_name, need_merge
559
+
560
+
561
+ def _merge_cached_files(
562
+ cache_dir: str, input_file_names: List[str], output_file_name: str
563
+ ):
564
+ with open(os.path.join(cache_dir, output_file_name), "wb") as output_file:
565
+ for file_name in input_file_names:
566
+ logger.info(f"Merging file {file_name} into {output_file_name} ...")
567
+
568
+ with open(os.path.join(cache_dir, file_name), "rb") as input_file:
569
+ shutil.copyfileobj(input_file, output_file)
570
+
571
+ logger.info(f"Merge complete.")
572
+
573
+
525
574
  def cache_from_modelscope(
526
575
  llm_family: LLMFamilyV1,
527
576
  llm_spec: "LLMSpecV1",
@@ -560,19 +609,26 @@ def cache_from_modelscope(
560
609
  symlink_local_file(os.path.join(subdir, file), cache_dir, relpath)
561
610
 
562
611
  elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
563
- filename = llm_spec.model_file_name_template.format(quantization=quantization)
564
- download_path = retry_download(
565
- model_file_download,
566
- llm_family.model_name,
567
- {
568
- "model_size": llm_spec.model_size_in_billions,
569
- "model_format": llm_spec.model_format,
570
- },
571
- llm_spec.model_id,
572
- filename,
573
- revision=llm_spec.model_revision,
612
+ file_names, final_file_name, need_merge = _generate_model_file_names(
613
+ llm_spec, quantization
574
614
  )
575
- symlink_local_file(download_path, cache_dir, filename)
615
+
616
+ for filename in file_names:
617
+ download_path = retry_download(
618
+ model_file_download,
619
+ llm_family.model_name,
620
+ {
621
+ "model_size": llm_spec.model_size_in_billions,
622
+ "model_format": llm_spec.model_format,
623
+ },
624
+ llm_spec.model_id,
625
+ filename,
626
+ revision=llm_spec.model_revision,
627
+ )
628
+ symlink_local_file(download_path, cache_dir, filename)
629
+
630
+ if need_merge:
631
+ _merge_cached_files(cache_dir, file_names, final_file_name)
576
632
  else:
577
633
  raise ValueError(f"Unsupported format: {llm_spec.model_format}")
578
634
 
@@ -621,20 +677,27 @@ def cache_from_huggingface(
621
677
 
622
678
  elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
623
679
  assert isinstance(llm_spec, GgmlLLMSpecV1)
624
- file_name = llm_spec.model_file_name_template.format(quantization=quantization)
625
- retry_download(
626
- huggingface_hub.hf_hub_download,
627
- llm_family.model_name,
628
- {
629
- "model_size": llm_spec.model_size_in_billions,
630
- "model_format": llm_spec.model_format,
631
- },
632
- llm_spec.model_id,
633
- revision=llm_spec.model_revision,
634
- filename=file_name,
635
- local_dir=cache_dir,
636
- local_dir_use_symlinks=True,
680
+ file_names, final_file_name, need_merge = _generate_model_file_names(
681
+ llm_spec, quantization
637
682
  )
683
+
684
+ for file_name in file_names:
685
+ retry_download(
686
+ huggingface_hub.hf_hub_download,
687
+ llm_family.model_name,
688
+ {
689
+ "model_size": llm_spec.model_size_in_billions,
690
+ "model_format": llm_spec.model_format,
691
+ },
692
+ llm_spec.model_id,
693
+ revision=llm_spec.model_revision,
694
+ filename=file_name,
695
+ local_dir=cache_dir,
696
+ local_dir_use_symlinks=True,
697
+ )
698
+
699
+ if need_merge:
700
+ _merge_cached_files(cache_dir, file_names, final_file_name)
638
701
  else:
639
702
  raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
640
703
 
@@ -873,12 +936,20 @@ def unregister_llm(model_name: str, raise_error: bool = True):
873
936
 
874
937
 
875
938
  def match_llm_cls(
876
- family: LLMFamilyV1, llm_spec: "LLMSpecV1", quantization: str
939
+ family: LLMFamilyV1,
940
+ llm_spec: "LLMSpecV1",
941
+ quantization: str,
942
+ peft_model_path: Optional[str] = None,
877
943
  ) -> Optional[Type[LLM]]:
878
944
  """
879
945
  Find an LLM implementation for given LLM family and spec.
880
946
  """
881
- for cls in LLM_CLASSES:
882
- if cls.match(family, llm_spec, quantization):
883
- return cls
947
+ if peft_model_path is not None:
948
+ for cls in PEFT_SUPPORTED_CLASSES:
949
+ if cls.match(family, llm_spec, quantization):
950
+ return cls
951
+ else:
952
+ for cls in LLM_CLASSES:
953
+ if cls.match(family, llm_spec, quantization):
954
+ return cls
884
955
  return None
@@ -1909,11 +1909,16 @@
1909
1909
  "model_size_in_billions": 72,
1910
1910
  "quantizations": [
1911
1911
  "q2_k",
1912
- "q3_k_m"
1912
+ "q3_k_m",
1913
+ "q4_k_m"
1913
1914
  ],
1914
1915
  "model_id": "qwen/Qwen1.5-72B-Chat-GGUF",
1915
1916
  "model_hub": "modelscope",
1916
- "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf"
1917
+ "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf",
1918
+ "model_file_name_split_template": "qwen1_5-72b-chat-{quantization}.gguf.{part}",
1919
+ "quantization_parts": {
1920
+ "q4_k_m": ["a", "b"]
1921
+ }
1917
1922
  }
1918
1923
  ],
1919
1924
  "prompt_style": {
@@ -27,6 +27,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
27
27
  quantization: str,
28
28
  model_path: str,
29
29
  pytorch_model_config: Optional[PytorchModelConfig] = None,
30
+ peft_model_path: Optional[str] = None,
30
31
  ):
31
32
  super().__init__(
32
33
  model_uid,
@@ -35,6 +36,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
35
36
  quantization,
36
37
  model_path,
37
38
  pytorch_model_config=pytorch_model_config,
39
+ peft_model_path=peft_model_path,
38
40
  )
39
41
  self._use_fast_tokenizer = False
40
42
 
@@ -39,6 +39,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
39
39
  quantization: str,
40
40
  model_path: str,
41
41
  pytorch_model_config: Optional[PytorchModelConfig] = None,
42
+ peft_model_path: Optional[str] = None,
42
43
  ):
43
44
  super().__init__(
44
45
  model_uid,
@@ -47,6 +48,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
47
48
  quantization,
48
49
  model_path,
49
50
  pytorch_model_config=pytorch_model_config,
51
+ peft_model_path=peft_model_path,
50
52
  )
51
53
 
52
54
  def _load_model(self, **kwargs):
@@ -52,12 +52,14 @@ class PytorchModel(LLM):
52
52
  quantization: str,
53
53
  model_path: str,
54
54
  pytorch_model_config: Optional[PytorchModelConfig] = None,
55
+ peft_model_path: Optional[str] = None,
55
56
  ):
56
57
  super().__init__(model_uid, model_family, model_spec, quantization, model_path)
57
58
  self._use_fast_tokenizer = True
58
59
  self._pytorch_model_config: PytorchModelConfig = self._sanitize_model_config(
59
60
  pytorch_model_config
60
61
  )
62
+ self._peft_model_path = peft_model_path
61
63
 
62
64
  def _sanitize_model_config(
63
65
  self, pytorch_model_config: Optional[PytorchModelConfig]
@@ -112,6 +114,24 @@ class PytorchModel(LLM):
112
114
  )
113
115
  return model, tokenizer
114
116
 
117
+ def _apply_lora(self):
118
+ if self._peft_model_path is not None:
119
+ try:
120
+ from peft import PeftModel
121
+ except ImportError:
122
+ raise ImportError(
123
+ f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
124
+ )
125
+
126
+ # Apply LoRA
127
+ self._model = PeftModel.from_pretrained(
128
+ self._model,
129
+ self._peft_model_path,
130
+ )
131
+ logger.info(
132
+ f"Successfully loaded the PEFT adaptor for model {self.model_uid}."
133
+ )
134
+
115
135
  def load(self):
116
136
  try:
117
137
  import torch
@@ -200,6 +220,7 @@ class PytorchModel(LLM):
200
220
  is_device_map_auto = True
201
221
 
202
222
  self._model, self._tokenizer = self._load_model(**kwargs)
223
+ self._apply_lora()
203
224
 
204
225
  if not is_device_map_auto:
205
226
  self._model.to(self._device)
@@ -391,6 +412,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
391
412
  quantization: str,
392
413
  model_path: str,
393
414
  pytorch_model_config: Optional[PytorchModelConfig] = None,
415
+ peft_model_path: Optional[str] = None,
394
416
  ):
395
417
  super().__init__(
396
418
  model_uid,
@@ -399,6 +421,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
399
421
  quantization,
400
422
  model_path,
401
423
  pytorch_model_config,
424
+ peft_model_path,
402
425
  )
403
426
 
404
427
  def _sanitize_generate_config(
@@ -27,6 +27,7 @@ class FalconPytorchModel(PytorchModel):
27
27
  quantization: str,
28
28
  model_path: str,
29
29
  pytorch_model_config: Optional[PytorchModelConfig] = None,
30
+ peft_model_path: Optional[str] = None,
30
31
  ):
31
32
  super().__init__(
32
33
  model_uid,
@@ -35,6 +36,7 @@ class FalconPytorchModel(PytorchModel):
35
36
  quantization,
36
37
  model_path,
37
38
  pytorch_model_config=pytorch_model_config,
39
+ peft_model_path=peft_model_path,
38
40
  )
39
41
 
40
42
  def _load_model(self, **kwargs):
@@ -84,6 +86,7 @@ class FalconPytorchChatModel(PytorchChatModel):
84
86
  quantization: str,
85
87
  model_path: str,
86
88
  pytorch_model_config: Optional[PytorchModelConfig] = None,
89
+ peft_model_path: Optional[str] = None,
87
90
  ):
88
91
  super().__init__(
89
92
  model_uid,
@@ -92,6 +95,7 @@ class FalconPytorchChatModel(PytorchChatModel):
92
95
  quantization,
93
96
  model_path,
94
97
  pytorch_model_config=pytorch_model_config,
98
+ peft_model_path=peft_model_path,
95
99
  )
96
100
 
97
101
  def _load_model(self, **kwargs):
@@ -38,6 +38,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
38
38
  quantization: str,
39
39
  model_path: str,
40
40
  pytorch_model_config: Optional[PytorchModelConfig] = None,
41
+ peft_model_path: Optional[str] = None,
41
42
  ):
42
43
  super().__init__(
43
44
  model_uid,
@@ -46,6 +47,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
46
47
  quantization,
47
48
  model_path,
48
49
  pytorch_model_config=pytorch_model_config,
50
+ peft_model_path=peft_model_path,
49
51
  )
50
52
 
51
53
  def _load_model(self, **kwargs):
@@ -27,6 +27,7 @@ class LlamaPytorchModel(PytorchModel):
27
27
  quantization: str,
28
28
  model_path: str,
29
29
  pytorch_model_config: Optional[PytorchModelConfig] = None,
30
+ peft_model_path: Optional[str] = None,
30
31
  ):
31
32
  super().__init__(
32
33
  model_uid,
@@ -35,6 +36,7 @@ class LlamaPytorchModel(PytorchModel):
35
36
  quantization,
36
37
  model_path,
37
38
  pytorch_model_config=pytorch_model_config,
39
+ peft_model_path=peft_model_path,
38
40
  )
39
41
 
40
42
  def _load_model(self, **kwargs):
@@ -67,6 +69,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
67
69
  model_spec: "LLMSpecV1",
68
70
  quantization: str,
69
71
  model_path: str,
72
+ peft_model_path: Optional[str] = None,
70
73
  pytorch_model_config: Optional["PytorchModelConfig"] = None,
71
74
  ):
72
75
  super().__init__(
@@ -75,6 +78,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
75
78
  model_spec,
76
79
  quantization,
77
80
  model_path,
81
+ peft_model_path=peft_model_path,
78
82
  pytorch_model_config=pytorch_model_config,
79
83
  )
80
84
  self._use_fast_tokenizer = False
@@ -71,6 +71,7 @@ class QwenVLChatModel(PytorchChatModel):
71
71
  trust_remote_code=True,
72
72
  code_revision=self.model_spec.model_revision,
73
73
  )
74
+ self._apply_lora()
74
75
 
75
76
  def _message_content_to_qwen(self, content) -> str:
76
77
  def _ensure_url(_url):
@@ -41,6 +41,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
41
41
  quantization: str,
42
42
  model_path: str,
43
43
  pytorch_model_config: Optional["PytorchModelConfig"] = None,
44
+ peft_model_path: Optional[str] = None,
44
45
  ):
45
46
  super().__init__(
46
47
  model_uid,
@@ -49,6 +50,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
49
50
  quantization,
50
51
  model_path,
51
52
  pytorch_model_config=pytorch_model_config,
53
+ peft_model_path=peft_model_path,
52
54
  )
53
55
  self._use_fast_tokenizer = False
54
56
 
@@ -69,6 +69,7 @@ class YiVLChatModel(PytorchChatModel):
69
69
  self._image_processor,
70
70
  _,
71
71
  ) = load_pretrained_model(self.model_path, device_map=self._device)
72
+ self._apply_lora()
72
73
 
73
74
  @staticmethod
74
75
  def _message_content_to_yi(content) -> Union[str, tuple]:
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "files": {
3
- "main.js": "./static/js/main.ebf7716d.js",
3
+ "main.js": "./static/js/main.78829790.js",
4
4
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
5
5
  "index.html": "./index.html",
6
- "main.ebf7716d.js.map": "./static/js/main.ebf7716d.js.map"
6
+ "main.78829790.js.map": "./static/js/main.78829790.js.map"
7
7
  },
8
8
  "entrypoints": [
9
- "static/js/main.ebf7716d.js"
9
+ "static/js/main.78829790.js"
10
10
  ]
11
11
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.ebf7716d.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.78829790.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>