xinference 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (47) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +33 -0
  3. xinference/client/common.py +2 -0
  4. xinference/client/restful/restful_client.py +49 -17
  5. xinference/conftest.py +4 -1
  6. xinference/core/supervisor.py +11 -1
  7. xinference/core/worker.py +29 -9
  8. xinference/deploy/cmdline.py +73 -2
  9. xinference/deploy/utils.py +25 -1
  10. xinference/device_utils.py +0 -2
  11. xinference/model/core.py +13 -2
  12. xinference/model/image/core.py +16 -2
  13. xinference/model/image/stable_diffusion/core.py +25 -2
  14. xinference/model/llm/__init__.py +17 -0
  15. xinference/model/llm/core.py +18 -2
  16. xinference/model/llm/ggml/llamacpp.py +3 -19
  17. xinference/model/llm/llm_family.json +8 -3
  18. xinference/model/llm/llm_family.py +100 -29
  19. xinference/model/llm/llm_family_modelscope.json +57 -3
  20. xinference/model/llm/pytorch/baichuan.py +2 -0
  21. xinference/model/llm/pytorch/chatglm.py +2 -0
  22. xinference/model/llm/pytorch/core.py +23 -0
  23. xinference/model/llm/pytorch/falcon.py +4 -0
  24. xinference/model/llm/pytorch/internlm2.py +2 -0
  25. xinference/model/llm/pytorch/llama_2.py +4 -0
  26. xinference/model/llm/pytorch/qwen_vl.py +1 -0
  27. xinference/model/llm/pytorch/vicuna.py +2 -0
  28. xinference/model/llm/pytorch/yi_vl.py +1 -0
  29. xinference/types.py +5 -2
  30. xinference/web/ui/build/asset-manifest.json +3 -3
  31. xinference/web/ui/build/index.html +1 -1
  32. xinference/web/ui/build/static/js/{main.87d39ffb.js → main.78829790.js} +3 -3
  33. xinference/web/ui/build/static/js/main.78829790.js.map +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +1 -0
  36. xinference/web/ui/node_modules/.cache/babel-loader/e8687f75d2adacd34852b71c41ca17203d6fb4c8999ea55325bb2939f9d9ea90.json +1 -0
  37. {xinference-0.9.0.dist-info → xinference-0.9.2.dist-info}/METADATA +7 -5
  38. {xinference-0.9.0.dist-info → xinference-0.9.2.dist-info}/RECORD +43 -43
  39. xinference/web/ui/build/static/js/main.87d39ffb.js.map +0 -1
  40. xinference/web/ui/node_modules/.cache/babel-loader/0738899eefad7f90261125823d87ea9f0d53667b1479a0c1f398aff14f2bbd2a.json +0 -1
  41. xinference/web/ui/node_modules/.cache/babel-loader/64accc515dc6cd584a2873796cd7da6f93de57f7e465eb5423cca9a2f3fe3eff.json +0 -1
  42. xinference/web/ui/node_modules/.cache/babel-loader/77d4d795f078408fa2dd49da26d1ba1543d51b63cc253e736f4bef2e6014e888.json +0 -1
  43. /xinference/web/ui/build/static/js/{main.87d39ffb.js.LICENSE.txt → main.78829790.js.LICENSE.txt} +0 -0
  44. {xinference-0.9.0.dist-info → xinference-0.9.2.dist-info}/LICENSE +0 -0
  45. {xinference-0.9.0.dist-info → xinference-0.9.2.dist-info}/WHEEL +0 -0
  46. {xinference-0.9.0.dist-info → xinference-0.9.2.dist-info}/entry_points.txt +0 -0
  47. {xinference-0.9.0.dist-info → xinference-0.9.2.dist-info}/top_level.txt +0 -0
@@ -31,6 +31,7 @@ from .llm_family import (
31
31
  BUILTIN_LLM_PROMPT_STYLE,
32
32
  BUILTIN_MODELSCOPE_LLM_FAMILIES,
33
33
  LLM_CLASSES,
34
+ PEFT_SUPPORTED_CLASSES,
34
35
  CustomLLMFamilyV1,
35
36
  GgmlLLMSpecV1,
36
37
  LLMFamilyV1,
@@ -95,6 +96,22 @@ def _install():
95
96
  PytorchModel,
96
97
  ]
97
98
  )
99
+ PEFT_SUPPORTED_CLASSES.extend(
100
+ [
101
+ BaichuanPytorchChatModel,
102
+ VicunaPytorchChatModel,
103
+ FalconPytorchChatModel,
104
+ ChatglmPytorchChatModel,
105
+ LlamaPytorchModel,
106
+ LlamaPytorchChatModel,
107
+ PytorchChatModel,
108
+ FalconPytorchModel,
109
+ Internlm2PytorchChatModel,
110
+ QwenVLChatModel,
111
+ YiVLChatModel,
112
+ PytorchModel,
113
+ ]
114
+ )
98
115
 
99
116
  json_path = os.path.join(
100
117
  os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
@@ -180,6 +180,7 @@ def create_llm_model_instance(
180
180
  model_format: Optional[str] = None,
181
181
  model_size_in_billions: Optional[int] = None,
182
182
  quantization: Optional[str] = None,
183
+ peft_model_path: Optional[str] = None,
183
184
  is_local_deployment: bool = False,
184
185
  **kwargs,
185
186
  ) -> Tuple[LLM, LLMDescription]:
@@ -203,7 +204,9 @@ def create_llm_model_instance(
203
204
  assert quantization is not None
204
205
  save_path = cache(llm_family, llm_spec, quantization)
205
206
 
206
- llm_cls = match_llm_cls(llm_family, llm_spec, quantization)
207
+ llm_cls = match_llm_cls(
208
+ llm_family, llm_spec, quantization, peft_model_path=peft_model_path
209
+ )
207
210
  if not llm_cls:
208
211
  raise ValueError(
209
212
  f"Model not supported, name: {model_name}, format: {model_format},"
@@ -211,7 +214,20 @@ def create_llm_model_instance(
211
214
  )
212
215
  logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
213
216
 
214
- model = llm_cls(model_uid, llm_family, llm_spec, quantization, save_path, kwargs)
217
+ if peft_model_path is not None:
218
+ model = llm_cls(
219
+ model_uid,
220
+ llm_family,
221
+ llm_spec,
222
+ quantization,
223
+ save_path,
224
+ kwargs,
225
+ peft_model_path,
226
+ )
227
+ else:
228
+ model = llm_cls(
229
+ model_uid, llm_family, llm_spec, quantization, save_path, kwargs
230
+ )
215
231
  return model, LLMDescription(
216
232
  subpool_addr, devices, llm_family, llm_spec, quantization
217
233
  )
@@ -35,15 +35,6 @@ from .ctransformers import CTRANSFORMERS_SUPPORTED_MODEL
35
35
  logger = logging.getLogger(__name__)
36
36
 
37
37
 
38
- SIZE_TO_GPU_LAYERS = {
39
- 3: 26,
40
- 7: 32,
41
- 13: 40,
42
- 30: 60,
43
- 65: 80,
44
- }
45
-
46
-
47
38
  class LlamaCppModel(LLM):
48
39
  def __init__(
49
40
  self,
@@ -56,13 +47,6 @@ class LlamaCppModel(LLM):
56
47
  ):
57
48
  super().__init__(model_uid, model_family, model_spec, quantization, model_path)
58
49
 
59
- closest_size = min(
60
- SIZE_TO_GPU_LAYERS.keys(),
61
- key=lambda x: abs(
62
- x - self.handle_model_size(model_spec.model_size_in_billions)
63
- ),
64
- )
65
- self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size]
66
50
  self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
67
51
  llamacpp_model_config
68
52
  )
@@ -96,9 +80,9 @@ class LlamaCppModel(LLM):
96
80
 
97
81
  if self._is_darwin_and_apple_silicon() and self._can_apply_metal():
98
82
  # TODO: platform.processor() is not safe, need to be replaced to other method.
99
- llamacpp_model_config.setdefault("n_gpu_layers", 1)
83
+ llamacpp_model_config.setdefault("n_gpu_layers", -1)
100
84
  elif self._is_linux() and self._can_apply_cublas():
101
- llamacpp_model_config.setdefault("n_gpu_layers", self._gpu_layers)
85
+ llamacpp_model_config.setdefault("n_gpu_layers", -1)
102
86
 
103
87
  return llamacpp_model_config
104
88
 
@@ -313,7 +297,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
313
297
  generate_config["stop"] = [stop, "Observation:"]
314
298
  elif isinstance(stop, Iterable):
315
299
  assert not isinstance(stop, str)
316
- generate_config["stop"] = stop + ["Observation:"]
300
+ generate_config["stop"] = stop + ["Observation:"] # type: ignore
317
301
  else:
318
302
  generate_config["stop"] = "Observation:"
319
303
 
@@ -1599,10 +1599,15 @@
1599
1599
  "model_size_in_billions": 72,
1600
1600
  "quantizations": [
1601
1601
  "q2_k",
1602
- "q3_k_m"
1602
+ "q3_k_m",
1603
+ "q4_k_m"
1603
1604
  ],
1604
1605
  "model_id": "Qwen/Qwen1.5-72B-Chat-GGUF",
1605
- "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf"
1606
+ "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf",
1607
+ "model_file_name_split_template": "qwen1_5-72b-chat-{quantization}.gguf.{part}",
1608
+ "quantization_parts": {
1609
+ "q4_k_m": ["a", "b"]
1610
+ }
1606
1611
  }
1607
1612
  ],
1608
1613
  "prompt_style": {
@@ -2967,7 +2972,7 @@
2967
2972
  },
2968
2973
  {
2969
2974
  "version": 1,
2970
- "context_length": 100000,
2975
+ "context_length": 16384,
2971
2976
  "model_name": "glaive-coder",
2972
2977
  "model_description": "A code model trained on a dataset of ~140k programming related problems and solutions generated from Glaive’s synthetic data generation platform.",
2973
2978
  "model_lang": [
@@ -59,6 +59,8 @@ class GgmlLLMSpecV1(BaseModel):
59
59
  quantizations: List[str]
60
60
  model_id: Optional[str]
61
61
  model_file_name_template: str
62
+ model_file_name_split_template: Optional[str]
63
+ quantization_parts: Optional[Dict[str, List[str]]]
62
64
  model_hub: str = "huggingface"
63
65
  model_uri: Optional[str]
64
66
  model_revision: Optional[str]
@@ -210,6 +212,7 @@ CustomLLMFamilyV1.update_forward_refs()
210
212
 
211
213
 
212
214
  LLM_CLASSES: List[Type[LLM]] = []
215
+ PEFT_SUPPORTED_CLASSES: List[Type[LLM]] = []
213
216
 
214
217
  BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
215
218
  BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
@@ -522,6 +525,52 @@ def _generate_meta_file(
522
525
  json.dump(desc.to_dict(), f)
523
526
 
524
527
 
528
+ def _generate_model_file_names(
529
+ llm_spec: "LLMSpecV1", quantization: Optional[str] = None
530
+ ) -> Tuple[List[str], str, bool]:
531
+ file_names = []
532
+ final_file_name = llm_spec.model_file_name_template.format(
533
+ quantization=quantization
534
+ )
535
+ need_merge = False
536
+
537
+ if llm_spec.quantization_parts is None:
538
+ file_names.append(final_file_name)
539
+ elif quantization is not None and quantization in llm_spec.quantization_parts:
540
+ parts = llm_spec.quantization_parts[quantization]
541
+ need_merge = True
542
+
543
+ logger.info(
544
+ f"Model {llm_spec.model_id} {llm_spec.model_format} {quantization} has {len(parts)} parts."
545
+ )
546
+
547
+ if llm_spec.model_file_name_split_template is None:
548
+ raise ValueError(
549
+ f"No model_file_name_split_template for model spec {llm_spec.model_id}"
550
+ )
551
+
552
+ for part in parts:
553
+ file_name = llm_spec.model_file_name_split_template.format(
554
+ quantization=quantization, part=part
555
+ )
556
+ file_names.append(file_name)
557
+
558
+ return file_names, final_file_name, need_merge
559
+
560
+
561
+ def _merge_cached_files(
562
+ cache_dir: str, input_file_names: List[str], output_file_name: str
563
+ ):
564
+ with open(os.path.join(cache_dir, output_file_name), "wb") as output_file:
565
+ for file_name in input_file_names:
566
+ logger.info(f"Merging file {file_name} into {output_file_name} ...")
567
+
568
+ with open(os.path.join(cache_dir, file_name), "rb") as input_file:
569
+ shutil.copyfileobj(input_file, output_file)
570
+
571
+ logger.info(f"Merge complete.")
572
+
573
+
525
574
  def cache_from_modelscope(
526
575
  llm_family: LLMFamilyV1,
527
576
  llm_spec: "LLMSpecV1",
@@ -560,19 +609,26 @@ def cache_from_modelscope(
560
609
  symlink_local_file(os.path.join(subdir, file), cache_dir, relpath)
561
610
 
562
611
  elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
563
- filename = llm_spec.model_file_name_template.format(quantization=quantization)
564
- download_path = retry_download(
565
- model_file_download,
566
- llm_family.model_name,
567
- {
568
- "model_size": llm_spec.model_size_in_billions,
569
- "model_format": llm_spec.model_format,
570
- },
571
- llm_spec.model_id,
572
- filename,
573
- revision=llm_spec.model_revision,
612
+ file_names, final_file_name, need_merge = _generate_model_file_names(
613
+ llm_spec, quantization
574
614
  )
575
- symlink_local_file(download_path, cache_dir, filename)
615
+
616
+ for filename in file_names:
617
+ download_path = retry_download(
618
+ model_file_download,
619
+ llm_family.model_name,
620
+ {
621
+ "model_size": llm_spec.model_size_in_billions,
622
+ "model_format": llm_spec.model_format,
623
+ },
624
+ llm_spec.model_id,
625
+ filename,
626
+ revision=llm_spec.model_revision,
627
+ )
628
+ symlink_local_file(download_path, cache_dir, filename)
629
+
630
+ if need_merge:
631
+ _merge_cached_files(cache_dir, file_names, final_file_name)
576
632
  else:
577
633
  raise ValueError(f"Unsupported format: {llm_spec.model_format}")
578
634
 
@@ -621,20 +677,27 @@ def cache_from_huggingface(
621
677
 
622
678
  elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
623
679
  assert isinstance(llm_spec, GgmlLLMSpecV1)
624
- file_name = llm_spec.model_file_name_template.format(quantization=quantization)
625
- retry_download(
626
- huggingface_hub.hf_hub_download,
627
- llm_family.model_name,
628
- {
629
- "model_size": llm_spec.model_size_in_billions,
630
- "model_format": llm_spec.model_format,
631
- },
632
- llm_spec.model_id,
633
- revision=llm_spec.model_revision,
634
- filename=file_name,
635
- local_dir=cache_dir,
636
- local_dir_use_symlinks=True,
680
+ file_names, final_file_name, need_merge = _generate_model_file_names(
681
+ llm_spec, quantization
637
682
  )
683
+
684
+ for file_name in file_names:
685
+ retry_download(
686
+ huggingface_hub.hf_hub_download,
687
+ llm_family.model_name,
688
+ {
689
+ "model_size": llm_spec.model_size_in_billions,
690
+ "model_format": llm_spec.model_format,
691
+ },
692
+ llm_spec.model_id,
693
+ revision=llm_spec.model_revision,
694
+ filename=file_name,
695
+ local_dir=cache_dir,
696
+ local_dir_use_symlinks=True,
697
+ )
698
+
699
+ if need_merge:
700
+ _merge_cached_files(cache_dir, file_names, final_file_name)
638
701
  else:
639
702
  raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
640
703
 
@@ -873,12 +936,20 @@ def unregister_llm(model_name: str, raise_error: bool = True):
873
936
 
874
937
 
875
938
  def match_llm_cls(
876
- family: LLMFamilyV1, llm_spec: "LLMSpecV1", quantization: str
939
+ family: LLMFamilyV1,
940
+ llm_spec: "LLMSpecV1",
941
+ quantization: str,
942
+ peft_model_path: Optional[str] = None,
877
943
  ) -> Optional[Type[LLM]]:
878
944
  """
879
945
  Find an LLM implementation for given LLM family and spec.
880
946
  """
881
- for cls in LLM_CLASSES:
882
- if cls.match(family, llm_spec, quantization):
883
- return cls
947
+ if peft_model_path is not None:
948
+ for cls in PEFT_SUPPORTED_CLASSES:
949
+ if cls.match(family, llm_spec, quantization):
950
+ return cls
951
+ else:
952
+ for cls in LLM_CLASSES:
953
+ if cls.match(family, llm_spec, quantization):
954
+ return cls
884
955
  return None
@@ -1124,6 +1124,7 @@
1124
1124
  "quantizations": [
1125
1125
  "8bits"
1126
1126
  ],
1127
+ "model_hub": "modelscope",
1127
1128
  "model_id": "01ai/Yi-34B-Chat-{quantization}",
1128
1129
  "model_revision": "master"
1129
1130
  },
@@ -1646,7 +1647,8 @@
1646
1647
  "8-bit",
1647
1648
  "none"
1648
1649
  ],
1649
- "model_id": "qwen/Qwen1.5-0.5B-Chat"
1650
+ "model_id": "qwen/Qwen1.5-0.5B-Chat",
1651
+ "model_hub": "modelscope"
1650
1652
  },
1651
1653
  {
1652
1654
  "model_format": "pytorch",
@@ -1907,11 +1909,16 @@
1907
1909
  "model_size_in_billions": 72,
1908
1910
  "quantizations": [
1909
1911
  "q2_k",
1910
- "q3_k_m"
1912
+ "q3_k_m",
1913
+ "q4_k_m"
1911
1914
  ],
1912
1915
  "model_id": "qwen/Qwen1.5-72B-Chat-GGUF",
1913
1916
  "model_hub": "modelscope",
1914
- "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf"
1917
+ "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf",
1918
+ "model_file_name_split_template": "qwen1_5-72b-chat-{quantization}.gguf.{part}",
1919
+ "quantization_parts": {
1920
+ "q4_k_m": ["a", "b"]
1921
+ }
1915
1922
  }
1916
1923
  ],
1917
1924
  "prompt_style": {
@@ -2329,5 +2336,52 @@
2329
2336
  "<|im_sep|>"
2330
2337
  ]
2331
2338
  }
2339
+ },
2340
+ {
2341
+ "version": 1,
2342
+ "context_length": 8192,
2343
+ "model_name": "gemma-it",
2344
+ "model_lang": [
2345
+ "en"
2346
+ ],
2347
+ "model_ability": [
2348
+ "chat"
2349
+ ],
2350
+ "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
2351
+ "model_specs": [
2352
+ {
2353
+ "model_format": "pytorch",
2354
+ "model_size_in_billions": 2,
2355
+ "quantizations": [
2356
+ "none",
2357
+ "4-bit",
2358
+ "8-bit"
2359
+ ],
2360
+ "model_hub": "modelscope",
2361
+ "model_id": "AI-ModelScope/gemma-2b-it"
2362
+ },
2363
+ {
2364
+ "model_format": "pytorch",
2365
+ "model_size_in_billions": 7,
2366
+ "quantizations": [
2367
+ "none",
2368
+ "4-bit",
2369
+ "8-bit"
2370
+ ],
2371
+ "model_hub": "modelscope",
2372
+ "model_id": "AI-ModelScope/gemma-7b-it"
2373
+ }
2374
+ ],
2375
+ "prompt_style": {
2376
+ "style_name": "gemma",
2377
+ "roles": [
2378
+ "user",
2379
+ "model"
2380
+ ],
2381
+ "stop": [
2382
+ "<end_of_turn>",
2383
+ "<start_of_turn>"
2384
+ ]
2385
+ }
2332
2386
  }
2333
2387
  ]
@@ -27,6 +27,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
27
27
  quantization: str,
28
28
  model_path: str,
29
29
  pytorch_model_config: Optional[PytorchModelConfig] = None,
30
+ peft_model_path: Optional[str] = None,
30
31
  ):
31
32
  super().__init__(
32
33
  model_uid,
@@ -35,6 +36,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
35
36
  quantization,
36
37
  model_path,
37
38
  pytorch_model_config=pytorch_model_config,
39
+ peft_model_path=peft_model_path,
38
40
  )
39
41
  self._use_fast_tokenizer = False
40
42
 
@@ -39,6 +39,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
39
39
  quantization: str,
40
40
  model_path: str,
41
41
  pytorch_model_config: Optional[PytorchModelConfig] = None,
42
+ peft_model_path: Optional[str] = None,
42
43
  ):
43
44
  super().__init__(
44
45
  model_uid,
@@ -47,6 +48,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
47
48
  quantization,
48
49
  model_path,
49
50
  pytorch_model_config=pytorch_model_config,
51
+ peft_model_path=peft_model_path,
50
52
  )
51
53
 
52
54
  def _load_model(self, **kwargs):
@@ -52,12 +52,14 @@ class PytorchModel(LLM):
52
52
  quantization: str,
53
53
  model_path: str,
54
54
  pytorch_model_config: Optional[PytorchModelConfig] = None,
55
+ peft_model_path: Optional[str] = None,
55
56
  ):
56
57
  super().__init__(model_uid, model_family, model_spec, quantization, model_path)
57
58
  self._use_fast_tokenizer = True
58
59
  self._pytorch_model_config: PytorchModelConfig = self._sanitize_model_config(
59
60
  pytorch_model_config
60
61
  )
62
+ self._peft_model_path = peft_model_path
61
63
 
62
64
  def _sanitize_model_config(
63
65
  self, pytorch_model_config: Optional[PytorchModelConfig]
@@ -112,6 +114,24 @@ class PytorchModel(LLM):
112
114
  )
113
115
  return model, tokenizer
114
116
 
117
+ def _apply_lora(self):
118
+ if self._peft_model_path is not None:
119
+ try:
120
+ from peft import PeftModel
121
+ except ImportError:
122
+ raise ImportError(
123
+ f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
124
+ )
125
+
126
+ # Apply LoRA
127
+ self._model = PeftModel.from_pretrained(
128
+ self._model,
129
+ self._peft_model_path,
130
+ )
131
+ logger.info(
132
+ f"Successfully loaded the PEFT adaptor for model {self.model_uid}."
133
+ )
134
+
115
135
  def load(self):
116
136
  try:
117
137
  import torch
@@ -200,6 +220,7 @@ class PytorchModel(LLM):
200
220
  is_device_map_auto = True
201
221
 
202
222
  self._model, self._tokenizer = self._load_model(**kwargs)
223
+ self._apply_lora()
203
224
 
204
225
  if not is_device_map_auto:
205
226
  self._model.to(self._device)
@@ -391,6 +412,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
391
412
  quantization: str,
392
413
  model_path: str,
393
414
  pytorch_model_config: Optional[PytorchModelConfig] = None,
415
+ peft_model_path: Optional[str] = None,
394
416
  ):
395
417
  super().__init__(
396
418
  model_uid,
@@ -399,6 +421,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
399
421
  quantization,
400
422
  model_path,
401
423
  pytorch_model_config,
424
+ peft_model_path,
402
425
  )
403
426
 
404
427
  def _sanitize_generate_config(
@@ -27,6 +27,7 @@ class FalconPytorchModel(PytorchModel):
27
27
  quantization: str,
28
28
  model_path: str,
29
29
  pytorch_model_config: Optional[PytorchModelConfig] = None,
30
+ peft_model_path: Optional[str] = None,
30
31
  ):
31
32
  super().__init__(
32
33
  model_uid,
@@ -35,6 +36,7 @@ class FalconPytorchModel(PytorchModel):
35
36
  quantization,
36
37
  model_path,
37
38
  pytorch_model_config=pytorch_model_config,
39
+ peft_model_path=peft_model_path,
38
40
  )
39
41
 
40
42
  def _load_model(self, **kwargs):
@@ -84,6 +86,7 @@ class FalconPytorchChatModel(PytorchChatModel):
84
86
  quantization: str,
85
87
  model_path: str,
86
88
  pytorch_model_config: Optional[PytorchModelConfig] = None,
89
+ peft_model_path: Optional[str] = None,
87
90
  ):
88
91
  super().__init__(
89
92
  model_uid,
@@ -92,6 +95,7 @@ class FalconPytorchChatModel(PytorchChatModel):
92
95
  quantization,
93
96
  model_path,
94
97
  pytorch_model_config=pytorch_model_config,
98
+ peft_model_path=peft_model_path,
95
99
  )
96
100
 
97
101
  def _load_model(self, **kwargs):
@@ -38,6 +38,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
38
38
  quantization: str,
39
39
  model_path: str,
40
40
  pytorch_model_config: Optional[PytorchModelConfig] = None,
41
+ peft_model_path: Optional[str] = None,
41
42
  ):
42
43
  super().__init__(
43
44
  model_uid,
@@ -46,6 +47,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
46
47
  quantization,
47
48
  model_path,
48
49
  pytorch_model_config=pytorch_model_config,
50
+ peft_model_path=peft_model_path,
49
51
  )
50
52
 
51
53
  def _load_model(self, **kwargs):
@@ -27,6 +27,7 @@ class LlamaPytorchModel(PytorchModel):
27
27
  quantization: str,
28
28
  model_path: str,
29
29
  pytorch_model_config: Optional[PytorchModelConfig] = None,
30
+ peft_model_path: Optional[str] = None,
30
31
  ):
31
32
  super().__init__(
32
33
  model_uid,
@@ -35,6 +36,7 @@ class LlamaPytorchModel(PytorchModel):
35
36
  quantization,
36
37
  model_path,
37
38
  pytorch_model_config=pytorch_model_config,
39
+ peft_model_path=peft_model_path,
38
40
  )
39
41
 
40
42
  def _load_model(self, **kwargs):
@@ -67,6 +69,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
67
69
  model_spec: "LLMSpecV1",
68
70
  quantization: str,
69
71
  model_path: str,
72
+ peft_model_path: Optional[str] = None,
70
73
  pytorch_model_config: Optional["PytorchModelConfig"] = None,
71
74
  ):
72
75
  super().__init__(
@@ -75,6 +78,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
75
78
  model_spec,
76
79
  quantization,
77
80
  model_path,
81
+ peft_model_path=peft_model_path,
78
82
  pytorch_model_config=pytorch_model_config,
79
83
  )
80
84
  self._use_fast_tokenizer = False
@@ -71,6 +71,7 @@ class QwenVLChatModel(PytorchChatModel):
71
71
  trust_remote_code=True,
72
72
  code_revision=self.model_spec.model_revision,
73
73
  )
74
+ self._apply_lora()
74
75
 
75
76
  def _message_content_to_qwen(self, content) -> str:
76
77
  def _ensure_url(_url):
@@ -41,6 +41,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
41
41
  quantization: str,
42
42
  model_path: str,
43
43
  pytorch_model_config: Optional["PytorchModelConfig"] = None,
44
+ peft_model_path: Optional[str] = None,
44
45
  ):
45
46
  super().__init__(
46
47
  model_uid,
@@ -49,6 +50,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
49
50
  quantization,
50
51
  model_path,
51
52
  pytorch_model_config=pytorch_model_config,
53
+ peft_model_path=peft_model_path,
52
54
  )
53
55
  self._use_fast_tokenizer = False
54
56
 
@@ -69,6 +69,7 @@ class YiVLChatModel(PytorchChatModel):
69
69
  self._image_processor,
70
70
  _,
71
71
  ) = load_pretrained_model(self.model_path, device_map=self._device)
72
+ self._apply_lora()
72
73
 
73
74
  @staticmethod
74
75
  def _message_content_to_yi(content) -> Union[str, tuple]:
xinference/types.py CHANGED
@@ -346,8 +346,11 @@ try:
346
346
 
347
347
  CreateCompletionLlamaCpp = get_pydantic_model_from_method(
348
348
  Llama.create_completion,
349
- exclude_fields=["model", "prompt", "grammar"],
350
- include_fields={"grammar": (Optional[Any], None)},
349
+ exclude_fields=["model", "prompt", "grammar", "max_tokens"],
350
+ include_fields={
351
+ "grammar": (Optional[Any], None),
352
+ "max_tokens": (Optional[int], max_tokens_field),
353
+ },
351
354
  )
352
355
  except ImportError:
353
356
  CreateCompletionLlamaCpp = create_model("CreateCompletionLlamaCpp")
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "files": {
3
- "main.js": "./static/js/main.87d39ffb.js",
3
+ "main.js": "./static/js/main.78829790.js",
4
4
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
5
5
  "index.html": "./index.html",
6
- "main.87d39ffb.js.map": "./static/js/main.87d39ffb.js.map"
6
+ "main.78829790.js.map": "./static/js/main.78829790.js.map"
7
7
  },
8
8
  "entrypoints": [
9
- "static/js/main.87d39ffb.js"
9
+ "static/js/main.78829790.js"
10
10
  ]
11
11
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.87d39ffb.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.78829790.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>