xinference 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +33 -0
- xinference/client/common.py +2 -0
- xinference/client/restful/restful_client.py +49 -17
- xinference/conftest.py +4 -1
- xinference/core/supervisor.py +11 -1
- xinference/core/worker.py +29 -9
- xinference/deploy/cmdline.py +73 -2
- xinference/deploy/utils.py +25 -1
- xinference/device_utils.py +0 -2
- xinference/model/core.py +13 -2
- xinference/model/image/core.py +16 -2
- xinference/model/image/stable_diffusion/core.py +25 -2
- xinference/model/llm/__init__.py +17 -0
- xinference/model/llm/core.py +18 -2
- xinference/model/llm/ggml/llamacpp.py +3 -19
- xinference/model/llm/llm_family.json +8 -3
- xinference/model/llm/llm_family.py +100 -29
- xinference/model/llm/llm_family_modelscope.json +57 -3
- xinference/model/llm/pytorch/baichuan.py +2 -0
- xinference/model/llm/pytorch/chatglm.py +2 -0
- xinference/model/llm/pytorch/core.py +23 -0
- xinference/model/llm/pytorch/falcon.py +4 -0
- xinference/model/llm/pytorch/internlm2.py +2 -0
- xinference/model/llm/pytorch/llama_2.py +4 -0
- xinference/model/llm/pytorch/qwen_vl.py +1 -0
- xinference/model/llm/pytorch/vicuna.py +2 -0
- xinference/model/llm/pytorch/yi_vl.py +1 -0
- xinference/types.py +5 -2
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.87d39ffb.js → main.78829790.js} +3 -3
- xinference/web/ui/build/static/js/main.78829790.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e8687f75d2adacd34852b71c41ca17203d6fb4c8999ea55325bb2939f9d9ea90.json +1 -0
- {xinference-0.9.0.dist-info → xinference-0.9.2.dist-info}/METADATA +7 -5
- {xinference-0.9.0.dist-info → xinference-0.9.2.dist-info}/RECORD +43 -43
- xinference/web/ui/build/static/js/main.87d39ffb.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0738899eefad7f90261125823d87ea9f0d53667b1479a0c1f398aff14f2bbd2a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/64accc515dc6cd584a2873796cd7da6f93de57f7e465eb5423cca9a2f3fe3eff.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/77d4d795f078408fa2dd49da26d1ba1543d51b63cc253e736f4bef2e6014e888.json +0 -1
- /xinference/web/ui/build/static/js/{main.87d39ffb.js.LICENSE.txt → main.78829790.js.LICENSE.txt} +0 -0
- {xinference-0.9.0.dist-info → xinference-0.9.2.dist-info}/LICENSE +0 -0
- {xinference-0.9.0.dist-info → xinference-0.9.2.dist-info}/WHEEL +0 -0
- {xinference-0.9.0.dist-info → xinference-0.9.2.dist-info}/entry_points.txt +0 -0
- {xinference-0.9.0.dist-info → xinference-0.9.2.dist-info}/top_level.txt +0 -0
xinference/model/llm/__init__.py
CHANGED
|
@@ -31,6 +31,7 @@ from .llm_family import (
|
|
|
31
31
|
BUILTIN_LLM_PROMPT_STYLE,
|
|
32
32
|
BUILTIN_MODELSCOPE_LLM_FAMILIES,
|
|
33
33
|
LLM_CLASSES,
|
|
34
|
+
PEFT_SUPPORTED_CLASSES,
|
|
34
35
|
CustomLLMFamilyV1,
|
|
35
36
|
GgmlLLMSpecV1,
|
|
36
37
|
LLMFamilyV1,
|
|
@@ -95,6 +96,22 @@ def _install():
|
|
|
95
96
|
PytorchModel,
|
|
96
97
|
]
|
|
97
98
|
)
|
|
99
|
+
PEFT_SUPPORTED_CLASSES.extend(
|
|
100
|
+
[
|
|
101
|
+
BaichuanPytorchChatModel,
|
|
102
|
+
VicunaPytorchChatModel,
|
|
103
|
+
FalconPytorchChatModel,
|
|
104
|
+
ChatglmPytorchChatModel,
|
|
105
|
+
LlamaPytorchModel,
|
|
106
|
+
LlamaPytorchChatModel,
|
|
107
|
+
PytorchChatModel,
|
|
108
|
+
FalconPytorchModel,
|
|
109
|
+
Internlm2PytorchChatModel,
|
|
110
|
+
QwenVLChatModel,
|
|
111
|
+
YiVLChatModel,
|
|
112
|
+
PytorchModel,
|
|
113
|
+
]
|
|
114
|
+
)
|
|
98
115
|
|
|
99
116
|
json_path = os.path.join(
|
|
100
117
|
os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
|
xinference/model/llm/core.py
CHANGED
|
@@ -180,6 +180,7 @@ def create_llm_model_instance(
|
|
|
180
180
|
model_format: Optional[str] = None,
|
|
181
181
|
model_size_in_billions: Optional[int] = None,
|
|
182
182
|
quantization: Optional[str] = None,
|
|
183
|
+
peft_model_path: Optional[str] = None,
|
|
183
184
|
is_local_deployment: bool = False,
|
|
184
185
|
**kwargs,
|
|
185
186
|
) -> Tuple[LLM, LLMDescription]:
|
|
@@ -203,7 +204,9 @@ def create_llm_model_instance(
|
|
|
203
204
|
assert quantization is not None
|
|
204
205
|
save_path = cache(llm_family, llm_spec, quantization)
|
|
205
206
|
|
|
206
|
-
llm_cls = match_llm_cls(
|
|
207
|
+
llm_cls = match_llm_cls(
|
|
208
|
+
llm_family, llm_spec, quantization, peft_model_path=peft_model_path
|
|
209
|
+
)
|
|
207
210
|
if not llm_cls:
|
|
208
211
|
raise ValueError(
|
|
209
212
|
f"Model not supported, name: {model_name}, format: {model_format},"
|
|
@@ -211,7 +214,20 @@ def create_llm_model_instance(
|
|
|
211
214
|
)
|
|
212
215
|
logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
|
|
213
216
|
|
|
214
|
-
|
|
217
|
+
if peft_model_path is not None:
|
|
218
|
+
model = llm_cls(
|
|
219
|
+
model_uid,
|
|
220
|
+
llm_family,
|
|
221
|
+
llm_spec,
|
|
222
|
+
quantization,
|
|
223
|
+
save_path,
|
|
224
|
+
kwargs,
|
|
225
|
+
peft_model_path,
|
|
226
|
+
)
|
|
227
|
+
else:
|
|
228
|
+
model = llm_cls(
|
|
229
|
+
model_uid, llm_family, llm_spec, quantization, save_path, kwargs
|
|
230
|
+
)
|
|
215
231
|
return model, LLMDescription(
|
|
216
232
|
subpool_addr, devices, llm_family, llm_spec, quantization
|
|
217
233
|
)
|
|
@@ -35,15 +35,6 @@ from .ctransformers import CTRANSFORMERS_SUPPORTED_MODEL
|
|
|
35
35
|
logger = logging.getLogger(__name__)
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
SIZE_TO_GPU_LAYERS = {
|
|
39
|
-
3: 26,
|
|
40
|
-
7: 32,
|
|
41
|
-
13: 40,
|
|
42
|
-
30: 60,
|
|
43
|
-
65: 80,
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
|
|
47
38
|
class LlamaCppModel(LLM):
|
|
48
39
|
def __init__(
|
|
49
40
|
self,
|
|
@@ -56,13 +47,6 @@ class LlamaCppModel(LLM):
|
|
|
56
47
|
):
|
|
57
48
|
super().__init__(model_uid, model_family, model_spec, quantization, model_path)
|
|
58
49
|
|
|
59
|
-
closest_size = min(
|
|
60
|
-
SIZE_TO_GPU_LAYERS.keys(),
|
|
61
|
-
key=lambda x: abs(
|
|
62
|
-
x - self.handle_model_size(model_spec.model_size_in_billions)
|
|
63
|
-
),
|
|
64
|
-
)
|
|
65
|
-
self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size]
|
|
66
50
|
self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
|
|
67
51
|
llamacpp_model_config
|
|
68
52
|
)
|
|
@@ -96,9 +80,9 @@ class LlamaCppModel(LLM):
|
|
|
96
80
|
|
|
97
81
|
if self._is_darwin_and_apple_silicon() and self._can_apply_metal():
|
|
98
82
|
# TODO: platform.processor() is not safe, need to be replaced to other method.
|
|
99
|
-
llamacpp_model_config.setdefault("n_gpu_layers", 1)
|
|
83
|
+
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
100
84
|
elif self._is_linux() and self._can_apply_cublas():
|
|
101
|
-
llamacpp_model_config.setdefault("n_gpu_layers",
|
|
85
|
+
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
102
86
|
|
|
103
87
|
return llamacpp_model_config
|
|
104
88
|
|
|
@@ -313,7 +297,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
|
|
|
313
297
|
generate_config["stop"] = [stop, "Observation:"]
|
|
314
298
|
elif isinstance(stop, Iterable):
|
|
315
299
|
assert not isinstance(stop, str)
|
|
316
|
-
generate_config["stop"] = stop + ["Observation:"]
|
|
300
|
+
generate_config["stop"] = stop + ["Observation:"] # type: ignore
|
|
317
301
|
else:
|
|
318
302
|
generate_config["stop"] = "Observation:"
|
|
319
303
|
|
|
@@ -1599,10 +1599,15 @@
|
|
|
1599
1599
|
"model_size_in_billions": 72,
|
|
1600
1600
|
"quantizations": [
|
|
1601
1601
|
"q2_k",
|
|
1602
|
-
"q3_k_m"
|
|
1602
|
+
"q3_k_m",
|
|
1603
|
+
"q4_k_m"
|
|
1603
1604
|
],
|
|
1604
1605
|
"model_id": "Qwen/Qwen1.5-72B-Chat-GGUF",
|
|
1605
|
-
"model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf"
|
|
1606
|
+
"model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf",
|
|
1607
|
+
"model_file_name_split_template": "qwen1_5-72b-chat-{quantization}.gguf.{part}",
|
|
1608
|
+
"quantization_parts": {
|
|
1609
|
+
"q4_k_m": ["a", "b"]
|
|
1610
|
+
}
|
|
1606
1611
|
}
|
|
1607
1612
|
],
|
|
1608
1613
|
"prompt_style": {
|
|
@@ -2967,7 +2972,7 @@
|
|
|
2967
2972
|
},
|
|
2968
2973
|
{
|
|
2969
2974
|
"version": 1,
|
|
2970
|
-
"context_length":
|
|
2975
|
+
"context_length": 16384,
|
|
2971
2976
|
"model_name": "glaive-coder",
|
|
2972
2977
|
"model_description": "A code model trained on a dataset of ~140k programming related problems and solutions generated from Glaive’s synthetic data generation platform.",
|
|
2973
2978
|
"model_lang": [
|
|
@@ -59,6 +59,8 @@ class GgmlLLMSpecV1(BaseModel):
|
|
|
59
59
|
quantizations: List[str]
|
|
60
60
|
model_id: Optional[str]
|
|
61
61
|
model_file_name_template: str
|
|
62
|
+
model_file_name_split_template: Optional[str]
|
|
63
|
+
quantization_parts: Optional[Dict[str, List[str]]]
|
|
62
64
|
model_hub: str = "huggingface"
|
|
63
65
|
model_uri: Optional[str]
|
|
64
66
|
model_revision: Optional[str]
|
|
@@ -210,6 +212,7 @@ CustomLLMFamilyV1.update_forward_refs()
|
|
|
210
212
|
|
|
211
213
|
|
|
212
214
|
LLM_CLASSES: List[Type[LLM]] = []
|
|
215
|
+
PEFT_SUPPORTED_CLASSES: List[Type[LLM]] = []
|
|
213
216
|
|
|
214
217
|
BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
|
|
215
218
|
BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
|
|
@@ -522,6 +525,52 @@ def _generate_meta_file(
|
|
|
522
525
|
json.dump(desc.to_dict(), f)
|
|
523
526
|
|
|
524
527
|
|
|
528
|
+
def _generate_model_file_names(
|
|
529
|
+
llm_spec: "LLMSpecV1", quantization: Optional[str] = None
|
|
530
|
+
) -> Tuple[List[str], str, bool]:
|
|
531
|
+
file_names = []
|
|
532
|
+
final_file_name = llm_spec.model_file_name_template.format(
|
|
533
|
+
quantization=quantization
|
|
534
|
+
)
|
|
535
|
+
need_merge = False
|
|
536
|
+
|
|
537
|
+
if llm_spec.quantization_parts is None:
|
|
538
|
+
file_names.append(final_file_name)
|
|
539
|
+
elif quantization is not None and quantization in llm_spec.quantization_parts:
|
|
540
|
+
parts = llm_spec.quantization_parts[quantization]
|
|
541
|
+
need_merge = True
|
|
542
|
+
|
|
543
|
+
logger.info(
|
|
544
|
+
f"Model {llm_spec.model_id} {llm_spec.model_format} {quantization} has {len(parts)} parts."
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
if llm_spec.model_file_name_split_template is None:
|
|
548
|
+
raise ValueError(
|
|
549
|
+
f"No model_file_name_split_template for model spec {llm_spec.model_id}"
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
for part in parts:
|
|
553
|
+
file_name = llm_spec.model_file_name_split_template.format(
|
|
554
|
+
quantization=quantization, part=part
|
|
555
|
+
)
|
|
556
|
+
file_names.append(file_name)
|
|
557
|
+
|
|
558
|
+
return file_names, final_file_name, need_merge
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def _merge_cached_files(
|
|
562
|
+
cache_dir: str, input_file_names: List[str], output_file_name: str
|
|
563
|
+
):
|
|
564
|
+
with open(os.path.join(cache_dir, output_file_name), "wb") as output_file:
|
|
565
|
+
for file_name in input_file_names:
|
|
566
|
+
logger.info(f"Merging file {file_name} into {output_file_name} ...")
|
|
567
|
+
|
|
568
|
+
with open(os.path.join(cache_dir, file_name), "rb") as input_file:
|
|
569
|
+
shutil.copyfileobj(input_file, output_file)
|
|
570
|
+
|
|
571
|
+
logger.info(f"Merge complete.")
|
|
572
|
+
|
|
573
|
+
|
|
525
574
|
def cache_from_modelscope(
|
|
526
575
|
llm_family: LLMFamilyV1,
|
|
527
576
|
llm_spec: "LLMSpecV1",
|
|
@@ -560,19 +609,26 @@ def cache_from_modelscope(
|
|
|
560
609
|
symlink_local_file(os.path.join(subdir, file), cache_dir, relpath)
|
|
561
610
|
|
|
562
611
|
elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
model_file_download,
|
|
566
|
-
llm_family.model_name,
|
|
567
|
-
{
|
|
568
|
-
"model_size": llm_spec.model_size_in_billions,
|
|
569
|
-
"model_format": llm_spec.model_format,
|
|
570
|
-
},
|
|
571
|
-
llm_spec.model_id,
|
|
572
|
-
filename,
|
|
573
|
-
revision=llm_spec.model_revision,
|
|
612
|
+
file_names, final_file_name, need_merge = _generate_model_file_names(
|
|
613
|
+
llm_spec, quantization
|
|
574
614
|
)
|
|
575
|
-
|
|
615
|
+
|
|
616
|
+
for filename in file_names:
|
|
617
|
+
download_path = retry_download(
|
|
618
|
+
model_file_download,
|
|
619
|
+
llm_family.model_name,
|
|
620
|
+
{
|
|
621
|
+
"model_size": llm_spec.model_size_in_billions,
|
|
622
|
+
"model_format": llm_spec.model_format,
|
|
623
|
+
},
|
|
624
|
+
llm_spec.model_id,
|
|
625
|
+
filename,
|
|
626
|
+
revision=llm_spec.model_revision,
|
|
627
|
+
)
|
|
628
|
+
symlink_local_file(download_path, cache_dir, filename)
|
|
629
|
+
|
|
630
|
+
if need_merge:
|
|
631
|
+
_merge_cached_files(cache_dir, file_names, final_file_name)
|
|
576
632
|
else:
|
|
577
633
|
raise ValueError(f"Unsupported format: {llm_spec.model_format}")
|
|
578
634
|
|
|
@@ -621,20 +677,27 @@ def cache_from_huggingface(
|
|
|
621
677
|
|
|
622
678
|
elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
|
|
623
679
|
assert isinstance(llm_spec, GgmlLLMSpecV1)
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
huggingface_hub.hf_hub_download,
|
|
627
|
-
llm_family.model_name,
|
|
628
|
-
{
|
|
629
|
-
"model_size": llm_spec.model_size_in_billions,
|
|
630
|
-
"model_format": llm_spec.model_format,
|
|
631
|
-
},
|
|
632
|
-
llm_spec.model_id,
|
|
633
|
-
revision=llm_spec.model_revision,
|
|
634
|
-
filename=file_name,
|
|
635
|
-
local_dir=cache_dir,
|
|
636
|
-
local_dir_use_symlinks=True,
|
|
680
|
+
file_names, final_file_name, need_merge = _generate_model_file_names(
|
|
681
|
+
llm_spec, quantization
|
|
637
682
|
)
|
|
683
|
+
|
|
684
|
+
for file_name in file_names:
|
|
685
|
+
retry_download(
|
|
686
|
+
huggingface_hub.hf_hub_download,
|
|
687
|
+
llm_family.model_name,
|
|
688
|
+
{
|
|
689
|
+
"model_size": llm_spec.model_size_in_billions,
|
|
690
|
+
"model_format": llm_spec.model_format,
|
|
691
|
+
},
|
|
692
|
+
llm_spec.model_id,
|
|
693
|
+
revision=llm_spec.model_revision,
|
|
694
|
+
filename=file_name,
|
|
695
|
+
local_dir=cache_dir,
|
|
696
|
+
local_dir_use_symlinks=True,
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
if need_merge:
|
|
700
|
+
_merge_cached_files(cache_dir, file_names, final_file_name)
|
|
638
701
|
else:
|
|
639
702
|
raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
|
|
640
703
|
|
|
@@ -873,12 +936,20 @@ def unregister_llm(model_name: str, raise_error: bool = True):
|
|
|
873
936
|
|
|
874
937
|
|
|
875
938
|
def match_llm_cls(
|
|
876
|
-
family: LLMFamilyV1,
|
|
939
|
+
family: LLMFamilyV1,
|
|
940
|
+
llm_spec: "LLMSpecV1",
|
|
941
|
+
quantization: str,
|
|
942
|
+
peft_model_path: Optional[str] = None,
|
|
877
943
|
) -> Optional[Type[LLM]]:
|
|
878
944
|
"""
|
|
879
945
|
Find an LLM implementation for given LLM family and spec.
|
|
880
946
|
"""
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
947
|
+
if peft_model_path is not None:
|
|
948
|
+
for cls in PEFT_SUPPORTED_CLASSES:
|
|
949
|
+
if cls.match(family, llm_spec, quantization):
|
|
950
|
+
return cls
|
|
951
|
+
else:
|
|
952
|
+
for cls in LLM_CLASSES:
|
|
953
|
+
if cls.match(family, llm_spec, quantization):
|
|
954
|
+
return cls
|
|
884
955
|
return None
|
|
@@ -1124,6 +1124,7 @@
|
|
|
1124
1124
|
"quantizations": [
|
|
1125
1125
|
"8bits"
|
|
1126
1126
|
],
|
|
1127
|
+
"model_hub": "modelscope",
|
|
1127
1128
|
"model_id": "01ai/Yi-34B-Chat-{quantization}",
|
|
1128
1129
|
"model_revision": "master"
|
|
1129
1130
|
},
|
|
@@ -1646,7 +1647,8 @@
|
|
|
1646
1647
|
"8-bit",
|
|
1647
1648
|
"none"
|
|
1648
1649
|
],
|
|
1649
|
-
"model_id": "qwen/Qwen1.5-0.5B-Chat"
|
|
1650
|
+
"model_id": "qwen/Qwen1.5-0.5B-Chat",
|
|
1651
|
+
"model_hub": "modelscope"
|
|
1650
1652
|
},
|
|
1651
1653
|
{
|
|
1652
1654
|
"model_format": "pytorch",
|
|
@@ -1907,11 +1909,16 @@
|
|
|
1907
1909
|
"model_size_in_billions": 72,
|
|
1908
1910
|
"quantizations": [
|
|
1909
1911
|
"q2_k",
|
|
1910
|
-
"q3_k_m"
|
|
1912
|
+
"q3_k_m",
|
|
1913
|
+
"q4_k_m"
|
|
1911
1914
|
],
|
|
1912
1915
|
"model_id": "qwen/Qwen1.5-72B-Chat-GGUF",
|
|
1913
1916
|
"model_hub": "modelscope",
|
|
1914
|
-
"model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf"
|
|
1917
|
+
"model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf",
|
|
1918
|
+
"model_file_name_split_template": "qwen1_5-72b-chat-{quantization}.gguf.{part}",
|
|
1919
|
+
"quantization_parts": {
|
|
1920
|
+
"q4_k_m": ["a", "b"]
|
|
1921
|
+
}
|
|
1915
1922
|
}
|
|
1916
1923
|
],
|
|
1917
1924
|
"prompt_style": {
|
|
@@ -2329,5 +2336,52 @@
|
|
|
2329
2336
|
"<|im_sep|>"
|
|
2330
2337
|
]
|
|
2331
2338
|
}
|
|
2339
|
+
},
|
|
2340
|
+
{
|
|
2341
|
+
"version": 1,
|
|
2342
|
+
"context_length": 8192,
|
|
2343
|
+
"model_name": "gemma-it",
|
|
2344
|
+
"model_lang": [
|
|
2345
|
+
"en"
|
|
2346
|
+
],
|
|
2347
|
+
"model_ability": [
|
|
2348
|
+
"chat"
|
|
2349
|
+
],
|
|
2350
|
+
"model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
|
|
2351
|
+
"model_specs": [
|
|
2352
|
+
{
|
|
2353
|
+
"model_format": "pytorch",
|
|
2354
|
+
"model_size_in_billions": 2,
|
|
2355
|
+
"quantizations": [
|
|
2356
|
+
"none",
|
|
2357
|
+
"4-bit",
|
|
2358
|
+
"8-bit"
|
|
2359
|
+
],
|
|
2360
|
+
"model_hub": "modelscope",
|
|
2361
|
+
"model_id": "AI-ModelScope/gemma-2b-it"
|
|
2362
|
+
},
|
|
2363
|
+
{
|
|
2364
|
+
"model_format": "pytorch",
|
|
2365
|
+
"model_size_in_billions": 7,
|
|
2366
|
+
"quantizations": [
|
|
2367
|
+
"none",
|
|
2368
|
+
"4-bit",
|
|
2369
|
+
"8-bit"
|
|
2370
|
+
],
|
|
2371
|
+
"model_hub": "modelscope",
|
|
2372
|
+
"model_id": "AI-ModelScope/gemma-7b-it"
|
|
2373
|
+
}
|
|
2374
|
+
],
|
|
2375
|
+
"prompt_style": {
|
|
2376
|
+
"style_name": "gemma",
|
|
2377
|
+
"roles": [
|
|
2378
|
+
"user",
|
|
2379
|
+
"model"
|
|
2380
|
+
],
|
|
2381
|
+
"stop": [
|
|
2382
|
+
"<end_of_turn>",
|
|
2383
|
+
"<start_of_turn>"
|
|
2384
|
+
]
|
|
2385
|
+
}
|
|
2332
2386
|
}
|
|
2333
2387
|
]
|
|
@@ -27,6 +27,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
|
|
|
27
27
|
quantization: str,
|
|
28
28
|
model_path: str,
|
|
29
29
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
30
|
+
peft_model_path: Optional[str] = None,
|
|
30
31
|
):
|
|
31
32
|
super().__init__(
|
|
32
33
|
model_uid,
|
|
@@ -35,6 +36,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
|
|
|
35
36
|
quantization,
|
|
36
37
|
model_path,
|
|
37
38
|
pytorch_model_config=pytorch_model_config,
|
|
39
|
+
peft_model_path=peft_model_path,
|
|
38
40
|
)
|
|
39
41
|
self._use_fast_tokenizer = False
|
|
40
42
|
|
|
@@ -39,6 +39,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
39
39
|
quantization: str,
|
|
40
40
|
model_path: str,
|
|
41
41
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
42
|
+
peft_model_path: Optional[str] = None,
|
|
42
43
|
):
|
|
43
44
|
super().__init__(
|
|
44
45
|
model_uid,
|
|
@@ -47,6 +48,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
47
48
|
quantization,
|
|
48
49
|
model_path,
|
|
49
50
|
pytorch_model_config=pytorch_model_config,
|
|
51
|
+
peft_model_path=peft_model_path,
|
|
50
52
|
)
|
|
51
53
|
|
|
52
54
|
def _load_model(self, **kwargs):
|
|
@@ -52,12 +52,14 @@ class PytorchModel(LLM):
|
|
|
52
52
|
quantization: str,
|
|
53
53
|
model_path: str,
|
|
54
54
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
55
|
+
peft_model_path: Optional[str] = None,
|
|
55
56
|
):
|
|
56
57
|
super().__init__(model_uid, model_family, model_spec, quantization, model_path)
|
|
57
58
|
self._use_fast_tokenizer = True
|
|
58
59
|
self._pytorch_model_config: PytorchModelConfig = self._sanitize_model_config(
|
|
59
60
|
pytorch_model_config
|
|
60
61
|
)
|
|
62
|
+
self._peft_model_path = peft_model_path
|
|
61
63
|
|
|
62
64
|
def _sanitize_model_config(
|
|
63
65
|
self, pytorch_model_config: Optional[PytorchModelConfig]
|
|
@@ -112,6 +114,24 @@ class PytorchModel(LLM):
|
|
|
112
114
|
)
|
|
113
115
|
return model, tokenizer
|
|
114
116
|
|
|
117
|
+
def _apply_lora(self):
|
|
118
|
+
if self._peft_model_path is not None:
|
|
119
|
+
try:
|
|
120
|
+
from peft import PeftModel
|
|
121
|
+
except ImportError:
|
|
122
|
+
raise ImportError(
|
|
123
|
+
f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Apply LoRA
|
|
127
|
+
self._model = PeftModel.from_pretrained(
|
|
128
|
+
self._model,
|
|
129
|
+
self._peft_model_path,
|
|
130
|
+
)
|
|
131
|
+
logger.info(
|
|
132
|
+
f"Successfully loaded the PEFT adaptor for model {self.model_uid}."
|
|
133
|
+
)
|
|
134
|
+
|
|
115
135
|
def load(self):
|
|
116
136
|
try:
|
|
117
137
|
import torch
|
|
@@ -200,6 +220,7 @@ class PytorchModel(LLM):
|
|
|
200
220
|
is_device_map_auto = True
|
|
201
221
|
|
|
202
222
|
self._model, self._tokenizer = self._load_model(**kwargs)
|
|
223
|
+
self._apply_lora()
|
|
203
224
|
|
|
204
225
|
if not is_device_map_auto:
|
|
205
226
|
self._model.to(self._device)
|
|
@@ -391,6 +412,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
391
412
|
quantization: str,
|
|
392
413
|
model_path: str,
|
|
393
414
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
415
|
+
peft_model_path: Optional[str] = None,
|
|
394
416
|
):
|
|
395
417
|
super().__init__(
|
|
396
418
|
model_uid,
|
|
@@ -399,6 +421,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
399
421
|
quantization,
|
|
400
422
|
model_path,
|
|
401
423
|
pytorch_model_config,
|
|
424
|
+
peft_model_path,
|
|
402
425
|
)
|
|
403
426
|
|
|
404
427
|
def _sanitize_generate_config(
|
|
@@ -27,6 +27,7 @@ class FalconPytorchModel(PytorchModel):
|
|
|
27
27
|
quantization: str,
|
|
28
28
|
model_path: str,
|
|
29
29
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
30
|
+
peft_model_path: Optional[str] = None,
|
|
30
31
|
):
|
|
31
32
|
super().__init__(
|
|
32
33
|
model_uid,
|
|
@@ -35,6 +36,7 @@ class FalconPytorchModel(PytorchModel):
|
|
|
35
36
|
quantization,
|
|
36
37
|
model_path,
|
|
37
38
|
pytorch_model_config=pytorch_model_config,
|
|
39
|
+
peft_model_path=peft_model_path,
|
|
38
40
|
)
|
|
39
41
|
|
|
40
42
|
def _load_model(self, **kwargs):
|
|
@@ -84,6 +86,7 @@ class FalconPytorchChatModel(PytorchChatModel):
|
|
|
84
86
|
quantization: str,
|
|
85
87
|
model_path: str,
|
|
86
88
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
89
|
+
peft_model_path: Optional[str] = None,
|
|
87
90
|
):
|
|
88
91
|
super().__init__(
|
|
89
92
|
model_uid,
|
|
@@ -92,6 +95,7 @@ class FalconPytorchChatModel(PytorchChatModel):
|
|
|
92
95
|
quantization,
|
|
93
96
|
model_path,
|
|
94
97
|
pytorch_model_config=pytorch_model_config,
|
|
98
|
+
peft_model_path=peft_model_path,
|
|
95
99
|
)
|
|
96
100
|
|
|
97
101
|
def _load_model(self, **kwargs):
|
|
@@ -38,6 +38,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
38
38
|
quantization: str,
|
|
39
39
|
model_path: str,
|
|
40
40
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
41
|
+
peft_model_path: Optional[str] = None,
|
|
41
42
|
):
|
|
42
43
|
super().__init__(
|
|
43
44
|
model_uid,
|
|
@@ -46,6 +47,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
46
47
|
quantization,
|
|
47
48
|
model_path,
|
|
48
49
|
pytorch_model_config=pytorch_model_config,
|
|
50
|
+
peft_model_path=peft_model_path,
|
|
49
51
|
)
|
|
50
52
|
|
|
51
53
|
def _load_model(self, **kwargs):
|
|
@@ -27,6 +27,7 @@ class LlamaPytorchModel(PytorchModel):
|
|
|
27
27
|
quantization: str,
|
|
28
28
|
model_path: str,
|
|
29
29
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
30
|
+
peft_model_path: Optional[str] = None,
|
|
30
31
|
):
|
|
31
32
|
super().__init__(
|
|
32
33
|
model_uid,
|
|
@@ -35,6 +36,7 @@ class LlamaPytorchModel(PytorchModel):
|
|
|
35
36
|
quantization,
|
|
36
37
|
model_path,
|
|
37
38
|
pytorch_model_config=pytorch_model_config,
|
|
39
|
+
peft_model_path=peft_model_path,
|
|
38
40
|
)
|
|
39
41
|
|
|
40
42
|
def _load_model(self, **kwargs):
|
|
@@ -67,6 +69,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
|
|
|
67
69
|
model_spec: "LLMSpecV1",
|
|
68
70
|
quantization: str,
|
|
69
71
|
model_path: str,
|
|
72
|
+
peft_model_path: Optional[str] = None,
|
|
70
73
|
pytorch_model_config: Optional["PytorchModelConfig"] = None,
|
|
71
74
|
):
|
|
72
75
|
super().__init__(
|
|
@@ -75,6 +78,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
|
|
|
75
78
|
model_spec,
|
|
76
79
|
quantization,
|
|
77
80
|
model_path,
|
|
81
|
+
peft_model_path=peft_model_path,
|
|
78
82
|
pytorch_model_config=pytorch_model_config,
|
|
79
83
|
)
|
|
80
84
|
self._use_fast_tokenizer = False
|
|
@@ -41,6 +41,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
|
|
|
41
41
|
quantization: str,
|
|
42
42
|
model_path: str,
|
|
43
43
|
pytorch_model_config: Optional["PytorchModelConfig"] = None,
|
|
44
|
+
peft_model_path: Optional[str] = None,
|
|
44
45
|
):
|
|
45
46
|
super().__init__(
|
|
46
47
|
model_uid,
|
|
@@ -49,6 +50,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
|
|
|
49
50
|
quantization,
|
|
50
51
|
model_path,
|
|
51
52
|
pytorch_model_config=pytorch_model_config,
|
|
53
|
+
peft_model_path=peft_model_path,
|
|
52
54
|
)
|
|
53
55
|
self._use_fast_tokenizer = False
|
|
54
56
|
|
xinference/types.py
CHANGED
|
@@ -346,8 +346,11 @@ try:
|
|
|
346
346
|
|
|
347
347
|
CreateCompletionLlamaCpp = get_pydantic_model_from_method(
|
|
348
348
|
Llama.create_completion,
|
|
349
|
-
exclude_fields=["model", "prompt", "grammar"],
|
|
350
|
-
include_fields={
|
|
349
|
+
exclude_fields=["model", "prompt", "grammar", "max_tokens"],
|
|
350
|
+
include_fields={
|
|
351
|
+
"grammar": (Optional[Any], None),
|
|
352
|
+
"max_tokens": (Optional[int], max_tokens_field),
|
|
353
|
+
},
|
|
351
354
|
)
|
|
352
355
|
except ImportError:
|
|
353
356
|
CreateCompletionLlamaCpp = create_model("CreateCompletionLlamaCpp")
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
|
-
"main.js": "./static/js/main.
|
|
3
|
+
"main.js": "./static/js/main.78829790.js",
|
|
4
4
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
5
5
|
"index.html": "./index.html",
|
|
6
|
-
"main.
|
|
6
|
+
"main.78829790.js.map": "./static/js/main.78829790.js.map"
|
|
7
7
|
},
|
|
8
8
|
"entrypoints": [
|
|
9
|
-
"static/js/main.
|
|
9
|
+
"static/js/main.78829790.js"
|
|
10
10
|
]
|
|
11
11
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.78829790.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|