xinference 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +123 -3
- xinference/client/restful/restful_client.py +131 -2
- xinference/core/model.py +93 -24
- xinference/core/supervisor.py +132 -15
- xinference/core/worker.py +165 -8
- xinference/deploy/cmdline.py +5 -0
- xinference/model/audio/chattts.py +46 -14
- xinference/model/audio/core.py +23 -15
- xinference/model/core.py +12 -3
- xinference/model/embedding/core.py +25 -16
- xinference/model/flexible/__init__.py +40 -0
- xinference/model/flexible/core.py +228 -0
- xinference/model/flexible/launchers/__init__.py +15 -0
- xinference/model/flexible/launchers/transformers_launcher.py +63 -0
- xinference/model/flexible/utils.py +33 -0
- xinference/model/image/core.py +21 -14
- xinference/model/image/custom.py +1 -1
- xinference/model/image/model_spec.json +14 -0
- xinference/model/image/stable_diffusion/core.py +43 -6
- xinference/model/llm/__init__.py +0 -2
- xinference/model/llm/core.py +3 -2
- xinference/model/llm/ggml/llamacpp.py +1 -10
- xinference/model/llm/llm_family.json +292 -36
- xinference/model/llm/llm_family.py +97 -52
- xinference/model/llm/llm_family_modelscope.json +220 -27
- xinference/model/llm/pytorch/core.py +0 -80
- xinference/model/llm/sglang/core.py +7 -2
- xinference/model/llm/utils.py +4 -2
- xinference/model/llm/vllm/core.py +3 -0
- xinference/model/rerank/core.py +24 -25
- xinference/types.py +0 -1
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.0fb6f3ab.js → main.95c1d652.js} +3 -3
- xinference/web/ui/build/static/js/main.95c1d652.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/70fa8c07463a5fe57c68bf92502910105a8f647371836fe8c3a7408246ca7ba0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +1 -0
- {xinference-0.13.0.dist-info → xinference-0.13.2.dist-info}/METADATA +9 -11
- {xinference-0.13.0.dist-info → xinference-0.13.2.dist-info}/RECORD +49 -58
- xinference/model/llm/ggml/chatglm.py +0 -457
- xinference/thirdparty/ChatTTS/__init__.py +0 -1
- xinference/thirdparty/ChatTTS/core.py +0 -200
- xinference/thirdparty/ChatTTS/experimental/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/experimental/llm.py +0 -40
- xinference/thirdparty/ChatTTS/infer/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/infer/api.py +0 -125
- xinference/thirdparty/ChatTTS/model/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/model/dvae.py +0 -155
- xinference/thirdparty/ChatTTS/model/gpt.py +0 -265
- xinference/thirdparty/ChatTTS/utils/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/utils/gpu_utils.py +0 -23
- xinference/thirdparty/ChatTTS/utils/infer_utils.py +0 -141
- xinference/thirdparty/ChatTTS/utils/io_utils.py +0 -14
- xinference/web/ui/build/static/js/main.0fb6f3ab.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0f6b391abec76271137faad13a3793fe7acc1024e8cd2269c147b653ecd3a73b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/30a0c79d8025d6441eb75b2df5bc2750a14f30119c869ef02570d294dff65c2f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/40486e655c3c5801f087e2cf206c0b5511aaa0dfdba78046b7181bf9c17e54c5.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b5507cd57f16a3a230aa0128e39fe103e928de139ea29e2679e4c64dcbba3b3a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d779b915f83f9c7b5a72515b6932fdd114f1822cef90ae01cc0d12bca59abc2d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d87824cb266194447a9c0c69ebab2d507bfc3e3148976173760d18c035e9dd26.json +0 -1
- /xinference/web/ui/build/static/js/{main.0fb6f3ab.js.LICENSE.txt → main.95c1d652.js.LICENSE.txt} +0 -0
- {xinference-0.13.0.dist-info → xinference-0.13.2.dist-info}/LICENSE +0 -0
- {xinference-0.13.0.dist-info → xinference-0.13.2.dist-info}/WHEEL +0 -0
- {xinference-0.13.0.dist-info → xinference-0.13.2.dist-info}/entry_points.txt +0 -0
- {xinference-0.13.0.dist-info → xinference-0.13.2.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
import logging
|
|
16
16
|
import os
|
|
17
|
-
import platform
|
|
18
17
|
import shutil
|
|
19
18
|
from threading import Lock
|
|
20
19
|
from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
|
|
@@ -541,25 +540,50 @@ def _get_cache_dir_for_model_mem(
|
|
|
541
540
|
def _get_cache_dir(
|
|
542
541
|
llm_family: LLMFamilyV1,
|
|
543
542
|
llm_spec: "LLMSpecV1",
|
|
543
|
+
quantization: Optional[str] = None,
|
|
544
544
|
create_if_not_exist=True,
|
|
545
545
|
):
|
|
546
546
|
# If the model id contains quantization, then we should give each
|
|
547
547
|
# quantization a dedicated cache dir.
|
|
548
548
|
quant_suffix = ""
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
549
|
+
if llm_spec.model_id and "{" in llm_spec.model_id and quantization is not None:
|
|
550
|
+
quant_suffix = quantization
|
|
551
|
+
else:
|
|
552
|
+
for q in llm_spec.quantizations:
|
|
553
|
+
if llm_spec.model_id and q in llm_spec.model_id:
|
|
554
|
+
quant_suffix = q
|
|
555
|
+
break
|
|
556
|
+
|
|
557
|
+
# some model name includes ".", e.g. qwen1.5-chat
|
|
558
|
+
# if the model does not require trust_remote_code, it's OK
|
|
559
|
+
# because no need to import modeling_xxx.py from the path
|
|
560
|
+
# but when the model need to trust_remote_code,
|
|
561
|
+
# e.g. internlm2.5-chat, the import will fail,
|
|
562
|
+
# but before the model may have been downloaded,
|
|
563
|
+
# thus we check it first, if exist, return it,
|
|
564
|
+
# otherwise, we replace the "." with "_" in model name
|
|
565
|
+
old_cache_dir_name = (
|
|
554
566
|
f"{llm_family.model_name}-{llm_spec.model_format}"
|
|
555
567
|
f"-{llm_spec.model_size_in_billions}b"
|
|
556
568
|
)
|
|
557
569
|
if quant_suffix:
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
570
|
+
old_cache_dir_name += f"-{quant_suffix}"
|
|
571
|
+
old_cache_dir = os.path.realpath(
|
|
572
|
+
os.path.join(XINFERENCE_CACHE_DIR, old_cache_dir_name)
|
|
573
|
+
)
|
|
574
|
+
if os.path.exists(old_cache_dir):
|
|
575
|
+
return old_cache_dir
|
|
576
|
+
else:
|
|
577
|
+
cache_dir_name = (
|
|
578
|
+
f"{llm_family.model_name.replace('.', '_')}-{llm_spec.model_format}"
|
|
579
|
+
f"-{llm_spec.model_size_in_billions}b"
|
|
580
|
+
)
|
|
581
|
+
if quant_suffix:
|
|
582
|
+
cache_dir_name += f"-{quant_suffix}"
|
|
583
|
+
cache_dir = os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, cache_dir_name))
|
|
584
|
+
if create_if_not_exist and not os.path.exists(cache_dir):
|
|
585
|
+
os.makedirs(cache_dir, exist_ok=True)
|
|
586
|
+
return cache_dir
|
|
563
587
|
|
|
564
588
|
|
|
565
589
|
def _get_meta_path(
|
|
@@ -900,6 +924,7 @@ def _check_revision(
|
|
|
900
924
|
llm_spec: "LLMSpecV1",
|
|
901
925
|
builtin: list,
|
|
902
926
|
meta_path: str,
|
|
927
|
+
quantization: Optional[str] = None,
|
|
903
928
|
) -> bool:
|
|
904
929
|
for family in builtin:
|
|
905
930
|
if llm_family.model_name == family.model_name:
|
|
@@ -908,59 +933,63 @@ def _check_revision(
|
|
|
908
933
|
if (
|
|
909
934
|
spec.model_format == "pytorch"
|
|
910
935
|
and spec.model_size_in_billions == llm_spec.model_size_in_billions
|
|
936
|
+
and (quantization is None or quantization in spec.quantizations)
|
|
911
937
|
):
|
|
912
938
|
return valid_model_revision(meta_path, spec.model_revision)
|
|
913
939
|
return False
|
|
914
940
|
|
|
915
941
|
|
|
916
942
|
def get_cache_status(
|
|
917
|
-
llm_family: LLMFamilyV1,
|
|
918
|
-
llm_spec: "LLMSpecV1",
|
|
943
|
+
llm_family: LLMFamilyV1, llm_spec: "LLMSpecV1", quantization: Optional[str] = None
|
|
919
944
|
) -> Union[bool, List[bool]]:
|
|
920
945
|
"""
|
|
921
|
-
|
|
922
|
-
|
|
946
|
+
Checks if a model's cache status is available based on the model format and quantization.
|
|
947
|
+
Supports different directories and model formats.
|
|
923
948
|
"""
|
|
924
|
-
cache_dir = _get_cache_dir(llm_family, llm_spec, create_if_not_exist=False)
|
|
925
|
-
# check revision for pytorch model
|
|
926
|
-
if llm_spec.model_format == "pytorch":
|
|
927
|
-
hf_meta_path = _get_meta_path(cache_dir, "pytorch", "huggingface", "none")
|
|
928
|
-
ms_meta_path = _get_meta_path(cache_dir, "pytorch", "modelscope", "none")
|
|
929
|
-
revisions = [
|
|
930
|
-
_check_revision(llm_family, llm_spec, BUILTIN_LLM_FAMILIES, hf_meta_path),
|
|
931
|
-
_check_revision(
|
|
932
|
-
llm_family, llm_spec, BUILTIN_MODELSCOPE_LLM_FAMILIES, ms_meta_path
|
|
933
|
-
),
|
|
934
|
-
]
|
|
935
|
-
return any(revisions)
|
|
936
|
-
# just check meta file for ggml and gptq model
|
|
937
|
-
elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
|
|
938
|
-
ret = []
|
|
939
|
-
for q in llm_spec.quantizations:
|
|
940
|
-
assert q is not None
|
|
941
|
-
hf_meta_path = _get_meta_path(
|
|
942
|
-
cache_dir, llm_spec.model_format, "huggingface", q
|
|
943
|
-
)
|
|
944
|
-
ms_meta_path = _get_meta_path(
|
|
945
|
-
cache_dir, llm_spec.model_format, "modelscope", q
|
|
946
|
-
)
|
|
947
|
-
results = [os.path.exists(hf_meta_path), os.path.exists(ms_meta_path)]
|
|
948
|
-
ret.append(any(results))
|
|
949
|
-
return ret
|
|
950
|
-
else:
|
|
951
|
-
raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
|
|
952
|
-
|
|
953
949
|
|
|
954
|
-
def
|
|
955
|
-
|
|
950
|
+
def check_file_status(meta_path: str) -> bool:
|
|
951
|
+
return os.path.exists(meta_path)
|
|
956
952
|
|
|
953
|
+
def check_revision_status(
|
|
954
|
+
meta_path: str, families: list, quantization: Optional[str] = None
|
|
955
|
+
) -> bool:
|
|
956
|
+
return _check_revision(llm_family, llm_spec, families, meta_path, quantization)
|
|
957
957
|
|
|
958
|
-
def
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
958
|
+
def handle_quantization(q: Union[str, None]) -> bool:
|
|
959
|
+
specific_cache_dir = _get_cache_dir(
|
|
960
|
+
llm_family, llm_spec, q, create_if_not_exist=False
|
|
961
|
+
)
|
|
962
|
+
meta_paths = {
|
|
963
|
+
"huggingface": _get_meta_path(
|
|
964
|
+
specific_cache_dir, llm_spec.model_format, "huggingface", q
|
|
965
|
+
),
|
|
966
|
+
"modelscope": _get_meta_path(
|
|
967
|
+
specific_cache_dir, llm_spec.model_format, "modelscope", q
|
|
968
|
+
),
|
|
969
|
+
}
|
|
970
|
+
if llm_spec.model_format == "pytorch":
|
|
971
|
+
return check_revision_status(
|
|
972
|
+
meta_paths["huggingface"], BUILTIN_LLM_FAMILIES, q
|
|
973
|
+
) or check_revision_status(
|
|
974
|
+
meta_paths["modelscope"], BUILTIN_MODELSCOPE_LLM_FAMILIES, q
|
|
975
|
+
)
|
|
976
|
+
else:
|
|
977
|
+
return check_file_status(meta_paths["huggingface"]) or check_file_status(
|
|
978
|
+
meta_paths["modelscope"]
|
|
979
|
+
)
|
|
962
980
|
|
|
963
|
-
|
|
981
|
+
if llm_spec.model_id and "{" in llm_spec.model_id:
|
|
982
|
+
return (
|
|
983
|
+
[handle_quantization(q) for q in llm_spec.quantizations]
|
|
984
|
+
if quantization is None
|
|
985
|
+
else handle_quantization(quantization)
|
|
986
|
+
)
|
|
987
|
+
else:
|
|
988
|
+
return (
|
|
989
|
+
[handle_quantization(q) for q in llm_spec.quantizations]
|
|
990
|
+
if llm_spec.model_format != "pytorch"
|
|
991
|
+
else handle_quantization(None)
|
|
992
|
+
)
|
|
964
993
|
|
|
965
994
|
|
|
966
995
|
def get_user_defined_llm_families():
|
|
@@ -1006,6 +1035,7 @@ def match_llm(
|
|
|
1006
1035
|
model_format: Optional[str] = None,
|
|
1007
1036
|
model_size_in_billions: Optional[Union[int, str]] = None,
|
|
1008
1037
|
quantization: Optional[str] = None,
|
|
1038
|
+
download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
|
|
1009
1039
|
) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
|
|
1010
1040
|
"""
|
|
1011
1041
|
Find an LLM family, spec, and quantization that satisfy given criteria.
|
|
@@ -1029,7 +1059,22 @@ def match_llm(
|
|
|
1029
1059
|
spec.model_id = spec.model_id.format(quantization=q)
|
|
1030
1060
|
return spec
|
|
1031
1061
|
|
|
1032
|
-
|
|
1062
|
+
# priority: download_hub > download_from_modelscope() and download_from_csghub()
|
|
1063
|
+
if download_hub == "modelscope":
|
|
1064
|
+
all_families = (
|
|
1065
|
+
BUILTIN_MODELSCOPE_LLM_FAMILIES
|
|
1066
|
+
+ BUILTIN_LLM_FAMILIES
|
|
1067
|
+
+ user_defined_llm_families
|
|
1068
|
+
)
|
|
1069
|
+
elif download_hub == "csghub":
|
|
1070
|
+
all_families = (
|
|
1071
|
+
BUILTIN_CSGHUB_LLM_FAMILIES
|
|
1072
|
+
+ BUILTIN_LLM_FAMILIES
|
|
1073
|
+
+ user_defined_llm_families
|
|
1074
|
+
)
|
|
1075
|
+
elif download_hub == "huggingface":
|
|
1076
|
+
all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
|
|
1077
|
+
elif download_from_modelscope():
|
|
1033
1078
|
all_families = (
|
|
1034
1079
|
BUILTIN_MODELSCOPE_LLM_FAMILIES
|
|
1035
1080
|
+ BUILTIN_LLM_FAMILIES
|
|
@@ -304,21 +304,6 @@
|
|
|
304
304
|
],
|
|
305
305
|
"model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
|
|
306
306
|
"model_specs": [
|
|
307
|
-
{
|
|
308
|
-
"model_format": "ggmlv3",
|
|
309
|
-
"model_size_in_billions": 6,
|
|
310
|
-
"quantizations": [
|
|
311
|
-
"q4_0",
|
|
312
|
-
"q4_1",
|
|
313
|
-
"q5_0",
|
|
314
|
-
"q5_1",
|
|
315
|
-
"q8_0"
|
|
316
|
-
],
|
|
317
|
-
"model_hub": "modelscope",
|
|
318
|
-
"model_id": "Xorbits/chatglm2-6B-GGML",
|
|
319
|
-
"model_revision": "v1.0.0",
|
|
320
|
-
"model_file_name_template": "chatglm2-ggml-{quantization}.bin"
|
|
321
|
-
},
|
|
322
307
|
{
|
|
323
308
|
"model_format": "pytorch",
|
|
324
309
|
"model_size_in_billions": 6,
|
|
@@ -392,17 +377,6 @@
|
|
|
392
377
|
],
|
|
393
378
|
"model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
|
|
394
379
|
"model_specs": [
|
|
395
|
-
{
|
|
396
|
-
"model_format": "ggmlv3",
|
|
397
|
-
"model_size_in_billions": 6,
|
|
398
|
-
"quantizations": [
|
|
399
|
-
"q4_0"
|
|
400
|
-
],
|
|
401
|
-
"model_hub": "modelscope",
|
|
402
|
-
"model_id": "Xorbits/chatglm3-ggml",
|
|
403
|
-
"model_revision": "v1.0.0",
|
|
404
|
-
"model_file_name_template": "chatglm3-ggml-{quantization}.bin"
|
|
405
|
-
},
|
|
406
380
|
{
|
|
407
381
|
"model_format": "pytorch",
|
|
408
382
|
"model_size_in_billions": 6,
|
|
@@ -547,6 +521,33 @@
|
|
|
547
521
|
"model_hub": "modelscope",
|
|
548
522
|
"model_id": "ZhipuAI/glm-4-9b-chat",
|
|
549
523
|
"model_revision": "master"
|
|
524
|
+
},
|
|
525
|
+
{
|
|
526
|
+
"model_format": "ggufv2",
|
|
527
|
+
"model_size_in_billions": 9,
|
|
528
|
+
"quantizations": [
|
|
529
|
+
"Q2_K",
|
|
530
|
+
"IQ3_XS",
|
|
531
|
+
"IQ3_S",
|
|
532
|
+
"IQ3_M",
|
|
533
|
+
"Q3_K_S",
|
|
534
|
+
"Q3_K_L",
|
|
535
|
+
"Q3_K",
|
|
536
|
+
"IQ4_XS",
|
|
537
|
+
"IQ4_NL",
|
|
538
|
+
"Q4_K_S",
|
|
539
|
+
"Q4_K",
|
|
540
|
+
"Q5_K_S",
|
|
541
|
+
"Q5_K",
|
|
542
|
+
"Q6_K",
|
|
543
|
+
"Q8_0",
|
|
544
|
+
"BF16",
|
|
545
|
+
"FP16"
|
|
546
|
+
],
|
|
547
|
+
"model_file_name_template": "glm-4-9b-chat.{quantization}.gguf",
|
|
548
|
+
"model_hub": "modelscope",
|
|
549
|
+
"model_id": "LLM-Research/glm-4-9b-chat-GGUF",
|
|
550
|
+
"model_revision": "master"
|
|
550
551
|
}
|
|
551
552
|
],
|
|
552
553
|
"prompt_style": {
|
|
@@ -593,6 +594,33 @@
|
|
|
593
594
|
"model_hub": "modelscope",
|
|
594
595
|
"model_id": "ZhipuAI/glm-4-9b-chat-1m",
|
|
595
596
|
"model_revision": "master"
|
|
597
|
+
},
|
|
598
|
+
{
|
|
599
|
+
"model_format": "ggufv2",
|
|
600
|
+
"model_size_in_billions": 9,
|
|
601
|
+
"quantizations": [
|
|
602
|
+
"Q2_K",
|
|
603
|
+
"IQ3_XS",
|
|
604
|
+
"IQ3_S",
|
|
605
|
+
"IQ3_M",
|
|
606
|
+
"Q3_K_S",
|
|
607
|
+
"Q3_K_L",
|
|
608
|
+
"Q3_K",
|
|
609
|
+
"IQ4_XS",
|
|
610
|
+
"IQ4_NL",
|
|
611
|
+
"Q4_K_S",
|
|
612
|
+
"Q4_K",
|
|
613
|
+
"Q5_K_S",
|
|
614
|
+
"Q5_K",
|
|
615
|
+
"Q6_K",
|
|
616
|
+
"Q8_0",
|
|
617
|
+
"BF16",
|
|
618
|
+
"FP16"
|
|
619
|
+
],
|
|
620
|
+
"model_file_name_template": "glm-4-9b-chat-1m.{quantization}.gguf",
|
|
621
|
+
"model_hub": "modelscope",
|
|
622
|
+
"model_id": "LLM-Research/glm-4-9b-chat-1m-GGUF",
|
|
623
|
+
"model_revision": "master"
|
|
596
624
|
}
|
|
597
625
|
],
|
|
598
626
|
"prompt_style": {
|
|
@@ -660,6 +688,66 @@
|
|
|
660
688
|
]
|
|
661
689
|
}
|
|
662
690
|
},
|
|
691
|
+
{
|
|
692
|
+
"version": 1,
|
|
693
|
+
"context_length": 131072,
|
|
694
|
+
"model_name": "codegeex4",
|
|
695
|
+
"model_lang": [
|
|
696
|
+
"en",
|
|
697
|
+
"zh"
|
|
698
|
+
],
|
|
699
|
+
"model_ability": [
|
|
700
|
+
"chat"
|
|
701
|
+
],
|
|
702
|
+
"model_description": "the open-source version of the latest CodeGeeX4 model series",
|
|
703
|
+
"model_specs": [
|
|
704
|
+
{
|
|
705
|
+
"model_format": "pytorch",
|
|
706
|
+
"model_size_in_billions": 9,
|
|
707
|
+
"quantizations": [
|
|
708
|
+
"4-bit",
|
|
709
|
+
"8-bit",
|
|
710
|
+
"none"
|
|
711
|
+
],
|
|
712
|
+
"model_id": "ZhipuAI/codegeex4-all-9b",
|
|
713
|
+
"model_hub": "modelscope",
|
|
714
|
+
"model_revision": "master"
|
|
715
|
+
},
|
|
716
|
+
{
|
|
717
|
+
"model_format": "ggufv2",
|
|
718
|
+
"model_size_in_billions": 9,
|
|
719
|
+
"quantizations": [
|
|
720
|
+
"IQ2_M",
|
|
721
|
+
"IQ3_M",
|
|
722
|
+
"Q4_K_M",
|
|
723
|
+
"Q5_K_M",
|
|
724
|
+
"Q6_K_L",
|
|
725
|
+
"Q8_0"
|
|
726
|
+
],
|
|
727
|
+
"model_file_name_template": "codegeex4-all-9b-{quantization}.gguf",
|
|
728
|
+
"model_id": "ZhipuAI/codegeex4-all-9b-GGUF",
|
|
729
|
+
"model_hub": "modelscope"
|
|
730
|
+
}
|
|
731
|
+
],
|
|
732
|
+
"prompt_style": {
|
|
733
|
+
"style_name": "CHATGLM3",
|
|
734
|
+
"system_prompt": "",
|
|
735
|
+
"roles": [
|
|
736
|
+
"user",
|
|
737
|
+
"assistant"
|
|
738
|
+
],
|
|
739
|
+
"stop_token_ids": [
|
|
740
|
+
151329,
|
|
741
|
+
151336,
|
|
742
|
+
151338
|
|
743
|
+
],
|
|
744
|
+
"stop": [
|
|
745
|
+
"<|endoftext|>",
|
|
746
|
+
"<|user|>",
|
|
747
|
+
"<|observation|>"
|
|
748
|
+
]
|
|
749
|
+
}
|
|
750
|
+
},
|
|
663
751
|
{
|
|
664
752
|
"version": 1,
|
|
665
753
|
"context_length": 2048,
|
|
@@ -900,6 +988,88 @@
|
|
|
900
988
|
]
|
|
901
989
|
}
|
|
902
990
|
},
|
|
991
|
+
{
|
|
992
|
+
"version": 1,
|
|
993
|
+
"context_length": 32768,
|
|
994
|
+
"model_name": "internlm2.5-chat",
|
|
995
|
+
"model_lang": [
|
|
996
|
+
"en",
|
|
997
|
+
"zh"
|
|
998
|
+
],
|
|
999
|
+
"model_ability": [
|
|
1000
|
+
"chat"
|
|
1001
|
+
],
|
|
1002
|
+
"model_description": "InternLM2.5 series of the InternLM model.",
|
|
1003
|
+
"model_specs": [
|
|
1004
|
+
{
|
|
1005
|
+
"model_format": "pytorch",
|
|
1006
|
+
"model_size_in_billions": 7,
|
|
1007
|
+
"quantizations": [
|
|
1008
|
+
"none"
|
|
1009
|
+
],
|
|
1010
|
+
"model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat",
|
|
1011
|
+
"model_hub": "modelscope"
|
|
1012
|
+
}
|
|
1013
|
+
],
|
|
1014
|
+
"prompt_style": {
|
|
1015
|
+
"style_name": "INTERNLM2",
|
|
1016
|
+
"system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
|
|
1017
|
+
"roles": [
|
|
1018
|
+
"<|im_start|>user",
|
|
1019
|
+
"<|im_start|>assistant"
|
|
1020
|
+
],
|
|
1021
|
+
"intra_message_sep": "<|im_end|>",
|
|
1022
|
+
"stop_token_ids": [
|
|
1023
|
+
2,
|
|
1024
|
+
92542
|
|
1025
|
+
],
|
|
1026
|
+
"stop": [
|
|
1027
|
+
"</s>",
|
|
1028
|
+
"<|im_end|>"
|
|
1029
|
+
]
|
|
1030
|
+
}
|
|
1031
|
+
},
|
|
1032
|
+
{
|
|
1033
|
+
"version": 1,
|
|
1034
|
+
"context_length": 262144,
|
|
1035
|
+
"model_name": "internlm2.5-chat-1m",
|
|
1036
|
+
"model_lang": [
|
|
1037
|
+
"en",
|
|
1038
|
+
"zh"
|
|
1039
|
+
],
|
|
1040
|
+
"model_ability": [
|
|
1041
|
+
"chat"
|
|
1042
|
+
],
|
|
1043
|
+
"model_description": "InternLM2.5 series of the InternLM model supports 1M long-context",
|
|
1044
|
+
"model_specs": [
|
|
1045
|
+
{
|
|
1046
|
+
"model_format": "pytorch",
|
|
1047
|
+
"model_size_in_billions": 7,
|
|
1048
|
+
"quantizations": [
|
|
1049
|
+
"none"
|
|
1050
|
+
],
|
|
1051
|
+
"model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat-1m",
|
|
1052
|
+
"model_hub": "modelscope"
|
|
1053
|
+
}
|
|
1054
|
+
],
|
|
1055
|
+
"prompt_style": {
|
|
1056
|
+
"style_name": "INTERNLM2",
|
|
1057
|
+
"system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
|
|
1058
|
+
"roles": [
|
|
1059
|
+
"<|im_start|>user",
|
|
1060
|
+
"<|im_start|>assistant"
|
|
1061
|
+
],
|
|
1062
|
+
"intra_message_sep": "<|im_end|>",
|
|
1063
|
+
"stop_token_ids": [
|
|
1064
|
+
2,
|
|
1065
|
+
92542
|
|
1066
|
+
],
|
|
1067
|
+
"stop": [
|
|
1068
|
+
"</s>",
|
|
1069
|
+
"<|im_end|>"
|
|
1070
|
+
]
|
|
1071
|
+
}
|
|
1072
|
+
},
|
|
903
1073
|
{
|
|
904
1074
|
"version": 1,
|
|
905
1075
|
"context_length": 100000,
|
|
@@ -3771,6 +3941,29 @@
|
|
|
3771
3941
|
],
|
|
3772
3942
|
"model_id": "AI-ModelScope/gemma-2-27b-it",
|
|
3773
3943
|
"model_hub": "modelscope"
|
|
3944
|
+
},
|
|
3945
|
+
{
|
|
3946
|
+
"model_format": "ggufv2",
|
|
3947
|
+
"model_size_in_billions": 9,
|
|
3948
|
+
"quantizations": [
|
|
3949
|
+
"Q2_K",
|
|
3950
|
+
"Q3_K_L",
|
|
3951
|
+
"Q3_K_M",
|
|
3952
|
+
"Q3_K_S",
|
|
3953
|
+
"Q4_K_L",
|
|
3954
|
+
"Q4_K_M",
|
|
3955
|
+
"Q4_K_S",
|
|
3956
|
+
"Q5_K_L",
|
|
3957
|
+
"Q5_K_M",
|
|
3958
|
+
"Q5_K_S",
|
|
3959
|
+
"Q6_K",
|
|
3960
|
+
"Q6_K_L",
|
|
3961
|
+
"Q8_0",
|
|
3962
|
+
"f32"
|
|
3963
|
+
],
|
|
3964
|
+
"model_id": "LLM-Research/gemma-2-9b-it-GGUF",
|
|
3965
|
+
"model_file_name_template": "gemma-2-9b-it-{quantization}.gguf",
|
|
3966
|
+
"model_hub": "modelscope"
|
|
3774
3967
|
}
|
|
3775
3968
|
],
|
|
3776
3969
|
"prompt_style": {
|
|
@@ -4115,7 +4308,7 @@
|
|
|
4115
4308
|
"zh"
|
|
4116
4309
|
],
|
|
4117
4310
|
"model_ability": [
|
|
4118
|
-
"
|
|
4311
|
+
"chat"
|
|
4119
4312
|
],
|
|
4120
4313
|
"model_description": "Aquila2-chat series models are the chat models",
|
|
4121
4314
|
"model_specs": [
|
|
@@ -34,9 +34,6 @@ from ....types import (
|
|
|
34
34
|
CompletionChoice,
|
|
35
35
|
CompletionChunk,
|
|
36
36
|
CreateCompletionTorch,
|
|
37
|
-
Embedding,
|
|
38
|
-
EmbeddingData,
|
|
39
|
-
EmbeddingUsage,
|
|
40
37
|
LoRA,
|
|
41
38
|
PytorchGenerateConfig,
|
|
42
39
|
PytorchModelConfig,
|
|
@@ -673,83 +670,6 @@ class PytorchModel(LLM):
|
|
|
673
670
|
)
|
|
674
671
|
self.handle_batch_inference_results(req_list)
|
|
675
672
|
|
|
676
|
-
def create_embedding(self, input: Union[str, List[str]]) -> Embedding:
|
|
677
|
-
try:
|
|
678
|
-
import torch
|
|
679
|
-
import torch.nn.functional as F
|
|
680
|
-
except ImportError as e:
|
|
681
|
-
raise ImportError(
|
|
682
|
-
"Could not import torch. Please install it with `pip install torch`."
|
|
683
|
-
) from e
|
|
684
|
-
|
|
685
|
-
if isinstance(input, str):
|
|
686
|
-
inputs = [input]
|
|
687
|
-
else:
|
|
688
|
-
inputs = input
|
|
689
|
-
|
|
690
|
-
tokenizer = self._tokenizer
|
|
691
|
-
tokenizer.pad_token = tokenizer.eos_token
|
|
692
|
-
is_llama = "llama" in str(type(self._model)) # llama supports batch inference
|
|
693
|
-
is_chatglm = "chatglm" in str(type(self._model))
|
|
694
|
-
if is_llama:
|
|
695
|
-
encoding = tokenizer.batch_encode_plus(
|
|
696
|
-
inputs, padding=True, return_tensors="pt"
|
|
697
|
-
)
|
|
698
|
-
input_ids = encoding["input_ids"].to(self._device)
|
|
699
|
-
attention_mask = encoding["attention_mask"].to(self._device)
|
|
700
|
-
model_output = self._model(
|
|
701
|
-
input_ids, attention_mask, output_hidden_states=True
|
|
702
|
-
)
|
|
703
|
-
data = model_output.hidden_states[-1]
|
|
704
|
-
mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
|
|
705
|
-
masked_embeddings = data * mask
|
|
706
|
-
sum_embeddings = torch.sum(masked_embeddings, dim=1)
|
|
707
|
-
seq_length = torch.sum(mask, dim=1)
|
|
708
|
-
embedding = sum_embeddings / seq_length
|
|
709
|
-
normalized_embeddings = F.normalize(embedding, p=2, dim=1)
|
|
710
|
-
normalized_embeddings = normalized_embeddings.tolist()
|
|
711
|
-
token_num = torch.sum(attention_mask).item()
|
|
712
|
-
|
|
713
|
-
embedding_list = []
|
|
714
|
-
for index, data in enumerate(normalized_embeddings):
|
|
715
|
-
embedding_list.append(
|
|
716
|
-
EmbeddingData(index=index, object="embedding", embedding=data)
|
|
717
|
-
)
|
|
718
|
-
|
|
719
|
-
usage = EmbeddingUsage(prompt_tokens=token_num, total_tokens=token_num)
|
|
720
|
-
|
|
721
|
-
ret = Embedding(
|
|
722
|
-
object="list",
|
|
723
|
-
model=self.model_uid,
|
|
724
|
-
data=embedding_list,
|
|
725
|
-
usage=usage,
|
|
726
|
-
)
|
|
727
|
-
|
|
728
|
-
else:
|
|
729
|
-
embedding = []
|
|
730
|
-
token_num = 0
|
|
731
|
-
for index, text in enumerate(inputs):
|
|
732
|
-
input_ids = tokenizer.encode(text, return_tensors="pt").to(self._device)
|
|
733
|
-
model_output = self._model(input_ids, output_hidden_states=True)
|
|
734
|
-
if is_chatglm:
|
|
735
|
-
data = (model_output.hidden_states[-1].transpose(0, 1))[0]
|
|
736
|
-
else:
|
|
737
|
-
data = model_output.hidden_states[-1][0]
|
|
738
|
-
data = F.normalize(torch.mean(data, dim=0), p=2, dim=0)
|
|
739
|
-
data = data.tolist()
|
|
740
|
-
|
|
741
|
-
embedding.append(
|
|
742
|
-
EmbeddingData(index=index, object="embedding", embedding=data)
|
|
743
|
-
)
|
|
744
|
-
token_num += len(input_ids[0])
|
|
745
|
-
|
|
746
|
-
usage = EmbeddingUsage(prompt_tokens=token_num, total_tokens=token_num)
|
|
747
|
-
ret = Embedding(
|
|
748
|
-
object="list", model=self.model_uid, data=embedding, usage=usage
|
|
749
|
-
)
|
|
750
|
-
|
|
751
|
-
return ret
|
|
752
|
-
|
|
753
673
|
|
|
754
674
|
class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
755
675
|
def __init__(
|
|
@@ -269,8 +269,13 @@ class SGLANGModel(LLM):
|
|
|
269
269
|
)
|
|
270
270
|
stream = sanitized_generate_config.pop("stream")
|
|
271
271
|
stream_options = sanitized_generate_config.pop("stream_options")
|
|
272
|
-
|
|
273
|
-
|
|
272
|
+
|
|
273
|
+
include_usage = (
|
|
274
|
+
stream_options.pop("include_usage")
|
|
275
|
+
if isinstance(stream_options, dict)
|
|
276
|
+
else False
|
|
277
|
+
)
|
|
278
|
+
|
|
274
279
|
request_id = str(uuid.uuid1())
|
|
275
280
|
state = pipeline.run(
|
|
276
281
|
question=prompt,
|
xinference/model/llm/utils.py
CHANGED
|
@@ -779,8 +779,10 @@ Begin!"""
|
|
|
779
779
|
def get_file_location(
|
|
780
780
|
llm_family: LLMFamilyV1, spec: LLMSpecV1, quantization: str
|
|
781
781
|
) -> Tuple[str, bool]:
|
|
782
|
-
cache_dir = _get_cache_dir(
|
|
783
|
-
|
|
782
|
+
cache_dir = _get_cache_dir(
|
|
783
|
+
llm_family, spec, quantization, create_if_not_exist=False
|
|
784
|
+
)
|
|
785
|
+
cache_status = get_cache_status(llm_family, spec, quantization)
|
|
784
786
|
if isinstance(cache_status, list):
|
|
785
787
|
is_cached = None
|
|
786
788
|
for q, cs in zip(spec.quantizations, cache_status):
|
|
@@ -112,6 +112,8 @@ VLLM_SUPPORTED_CHAT_MODELS = [
|
|
|
112
112
|
"internlm-chat-8k",
|
|
113
113
|
"internlm-chat-20b",
|
|
114
114
|
"internlm2-chat",
|
|
115
|
+
"internlm2.5-chat",
|
|
116
|
+
"internlm2.5-chat-1m",
|
|
115
117
|
"qwen-chat",
|
|
116
118
|
"Yi-chat",
|
|
117
119
|
"Yi-1.5-chat",
|
|
@@ -127,6 +129,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
|
|
|
127
129
|
"chatglm3-128k",
|
|
128
130
|
"glm4-chat",
|
|
129
131
|
"glm4-chat-1m",
|
|
132
|
+
"codegeex4",
|
|
130
133
|
"deepseek-chat",
|
|
131
134
|
"deepseek-coder-instruct",
|
|
132
135
|
]
|