xinference 0.13.0__py3-none-any.whl → 0.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +50 -2
- xinference/client/restful/restful_client.py +49 -2
- xinference/core/model.py +15 -0
- xinference/core/supervisor.py +132 -15
- xinference/core/worker.py +165 -8
- xinference/deploy/cmdline.py +5 -0
- xinference/model/audio/chattts.py +6 -6
- xinference/model/audio/core.py +23 -15
- xinference/model/core.py +12 -3
- xinference/model/embedding/core.py +25 -16
- xinference/model/flexible/__init__.py +40 -0
- xinference/model/flexible/core.py +228 -0
- xinference/model/flexible/launchers/__init__.py +15 -0
- xinference/model/flexible/launchers/transformers_launcher.py +63 -0
- xinference/model/flexible/utils.py +33 -0
- xinference/model/image/core.py +18 -14
- xinference/model/image/custom.py +1 -1
- xinference/model/llm/__init__.py +0 -2
- xinference/model/llm/core.py +3 -2
- xinference/model/llm/ggml/llamacpp.py +1 -10
- xinference/model/llm/llm_family.json +52 -35
- xinference/model/llm/llm_family.py +71 -46
- xinference/model/llm/llm_family_modelscope.json +55 -27
- xinference/model/llm/pytorch/core.py +0 -80
- xinference/model/llm/utils.py +4 -2
- xinference/model/rerank/core.py +24 -25
- xinference/types.py +0 -1
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.0fb6f3ab.js → main.95c1d652.js} +3 -3
- xinference/web/ui/build/static/js/main.95c1d652.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/70fa8c07463a5fe57c68bf92502910105a8f647371836fe8c3a7408246ca7ba0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +1 -0
- {xinference-0.13.0.dist-info → xinference-0.13.1.dist-info}/METADATA +7 -11
- {xinference-0.13.0.dist-info → xinference-0.13.1.dist-info}/RECORD +45 -54
- xinference/model/llm/ggml/chatglm.py +0 -457
- xinference/thirdparty/ChatTTS/__init__.py +0 -1
- xinference/thirdparty/ChatTTS/core.py +0 -200
- xinference/thirdparty/ChatTTS/experimental/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/experimental/llm.py +0 -40
- xinference/thirdparty/ChatTTS/infer/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/infer/api.py +0 -125
- xinference/thirdparty/ChatTTS/model/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/model/dvae.py +0 -155
- xinference/thirdparty/ChatTTS/model/gpt.py +0 -265
- xinference/thirdparty/ChatTTS/utils/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/utils/gpu_utils.py +0 -23
- xinference/thirdparty/ChatTTS/utils/infer_utils.py +0 -141
- xinference/thirdparty/ChatTTS/utils/io_utils.py +0 -14
- xinference/web/ui/build/static/js/main.0fb6f3ab.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0f6b391abec76271137faad13a3793fe7acc1024e8cd2269c147b653ecd3a73b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/30a0c79d8025d6441eb75b2df5bc2750a14f30119c869ef02570d294dff65c2f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/40486e655c3c5801f087e2cf206c0b5511aaa0dfdba78046b7181bf9c17e54c5.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b5507cd57f16a3a230aa0128e39fe103e928de139ea29e2679e4c64dcbba3b3a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d779b915f83f9c7b5a72515b6932fdd114f1822cef90ae01cc0d12bca59abc2d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d87824cb266194447a9c0c69ebab2d507bfc3e3148976173760d18c035e9dd26.json +0 -1
- /xinference/web/ui/build/static/js/{main.0fb6f3ab.js.LICENSE.txt → main.95c1d652.js.LICENSE.txt} +0 -0
- {xinference-0.13.0.dist-info → xinference-0.13.1.dist-info}/LICENSE +0 -0
- {xinference-0.13.0.dist-info → xinference-0.13.1.dist-info}/WHEEL +0 -0
- {xinference-0.13.0.dist-info → xinference-0.13.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.13.0.dist-info → xinference-0.13.1.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
import logging
|
|
16
16
|
import os
|
|
17
|
-
import platform
|
|
18
17
|
import shutil
|
|
19
18
|
from threading import Lock
|
|
20
19
|
from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
|
|
@@ -541,15 +540,20 @@ def _get_cache_dir_for_model_mem(
|
|
|
541
540
|
def _get_cache_dir(
|
|
542
541
|
llm_family: LLMFamilyV1,
|
|
543
542
|
llm_spec: "LLMSpecV1",
|
|
543
|
+
quantization: Optional[str] = None,
|
|
544
544
|
create_if_not_exist=True,
|
|
545
545
|
):
|
|
546
546
|
# If the model id contains quantization, then we should give each
|
|
547
547
|
# quantization a dedicated cache dir.
|
|
548
548
|
quant_suffix = ""
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
549
|
+
if llm_spec.model_id and "{" in llm_spec.model_id and quantization is not None:
|
|
550
|
+
quant_suffix = quantization
|
|
551
|
+
else:
|
|
552
|
+
for q in llm_spec.quantizations:
|
|
553
|
+
if llm_spec.model_id and q in llm_spec.model_id:
|
|
554
|
+
quant_suffix = q
|
|
555
|
+
break
|
|
556
|
+
|
|
553
557
|
cache_dir_name = (
|
|
554
558
|
f"{llm_family.model_name}-{llm_spec.model_format}"
|
|
555
559
|
f"-{llm_spec.model_size_in_billions}b"
|
|
@@ -900,6 +904,7 @@ def _check_revision(
|
|
|
900
904
|
llm_spec: "LLMSpecV1",
|
|
901
905
|
builtin: list,
|
|
902
906
|
meta_path: str,
|
|
907
|
+
quantization: Optional[str] = None,
|
|
903
908
|
) -> bool:
|
|
904
909
|
for family in builtin:
|
|
905
910
|
if llm_family.model_name == family.model_name:
|
|
@@ -908,59 +913,63 @@ def _check_revision(
|
|
|
908
913
|
if (
|
|
909
914
|
spec.model_format == "pytorch"
|
|
910
915
|
and spec.model_size_in_billions == llm_spec.model_size_in_billions
|
|
916
|
+
and (quantization is None or quantization in spec.quantizations)
|
|
911
917
|
):
|
|
912
918
|
return valid_model_revision(meta_path, spec.model_revision)
|
|
913
919
|
return False
|
|
914
920
|
|
|
915
921
|
|
|
916
922
|
def get_cache_status(
|
|
917
|
-
llm_family: LLMFamilyV1,
|
|
918
|
-
llm_spec: "LLMSpecV1",
|
|
923
|
+
llm_family: LLMFamilyV1, llm_spec: "LLMSpecV1", quantization: Optional[str] = None
|
|
919
924
|
) -> Union[bool, List[bool]]:
|
|
920
925
|
"""
|
|
921
|
-
|
|
922
|
-
|
|
926
|
+
Checks if a model's cache status is available based on the model format and quantization.
|
|
927
|
+
Supports different directories and model formats.
|
|
923
928
|
"""
|
|
924
|
-
cache_dir = _get_cache_dir(llm_family, llm_spec, create_if_not_exist=False)
|
|
925
|
-
# check revision for pytorch model
|
|
926
|
-
if llm_spec.model_format == "pytorch":
|
|
927
|
-
hf_meta_path = _get_meta_path(cache_dir, "pytorch", "huggingface", "none")
|
|
928
|
-
ms_meta_path = _get_meta_path(cache_dir, "pytorch", "modelscope", "none")
|
|
929
|
-
revisions = [
|
|
930
|
-
_check_revision(llm_family, llm_spec, BUILTIN_LLM_FAMILIES, hf_meta_path),
|
|
931
|
-
_check_revision(
|
|
932
|
-
llm_family, llm_spec, BUILTIN_MODELSCOPE_LLM_FAMILIES, ms_meta_path
|
|
933
|
-
),
|
|
934
|
-
]
|
|
935
|
-
return any(revisions)
|
|
936
|
-
# just check meta file for ggml and gptq model
|
|
937
|
-
elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
|
|
938
|
-
ret = []
|
|
939
|
-
for q in llm_spec.quantizations:
|
|
940
|
-
assert q is not None
|
|
941
|
-
hf_meta_path = _get_meta_path(
|
|
942
|
-
cache_dir, llm_spec.model_format, "huggingface", q
|
|
943
|
-
)
|
|
944
|
-
ms_meta_path = _get_meta_path(
|
|
945
|
-
cache_dir, llm_spec.model_format, "modelscope", q
|
|
946
|
-
)
|
|
947
|
-
results = [os.path.exists(hf_meta_path), os.path.exists(ms_meta_path)]
|
|
948
|
-
ret.append(any(results))
|
|
949
|
-
return ret
|
|
950
|
-
else:
|
|
951
|
-
raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
|
|
952
|
-
|
|
953
929
|
|
|
954
|
-
def
|
|
955
|
-
|
|
930
|
+
def check_file_status(meta_path: str) -> bool:
|
|
931
|
+
return os.path.exists(meta_path)
|
|
956
932
|
|
|
933
|
+
def check_revision_status(
|
|
934
|
+
meta_path: str, families: list, quantization: Optional[str] = None
|
|
935
|
+
) -> bool:
|
|
936
|
+
return _check_revision(llm_family, llm_spec, families, meta_path, quantization)
|
|
957
937
|
|
|
958
|
-
def
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
938
|
+
def handle_quantization(q: Union[str, None]) -> bool:
|
|
939
|
+
specific_cache_dir = _get_cache_dir(
|
|
940
|
+
llm_family, llm_spec, q, create_if_not_exist=False
|
|
941
|
+
)
|
|
942
|
+
meta_paths = {
|
|
943
|
+
"huggingface": _get_meta_path(
|
|
944
|
+
specific_cache_dir, llm_spec.model_format, "huggingface", q
|
|
945
|
+
),
|
|
946
|
+
"modelscope": _get_meta_path(
|
|
947
|
+
specific_cache_dir, llm_spec.model_format, "modelscope", q
|
|
948
|
+
),
|
|
949
|
+
}
|
|
950
|
+
if llm_spec.model_format == "pytorch":
|
|
951
|
+
return check_revision_status(
|
|
952
|
+
meta_paths["huggingface"], BUILTIN_LLM_FAMILIES, q
|
|
953
|
+
) or check_revision_status(
|
|
954
|
+
meta_paths["modelscope"], BUILTIN_MODELSCOPE_LLM_FAMILIES, q
|
|
955
|
+
)
|
|
956
|
+
else:
|
|
957
|
+
return check_file_status(meta_paths["huggingface"]) or check_file_status(
|
|
958
|
+
meta_paths["modelscope"]
|
|
959
|
+
)
|
|
962
960
|
|
|
963
|
-
|
|
961
|
+
if llm_spec.model_id and "{" in llm_spec.model_id:
|
|
962
|
+
return (
|
|
963
|
+
[handle_quantization(q) for q in llm_spec.quantizations]
|
|
964
|
+
if quantization is None
|
|
965
|
+
else handle_quantization(quantization)
|
|
966
|
+
)
|
|
967
|
+
else:
|
|
968
|
+
return (
|
|
969
|
+
[handle_quantization(q) for q in llm_spec.quantizations]
|
|
970
|
+
if llm_spec.model_format != "pytorch"
|
|
971
|
+
else handle_quantization(None)
|
|
972
|
+
)
|
|
964
973
|
|
|
965
974
|
|
|
966
975
|
def get_user_defined_llm_families():
|
|
@@ -1006,6 +1015,7 @@ def match_llm(
|
|
|
1006
1015
|
model_format: Optional[str] = None,
|
|
1007
1016
|
model_size_in_billions: Optional[Union[int, str]] = None,
|
|
1008
1017
|
quantization: Optional[str] = None,
|
|
1018
|
+
download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
|
|
1009
1019
|
) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
|
|
1010
1020
|
"""
|
|
1011
1021
|
Find an LLM family, spec, and quantization that satisfy given criteria.
|
|
@@ -1029,7 +1039,22 @@ def match_llm(
|
|
|
1029
1039
|
spec.model_id = spec.model_id.format(quantization=q)
|
|
1030
1040
|
return spec
|
|
1031
1041
|
|
|
1032
|
-
|
|
1042
|
+
# priority: download_hub > download_from_modelscope() and download_from_csghub()
|
|
1043
|
+
if download_hub == "modelscope":
|
|
1044
|
+
all_families = (
|
|
1045
|
+
BUILTIN_MODELSCOPE_LLM_FAMILIES
|
|
1046
|
+
+ BUILTIN_LLM_FAMILIES
|
|
1047
|
+
+ user_defined_llm_families
|
|
1048
|
+
)
|
|
1049
|
+
elif download_hub == "csghub":
|
|
1050
|
+
all_families = (
|
|
1051
|
+
BUILTIN_CSGHUB_LLM_FAMILIES
|
|
1052
|
+
+ BUILTIN_LLM_FAMILIES
|
|
1053
|
+
+ user_defined_llm_families
|
|
1054
|
+
)
|
|
1055
|
+
elif download_hub == "huggingface":
|
|
1056
|
+
all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
|
|
1057
|
+
elif download_from_modelscope():
|
|
1033
1058
|
all_families = (
|
|
1034
1059
|
BUILTIN_MODELSCOPE_LLM_FAMILIES
|
|
1035
1060
|
+ BUILTIN_LLM_FAMILIES
|
|
@@ -304,21 +304,6 @@
|
|
|
304
304
|
],
|
|
305
305
|
"model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
|
|
306
306
|
"model_specs": [
|
|
307
|
-
{
|
|
308
|
-
"model_format": "ggmlv3",
|
|
309
|
-
"model_size_in_billions": 6,
|
|
310
|
-
"quantizations": [
|
|
311
|
-
"q4_0",
|
|
312
|
-
"q4_1",
|
|
313
|
-
"q5_0",
|
|
314
|
-
"q5_1",
|
|
315
|
-
"q8_0"
|
|
316
|
-
],
|
|
317
|
-
"model_hub": "modelscope",
|
|
318
|
-
"model_id": "Xorbits/chatglm2-6B-GGML",
|
|
319
|
-
"model_revision": "v1.0.0",
|
|
320
|
-
"model_file_name_template": "chatglm2-ggml-{quantization}.bin"
|
|
321
|
-
},
|
|
322
307
|
{
|
|
323
308
|
"model_format": "pytorch",
|
|
324
309
|
"model_size_in_billions": 6,
|
|
@@ -392,17 +377,6 @@
|
|
|
392
377
|
],
|
|
393
378
|
"model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
|
|
394
379
|
"model_specs": [
|
|
395
|
-
{
|
|
396
|
-
"model_format": "ggmlv3",
|
|
397
|
-
"model_size_in_billions": 6,
|
|
398
|
-
"quantizations": [
|
|
399
|
-
"q4_0"
|
|
400
|
-
],
|
|
401
|
-
"model_hub": "modelscope",
|
|
402
|
-
"model_id": "Xorbits/chatglm3-ggml",
|
|
403
|
-
"model_revision": "v1.0.0",
|
|
404
|
-
"model_file_name_template": "chatglm3-ggml-{quantization}.bin"
|
|
405
|
-
},
|
|
406
380
|
{
|
|
407
381
|
"model_format": "pytorch",
|
|
408
382
|
"model_size_in_billions": 6,
|
|
@@ -547,6 +521,33 @@
|
|
|
547
521
|
"model_hub": "modelscope",
|
|
548
522
|
"model_id": "ZhipuAI/glm-4-9b-chat",
|
|
549
523
|
"model_revision": "master"
|
|
524
|
+
},
|
|
525
|
+
{
|
|
526
|
+
"model_format": "ggufv2",
|
|
527
|
+
"model_size_in_billions": 9,
|
|
528
|
+
"quantizations": [
|
|
529
|
+
"Q2_K",
|
|
530
|
+
"IQ3_XS",
|
|
531
|
+
"IQ3_S",
|
|
532
|
+
"IQ3_M",
|
|
533
|
+
"Q3_K_S",
|
|
534
|
+
"Q3_K_L",
|
|
535
|
+
"Q3_K",
|
|
536
|
+
"IQ4_XS",
|
|
537
|
+
"IQ4_NL",
|
|
538
|
+
"Q4_K_S",
|
|
539
|
+
"Q4_K",
|
|
540
|
+
"Q5_K_S",
|
|
541
|
+
"Q5_K",
|
|
542
|
+
"Q6_K",
|
|
543
|
+
"Q8_0",
|
|
544
|
+
"BF16",
|
|
545
|
+
"FP16"
|
|
546
|
+
],
|
|
547
|
+
"model_file_name_template": "glm-4-9b-chat.{quantization}.gguf",
|
|
548
|
+
"model_hub": "modelscope",
|
|
549
|
+
"model_id": "LLM-Research/glm-4-9b-chat-GGUF",
|
|
550
|
+
"model_revision": "master"
|
|
550
551
|
}
|
|
551
552
|
],
|
|
552
553
|
"prompt_style": {
|
|
@@ -593,6 +594,33 @@
|
|
|
593
594
|
"model_hub": "modelscope",
|
|
594
595
|
"model_id": "ZhipuAI/glm-4-9b-chat-1m",
|
|
595
596
|
"model_revision": "master"
|
|
597
|
+
},
|
|
598
|
+
{
|
|
599
|
+
"model_format": "ggufv2",
|
|
600
|
+
"model_size_in_billions": 9,
|
|
601
|
+
"quantizations": [
|
|
602
|
+
"Q2_K",
|
|
603
|
+
"IQ3_XS",
|
|
604
|
+
"IQ3_S",
|
|
605
|
+
"IQ3_M",
|
|
606
|
+
"Q3_K_S",
|
|
607
|
+
"Q3_K_L",
|
|
608
|
+
"Q3_K",
|
|
609
|
+
"IQ4_XS",
|
|
610
|
+
"IQ4_NL",
|
|
611
|
+
"Q4_K_S",
|
|
612
|
+
"Q4_K",
|
|
613
|
+
"Q5_K_S",
|
|
614
|
+
"Q5_K",
|
|
615
|
+
"Q6_K",
|
|
616
|
+
"Q8_0",
|
|
617
|
+
"BF16",
|
|
618
|
+
"FP16"
|
|
619
|
+
],
|
|
620
|
+
"model_file_name_template": "glm-4-9b-chat-1m.{quantization}.gguf",
|
|
621
|
+
"model_hub": "modelscope",
|
|
622
|
+
"model_id": "LLM-Research/glm-4-9b-chat-1m-GGUF",
|
|
623
|
+
"model_revision": "master"
|
|
596
624
|
}
|
|
597
625
|
],
|
|
598
626
|
"prompt_style": {
|
|
@@ -4115,7 +4143,7 @@
|
|
|
4115
4143
|
"zh"
|
|
4116
4144
|
],
|
|
4117
4145
|
"model_ability": [
|
|
4118
|
-
"
|
|
4146
|
+
"chat"
|
|
4119
4147
|
],
|
|
4120
4148
|
"model_description": "Aquila2-chat series models are the chat models",
|
|
4121
4149
|
"model_specs": [
|
|
@@ -34,9 +34,6 @@ from ....types import (
|
|
|
34
34
|
CompletionChoice,
|
|
35
35
|
CompletionChunk,
|
|
36
36
|
CreateCompletionTorch,
|
|
37
|
-
Embedding,
|
|
38
|
-
EmbeddingData,
|
|
39
|
-
EmbeddingUsage,
|
|
40
37
|
LoRA,
|
|
41
38
|
PytorchGenerateConfig,
|
|
42
39
|
PytorchModelConfig,
|
|
@@ -673,83 +670,6 @@ class PytorchModel(LLM):
|
|
|
673
670
|
)
|
|
674
671
|
self.handle_batch_inference_results(req_list)
|
|
675
672
|
|
|
676
|
-
def create_embedding(self, input: Union[str, List[str]]) -> Embedding:
|
|
677
|
-
try:
|
|
678
|
-
import torch
|
|
679
|
-
import torch.nn.functional as F
|
|
680
|
-
except ImportError as e:
|
|
681
|
-
raise ImportError(
|
|
682
|
-
"Could not import torch. Please install it with `pip install torch`."
|
|
683
|
-
) from e
|
|
684
|
-
|
|
685
|
-
if isinstance(input, str):
|
|
686
|
-
inputs = [input]
|
|
687
|
-
else:
|
|
688
|
-
inputs = input
|
|
689
|
-
|
|
690
|
-
tokenizer = self._tokenizer
|
|
691
|
-
tokenizer.pad_token = tokenizer.eos_token
|
|
692
|
-
is_llama = "llama" in str(type(self._model)) # llama supports batch inference
|
|
693
|
-
is_chatglm = "chatglm" in str(type(self._model))
|
|
694
|
-
if is_llama:
|
|
695
|
-
encoding = tokenizer.batch_encode_plus(
|
|
696
|
-
inputs, padding=True, return_tensors="pt"
|
|
697
|
-
)
|
|
698
|
-
input_ids = encoding["input_ids"].to(self._device)
|
|
699
|
-
attention_mask = encoding["attention_mask"].to(self._device)
|
|
700
|
-
model_output = self._model(
|
|
701
|
-
input_ids, attention_mask, output_hidden_states=True
|
|
702
|
-
)
|
|
703
|
-
data = model_output.hidden_states[-1]
|
|
704
|
-
mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
|
|
705
|
-
masked_embeddings = data * mask
|
|
706
|
-
sum_embeddings = torch.sum(masked_embeddings, dim=1)
|
|
707
|
-
seq_length = torch.sum(mask, dim=1)
|
|
708
|
-
embedding = sum_embeddings / seq_length
|
|
709
|
-
normalized_embeddings = F.normalize(embedding, p=2, dim=1)
|
|
710
|
-
normalized_embeddings = normalized_embeddings.tolist()
|
|
711
|
-
token_num = torch.sum(attention_mask).item()
|
|
712
|
-
|
|
713
|
-
embedding_list = []
|
|
714
|
-
for index, data in enumerate(normalized_embeddings):
|
|
715
|
-
embedding_list.append(
|
|
716
|
-
EmbeddingData(index=index, object="embedding", embedding=data)
|
|
717
|
-
)
|
|
718
|
-
|
|
719
|
-
usage = EmbeddingUsage(prompt_tokens=token_num, total_tokens=token_num)
|
|
720
|
-
|
|
721
|
-
ret = Embedding(
|
|
722
|
-
object="list",
|
|
723
|
-
model=self.model_uid,
|
|
724
|
-
data=embedding_list,
|
|
725
|
-
usage=usage,
|
|
726
|
-
)
|
|
727
|
-
|
|
728
|
-
else:
|
|
729
|
-
embedding = []
|
|
730
|
-
token_num = 0
|
|
731
|
-
for index, text in enumerate(inputs):
|
|
732
|
-
input_ids = tokenizer.encode(text, return_tensors="pt").to(self._device)
|
|
733
|
-
model_output = self._model(input_ids, output_hidden_states=True)
|
|
734
|
-
if is_chatglm:
|
|
735
|
-
data = (model_output.hidden_states[-1].transpose(0, 1))[0]
|
|
736
|
-
else:
|
|
737
|
-
data = model_output.hidden_states[-1][0]
|
|
738
|
-
data = F.normalize(torch.mean(data, dim=0), p=2, dim=0)
|
|
739
|
-
data = data.tolist()
|
|
740
|
-
|
|
741
|
-
embedding.append(
|
|
742
|
-
EmbeddingData(index=index, object="embedding", embedding=data)
|
|
743
|
-
)
|
|
744
|
-
token_num += len(input_ids[0])
|
|
745
|
-
|
|
746
|
-
usage = EmbeddingUsage(prompt_tokens=token_num, total_tokens=token_num)
|
|
747
|
-
ret = Embedding(
|
|
748
|
-
object="list", model=self.model_uid, data=embedding, usage=usage
|
|
749
|
-
)
|
|
750
|
-
|
|
751
|
-
return ret
|
|
752
|
-
|
|
753
673
|
|
|
754
674
|
class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
755
675
|
def __init__(
|
xinference/model/llm/utils.py
CHANGED
|
@@ -779,8 +779,10 @@ Begin!"""
|
|
|
779
779
|
def get_file_location(
|
|
780
780
|
llm_family: LLMFamilyV1, spec: LLMSpecV1, quantization: str
|
|
781
781
|
) -> Tuple[str, bool]:
|
|
782
|
-
cache_dir = _get_cache_dir(
|
|
783
|
-
|
|
782
|
+
cache_dir = _get_cache_dir(
|
|
783
|
+
llm_family, spec, quantization, create_if_not_exist=False
|
|
784
|
+
)
|
|
785
|
+
cache_status = get_cache_status(llm_family, spec, quantization)
|
|
784
786
|
if isinstance(cache_status, list):
|
|
785
787
|
is_cached = None
|
|
786
788
|
for q, cs in zip(spec.quantizations, cache_status):
|
xinference/model/rerank/core.py
CHANGED
|
@@ -18,7 +18,7 @@ import os
|
|
|
18
18
|
import uuid
|
|
19
19
|
from collections import defaultdict
|
|
20
20
|
from collections.abc import Sequence
|
|
21
|
-
from typing import Dict, List, Optional, Tuple
|
|
21
|
+
from typing import Dict, List, Literal, Optional, Tuple
|
|
22
22
|
|
|
23
23
|
import numpy as np
|
|
24
24
|
import torch
|
|
@@ -285,7 +285,12 @@ def cache(model_spec: RerankModelSpec):
|
|
|
285
285
|
|
|
286
286
|
|
|
287
287
|
def create_rerank_model_instance(
|
|
288
|
-
subpool_addr: str,
|
|
288
|
+
subpool_addr: str,
|
|
289
|
+
devices: List[str],
|
|
290
|
+
model_uid: str,
|
|
291
|
+
model_name: str,
|
|
292
|
+
download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
|
|
293
|
+
**kwargs,
|
|
289
294
|
) -> Tuple[RerankModel, RerankModelDescription]:
|
|
290
295
|
from ..utils import download_from_modelscope
|
|
291
296
|
from . import BUILTIN_RERANK_MODELS, MODELSCOPE_RERANK_MODELS
|
|
@@ -298,30 +303,24 @@ def create_rerank_model_instance(
|
|
|
298
303
|
break
|
|
299
304
|
|
|
300
305
|
if model_spec is None:
|
|
301
|
-
if
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
raise ValueError(
|
|
314
|
-
f"Rerank model {model_name} not found, available"
|
|
315
|
-
f"model list: {BUILTIN_RERANK_MODELS.keys()}"
|
|
316
|
-
)
|
|
306
|
+
if download_hub == "huggingface" and model_name in BUILTIN_RERANK_MODELS:
|
|
307
|
+
logger.debug(f"Rerank model {model_name} found in Huggingface.")
|
|
308
|
+
model_spec = BUILTIN_RERANK_MODELS[model_name]
|
|
309
|
+
elif download_hub == "modelscope" and model_name in MODELSCOPE_RERANK_MODELS:
|
|
310
|
+
logger.debug(f"Rerank model {model_name} found in ModelScope.")
|
|
311
|
+
model_spec = MODELSCOPE_RERANK_MODELS[model_name]
|
|
312
|
+
elif download_from_modelscope() and model_name in MODELSCOPE_RERANK_MODELS:
|
|
313
|
+
logger.debug(f"Rerank model {model_name} found in ModelScope.")
|
|
314
|
+
model_spec = MODELSCOPE_RERANK_MODELS[model_name]
|
|
315
|
+
elif model_name in BUILTIN_RERANK_MODELS:
|
|
316
|
+
logger.debug(f"Rerank model {model_name} found in Huggingface.")
|
|
317
|
+
model_spec = BUILTIN_RERANK_MODELS[model_name]
|
|
317
318
|
else:
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
f"model list: {BUILTIN_RERANK_MODELS.keys()}"
|
|
324
|
-
)
|
|
319
|
+
raise ValueError(
|
|
320
|
+
f"Rerank model {model_name} not found, available"
|
|
321
|
+
f"Huggingface: {BUILTIN_RERANK_MODELS.keys()}"
|
|
322
|
+
f"ModelScope: {MODELSCOPE_RERANK_MODELS.keys()}"
|
|
323
|
+
)
|
|
325
324
|
|
|
326
325
|
model_path = cache(model_spec)
|
|
327
326
|
use_fp16 = kwargs.pop("use_fp16", False)
|
xinference/types.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
3
|
"main.css": "./static/css/main.4bafd904.css",
|
|
4
|
-
"main.js": "./static/js/main.
|
|
4
|
+
"main.js": "./static/js/main.95c1d652.js",
|
|
5
5
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
6
6
|
"index.html": "./index.html",
|
|
7
7
|
"main.4bafd904.css.map": "./static/css/main.4bafd904.css.map",
|
|
8
|
-
"main.
|
|
8
|
+
"main.95c1d652.js.map": "./static/js/main.95c1d652.js.map"
|
|
9
9
|
},
|
|
10
10
|
"entrypoints": [
|
|
11
11
|
"static/css/main.4bafd904.css",
|
|
12
|
-
"static/js/main.
|
|
12
|
+
"static/js/main.95c1d652.js"
|
|
13
13
|
]
|
|
14
14
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.95c1d652.js"></script><link href="./static/css/main.4bafd904.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|