xinference 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/client/restful/async_restful_client.py +8 -13
- xinference/client/restful/restful_client.py +6 -2
- xinference/core/chat_interface.py +6 -4
- xinference/core/media_interface.py +5 -0
- xinference/core/model.py +1 -5
- xinference/core/supervisor.py +117 -68
- xinference/core/worker.py +49 -37
- xinference/deploy/test/test_cmdline.py +2 -6
- xinference/model/audio/__init__.py +26 -23
- xinference/model/audio/chattts.py +3 -2
- xinference/model/audio/core.py +49 -98
- xinference/model/audio/cosyvoice.py +3 -2
- xinference/model/audio/custom.py +28 -73
- xinference/model/audio/f5tts.py +3 -2
- xinference/model/audio/f5tts_mlx.py +3 -2
- xinference/model/audio/fish_speech.py +3 -2
- xinference/model/audio/funasr.py +17 -4
- xinference/model/audio/kokoro.py +3 -2
- xinference/model/audio/megatts.py +3 -2
- xinference/model/audio/melotts.py +3 -2
- xinference/model/audio/model_spec.json +572 -171
- xinference/model/audio/utils.py +0 -6
- xinference/model/audio/whisper.py +3 -2
- xinference/model/audio/whisper_mlx.py +3 -2
- xinference/model/cache_manager.py +141 -0
- xinference/model/core.py +6 -49
- xinference/model/custom.py +174 -0
- xinference/model/embedding/__init__.py +67 -56
- xinference/model/embedding/cache_manager.py +35 -0
- xinference/model/embedding/core.py +104 -84
- xinference/model/embedding/custom.py +55 -78
- xinference/model/embedding/embed_family.py +80 -31
- xinference/model/embedding/flag/core.py +21 -5
- xinference/model/embedding/llama_cpp/__init__.py +0 -0
- xinference/model/embedding/llama_cpp/core.py +234 -0
- xinference/model/embedding/model_spec.json +968 -103
- xinference/model/embedding/sentence_transformers/core.py +30 -20
- xinference/model/embedding/vllm/core.py +11 -5
- xinference/model/flexible/__init__.py +8 -2
- xinference/model/flexible/core.py +26 -119
- xinference/model/flexible/custom.py +69 -0
- xinference/model/flexible/launchers/image_process_launcher.py +1 -0
- xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
- xinference/model/flexible/launchers/transformers_launcher.py +15 -3
- xinference/model/flexible/launchers/yolo_launcher.py +5 -1
- xinference/model/image/__init__.py +20 -20
- xinference/model/image/cache_manager.py +62 -0
- xinference/model/image/core.py +70 -182
- xinference/model/image/custom.py +28 -72
- xinference/model/image/model_spec.json +402 -119
- xinference/model/image/ocr/got_ocr2.py +3 -2
- xinference/model/image/stable_diffusion/core.py +22 -7
- xinference/model/image/stable_diffusion/mlx.py +6 -6
- xinference/model/image/utils.py +2 -2
- xinference/model/llm/__init__.py +71 -94
- xinference/model/llm/cache_manager.py +292 -0
- xinference/model/llm/core.py +37 -111
- xinference/model/llm/custom.py +88 -0
- xinference/model/llm/llama_cpp/core.py +5 -7
- xinference/model/llm/llm_family.json +16260 -8151
- xinference/model/llm/llm_family.py +138 -839
- xinference/model/llm/lmdeploy/core.py +5 -7
- xinference/model/llm/memory.py +3 -4
- xinference/model/llm/mlx/core.py +6 -8
- xinference/model/llm/reasoning_parser.py +3 -1
- xinference/model/llm/sglang/core.py +32 -14
- xinference/model/llm/transformers/chatglm.py +3 -7
- xinference/model/llm/transformers/core.py +49 -27
- xinference/model/llm/transformers/deepseek_v2.py +2 -2
- xinference/model/llm/transformers/gemma3.py +2 -2
- xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
- xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
- xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
- xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
- xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
- xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
- xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
- xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
- xinference/model/llm/transformers/opt.py +3 -7
- xinference/model/llm/utils.py +34 -49
- xinference/model/llm/vllm/core.py +77 -27
- xinference/model/llm/vllm/xavier/engine.py +5 -3
- xinference/model/llm/vllm/xavier/scheduler.py +10 -6
- xinference/model/llm/vllm/xavier/transfer.py +1 -1
- xinference/model/rerank/__init__.py +26 -25
- xinference/model/rerank/core.py +47 -87
- xinference/model/rerank/custom.py +25 -71
- xinference/model/rerank/model_spec.json +158 -33
- xinference/model/rerank/utils.py +2 -2
- xinference/model/utils.py +115 -54
- xinference/model/video/__init__.py +13 -17
- xinference/model/video/core.py +44 -102
- xinference/model/video/diffusers.py +4 -3
- xinference/model/video/model_spec.json +90 -21
- xinference/types.py +5 -3
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
- xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
- xinference/web/ui/src/locales/en.json +0 -1
- xinference/web/ui/src/locales/ja.json +0 -1
- xinference/web/ui/src/locales/ko.json +0 -1
- xinference/web/ui/src/locales/zh.json +0 -1
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
- xinference/model/audio/model_spec_modelscope.json +0 -231
- xinference/model/embedding/model_spec_modelscope.json +0 -293
- xinference/model/embedding/utils.py +0 -18
- xinference/model/image/model_spec_modelscope.json +0 -375
- xinference/model/llm/llama_cpp/memory.py +0 -457
- xinference/model/llm/llm_family_csghub.json +0 -56
- xinference/model/llm/llm_family_modelscope.json +0 -8700
- xinference/model/llm/llm_family_openmind_hub.json +0 -1019
- xinference/model/rerank/model_spec_modelscope.json +0 -85
- xinference/model/video/model_spec_modelscope.json +0 -184
- xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
- xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
- /xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0
|
@@ -13,57 +13,99 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import logging
|
|
16
|
-
from typing import TYPE_CHECKING,
|
|
16
|
+
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Type, Union
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
|
-
from .core import EmbeddingModel,
|
|
19
|
+
from .core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
|
|
20
20
|
|
|
21
21
|
FLAG_EMBEDDER_CLASSES: List[Type["EmbeddingModel"]] = []
|
|
22
22
|
SENTENCE_TRANSFORMER_CLASSES: List[Type["EmbeddingModel"]] = []
|
|
23
23
|
VLLM_CLASSES: List[Type["EmbeddingModel"]] = []
|
|
24
|
+
LLAMA_CPP_CLASSES: List[Type["EmbeddingModel"]] = []
|
|
24
25
|
|
|
25
|
-
BUILTIN_EMBEDDING_MODELS: Dict[str,
|
|
26
|
-
MODELSCOPE_EMBEDDING_MODELS: Dict[str, Any] = {}
|
|
26
|
+
BUILTIN_EMBEDDING_MODELS: Dict[str, "EmbeddingModelFamilyV2"] = {}
|
|
27
27
|
|
|
28
28
|
logger = logging.getLogger(__name__)
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
# Desc: this file used to manage embedding models information.
|
|
32
31
|
def match_embedding(
|
|
33
32
|
model_name: str,
|
|
33
|
+
model_format: Optional[str] = None,
|
|
34
|
+
quantization: Optional[str] = None,
|
|
34
35
|
download_hub: Optional[
|
|
35
36
|
Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
|
|
36
37
|
] = None,
|
|
37
|
-
) -> "
|
|
38
|
+
) -> "EmbeddingModelFamilyV2":
|
|
38
39
|
from ..utils import download_from_modelscope
|
|
39
|
-
|
|
40
|
-
# The model info has benn init by __init__.py with model_spec.json file
|
|
41
40
|
from .custom import get_user_defined_embeddings
|
|
42
41
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
if download_hub == "modelscope" and model_name in MODELSCOPE_EMBEDDING_MODELS:
|
|
49
|
-
logger.debug(f"Embedding model {model_name} found in ModelScope.")
|
|
50
|
-
return MODELSCOPE_EMBEDDING_MODELS[model_name]
|
|
51
|
-
elif download_hub == "huggingface" and model_name in BUILTIN_EMBEDDING_MODELS:
|
|
52
|
-
logger.debug(f"Embedding model {model_name} found in Huggingface.")
|
|
53
|
-
return BUILTIN_EMBEDDING_MODELS[model_name]
|
|
54
|
-
elif download_from_modelscope() and model_name in MODELSCOPE_EMBEDDING_MODELS:
|
|
55
|
-
logger.debug(f"Embedding model {model_name} found in ModelScope.")
|
|
56
|
-
return MODELSCOPE_EMBEDDING_MODELS[model_name]
|
|
57
|
-
elif model_name in BUILTIN_EMBEDDING_MODELS:
|
|
58
|
-
logger.debug(f"Embedding model {model_name} found in Huggingface.")
|
|
59
|
-
return BUILTIN_EMBEDDING_MODELS[model_name]
|
|
42
|
+
target_family = None
|
|
43
|
+
|
|
44
|
+
if model_name in BUILTIN_EMBEDDING_MODELS:
|
|
45
|
+
target_family = BUILTIN_EMBEDDING_MODELS[model_name]
|
|
60
46
|
else:
|
|
47
|
+
for model_family in get_user_defined_embeddings():
|
|
48
|
+
if model_name == model_family.model_name:
|
|
49
|
+
target_family = model_family
|
|
50
|
+
break
|
|
51
|
+
|
|
52
|
+
if target_family is None:
|
|
61
53
|
raise ValueError(
|
|
62
|
-
f"Embedding model {model_name} not found, available"
|
|
63
|
-
f"
|
|
64
|
-
f"ModelScope: {MODELSCOPE_EMBEDDING_MODELS.keys()}"
|
|
54
|
+
f"Embedding model {model_name} not found, available "
|
|
55
|
+
f"models: {BUILTIN_EMBEDDING_MODELS.keys()}"
|
|
65
56
|
)
|
|
66
57
|
|
|
58
|
+
if download_hub == "modelscope" or download_from_modelscope():
|
|
59
|
+
specs = [
|
|
60
|
+
x for x in target_family.model_specs if x.model_hub == "modelscope"
|
|
61
|
+
] + [x for x in target_family.model_specs if x.model_hub == "huggingface"]
|
|
62
|
+
else:
|
|
63
|
+
specs = [x for x in target_family.model_specs if x.model_hub == "huggingface"]
|
|
64
|
+
|
|
65
|
+
def _match_quantization(q: Union[str, None], _quantization: str):
|
|
66
|
+
# Currently, the quantization name could include both uppercase and lowercase letters,
|
|
67
|
+
# so it is necessary to ensure that the case sensitivity does not
|
|
68
|
+
# affect the matching results.
|
|
69
|
+
if q is None:
|
|
70
|
+
return None
|
|
71
|
+
return _quantization if q.lower() == _quantization.lower() else None
|
|
72
|
+
|
|
73
|
+
def _apply_format_to_model_id(
|
|
74
|
+
_spec: "EmbeddingSpecV1", q: str
|
|
75
|
+
) -> "EmbeddingSpecV1":
|
|
76
|
+
# Different quantized versions of some models use different model ids,
|
|
77
|
+
# Here we check the `{}` in the model id to format the id.
|
|
78
|
+
if _spec.model_id and "{" in _spec.model_id:
|
|
79
|
+
_spec.model_id = _spec.model_id.format(quantization=q)
|
|
80
|
+
return _spec
|
|
81
|
+
|
|
82
|
+
for spec in specs:
|
|
83
|
+
matched_quantization = _match_quantization(quantization, spec.quantization)
|
|
84
|
+
if (
|
|
85
|
+
model_format
|
|
86
|
+
and model_format != spec.model_format
|
|
87
|
+
or quantization
|
|
88
|
+
and matched_quantization is None
|
|
89
|
+
):
|
|
90
|
+
continue
|
|
91
|
+
# Copy spec to avoid _apply_format_to_model_id modify the original spec.
|
|
92
|
+
spec = spec.copy()
|
|
93
|
+
_family = target_family.copy()
|
|
94
|
+
if quantization:
|
|
95
|
+
_family.model_specs = [
|
|
96
|
+
_apply_format_to_model_id(spec, matched_quantization)
|
|
97
|
+
]
|
|
98
|
+
return _family
|
|
99
|
+
else:
|
|
100
|
+
# TODO: If user does not specify quantization, just use the first one
|
|
101
|
+
_q = "none" if spec.model_format == "pytorch" else spec.quantization
|
|
102
|
+
_family.model_specs = [_apply_format_to_model_id(spec, _q)]
|
|
103
|
+
return _family
|
|
104
|
+
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"Embedding model {model_name} with format {model_format} and quantization {quantization} not found."
|
|
107
|
+
)
|
|
108
|
+
|
|
67
109
|
|
|
68
110
|
# { embedding model name -> { engine name -> engine params } }
|
|
69
111
|
EMBEDDING_ENGINES: Dict[str, Dict[str, List[Dict[str, Type["EmbeddingModel"]]]]] = {}
|
|
@@ -71,8 +113,10 @@ SUPPORTED_ENGINES: Dict[str, List[Type["EmbeddingModel"]]] = {}
|
|
|
71
113
|
|
|
72
114
|
|
|
73
115
|
def check_engine_by_model_name_and_engine(
|
|
74
|
-
model_name: str,
|
|
75
116
|
model_engine: str,
|
|
117
|
+
model_name: str,
|
|
118
|
+
model_format: Optional[str],
|
|
119
|
+
quantization: Optional[str],
|
|
76
120
|
) -> Type["EmbeddingModel"]:
|
|
77
121
|
def get_model_engine_from_spell(engine_str: str) -> str:
|
|
78
122
|
for engine in EMBEDDING_ENGINES[model_name].keys():
|
|
@@ -87,6 +131,11 @@ def check_engine_by_model_name_and_engine(
|
|
|
87
131
|
raise ValueError(f"Model {model_name} cannot be run on engine {model_engine}.")
|
|
88
132
|
match_params = EMBEDDING_ENGINES[model_name][model_engine]
|
|
89
133
|
for param in match_params:
|
|
90
|
-
if model_name
|
|
91
|
-
|
|
134
|
+
if model_name != param["model_name"]:
|
|
135
|
+
continue
|
|
136
|
+
if (model_format and model_format != param["model_format"]) or (
|
|
137
|
+
quantization and quantization != param["quantization"]
|
|
138
|
+
):
|
|
139
|
+
continue
|
|
140
|
+
return param["embedding_class"]
|
|
92
141
|
raise ValueError(f"Model {model_name} cannot be run on engine {model_engine}.")
|
|
@@ -30,7 +30,7 @@ except ImportError:
|
|
|
30
30
|
|
|
31
31
|
from ....device_utils import get_available_device
|
|
32
32
|
from ....types import Embedding, EmbeddingData, EmbeddingUsage
|
|
33
|
-
from ..core import EmbeddingModel,
|
|
33
|
+
from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
|
|
34
34
|
|
|
35
35
|
FLAG_EMBEDDER_MODEL_LIST = support_native_bge_model_list() if flag_installed else []
|
|
36
36
|
logger = logging.getLogger(__name__)
|
|
@@ -41,12 +41,20 @@ class FlagEmbeddingModel(EmbeddingModel):
|
|
|
41
41
|
self,
|
|
42
42
|
model_uid: str,
|
|
43
43
|
model_path: str,
|
|
44
|
-
|
|
44
|
+
model_family: EmbeddingModelFamilyV2,
|
|
45
|
+
quantization: Optional[str] = None,
|
|
45
46
|
device: Optional[str] = None,
|
|
46
47
|
return_sparse: bool = False,
|
|
47
48
|
**kwargs,
|
|
48
49
|
):
|
|
49
|
-
super().__init__(
|
|
50
|
+
super().__init__(
|
|
51
|
+
model_uid,
|
|
52
|
+
model_path,
|
|
53
|
+
model_family,
|
|
54
|
+
quantization,
|
|
55
|
+
device,
|
|
56
|
+
**kwargs,
|
|
57
|
+
)
|
|
50
58
|
self._return_sparse = return_sparse
|
|
51
59
|
|
|
52
60
|
def load(self):
|
|
@@ -276,7 +284,15 @@ class FlagEmbeddingModel(EmbeddingModel):
|
|
|
276
284
|
return importlib.util.find_spec("FlagEmbedding") is not None
|
|
277
285
|
|
|
278
286
|
@classmethod
|
|
279
|
-
def match_json(
|
|
280
|
-
|
|
287
|
+
def match_json(
|
|
288
|
+
cls,
|
|
289
|
+
model_family: EmbeddingModelFamilyV2,
|
|
290
|
+
model_spec: EmbeddingSpecV1,
|
|
291
|
+
quantization: str,
|
|
292
|
+
) -> bool:
|
|
293
|
+
if (
|
|
294
|
+
model_spec.model_format in ["pytorch"]
|
|
295
|
+
and model_family.model_name in FLAG_EMBEDDER_MODEL_LIST
|
|
296
|
+
):
|
|
281
297
|
return True
|
|
282
298
|
return False
|
|
File without changes
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
# Copyright 2022-2025 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import concurrent.futures
|
|
16
|
+
import importlib.util
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
import platform
|
|
20
|
+
import pprint
|
|
21
|
+
import queue
|
|
22
|
+
import sys
|
|
23
|
+
from typing import List, Optional, Union
|
|
24
|
+
|
|
25
|
+
import orjson
|
|
26
|
+
|
|
27
|
+
from ....types import Embedding
|
|
28
|
+
from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class _Done:
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class _Error:
|
|
38
|
+
def __init__(self, msg):
|
|
39
|
+
self.msg = msg
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class XllamaCppEmbeddingModel(EmbeddingModel):
|
|
43
|
+
def __init__(self, *args, **kwargs) -> None:
|
|
44
|
+
super().__init__(*args, **kwargs)
|
|
45
|
+
self._llm = None
|
|
46
|
+
self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
|
|
47
|
+
llamacpp_model_config = self._kwargs.get("llamacpp_model_config")
|
|
48
|
+
self._llamacpp_model_config = self._sanitize_model_config(llamacpp_model_config)
|
|
49
|
+
|
|
50
|
+
def _sanitize_model_config(self, llamacpp_model_config: Optional[dict]) -> dict:
|
|
51
|
+
if llamacpp_model_config is None:
|
|
52
|
+
llamacpp_model_config = {}
|
|
53
|
+
|
|
54
|
+
llamacpp_model_config.setdefault("embedding", True)
|
|
55
|
+
llamacpp_model_config.setdefault("use_mmap", False)
|
|
56
|
+
llamacpp_model_config.setdefault("use_mlock", True)
|
|
57
|
+
|
|
58
|
+
if self._is_darwin_and_apple_silicon():
|
|
59
|
+
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
60
|
+
elif self._is_linux():
|
|
61
|
+
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
62
|
+
|
|
63
|
+
return llamacpp_model_config
|
|
64
|
+
|
|
65
|
+
def _is_darwin_and_apple_silicon(self):
|
|
66
|
+
return sys.platform == "darwin" and platform.processor() == "arm"
|
|
67
|
+
|
|
68
|
+
def _is_linux(self):
|
|
69
|
+
return sys.platform.startswith("linux")
|
|
70
|
+
|
|
71
|
+
def load(self):
|
|
72
|
+
try:
|
|
73
|
+
from xllamacpp import (
|
|
74
|
+
CommonParams,
|
|
75
|
+
Server,
|
|
76
|
+
estimate_gpu_layers,
|
|
77
|
+
get_device_info,
|
|
78
|
+
ggml_backend_dev_type,
|
|
79
|
+
llama_pooling_type,
|
|
80
|
+
)
|
|
81
|
+
except ImportError:
|
|
82
|
+
error_message = "Failed to import module 'xllamacpp'"
|
|
83
|
+
installation_guide = ["Please make sure 'xllamacpp' is installed. "]
|
|
84
|
+
|
|
85
|
+
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
86
|
+
|
|
87
|
+
# handle legacy cache.
|
|
88
|
+
if (
|
|
89
|
+
self._model_spec.model_file_name_split_template
|
|
90
|
+
and self._quantization in self._model_spec.quantization_parts
|
|
91
|
+
):
|
|
92
|
+
part = self._model_spec.quantization_parts[self._quantization]
|
|
93
|
+
model_path = os.path.join(
|
|
94
|
+
self._model_path,
|
|
95
|
+
self._model_spec.model_file_name_split_template.format(
|
|
96
|
+
quantization=self._quantization, part=part[0]
|
|
97
|
+
),
|
|
98
|
+
)
|
|
99
|
+
else:
|
|
100
|
+
model_path = os.path.join(
|
|
101
|
+
self._model_path,
|
|
102
|
+
self._model_spec.model_file_name_template.format(
|
|
103
|
+
quantization=self._quantization
|
|
104
|
+
),
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
params = CommonParams()
|
|
109
|
+
params.embedding = True
|
|
110
|
+
# Compatible with xllamacpp changes
|
|
111
|
+
try:
|
|
112
|
+
params.model = model_path
|
|
113
|
+
except Exception:
|
|
114
|
+
params.model.path = model_path
|
|
115
|
+
|
|
116
|
+
# This is the default value, could be overwritten by _llamacpp_model_config
|
|
117
|
+
params.n_parallel = min(8, os.cpu_count() or 1)
|
|
118
|
+
params.pooling_type = llama_pooling_type.LLAMA_POOLING_TYPE_LAST
|
|
119
|
+
for k, v in self._llamacpp_model_config.items():
|
|
120
|
+
try:
|
|
121
|
+
if "." in k:
|
|
122
|
+
parts = k.split(".")
|
|
123
|
+
sub_param = params
|
|
124
|
+
for p in parts[:-1]:
|
|
125
|
+
sub_param = getattr(sub_param, p)
|
|
126
|
+
setattr(sub_param, parts[-1], v)
|
|
127
|
+
else:
|
|
128
|
+
setattr(params, k, v)
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.error("Failed to set the param %s = %s, error: %s", k, v, e)
|
|
131
|
+
n_threads = self._llamacpp_model_config.get("n_threads", os.cpu_count())
|
|
132
|
+
params.cpuparams.n_threads = n_threads
|
|
133
|
+
params.cpuparams_batch.n_threads = n_threads
|
|
134
|
+
if params.n_gpu_layers == -1:
|
|
135
|
+
# Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
|
|
136
|
+
# 0x7FFFFFFF is INT32 max, will be auto set to all layers
|
|
137
|
+
params.n_gpu_layers = 0x7FFFFFFF
|
|
138
|
+
try:
|
|
139
|
+
device_info = get_device_info()
|
|
140
|
+
gpus = [
|
|
141
|
+
info
|
|
142
|
+
for info in device_info
|
|
143
|
+
if info["type"]
|
|
144
|
+
== ggml_backend_dev_type.GGML_BACKEND_DEVICE_TYPE_GPU
|
|
145
|
+
]
|
|
146
|
+
if gpus:
|
|
147
|
+
logger.info(
|
|
148
|
+
"Try to estimate num gpu layers, n_ctx: %s, n_batch: %s, n_parallel: %s, gpus:\n%s",
|
|
149
|
+
params.n_ctx,
|
|
150
|
+
params.n_batch,
|
|
151
|
+
params.n_parallel,
|
|
152
|
+
pprint.pformat(gpus),
|
|
153
|
+
)
|
|
154
|
+
estimate = estimate_gpu_layers(
|
|
155
|
+
gpus=gpus,
|
|
156
|
+
model_path=model_path,
|
|
157
|
+
projectors=[],
|
|
158
|
+
context_length=params.n_ctx,
|
|
159
|
+
batch_size=params.n_batch,
|
|
160
|
+
num_parallel=params.n_parallel,
|
|
161
|
+
kv_cache_type="",
|
|
162
|
+
)
|
|
163
|
+
logger.info("Estimate num gpu layers: %s", estimate)
|
|
164
|
+
if estimate.tensor_split:
|
|
165
|
+
params.tensor_split = estimate.tensor_split
|
|
166
|
+
else:
|
|
167
|
+
params.n_gpu_layers = estimate.layers
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.exception(
|
|
170
|
+
"Estimate num gpu layers for llama.cpp backend failed: %s", e
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
self._llm = Server(params)
|
|
174
|
+
self._executor = concurrent.futures.ThreadPoolExecutor(
|
|
175
|
+
max_workers=max(10, n_threads)
|
|
176
|
+
)
|
|
177
|
+
except AssertionError:
|
|
178
|
+
raise RuntimeError(f"Load model {self._model_name} failed")
|
|
179
|
+
|
|
180
|
+
def create_embedding(self, sentences: Union[str, List[str]], **kwargs) -> Embedding:
|
|
181
|
+
if self._llm is None:
|
|
182
|
+
raise RuntimeError("Model is not loaded.")
|
|
183
|
+
|
|
184
|
+
q: queue.Queue = queue.Queue()
|
|
185
|
+
if isinstance(sentences, str):
|
|
186
|
+
sentences = [sentences]
|
|
187
|
+
|
|
188
|
+
def _handle_embedding():
|
|
189
|
+
data = {"input": sentences}
|
|
190
|
+
prompt_json = orjson.dumps(data)
|
|
191
|
+
|
|
192
|
+
def _error_callback(err):
|
|
193
|
+
try:
|
|
194
|
+
msg = orjson.loads(err)
|
|
195
|
+
q.put(_Error(msg))
|
|
196
|
+
except Exception as e:
|
|
197
|
+
q.put(_Error(str(e)))
|
|
198
|
+
|
|
199
|
+
def _ok_callback(ok):
|
|
200
|
+
try:
|
|
201
|
+
res = orjson.loads(ok)
|
|
202
|
+
q.put(res)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
q.put(_Error(str(e)))
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
self._llm.handle_embeddings(prompt_json, _error_callback, _ok_callback)
|
|
208
|
+
except Exception as ex:
|
|
209
|
+
q.put(_Error(str(ex)))
|
|
210
|
+
q.put(_Done)
|
|
211
|
+
|
|
212
|
+
assert self._executor
|
|
213
|
+
self._executor.submit(_handle_embedding)
|
|
214
|
+
|
|
215
|
+
r = q.get()
|
|
216
|
+
if type(r) is _Error:
|
|
217
|
+
raise Exception(f"Failed to create embedding: {r.msg}")
|
|
218
|
+
r["model_replica"] = self._model_uid
|
|
219
|
+
return Embedding(**r) # type: ignore
|
|
220
|
+
|
|
221
|
+
@classmethod
|
|
222
|
+
def check_lib(cls) -> bool:
|
|
223
|
+
return importlib.util.find_spec("xllamacpp") is not None
|
|
224
|
+
|
|
225
|
+
@classmethod
|
|
226
|
+
def match_json(
|
|
227
|
+
cls,
|
|
228
|
+
model_family: EmbeddingModelFamilyV2,
|
|
229
|
+
model_spec: EmbeddingSpecV1,
|
|
230
|
+
quantization: str,
|
|
231
|
+
) -> bool:
|
|
232
|
+
if model_spec.model_format not in ["ggufv2"]:
|
|
233
|
+
return False
|
|
234
|
+
return True
|