xinference 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (136) hide show
  1. xinference/_version.py +3 -3
  2. xinference/client/restful/async_restful_client.py +8 -13
  3. xinference/client/restful/restful_client.py +6 -2
  4. xinference/core/chat_interface.py +6 -4
  5. xinference/core/media_interface.py +5 -0
  6. xinference/core/model.py +1 -5
  7. xinference/core/supervisor.py +117 -68
  8. xinference/core/worker.py +49 -37
  9. xinference/deploy/test/test_cmdline.py +2 -6
  10. xinference/model/audio/__init__.py +26 -23
  11. xinference/model/audio/chattts.py +3 -2
  12. xinference/model/audio/core.py +49 -98
  13. xinference/model/audio/cosyvoice.py +3 -2
  14. xinference/model/audio/custom.py +28 -73
  15. xinference/model/audio/f5tts.py +3 -2
  16. xinference/model/audio/f5tts_mlx.py +3 -2
  17. xinference/model/audio/fish_speech.py +3 -2
  18. xinference/model/audio/funasr.py +17 -4
  19. xinference/model/audio/kokoro.py +3 -2
  20. xinference/model/audio/megatts.py +3 -2
  21. xinference/model/audio/melotts.py +3 -2
  22. xinference/model/audio/model_spec.json +572 -171
  23. xinference/model/audio/utils.py +0 -6
  24. xinference/model/audio/whisper.py +3 -2
  25. xinference/model/audio/whisper_mlx.py +3 -2
  26. xinference/model/cache_manager.py +141 -0
  27. xinference/model/core.py +6 -49
  28. xinference/model/custom.py +174 -0
  29. xinference/model/embedding/__init__.py +67 -56
  30. xinference/model/embedding/cache_manager.py +35 -0
  31. xinference/model/embedding/core.py +104 -84
  32. xinference/model/embedding/custom.py +55 -78
  33. xinference/model/embedding/embed_family.py +80 -31
  34. xinference/model/embedding/flag/core.py +21 -5
  35. xinference/model/embedding/llama_cpp/__init__.py +0 -0
  36. xinference/model/embedding/llama_cpp/core.py +234 -0
  37. xinference/model/embedding/model_spec.json +968 -103
  38. xinference/model/embedding/sentence_transformers/core.py +30 -20
  39. xinference/model/embedding/vllm/core.py +11 -5
  40. xinference/model/flexible/__init__.py +8 -2
  41. xinference/model/flexible/core.py +26 -119
  42. xinference/model/flexible/custom.py +69 -0
  43. xinference/model/flexible/launchers/image_process_launcher.py +1 -0
  44. xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
  45. xinference/model/flexible/launchers/transformers_launcher.py +15 -3
  46. xinference/model/flexible/launchers/yolo_launcher.py +5 -1
  47. xinference/model/image/__init__.py +20 -20
  48. xinference/model/image/cache_manager.py +62 -0
  49. xinference/model/image/core.py +70 -182
  50. xinference/model/image/custom.py +28 -72
  51. xinference/model/image/model_spec.json +402 -119
  52. xinference/model/image/ocr/got_ocr2.py +3 -2
  53. xinference/model/image/stable_diffusion/core.py +22 -7
  54. xinference/model/image/stable_diffusion/mlx.py +6 -6
  55. xinference/model/image/utils.py +2 -2
  56. xinference/model/llm/__init__.py +71 -94
  57. xinference/model/llm/cache_manager.py +292 -0
  58. xinference/model/llm/core.py +37 -111
  59. xinference/model/llm/custom.py +88 -0
  60. xinference/model/llm/llama_cpp/core.py +5 -7
  61. xinference/model/llm/llm_family.json +16260 -8151
  62. xinference/model/llm/llm_family.py +138 -839
  63. xinference/model/llm/lmdeploy/core.py +5 -7
  64. xinference/model/llm/memory.py +3 -4
  65. xinference/model/llm/mlx/core.py +6 -8
  66. xinference/model/llm/reasoning_parser.py +3 -1
  67. xinference/model/llm/sglang/core.py +32 -14
  68. xinference/model/llm/transformers/chatglm.py +3 -7
  69. xinference/model/llm/transformers/core.py +49 -27
  70. xinference/model/llm/transformers/deepseek_v2.py +2 -2
  71. xinference/model/llm/transformers/gemma3.py +2 -2
  72. xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
  73. xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
  74. xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
  75. xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
  76. xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
  77. xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
  78. xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
  79. xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
  80. xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
  81. xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
  82. xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
  83. xinference/model/llm/transformers/opt.py +3 -7
  84. xinference/model/llm/utils.py +34 -49
  85. xinference/model/llm/vllm/core.py +77 -27
  86. xinference/model/llm/vllm/xavier/engine.py +5 -3
  87. xinference/model/llm/vllm/xavier/scheduler.py +10 -6
  88. xinference/model/llm/vllm/xavier/transfer.py +1 -1
  89. xinference/model/rerank/__init__.py +26 -25
  90. xinference/model/rerank/core.py +47 -87
  91. xinference/model/rerank/custom.py +25 -71
  92. xinference/model/rerank/model_spec.json +158 -33
  93. xinference/model/rerank/utils.py +2 -2
  94. xinference/model/utils.py +115 -54
  95. xinference/model/video/__init__.py +13 -17
  96. xinference/model/video/core.py +44 -102
  97. xinference/model/video/diffusers.py +4 -3
  98. xinference/model/video/model_spec.json +90 -21
  99. xinference/types.py +5 -3
  100. xinference/web/ui/build/asset-manifest.json +3 -3
  101. xinference/web/ui/build/index.html +1 -1
  102. xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
  103. xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
  104. xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
  105. xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
  106. xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
  107. xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
  108. xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
  109. xinference/web/ui/src/locales/en.json +0 -1
  110. xinference/web/ui/src/locales/ja.json +0 -1
  111. xinference/web/ui/src/locales/ko.json +0 -1
  112. xinference/web/ui/src/locales/zh.json +0 -1
  113. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
  114. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
  115. xinference/model/audio/model_spec_modelscope.json +0 -231
  116. xinference/model/embedding/model_spec_modelscope.json +0 -293
  117. xinference/model/embedding/utils.py +0 -18
  118. xinference/model/image/model_spec_modelscope.json +0 -375
  119. xinference/model/llm/llama_cpp/memory.py +0 -457
  120. xinference/model/llm/llm_family_csghub.json +0 -56
  121. xinference/model/llm/llm_family_modelscope.json +0 -8700
  122. xinference/model/llm/llm_family_openmind_hub.json +0 -1019
  123. xinference/model/rerank/model_spec_modelscope.json +0 -85
  124. xinference/model/video/model_spec_modelscope.json +0 -184
  125. xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
  126. xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
  127. xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
  128. xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
  129. xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
  130. xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
  131. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
  132. /xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
  133. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
  134. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
  135. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
  136. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0
@@ -13,57 +13,99 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import logging
16
- from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Type
16
+ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Type, Union
17
17
 
18
18
  if TYPE_CHECKING:
19
- from .core import EmbeddingModel, EmbeddingModelSpec
19
+ from .core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
20
20
 
21
21
  FLAG_EMBEDDER_CLASSES: List[Type["EmbeddingModel"]] = []
22
22
  SENTENCE_TRANSFORMER_CLASSES: List[Type["EmbeddingModel"]] = []
23
23
  VLLM_CLASSES: List[Type["EmbeddingModel"]] = []
24
+ LLAMA_CPP_CLASSES: List[Type["EmbeddingModel"]] = []
24
25
 
25
- BUILTIN_EMBEDDING_MODELS: Dict[str, Any] = {}
26
- MODELSCOPE_EMBEDDING_MODELS: Dict[str, Any] = {}
26
+ BUILTIN_EMBEDDING_MODELS: Dict[str, "EmbeddingModelFamilyV2"] = {}
27
27
 
28
28
  logger = logging.getLogger(__name__)
29
29
 
30
30
 
31
- # Desc: this file used to manage embedding models information.
32
31
  def match_embedding(
33
32
  model_name: str,
33
+ model_format: Optional[str] = None,
34
+ quantization: Optional[str] = None,
34
35
  download_hub: Optional[
35
36
  Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
36
37
  ] = None,
37
- ) -> "EmbeddingModelSpec":
38
+ ) -> "EmbeddingModelFamilyV2":
38
39
  from ..utils import download_from_modelscope
39
-
40
- # The model info has benn init by __init__.py with model_spec.json file
41
40
  from .custom import get_user_defined_embeddings
42
41
 
43
- # first, check whether it is a user-defined embedding model
44
- for model_spec in get_user_defined_embeddings():
45
- if model_name == model_spec.model_name:
46
- return model_spec
47
-
48
- if download_hub == "modelscope" and model_name in MODELSCOPE_EMBEDDING_MODELS:
49
- logger.debug(f"Embedding model {model_name} found in ModelScope.")
50
- return MODELSCOPE_EMBEDDING_MODELS[model_name]
51
- elif download_hub == "huggingface" and model_name in BUILTIN_EMBEDDING_MODELS:
52
- logger.debug(f"Embedding model {model_name} found in Huggingface.")
53
- return BUILTIN_EMBEDDING_MODELS[model_name]
54
- elif download_from_modelscope() and model_name in MODELSCOPE_EMBEDDING_MODELS:
55
- logger.debug(f"Embedding model {model_name} found in ModelScope.")
56
- return MODELSCOPE_EMBEDDING_MODELS[model_name]
57
- elif model_name in BUILTIN_EMBEDDING_MODELS:
58
- logger.debug(f"Embedding model {model_name} found in Huggingface.")
59
- return BUILTIN_EMBEDDING_MODELS[model_name]
42
+ target_family = None
43
+
44
+ if model_name in BUILTIN_EMBEDDING_MODELS:
45
+ target_family = BUILTIN_EMBEDDING_MODELS[model_name]
60
46
  else:
47
+ for model_family in get_user_defined_embeddings():
48
+ if model_name == model_family.model_name:
49
+ target_family = model_family
50
+ break
51
+
52
+ if target_family is None:
61
53
  raise ValueError(
62
- f"Embedding model {model_name} not found, available"
63
- f"Huggingface: {BUILTIN_EMBEDDING_MODELS.keys()}"
64
- f"ModelScope: {MODELSCOPE_EMBEDDING_MODELS.keys()}"
54
+ f"Embedding model {model_name} not found, available "
55
+ f"models: {BUILTIN_EMBEDDING_MODELS.keys()}"
65
56
  )
66
57
 
58
+ if download_hub == "modelscope" or download_from_modelscope():
59
+ specs = [
60
+ x for x in target_family.model_specs if x.model_hub == "modelscope"
61
+ ] + [x for x in target_family.model_specs if x.model_hub == "huggingface"]
62
+ else:
63
+ specs = [x for x in target_family.model_specs if x.model_hub == "huggingface"]
64
+
65
+ def _match_quantization(q: Union[str, None], _quantization: str):
66
+ # Currently, the quantization name could include both uppercase and lowercase letters,
67
+ # so it is necessary to ensure that the case sensitivity does not
68
+ # affect the matching results.
69
+ if q is None:
70
+ return None
71
+ return _quantization if q.lower() == _quantization.lower() else None
72
+
73
+ def _apply_format_to_model_id(
74
+ _spec: "EmbeddingSpecV1", q: str
75
+ ) -> "EmbeddingSpecV1":
76
+ # Different quantized versions of some models use different model ids,
77
+ # Here we check the `{}` in the model id to format the id.
78
+ if _spec.model_id and "{" in _spec.model_id:
79
+ _spec.model_id = _spec.model_id.format(quantization=q)
80
+ return _spec
81
+
82
+ for spec in specs:
83
+ matched_quantization = _match_quantization(quantization, spec.quantization)
84
+ if (
85
+ model_format
86
+ and model_format != spec.model_format
87
+ or quantization
88
+ and matched_quantization is None
89
+ ):
90
+ continue
91
+ # Copy spec to avoid _apply_format_to_model_id modify the original spec.
92
+ spec = spec.copy()
93
+ _family = target_family.copy()
94
+ if quantization:
95
+ _family.model_specs = [
96
+ _apply_format_to_model_id(spec, matched_quantization)
97
+ ]
98
+ return _family
99
+ else:
100
+ # TODO: If user does not specify quantization, just use the first one
101
+ _q = "none" if spec.model_format == "pytorch" else spec.quantization
102
+ _family.model_specs = [_apply_format_to_model_id(spec, _q)]
103
+ return _family
104
+
105
+ raise ValueError(
106
+ f"Embedding model {model_name} with format {model_format} and quantization {quantization} not found."
107
+ )
108
+
67
109
 
68
110
  # { embedding model name -> { engine name -> engine params } }
69
111
  EMBEDDING_ENGINES: Dict[str, Dict[str, List[Dict[str, Type["EmbeddingModel"]]]]] = {}
@@ -71,8 +113,10 @@ SUPPORTED_ENGINES: Dict[str, List[Type["EmbeddingModel"]]] = {}
71
113
 
72
114
 
73
115
  def check_engine_by_model_name_and_engine(
74
- model_name: str,
75
116
  model_engine: str,
117
+ model_name: str,
118
+ model_format: Optional[str],
119
+ quantization: Optional[str],
76
120
  ) -> Type["EmbeddingModel"]:
77
121
  def get_model_engine_from_spell(engine_str: str) -> str:
78
122
  for engine in EMBEDDING_ENGINES[model_name].keys():
@@ -87,6 +131,11 @@ def check_engine_by_model_name_and_engine(
87
131
  raise ValueError(f"Model {model_name} cannot be run on engine {model_engine}.")
88
132
  match_params = EMBEDDING_ENGINES[model_name][model_engine]
89
133
  for param in match_params:
90
- if model_name == param["model_name"]:
91
- return param["embedding_class"]
134
+ if model_name != param["model_name"]:
135
+ continue
136
+ if (model_format and model_format != param["model_format"]) or (
137
+ quantization and quantization != param["quantization"]
138
+ ):
139
+ continue
140
+ return param["embedding_class"]
92
141
  raise ValueError(f"Model {model_name} cannot be run on engine {model_engine}.")
@@ -30,7 +30,7 @@ except ImportError:
30
30
 
31
31
  from ....device_utils import get_available_device
32
32
  from ....types import Embedding, EmbeddingData, EmbeddingUsage
33
- from ..core import EmbeddingModel, EmbeddingModelSpec
33
+ from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
34
34
 
35
35
  FLAG_EMBEDDER_MODEL_LIST = support_native_bge_model_list() if flag_installed else []
36
36
  logger = logging.getLogger(__name__)
@@ -41,12 +41,20 @@ class FlagEmbeddingModel(EmbeddingModel):
41
41
  self,
42
42
  model_uid: str,
43
43
  model_path: str,
44
- model_spec: EmbeddingModelSpec,
44
+ model_family: EmbeddingModelFamilyV2,
45
+ quantization: Optional[str] = None,
45
46
  device: Optional[str] = None,
46
47
  return_sparse: bool = False,
47
48
  **kwargs,
48
49
  ):
49
- super().__init__(model_uid, model_path, model_spec, device, **kwargs)
50
+ super().__init__(
51
+ model_uid,
52
+ model_path,
53
+ model_family,
54
+ quantization,
55
+ device,
56
+ **kwargs,
57
+ )
50
58
  self._return_sparse = return_sparse
51
59
 
52
60
  def load(self):
@@ -276,7 +284,15 @@ class FlagEmbeddingModel(EmbeddingModel):
276
284
  return importlib.util.find_spec("FlagEmbedding") is not None
277
285
 
278
286
  @classmethod
279
- def match_json(cls, model_spec: EmbeddingModelSpec) -> bool:
280
- if model_spec.model_name in FLAG_EMBEDDER_MODEL_LIST:
287
+ def match_json(
288
+ cls,
289
+ model_family: EmbeddingModelFamilyV2,
290
+ model_spec: EmbeddingSpecV1,
291
+ quantization: str,
292
+ ) -> bool:
293
+ if (
294
+ model_spec.model_format in ["pytorch"]
295
+ and model_family.model_name in FLAG_EMBEDDER_MODEL_LIST
296
+ ):
281
297
  return True
282
298
  return False
File without changes
@@ -0,0 +1,234 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import concurrent.futures
16
+ import importlib.util
17
+ import logging
18
+ import os
19
+ import platform
20
+ import pprint
21
+ import queue
22
+ import sys
23
+ from typing import List, Optional, Union
24
+
25
+ import orjson
26
+
27
+ from ....types import Embedding
28
+ from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class _Done:
34
+ pass
35
+
36
+
37
+ class _Error:
38
+ def __init__(self, msg):
39
+ self.msg = msg
40
+
41
+
42
+ class XllamaCppEmbeddingModel(EmbeddingModel):
43
+ def __init__(self, *args, **kwargs) -> None:
44
+ super().__init__(*args, **kwargs)
45
+ self._llm = None
46
+ self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
47
+ llamacpp_model_config = self._kwargs.get("llamacpp_model_config")
48
+ self._llamacpp_model_config = self._sanitize_model_config(llamacpp_model_config)
49
+
50
+ def _sanitize_model_config(self, llamacpp_model_config: Optional[dict]) -> dict:
51
+ if llamacpp_model_config is None:
52
+ llamacpp_model_config = {}
53
+
54
+ llamacpp_model_config.setdefault("embedding", True)
55
+ llamacpp_model_config.setdefault("use_mmap", False)
56
+ llamacpp_model_config.setdefault("use_mlock", True)
57
+
58
+ if self._is_darwin_and_apple_silicon():
59
+ llamacpp_model_config.setdefault("n_gpu_layers", -1)
60
+ elif self._is_linux():
61
+ llamacpp_model_config.setdefault("n_gpu_layers", -1)
62
+
63
+ return llamacpp_model_config
64
+
65
+ def _is_darwin_and_apple_silicon(self):
66
+ return sys.platform == "darwin" and platform.processor() == "arm"
67
+
68
+ def _is_linux(self):
69
+ return sys.platform.startswith("linux")
70
+
71
+ def load(self):
72
+ try:
73
+ from xllamacpp import (
74
+ CommonParams,
75
+ Server,
76
+ estimate_gpu_layers,
77
+ get_device_info,
78
+ ggml_backend_dev_type,
79
+ llama_pooling_type,
80
+ )
81
+ except ImportError:
82
+ error_message = "Failed to import module 'xllamacpp'"
83
+ installation_guide = ["Please make sure 'xllamacpp' is installed. "]
84
+
85
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
86
+
87
+ # handle legacy cache.
88
+ if (
89
+ self._model_spec.model_file_name_split_template
90
+ and self._quantization in self._model_spec.quantization_parts
91
+ ):
92
+ part = self._model_spec.quantization_parts[self._quantization]
93
+ model_path = os.path.join(
94
+ self._model_path,
95
+ self._model_spec.model_file_name_split_template.format(
96
+ quantization=self._quantization, part=part[0]
97
+ ),
98
+ )
99
+ else:
100
+ model_path = os.path.join(
101
+ self._model_path,
102
+ self._model_spec.model_file_name_template.format(
103
+ quantization=self._quantization
104
+ ),
105
+ )
106
+
107
+ try:
108
+ params = CommonParams()
109
+ params.embedding = True
110
+ # Compatible with xllamacpp changes
111
+ try:
112
+ params.model = model_path
113
+ except Exception:
114
+ params.model.path = model_path
115
+
116
+ # This is the default value, could be overwritten by _llamacpp_model_config
117
+ params.n_parallel = min(8, os.cpu_count() or 1)
118
+ params.pooling_type = llama_pooling_type.LLAMA_POOLING_TYPE_LAST
119
+ for k, v in self._llamacpp_model_config.items():
120
+ try:
121
+ if "." in k:
122
+ parts = k.split(".")
123
+ sub_param = params
124
+ for p in parts[:-1]:
125
+ sub_param = getattr(sub_param, p)
126
+ setattr(sub_param, parts[-1], v)
127
+ else:
128
+ setattr(params, k, v)
129
+ except Exception as e:
130
+ logger.error("Failed to set the param %s = %s, error: %s", k, v, e)
131
+ n_threads = self._llamacpp_model_config.get("n_threads", os.cpu_count())
132
+ params.cpuparams.n_threads = n_threads
133
+ params.cpuparams_batch.n_threads = n_threads
134
+ if params.n_gpu_layers == -1:
135
+ # Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
136
+ # 0x7FFFFFFF is INT32 max, will be auto set to all layers
137
+ params.n_gpu_layers = 0x7FFFFFFF
138
+ try:
139
+ device_info = get_device_info()
140
+ gpus = [
141
+ info
142
+ for info in device_info
143
+ if info["type"]
144
+ == ggml_backend_dev_type.GGML_BACKEND_DEVICE_TYPE_GPU
145
+ ]
146
+ if gpus:
147
+ logger.info(
148
+ "Try to estimate num gpu layers, n_ctx: %s, n_batch: %s, n_parallel: %s, gpus:\n%s",
149
+ params.n_ctx,
150
+ params.n_batch,
151
+ params.n_parallel,
152
+ pprint.pformat(gpus),
153
+ )
154
+ estimate = estimate_gpu_layers(
155
+ gpus=gpus,
156
+ model_path=model_path,
157
+ projectors=[],
158
+ context_length=params.n_ctx,
159
+ batch_size=params.n_batch,
160
+ num_parallel=params.n_parallel,
161
+ kv_cache_type="",
162
+ )
163
+ logger.info("Estimate num gpu layers: %s", estimate)
164
+ if estimate.tensor_split:
165
+ params.tensor_split = estimate.tensor_split
166
+ else:
167
+ params.n_gpu_layers = estimate.layers
168
+ except Exception as e:
169
+ logger.exception(
170
+ "Estimate num gpu layers for llama.cpp backend failed: %s", e
171
+ )
172
+
173
+ self._llm = Server(params)
174
+ self._executor = concurrent.futures.ThreadPoolExecutor(
175
+ max_workers=max(10, n_threads)
176
+ )
177
+ except AssertionError:
178
+ raise RuntimeError(f"Load model {self._model_name} failed")
179
+
180
+ def create_embedding(self, sentences: Union[str, List[str]], **kwargs) -> Embedding:
181
+ if self._llm is None:
182
+ raise RuntimeError("Model is not loaded.")
183
+
184
+ q: queue.Queue = queue.Queue()
185
+ if isinstance(sentences, str):
186
+ sentences = [sentences]
187
+
188
+ def _handle_embedding():
189
+ data = {"input": sentences}
190
+ prompt_json = orjson.dumps(data)
191
+
192
+ def _error_callback(err):
193
+ try:
194
+ msg = orjson.loads(err)
195
+ q.put(_Error(msg))
196
+ except Exception as e:
197
+ q.put(_Error(str(e)))
198
+
199
+ def _ok_callback(ok):
200
+ try:
201
+ res = orjson.loads(ok)
202
+ q.put(res)
203
+ except Exception as e:
204
+ q.put(_Error(str(e)))
205
+
206
+ try:
207
+ self._llm.handle_embeddings(prompt_json, _error_callback, _ok_callback)
208
+ except Exception as ex:
209
+ q.put(_Error(str(ex)))
210
+ q.put(_Done)
211
+
212
+ assert self._executor
213
+ self._executor.submit(_handle_embedding)
214
+
215
+ r = q.get()
216
+ if type(r) is _Error:
217
+ raise Exception(f"Failed to create embedding: {r.msg}")
218
+ r["model_replica"] = self._model_uid
219
+ return Embedding(**r) # type: ignore
220
+
221
+ @classmethod
222
+ def check_lib(cls) -> bool:
223
+ return importlib.util.find_spec("xllamacpp") is not None
224
+
225
+ @classmethod
226
+ def match_json(
227
+ cls,
228
+ model_family: EmbeddingModelFamilyV2,
229
+ model_spec: EmbeddingSpecV1,
230
+ quantization: str,
231
+ ) -> bool:
232
+ if model_spec.model_format not in ["ggufv2"]:
233
+ return False
234
+ return True