xinference 1.7.1.post1__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (136) hide show
  1. xinference/_version.py +3 -3
  2. xinference/client/restful/async_restful_client.py +8 -13
  3. xinference/client/restful/restful_client.py +6 -2
  4. xinference/core/chat_interface.py +6 -4
  5. xinference/core/media_interface.py +5 -0
  6. xinference/core/model.py +1 -5
  7. xinference/core/supervisor.py +117 -68
  8. xinference/core/worker.py +49 -37
  9. xinference/deploy/test/test_cmdline.py +2 -6
  10. xinference/model/audio/__init__.py +26 -23
  11. xinference/model/audio/chattts.py +3 -2
  12. xinference/model/audio/core.py +49 -98
  13. xinference/model/audio/cosyvoice.py +3 -2
  14. xinference/model/audio/custom.py +28 -73
  15. xinference/model/audio/f5tts.py +3 -2
  16. xinference/model/audio/f5tts_mlx.py +3 -2
  17. xinference/model/audio/fish_speech.py +3 -2
  18. xinference/model/audio/funasr.py +17 -4
  19. xinference/model/audio/kokoro.py +3 -2
  20. xinference/model/audio/megatts.py +3 -2
  21. xinference/model/audio/melotts.py +3 -2
  22. xinference/model/audio/model_spec.json +572 -171
  23. xinference/model/audio/utils.py +0 -6
  24. xinference/model/audio/whisper.py +3 -2
  25. xinference/model/audio/whisper_mlx.py +3 -2
  26. xinference/model/cache_manager.py +141 -0
  27. xinference/model/core.py +6 -49
  28. xinference/model/custom.py +174 -0
  29. xinference/model/embedding/__init__.py +67 -56
  30. xinference/model/embedding/cache_manager.py +35 -0
  31. xinference/model/embedding/core.py +104 -84
  32. xinference/model/embedding/custom.py +55 -78
  33. xinference/model/embedding/embed_family.py +80 -31
  34. xinference/model/embedding/flag/core.py +21 -5
  35. xinference/model/embedding/llama_cpp/__init__.py +0 -0
  36. xinference/model/embedding/llama_cpp/core.py +234 -0
  37. xinference/model/embedding/model_spec.json +968 -103
  38. xinference/model/embedding/sentence_transformers/core.py +30 -20
  39. xinference/model/embedding/vllm/core.py +11 -5
  40. xinference/model/flexible/__init__.py +8 -2
  41. xinference/model/flexible/core.py +26 -119
  42. xinference/model/flexible/custom.py +69 -0
  43. xinference/model/flexible/launchers/image_process_launcher.py +1 -0
  44. xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
  45. xinference/model/flexible/launchers/transformers_launcher.py +15 -3
  46. xinference/model/flexible/launchers/yolo_launcher.py +5 -1
  47. xinference/model/image/__init__.py +20 -20
  48. xinference/model/image/cache_manager.py +62 -0
  49. xinference/model/image/core.py +70 -182
  50. xinference/model/image/custom.py +28 -72
  51. xinference/model/image/model_spec.json +402 -119
  52. xinference/model/image/ocr/got_ocr2.py +3 -2
  53. xinference/model/image/stable_diffusion/core.py +22 -7
  54. xinference/model/image/stable_diffusion/mlx.py +6 -6
  55. xinference/model/image/utils.py +2 -2
  56. xinference/model/llm/__init__.py +71 -94
  57. xinference/model/llm/cache_manager.py +292 -0
  58. xinference/model/llm/core.py +37 -111
  59. xinference/model/llm/custom.py +88 -0
  60. xinference/model/llm/llama_cpp/core.py +5 -7
  61. xinference/model/llm/llm_family.json +16260 -8151
  62. xinference/model/llm/llm_family.py +138 -839
  63. xinference/model/llm/lmdeploy/core.py +5 -7
  64. xinference/model/llm/memory.py +3 -4
  65. xinference/model/llm/mlx/core.py +6 -8
  66. xinference/model/llm/reasoning_parser.py +3 -1
  67. xinference/model/llm/sglang/core.py +32 -14
  68. xinference/model/llm/transformers/chatglm.py +3 -7
  69. xinference/model/llm/transformers/core.py +49 -27
  70. xinference/model/llm/transformers/deepseek_v2.py +2 -2
  71. xinference/model/llm/transformers/gemma3.py +2 -2
  72. xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
  73. xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
  74. xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
  75. xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
  76. xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
  77. xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
  78. xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
  79. xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
  80. xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
  81. xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
  82. xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
  83. xinference/model/llm/transformers/opt.py +3 -7
  84. xinference/model/llm/utils.py +34 -49
  85. xinference/model/llm/vllm/core.py +77 -27
  86. xinference/model/llm/vllm/xavier/engine.py +5 -3
  87. xinference/model/llm/vllm/xavier/scheduler.py +10 -6
  88. xinference/model/llm/vllm/xavier/transfer.py +1 -1
  89. xinference/model/rerank/__init__.py +26 -25
  90. xinference/model/rerank/core.py +47 -87
  91. xinference/model/rerank/custom.py +25 -71
  92. xinference/model/rerank/model_spec.json +158 -33
  93. xinference/model/rerank/utils.py +2 -2
  94. xinference/model/utils.py +115 -54
  95. xinference/model/video/__init__.py +13 -17
  96. xinference/model/video/core.py +44 -102
  97. xinference/model/video/diffusers.py +4 -3
  98. xinference/model/video/model_spec.json +90 -21
  99. xinference/types.py +5 -3
  100. xinference/web/ui/build/asset-manifest.json +3 -3
  101. xinference/web/ui/build/index.html +1 -1
  102. xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
  103. xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
  104. xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
  105. xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
  106. xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
  107. xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
  108. xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
  109. xinference/web/ui/src/locales/en.json +0 -1
  110. xinference/web/ui/src/locales/ja.json +0 -1
  111. xinference/web/ui/src/locales/ko.json +0 -1
  112. xinference/web/ui/src/locales/zh.json +0 -1
  113. {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
  114. {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
  115. xinference/model/audio/model_spec_modelscope.json +0 -231
  116. xinference/model/embedding/model_spec_modelscope.json +0 -293
  117. xinference/model/embedding/utils.py +0 -18
  118. xinference/model/image/model_spec_modelscope.json +0 -375
  119. xinference/model/llm/llama_cpp/memory.py +0 -457
  120. xinference/model/llm/llm_family_csghub.json +0 -56
  121. xinference/model/llm/llm_family_modelscope.json +0 -8700
  122. xinference/model/llm/llm_family_openmind_hub.json +0 -1019
  123. xinference/model/rerank/model_spec_modelscope.json +0 -85
  124. xinference/model/video/model_spec_modelscope.json +0 -184
  125. xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
  126. xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
  127. xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
  128. xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
  129. xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
  130. xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
  131. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
  132. /xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
  133. {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
  134. {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
  135. {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
  136. {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0
@@ -22,35 +22,32 @@ from abc import abstractmethod
22
22
  from collections import defaultdict
23
23
  from contextvars import ContextVar
24
24
  from functools import lru_cache
25
- from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
25
+ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union
26
26
 
27
27
  from ...core.utils import parse_replica_model_uid
28
28
  from ...types import PeftModelConfig
29
- from ..core import ModelDescription
30
29
  from .reasoning_parser import ReasoningParser
31
30
 
32
31
  if TYPE_CHECKING:
33
- from .llm_family import LLMFamilyV1, LLMSpecV1
32
+ from .llm_family import LLMFamilyV2, LLMSpecV1
34
33
 
35
34
  logger = logging.getLogger(__name__)
36
35
 
37
36
 
38
- LLM_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
37
+ LLM_VERSION_INFOS: Dict[str, List[Dict]] = defaultdict(list)
39
38
 
40
39
 
41
- def get_llm_model_descriptions():
40
+ def get_llm_version_infos():
42
41
  import copy
43
42
 
44
- return copy.deepcopy(LLM_MODEL_DESCRIPTIONS)
43
+ return copy.deepcopy(LLM_VERSION_INFOS)
45
44
 
46
45
 
47
46
  class LLM(abc.ABC):
48
47
  def __init__(
49
48
  self,
50
49
  replica_model_uid: str,
51
- model_family: "LLMFamilyV1",
52
- model_spec: "LLMSpecV1",
53
- quantization: str,
50
+ model_family: "LLMFamilyV2",
54
51
  model_path: str,
55
52
  *args,
56
53
  **kwargs,
@@ -58,8 +55,8 @@ class LLM(abc.ABC):
58
55
  self.model_uid, self.rep_id = parse_replica_model_uid(replica_model_uid)
59
56
  self.raw_model_uid = replica_model_uid
60
57
  self.model_family = model_family
61
- self.model_spec = model_spec
62
- self.quantization = quantization
58
+ self.model_spec = model_family.model_specs[0]
59
+ self.quantization = model_family.model_specs[0].quantization
63
60
  self.model_path = model_path
64
61
  self.reasoning_parser = None
65
62
  if args:
@@ -128,7 +125,7 @@ class LLM(abc.ABC):
128
125
 
129
126
  @classmethod
130
127
  def match(
131
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
128
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
132
129
  ) -> bool:
133
130
  if not cls.check_lib():
134
131
  return False
@@ -137,7 +134,7 @@ class LLM(abc.ABC):
137
134
  @classmethod
138
135
  @abstractmethod
139
136
  def match_json(
140
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
137
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
141
138
  ) -> bool:
142
139
  raise NotImplementedError
143
140
 
@@ -169,89 +166,26 @@ class LLM(abc.ABC):
169
166
  chat_context_var: ContextVar[dict] = ContextVar("chat_context_var", default={})
170
167
 
171
168
 
172
- class LLMDescription(ModelDescription):
173
- def __init__(
174
- self,
175
- address: Optional[str],
176
- devices: Optional[List[str]],
177
- llm_family: "LLMFamilyV1",
178
- llm_spec: "LLMSpecV1",
179
- quantization: Optional[str],
180
- multimodal_projector: Optional[str] = None,
181
- model_path: Optional[str] = None,
182
- ):
183
- super().__init__(address, devices, model_path=model_path)
184
- self._llm_family = llm_family
185
- self._llm_spec = llm_spec
186
- self._quantization = quantization
187
- self._multimodal_projector = multimodal_projector
188
-
189
- @property
190
- def spec(self):
191
- return self._llm_family
192
-
193
- def to_dict(self):
194
- return {
195
- "model_type": "LLM",
196
- "address": self.address,
197
- "accelerators": self.devices,
198
- "model_name": self._llm_family.model_name,
199
- "model_lang": self._llm_family.model_lang,
200
- "model_ability": self._llm_family.model_ability,
201
- "model_description": self._llm_family.model_description,
202
- "model_format": self._llm_spec.model_format,
203
- "model_size_in_billions": self._llm_spec.model_size_in_billions,
204
- "model_family": self._llm_family.model_family
205
- or self._llm_family.model_name,
206
- "quantization": self._quantization,
207
- "multimodal_projector": self._multimodal_projector,
208
- "model_hub": self._llm_spec.model_hub,
209
- "revision": self._llm_spec.model_revision,
210
- "context_length": self._llm_family.context_length,
211
- }
212
-
213
- def to_version_info(self):
214
- from .utils import get_file_location, get_model_version
215
-
216
- model_file_location, cache_status = get_file_location(
217
- self._llm_family, self._llm_spec, self._quantization
218
- )
219
-
220
- return {
221
- "model_version": get_model_version(
222
- self._llm_family, self._llm_spec, self._quantization
223
- ),
224
- "model_file_location": model_file_location,
225
- "cache_status": cache_status,
226
- "quantization": self._quantization,
227
- "multimodal_projector": self._multimodal_projector,
228
- "model_format": self._llm_spec.model_format,
229
- "model_size_in_billions": self._llm_spec.model_size_in_billions,
230
- }
231
-
232
-
233
- def generate_llm_description(llm_family: "LLMFamilyV1") -> Dict[str, List[Dict]]:
169
+ def generate_llm_version_info(llm_family: "LLMFamilyV2") -> Dict[str, List[Dict]]:
234
170
  res = defaultdict(list)
235
- for spec in llm_family.model_specs:
171
+ # Use model_specs from huggingface, as HuggingFace is the most comprehensive.
172
+ hf_specs = [
173
+ spec for spec in llm_family.model_specs if spec.model_hub == "huggingface"
174
+ ]
175
+ for spec in hf_specs:
176
+ _llm_family = llm_family.copy()
177
+ _llm_family.model_specs = [spec]
236
178
  multimodal_projectors = getattr(spec, "multimodal_projectors", None)
237
- for q in spec.quantizations:
238
- if multimodal_projectors:
239
- for mmproj in multimodal_projectors:
240
- res[llm_family.model_name].append(
241
- LLMDescription(
242
- None, None, llm_family, spec, q, mmproj
243
- ).to_version_info()
244
- )
245
- else:
246
- res[llm_family.model_name].append(
247
- LLMDescription(None, None, llm_family, spec, q).to_version_info()
248
- )
179
+ if multimodal_projectors:
180
+ for mmproj in multimodal_projectors:
181
+ _llm_family.multimodal_projector = mmproj
182
+ res[_llm_family.model_name].append(_llm_family.to_version_info())
183
+ else:
184
+ res[_llm_family.model_name].append(_llm_family.to_version_info())
249
185
  return res
250
186
 
251
187
 
252
188
  def create_llm_model_instance(
253
- subpool_addr: str,
254
- devices: List[str],
255
189
  model_uid: str,
256
190
  model_name: str,
257
191
  model_engine: Optional[str],
@@ -264,35 +198,35 @@ def create_llm_model_instance(
264
198
  ] = None,
265
199
  model_path: Optional[str] = None,
266
200
  **kwargs,
267
- ) -> Tuple[LLM, LLMDescription]:
268
- from .llm_family import cache, check_engine_by_spec_parameters, match_llm
201
+ ) -> LLM:
202
+ from .cache_manager import LLMCacheManager
203
+ from .llm_family import check_engine_by_spec_parameters, match_llm
269
204
 
270
205
  if model_engine is None:
271
206
  raise ValueError("model_engine is required for LLM model")
272
- match_result = match_llm(
207
+ llm_family = match_llm(
273
208
  model_name, model_format, model_size_in_billions, quantization, download_hub
274
209
  )
275
210
 
276
- if not match_result:
211
+ if not llm_family:
277
212
  raise ValueError(
278
213
  f"Model not found, name: {model_name}, format: {model_format},"
279
214
  f" size: {model_size_in_billions}, quantization: {quantization}"
280
215
  )
281
- llm_family, llm_spec, quantization = match_result
282
- assert quantization is not None
283
216
 
284
217
  llm_cls = check_engine_by_spec_parameters(
285
218
  model_engine,
286
219
  llm_family.model_name,
287
- llm_spec.model_format,
288
- llm_spec.model_size_in_billions,
289
- quantization,
220
+ llm_family.model_specs[0].model_format,
221
+ llm_family.model_specs[0].model_size_in_billions,
222
+ llm_family.model_specs[0].quantization,
290
223
  )
291
224
  logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
292
225
 
293
226
  multimodal_projector = kwargs.get("multimodal_projector")
294
227
  if not model_path:
295
- model_path = cache(llm_family, llm_spec, quantization, multimodal_projector)
228
+ cache_manager = LLMCacheManager(llm_family, multimodal_projector)
229
+ model_path = cache_manager.cache()
296
230
 
297
231
  peft_model = peft_model_config.peft_model if peft_model_config else None
298
232
  if peft_model is not None:
@@ -300,8 +234,6 @@ def create_llm_model_instance(
300
234
  model = llm_cls(
301
235
  model_uid,
302
236
  llm_family,
303
- llm_spec,
304
- quantization,
305
237
  model_path,
306
238
  kwargs,
307
239
  peft_model,
@@ -311,13 +243,7 @@ def create_llm_model_instance(
311
243
  f"Model not supported with lora, name: {model_name}, format: {model_format}, engine: {model_engine}. "
312
244
  f"Load this without lora."
313
245
  )
314
- model = llm_cls(
315
- model_uid, llm_family, llm_spec, quantization, model_path, kwargs
316
- )
246
+ model = llm_cls(model_uid, llm_family, model_path, kwargs)
317
247
  else:
318
- model = llm_cls(
319
- model_uid, llm_family, llm_spec, quantization, model_path, kwargs
320
- )
321
- return model, LLMDescription(
322
- subpool_addr, devices, llm_family, llm_spec, quantization, multimodal_projector
323
- )
248
+ model = llm_cls(model_uid, llm_family, model_path, kwargs)
249
+ return model
@@ -0,0 +1,88 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ from typing import TYPE_CHECKING, List
17
+
18
+ from ..custom import ModelRegistry
19
+
20
+ if TYPE_CHECKING:
21
+ from .llm_family import LLMFamilyV2
22
+
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ UD_LLM_FAMILIES: List["LLMFamilyV2"] = []
28
+
29
+
30
+ class LLMModelRegistry(ModelRegistry):
31
+ model_type = "llm"
32
+
33
+ def __init__(self):
34
+ from .llm_family import BUILTIN_LLM_FAMILIES
35
+
36
+ super().__init__()
37
+ self.models = UD_LLM_FAMILIES
38
+ self.builtin_models = [x.model_name for x in BUILTIN_LLM_FAMILIES]
39
+
40
+ def add_ud_model(self, model_spec):
41
+ from . import generate_engine_config_by_model_family
42
+
43
+ self.models.append(model_spec)
44
+ generate_engine_config_by_model_family(model_spec)
45
+
46
+ def check_model_uri(self, llm_family: "LLMFamilyV2"):
47
+ from ..utils import is_valid_model_uri
48
+
49
+ for spec in llm_family.model_specs:
50
+ model_uri = spec.model_uri
51
+ if model_uri and not is_valid_model_uri(model_uri):
52
+ raise ValueError(f"Invalid model URI {model_uri}.")
53
+
54
+ def remove_ud_model(self, llm_family: "LLMFamilyV2"):
55
+ from .llm_family import LLM_ENGINES
56
+
57
+ UD_LLM_FAMILIES.remove(llm_family)
58
+ del LLM_ENGINES[llm_family.model_name]
59
+
60
+ def remove_ud_model_files(self, llm_family: "LLMFamilyV2"):
61
+ from .cache_manager import LLMCacheManager
62
+
63
+ _llm_family = llm_family.copy()
64
+ for spec in llm_family.model_specs:
65
+ _llm_family.model_specs = [spec]
66
+ cache_manager = LLMCacheManager(_llm_family)
67
+ cache_manager.unregister_custom_model(self.model_type)
68
+
69
+
70
+ def get_user_defined_llm_families():
71
+ from ..custom import RegistryManager
72
+
73
+ registry = RegistryManager.get_registry("llm")
74
+ return registry.get_custom_models()
75
+
76
+
77
+ def register_llm(llm_family: "LLMFamilyV2", persist: bool):
78
+ from ..custom import RegistryManager
79
+
80
+ registry = RegistryManager.get_registry("llm")
81
+ registry.register(llm_family, persist)
82
+
83
+
84
+ def unregister_llm(model_name: str, raise_error: bool = True):
85
+ from ..custom import RegistryManager
86
+
87
+ registry = RegistryManager.get_registry("llm")
88
+ registry.unregister(model_name, raise_error)
@@ -23,9 +23,8 @@ import orjson
23
23
 
24
24
  from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
25
25
  from ..core import LLM
26
- from ..llm_family import LLMFamilyV1, LLMSpecV1
26
+ from ..llm_family import LLMFamilyV2, LLMSpecV1
27
27
  from ..utils import ChatModelMixin
28
- from .memory import estimate_gpu_layers
29
28
 
30
29
  logger = logging.getLogger(__name__)
31
30
 
@@ -43,13 +42,11 @@ class XllamaCppModel(LLM, ChatModelMixin):
43
42
  def __init__(
44
43
  self,
45
44
  model_uid: str,
46
- model_family: "LLMFamilyV1",
47
- model_spec: "LLMSpecV1",
48
- quantization: str,
45
+ model_family: "LLMFamilyV2",
49
46
  model_path: str,
50
47
  llamacpp_model_config: Optional[dict] = None,
51
48
  ):
52
- super().__init__(model_uid, model_family, model_spec, quantization, model_path)
49
+ super().__init__(model_uid, model_family, model_path)
53
50
  self._llamacpp_model_config = self._sanitize_model_config(llamacpp_model_config)
54
51
  self._llm = None
55
52
  self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
@@ -84,7 +81,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
84
81
 
85
82
  @classmethod
86
83
  def match_json(
87
- cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
84
+ cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
88
85
  ) -> bool:
89
86
  if llm_spec.model_format not in ["ggufv2"]:
90
87
  return False
@@ -100,6 +97,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
100
97
  from xllamacpp import (
101
98
  CommonParams,
102
99
  Server,
100
+ estimate_gpu_layers,
103
101
  get_device_info,
104
102
  ggml_backend_dev_type,
105
103
  )