xinference 0.14.1.post1__py3-none-any.whl → 0.14.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +15 -34
- xinference/client/restful/restful_client.py +2 -2
- xinference/core/chat_interface.py +45 -10
- xinference/core/image_interface.py +9 -0
- xinference/core/model.py +8 -5
- xinference/core/scheduler.py +1 -2
- xinference/core/worker.py +49 -42
- xinference/deploy/cmdline.py +2 -2
- xinference/deploy/test/test_cmdline.py +7 -7
- xinference/model/audio/chattts.py +24 -9
- xinference/model/audio/core.py +8 -2
- xinference/model/audio/fish_speech.py +228 -0
- xinference/model/audio/model_spec.json +8 -0
- xinference/model/embedding/core.py +23 -1
- xinference/model/image/model_spec.json +2 -1
- xinference/model/image/model_spec_modelscope.json +2 -1
- xinference/model/image/stable_diffusion/core.py +49 -1
- xinference/model/llm/__init__.py +26 -27
- xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
- xinference/model/llm/llm_family.json +606 -1266
- xinference/model/llm/llm_family.py +16 -139
- xinference/model/llm/llm_family_modelscope.json +276 -313
- xinference/model/llm/lmdeploy/__init__.py +0 -0
- xinference/model/llm/lmdeploy/core.py +557 -0
- xinference/model/llm/memory.py +9 -9
- xinference/model/llm/sglang/core.py +2 -2
- xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
- xinference/model/llm/{pytorch → transformers}/cogvlm2.py +4 -45
- xinference/model/llm/transformers/cogvlm2_video.py +524 -0
- xinference/model/llm/{pytorch → transformers}/core.py +3 -10
- xinference/model/llm/{pytorch → transformers}/glm4v.py +2 -23
- xinference/model/llm/transformers/intern_vl.py +540 -0
- xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
- xinference/model/llm/{pytorch → transformers}/minicpmv25.py +2 -23
- xinference/model/llm/{pytorch → transformers}/minicpmv26.py +66 -41
- xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
- xinference/model/llm/{pytorch → transformers}/yi_vl.py +2 -24
- xinference/model/llm/utils.py +85 -70
- xinference/model/llm/vllm/core.py +110 -11
- xinference/model/utils.py +1 -95
- xinference/thirdparty/fish_speech/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/callbacks/__init__.py +3 -0
- xinference/thirdparty/fish_speech/fish_speech/callbacks/grad_norm.py +113 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/conversation.py +2 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/concat_repeat.py +53 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_pb2.py +33 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_stream.py +36 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/semantic.py +496 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/vqgan.py +147 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/__init__.py +3 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/core.py +40 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +122 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +122 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +123 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +133 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +122 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/scan.py +122 -0
- xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lit_module.py +202 -0
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +779 -0
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lora.py +92 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +3 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +442 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +44 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +625 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +139 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +115 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +225 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/utils.py +94 -0
- xinference/thirdparty/fish_speech/fish_speech/scheduler.py +40 -0
- xinference/thirdparty/fish_speech/fish_speech/text/__init__.py +4 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_class.py +172 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_constant.py +30 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_util.py +342 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/cardinal.py +32 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/date.py +75 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/digit.py +32 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/fraction.py +35 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/money.py +43 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/percentage.py +33 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/telephone.py +51 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +177 -0
- xinference/thirdparty/fish_speech/fish_speech/text/clean.py +69 -0
- xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +130 -0
- xinference/thirdparty/fish_speech/fish_speech/train.py +139 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +23 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/braceexpand.py +217 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/context.py +13 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/file.py +16 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/instantiators.py +50 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/logger.py +55 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/logging_utils.py +48 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/rich_utils.py +100 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/spectrogram.py +122 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +114 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +120 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1237 -0
- xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/api.py +495 -0
- xinference/thirdparty/fish_speech/tools/auto_rerank.py +159 -0
- xinference/thirdparty/fish_speech/tools/download_models.py +55 -0
- xinference/thirdparty/fish_speech/tools/extract_model.py +21 -0
- xinference/thirdparty/fish_speech/tools/file.py +108 -0
- xinference/thirdparty/fish_speech/tools/gen_ref.py +36 -0
- xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +169 -0
- xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +171 -0
- xinference/thirdparty/fish_speech/tools/llama/generate.py +698 -0
- xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +95 -0
- xinference/thirdparty/fish_speech/tools/llama/quantize.py +497 -0
- xinference/thirdparty/fish_speech/tools/llama/rebuild_tokenizer.py +57 -0
- xinference/thirdparty/fish_speech/tools/merge_asr_files.py +55 -0
- xinference/thirdparty/fish_speech/tools/post_api.py +164 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/auto_model.py +573 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +332 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/vad_utils.py +61 -0
- xinference/thirdparty/fish_speech/tools/smart_pad.py +47 -0
- xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/vqgan/create_train_split.py +83 -0
- xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +227 -0
- xinference/thirdparty/fish_speech/tools/vqgan/inference.py +120 -0
- xinference/thirdparty/fish_speech/tools/webui.py +619 -0
- xinference/thirdparty/fish_speech/tools/whisper_asr.py +176 -0
- xinference/thirdparty/internvl/__init__.py +0 -0
- xinference/thirdparty/internvl/conversation.py +393 -0
- xinference/thirdparty/omnilmm/model/utils.py +16 -1
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.661c7b0a.js +3 -0
- xinference/web/ui/build/static/js/{main.17ca0398.js.map → main.661c7b0a.js.map} +1 -1
- xinference/web/ui/node_modules/.cache/babel-loader/070d8c6b3b0f3485c6d3885f0b6bbfdf9643e088a468acbd5d596f2396071c16.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/METADATA +22 -13
- {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/RECORD +170 -79
- xinference/locale/utils.py +0 -39
- xinference/locale/zh_CN.json +0 -26
- xinference/model/llm/ggml/tools/__init__.py +0 -15
- xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
- xinference/model/llm/ggml/tools/gguf.py +0 -884
- xinference/model/llm/pytorch/__init__.py +0 -13
- xinference/model/llm/pytorch/baichuan.py +0 -81
- xinference/model/llm/pytorch/falcon.py +0 -138
- xinference/model/llm/pytorch/intern_vl.py +0 -352
- xinference/model/llm/pytorch/vicuna.py +0 -69
- xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
- xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2f40209b32e7e46a2eab6b8c8a355eb42c3caa8bc3228dd929f32fd2b3940294.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
- /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
- /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
- /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.661c7b0a.js.LICENSE.txt} +0 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/LICENSE +0 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/WHEEL +0 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/entry_points.txt +0 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
import logging
|
|
16
16
|
import os
|
|
17
|
-
import shutil
|
|
18
17
|
from threading import Lock
|
|
19
18
|
from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
|
|
20
19
|
|
|
@@ -59,8 +58,8 @@ BUILTIN_LLM_MODEL_GENERATE_FAMILIES: Set[str] = set()
|
|
|
59
58
|
BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES: Set[str] = set()
|
|
60
59
|
|
|
61
60
|
|
|
62
|
-
class
|
|
63
|
-
model_format: Literal["
|
|
61
|
+
class LlamaCppLLMSpecV1(BaseModel):
|
|
62
|
+
model_format: Literal["ggufv2"]
|
|
64
63
|
# Must in order that `str` first, then `int`
|
|
65
64
|
model_size_in_billions: Union[str, int]
|
|
66
65
|
quantizations: List[str]
|
|
@@ -85,7 +84,7 @@ class GgmlLLMSpecV1(BaseModel):
|
|
|
85
84
|
|
|
86
85
|
|
|
87
86
|
class PytorchLLMSpecV1(BaseModel):
|
|
88
|
-
model_format: Literal["pytorch", "gptq", "awq"]
|
|
87
|
+
model_format: Literal["pytorch", "gptq", "awq", "fp8"]
|
|
89
88
|
# Must in order that `str` first, then `int`
|
|
90
89
|
model_size_in_billions: Union[str, int]
|
|
91
90
|
quantizations: List[str]
|
|
@@ -247,7 +246,7 @@ class CustomLLMFamilyV1(LLMFamilyV1):
|
|
|
247
246
|
|
|
248
247
|
|
|
249
248
|
LLMSpecV1 = Annotated[
|
|
250
|
-
Union[
|
|
249
|
+
Union[LlamaCppLLMSpecV1, PytorchLLMSpecV1, MLXLLMSpecV1],
|
|
251
250
|
Field(discriminator="model_format"),
|
|
252
251
|
]
|
|
253
252
|
|
|
@@ -272,6 +271,8 @@ VLLM_CLASSES: List[Type[LLM]] = []
|
|
|
272
271
|
|
|
273
272
|
MLX_CLASSES: List[Type[LLM]] = []
|
|
274
273
|
|
|
274
|
+
LMDEPLOY_CLASSES: List[Type[LLM]] = []
|
|
275
|
+
|
|
275
276
|
LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
|
|
276
277
|
SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
|
|
277
278
|
|
|
@@ -308,13 +309,10 @@ def cache(
|
|
|
308
309
|
if os.path.exists(legacy_cache_path):
|
|
309
310
|
logger.info("Legacy cache path exists: %s", legacy_cache_path)
|
|
310
311
|
return os.path.dirname(legacy_cache_path)
|
|
311
|
-
elif download_from_self_hosted_storage() and is_self_hosted(llm_family, llm_spec):
|
|
312
|
-
logger.info(f"Caching from self-hosted storage")
|
|
313
|
-
return cache_from_self_hosted_storage(llm_family, llm_spec, quantization)
|
|
314
312
|
else:
|
|
315
313
|
if llm_spec.model_uri is not None:
|
|
316
314
|
logger.info(f"Caching from URI: {llm_spec.model_uri}")
|
|
317
|
-
return cache_from_uri(llm_family, llm_spec
|
|
315
|
+
return cache_from_uri(llm_family, llm_spec)
|
|
318
316
|
else:
|
|
319
317
|
if llm_spec.model_hub == "huggingface":
|
|
320
318
|
logger.info(f"Caching from Hugging Face: {llm_spec.model_id}")
|
|
@@ -329,68 +327,10 @@ def cache(
|
|
|
329
327
|
raise ValueError(f"Unknown model hub: {llm_spec.model_hub}")
|
|
330
328
|
|
|
331
329
|
|
|
332
|
-
SUPPORTED_SCHEMES = ["s3"]
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
class AWSRegion:
|
|
336
|
-
def __init__(self, region: str):
|
|
337
|
-
self.region = region
|
|
338
|
-
self.original_aws_default_region = None
|
|
339
|
-
|
|
340
|
-
def __enter__(self):
|
|
341
|
-
if "AWS_DEFAULT_REGION" in os.environ:
|
|
342
|
-
self.original_aws_default_region = os.environ["AWS_DEFAULT_REGION"]
|
|
343
|
-
os.environ["AWS_DEFAULT_REGION"] = self.region
|
|
344
|
-
|
|
345
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
|
346
|
-
if self.original_aws_default_region:
|
|
347
|
-
os.environ["AWS_DEFAULT_REGION"] = self.original_aws_default_region
|
|
348
|
-
else:
|
|
349
|
-
del os.environ["AWS_DEFAULT_REGION"]
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
def is_self_hosted(
|
|
353
|
-
llm_family: LLMFamilyV1,
|
|
354
|
-
llm_spec: "LLMSpecV1",
|
|
355
|
-
):
|
|
356
|
-
from fsspec import AbstractFileSystem, filesystem
|
|
357
|
-
|
|
358
|
-
with AWSRegion("cn-northwest-1"):
|
|
359
|
-
src_fs: AbstractFileSystem = filesystem("s3", anon=True)
|
|
360
|
-
model_dir = (
|
|
361
|
-
f"/xinference-models/llm/"
|
|
362
|
-
f"{llm_family.model_name}-{llm_spec.model_format}-{llm_spec.model_size_in_billions}b"
|
|
363
|
-
)
|
|
364
|
-
return src_fs.exists(model_dir)
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
def cache_from_self_hosted_storage(
|
|
368
|
-
llm_family: LLMFamilyV1,
|
|
369
|
-
llm_spec: "LLMSpecV1",
|
|
370
|
-
quantization: Optional[str] = None,
|
|
371
|
-
) -> str:
|
|
372
|
-
with AWSRegion("cn-northwest-1"):
|
|
373
|
-
llm_spec = llm_spec.copy()
|
|
374
|
-
llm_spec.model_uri = (
|
|
375
|
-
f"s3://xinference-models/llm/"
|
|
376
|
-
f"{llm_family.model_name}-{llm_spec.model_format}-{llm_spec.model_size_in_billions}b"
|
|
377
|
-
)
|
|
378
|
-
|
|
379
|
-
return cache_from_uri(
|
|
380
|
-
llm_family, llm_spec, quantization, self_hosted_storage=True
|
|
381
|
-
)
|
|
382
|
-
|
|
383
|
-
|
|
384
330
|
def cache_from_uri(
|
|
385
331
|
llm_family: LLMFamilyV1,
|
|
386
332
|
llm_spec: "LLMSpecV1",
|
|
387
|
-
quantization: Optional[str] = None,
|
|
388
|
-
self_hosted_storage: bool = False,
|
|
389
333
|
) -> str:
|
|
390
|
-
from fsspec import AbstractFileSystem, filesystem
|
|
391
|
-
|
|
392
|
-
from ..utils import copy_from_src_to_dst
|
|
393
|
-
|
|
394
334
|
cache_dir_name = (
|
|
395
335
|
f"{llm_family.model_name}-{llm_spec.model_format}"
|
|
396
336
|
f"-{llm_spec.model_size_in_billions}b"
|
|
@@ -415,69 +355,6 @@ def cache_from_uri(
|
|
|
415
355
|
else:
|
|
416
356
|
os.symlink(src_root, cache_dir, target_is_directory=True)
|
|
417
357
|
return cache_dir
|
|
418
|
-
elif src_scheme in SUPPORTED_SCHEMES:
|
|
419
|
-
# use anonymous connection for self-hosted storage.
|
|
420
|
-
src_fs: AbstractFileSystem = filesystem(src_scheme, anon=self_hosted_storage)
|
|
421
|
-
local_fs: AbstractFileSystem = filesystem("file")
|
|
422
|
-
|
|
423
|
-
files_to_download = []
|
|
424
|
-
if llm_spec.model_format == "pytorch":
|
|
425
|
-
if os.path.exists(cache_dir):
|
|
426
|
-
logger.info(f"Cache {cache_dir} exists")
|
|
427
|
-
return cache_dir
|
|
428
|
-
else:
|
|
429
|
-
os.makedirs(cache_dir, exist_ok=True)
|
|
430
|
-
|
|
431
|
-
for path, _, files in src_fs.walk(llm_spec.model_uri):
|
|
432
|
-
for file in files:
|
|
433
|
-
src_path = f"{path}/{file}"
|
|
434
|
-
local_path = src_path.replace(src_root, cache_dir)
|
|
435
|
-
files_to_download.append((src_path, local_path))
|
|
436
|
-
elif llm_spec.model_format == "ggmlv3":
|
|
437
|
-
file = llm_spec.model_file_name_template.format(quantization=quantization)
|
|
438
|
-
if os.path.exists(os.path.join(cache_dir, file)):
|
|
439
|
-
logger.info(f"Cache {os.path.join(cache_dir, file)} exists")
|
|
440
|
-
return cache_dir
|
|
441
|
-
else:
|
|
442
|
-
os.makedirs(cache_dir, exist_ok=True)
|
|
443
|
-
|
|
444
|
-
src_path = f"{src_root}/{file}"
|
|
445
|
-
local_path = f"{cache_dir}/{file}"
|
|
446
|
-
files_to_download.append((src_path, local_path))
|
|
447
|
-
else:
|
|
448
|
-
raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
|
|
449
|
-
|
|
450
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
451
|
-
|
|
452
|
-
failed = False
|
|
453
|
-
with ThreadPoolExecutor(max_workers=min(len(files_to_download), 4)) as executor:
|
|
454
|
-
futures = [
|
|
455
|
-
(
|
|
456
|
-
src_path,
|
|
457
|
-
executor.submit(
|
|
458
|
-
copy_from_src_to_dst, src_fs, src_path, local_fs, local_path
|
|
459
|
-
),
|
|
460
|
-
)
|
|
461
|
-
for src_path, local_path in files_to_download
|
|
462
|
-
]
|
|
463
|
-
for src_path, future in futures:
|
|
464
|
-
if failed:
|
|
465
|
-
future.cancel()
|
|
466
|
-
else:
|
|
467
|
-
try:
|
|
468
|
-
future.result()
|
|
469
|
-
except:
|
|
470
|
-
logger.error(f"Download {src_path} failed", exc_info=True)
|
|
471
|
-
failed = True
|
|
472
|
-
|
|
473
|
-
if failed:
|
|
474
|
-
logger.warning(f"Removing cache directory: {cache_dir}")
|
|
475
|
-
shutil.rmtree(cache_dir, ignore_errors=True)
|
|
476
|
-
raise RuntimeError(
|
|
477
|
-
f"Failed to download model '{llm_family.model_name}' "
|
|
478
|
-
f"(size: {llm_spec.model_size_in_billions}, format: {llm_spec.model_format})"
|
|
479
|
-
)
|
|
480
|
-
return cache_dir
|
|
481
358
|
else:
|
|
482
359
|
raise ValueError(f"Unsupported URL scheme: {src_scheme}")
|
|
483
360
|
|
|
@@ -597,7 +474,7 @@ def _get_meta_path(
|
|
|
597
474
|
return os.path.join(cache_dir, "__valid_download")
|
|
598
475
|
else:
|
|
599
476
|
return os.path.join(cache_dir, f"__valid_download_{model_hub}")
|
|
600
|
-
elif model_format in ["
|
|
477
|
+
elif model_format in ["ggufv2", "gptq", "awq", "fp8", "mlx"]:
|
|
601
478
|
assert quantization is not None
|
|
602
479
|
if model_hub == "huggingface":
|
|
603
480
|
return os.path.join(cache_dir, f"__valid_download_{quantization}")
|
|
@@ -636,7 +513,7 @@ def _skip_download(
|
|
|
636
513
|
logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
|
|
637
514
|
return True
|
|
638
515
|
return False
|
|
639
|
-
elif model_format in ["
|
|
516
|
+
elif model_format in ["ggufv2", "gptq", "awq", "fp8", "mlx"]:
|
|
640
517
|
assert quantization is not None
|
|
641
518
|
return os.path.exists(
|
|
642
519
|
_get_meta_path(cache_dir, model_format, model_hub, quantization)
|
|
@@ -731,7 +608,7 @@ def cache_from_csghub(
|
|
|
731
608
|
):
|
|
732
609
|
return cache_dir
|
|
733
610
|
|
|
734
|
-
if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
|
|
611
|
+
if llm_spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
|
|
735
612
|
download_dir = retry_download(
|
|
736
613
|
snapshot_download,
|
|
737
614
|
llm_family.model_name,
|
|
@@ -745,7 +622,7 @@ def cache_from_csghub(
|
|
|
745
622
|
)
|
|
746
623
|
create_symlink(download_dir, cache_dir)
|
|
747
624
|
|
|
748
|
-
elif llm_spec.model_format in ["
|
|
625
|
+
elif llm_spec.model_format in ["ggufv2"]:
|
|
749
626
|
file_names, final_file_name, need_merge = _generate_model_file_names(
|
|
750
627
|
llm_spec, quantization
|
|
751
628
|
)
|
|
@@ -799,7 +676,7 @@ def cache_from_modelscope(
|
|
|
799
676
|
):
|
|
800
677
|
return cache_dir
|
|
801
678
|
|
|
802
|
-
if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
|
|
679
|
+
if llm_spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
|
|
803
680
|
download_dir = retry_download(
|
|
804
681
|
snapshot_download,
|
|
805
682
|
llm_family.model_name,
|
|
@@ -812,7 +689,7 @@ def cache_from_modelscope(
|
|
|
812
689
|
)
|
|
813
690
|
create_symlink(download_dir, cache_dir)
|
|
814
691
|
|
|
815
|
-
elif llm_spec.model_format in ["
|
|
692
|
+
elif llm_spec.model_format in ["ggufv2"]:
|
|
816
693
|
file_names, final_file_name, need_merge = _generate_model_file_names(
|
|
817
694
|
llm_spec, quantization
|
|
818
695
|
)
|
|
@@ -868,7 +745,7 @@ def cache_from_huggingface(
|
|
|
868
745
|
if not IS_NEW_HUGGINGFACE_HUB:
|
|
869
746
|
use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
|
|
870
747
|
|
|
871
|
-
if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
|
|
748
|
+
if llm_spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
|
|
872
749
|
assert isinstance(llm_spec, (PytorchLLMSpecV1, MLXLLMSpecV1))
|
|
873
750
|
download_dir = retry_download(
|
|
874
751
|
huggingface_hub.snapshot_download,
|
|
@@ -884,8 +761,8 @@ def cache_from_huggingface(
|
|
|
884
761
|
if IS_NEW_HUGGINGFACE_HUB:
|
|
885
762
|
create_symlink(download_dir, cache_dir)
|
|
886
763
|
|
|
887
|
-
elif llm_spec.model_format in ["
|
|
888
|
-
assert isinstance(llm_spec,
|
|
764
|
+
elif llm_spec.model_format in ["ggufv2"]:
|
|
765
|
+
assert isinstance(llm_spec, LlamaCppLLMSpecV1)
|
|
889
766
|
file_names, final_file_name, need_merge = _generate_model_file_names(
|
|
890
767
|
llm_spec, quantization
|
|
891
768
|
)
|