xinference 0.14.1.post1__py3-none-any.whl → 0.14.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +15 -34
  3. xinference/client/restful/restful_client.py +2 -2
  4. xinference/core/chat_interface.py +45 -10
  5. xinference/core/image_interface.py +9 -0
  6. xinference/core/model.py +8 -5
  7. xinference/core/scheduler.py +1 -2
  8. xinference/core/worker.py +49 -42
  9. xinference/deploy/cmdline.py +2 -2
  10. xinference/deploy/test/test_cmdline.py +7 -7
  11. xinference/model/audio/chattts.py +24 -9
  12. xinference/model/audio/core.py +8 -2
  13. xinference/model/audio/fish_speech.py +228 -0
  14. xinference/model/audio/model_spec.json +8 -0
  15. xinference/model/embedding/core.py +23 -1
  16. xinference/model/image/model_spec.json +2 -1
  17. xinference/model/image/model_spec_modelscope.json +2 -1
  18. xinference/model/image/stable_diffusion/core.py +49 -1
  19. xinference/model/llm/__init__.py +26 -27
  20. xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
  21. xinference/model/llm/llm_family.json +606 -1266
  22. xinference/model/llm/llm_family.py +16 -139
  23. xinference/model/llm/llm_family_modelscope.json +276 -313
  24. xinference/model/llm/lmdeploy/__init__.py +0 -0
  25. xinference/model/llm/lmdeploy/core.py +557 -0
  26. xinference/model/llm/memory.py +9 -9
  27. xinference/model/llm/sglang/core.py +2 -2
  28. xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
  29. xinference/model/llm/{pytorch → transformers}/cogvlm2.py +4 -45
  30. xinference/model/llm/transformers/cogvlm2_video.py +524 -0
  31. xinference/model/llm/{pytorch → transformers}/core.py +3 -10
  32. xinference/model/llm/{pytorch → transformers}/glm4v.py +2 -23
  33. xinference/model/llm/transformers/intern_vl.py +540 -0
  34. xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
  35. xinference/model/llm/{pytorch → transformers}/minicpmv25.py +2 -23
  36. xinference/model/llm/{pytorch → transformers}/minicpmv26.py +66 -41
  37. xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
  38. xinference/model/llm/{pytorch → transformers}/yi_vl.py +2 -24
  39. xinference/model/llm/utils.py +85 -70
  40. xinference/model/llm/vllm/core.py +110 -11
  41. xinference/model/utils.py +1 -95
  42. xinference/thirdparty/fish_speech/__init__.py +0 -0
  43. xinference/thirdparty/fish_speech/fish_speech/__init__.py +0 -0
  44. xinference/thirdparty/fish_speech/fish_speech/callbacks/__init__.py +3 -0
  45. xinference/thirdparty/fish_speech/fish_speech/callbacks/grad_norm.py +113 -0
  46. xinference/thirdparty/fish_speech/fish_speech/configs/__init__.py +0 -0
  47. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  48. xinference/thirdparty/fish_speech/fish_speech/conversation.py +2 -0
  49. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  50. xinference/thirdparty/fish_speech/fish_speech/datasets/concat_repeat.py +53 -0
  51. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  52. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_pb2.py +33 -0
  53. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_stream.py +36 -0
  54. xinference/thirdparty/fish_speech/fish_speech/datasets/semantic.py +496 -0
  55. xinference/thirdparty/fish_speech/fish_speech/datasets/vqgan.py +147 -0
  56. xinference/thirdparty/fish_speech/fish_speech/i18n/__init__.py +3 -0
  57. xinference/thirdparty/fish_speech/fish_speech/i18n/core.py +40 -0
  58. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  59. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +122 -0
  60. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +122 -0
  61. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +123 -0
  62. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +133 -0
  63. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +122 -0
  64. xinference/thirdparty/fish_speech/fish_speech/i18n/scan.py +122 -0
  65. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  66. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/__init__.py +0 -0
  67. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lit_module.py +202 -0
  68. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +779 -0
  69. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lora.py +92 -0
  70. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +3 -0
  71. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +442 -0
  72. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  73. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +44 -0
  74. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +625 -0
  75. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +139 -0
  76. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +115 -0
  77. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +225 -0
  78. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/utils.py +94 -0
  79. xinference/thirdparty/fish_speech/fish_speech/scheduler.py +40 -0
  80. xinference/thirdparty/fish_speech/fish_speech/text/__init__.py +4 -0
  81. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/__init__.py +0 -0
  82. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_class.py +172 -0
  83. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_constant.py +30 -0
  84. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_util.py +342 -0
  85. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/cardinal.py +32 -0
  86. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/date.py +75 -0
  87. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/digit.py +32 -0
  88. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/fraction.py +35 -0
  89. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/money.py +43 -0
  90. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/percentage.py +33 -0
  91. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/telephone.py +51 -0
  92. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +177 -0
  93. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +69 -0
  94. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +130 -0
  95. xinference/thirdparty/fish_speech/fish_speech/train.py +139 -0
  96. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +23 -0
  97. xinference/thirdparty/fish_speech/fish_speech/utils/braceexpand.py +217 -0
  98. xinference/thirdparty/fish_speech/fish_speech/utils/context.py +13 -0
  99. xinference/thirdparty/fish_speech/fish_speech/utils/file.py +16 -0
  100. xinference/thirdparty/fish_speech/fish_speech/utils/instantiators.py +50 -0
  101. xinference/thirdparty/fish_speech/fish_speech/utils/logger.py +55 -0
  102. xinference/thirdparty/fish_speech/fish_speech/utils/logging_utils.py +48 -0
  103. xinference/thirdparty/fish_speech/fish_speech/utils/rich_utils.py +100 -0
  104. xinference/thirdparty/fish_speech/fish_speech/utils/spectrogram.py +122 -0
  105. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +114 -0
  106. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  107. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +120 -0
  108. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1237 -0
  109. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  110. xinference/thirdparty/fish_speech/tools/api.py +495 -0
  111. xinference/thirdparty/fish_speech/tools/auto_rerank.py +159 -0
  112. xinference/thirdparty/fish_speech/tools/download_models.py +55 -0
  113. xinference/thirdparty/fish_speech/tools/extract_model.py +21 -0
  114. xinference/thirdparty/fish_speech/tools/file.py +108 -0
  115. xinference/thirdparty/fish_speech/tools/gen_ref.py +36 -0
  116. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  117. xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +169 -0
  118. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +171 -0
  119. xinference/thirdparty/fish_speech/tools/llama/generate.py +698 -0
  120. xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +95 -0
  121. xinference/thirdparty/fish_speech/tools/llama/quantize.py +497 -0
  122. xinference/thirdparty/fish_speech/tools/llama/rebuild_tokenizer.py +57 -0
  123. xinference/thirdparty/fish_speech/tools/merge_asr_files.py +55 -0
  124. xinference/thirdparty/fish_speech/tools/post_api.py +164 -0
  125. xinference/thirdparty/fish_speech/tools/sensevoice/__init__.py +0 -0
  126. xinference/thirdparty/fish_speech/tools/sensevoice/auto_model.py +573 -0
  127. xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +332 -0
  128. xinference/thirdparty/fish_speech/tools/sensevoice/vad_utils.py +61 -0
  129. xinference/thirdparty/fish_speech/tools/smart_pad.py +47 -0
  130. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  131. xinference/thirdparty/fish_speech/tools/vqgan/create_train_split.py +83 -0
  132. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +227 -0
  133. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +120 -0
  134. xinference/thirdparty/fish_speech/tools/webui.py +619 -0
  135. xinference/thirdparty/fish_speech/tools/whisper_asr.py +176 -0
  136. xinference/thirdparty/internvl/__init__.py +0 -0
  137. xinference/thirdparty/internvl/conversation.py +393 -0
  138. xinference/thirdparty/omnilmm/model/utils.py +16 -1
  139. xinference/web/ui/build/asset-manifest.json +3 -3
  140. xinference/web/ui/build/index.html +1 -1
  141. xinference/web/ui/build/static/js/main.661c7b0a.js +3 -0
  142. xinference/web/ui/build/static/js/{main.17ca0398.js.map → main.661c7b0a.js.map} +1 -1
  143. xinference/web/ui/node_modules/.cache/babel-loader/070d8c6b3b0f3485c6d3885f0b6bbfdf9643e088a468acbd5d596f2396071c16.json +1 -0
  144. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
  145. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
  146. xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
  147. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
  148. xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
  149. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
  150. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
  151. xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
  152. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
  153. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
  154. xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
  155. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
  156. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/METADATA +22 -13
  157. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/RECORD +170 -79
  158. xinference/locale/utils.py +0 -39
  159. xinference/locale/zh_CN.json +0 -26
  160. xinference/model/llm/ggml/tools/__init__.py +0 -15
  161. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
  162. xinference/model/llm/ggml/tools/gguf.py +0 -884
  163. xinference/model/llm/pytorch/__init__.py +0 -13
  164. xinference/model/llm/pytorch/baichuan.py +0 -81
  165. xinference/model/llm/pytorch/falcon.py +0 -138
  166. xinference/model/llm/pytorch/intern_vl.py +0 -352
  167. xinference/model/llm/pytorch/vicuna.py +0 -69
  168. xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
  169. xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
  170. xinference/web/ui/node_modules/.cache/babel-loader/2f40209b32e7e46a2eab6b8c8a355eb42c3caa8bc3228dd929f32fd2b3940294.json +0 -1
  171. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
  172. xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
  173. xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
  174. xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
  175. xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
  176. xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
  177. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
  178. xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
  179. xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
  180. xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
  181. xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
  182. /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
  183. /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
  184. /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
  185. /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
  186. /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
  187. /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
  188. /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
  189. /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
  190. /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.661c7b0a.js.LICENSE.txt} +0 -0
  191. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/LICENSE +0 -0
  192. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/WHEEL +0 -0
  193. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/entry_points.txt +0 -0
  194. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,6 @@
14
14
 
15
15
  import logging
16
16
  import os
17
- import shutil
18
17
  from threading import Lock
19
18
  from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
20
19
 
@@ -59,8 +58,8 @@ BUILTIN_LLM_MODEL_GENERATE_FAMILIES: Set[str] = set()
59
58
  BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES: Set[str] = set()
60
59
 
61
60
 
62
- class GgmlLLMSpecV1(BaseModel):
63
- model_format: Literal["ggmlv3", "ggufv2"]
61
+ class LlamaCppLLMSpecV1(BaseModel):
62
+ model_format: Literal["ggufv2"]
64
63
  # Must in order that `str` first, then `int`
65
64
  model_size_in_billions: Union[str, int]
66
65
  quantizations: List[str]
@@ -85,7 +84,7 @@ class GgmlLLMSpecV1(BaseModel):
85
84
 
86
85
 
87
86
  class PytorchLLMSpecV1(BaseModel):
88
- model_format: Literal["pytorch", "gptq", "awq"]
87
+ model_format: Literal["pytorch", "gptq", "awq", "fp8"]
89
88
  # Must in order that `str` first, then `int`
90
89
  model_size_in_billions: Union[str, int]
91
90
  quantizations: List[str]
@@ -247,7 +246,7 @@ class CustomLLMFamilyV1(LLMFamilyV1):
247
246
 
248
247
 
249
248
  LLMSpecV1 = Annotated[
250
- Union[GgmlLLMSpecV1, PytorchLLMSpecV1, MLXLLMSpecV1],
249
+ Union[LlamaCppLLMSpecV1, PytorchLLMSpecV1, MLXLLMSpecV1],
251
250
  Field(discriminator="model_format"),
252
251
  ]
253
252
 
@@ -272,6 +271,8 @@ VLLM_CLASSES: List[Type[LLM]] = []
272
271
 
273
272
  MLX_CLASSES: List[Type[LLM]] = []
274
273
 
274
+ LMDEPLOY_CLASSES: List[Type[LLM]] = []
275
+
275
276
  LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
276
277
  SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
277
278
 
@@ -308,13 +309,10 @@ def cache(
308
309
  if os.path.exists(legacy_cache_path):
309
310
  logger.info("Legacy cache path exists: %s", legacy_cache_path)
310
311
  return os.path.dirname(legacy_cache_path)
311
- elif download_from_self_hosted_storage() and is_self_hosted(llm_family, llm_spec):
312
- logger.info(f"Caching from self-hosted storage")
313
- return cache_from_self_hosted_storage(llm_family, llm_spec, quantization)
314
312
  else:
315
313
  if llm_spec.model_uri is not None:
316
314
  logger.info(f"Caching from URI: {llm_spec.model_uri}")
317
- return cache_from_uri(llm_family, llm_spec, quantization)
315
+ return cache_from_uri(llm_family, llm_spec)
318
316
  else:
319
317
  if llm_spec.model_hub == "huggingface":
320
318
  logger.info(f"Caching from Hugging Face: {llm_spec.model_id}")
@@ -329,68 +327,10 @@ def cache(
329
327
  raise ValueError(f"Unknown model hub: {llm_spec.model_hub}")
330
328
 
331
329
 
332
- SUPPORTED_SCHEMES = ["s3"]
333
-
334
-
335
- class AWSRegion:
336
- def __init__(self, region: str):
337
- self.region = region
338
- self.original_aws_default_region = None
339
-
340
- def __enter__(self):
341
- if "AWS_DEFAULT_REGION" in os.environ:
342
- self.original_aws_default_region = os.environ["AWS_DEFAULT_REGION"]
343
- os.environ["AWS_DEFAULT_REGION"] = self.region
344
-
345
- def __exit__(self, exc_type, exc_value, traceback):
346
- if self.original_aws_default_region:
347
- os.environ["AWS_DEFAULT_REGION"] = self.original_aws_default_region
348
- else:
349
- del os.environ["AWS_DEFAULT_REGION"]
350
-
351
-
352
- def is_self_hosted(
353
- llm_family: LLMFamilyV1,
354
- llm_spec: "LLMSpecV1",
355
- ):
356
- from fsspec import AbstractFileSystem, filesystem
357
-
358
- with AWSRegion("cn-northwest-1"):
359
- src_fs: AbstractFileSystem = filesystem("s3", anon=True)
360
- model_dir = (
361
- f"/xinference-models/llm/"
362
- f"{llm_family.model_name}-{llm_spec.model_format}-{llm_spec.model_size_in_billions}b"
363
- )
364
- return src_fs.exists(model_dir)
365
-
366
-
367
- def cache_from_self_hosted_storage(
368
- llm_family: LLMFamilyV1,
369
- llm_spec: "LLMSpecV1",
370
- quantization: Optional[str] = None,
371
- ) -> str:
372
- with AWSRegion("cn-northwest-1"):
373
- llm_spec = llm_spec.copy()
374
- llm_spec.model_uri = (
375
- f"s3://xinference-models/llm/"
376
- f"{llm_family.model_name}-{llm_spec.model_format}-{llm_spec.model_size_in_billions}b"
377
- )
378
-
379
- return cache_from_uri(
380
- llm_family, llm_spec, quantization, self_hosted_storage=True
381
- )
382
-
383
-
384
330
  def cache_from_uri(
385
331
  llm_family: LLMFamilyV1,
386
332
  llm_spec: "LLMSpecV1",
387
- quantization: Optional[str] = None,
388
- self_hosted_storage: bool = False,
389
333
  ) -> str:
390
- from fsspec import AbstractFileSystem, filesystem
391
-
392
- from ..utils import copy_from_src_to_dst
393
-
394
334
  cache_dir_name = (
395
335
  f"{llm_family.model_name}-{llm_spec.model_format}"
396
336
  f"-{llm_spec.model_size_in_billions}b"
@@ -415,69 +355,6 @@ def cache_from_uri(
415
355
  else:
416
356
  os.symlink(src_root, cache_dir, target_is_directory=True)
417
357
  return cache_dir
418
- elif src_scheme in SUPPORTED_SCHEMES:
419
- # use anonymous connection for self-hosted storage.
420
- src_fs: AbstractFileSystem = filesystem(src_scheme, anon=self_hosted_storage)
421
- local_fs: AbstractFileSystem = filesystem("file")
422
-
423
- files_to_download = []
424
- if llm_spec.model_format == "pytorch":
425
- if os.path.exists(cache_dir):
426
- logger.info(f"Cache {cache_dir} exists")
427
- return cache_dir
428
- else:
429
- os.makedirs(cache_dir, exist_ok=True)
430
-
431
- for path, _, files in src_fs.walk(llm_spec.model_uri):
432
- for file in files:
433
- src_path = f"{path}/{file}"
434
- local_path = src_path.replace(src_root, cache_dir)
435
- files_to_download.append((src_path, local_path))
436
- elif llm_spec.model_format == "ggmlv3":
437
- file = llm_spec.model_file_name_template.format(quantization=quantization)
438
- if os.path.exists(os.path.join(cache_dir, file)):
439
- logger.info(f"Cache {os.path.join(cache_dir, file)} exists")
440
- return cache_dir
441
- else:
442
- os.makedirs(cache_dir, exist_ok=True)
443
-
444
- src_path = f"{src_root}/{file}"
445
- local_path = f"{cache_dir}/{file}"
446
- files_to_download.append((src_path, local_path))
447
- else:
448
- raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
449
-
450
- from concurrent.futures import ThreadPoolExecutor
451
-
452
- failed = False
453
- with ThreadPoolExecutor(max_workers=min(len(files_to_download), 4)) as executor:
454
- futures = [
455
- (
456
- src_path,
457
- executor.submit(
458
- copy_from_src_to_dst, src_fs, src_path, local_fs, local_path
459
- ),
460
- )
461
- for src_path, local_path in files_to_download
462
- ]
463
- for src_path, future in futures:
464
- if failed:
465
- future.cancel()
466
- else:
467
- try:
468
- future.result()
469
- except:
470
- logger.error(f"Download {src_path} failed", exc_info=True)
471
- failed = True
472
-
473
- if failed:
474
- logger.warning(f"Removing cache directory: {cache_dir}")
475
- shutil.rmtree(cache_dir, ignore_errors=True)
476
- raise RuntimeError(
477
- f"Failed to download model '{llm_family.model_name}' "
478
- f"(size: {llm_spec.model_size_in_billions}, format: {llm_spec.model_format})"
479
- )
480
- return cache_dir
481
358
  else:
482
359
  raise ValueError(f"Unsupported URL scheme: {src_scheme}")
483
360
 
@@ -597,7 +474,7 @@ def _get_meta_path(
597
474
  return os.path.join(cache_dir, "__valid_download")
598
475
  else:
599
476
  return os.path.join(cache_dir, f"__valid_download_{model_hub}")
600
- elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
477
+ elif model_format in ["ggufv2", "gptq", "awq", "fp8", "mlx"]:
601
478
  assert quantization is not None
602
479
  if model_hub == "huggingface":
603
480
  return os.path.join(cache_dir, f"__valid_download_{quantization}")
@@ -636,7 +513,7 @@ def _skip_download(
636
513
  logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
637
514
  return True
638
515
  return False
639
- elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
516
+ elif model_format in ["ggufv2", "gptq", "awq", "fp8", "mlx"]:
640
517
  assert quantization is not None
641
518
  return os.path.exists(
642
519
  _get_meta_path(cache_dir, model_format, model_hub, quantization)
@@ -731,7 +608,7 @@ def cache_from_csghub(
731
608
  ):
732
609
  return cache_dir
733
610
 
734
- if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
611
+ if llm_spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
735
612
  download_dir = retry_download(
736
613
  snapshot_download,
737
614
  llm_family.model_name,
@@ -745,7 +622,7 @@ def cache_from_csghub(
745
622
  )
746
623
  create_symlink(download_dir, cache_dir)
747
624
 
748
- elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
625
+ elif llm_spec.model_format in ["ggufv2"]:
749
626
  file_names, final_file_name, need_merge = _generate_model_file_names(
750
627
  llm_spec, quantization
751
628
  )
@@ -799,7 +676,7 @@ def cache_from_modelscope(
799
676
  ):
800
677
  return cache_dir
801
678
 
802
- if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
679
+ if llm_spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
803
680
  download_dir = retry_download(
804
681
  snapshot_download,
805
682
  llm_family.model_name,
@@ -812,7 +689,7 @@ def cache_from_modelscope(
812
689
  )
813
690
  create_symlink(download_dir, cache_dir)
814
691
 
815
- elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
692
+ elif llm_spec.model_format in ["ggufv2"]:
816
693
  file_names, final_file_name, need_merge = _generate_model_file_names(
817
694
  llm_spec, quantization
818
695
  )
@@ -868,7 +745,7 @@ def cache_from_huggingface(
868
745
  if not IS_NEW_HUGGINGFACE_HUB:
869
746
  use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
870
747
 
871
- if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
748
+ if llm_spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
872
749
  assert isinstance(llm_spec, (PytorchLLMSpecV1, MLXLLMSpecV1))
873
750
  download_dir = retry_download(
874
751
  huggingface_hub.snapshot_download,
@@ -884,8 +761,8 @@ def cache_from_huggingface(
884
761
  if IS_NEW_HUGGINGFACE_HUB:
885
762
  create_symlink(download_dir, cache_dir)
886
763
 
887
- elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
888
- assert isinstance(llm_spec, GgmlLLMSpecV1)
764
+ elif llm_spec.model_format in ["ggufv2"]:
765
+ assert isinstance(llm_spec, LlamaCppLLMSpecV1)
889
766
  file_names, final_file_name, need_merge = _generate_model_file_names(
890
767
  llm_spec, quantization
891
768
  )