xinference 0.14.1.post1__py3-none-any.whl → 0.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +15 -34
  3. xinference/client/restful/restful_client.py +2 -2
  4. xinference/core/chat_interface.py +44 -9
  5. xinference/core/model.py +4 -4
  6. xinference/core/scheduler.py +1 -2
  7. xinference/core/worker.py +1 -1
  8. xinference/deploy/cmdline.py +2 -2
  9. xinference/deploy/test/test_cmdline.py +7 -7
  10. xinference/model/llm/__init__.py +20 -27
  11. xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
  12. xinference/model/llm/llm_family.json +448 -1153
  13. xinference/model/llm/llm_family.py +14 -139
  14. xinference/model/llm/llm_family_modelscope.json +230 -313
  15. xinference/model/llm/memory.py +9 -9
  16. xinference/model/llm/sglang/core.py +2 -2
  17. xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
  18. xinference/model/llm/{pytorch → transformers}/core.py +2 -10
  19. xinference/model/llm/transformers/intern_vl.py +457 -0
  20. xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
  21. xinference/model/llm/{pytorch → transformers}/minicpmv26.py +67 -22
  22. xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
  23. xinference/model/llm/utils.py +76 -70
  24. xinference/model/llm/vllm/core.py +110 -11
  25. xinference/model/utils.py +1 -95
  26. xinference/thirdparty/internvl/__init__.py +0 -0
  27. xinference/thirdparty/internvl/conversation.py +393 -0
  28. xinference/thirdparty/omnilmm/model/utils.py +16 -1
  29. xinference/web/ui/build/asset-manifest.json +3 -3
  30. xinference/web/ui/build/index.html +1 -1
  31. xinference/web/ui/build/static/js/main.ffc26121.js +3 -0
  32. xinference/web/ui/build/static/js/main.ffc26121.js.map +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
  36. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
  45. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/METADATA +5 -8
  46. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/RECORD +63 -70
  47. xinference/locale/utils.py +0 -39
  48. xinference/locale/zh_CN.json +0 -26
  49. xinference/model/llm/ggml/tools/__init__.py +0 -15
  50. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
  51. xinference/model/llm/ggml/tools/gguf.py +0 -884
  52. xinference/model/llm/pytorch/__init__.py +0 -13
  53. xinference/model/llm/pytorch/baichuan.py +0 -81
  54. xinference/model/llm/pytorch/falcon.py +0 -138
  55. xinference/model/llm/pytorch/intern_vl.py +0 -352
  56. xinference/model/llm/pytorch/vicuna.py +0 -69
  57. xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
  58. xinference/web/ui/build/static/js/main.17ca0398.js.map +0 -1
  59. xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
  61. xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
  62. xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
  63. xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
  64. xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
  65. xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
  71. /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
  72. /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
  73. /xinference/model/llm/{pytorch → transformers}/cogvlm2.py +0 -0
  74. /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
  75. /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
  76. /xinference/model/llm/{pytorch → transformers}/glm4v.py +0 -0
  77. /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
  78. /xinference/model/llm/{pytorch → transformers}/minicpmv25.py +0 -0
  79. /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
  80. /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
  81. /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
  82. /xinference/model/llm/{pytorch → transformers}/yi_vl.py +0 -0
  83. /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.ffc26121.js.LICENSE.txt} +0 -0
  84. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/LICENSE +0 -0
  85. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/WHEEL +0 -0
  86. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/entry_points.txt +0 -0
  87. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,6 @@
14
14
 
15
15
  import logging
16
16
  import os
17
- import shutil
18
17
  from threading import Lock
19
18
  from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
20
19
 
@@ -59,8 +58,8 @@ BUILTIN_LLM_MODEL_GENERATE_FAMILIES: Set[str] = set()
59
58
  BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES: Set[str] = set()
60
59
 
61
60
 
62
- class GgmlLLMSpecV1(BaseModel):
63
- model_format: Literal["ggmlv3", "ggufv2"]
61
+ class LlamaCppLLMSpecV1(BaseModel):
62
+ model_format: Literal["ggufv2"]
64
63
  # Must in order that `str` first, then `int`
65
64
  model_size_in_billions: Union[str, int]
66
65
  quantizations: List[str]
@@ -85,7 +84,7 @@ class GgmlLLMSpecV1(BaseModel):
85
84
 
86
85
 
87
86
  class PytorchLLMSpecV1(BaseModel):
88
- model_format: Literal["pytorch", "gptq", "awq"]
87
+ model_format: Literal["pytorch", "gptq", "awq", "fp8"]
89
88
  # Must in order that `str` first, then `int`
90
89
  model_size_in_billions: Union[str, int]
91
90
  quantizations: List[str]
@@ -247,7 +246,7 @@ class CustomLLMFamilyV1(LLMFamilyV1):
247
246
 
248
247
 
249
248
  LLMSpecV1 = Annotated[
250
- Union[GgmlLLMSpecV1, PytorchLLMSpecV1, MLXLLMSpecV1],
249
+ Union[LlamaCppLLMSpecV1, PytorchLLMSpecV1, MLXLLMSpecV1],
251
250
  Field(discriminator="model_format"),
252
251
  ]
253
252
 
@@ -308,13 +307,10 @@ def cache(
308
307
  if os.path.exists(legacy_cache_path):
309
308
  logger.info("Legacy cache path exists: %s", legacy_cache_path)
310
309
  return os.path.dirname(legacy_cache_path)
311
- elif download_from_self_hosted_storage() and is_self_hosted(llm_family, llm_spec):
312
- logger.info(f"Caching from self-hosted storage")
313
- return cache_from_self_hosted_storage(llm_family, llm_spec, quantization)
314
310
  else:
315
311
  if llm_spec.model_uri is not None:
316
312
  logger.info(f"Caching from URI: {llm_spec.model_uri}")
317
- return cache_from_uri(llm_family, llm_spec, quantization)
313
+ return cache_from_uri(llm_family, llm_spec)
318
314
  else:
319
315
  if llm_spec.model_hub == "huggingface":
320
316
  logger.info(f"Caching from Hugging Face: {llm_spec.model_id}")
@@ -329,68 +325,10 @@ def cache(
329
325
  raise ValueError(f"Unknown model hub: {llm_spec.model_hub}")
330
326
 
331
327
 
332
- SUPPORTED_SCHEMES = ["s3"]
333
-
334
-
335
- class AWSRegion:
336
- def __init__(self, region: str):
337
- self.region = region
338
- self.original_aws_default_region = None
339
-
340
- def __enter__(self):
341
- if "AWS_DEFAULT_REGION" in os.environ:
342
- self.original_aws_default_region = os.environ["AWS_DEFAULT_REGION"]
343
- os.environ["AWS_DEFAULT_REGION"] = self.region
344
-
345
- def __exit__(self, exc_type, exc_value, traceback):
346
- if self.original_aws_default_region:
347
- os.environ["AWS_DEFAULT_REGION"] = self.original_aws_default_region
348
- else:
349
- del os.environ["AWS_DEFAULT_REGION"]
350
-
351
-
352
- def is_self_hosted(
353
- llm_family: LLMFamilyV1,
354
- llm_spec: "LLMSpecV1",
355
- ):
356
- from fsspec import AbstractFileSystem, filesystem
357
-
358
- with AWSRegion("cn-northwest-1"):
359
- src_fs: AbstractFileSystem = filesystem("s3", anon=True)
360
- model_dir = (
361
- f"/xinference-models/llm/"
362
- f"{llm_family.model_name}-{llm_spec.model_format}-{llm_spec.model_size_in_billions}b"
363
- )
364
- return src_fs.exists(model_dir)
365
-
366
-
367
- def cache_from_self_hosted_storage(
368
- llm_family: LLMFamilyV1,
369
- llm_spec: "LLMSpecV1",
370
- quantization: Optional[str] = None,
371
- ) -> str:
372
- with AWSRegion("cn-northwest-1"):
373
- llm_spec = llm_spec.copy()
374
- llm_spec.model_uri = (
375
- f"s3://xinference-models/llm/"
376
- f"{llm_family.model_name}-{llm_spec.model_format}-{llm_spec.model_size_in_billions}b"
377
- )
378
-
379
- return cache_from_uri(
380
- llm_family, llm_spec, quantization, self_hosted_storage=True
381
- )
382
-
383
-
384
328
  def cache_from_uri(
385
329
  llm_family: LLMFamilyV1,
386
330
  llm_spec: "LLMSpecV1",
387
- quantization: Optional[str] = None,
388
- self_hosted_storage: bool = False,
389
331
  ) -> str:
390
- from fsspec import AbstractFileSystem, filesystem
391
-
392
- from ..utils import copy_from_src_to_dst
393
-
394
332
  cache_dir_name = (
395
333
  f"{llm_family.model_name}-{llm_spec.model_format}"
396
334
  f"-{llm_spec.model_size_in_billions}b"
@@ -415,69 +353,6 @@ def cache_from_uri(
415
353
  else:
416
354
  os.symlink(src_root, cache_dir, target_is_directory=True)
417
355
  return cache_dir
418
- elif src_scheme in SUPPORTED_SCHEMES:
419
- # use anonymous connection for self-hosted storage.
420
- src_fs: AbstractFileSystem = filesystem(src_scheme, anon=self_hosted_storage)
421
- local_fs: AbstractFileSystem = filesystem("file")
422
-
423
- files_to_download = []
424
- if llm_spec.model_format == "pytorch":
425
- if os.path.exists(cache_dir):
426
- logger.info(f"Cache {cache_dir} exists")
427
- return cache_dir
428
- else:
429
- os.makedirs(cache_dir, exist_ok=True)
430
-
431
- for path, _, files in src_fs.walk(llm_spec.model_uri):
432
- for file in files:
433
- src_path = f"{path}/{file}"
434
- local_path = src_path.replace(src_root, cache_dir)
435
- files_to_download.append((src_path, local_path))
436
- elif llm_spec.model_format == "ggmlv3":
437
- file = llm_spec.model_file_name_template.format(quantization=quantization)
438
- if os.path.exists(os.path.join(cache_dir, file)):
439
- logger.info(f"Cache {os.path.join(cache_dir, file)} exists")
440
- return cache_dir
441
- else:
442
- os.makedirs(cache_dir, exist_ok=True)
443
-
444
- src_path = f"{src_root}/{file}"
445
- local_path = f"{cache_dir}/{file}"
446
- files_to_download.append((src_path, local_path))
447
- else:
448
- raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
449
-
450
- from concurrent.futures import ThreadPoolExecutor
451
-
452
- failed = False
453
- with ThreadPoolExecutor(max_workers=min(len(files_to_download), 4)) as executor:
454
- futures = [
455
- (
456
- src_path,
457
- executor.submit(
458
- copy_from_src_to_dst, src_fs, src_path, local_fs, local_path
459
- ),
460
- )
461
- for src_path, local_path in files_to_download
462
- ]
463
- for src_path, future in futures:
464
- if failed:
465
- future.cancel()
466
- else:
467
- try:
468
- future.result()
469
- except:
470
- logger.error(f"Download {src_path} failed", exc_info=True)
471
- failed = True
472
-
473
- if failed:
474
- logger.warning(f"Removing cache directory: {cache_dir}")
475
- shutil.rmtree(cache_dir, ignore_errors=True)
476
- raise RuntimeError(
477
- f"Failed to download model '{llm_family.model_name}' "
478
- f"(size: {llm_spec.model_size_in_billions}, format: {llm_spec.model_format})"
479
- )
480
- return cache_dir
481
356
  else:
482
357
  raise ValueError(f"Unsupported URL scheme: {src_scheme}")
483
358
 
@@ -597,7 +472,7 @@ def _get_meta_path(
597
472
  return os.path.join(cache_dir, "__valid_download")
598
473
  else:
599
474
  return os.path.join(cache_dir, f"__valid_download_{model_hub}")
600
- elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
475
+ elif model_format in ["ggufv2", "gptq", "awq", "fp8", "mlx"]:
601
476
  assert quantization is not None
602
477
  if model_hub == "huggingface":
603
478
  return os.path.join(cache_dir, f"__valid_download_{quantization}")
@@ -636,7 +511,7 @@ def _skip_download(
636
511
  logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
637
512
  return True
638
513
  return False
639
- elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
514
+ elif model_format in ["ggufv2", "gptq", "awq", "fp8", "mlx"]:
640
515
  assert quantization is not None
641
516
  return os.path.exists(
642
517
  _get_meta_path(cache_dir, model_format, model_hub, quantization)
@@ -731,7 +606,7 @@ def cache_from_csghub(
731
606
  ):
732
607
  return cache_dir
733
608
 
734
- if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
609
+ if llm_spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
735
610
  download_dir = retry_download(
736
611
  snapshot_download,
737
612
  llm_family.model_name,
@@ -745,7 +620,7 @@ def cache_from_csghub(
745
620
  )
746
621
  create_symlink(download_dir, cache_dir)
747
622
 
748
- elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
623
+ elif llm_spec.model_format in ["ggufv2"]:
749
624
  file_names, final_file_name, need_merge = _generate_model_file_names(
750
625
  llm_spec, quantization
751
626
  )
@@ -799,7 +674,7 @@ def cache_from_modelscope(
799
674
  ):
800
675
  return cache_dir
801
676
 
802
- if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
677
+ if llm_spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
803
678
  download_dir = retry_download(
804
679
  snapshot_download,
805
680
  llm_family.model_name,
@@ -812,7 +687,7 @@ def cache_from_modelscope(
812
687
  )
813
688
  create_symlink(download_dir, cache_dir)
814
689
 
815
- elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
690
+ elif llm_spec.model_format in ["ggufv2"]:
816
691
  file_names, final_file_name, need_merge = _generate_model_file_names(
817
692
  llm_spec, quantization
818
693
  )
@@ -868,7 +743,7 @@ def cache_from_huggingface(
868
743
  if not IS_NEW_HUGGINGFACE_HUB:
869
744
  use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
870
745
 
871
- if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
746
+ if llm_spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
872
747
  assert isinstance(llm_spec, (PytorchLLMSpecV1, MLXLLMSpecV1))
873
748
  download_dir = retry_download(
874
749
  huggingface_hub.snapshot_download,
@@ -884,8 +759,8 @@ def cache_from_huggingface(
884
759
  if IS_NEW_HUGGINGFACE_HUB:
885
760
  create_symlink(download_dir, cache_dir)
886
761
 
887
- elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
888
- assert isinstance(llm_spec, GgmlLLMSpecV1)
762
+ elif llm_spec.model_format in ["ggufv2"]:
763
+ assert isinstance(llm_spec, LlamaCppLLMSpecV1)
889
764
  file_names, final_file_name, need_merge = _generate_model_file_names(
890
765
  llm_spec, quantization
891
766
  )