xinference 0.11.1__py3-none-any.whl → 0.11.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (31) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +30 -0
  3. xinference/client/restful/restful_client.py +29 -0
  4. xinference/core/cache_tracker.py +12 -1
  5. xinference/core/supervisor.py +30 -2
  6. xinference/core/utils.py +12 -0
  7. xinference/core/worker.py +4 -1
  8. xinference/deploy/cmdline.py +126 -0
  9. xinference/deploy/test/test_cmdline.py +24 -0
  10. xinference/model/llm/__init__.py +2 -0
  11. xinference/model/llm/llm_family.json +501 -6
  12. xinference/model/llm/llm_family.py +84 -10
  13. xinference/model/llm/llm_family_modelscope.json +198 -7
  14. xinference/model/llm/memory.py +332 -0
  15. xinference/model/llm/pytorch/core.py +2 -0
  16. xinference/model/llm/pytorch/intern_vl.py +387 -0
  17. xinference/model/llm/utils.py +13 -0
  18. xinference/model/llm/vllm/core.py +5 -2
  19. xinference/model/rerank/core.py +23 -1
  20. xinference/model/utils.py +17 -7
  21. xinference/thirdparty/deepseek_vl/models/processing_vlm.py +1 -1
  22. xinference/thirdparty/deepseek_vl/models/siglip_vit.py +2 -2
  23. xinference/thirdparty/llava/mm_utils.py +3 -2
  24. xinference/thirdparty/llava/model/llava_arch.py +1 -1
  25. xinference/thirdparty/omnilmm/chat.py +6 -5
  26. {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/METADATA +8 -7
  27. {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/RECORD +31 -29
  28. {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/LICENSE +0 -0
  29. {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/WHEEL +0 -0
  30. {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/entry_points.txt +0 -0
  31. {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/top_level.txt +0 -0
@@ -34,6 +34,8 @@ from ..._compat import (
34
34
  )
35
35
  from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
36
36
  from ..utils import (
37
+ IS_NEW_HUGGINGFACE_HUB,
38
+ create_symlink,
37
39
  download_from_modelscope,
38
40
  is_valid_model_uri,
39
41
  parse_uri,
@@ -447,6 +449,61 @@ def cache_from_uri(
447
449
  raise ValueError(f"Unsupported URL scheme: {src_scheme}")
448
450
 
449
451
 
452
+ def cache_model_config(
453
+ llm_family: LLMFamilyV1,
454
+ llm_spec: "LLMSpecV1",
455
+ ):
456
+ """Download model config.json into cache_dir,
457
+ returns local filepath
458
+ """
459
+ cache_dir = _get_cache_dir_for_model_mem(llm_family, llm_spec)
460
+ config_file = os.path.join(cache_dir, "config.json")
461
+ if not os.path.islink(config_file) and not os.path.exists(config_file):
462
+ os.makedirs(cache_dir, exist_ok=True)
463
+ if llm_spec.model_hub == "huggingface":
464
+ from huggingface_hub import hf_hub_download
465
+
466
+ hf_hub_download(
467
+ repo_id=llm_spec.model_id, filename="config.json", local_dir=cache_dir
468
+ )
469
+ else:
470
+ from modelscope.hub.file_download import model_file_download
471
+
472
+ download_path = model_file_download(
473
+ model_id=llm_spec.model_id, file_path="config.json"
474
+ )
475
+ os.symlink(download_path, config_file)
476
+ return config_file
477
+
478
+
479
+ def _get_cache_dir_for_model_mem(
480
+ llm_family: LLMFamilyV1,
481
+ llm_spec: "LLMSpecV1",
482
+ create_if_not_exist=True,
483
+ ):
484
+ """
485
+ For cal-model-mem only. (might called from supervisor / cli)
486
+ Temporary use separate dir from worker's cache_dir, due to issue of different style of symlink.
487
+ """
488
+ quant_suffix = ""
489
+ for q in llm_spec.quantizations:
490
+ if llm_spec.model_id and q in llm_spec.model_id:
491
+ quant_suffix = q
492
+ break
493
+ cache_dir_name = (
494
+ f"{llm_family.model_name}-{llm_spec.model_format}"
495
+ f"-{llm_spec.model_size_in_billions}b"
496
+ )
497
+ if quant_suffix:
498
+ cache_dir_name += f"-{quant_suffix}"
499
+ cache_dir = os.path.realpath(
500
+ os.path.join(XINFERENCE_CACHE_DIR, "model_mem", cache_dir_name)
501
+ )
502
+ if create_if_not_exist and not os.path.exists(cache_dir):
503
+ os.makedirs(cache_dir, exist_ok=True)
504
+ return cache_dir
505
+
506
+
450
507
  def _get_cache_dir(
451
508
  llm_family: LLMFamilyV1,
452
509
  llm_spec: "LLMSpecV1",
@@ -625,10 +682,7 @@ def cache_from_modelscope(
625
682
  llm_spec.model_id,
626
683
  revision=llm_spec.model_revision,
627
684
  )
628
- for subdir, dirs, files in os.walk(download_dir):
629
- for file in files:
630
- relpath = os.path.relpath(os.path.join(subdir, file), download_dir)
631
- symlink_local_file(os.path.join(subdir, file), cache_dir, relpath)
685
+ create_symlink(download_dir, cache_dir)
632
686
 
633
687
  elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
634
688
  file_names, final_file_name, need_merge = _generate_model_file_names(
@@ -682,9 +736,13 @@ def cache_from_huggingface(
682
736
  ):
683
737
  return cache_dir
684
738
 
739
+ use_symlinks = {}
740
+ if not IS_NEW_HUGGINGFACE_HUB:
741
+ use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
742
+
685
743
  if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
686
744
  assert isinstance(llm_spec, PytorchLLMSpecV1)
687
- retry_download(
745
+ download_dir = retry_download(
688
746
  huggingface_hub.snapshot_download,
689
747
  llm_family.model_name,
690
748
  {
@@ -693,9 +751,10 @@ def cache_from_huggingface(
693
751
  },
694
752
  llm_spec.model_id,
695
753
  revision=llm_spec.model_revision,
696
- local_dir=cache_dir,
697
- local_dir_use_symlinks=True,
754
+ **use_symlinks,
698
755
  )
756
+ if IS_NEW_HUGGINGFACE_HUB:
757
+ create_symlink(download_dir, cache_dir)
699
758
 
700
759
  elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
701
760
  assert isinstance(llm_spec, GgmlLLMSpecV1)
@@ -704,7 +763,7 @@ def cache_from_huggingface(
704
763
  )
705
764
 
706
765
  for file_name in file_names:
707
- retry_download(
766
+ download_file_path = retry_download(
708
767
  huggingface_hub.hf_hub_download,
709
768
  llm_family.model_name,
710
769
  {
@@ -714,9 +773,10 @@ def cache_from_huggingface(
714
773
  llm_spec.model_id,
715
774
  revision=llm_spec.model_revision,
716
775
  filename=file_name,
717
- local_dir=cache_dir,
718
- local_dir_use_symlinks=True,
776
+ **use_symlinks,
719
777
  )
778
+ if IS_NEW_HUGGINGFACE_HUB:
779
+ symlink_local_file(download_file_path, cache_dir, file_name)
720
780
 
721
781
  if need_merge:
722
782
  _merge_cached_files(cache_dir, file_names, final_file_name)
@@ -823,6 +883,20 @@ def match_model_size(
823
883
  return False
824
884
 
825
885
 
886
+ def convert_model_size_to_float(
887
+ model_size_in_billions: Union[float, int, str]
888
+ ) -> float:
889
+ if isinstance(model_size_in_billions, str):
890
+ if "_" in model_size_in_billions:
891
+ ms = model_size_in_billions.replace("_", ".")
892
+ return float(ms)
893
+ elif "." in model_size_in_billions:
894
+ return float(model_size_in_billions)
895
+ else:
896
+ return int(model_size_in_billions)
897
+ return model_size_in_billions
898
+
899
+
826
900
  def match_llm(
827
901
  model_name: str,
828
902
  model_format: Optional[str] = None,
@@ -2430,6 +2430,32 @@
2430
2430
  ]
2431
2431
  }
2432
2432
  },
2433
+ {
2434
+ "version": 1,
2435
+ "context_length": 65536,
2436
+ "model_name": "codeqwen1.5",
2437
+ "model_lang": [
2438
+ "en",
2439
+ "zh"
2440
+ ],
2441
+ "model_ability": [
2442
+ "generate"
2443
+ ],
2444
+ "model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
2445
+ "model_specs": [
2446
+ {
2447
+ "model_format": "pytorch",
2448
+ "model_size_in_billions": 7,
2449
+ "quantizations": [
2450
+ "4-bit",
2451
+ "8-bit",
2452
+ "none"
2453
+ ],
2454
+ "model_id": "qwen/CodeQwen1.5-7B",
2455
+ "model_hub": "modelscope"
2456
+ }
2457
+ ]
2458
+ },
2433
2459
  {
2434
2460
  "version": 1,
2435
2461
  "context_length": 65536,
@@ -2548,6 +2574,43 @@
2548
2574
  ]
2549
2575
  }
2550
2576
  },
2577
+ {
2578
+ "version": 1,
2579
+ "context_length": 4096,
2580
+ "model_name": "deepseek",
2581
+ "model_lang": [
2582
+ "en",
2583
+ "zh"
2584
+ ],
2585
+ "model_ability": [
2586
+ "generate"
2587
+ ],
2588
+ "model_description": "DDeepSeek LLM, trained from scratch on a vast dataset of 2 trillion tokens in both English and Chinese. ",
2589
+ "model_specs": [
2590
+ {
2591
+ "model_format": "pytorch",
2592
+ "model_size_in_billions": 7,
2593
+ "quantizations": [
2594
+ "4-bit",
2595
+ "8-bit",
2596
+ "none"
2597
+ ],
2598
+ "model_id": "deepseek-ai/deepseek-llm-7b-base",
2599
+ "model_hub": "modelscope"
2600
+ },
2601
+ {
2602
+ "model_format": "pytorch",
2603
+ "model_size_in_billions": 67,
2604
+ "quantizations": [
2605
+ "4-bit",
2606
+ "8-bit",
2607
+ "none"
2608
+ ],
2609
+ "model_id": "deepseek-ai/deepseek-llm-67b-base",
2610
+ "model_hub": "modelscope"
2611
+ }
2612
+ ]
2613
+ },
2551
2614
  {
2552
2615
  "version": 1,
2553
2616
  "context_length": 4096,
@@ -2600,7 +2663,55 @@
2600
2663
  },
2601
2664
  {
2602
2665
  "version": 1,
2603
- "context_length": 4096,
2666
+ "context_length": 16384,
2667
+ "model_name": "deepseek-coder",
2668
+ "model_lang": [
2669
+ "en",
2670
+ "zh"
2671
+ ],
2672
+ "model_ability": [
2673
+ "generate"
2674
+ ],
2675
+ "model_description": "Deepseek Coder is composed of a series of code language models, each trained from scratch on 2T tokens, with a composition of 87% code and 13% natural language in both English and Chinese.",
2676
+ "model_specs": [
2677
+ {
2678
+ "model_format": "pytorch",
2679
+ "model_size_in_billions": "1_3",
2680
+ "quantizations": [
2681
+ "4-bit",
2682
+ "8-bit",
2683
+ "none"
2684
+ ],
2685
+ "model_id": "deepseek-ai/deepseek-coder-1.3b-base",
2686
+ "model_hub": "modelscope"
2687
+ },
2688
+ {
2689
+ "model_format": "pytorch",
2690
+ "model_size_in_billions": "6_7",
2691
+ "quantizations": [
2692
+ "4-bit",
2693
+ "8-bit",
2694
+ "none"
2695
+ ],
2696
+ "model_id": "deepseek-ai/deepseek-coder-6.7b-base",
2697
+ "model_hub": "modelscope"
2698
+ },
2699
+ {
2700
+ "model_format": "pytorch",
2701
+ "model_size_in_billions": 33,
2702
+ "quantizations": [
2703
+ "4-bit",
2704
+ "8-bit",
2705
+ "none"
2706
+ ],
2707
+ "model_id": "deepseek-ai/deepseek-coder-33b-base",
2708
+ "model_hub": "modelscope"
2709
+ }
2710
+ ]
2711
+ },
2712
+ {
2713
+ "version": 1,
2714
+ "context_length": 16384,
2604
2715
  "model_name": "deepseek-coder-instruct",
2605
2716
  "model_lang": [
2606
2717
  "en",
@@ -3389,7 +3500,7 @@
3389
3500
  "ar"
3390
3501
  ],
3391
3502
  "model_ability": [
3392
- "generate"
3503
+ "chat"
3393
3504
  ],
3394
3505
  "model_description": "C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.",
3395
3506
  "model_specs": [
@@ -3408,11 +3519,12 @@
3408
3519
  "model_size_in_billions": 35,
3409
3520
  "quantizations": [
3410
3521
  "Q2_K",
3522
+ "Q3_K_M",
3411
3523
  "Q4_K_M",
3412
3524
  "Q5_K_M"
3413
3525
  ],
3414
3526
  "model_id": "mirror013/C4AI-Command-R-v01-GGUF",
3415
- "model_file_name_template": "c4ai-command-r-v01.{quantization}.gguf",
3527
+ "model_file_name_template": "c4ai-command-r-v01-{quantization}.gguf",
3416
3528
  "model_hub": "modelscope",
3417
3529
  "model_revision": "master"
3418
3530
  },
@@ -3426,7 +3538,21 @@
3426
3538
  "model_id": "AI-ModelScope/c4ai-command-r-plus",
3427
3539
  "model_revision": "master"
3428
3540
  }
3429
- ]
3541
+ ],
3542
+ "prompt_style": {
3543
+ "style_name": "c4ai-command-r",
3544
+ "system_prompt": "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.",
3545
+ "roles": [
3546
+ "<|USER_TOKEN|>",
3547
+ "<|CHATBOT_TOKEN|>"
3548
+ ],
3549
+ "intra_message_sep": "",
3550
+ "inter_message_sep": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|>",
3551
+ "stop_token_ids": [
3552
+ 6,
3553
+ 255001
3554
+ ]
3555
+ }
3430
3556
  },
3431
3557
  {
3432
3558
  "version": 1,
@@ -3445,7 +3571,7 @@
3445
3571
  "ar"
3446
3572
  ],
3447
3573
  "model_ability": [
3448
- "generate"
3574
+ "chat"
3449
3575
  ],
3450
3576
  "model_description": "This model is 4bit quantized version of C4AI Command-R using bitsandbytes.",
3451
3577
  "model_specs": [
@@ -3459,7 +3585,21 @@
3459
3585
  "model_id": "mirror013/c4ai-command-r-v01-4bit",
3460
3586
  "model_revision": "master"
3461
3587
  }
3462
- ]
3588
+ ],
3589
+ "prompt_style": {
3590
+ "style_name": "c4ai-command-r",
3591
+ "system_prompt": "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.",
3592
+ "roles": [
3593
+ "<|USER_TOKEN|>",
3594
+ "<|CHATBOT_TOKEN|>"
3595
+ ],
3596
+ "intra_message_sep": "",
3597
+ "inter_message_sep": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|>",
3598
+ "stop_token_ids": [
3599
+ 6,
3600
+ 255001
3601
+ ]
3602
+ }
3463
3603
  },
3464
3604
  {
3465
3605
  "version": 1,
@@ -3548,5 +3688,56 @@
3548
3688
  "<|end|>"
3549
3689
  ]
3550
3690
  }
3551
- }
3691
+ },
3692
+ {
3693
+ "version": 1,
3694
+ "context_length": 32768,
3695
+ "model_name": "internvl-chat",
3696
+ "model_lang": [
3697
+ "en",
3698
+ "zh"
3699
+ ],
3700
+ "model_ability": [
3701
+ "chat",
3702
+ "vision"
3703
+ ],
3704
+ "model_description": "InternVL 1.5 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
3705
+ "model_specs": [
3706
+ {
3707
+ "model_format": "pytorch",
3708
+ "model_size_in_billions": 26,
3709
+ "quantizations": [
3710
+ "none"
3711
+ ],
3712
+ "model_hub": "modelscope",
3713
+ "model_id": "AI-ModelScope/InternVL-Chat-V1-5",
3714
+ "model_revision": "master"
3715
+ },
3716
+ {
3717
+ "model_format": "pytorch",
3718
+ "model_size_in_billions": 26,
3719
+ "quantizations": [
3720
+ "Int8"
3721
+ ],
3722
+ "model_hub": "modelscope",
3723
+ "model_id": "AI-ModelScope/InternVL-Chat-V1-5-{quantization}",
3724
+ "model_revision": "master"
3725
+ }
3726
+ ],
3727
+ "prompt_style": {
3728
+ "style_name": "INTERNLM2",
3729
+ "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
3730
+ "roles": [
3731
+ "<|im_start|>user",
3732
+ "<|im_start|>assistant"
3733
+ ],
3734
+ "intra_message_sep": "<|im_end|>",
3735
+ "stop_token_ids": [
3736
+ 92542
3737
+ ],
3738
+ "stop": [
3739
+ "<|im_end|>"
3740
+ ]
3741
+ }
3742
+ }
3552
3743
  ]