xinference 0.11.1__py3-none-any.whl → 0.11.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +30 -0
- xinference/client/restful/restful_client.py +29 -0
- xinference/core/cache_tracker.py +12 -1
- xinference/core/supervisor.py +30 -2
- xinference/core/utils.py +12 -0
- xinference/core/worker.py +4 -1
- xinference/deploy/cmdline.py +126 -0
- xinference/deploy/test/test_cmdline.py +24 -0
- xinference/model/llm/__init__.py +2 -0
- xinference/model/llm/llm_family.json +501 -6
- xinference/model/llm/llm_family.py +84 -10
- xinference/model/llm/llm_family_modelscope.json +198 -7
- xinference/model/llm/memory.py +332 -0
- xinference/model/llm/pytorch/core.py +2 -0
- xinference/model/llm/pytorch/intern_vl.py +387 -0
- xinference/model/llm/utils.py +13 -0
- xinference/model/llm/vllm/core.py +5 -2
- xinference/model/rerank/core.py +23 -1
- xinference/model/utils.py +17 -7
- xinference/thirdparty/deepseek_vl/models/processing_vlm.py +1 -1
- xinference/thirdparty/deepseek_vl/models/siglip_vit.py +2 -2
- xinference/thirdparty/llava/mm_utils.py +3 -2
- xinference/thirdparty/llava/model/llava_arch.py +1 -1
- xinference/thirdparty/omnilmm/chat.py +6 -5
- {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/METADATA +8 -7
- {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/RECORD +31 -29
- {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/LICENSE +0 -0
- {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/WHEEL +0 -0
- {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/entry_points.txt +0 -0
- {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/top_level.txt +0 -0
|
@@ -34,6 +34,8 @@ from ..._compat import (
|
|
|
34
34
|
)
|
|
35
35
|
from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
|
|
36
36
|
from ..utils import (
|
|
37
|
+
IS_NEW_HUGGINGFACE_HUB,
|
|
38
|
+
create_symlink,
|
|
37
39
|
download_from_modelscope,
|
|
38
40
|
is_valid_model_uri,
|
|
39
41
|
parse_uri,
|
|
@@ -447,6 +449,61 @@ def cache_from_uri(
|
|
|
447
449
|
raise ValueError(f"Unsupported URL scheme: {src_scheme}")
|
|
448
450
|
|
|
449
451
|
|
|
452
|
+
def cache_model_config(
|
|
453
|
+
llm_family: LLMFamilyV1,
|
|
454
|
+
llm_spec: "LLMSpecV1",
|
|
455
|
+
):
|
|
456
|
+
"""Download model config.json into cache_dir,
|
|
457
|
+
returns local filepath
|
|
458
|
+
"""
|
|
459
|
+
cache_dir = _get_cache_dir_for_model_mem(llm_family, llm_spec)
|
|
460
|
+
config_file = os.path.join(cache_dir, "config.json")
|
|
461
|
+
if not os.path.islink(config_file) and not os.path.exists(config_file):
|
|
462
|
+
os.makedirs(cache_dir, exist_ok=True)
|
|
463
|
+
if llm_spec.model_hub == "huggingface":
|
|
464
|
+
from huggingface_hub import hf_hub_download
|
|
465
|
+
|
|
466
|
+
hf_hub_download(
|
|
467
|
+
repo_id=llm_spec.model_id, filename="config.json", local_dir=cache_dir
|
|
468
|
+
)
|
|
469
|
+
else:
|
|
470
|
+
from modelscope.hub.file_download import model_file_download
|
|
471
|
+
|
|
472
|
+
download_path = model_file_download(
|
|
473
|
+
model_id=llm_spec.model_id, file_path="config.json"
|
|
474
|
+
)
|
|
475
|
+
os.symlink(download_path, config_file)
|
|
476
|
+
return config_file
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def _get_cache_dir_for_model_mem(
|
|
480
|
+
llm_family: LLMFamilyV1,
|
|
481
|
+
llm_spec: "LLMSpecV1",
|
|
482
|
+
create_if_not_exist=True,
|
|
483
|
+
):
|
|
484
|
+
"""
|
|
485
|
+
For cal-model-mem only. (might called from supervisor / cli)
|
|
486
|
+
Temporary use separate dir from worker's cache_dir, due to issue of different style of symlink.
|
|
487
|
+
"""
|
|
488
|
+
quant_suffix = ""
|
|
489
|
+
for q in llm_spec.quantizations:
|
|
490
|
+
if llm_spec.model_id and q in llm_spec.model_id:
|
|
491
|
+
quant_suffix = q
|
|
492
|
+
break
|
|
493
|
+
cache_dir_name = (
|
|
494
|
+
f"{llm_family.model_name}-{llm_spec.model_format}"
|
|
495
|
+
f"-{llm_spec.model_size_in_billions}b"
|
|
496
|
+
)
|
|
497
|
+
if quant_suffix:
|
|
498
|
+
cache_dir_name += f"-{quant_suffix}"
|
|
499
|
+
cache_dir = os.path.realpath(
|
|
500
|
+
os.path.join(XINFERENCE_CACHE_DIR, "model_mem", cache_dir_name)
|
|
501
|
+
)
|
|
502
|
+
if create_if_not_exist and not os.path.exists(cache_dir):
|
|
503
|
+
os.makedirs(cache_dir, exist_ok=True)
|
|
504
|
+
return cache_dir
|
|
505
|
+
|
|
506
|
+
|
|
450
507
|
def _get_cache_dir(
|
|
451
508
|
llm_family: LLMFamilyV1,
|
|
452
509
|
llm_spec: "LLMSpecV1",
|
|
@@ -625,10 +682,7 @@ def cache_from_modelscope(
|
|
|
625
682
|
llm_spec.model_id,
|
|
626
683
|
revision=llm_spec.model_revision,
|
|
627
684
|
)
|
|
628
|
-
|
|
629
|
-
for file in files:
|
|
630
|
-
relpath = os.path.relpath(os.path.join(subdir, file), download_dir)
|
|
631
|
-
symlink_local_file(os.path.join(subdir, file), cache_dir, relpath)
|
|
685
|
+
create_symlink(download_dir, cache_dir)
|
|
632
686
|
|
|
633
687
|
elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
|
|
634
688
|
file_names, final_file_name, need_merge = _generate_model_file_names(
|
|
@@ -682,9 +736,13 @@ def cache_from_huggingface(
|
|
|
682
736
|
):
|
|
683
737
|
return cache_dir
|
|
684
738
|
|
|
739
|
+
use_symlinks = {}
|
|
740
|
+
if not IS_NEW_HUGGINGFACE_HUB:
|
|
741
|
+
use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
|
|
742
|
+
|
|
685
743
|
if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
|
|
686
744
|
assert isinstance(llm_spec, PytorchLLMSpecV1)
|
|
687
|
-
retry_download(
|
|
745
|
+
download_dir = retry_download(
|
|
688
746
|
huggingface_hub.snapshot_download,
|
|
689
747
|
llm_family.model_name,
|
|
690
748
|
{
|
|
@@ -693,9 +751,10 @@ def cache_from_huggingface(
|
|
|
693
751
|
},
|
|
694
752
|
llm_spec.model_id,
|
|
695
753
|
revision=llm_spec.model_revision,
|
|
696
|
-
|
|
697
|
-
local_dir_use_symlinks=True,
|
|
754
|
+
**use_symlinks,
|
|
698
755
|
)
|
|
756
|
+
if IS_NEW_HUGGINGFACE_HUB:
|
|
757
|
+
create_symlink(download_dir, cache_dir)
|
|
699
758
|
|
|
700
759
|
elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
|
|
701
760
|
assert isinstance(llm_spec, GgmlLLMSpecV1)
|
|
@@ -704,7 +763,7 @@ def cache_from_huggingface(
|
|
|
704
763
|
)
|
|
705
764
|
|
|
706
765
|
for file_name in file_names:
|
|
707
|
-
retry_download(
|
|
766
|
+
download_file_path = retry_download(
|
|
708
767
|
huggingface_hub.hf_hub_download,
|
|
709
768
|
llm_family.model_name,
|
|
710
769
|
{
|
|
@@ -714,9 +773,10 @@ def cache_from_huggingface(
|
|
|
714
773
|
llm_spec.model_id,
|
|
715
774
|
revision=llm_spec.model_revision,
|
|
716
775
|
filename=file_name,
|
|
717
|
-
|
|
718
|
-
local_dir_use_symlinks=True,
|
|
776
|
+
**use_symlinks,
|
|
719
777
|
)
|
|
778
|
+
if IS_NEW_HUGGINGFACE_HUB:
|
|
779
|
+
symlink_local_file(download_file_path, cache_dir, file_name)
|
|
720
780
|
|
|
721
781
|
if need_merge:
|
|
722
782
|
_merge_cached_files(cache_dir, file_names, final_file_name)
|
|
@@ -823,6 +883,20 @@ def match_model_size(
|
|
|
823
883
|
return False
|
|
824
884
|
|
|
825
885
|
|
|
886
|
+
def convert_model_size_to_float(
|
|
887
|
+
model_size_in_billions: Union[float, int, str]
|
|
888
|
+
) -> float:
|
|
889
|
+
if isinstance(model_size_in_billions, str):
|
|
890
|
+
if "_" in model_size_in_billions:
|
|
891
|
+
ms = model_size_in_billions.replace("_", ".")
|
|
892
|
+
return float(ms)
|
|
893
|
+
elif "." in model_size_in_billions:
|
|
894
|
+
return float(model_size_in_billions)
|
|
895
|
+
else:
|
|
896
|
+
return int(model_size_in_billions)
|
|
897
|
+
return model_size_in_billions
|
|
898
|
+
|
|
899
|
+
|
|
826
900
|
def match_llm(
|
|
827
901
|
model_name: str,
|
|
828
902
|
model_format: Optional[str] = None,
|
|
@@ -2430,6 +2430,32 @@
|
|
|
2430
2430
|
]
|
|
2431
2431
|
}
|
|
2432
2432
|
},
|
|
2433
|
+
{
|
|
2434
|
+
"version": 1,
|
|
2435
|
+
"context_length": 65536,
|
|
2436
|
+
"model_name": "codeqwen1.5",
|
|
2437
|
+
"model_lang": [
|
|
2438
|
+
"en",
|
|
2439
|
+
"zh"
|
|
2440
|
+
],
|
|
2441
|
+
"model_ability": [
|
|
2442
|
+
"generate"
|
|
2443
|
+
],
|
|
2444
|
+
"model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
|
|
2445
|
+
"model_specs": [
|
|
2446
|
+
{
|
|
2447
|
+
"model_format": "pytorch",
|
|
2448
|
+
"model_size_in_billions": 7,
|
|
2449
|
+
"quantizations": [
|
|
2450
|
+
"4-bit",
|
|
2451
|
+
"8-bit",
|
|
2452
|
+
"none"
|
|
2453
|
+
],
|
|
2454
|
+
"model_id": "qwen/CodeQwen1.5-7B",
|
|
2455
|
+
"model_hub": "modelscope"
|
|
2456
|
+
}
|
|
2457
|
+
]
|
|
2458
|
+
},
|
|
2433
2459
|
{
|
|
2434
2460
|
"version": 1,
|
|
2435
2461
|
"context_length": 65536,
|
|
@@ -2548,6 +2574,43 @@
|
|
|
2548
2574
|
]
|
|
2549
2575
|
}
|
|
2550
2576
|
},
|
|
2577
|
+
{
|
|
2578
|
+
"version": 1,
|
|
2579
|
+
"context_length": 4096,
|
|
2580
|
+
"model_name": "deepseek",
|
|
2581
|
+
"model_lang": [
|
|
2582
|
+
"en",
|
|
2583
|
+
"zh"
|
|
2584
|
+
],
|
|
2585
|
+
"model_ability": [
|
|
2586
|
+
"generate"
|
|
2587
|
+
],
|
|
2588
|
+
"model_description": "DDeepSeek LLM, trained from scratch on a vast dataset of 2 trillion tokens in both English and Chinese. ",
|
|
2589
|
+
"model_specs": [
|
|
2590
|
+
{
|
|
2591
|
+
"model_format": "pytorch",
|
|
2592
|
+
"model_size_in_billions": 7,
|
|
2593
|
+
"quantizations": [
|
|
2594
|
+
"4-bit",
|
|
2595
|
+
"8-bit",
|
|
2596
|
+
"none"
|
|
2597
|
+
],
|
|
2598
|
+
"model_id": "deepseek-ai/deepseek-llm-7b-base",
|
|
2599
|
+
"model_hub": "modelscope"
|
|
2600
|
+
},
|
|
2601
|
+
{
|
|
2602
|
+
"model_format": "pytorch",
|
|
2603
|
+
"model_size_in_billions": 67,
|
|
2604
|
+
"quantizations": [
|
|
2605
|
+
"4-bit",
|
|
2606
|
+
"8-bit",
|
|
2607
|
+
"none"
|
|
2608
|
+
],
|
|
2609
|
+
"model_id": "deepseek-ai/deepseek-llm-67b-base",
|
|
2610
|
+
"model_hub": "modelscope"
|
|
2611
|
+
}
|
|
2612
|
+
]
|
|
2613
|
+
},
|
|
2551
2614
|
{
|
|
2552
2615
|
"version": 1,
|
|
2553
2616
|
"context_length": 4096,
|
|
@@ -2600,7 +2663,55 @@
|
|
|
2600
2663
|
},
|
|
2601
2664
|
{
|
|
2602
2665
|
"version": 1,
|
|
2603
|
-
"context_length":
|
|
2666
|
+
"context_length": 16384,
|
|
2667
|
+
"model_name": "deepseek-coder",
|
|
2668
|
+
"model_lang": [
|
|
2669
|
+
"en",
|
|
2670
|
+
"zh"
|
|
2671
|
+
],
|
|
2672
|
+
"model_ability": [
|
|
2673
|
+
"generate"
|
|
2674
|
+
],
|
|
2675
|
+
"model_description": "Deepseek Coder is composed of a series of code language models, each trained from scratch on 2T tokens, with a composition of 87% code and 13% natural language in both English and Chinese.",
|
|
2676
|
+
"model_specs": [
|
|
2677
|
+
{
|
|
2678
|
+
"model_format": "pytorch",
|
|
2679
|
+
"model_size_in_billions": "1_3",
|
|
2680
|
+
"quantizations": [
|
|
2681
|
+
"4-bit",
|
|
2682
|
+
"8-bit",
|
|
2683
|
+
"none"
|
|
2684
|
+
],
|
|
2685
|
+
"model_id": "deepseek-ai/deepseek-coder-1.3b-base",
|
|
2686
|
+
"model_hub": "modelscope"
|
|
2687
|
+
},
|
|
2688
|
+
{
|
|
2689
|
+
"model_format": "pytorch",
|
|
2690
|
+
"model_size_in_billions": "6_7",
|
|
2691
|
+
"quantizations": [
|
|
2692
|
+
"4-bit",
|
|
2693
|
+
"8-bit",
|
|
2694
|
+
"none"
|
|
2695
|
+
],
|
|
2696
|
+
"model_id": "deepseek-ai/deepseek-coder-6.7b-base",
|
|
2697
|
+
"model_hub": "modelscope"
|
|
2698
|
+
},
|
|
2699
|
+
{
|
|
2700
|
+
"model_format": "pytorch",
|
|
2701
|
+
"model_size_in_billions": 33,
|
|
2702
|
+
"quantizations": [
|
|
2703
|
+
"4-bit",
|
|
2704
|
+
"8-bit",
|
|
2705
|
+
"none"
|
|
2706
|
+
],
|
|
2707
|
+
"model_id": "deepseek-ai/deepseek-coder-33b-base",
|
|
2708
|
+
"model_hub": "modelscope"
|
|
2709
|
+
}
|
|
2710
|
+
]
|
|
2711
|
+
},
|
|
2712
|
+
{
|
|
2713
|
+
"version": 1,
|
|
2714
|
+
"context_length": 16384,
|
|
2604
2715
|
"model_name": "deepseek-coder-instruct",
|
|
2605
2716
|
"model_lang": [
|
|
2606
2717
|
"en",
|
|
@@ -3389,7 +3500,7 @@
|
|
|
3389
3500
|
"ar"
|
|
3390
3501
|
],
|
|
3391
3502
|
"model_ability": [
|
|
3392
|
-
"
|
|
3503
|
+
"chat"
|
|
3393
3504
|
],
|
|
3394
3505
|
"model_description": "C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.",
|
|
3395
3506
|
"model_specs": [
|
|
@@ -3408,11 +3519,12 @@
|
|
|
3408
3519
|
"model_size_in_billions": 35,
|
|
3409
3520
|
"quantizations": [
|
|
3410
3521
|
"Q2_K",
|
|
3522
|
+
"Q3_K_M",
|
|
3411
3523
|
"Q4_K_M",
|
|
3412
3524
|
"Q5_K_M"
|
|
3413
3525
|
],
|
|
3414
3526
|
"model_id": "mirror013/C4AI-Command-R-v01-GGUF",
|
|
3415
|
-
"model_file_name_template": "c4ai-command-r-v01
|
|
3527
|
+
"model_file_name_template": "c4ai-command-r-v01-{quantization}.gguf",
|
|
3416
3528
|
"model_hub": "modelscope",
|
|
3417
3529
|
"model_revision": "master"
|
|
3418
3530
|
},
|
|
@@ -3426,7 +3538,21 @@
|
|
|
3426
3538
|
"model_id": "AI-ModelScope/c4ai-command-r-plus",
|
|
3427
3539
|
"model_revision": "master"
|
|
3428
3540
|
}
|
|
3429
|
-
]
|
|
3541
|
+
],
|
|
3542
|
+
"prompt_style": {
|
|
3543
|
+
"style_name": "c4ai-command-r",
|
|
3544
|
+
"system_prompt": "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.",
|
|
3545
|
+
"roles": [
|
|
3546
|
+
"<|USER_TOKEN|>",
|
|
3547
|
+
"<|CHATBOT_TOKEN|>"
|
|
3548
|
+
],
|
|
3549
|
+
"intra_message_sep": "",
|
|
3550
|
+
"inter_message_sep": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|>",
|
|
3551
|
+
"stop_token_ids": [
|
|
3552
|
+
6,
|
|
3553
|
+
255001
|
|
3554
|
+
]
|
|
3555
|
+
}
|
|
3430
3556
|
},
|
|
3431
3557
|
{
|
|
3432
3558
|
"version": 1,
|
|
@@ -3445,7 +3571,7 @@
|
|
|
3445
3571
|
"ar"
|
|
3446
3572
|
],
|
|
3447
3573
|
"model_ability": [
|
|
3448
|
-
"
|
|
3574
|
+
"chat"
|
|
3449
3575
|
],
|
|
3450
3576
|
"model_description": "This model is 4bit quantized version of C4AI Command-R using bitsandbytes.",
|
|
3451
3577
|
"model_specs": [
|
|
@@ -3459,7 +3585,21 @@
|
|
|
3459
3585
|
"model_id": "mirror013/c4ai-command-r-v01-4bit",
|
|
3460
3586
|
"model_revision": "master"
|
|
3461
3587
|
}
|
|
3462
|
-
]
|
|
3588
|
+
],
|
|
3589
|
+
"prompt_style": {
|
|
3590
|
+
"style_name": "c4ai-command-r",
|
|
3591
|
+
"system_prompt": "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.",
|
|
3592
|
+
"roles": [
|
|
3593
|
+
"<|USER_TOKEN|>",
|
|
3594
|
+
"<|CHATBOT_TOKEN|>"
|
|
3595
|
+
],
|
|
3596
|
+
"intra_message_sep": "",
|
|
3597
|
+
"inter_message_sep": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|>",
|
|
3598
|
+
"stop_token_ids": [
|
|
3599
|
+
6,
|
|
3600
|
+
255001
|
|
3601
|
+
]
|
|
3602
|
+
}
|
|
3463
3603
|
},
|
|
3464
3604
|
{
|
|
3465
3605
|
"version": 1,
|
|
@@ -3548,5 +3688,56 @@
|
|
|
3548
3688
|
"<|end|>"
|
|
3549
3689
|
]
|
|
3550
3690
|
}
|
|
3551
|
-
}
|
|
3691
|
+
},
|
|
3692
|
+
{
|
|
3693
|
+
"version": 1,
|
|
3694
|
+
"context_length": 32768,
|
|
3695
|
+
"model_name": "internvl-chat",
|
|
3696
|
+
"model_lang": [
|
|
3697
|
+
"en",
|
|
3698
|
+
"zh"
|
|
3699
|
+
],
|
|
3700
|
+
"model_ability": [
|
|
3701
|
+
"chat",
|
|
3702
|
+
"vision"
|
|
3703
|
+
],
|
|
3704
|
+
"model_description": "InternVL 1.5 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
|
|
3705
|
+
"model_specs": [
|
|
3706
|
+
{
|
|
3707
|
+
"model_format": "pytorch",
|
|
3708
|
+
"model_size_in_billions": 26,
|
|
3709
|
+
"quantizations": [
|
|
3710
|
+
"none"
|
|
3711
|
+
],
|
|
3712
|
+
"model_hub": "modelscope",
|
|
3713
|
+
"model_id": "AI-ModelScope/InternVL-Chat-V1-5",
|
|
3714
|
+
"model_revision": "master"
|
|
3715
|
+
},
|
|
3716
|
+
{
|
|
3717
|
+
"model_format": "pytorch",
|
|
3718
|
+
"model_size_in_billions": 26,
|
|
3719
|
+
"quantizations": [
|
|
3720
|
+
"Int8"
|
|
3721
|
+
],
|
|
3722
|
+
"model_hub": "modelscope",
|
|
3723
|
+
"model_id": "AI-ModelScope/InternVL-Chat-V1-5-{quantization}",
|
|
3724
|
+
"model_revision": "master"
|
|
3725
|
+
}
|
|
3726
|
+
],
|
|
3727
|
+
"prompt_style": {
|
|
3728
|
+
"style_name": "INTERNLM2",
|
|
3729
|
+
"system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
|
|
3730
|
+
"roles": [
|
|
3731
|
+
"<|im_start|>user",
|
|
3732
|
+
"<|im_start|>assistant"
|
|
3733
|
+
],
|
|
3734
|
+
"intra_message_sep": "<|im_end|>",
|
|
3735
|
+
"stop_token_ids": [
|
|
3736
|
+
92542
|
|
3737
|
+
],
|
|
3738
|
+
"stop": [
|
|
3739
|
+
"<|im_end|>"
|
|
3740
|
+
]
|
|
3741
|
+
}
|
|
3742
|
+
}
|
|
3552
3743
|
]
|