xinference 0.11.3__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +143 -6
- xinference/client/restful/restful_client.py +144 -5
- xinference/constants.py +5 -0
- xinference/core/cache_tracker.py +48 -28
- xinference/core/model.py +160 -19
- xinference/core/scheduler.py +446 -0
- xinference/core/supervisor.py +99 -24
- xinference/core/worker.py +68 -2
- xinference/deploy/cmdline.py +86 -2
- xinference/deploy/test/test_cmdline.py +19 -10
- xinference/isolation.py +9 -2
- xinference/model/audio/__init__.py +14 -1
- xinference/model/audio/chattts.py +84 -0
- xinference/model/audio/core.py +22 -4
- xinference/model/audio/custom.py +6 -4
- xinference/model/audio/model_spec.json +20 -0
- xinference/model/audio/model_spec_modelscope.json +20 -0
- xinference/model/llm/__init__.py +38 -2
- xinference/model/llm/llm_family.json +509 -1
- xinference/model/llm/llm_family.py +86 -1
- xinference/model/llm/llm_family_csghub.json +66 -0
- xinference/model/llm/llm_family_modelscope.json +411 -2
- xinference/model/llm/pytorch/chatglm.py +20 -13
- xinference/model/llm/pytorch/cogvlm2.py +76 -17
- xinference/model/llm/pytorch/core.py +141 -6
- xinference/model/llm/pytorch/glm4v.py +268 -0
- xinference/model/llm/pytorch/minicpmv25.py +232 -0
- xinference/model/llm/pytorch/qwen_vl.py +1 -1
- xinference/model/llm/pytorch/utils.py +405 -8
- xinference/model/llm/utils.py +14 -13
- xinference/model/llm/vllm/core.py +16 -4
- xinference/model/utils.py +8 -2
- xinference/thirdparty/ChatTTS/__init__.py +1 -0
- xinference/thirdparty/ChatTTS/core.py +200 -0
- xinference/thirdparty/ChatTTS/experimental/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/experimental/llm.py +40 -0
- xinference/thirdparty/ChatTTS/infer/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/infer/api.py +125 -0
- xinference/thirdparty/ChatTTS/model/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/model/dvae.py +155 -0
- xinference/thirdparty/ChatTTS/model/gpt.py +265 -0
- xinference/thirdparty/ChatTTS/utils/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/utils/gpu_utils.py +23 -0
- xinference/thirdparty/ChatTTS/utils/infer_utils.py +141 -0
- xinference/thirdparty/ChatTTS/utils/io_utils.py +14 -0
- xinference/types.py +3 -0
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.074e2b31.css +2 -0
- xinference/web/ui/build/static/css/main.074e2b31.css.map +1 -0
- xinference/web/ui/build/static/js/main.a58ff436.js +3 -0
- xinference/web/ui/build/static/js/main.a58ff436.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/10262a281dec3bc2b185f4385ceb6846626f52d41cb4d46c7c649e719f979d4d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/762a75a62daf3bec2cfc97ec8612798493fb34ef87087dcad6aad64ab7f14345.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/7f3bdb3a48fa00c046c8b185acd4da6f2e2940a20dbd77f9373d60de3fd6633e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f2f73bfdc13b12b02c8cbc4769b0b8e6367e9b6d8331c322d94318491a0b3653.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f51bf63ddaa7afd125ef2254a105789333eecc1c94fdf5157a9b88ef7ad0a5bd.json +1 -0
- {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/METADATA +26 -9
- {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/RECORD +65 -47
- xinference/web/ui/build/static/css/main.54bca460.css +0 -2
- xinference/web/ui/build/static/css/main.54bca460.css.map +0 -1
- xinference/web/ui/build/static/js/main.551aa479.js +0 -3
- xinference/web/ui/build/static/js/main.551aa479.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3e737bcdbcbc407ccd65b90e199ef0c3214b261e8e41dbf14d921384a717d9ee.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +0 -1
- /xinference/web/ui/build/static/js/{main.551aa479.js.LICENSE.txt → main.a58ff436.js.LICENSE.txt} +0 -0
- {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/LICENSE +0 -0
- {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/WHEEL +0 -0
- {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/top_level.txt +0 -0
|
@@ -831,6 +831,141 @@
|
|
|
831
831
|
]
|
|
832
832
|
}
|
|
833
833
|
},
|
|
834
|
+
{
|
|
835
|
+
"version": 1,
|
|
836
|
+
"context_length": 131072,
|
|
837
|
+
"model_name": "glm4-chat",
|
|
838
|
+
"model_lang": [
|
|
839
|
+
"en",
|
|
840
|
+
"zh"
|
|
841
|
+
],
|
|
842
|
+
"model_ability": [
|
|
843
|
+
"chat",
|
|
844
|
+
"tools"
|
|
845
|
+
],
|
|
846
|
+
"model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
|
|
847
|
+
"model_specs": [
|
|
848
|
+
{
|
|
849
|
+
"model_format": "pytorch",
|
|
850
|
+
"model_size_in_billions": 9,
|
|
851
|
+
"quantizations": [
|
|
852
|
+
"4-bit",
|
|
853
|
+
"8-bit",
|
|
854
|
+
"none"
|
|
855
|
+
],
|
|
856
|
+
"model_id": "THUDM/glm-4-9b-chat",
|
|
857
|
+
"model_revision": "b84dc74294ccd507a3d78bde8aebf628221af9bd"
|
|
858
|
+
}
|
|
859
|
+
],
|
|
860
|
+
"prompt_style": {
|
|
861
|
+
"style_name": "CHATGLM3",
|
|
862
|
+
"system_prompt": "",
|
|
863
|
+
"roles": [
|
|
864
|
+
"user",
|
|
865
|
+
"assistant"
|
|
866
|
+
],
|
|
867
|
+
"stop_token_ids": [
|
|
868
|
+
151329,
|
|
869
|
+
151336,
|
|
870
|
+
151338
|
|
871
|
+
],
|
|
872
|
+
"stop": [
|
|
873
|
+
"<|endoftext|>",
|
|
874
|
+
"<|user|>",
|
|
875
|
+
"<|observation|>"
|
|
876
|
+
]
|
|
877
|
+
}
|
|
878
|
+
},
|
|
879
|
+
{
|
|
880
|
+
"version": 1,
|
|
881
|
+
"context_length": 1048576,
|
|
882
|
+
"model_name": "glm4-chat-1m",
|
|
883
|
+
"model_lang": [
|
|
884
|
+
"en",
|
|
885
|
+
"zh"
|
|
886
|
+
],
|
|
887
|
+
"model_ability": [
|
|
888
|
+
"chat",
|
|
889
|
+
"tools"
|
|
890
|
+
],
|
|
891
|
+
"model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
|
|
892
|
+
"model_specs": [
|
|
893
|
+
{
|
|
894
|
+
"model_format": "pytorch",
|
|
895
|
+
"model_size_in_billions": 9,
|
|
896
|
+
"quantizations": [
|
|
897
|
+
"4-bit",
|
|
898
|
+
"8-bit",
|
|
899
|
+
"none"
|
|
900
|
+
],
|
|
901
|
+
"model_id": "THUDM/glm-4-9b-chat-1m",
|
|
902
|
+
"model_revision": "715ddbe91082f976ff6a4ca06d59e5bbff6c3642"
|
|
903
|
+
}
|
|
904
|
+
],
|
|
905
|
+
"prompt_style": {
|
|
906
|
+
"style_name": "CHATGLM3",
|
|
907
|
+
"system_prompt": "",
|
|
908
|
+
"roles": [
|
|
909
|
+
"user",
|
|
910
|
+
"assistant"
|
|
911
|
+
],
|
|
912
|
+
"stop_token_ids": [
|
|
913
|
+
151329,
|
|
914
|
+
151336,
|
|
915
|
+
151338
|
|
916
|
+
],
|
|
917
|
+
"stop": [
|
|
918
|
+
"<|endoftext|>",
|
|
919
|
+
"<|user|>",
|
|
920
|
+
"<|observation|>"
|
|
921
|
+
]
|
|
922
|
+
}
|
|
923
|
+
},
|
|
924
|
+
{
|
|
925
|
+
"version": 1,
|
|
926
|
+
"context_length": 8192,
|
|
927
|
+
"model_name": "glm-4v",
|
|
928
|
+
"model_lang": [
|
|
929
|
+
"en",
|
|
930
|
+
"zh"
|
|
931
|
+
],
|
|
932
|
+
"model_ability": [
|
|
933
|
+
"chat",
|
|
934
|
+
"vision"
|
|
935
|
+
],
|
|
936
|
+
"model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
|
|
937
|
+
"model_specs": [
|
|
938
|
+
{
|
|
939
|
+
"model_format": "pytorch",
|
|
940
|
+
"model_size_in_billions": 9,
|
|
941
|
+
"quantizations": [
|
|
942
|
+
"4-bit",
|
|
943
|
+
"8-bit",
|
|
944
|
+
"none"
|
|
945
|
+
],
|
|
946
|
+
"model_id": "THUDM/glm-4v-9b",
|
|
947
|
+
"model_revision": "e8b84fefc07e58a90c8489337675573fda95e289"
|
|
948
|
+
}
|
|
949
|
+
],
|
|
950
|
+
"prompt_style": {
|
|
951
|
+
"style_name": "CHATGLM3",
|
|
952
|
+
"system_prompt": "",
|
|
953
|
+
"roles": [
|
|
954
|
+
"user",
|
|
955
|
+
"assistant"
|
|
956
|
+
],
|
|
957
|
+
"stop_token_ids": [
|
|
958
|
+
151329,
|
|
959
|
+
151336,
|
|
960
|
+
151338
|
|
961
|
+
],
|
|
962
|
+
"stop": [
|
|
963
|
+
"<|endoftext|>",
|
|
964
|
+
"<|user|>",
|
|
965
|
+
"<|observation|>"
|
|
966
|
+
]
|
|
967
|
+
}
|
|
968
|
+
},
|
|
834
969
|
{
|
|
835
970
|
"version": 1,
|
|
836
971
|
"context_length": 2048,
|
|
@@ -2291,6 +2426,218 @@
|
|
|
2291
2426
|
]
|
|
2292
2427
|
}
|
|
2293
2428
|
},
|
|
2429
|
+
{
|
|
2430
|
+
"version": 1,
|
|
2431
|
+
"context_length": 32768,
|
|
2432
|
+
"model_name": "qwen2-instruct",
|
|
2433
|
+
"model_lang": [
|
|
2434
|
+
"en",
|
|
2435
|
+
"zh"
|
|
2436
|
+
],
|
|
2437
|
+
"model_ability": [
|
|
2438
|
+
"chat",
|
|
2439
|
+
"tools"
|
|
2440
|
+
],
|
|
2441
|
+
"model_description": "Qwen2 is the new series of Qwen large language models",
|
|
2442
|
+
"model_specs": [
|
|
2443
|
+
{
|
|
2444
|
+
"model_format": "pytorch",
|
|
2445
|
+
"model_size_in_billions": "0_5",
|
|
2446
|
+
"quantizations": [
|
|
2447
|
+
"4-bit",
|
|
2448
|
+
"8-bit",
|
|
2449
|
+
"none"
|
|
2450
|
+
],
|
|
2451
|
+
"model_id": "Qwen/Qwen2-0.5B-Instruct"
|
|
2452
|
+
},
|
|
2453
|
+
{
|
|
2454
|
+
"model_format": "pytorch",
|
|
2455
|
+
"model_size_in_billions": "1_5",
|
|
2456
|
+
"quantizations": [
|
|
2457
|
+
"4-bit",
|
|
2458
|
+
"8-bit",
|
|
2459
|
+
"none"
|
|
2460
|
+
],
|
|
2461
|
+
"model_id": "Qwen/Qwen2-1.5B-Instruct"
|
|
2462
|
+
},
|
|
2463
|
+
{
|
|
2464
|
+
"model_format": "pytorch",
|
|
2465
|
+
"model_size_in_billions": 7,
|
|
2466
|
+
"quantizations": [
|
|
2467
|
+
"4-bit",
|
|
2468
|
+
"8-bit",
|
|
2469
|
+
"none"
|
|
2470
|
+
],
|
|
2471
|
+
"model_id": "Qwen/Qwen2-7B-Instruct"
|
|
2472
|
+
},
|
|
2473
|
+
{
|
|
2474
|
+
"model_format": "pytorch",
|
|
2475
|
+
"model_size_in_billions": 72,
|
|
2476
|
+
"quantizations": [
|
|
2477
|
+
"4-bit",
|
|
2478
|
+
"8-bit",
|
|
2479
|
+
"none"
|
|
2480
|
+
],
|
|
2481
|
+
"model_id": "Qwen/Qwen2-72B-Instruct"
|
|
2482
|
+
},
|
|
2483
|
+
{
|
|
2484
|
+
"model_format": "gptq",
|
|
2485
|
+
"model_size_in_billions": "0_5",
|
|
2486
|
+
"quantizations": [
|
|
2487
|
+
"Int4",
|
|
2488
|
+
"Int8"
|
|
2489
|
+
],
|
|
2490
|
+
"model_id": "Qwen/Qwen2-0.5B-Instruct-GPTQ-{quantization}"
|
|
2491
|
+
},
|
|
2492
|
+
{
|
|
2493
|
+
"model_format": "gptq",
|
|
2494
|
+
"model_size_in_billions": "1_5",
|
|
2495
|
+
"quantizations": [
|
|
2496
|
+
"Int4",
|
|
2497
|
+
"Int8"
|
|
2498
|
+
],
|
|
2499
|
+
"model_id": "Qwen/Qwen2-1.5B-Instruct-GPTQ-{quantization}"
|
|
2500
|
+
},
|
|
2501
|
+
{
|
|
2502
|
+
"model_format": "gptq",
|
|
2503
|
+
"model_size_in_billions": 7,
|
|
2504
|
+
"quantizations": [
|
|
2505
|
+
"Int4",
|
|
2506
|
+
"Int8"
|
|
2507
|
+
],
|
|
2508
|
+
"model_id": "Qwen/Qwen2-7B-Instruct-GPTQ-{quantization}"
|
|
2509
|
+
},
|
|
2510
|
+
{
|
|
2511
|
+
"model_format": "gptq",
|
|
2512
|
+
"model_size_in_billions": 72,
|
|
2513
|
+
"quantizations": [
|
|
2514
|
+
"Int4",
|
|
2515
|
+
"Int8"
|
|
2516
|
+
],
|
|
2517
|
+
"model_id": "Qwen/Qwen2-72B-Instruct-GPTQ-{quantization}"
|
|
2518
|
+
},
|
|
2519
|
+
{
|
|
2520
|
+
"model_format": "awq",
|
|
2521
|
+
"model_size_in_billions": "0_5",
|
|
2522
|
+
"quantizations": [
|
|
2523
|
+
"Int4"
|
|
2524
|
+
],
|
|
2525
|
+
"model_id": "Qwen/Qwen2-0.5B-Instruct-AWQ"
|
|
2526
|
+
},
|
|
2527
|
+
{
|
|
2528
|
+
"model_format": "awq",
|
|
2529
|
+
"model_size_in_billions": "1_5",
|
|
2530
|
+
"quantizations": [
|
|
2531
|
+
"Int4"
|
|
2532
|
+
],
|
|
2533
|
+
"model_id": "Qwen/Qwen2-1.5B-Instruct-AWQ"
|
|
2534
|
+
},
|
|
2535
|
+
{
|
|
2536
|
+
"model_format": "awq",
|
|
2537
|
+
"model_size_in_billions": 7,
|
|
2538
|
+
"quantizations": [
|
|
2539
|
+
"Int4"
|
|
2540
|
+
],
|
|
2541
|
+
"model_id": "Qwen/Qwen2-7B-Instruct-AWQ"
|
|
2542
|
+
},
|
|
2543
|
+
{
|
|
2544
|
+
"model_format": "awq",
|
|
2545
|
+
"model_size_in_billions": 72,
|
|
2546
|
+
"quantizations": [
|
|
2547
|
+
"Int4"
|
|
2548
|
+
],
|
|
2549
|
+
"model_id": "Qwen/Qwen2-72B-Instruct-AWQ"
|
|
2550
|
+
},
|
|
2551
|
+
{
|
|
2552
|
+
"model_format": "ggufv2",
|
|
2553
|
+
"model_size_in_billions": "0_5",
|
|
2554
|
+
"quantizations": [
|
|
2555
|
+
"q2_k",
|
|
2556
|
+
"q3_k_m",
|
|
2557
|
+
"q4_0",
|
|
2558
|
+
"q4_k_m",
|
|
2559
|
+
"q5_0",
|
|
2560
|
+
"q5_k_m",
|
|
2561
|
+
"q6_k",
|
|
2562
|
+
"q8_0",
|
|
2563
|
+
"fp16"
|
|
2564
|
+
],
|
|
2565
|
+
"model_id": "Qwen/Qwen2-0.5B-Instruct-GGUF",
|
|
2566
|
+
"model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf"
|
|
2567
|
+
}
|
|
2568
|
+
],
|
|
2569
|
+
"prompt_style": {
|
|
2570
|
+
"style_name": "QWEN",
|
|
2571
|
+
"system_prompt": "You are a helpful assistant.",
|
|
2572
|
+
"roles": [
|
|
2573
|
+
"user",
|
|
2574
|
+
"assistant"
|
|
2575
|
+
],
|
|
2576
|
+
"intra_message_sep": "\n",
|
|
2577
|
+
"stop_token_ids": [
|
|
2578
|
+
151643,
|
|
2579
|
+
151644,
|
|
2580
|
+
151645
|
|
2581
|
+
],
|
|
2582
|
+
"stop": [
|
|
2583
|
+
"<|endoftext|>",
|
|
2584
|
+
"<|im_start|>",
|
|
2585
|
+
"<|im_end|>"
|
|
2586
|
+
]
|
|
2587
|
+
}
|
|
2588
|
+
},
|
|
2589
|
+
{
|
|
2590
|
+
"version": 1,
|
|
2591
|
+
"context_length": 32768,
|
|
2592
|
+
"model_name": "qwen2-moe-instruct",
|
|
2593
|
+
"model_lang": [
|
|
2594
|
+
"en",
|
|
2595
|
+
"zh"
|
|
2596
|
+
],
|
|
2597
|
+
"model_ability": [
|
|
2598
|
+
"chat"
|
|
2599
|
+
],
|
|
2600
|
+
"model_description": "Qwen2 is the new series of Qwen large language models. ",
|
|
2601
|
+
"model_specs": [
|
|
2602
|
+
{
|
|
2603
|
+
"model_format": "pytorch",
|
|
2604
|
+
"model_size_in_billions": 14,
|
|
2605
|
+
"quantizations": [
|
|
2606
|
+
"4-bit",
|
|
2607
|
+
"8-bit",
|
|
2608
|
+
"none"
|
|
2609
|
+
],
|
|
2610
|
+
"model_id": "Qwen/Qwen2-57B-A14B-Instruct"
|
|
2611
|
+
},
|
|
2612
|
+
{
|
|
2613
|
+
"model_format": "gptq",
|
|
2614
|
+
"model_size_in_billions": 14,
|
|
2615
|
+
"quantizations": [
|
|
2616
|
+
"Int4"
|
|
2617
|
+
],
|
|
2618
|
+
"model_id": "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4"
|
|
2619
|
+
}
|
|
2620
|
+
],
|
|
2621
|
+
"prompt_style": {
|
|
2622
|
+
"style_name": "QWEN",
|
|
2623
|
+
"system_prompt": "You are a helpful assistant.",
|
|
2624
|
+
"roles": [
|
|
2625
|
+
"user",
|
|
2626
|
+
"assistant"
|
|
2627
|
+
],
|
|
2628
|
+
"intra_message_sep": "\n",
|
|
2629
|
+
"stop_token_ids": [
|
|
2630
|
+
151643,
|
|
2631
|
+
151644,
|
|
2632
|
+
151645
|
|
2633
|
+
],
|
|
2634
|
+
"stop": [
|
|
2635
|
+
"<|endoftext|>",
|
|
2636
|
+
"<|im_start|>",
|
|
2637
|
+
"<|im_end|>"
|
|
2638
|
+
]
|
|
2639
|
+
}
|
|
2640
|
+
},
|
|
2294
2641
|
{
|
|
2295
2642
|
"version": 1,
|
|
2296
2643
|
"context_length": 8192,
|
|
@@ -3251,6 +3598,125 @@
|
|
|
3251
3598
|
]
|
|
3252
3599
|
}
|
|
3253
3600
|
},
|
|
3601
|
+
{
|
|
3602
|
+
"version": 1,
|
|
3603
|
+
"context_length": 32768,
|
|
3604
|
+
"model_name": "mistral-instruct-v0.3",
|
|
3605
|
+
"model_lang": [
|
|
3606
|
+
"en"
|
|
3607
|
+
],
|
|
3608
|
+
"model_ability": [
|
|
3609
|
+
"chat"
|
|
3610
|
+
],
|
|
3611
|
+
"model_description": "The Mistral-7B-Instruct-v0.2 Large Language Model (LLM) is an improved instruct fine-tuned version of Mistral-7B-Instruct-v0.1.",
|
|
3612
|
+
"model_specs": [
|
|
3613
|
+
{
|
|
3614
|
+
"model_format": "pytorch",
|
|
3615
|
+
"model_size_in_billions": 7,
|
|
3616
|
+
"quantizations": [
|
|
3617
|
+
"4-bit",
|
|
3618
|
+
"8-bit",
|
|
3619
|
+
"none"
|
|
3620
|
+
],
|
|
3621
|
+
"model_id": "mistralai/Mistral-7B-Instruct-v0.3",
|
|
3622
|
+
"model_revision": "83e9aa141f2e28c82232fea5325f54edf17c43de"
|
|
3623
|
+
},
|
|
3624
|
+
{
|
|
3625
|
+
"model_format": "gptq",
|
|
3626
|
+
"model_size_in_billions": 7,
|
|
3627
|
+
"quantizations": [
|
|
3628
|
+
"Int4"
|
|
3629
|
+
],
|
|
3630
|
+
"model_id": "neuralmagic/Mistral-7B-Instruct-v0.3-GPTQ-4bit"
|
|
3631
|
+
},
|
|
3632
|
+
{
|
|
3633
|
+
"model_format": "awq",
|
|
3634
|
+
"model_size_in_billions": 7,
|
|
3635
|
+
"quantizations": [
|
|
3636
|
+
"Int4"
|
|
3637
|
+
],
|
|
3638
|
+
"model_id": "solidrust/Mistral-7B-Instruct-v0.3-AWQ"
|
|
3639
|
+
},
|
|
3640
|
+
{
|
|
3641
|
+
"model_format": "ggufv2",
|
|
3642
|
+
"model_size_in_billions": 7,
|
|
3643
|
+
"quantizations": [
|
|
3644
|
+
"Q2_K",
|
|
3645
|
+
"Q3_K_S",
|
|
3646
|
+
"Q3_K_M",
|
|
3647
|
+
"Q3_K_L",
|
|
3648
|
+
"Q4_K_S",
|
|
3649
|
+
"Q4_K_M",
|
|
3650
|
+
"Q5_K_S",
|
|
3651
|
+
"Q5_K_M",
|
|
3652
|
+
"Q6_K",
|
|
3653
|
+
"Q8_0",
|
|
3654
|
+
"fp16"
|
|
3655
|
+
],
|
|
3656
|
+
"model_id": "MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF",
|
|
3657
|
+
"model_file_name_template": "Mistral-7B-Instruct-v0.3.{quantization}.gguf"
|
|
3658
|
+
}
|
|
3659
|
+
],
|
|
3660
|
+
"prompt_style": {
|
|
3661
|
+
"style_name": "LLAMA2",
|
|
3662
|
+
"system_prompt": "[INST] ",
|
|
3663
|
+
"roles": [
|
|
3664
|
+
"[INST]",
|
|
3665
|
+
"[/INST]"
|
|
3666
|
+
],
|
|
3667
|
+
"intra_message_sep": " ",
|
|
3668
|
+
"inter_message_sep": "<s>",
|
|
3669
|
+
"stop_token_ids": [
|
|
3670
|
+
2
|
|
3671
|
+
],
|
|
3672
|
+
"stop": [
|
|
3673
|
+
"</s>"
|
|
3674
|
+
]
|
|
3675
|
+
}
|
|
3676
|
+
},
|
|
3677
|
+
{
|
|
3678
|
+
"version": 1,
|
|
3679
|
+
"context_length": 32768,
|
|
3680
|
+
"model_name": "codestral-v0.1",
|
|
3681
|
+
"model_lang": [
|
|
3682
|
+
"en"
|
|
3683
|
+
],
|
|
3684
|
+
"model_ability": [
|
|
3685
|
+
"generate"
|
|
3686
|
+
],
|
|
3687
|
+
"model_description": "Codestrall-22B-v0.1 is trained on a diverse dataset of 80+ programming languages, including the most popular ones, such as Python, Java, C, C++, JavaScript, and Bash",
|
|
3688
|
+
"model_specs": [
|
|
3689
|
+
{
|
|
3690
|
+
"model_format": "pytorch",
|
|
3691
|
+
"model_size_in_billions": 22,
|
|
3692
|
+
"quantizations": [
|
|
3693
|
+
"4-bit",
|
|
3694
|
+
"8-bit",
|
|
3695
|
+
"none"
|
|
3696
|
+
],
|
|
3697
|
+
"model_id": "mistralai/Mistral-7B-Instruct-v0.2",
|
|
3698
|
+
"model_revision": "9552e7b1d9b2d5bbd87a5aa7221817285dbb6366"
|
|
3699
|
+
},
|
|
3700
|
+
{
|
|
3701
|
+
"model_format": "ggufv2",
|
|
3702
|
+
"model_size_in_billions": 22,
|
|
3703
|
+
"quantizations": [
|
|
3704
|
+
"Q2_K",
|
|
3705
|
+
"Q3_K_S",
|
|
3706
|
+
"Q3_K_M",
|
|
3707
|
+
"Q3_K_L",
|
|
3708
|
+
"Q4_K_S",
|
|
3709
|
+
"Q4_K_M",
|
|
3710
|
+
"Q5_K_S",
|
|
3711
|
+
"Q5_K_M",
|
|
3712
|
+
"Q6_K",
|
|
3713
|
+
"Q8_0"
|
|
3714
|
+
],
|
|
3715
|
+
"model_id": "bartowski/Codestral-22B-v0.1-GGUF",
|
|
3716
|
+
"model_file_name_template": "Codestral-22B-v0.1-{quantization}.gguf"
|
|
3717
|
+
}
|
|
3718
|
+
]
|
|
3719
|
+
},
|
|
3254
3720
|
{
|
|
3255
3721
|
"version": 1,
|
|
3256
3722
|
"context_length": 8192,
|
|
@@ -5258,6 +5724,48 @@
|
|
|
5258
5724
|
]
|
|
5259
5725
|
}
|
|
5260
5726
|
},
|
|
5727
|
+
{
|
|
5728
|
+
"version":1,
|
|
5729
|
+
"context_length":2048,
|
|
5730
|
+
"model_name":"MiniCPM-Llama3-V-2_5",
|
|
5731
|
+
"model_lang":[
|
|
5732
|
+
"en",
|
|
5733
|
+
"zh"
|
|
5734
|
+
],
|
|
5735
|
+
"model_ability":[
|
|
5736
|
+
"chat",
|
|
5737
|
+
"vision"
|
|
5738
|
+
],
|
|
5739
|
+
"model_description":"MiniCPM-Llama3-V 2.5 is the latest model in the MiniCPM-V series. The model is built on SigLip-400M and Llama3-8B-Instruct with a total of 8B parameters.",
|
|
5740
|
+
"model_specs":[
|
|
5741
|
+
{
|
|
5742
|
+
"model_format":"pytorch",
|
|
5743
|
+
"model_size_in_billions":8,
|
|
5744
|
+
"quantizations":[
|
|
5745
|
+
"none"
|
|
5746
|
+
],
|
|
5747
|
+
"model_id":"openbmb/MiniCPM-Llama3-V-2_5",
|
|
5748
|
+
"model_revision":"285a637ba8a30a0660dfcccad16f9a864f75abfd"
|
|
5749
|
+
},
|
|
5750
|
+
{
|
|
5751
|
+
"model_format":"pytorch",
|
|
5752
|
+
"model_size_in_billions":8,
|
|
5753
|
+
"quantizations":[
|
|
5754
|
+
"int4"
|
|
5755
|
+
],
|
|
5756
|
+
"model_id":"openbmb/MiniCPM-Llama3-V-2_5-{quantization}",
|
|
5757
|
+
"model_revision":"f92aff28552de35de3be204e8fe292dd4824e544"
|
|
5758
|
+
}
|
|
5759
|
+
],
|
|
5760
|
+
"prompt_style":{
|
|
5761
|
+
"style_name":"OmniLMM",
|
|
5762
|
+
"system_prompt":"The role of first msg should be user",
|
|
5763
|
+
"roles":[
|
|
5764
|
+
"user",
|
|
5765
|
+
"assistant"
|
|
5766
|
+
]
|
|
5767
|
+
}
|
|
5768
|
+
},
|
|
5261
5769
|
{
|
|
5262
5770
|
"version": 1,
|
|
5263
5771
|
"context_length": 4096,
|
|
@@ -6277,7 +6785,7 @@
|
|
|
6277
6785
|
"quantizations": [
|
|
6278
6786
|
"int4"
|
|
6279
6787
|
],
|
|
6280
|
-
"model_id": "THUDM/cogvlm2-llama3-chinese-chat-19B-{
|
|
6788
|
+
"model_id": "THUDM/cogvlm2-llama3-chinese-chat-19B-{quantization}",
|
|
6281
6789
|
"model_revision": "7863e362174f4718c2fe9cba4befd0b580a3194f"
|
|
6282
6790
|
}
|
|
6283
6791
|
],
|
|
@@ -32,10 +32,15 @@ from ..._compat import (
|
|
|
32
32
|
load_str_bytes,
|
|
33
33
|
validator,
|
|
34
34
|
)
|
|
35
|
-
from ...constants import
|
|
35
|
+
from ...constants import (
|
|
36
|
+
XINFERENCE_CACHE_DIR,
|
|
37
|
+
XINFERENCE_ENV_CSG_TOKEN,
|
|
38
|
+
XINFERENCE_MODEL_DIR,
|
|
39
|
+
)
|
|
36
40
|
from ..utils import (
|
|
37
41
|
IS_NEW_HUGGINGFACE_HUB,
|
|
38
42
|
create_symlink,
|
|
43
|
+
download_from_csghub,
|
|
39
44
|
download_from_modelscope,
|
|
40
45
|
is_valid_model_uri,
|
|
41
46
|
parse_uri,
|
|
@@ -232,6 +237,7 @@ LLAMA_CLASSES: List[Type[LLM]] = []
|
|
|
232
237
|
|
|
233
238
|
BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
|
|
234
239
|
BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
|
|
240
|
+
BUILTIN_CSGHUB_LLM_FAMILIES: List["LLMFamilyV1"] = []
|
|
235
241
|
|
|
236
242
|
SGLANG_CLASSES: List[Type[LLM]] = []
|
|
237
243
|
TRANSFORMERS_CLASSES: List[Type[LLM]] = []
|
|
@@ -292,6 +298,9 @@ def cache(
|
|
|
292
298
|
elif llm_spec.model_hub == "modelscope":
|
|
293
299
|
logger.info(f"Caching from Modelscope: {llm_spec.model_id}")
|
|
294
300
|
return cache_from_modelscope(llm_family, llm_spec, quantization)
|
|
301
|
+
elif llm_spec.model_hub == "csghub":
|
|
302
|
+
logger.info(f"Caching from CSGHub: {llm_spec.model_id}")
|
|
303
|
+
return cache_from_csghub(llm_family, llm_spec, quantization)
|
|
295
304
|
else:
|
|
296
305
|
raise ValueError(f"Unknown model hub: {llm_spec.model_hub}")
|
|
297
306
|
|
|
@@ -566,6 +575,7 @@ def _skip_download(
|
|
|
566
575
|
"modelscope": _get_meta_path(
|
|
567
576
|
cache_dir, model_format, "modelscope", quantization
|
|
568
577
|
),
|
|
578
|
+
"csghub": _get_meta_path(cache_dir, model_format, "csghub", quantization),
|
|
569
579
|
}
|
|
570
580
|
if valid_model_revision(model_hub_to_meta_path[model_hub], model_revision):
|
|
571
581
|
logger.info(f"Cache {cache_dir} exists")
|
|
@@ -650,6 +660,75 @@ def _merge_cached_files(
|
|
|
650
660
|
logger.info(f"Merge complete.")
|
|
651
661
|
|
|
652
662
|
|
|
663
|
+
def cache_from_csghub(
|
|
664
|
+
llm_family: LLMFamilyV1,
|
|
665
|
+
llm_spec: "LLMSpecV1",
|
|
666
|
+
quantization: Optional[str] = None,
|
|
667
|
+
) -> str:
|
|
668
|
+
"""
|
|
669
|
+
Cache model from CSGHub. Return the cache directory.
|
|
670
|
+
"""
|
|
671
|
+
from pycsghub.file_download import file_download
|
|
672
|
+
from pycsghub.snapshot_download import snapshot_download
|
|
673
|
+
|
|
674
|
+
cache_dir = _get_cache_dir(llm_family, llm_spec)
|
|
675
|
+
|
|
676
|
+
if _skip_download(
|
|
677
|
+
cache_dir,
|
|
678
|
+
llm_spec.model_format,
|
|
679
|
+
llm_spec.model_hub,
|
|
680
|
+
llm_spec.model_revision,
|
|
681
|
+
quantization,
|
|
682
|
+
):
|
|
683
|
+
return cache_dir
|
|
684
|
+
|
|
685
|
+
if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
|
|
686
|
+
download_dir = retry_download(
|
|
687
|
+
snapshot_download,
|
|
688
|
+
llm_family.model_name,
|
|
689
|
+
{
|
|
690
|
+
"model_size": llm_spec.model_size_in_billions,
|
|
691
|
+
"model_format": llm_spec.model_format,
|
|
692
|
+
},
|
|
693
|
+
llm_spec.model_id,
|
|
694
|
+
endpoint="https://hub-stg.opencsg.com",
|
|
695
|
+
token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
|
|
696
|
+
)
|
|
697
|
+
create_symlink(download_dir, cache_dir)
|
|
698
|
+
|
|
699
|
+
elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
|
|
700
|
+
file_names, final_file_name, need_merge = _generate_model_file_names(
|
|
701
|
+
llm_spec, quantization
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
for filename in file_names:
|
|
705
|
+
download_path = retry_download(
|
|
706
|
+
file_download,
|
|
707
|
+
llm_family.model_name,
|
|
708
|
+
{
|
|
709
|
+
"model_size": llm_spec.model_size_in_billions,
|
|
710
|
+
"model_format": llm_spec.model_format,
|
|
711
|
+
},
|
|
712
|
+
llm_spec.model_id,
|
|
713
|
+
file_name=filename,
|
|
714
|
+
endpoint="https://hub-stg.opencsg.com",
|
|
715
|
+
token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
|
|
716
|
+
)
|
|
717
|
+
symlink_local_file(download_path, cache_dir, filename)
|
|
718
|
+
|
|
719
|
+
if need_merge:
|
|
720
|
+
_merge_cached_files(cache_dir, file_names, final_file_name)
|
|
721
|
+
else:
|
|
722
|
+
raise ValueError(f"Unsupported format: {llm_spec.model_format}")
|
|
723
|
+
|
|
724
|
+
meta_path = _get_meta_path(
|
|
725
|
+
cache_dir, llm_spec.model_format, llm_spec.model_hub, quantization
|
|
726
|
+
)
|
|
727
|
+
_generate_meta_file(meta_path, llm_family, llm_spec, quantization)
|
|
728
|
+
|
|
729
|
+
return cache_dir
|
|
730
|
+
|
|
731
|
+
|
|
653
732
|
def cache_from_modelscope(
|
|
654
733
|
llm_family: LLMFamilyV1,
|
|
655
734
|
llm_spec: "LLMSpecV1",
|
|
@@ -931,6 +1010,12 @@ def match_llm(
|
|
|
931
1010
|
+ BUILTIN_LLM_FAMILIES
|
|
932
1011
|
+ user_defined_llm_families
|
|
933
1012
|
)
|
|
1013
|
+
elif download_from_csghub():
|
|
1014
|
+
all_families = (
|
|
1015
|
+
BUILTIN_CSGHUB_LLM_FAMILIES
|
|
1016
|
+
+ BUILTIN_LLM_FAMILIES
|
|
1017
|
+
+ user_defined_llm_families
|
|
1018
|
+
)
|
|
934
1019
|
else:
|
|
935
1020
|
all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
|
|
936
1021
|
|