xinference 0.11.3__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (75) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +143 -6
  3. xinference/client/restful/restful_client.py +144 -5
  4. xinference/constants.py +5 -0
  5. xinference/core/cache_tracker.py +48 -28
  6. xinference/core/model.py +160 -19
  7. xinference/core/scheduler.py +446 -0
  8. xinference/core/supervisor.py +99 -24
  9. xinference/core/worker.py +68 -2
  10. xinference/deploy/cmdline.py +86 -2
  11. xinference/deploy/test/test_cmdline.py +19 -10
  12. xinference/isolation.py +9 -2
  13. xinference/model/audio/__init__.py +14 -1
  14. xinference/model/audio/chattts.py +84 -0
  15. xinference/model/audio/core.py +22 -4
  16. xinference/model/audio/custom.py +6 -4
  17. xinference/model/audio/model_spec.json +20 -0
  18. xinference/model/audio/model_spec_modelscope.json +20 -0
  19. xinference/model/llm/__init__.py +38 -2
  20. xinference/model/llm/llm_family.json +509 -1
  21. xinference/model/llm/llm_family.py +86 -1
  22. xinference/model/llm/llm_family_csghub.json +66 -0
  23. xinference/model/llm/llm_family_modelscope.json +411 -2
  24. xinference/model/llm/pytorch/chatglm.py +20 -13
  25. xinference/model/llm/pytorch/cogvlm2.py +76 -17
  26. xinference/model/llm/pytorch/core.py +141 -6
  27. xinference/model/llm/pytorch/glm4v.py +268 -0
  28. xinference/model/llm/pytorch/minicpmv25.py +232 -0
  29. xinference/model/llm/pytorch/qwen_vl.py +1 -1
  30. xinference/model/llm/pytorch/utils.py +405 -8
  31. xinference/model/llm/utils.py +14 -13
  32. xinference/model/llm/vllm/core.py +16 -4
  33. xinference/model/utils.py +8 -2
  34. xinference/thirdparty/ChatTTS/__init__.py +1 -0
  35. xinference/thirdparty/ChatTTS/core.py +200 -0
  36. xinference/thirdparty/ChatTTS/experimental/__init__.py +0 -0
  37. xinference/thirdparty/ChatTTS/experimental/llm.py +40 -0
  38. xinference/thirdparty/ChatTTS/infer/__init__.py +0 -0
  39. xinference/thirdparty/ChatTTS/infer/api.py +125 -0
  40. xinference/thirdparty/ChatTTS/model/__init__.py +0 -0
  41. xinference/thirdparty/ChatTTS/model/dvae.py +155 -0
  42. xinference/thirdparty/ChatTTS/model/gpt.py +265 -0
  43. xinference/thirdparty/ChatTTS/utils/__init__.py +0 -0
  44. xinference/thirdparty/ChatTTS/utils/gpu_utils.py +23 -0
  45. xinference/thirdparty/ChatTTS/utils/infer_utils.py +141 -0
  46. xinference/thirdparty/ChatTTS/utils/io_utils.py +14 -0
  47. xinference/types.py +3 -0
  48. xinference/web/ui/build/asset-manifest.json +6 -6
  49. xinference/web/ui/build/index.html +1 -1
  50. xinference/web/ui/build/static/css/main.074e2b31.css +2 -0
  51. xinference/web/ui/build/static/css/main.074e2b31.css.map +1 -0
  52. xinference/web/ui/build/static/js/main.a58ff436.js +3 -0
  53. xinference/web/ui/build/static/js/main.a58ff436.js.map +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/10262a281dec3bc2b185f4385ceb6846626f52d41cb4d46c7c649e719f979d4d.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/762a75a62daf3bec2cfc97ec8612798493fb34ef87087dcad6aad64ab7f14345.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/7f3bdb3a48fa00c046c8b185acd4da6f2e2940a20dbd77f9373d60de3fd6633e.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/f2f73bfdc13b12b02c8cbc4769b0b8e6367e9b6d8331c322d94318491a0b3653.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/f51bf63ddaa7afd125ef2254a105789333eecc1c94fdf5157a9b88ef7ad0a5bd.json +1 -0
  59. {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/METADATA +26 -9
  60. {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/RECORD +65 -47
  61. xinference/web/ui/build/static/css/main.54bca460.css +0 -2
  62. xinference/web/ui/build/static/css/main.54bca460.css.map +0 -1
  63. xinference/web/ui/build/static/js/main.551aa479.js +0 -3
  64. xinference/web/ui/build/static/js/main.551aa479.js.map +0 -1
  65. xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/3e737bcdbcbc407ccd65b90e199ef0c3214b261e8e41dbf14d921384a717d9ee.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +0 -1
  71. /xinference/web/ui/build/static/js/{main.551aa479.js.LICENSE.txt → main.a58ff436.js.LICENSE.txt} +0 -0
  72. {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/LICENSE +0 -0
  73. {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/WHEEL +0 -0
  74. {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/entry_points.txt +0 -0
  75. {xinference-0.11.3.dist-info → xinference-0.12.1.dist-info}/top_level.txt +0 -0
@@ -831,6 +831,141 @@
831
831
  ]
832
832
  }
833
833
  },
834
+ {
835
+ "version": 1,
836
+ "context_length": 131072,
837
+ "model_name": "glm4-chat",
838
+ "model_lang": [
839
+ "en",
840
+ "zh"
841
+ ],
842
+ "model_ability": [
843
+ "chat",
844
+ "tools"
845
+ ],
846
+ "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
847
+ "model_specs": [
848
+ {
849
+ "model_format": "pytorch",
850
+ "model_size_in_billions": 9,
851
+ "quantizations": [
852
+ "4-bit",
853
+ "8-bit",
854
+ "none"
855
+ ],
856
+ "model_id": "THUDM/glm-4-9b-chat",
857
+ "model_revision": "b84dc74294ccd507a3d78bde8aebf628221af9bd"
858
+ }
859
+ ],
860
+ "prompt_style": {
861
+ "style_name": "CHATGLM3",
862
+ "system_prompt": "",
863
+ "roles": [
864
+ "user",
865
+ "assistant"
866
+ ],
867
+ "stop_token_ids": [
868
+ 151329,
869
+ 151336,
870
+ 151338
871
+ ],
872
+ "stop": [
873
+ "<|endoftext|>",
874
+ "<|user|>",
875
+ "<|observation|>"
876
+ ]
877
+ }
878
+ },
879
+ {
880
+ "version": 1,
881
+ "context_length": 1048576,
882
+ "model_name": "glm4-chat-1m",
883
+ "model_lang": [
884
+ "en",
885
+ "zh"
886
+ ],
887
+ "model_ability": [
888
+ "chat",
889
+ "tools"
890
+ ],
891
+ "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
892
+ "model_specs": [
893
+ {
894
+ "model_format": "pytorch",
895
+ "model_size_in_billions": 9,
896
+ "quantizations": [
897
+ "4-bit",
898
+ "8-bit",
899
+ "none"
900
+ ],
901
+ "model_id": "THUDM/glm-4-9b-chat-1m",
902
+ "model_revision": "715ddbe91082f976ff6a4ca06d59e5bbff6c3642"
903
+ }
904
+ ],
905
+ "prompt_style": {
906
+ "style_name": "CHATGLM3",
907
+ "system_prompt": "",
908
+ "roles": [
909
+ "user",
910
+ "assistant"
911
+ ],
912
+ "stop_token_ids": [
913
+ 151329,
914
+ 151336,
915
+ 151338
916
+ ],
917
+ "stop": [
918
+ "<|endoftext|>",
919
+ "<|user|>",
920
+ "<|observation|>"
921
+ ]
922
+ }
923
+ },
924
+ {
925
+ "version": 1,
926
+ "context_length": 8192,
927
+ "model_name": "glm-4v",
928
+ "model_lang": [
929
+ "en",
930
+ "zh"
931
+ ],
932
+ "model_ability": [
933
+ "chat",
934
+ "vision"
935
+ ],
936
+ "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
937
+ "model_specs": [
938
+ {
939
+ "model_format": "pytorch",
940
+ "model_size_in_billions": 9,
941
+ "quantizations": [
942
+ "4-bit",
943
+ "8-bit",
944
+ "none"
945
+ ],
946
+ "model_id": "THUDM/glm-4v-9b",
947
+ "model_revision": "e8b84fefc07e58a90c8489337675573fda95e289"
948
+ }
949
+ ],
950
+ "prompt_style": {
951
+ "style_name": "CHATGLM3",
952
+ "system_prompt": "",
953
+ "roles": [
954
+ "user",
955
+ "assistant"
956
+ ],
957
+ "stop_token_ids": [
958
+ 151329,
959
+ 151336,
960
+ 151338
961
+ ],
962
+ "stop": [
963
+ "<|endoftext|>",
964
+ "<|user|>",
965
+ "<|observation|>"
966
+ ]
967
+ }
968
+ },
834
969
  {
835
970
  "version": 1,
836
971
  "context_length": 2048,
@@ -2291,6 +2426,218 @@
2291
2426
  ]
2292
2427
  }
2293
2428
  },
2429
+ {
2430
+ "version": 1,
2431
+ "context_length": 32768,
2432
+ "model_name": "qwen2-instruct",
2433
+ "model_lang": [
2434
+ "en",
2435
+ "zh"
2436
+ ],
2437
+ "model_ability": [
2438
+ "chat",
2439
+ "tools"
2440
+ ],
2441
+ "model_description": "Qwen2 is the new series of Qwen large language models",
2442
+ "model_specs": [
2443
+ {
2444
+ "model_format": "pytorch",
2445
+ "model_size_in_billions": "0_5",
2446
+ "quantizations": [
2447
+ "4-bit",
2448
+ "8-bit",
2449
+ "none"
2450
+ ],
2451
+ "model_id": "Qwen/Qwen2-0.5B-Instruct"
2452
+ },
2453
+ {
2454
+ "model_format": "pytorch",
2455
+ "model_size_in_billions": "1_5",
2456
+ "quantizations": [
2457
+ "4-bit",
2458
+ "8-bit",
2459
+ "none"
2460
+ ],
2461
+ "model_id": "Qwen/Qwen2-1.5B-Instruct"
2462
+ },
2463
+ {
2464
+ "model_format": "pytorch",
2465
+ "model_size_in_billions": 7,
2466
+ "quantizations": [
2467
+ "4-bit",
2468
+ "8-bit",
2469
+ "none"
2470
+ ],
2471
+ "model_id": "Qwen/Qwen2-7B-Instruct"
2472
+ },
2473
+ {
2474
+ "model_format": "pytorch",
2475
+ "model_size_in_billions": 72,
2476
+ "quantizations": [
2477
+ "4-bit",
2478
+ "8-bit",
2479
+ "none"
2480
+ ],
2481
+ "model_id": "Qwen/Qwen2-72B-Instruct"
2482
+ },
2483
+ {
2484
+ "model_format": "gptq",
2485
+ "model_size_in_billions": "0_5",
2486
+ "quantizations": [
2487
+ "Int4",
2488
+ "Int8"
2489
+ ],
2490
+ "model_id": "Qwen/Qwen2-0.5B-Instruct-GPTQ-{quantization}"
2491
+ },
2492
+ {
2493
+ "model_format": "gptq",
2494
+ "model_size_in_billions": "1_5",
2495
+ "quantizations": [
2496
+ "Int4",
2497
+ "Int8"
2498
+ ],
2499
+ "model_id": "Qwen/Qwen2-1.5B-Instruct-GPTQ-{quantization}"
2500
+ },
2501
+ {
2502
+ "model_format": "gptq",
2503
+ "model_size_in_billions": 7,
2504
+ "quantizations": [
2505
+ "Int4",
2506
+ "Int8"
2507
+ ],
2508
+ "model_id": "Qwen/Qwen2-7B-Instruct-GPTQ-{quantization}"
2509
+ },
2510
+ {
2511
+ "model_format": "gptq",
2512
+ "model_size_in_billions": 72,
2513
+ "quantizations": [
2514
+ "Int4",
2515
+ "Int8"
2516
+ ],
2517
+ "model_id": "Qwen/Qwen2-72B-Instruct-GPTQ-{quantization}"
2518
+ },
2519
+ {
2520
+ "model_format": "awq",
2521
+ "model_size_in_billions": "0_5",
2522
+ "quantizations": [
2523
+ "Int4"
2524
+ ],
2525
+ "model_id": "Qwen/Qwen2-0.5B-Instruct-AWQ"
2526
+ },
2527
+ {
2528
+ "model_format": "awq",
2529
+ "model_size_in_billions": "1_5",
2530
+ "quantizations": [
2531
+ "Int4"
2532
+ ],
2533
+ "model_id": "Qwen/Qwen2-1.5B-Instruct-AWQ"
2534
+ },
2535
+ {
2536
+ "model_format": "awq",
2537
+ "model_size_in_billions": 7,
2538
+ "quantizations": [
2539
+ "Int4"
2540
+ ],
2541
+ "model_id": "Qwen/Qwen2-7B-Instruct-AWQ"
2542
+ },
2543
+ {
2544
+ "model_format": "awq",
2545
+ "model_size_in_billions": 72,
2546
+ "quantizations": [
2547
+ "Int4"
2548
+ ],
2549
+ "model_id": "Qwen/Qwen2-72B-Instruct-AWQ"
2550
+ },
2551
+ {
2552
+ "model_format": "ggufv2",
2553
+ "model_size_in_billions": "0_5",
2554
+ "quantizations": [
2555
+ "q2_k",
2556
+ "q3_k_m",
2557
+ "q4_0",
2558
+ "q4_k_m",
2559
+ "q5_0",
2560
+ "q5_k_m",
2561
+ "q6_k",
2562
+ "q8_0",
2563
+ "fp16"
2564
+ ],
2565
+ "model_id": "Qwen/Qwen2-0.5B-Instruct-GGUF",
2566
+ "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf"
2567
+ }
2568
+ ],
2569
+ "prompt_style": {
2570
+ "style_name": "QWEN",
2571
+ "system_prompt": "You are a helpful assistant.",
2572
+ "roles": [
2573
+ "user",
2574
+ "assistant"
2575
+ ],
2576
+ "intra_message_sep": "\n",
2577
+ "stop_token_ids": [
2578
+ 151643,
2579
+ 151644,
2580
+ 151645
2581
+ ],
2582
+ "stop": [
2583
+ "<|endoftext|>",
2584
+ "<|im_start|>",
2585
+ "<|im_end|>"
2586
+ ]
2587
+ }
2588
+ },
2589
+ {
2590
+ "version": 1,
2591
+ "context_length": 32768,
2592
+ "model_name": "qwen2-moe-instruct",
2593
+ "model_lang": [
2594
+ "en",
2595
+ "zh"
2596
+ ],
2597
+ "model_ability": [
2598
+ "chat"
2599
+ ],
2600
+ "model_description": "Qwen2 is the new series of Qwen large language models. ",
2601
+ "model_specs": [
2602
+ {
2603
+ "model_format": "pytorch",
2604
+ "model_size_in_billions": 14,
2605
+ "quantizations": [
2606
+ "4-bit",
2607
+ "8-bit",
2608
+ "none"
2609
+ ],
2610
+ "model_id": "Qwen/Qwen2-57B-A14B-Instruct"
2611
+ },
2612
+ {
2613
+ "model_format": "gptq",
2614
+ "model_size_in_billions": 14,
2615
+ "quantizations": [
2616
+ "Int4"
2617
+ ],
2618
+ "model_id": "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4"
2619
+ }
2620
+ ],
2621
+ "prompt_style": {
2622
+ "style_name": "QWEN",
2623
+ "system_prompt": "You are a helpful assistant.",
2624
+ "roles": [
2625
+ "user",
2626
+ "assistant"
2627
+ ],
2628
+ "intra_message_sep": "\n",
2629
+ "stop_token_ids": [
2630
+ 151643,
2631
+ 151644,
2632
+ 151645
2633
+ ],
2634
+ "stop": [
2635
+ "<|endoftext|>",
2636
+ "<|im_start|>",
2637
+ "<|im_end|>"
2638
+ ]
2639
+ }
2640
+ },
2294
2641
  {
2295
2642
  "version": 1,
2296
2643
  "context_length": 8192,
@@ -3251,6 +3598,125 @@
3251
3598
  ]
3252
3599
  }
3253
3600
  },
3601
+ {
3602
+ "version": 1,
3603
+ "context_length": 32768,
3604
+ "model_name": "mistral-instruct-v0.3",
3605
+ "model_lang": [
3606
+ "en"
3607
+ ],
3608
+ "model_ability": [
3609
+ "chat"
3610
+ ],
3611
+ "model_description": "The Mistral-7B-Instruct-v0.2 Large Language Model (LLM) is an improved instruct fine-tuned version of Mistral-7B-Instruct-v0.1.",
3612
+ "model_specs": [
3613
+ {
3614
+ "model_format": "pytorch",
3615
+ "model_size_in_billions": 7,
3616
+ "quantizations": [
3617
+ "4-bit",
3618
+ "8-bit",
3619
+ "none"
3620
+ ],
3621
+ "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
3622
+ "model_revision": "83e9aa141f2e28c82232fea5325f54edf17c43de"
3623
+ },
3624
+ {
3625
+ "model_format": "gptq",
3626
+ "model_size_in_billions": 7,
3627
+ "quantizations": [
3628
+ "Int4"
3629
+ ],
3630
+ "model_id": "neuralmagic/Mistral-7B-Instruct-v0.3-GPTQ-4bit"
3631
+ },
3632
+ {
3633
+ "model_format": "awq",
3634
+ "model_size_in_billions": 7,
3635
+ "quantizations": [
3636
+ "Int4"
3637
+ ],
3638
+ "model_id": "solidrust/Mistral-7B-Instruct-v0.3-AWQ"
3639
+ },
3640
+ {
3641
+ "model_format": "ggufv2",
3642
+ "model_size_in_billions": 7,
3643
+ "quantizations": [
3644
+ "Q2_K",
3645
+ "Q3_K_S",
3646
+ "Q3_K_M",
3647
+ "Q3_K_L",
3648
+ "Q4_K_S",
3649
+ "Q4_K_M",
3650
+ "Q5_K_S",
3651
+ "Q5_K_M",
3652
+ "Q6_K",
3653
+ "Q8_0",
3654
+ "fp16"
3655
+ ],
3656
+ "model_id": "MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF",
3657
+ "model_file_name_template": "Mistral-7B-Instruct-v0.3.{quantization}.gguf"
3658
+ }
3659
+ ],
3660
+ "prompt_style": {
3661
+ "style_name": "LLAMA2",
3662
+ "system_prompt": "[INST] ",
3663
+ "roles": [
3664
+ "[INST]",
3665
+ "[/INST]"
3666
+ ],
3667
+ "intra_message_sep": " ",
3668
+ "inter_message_sep": "<s>",
3669
+ "stop_token_ids": [
3670
+ 2
3671
+ ],
3672
+ "stop": [
3673
+ "</s>"
3674
+ ]
3675
+ }
3676
+ },
3677
+ {
3678
+ "version": 1,
3679
+ "context_length": 32768,
3680
+ "model_name": "codestral-v0.1",
3681
+ "model_lang": [
3682
+ "en"
3683
+ ],
3684
+ "model_ability": [
3685
+ "generate"
3686
+ ],
3687
+ "model_description": "Codestrall-22B-v0.1 is trained on a diverse dataset of 80+ programming languages, including the most popular ones, such as Python, Java, C, C++, JavaScript, and Bash",
3688
+ "model_specs": [
3689
+ {
3690
+ "model_format": "pytorch",
3691
+ "model_size_in_billions": 22,
3692
+ "quantizations": [
3693
+ "4-bit",
3694
+ "8-bit",
3695
+ "none"
3696
+ ],
3697
+ "model_id": "mistralai/Mistral-7B-Instruct-v0.2",
3698
+ "model_revision": "9552e7b1d9b2d5bbd87a5aa7221817285dbb6366"
3699
+ },
3700
+ {
3701
+ "model_format": "ggufv2",
3702
+ "model_size_in_billions": 22,
3703
+ "quantizations": [
3704
+ "Q2_K",
3705
+ "Q3_K_S",
3706
+ "Q3_K_M",
3707
+ "Q3_K_L",
3708
+ "Q4_K_S",
3709
+ "Q4_K_M",
3710
+ "Q5_K_S",
3711
+ "Q5_K_M",
3712
+ "Q6_K",
3713
+ "Q8_0"
3714
+ ],
3715
+ "model_id": "bartowski/Codestral-22B-v0.1-GGUF",
3716
+ "model_file_name_template": "Codestral-22B-v0.1-{quantization}.gguf"
3717
+ }
3718
+ ]
3719
+ },
3254
3720
  {
3255
3721
  "version": 1,
3256
3722
  "context_length": 8192,
@@ -5258,6 +5724,48 @@
5258
5724
  ]
5259
5725
  }
5260
5726
  },
5727
+ {
5728
+ "version":1,
5729
+ "context_length":2048,
5730
+ "model_name":"MiniCPM-Llama3-V-2_5",
5731
+ "model_lang":[
5732
+ "en",
5733
+ "zh"
5734
+ ],
5735
+ "model_ability":[
5736
+ "chat",
5737
+ "vision"
5738
+ ],
5739
+ "model_description":"MiniCPM-Llama3-V 2.5 is the latest model in the MiniCPM-V series. The model is built on SigLip-400M and Llama3-8B-Instruct with a total of 8B parameters.",
5740
+ "model_specs":[
5741
+ {
5742
+ "model_format":"pytorch",
5743
+ "model_size_in_billions":8,
5744
+ "quantizations":[
5745
+ "none"
5746
+ ],
5747
+ "model_id":"openbmb/MiniCPM-Llama3-V-2_5",
5748
+ "model_revision":"285a637ba8a30a0660dfcccad16f9a864f75abfd"
5749
+ },
5750
+ {
5751
+ "model_format":"pytorch",
5752
+ "model_size_in_billions":8,
5753
+ "quantizations":[
5754
+ "int4"
5755
+ ],
5756
+ "model_id":"openbmb/MiniCPM-Llama3-V-2_5-{quantization}",
5757
+ "model_revision":"f92aff28552de35de3be204e8fe292dd4824e544"
5758
+ }
5759
+ ],
5760
+ "prompt_style":{
5761
+ "style_name":"OmniLMM",
5762
+ "system_prompt":"The role of first msg should be user",
5763
+ "roles":[
5764
+ "user",
5765
+ "assistant"
5766
+ ]
5767
+ }
5768
+ },
5261
5769
  {
5262
5770
  "version": 1,
5263
5771
  "context_length": 4096,
@@ -6277,7 +6785,7 @@
6277
6785
  "quantizations": [
6278
6786
  "int4"
6279
6787
  ],
6280
- "model_id": "THUDM/cogvlm2-llama3-chinese-chat-19B-{quantizations}",
6788
+ "model_id": "THUDM/cogvlm2-llama3-chinese-chat-19B-{quantization}",
6281
6789
  "model_revision": "7863e362174f4718c2fe9cba4befd0b580a3194f"
6282
6790
  }
6283
6791
  ],
@@ -32,10 +32,15 @@ from ..._compat import (
32
32
  load_str_bytes,
33
33
  validator,
34
34
  )
35
- from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
35
+ from ...constants import (
36
+ XINFERENCE_CACHE_DIR,
37
+ XINFERENCE_ENV_CSG_TOKEN,
38
+ XINFERENCE_MODEL_DIR,
39
+ )
36
40
  from ..utils import (
37
41
  IS_NEW_HUGGINGFACE_HUB,
38
42
  create_symlink,
43
+ download_from_csghub,
39
44
  download_from_modelscope,
40
45
  is_valid_model_uri,
41
46
  parse_uri,
@@ -232,6 +237,7 @@ LLAMA_CLASSES: List[Type[LLM]] = []
232
237
 
233
238
  BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
234
239
  BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
240
+ BUILTIN_CSGHUB_LLM_FAMILIES: List["LLMFamilyV1"] = []
235
241
 
236
242
  SGLANG_CLASSES: List[Type[LLM]] = []
237
243
  TRANSFORMERS_CLASSES: List[Type[LLM]] = []
@@ -292,6 +298,9 @@ def cache(
292
298
  elif llm_spec.model_hub == "modelscope":
293
299
  logger.info(f"Caching from Modelscope: {llm_spec.model_id}")
294
300
  return cache_from_modelscope(llm_family, llm_spec, quantization)
301
+ elif llm_spec.model_hub == "csghub":
302
+ logger.info(f"Caching from CSGHub: {llm_spec.model_id}")
303
+ return cache_from_csghub(llm_family, llm_spec, quantization)
295
304
  else:
296
305
  raise ValueError(f"Unknown model hub: {llm_spec.model_hub}")
297
306
 
@@ -566,6 +575,7 @@ def _skip_download(
566
575
  "modelscope": _get_meta_path(
567
576
  cache_dir, model_format, "modelscope", quantization
568
577
  ),
578
+ "csghub": _get_meta_path(cache_dir, model_format, "csghub", quantization),
569
579
  }
570
580
  if valid_model_revision(model_hub_to_meta_path[model_hub], model_revision):
571
581
  logger.info(f"Cache {cache_dir} exists")
@@ -650,6 +660,75 @@ def _merge_cached_files(
650
660
  logger.info(f"Merge complete.")
651
661
 
652
662
 
663
+ def cache_from_csghub(
664
+ llm_family: LLMFamilyV1,
665
+ llm_spec: "LLMSpecV1",
666
+ quantization: Optional[str] = None,
667
+ ) -> str:
668
+ """
669
+ Cache model from CSGHub. Return the cache directory.
670
+ """
671
+ from pycsghub.file_download import file_download
672
+ from pycsghub.snapshot_download import snapshot_download
673
+
674
+ cache_dir = _get_cache_dir(llm_family, llm_spec)
675
+
676
+ if _skip_download(
677
+ cache_dir,
678
+ llm_spec.model_format,
679
+ llm_spec.model_hub,
680
+ llm_spec.model_revision,
681
+ quantization,
682
+ ):
683
+ return cache_dir
684
+
685
+ if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
686
+ download_dir = retry_download(
687
+ snapshot_download,
688
+ llm_family.model_name,
689
+ {
690
+ "model_size": llm_spec.model_size_in_billions,
691
+ "model_format": llm_spec.model_format,
692
+ },
693
+ llm_spec.model_id,
694
+ endpoint="https://hub-stg.opencsg.com",
695
+ token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
696
+ )
697
+ create_symlink(download_dir, cache_dir)
698
+
699
+ elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
700
+ file_names, final_file_name, need_merge = _generate_model_file_names(
701
+ llm_spec, quantization
702
+ )
703
+
704
+ for filename in file_names:
705
+ download_path = retry_download(
706
+ file_download,
707
+ llm_family.model_name,
708
+ {
709
+ "model_size": llm_spec.model_size_in_billions,
710
+ "model_format": llm_spec.model_format,
711
+ },
712
+ llm_spec.model_id,
713
+ file_name=filename,
714
+ endpoint="https://hub-stg.opencsg.com",
715
+ token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
716
+ )
717
+ symlink_local_file(download_path, cache_dir, filename)
718
+
719
+ if need_merge:
720
+ _merge_cached_files(cache_dir, file_names, final_file_name)
721
+ else:
722
+ raise ValueError(f"Unsupported format: {llm_spec.model_format}")
723
+
724
+ meta_path = _get_meta_path(
725
+ cache_dir, llm_spec.model_format, llm_spec.model_hub, quantization
726
+ )
727
+ _generate_meta_file(meta_path, llm_family, llm_spec, quantization)
728
+
729
+ return cache_dir
730
+
731
+
653
732
  def cache_from_modelscope(
654
733
  llm_family: LLMFamilyV1,
655
734
  llm_spec: "LLMSpecV1",
@@ -931,6 +1010,12 @@ def match_llm(
931
1010
  + BUILTIN_LLM_FAMILIES
932
1011
  + user_defined_llm_families
933
1012
  )
1013
+ elif download_from_csghub():
1014
+ all_families = (
1015
+ BUILTIN_CSGHUB_LLM_FAMILIES
1016
+ + BUILTIN_LLM_FAMILIES
1017
+ + user_defined_llm_families
1018
+ )
934
1019
  else:
935
1020
  all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
936
1021