xinference 0.12.3__py3-none-any.whl → 0.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +56 -8
- xinference/client/restful/restful_client.py +49 -4
- xinference/core/model.py +36 -4
- xinference/core/scheduler.py +2 -0
- xinference/core/supervisor.py +132 -15
- xinference/core/worker.py +239 -53
- xinference/deploy/cmdline.py +5 -0
- xinference/deploy/utils.py +33 -2
- xinference/model/audio/chattts.py +6 -6
- xinference/model/audio/core.py +23 -15
- xinference/model/core.py +12 -3
- xinference/model/embedding/core.py +25 -16
- xinference/model/flexible/__init__.py +40 -0
- xinference/model/flexible/core.py +228 -0
- xinference/model/flexible/launchers/__init__.py +15 -0
- xinference/model/flexible/launchers/transformers_launcher.py +63 -0
- xinference/model/flexible/utils.py +33 -0
- xinference/model/image/core.py +18 -14
- xinference/model/image/custom.py +1 -1
- xinference/model/llm/__init__.py +5 -2
- xinference/model/llm/core.py +3 -2
- xinference/model/llm/ggml/llamacpp.py +1 -10
- xinference/model/llm/llm_family.json +292 -36
- xinference/model/llm/llm_family.py +102 -53
- xinference/model/llm/llm_family_modelscope.json +247 -27
- xinference/model/llm/mlx/__init__.py +13 -0
- xinference/model/llm/mlx/core.py +408 -0
- xinference/model/llm/pytorch/chatglm.py +2 -9
- xinference/model/llm/pytorch/cogvlm2.py +206 -21
- xinference/model/llm/pytorch/core.py +213 -120
- xinference/model/llm/pytorch/glm4v.py +171 -15
- xinference/model/llm/pytorch/qwen_vl.py +168 -7
- xinference/model/llm/pytorch/utils.py +53 -62
- xinference/model/llm/utils.py +28 -7
- xinference/model/rerank/core.py +29 -25
- xinference/thirdparty/deepseek_vl/serve/__init__.py +13 -0
- xinference/thirdparty/deepseek_vl/serve/app_deepseek.py +510 -0
- xinference/thirdparty/deepseek_vl/serve/app_modules/__init__.py +13 -0
- xinference/thirdparty/deepseek_vl/serve/app_modules/gradio_utils.py +94 -0
- xinference/thirdparty/deepseek_vl/serve/app_modules/overwrites.py +81 -0
- xinference/thirdparty/deepseek_vl/serve/app_modules/presets.py +96 -0
- xinference/thirdparty/deepseek_vl/serve/app_modules/utils.py +229 -0
- xinference/thirdparty/deepseek_vl/serve/inference.py +170 -0
- xinference/types.py +0 -1
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.95c1d652.js +3 -0
- xinference/web/ui/build/static/js/main.95c1d652.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2c63090c842376cdd368c3ded88a333ef40d94785747651343040a6f7872a223.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/70fa8c07463a5fe57c68bf92502910105a8f647371836fe8c3a7408246ca7ba0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +1 -0
- {xinference-0.12.3.dist-info → xinference-0.13.1.dist-info}/METADATA +10 -11
- {xinference-0.12.3.dist-info → xinference-0.13.1.dist-info}/RECORD +71 -69
- xinference/model/llm/ggml/chatglm.py +0 -457
- xinference/thirdparty/ChatTTS/__init__.py +0 -1
- xinference/thirdparty/ChatTTS/core.py +0 -200
- xinference/thirdparty/ChatTTS/experimental/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/experimental/llm.py +0 -40
- xinference/thirdparty/ChatTTS/infer/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/infer/api.py +0 -125
- xinference/thirdparty/ChatTTS/model/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/model/dvae.py +0 -155
- xinference/thirdparty/ChatTTS/model/gpt.py +0 -265
- xinference/thirdparty/ChatTTS/utils/__init__.py +0 -0
- xinference/thirdparty/ChatTTS/utils/gpu_utils.py +0 -23
- xinference/thirdparty/ChatTTS/utils/infer_utils.py +0 -141
- xinference/thirdparty/ChatTTS/utils/io_utils.py +0 -14
- xinference/web/ui/build/static/js/main.77dd47c3.js +0 -3
- xinference/web/ui/build/static/js/main.77dd47c3.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0cd591866aa345566e0b63fb51ff2043e163a770af6fdc2f3bad395d046353e2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/37c1476717199863bbba1530e3513a9368f8f73001b75b4a85c2075956308027.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3da7d55e87882a4af923e187b1351160e34ca102f589086439c15131a227fb6e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/46edc1fe657dfedb2e673148332bb442c6eb98f09f2592c389209e376510afa5.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/72bcecc71c5267250edeb89608859d449b586f13ff9923a5e70e7172976ec403.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/82db357f3fd5b32215d747ee593f69ff06c95ad6cde37f71a96c8290aaab64c0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/935efd2867664c58230378fdf2ff1ea85e58d853b7214014e20dfbca8dab7b05.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bc6da27195ec4607bb472bf61f97c928ad4966fa64e4c2247661bedb7400abba.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c2abe75f04ad82fba68f35ed9cbe2e287762c876684fddccccfa73f739489b65.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f118f99c22b713c678c1209c4e1dd43fe86e3f6e801a4c0c35d3bbf41fd05fe6.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f51bf63ddaa7afd125ef2254a105789333eecc1c94fdf5157a9b88ef7ad0a5bd.json +0 -1
- /xinference/web/ui/build/static/js/{main.77dd47c3.js.LICENSE.txt → main.95c1d652.js.LICENSE.txt} +0 -0
- {xinference-0.12.3.dist-info → xinference-0.13.1.dist-info}/LICENSE +0 -0
- {xinference-0.12.3.dist-info → xinference-0.13.1.dist-info}/WHEEL +0 -0
- {xinference-0.12.3.dist-info → xinference-0.13.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.12.3.dist-info → xinference-0.13.1.dist-info}/top_level.txt +0 -0
|
@@ -574,19 +574,6 @@
|
|
|
574
574
|
],
|
|
575
575
|
"model_description": "ChatGLM is an open-source General Language Model (GLM) based LLM trained on both Chinese and English data.",
|
|
576
576
|
"model_specs": [
|
|
577
|
-
{
|
|
578
|
-
"model_format": "ggmlv3",
|
|
579
|
-
"model_size_in_billions": 6,
|
|
580
|
-
"quantizations": [
|
|
581
|
-
"q4_0",
|
|
582
|
-
"q4_1",
|
|
583
|
-
"q5_0",
|
|
584
|
-
"q5_1",
|
|
585
|
-
"q8_0"
|
|
586
|
-
],
|
|
587
|
-
"model_id": "Xorbits/chatglm-6B-GGML",
|
|
588
|
-
"model_file_name_template": "chatglm-ggml-{quantization}.bin"
|
|
589
|
-
},
|
|
590
577
|
{
|
|
591
578
|
"model_format": "pytorch",
|
|
592
579
|
"model_size_in_billions": 6,
|
|
@@ -622,19 +609,6 @@
|
|
|
622
609
|
],
|
|
623
610
|
"model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
|
|
624
611
|
"model_specs": [
|
|
625
|
-
{
|
|
626
|
-
"model_format": "ggmlv3",
|
|
627
|
-
"model_size_in_billions": 6,
|
|
628
|
-
"quantizations": [
|
|
629
|
-
"q4_0",
|
|
630
|
-
"q4_1",
|
|
631
|
-
"q5_0",
|
|
632
|
-
"q5_1",
|
|
633
|
-
"q8_0"
|
|
634
|
-
],
|
|
635
|
-
"model_id": "Xorbits/chatglm2-6B-GGML",
|
|
636
|
-
"model_file_name_template": "chatglm2-ggml-{quantization}.bin"
|
|
637
|
-
},
|
|
638
612
|
{
|
|
639
613
|
"model_format": "pytorch",
|
|
640
614
|
"model_size_in_billions": 6,
|
|
@@ -706,15 +680,6 @@
|
|
|
706
680
|
],
|
|
707
681
|
"model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
|
|
708
682
|
"model_specs": [
|
|
709
|
-
{
|
|
710
|
-
"model_format": "ggmlv3",
|
|
711
|
-
"model_size_in_billions": 6,
|
|
712
|
-
"quantizations": [
|
|
713
|
-
"q4_0"
|
|
714
|
-
],
|
|
715
|
-
"model_id": "Xorbits/chatglm3-6B-GGML",
|
|
716
|
-
"model_file_name_template": "chatglm3-ggml-{quantization}.bin"
|
|
717
|
-
},
|
|
718
683
|
{
|
|
719
684
|
"model_format": "pytorch",
|
|
720
685
|
"model_size_in_billions": 6,
|
|
@@ -855,6 +820,32 @@
|
|
|
855
820
|
],
|
|
856
821
|
"model_id": "THUDM/glm-4-9b-chat",
|
|
857
822
|
"model_revision": "b84dc74294ccd507a3d78bde8aebf628221af9bd"
|
|
823
|
+
},
|
|
824
|
+
{
|
|
825
|
+
"model_format": "ggufv2",
|
|
826
|
+
"model_size_in_billions": 9,
|
|
827
|
+
"quantizations": [
|
|
828
|
+
"Q2_K",
|
|
829
|
+
"IQ3_XS",
|
|
830
|
+
"IQ3_S",
|
|
831
|
+
"IQ3_M",
|
|
832
|
+
"Q3_K_S",
|
|
833
|
+
"Q3_K_L",
|
|
834
|
+
"Q3_K",
|
|
835
|
+
"IQ4_XS",
|
|
836
|
+
"IQ4_NL",
|
|
837
|
+
"Q4_K_S",
|
|
838
|
+
"Q4_K",
|
|
839
|
+
"Q5_K_S",
|
|
840
|
+
"Q5_K",
|
|
841
|
+
"Q6_K",
|
|
842
|
+
"Q8_0",
|
|
843
|
+
"BF16",
|
|
844
|
+
"FP16"
|
|
845
|
+
],
|
|
846
|
+
"model_file_name_template": "glm-4-9b-chat.{quantization}.gguf",
|
|
847
|
+
"model_id": "legraphista/glm-4-9b-chat-GGUF",
|
|
848
|
+
"model_revision": "0155a14edf0176863e9a003cdd78ce599e4d62c0"
|
|
858
849
|
}
|
|
859
850
|
],
|
|
860
851
|
"prompt_style": {
|
|
@@ -900,6 +891,32 @@
|
|
|
900
891
|
],
|
|
901
892
|
"model_id": "THUDM/glm-4-9b-chat-1m",
|
|
902
893
|
"model_revision": "715ddbe91082f976ff6a4ca06d59e5bbff6c3642"
|
|
894
|
+
},
|
|
895
|
+
{
|
|
896
|
+
"model_format": "ggufv2",
|
|
897
|
+
"model_size_in_billions": 9,
|
|
898
|
+
"quantizations": [
|
|
899
|
+
"Q2_K",
|
|
900
|
+
"IQ3_XS",
|
|
901
|
+
"IQ3_S",
|
|
902
|
+
"IQ3_M",
|
|
903
|
+
"Q3_K_S",
|
|
904
|
+
"Q3_K_L",
|
|
905
|
+
"Q3_K",
|
|
906
|
+
"IQ4_XS",
|
|
907
|
+
"IQ4_NL",
|
|
908
|
+
"Q4_K_S",
|
|
909
|
+
"Q4_K",
|
|
910
|
+
"Q5_K_S",
|
|
911
|
+
"Q5_K",
|
|
912
|
+
"Q6_K",
|
|
913
|
+
"Q8_0",
|
|
914
|
+
"BF16",
|
|
915
|
+
"FP16"
|
|
916
|
+
],
|
|
917
|
+
"model_file_name_template": "glm-4-9b-chat-1m.{quantization}.gguf",
|
|
918
|
+
"model_id": "legraphista/glm-4-9b-chat-1m-GGUF",
|
|
919
|
+
"model_revision": "782e28bd5eee3c514c07108da15e0b5e06dcf776"
|
|
903
920
|
}
|
|
904
921
|
],
|
|
905
922
|
"prompt_style": {
|
|
@@ -944,7 +961,7 @@
|
|
|
944
961
|
"none"
|
|
945
962
|
],
|
|
946
963
|
"model_id": "THUDM/glm-4v-9b",
|
|
947
|
-
"model_revision": "
|
|
964
|
+
"model_revision": "6c2e4732db8443f64a48d5af04b74425a7d169c4"
|
|
948
965
|
}
|
|
949
966
|
],
|
|
950
967
|
"prompt_style": {
|
|
@@ -2549,6 +2566,38 @@
|
|
|
2549
2566
|
],
|
|
2550
2567
|
"model_id": "Qwen/Qwen2-72B-Instruct-AWQ"
|
|
2551
2568
|
},
|
|
2569
|
+
{
|
|
2570
|
+
"model_format": "mlx",
|
|
2571
|
+
"model_size_in_billions": "0_5",
|
|
2572
|
+
"quantizations": [
|
|
2573
|
+
"4-bit"
|
|
2574
|
+
],
|
|
2575
|
+
"model_id": "Qwen/Qwen2-0.5B-Instruct-MLX"
|
|
2576
|
+
},
|
|
2577
|
+
{
|
|
2578
|
+
"model_format": "mlx",
|
|
2579
|
+
"model_size_in_billions": "1_5",
|
|
2580
|
+
"quantizations": [
|
|
2581
|
+
"4-bit"
|
|
2582
|
+
],
|
|
2583
|
+
"model_id": "Qwen/Qwen2-1.5B-Instruct-MLX"
|
|
2584
|
+
},
|
|
2585
|
+
{
|
|
2586
|
+
"model_format": "mlx",
|
|
2587
|
+
"model_size_in_billions": 7,
|
|
2588
|
+
"quantizations": [
|
|
2589
|
+
"4-bit"
|
|
2590
|
+
],
|
|
2591
|
+
"model_id": "Qwen/Qwen2-7B-Instruct-MLX"
|
|
2592
|
+
},
|
|
2593
|
+
{
|
|
2594
|
+
"model_format": "mlx",
|
|
2595
|
+
"model_size_in_billions": 72,
|
|
2596
|
+
"quantizations": [
|
|
2597
|
+
"4-bit"
|
|
2598
|
+
],
|
|
2599
|
+
"model_id": "mlx-community/Qwen2-72B-Instruct-4bit"
|
|
2600
|
+
},
|
|
2552
2601
|
{
|
|
2553
2602
|
"model_format": "ggufv2",
|
|
2554
2603
|
"model_size_in_billions": "0_5",
|
|
@@ -2565,6 +2614,82 @@
|
|
|
2565
2614
|
],
|
|
2566
2615
|
"model_id": "Qwen/Qwen2-0.5B-Instruct-GGUF",
|
|
2567
2616
|
"model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf"
|
|
2617
|
+
},
|
|
2618
|
+
{
|
|
2619
|
+
"model_format": "ggufv2",
|
|
2620
|
+
"model_size_in_billions": "1_5",
|
|
2621
|
+
"quantizations": [
|
|
2622
|
+
"q2_k",
|
|
2623
|
+
"q3_k_m",
|
|
2624
|
+
"q4_0",
|
|
2625
|
+
"q4_k_m",
|
|
2626
|
+
"q5_0",
|
|
2627
|
+
"q5_k_m",
|
|
2628
|
+
"q6_k",
|
|
2629
|
+
"q8_0",
|
|
2630
|
+
"fp16"
|
|
2631
|
+
],
|
|
2632
|
+
"model_id": "Qwen/Qwen2-1.5B-Instruct-GGUF",
|
|
2633
|
+
"model_file_name_template": "qwen2-1_5b-instruct-{quantization}.gguf"
|
|
2634
|
+
},
|
|
2635
|
+
{
|
|
2636
|
+
"model_format": "ggufv2",
|
|
2637
|
+
"model_size_in_billions": 7,
|
|
2638
|
+
"quantizations": [
|
|
2639
|
+
"q2_k",
|
|
2640
|
+
"q3_k_m",
|
|
2641
|
+
"q4_0",
|
|
2642
|
+
"q4_k_m",
|
|
2643
|
+
"q5_0",
|
|
2644
|
+
"q5_k_m",
|
|
2645
|
+
"q6_k",
|
|
2646
|
+
"q8_0",
|
|
2647
|
+
"fp16"
|
|
2648
|
+
],
|
|
2649
|
+
"model_id": "Qwen/Qwen2-7B-Instruct-GGUF",
|
|
2650
|
+
"model_file_name_template": "qwen2-7b-instruct-{quantization}.gguf"
|
|
2651
|
+
},
|
|
2652
|
+
{
|
|
2653
|
+
"model_format": "ggufv2",
|
|
2654
|
+
"model_size_in_billions": 72,
|
|
2655
|
+
"quantizations": [
|
|
2656
|
+
"q2_k",
|
|
2657
|
+
"q3_k_m",
|
|
2658
|
+
"q4_0",
|
|
2659
|
+
"q4_k_m",
|
|
2660
|
+
"q5_0",
|
|
2661
|
+
"q5_k_m",
|
|
2662
|
+
"q6_k",
|
|
2663
|
+
"q8_0",
|
|
2664
|
+
"fp16"
|
|
2665
|
+
],
|
|
2666
|
+
"model_id": "Qwen/Qwen2-72B-Instruct-GGUF",
|
|
2667
|
+
"model_file_name_template": "qwen2-72b-instruct-{quantization}.gguf",
|
|
2668
|
+
"model_file_name_split_template": "qwen2-72b-instruct-{quantization}-{part}.gguf",
|
|
2669
|
+
"quantization_parts": {
|
|
2670
|
+
"q5_0": [
|
|
2671
|
+
"00001-of-00002",
|
|
2672
|
+
"00002-of-00002"
|
|
2673
|
+
],
|
|
2674
|
+
"q5_k_m": [
|
|
2675
|
+
"00001-of-00002",
|
|
2676
|
+
"00002-of-00002"
|
|
2677
|
+
],
|
|
2678
|
+
"q6_k": [
|
|
2679
|
+
"00001-of-00002",
|
|
2680
|
+
"00002-of-00002"
|
|
2681
|
+
],
|
|
2682
|
+
"q8_0": [
|
|
2683
|
+
"00001-of-00002",
|
|
2684
|
+
"00002-of-00002"
|
|
2685
|
+
],
|
|
2686
|
+
"fp16": [
|
|
2687
|
+
"00001-of-00004",
|
|
2688
|
+
"00002-of-00004",
|
|
2689
|
+
"00003-of-00004",
|
|
2690
|
+
"00004-of-00004"
|
|
2691
|
+
]
|
|
2692
|
+
}
|
|
2568
2693
|
}
|
|
2569
2694
|
],
|
|
2570
2695
|
"prompt_style": {
|
|
@@ -2618,6 +2743,34 @@
|
|
|
2618
2743
|
"Int4"
|
|
2619
2744
|
],
|
|
2620
2745
|
"model_id": "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4"
|
|
2746
|
+
},
|
|
2747
|
+
{
|
|
2748
|
+
"model_format": "ggufv2",
|
|
2749
|
+
"model_size_in_billions": 14,
|
|
2750
|
+
"quantizations": [
|
|
2751
|
+
"q3_k_m",
|
|
2752
|
+
"q4_0",
|
|
2753
|
+
"q4_k_m",
|
|
2754
|
+
"q5_0",
|
|
2755
|
+
"q5_k_m",
|
|
2756
|
+
"q6_k",
|
|
2757
|
+
"q8_0",
|
|
2758
|
+
"fp16"
|
|
2759
|
+
],
|
|
2760
|
+
"model_id": "Qwen/Qwen2-57B-A14B-Instruct-GGUF",
|
|
2761
|
+
"model_file_name_template": "qwen2-57b-a14b-instruct-{quantization}.gguf",
|
|
2762
|
+
"model_file_name_split_template": "qwen2-57b-a14b-instruct-{quantization}-{part}.gguf",
|
|
2763
|
+
"quantization_parts": {
|
|
2764
|
+
"q8_0": [
|
|
2765
|
+
"00001-of-00002",
|
|
2766
|
+
"00002-of-00002"
|
|
2767
|
+
],
|
|
2768
|
+
"fp16": [
|
|
2769
|
+
"00001-of-00003",
|
|
2770
|
+
"00002-of-00003",
|
|
2771
|
+
"00003-of-00003"
|
|
2772
|
+
]
|
|
2773
|
+
}
|
|
2621
2774
|
}
|
|
2622
2775
|
],
|
|
2623
2776
|
"prompt_style": {
|
|
@@ -5809,6 +5962,16 @@
|
|
|
5809
5962
|
"roles": [
|
|
5810
5963
|
"user",
|
|
5811
5964
|
"assistant"
|
|
5965
|
+
],
|
|
5966
|
+
"stop_token_ids": [
|
|
5967
|
+
151643,
|
|
5968
|
+
151644,
|
|
5969
|
+
151645
|
|
5970
|
+
],
|
|
5971
|
+
"stop": [
|
|
5972
|
+
"<|endoftext|>",
|
|
5973
|
+
"<|im_start|>",
|
|
5974
|
+
"<|im_end|>"
|
|
5812
5975
|
]
|
|
5813
5976
|
}
|
|
5814
5977
|
},
|
|
@@ -5997,6 +6160,99 @@
|
|
|
5997
6160
|
]
|
|
5998
6161
|
}
|
|
5999
6162
|
},
|
|
6163
|
+
{
|
|
6164
|
+
"version": 1,
|
|
6165
|
+
"context_length": 8192,
|
|
6166
|
+
"model_name": "gemma-2-it",
|
|
6167
|
+
"model_lang": [
|
|
6168
|
+
"en"
|
|
6169
|
+
],
|
|
6170
|
+
"model_ability": [
|
|
6171
|
+
"chat"
|
|
6172
|
+
],
|
|
6173
|
+
"model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
|
|
6174
|
+
"model_specs": [
|
|
6175
|
+
{
|
|
6176
|
+
"model_format": "pytorch",
|
|
6177
|
+
"model_size_in_billions": 9,
|
|
6178
|
+
"quantizations": [
|
|
6179
|
+
"none",
|
|
6180
|
+
"4-bit",
|
|
6181
|
+
"8-bit"
|
|
6182
|
+
],
|
|
6183
|
+
"model_id": "google/gemma-2-9b-it"
|
|
6184
|
+
},
|
|
6185
|
+
{
|
|
6186
|
+
"model_format": "pytorch",
|
|
6187
|
+
"model_size_in_billions": 27,
|
|
6188
|
+
"quantizations": [
|
|
6189
|
+
"none",
|
|
6190
|
+
"4-bit",
|
|
6191
|
+
"8-bit"
|
|
6192
|
+
],
|
|
6193
|
+
"model_id": "google/gemma-2-27b-it"
|
|
6194
|
+
},
|
|
6195
|
+
{
|
|
6196
|
+
"model_format": "mlx",
|
|
6197
|
+
"model_size_in_billions": 9,
|
|
6198
|
+
"quantizations": [
|
|
6199
|
+
"4-bit"
|
|
6200
|
+
],
|
|
6201
|
+
"model_id": "mlx-community/gemma-2-9b-it-4bit"
|
|
6202
|
+
},
|
|
6203
|
+
{
|
|
6204
|
+
"model_format": "mlx",
|
|
6205
|
+
"model_size_in_billions": 9,
|
|
6206
|
+
"quantizations": [
|
|
6207
|
+
"8-bit"
|
|
6208
|
+
],
|
|
6209
|
+
"model_id": "mlx-community/gemma-2-9b-it-8bit"
|
|
6210
|
+
},
|
|
6211
|
+
{
|
|
6212
|
+
"model_format": "mlx",
|
|
6213
|
+
"model_size_in_billions": 9,
|
|
6214
|
+
"quantizations": [
|
|
6215
|
+
"None"
|
|
6216
|
+
],
|
|
6217
|
+
"model_id": "mlx-community/gemma-2-9b-it-fp16"
|
|
6218
|
+
},
|
|
6219
|
+
{
|
|
6220
|
+
"model_format": "mlx",
|
|
6221
|
+
"model_size_in_billions": 27,
|
|
6222
|
+
"quantizations": [
|
|
6223
|
+
"4-bit"
|
|
6224
|
+
],
|
|
6225
|
+
"model_id": "mlx-community/gemma-2-27b-it-4bit"
|
|
6226
|
+
},
|
|
6227
|
+
{
|
|
6228
|
+
"model_format": "mlx",
|
|
6229
|
+
"model_size_in_billions": 27,
|
|
6230
|
+
"quantizations": [
|
|
6231
|
+
"8-bit"
|
|
6232
|
+
],
|
|
6233
|
+
"model_id": "mlx-community/gemma-2-27b-it-8bit"
|
|
6234
|
+
},
|
|
6235
|
+
{
|
|
6236
|
+
"model_format": "mlx",
|
|
6237
|
+
"model_size_in_billions": 27,
|
|
6238
|
+
"quantizations": [
|
|
6239
|
+
"None"
|
|
6240
|
+
],
|
|
6241
|
+
"model_id": "mlx-community/gemma-2-27b-it-fp16"
|
|
6242
|
+
}
|
|
6243
|
+
],
|
|
6244
|
+
"prompt_style": {
|
|
6245
|
+
"style_name": "gemma",
|
|
6246
|
+
"roles": [
|
|
6247
|
+
"user",
|
|
6248
|
+
"model"
|
|
6249
|
+
],
|
|
6250
|
+
"stop": [
|
|
6251
|
+
"<end_of_turn>",
|
|
6252
|
+
"<start_of_turn>"
|
|
6253
|
+
]
|
|
6254
|
+
}
|
|
6255
|
+
},
|
|
6000
6256
|
{
|
|
6001
6257
|
"version": 1,
|
|
6002
6258
|
"context_length": 4096,
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
import logging
|
|
16
16
|
import os
|
|
17
|
-
import platform
|
|
18
17
|
import shutil
|
|
19
18
|
from threading import Lock
|
|
20
19
|
from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
|
|
@@ -107,6 +106,28 @@ class PytorchLLMSpecV1(BaseModel):
|
|
|
107
106
|
return v
|
|
108
107
|
|
|
109
108
|
|
|
109
|
+
class MLXLLMSpecV1(BaseModel):
|
|
110
|
+
model_format: Literal["mlx"]
|
|
111
|
+
# Must in order that `str` first, then `int`
|
|
112
|
+
model_size_in_billions: Union[str, int]
|
|
113
|
+
quantizations: List[str]
|
|
114
|
+
model_id: Optional[str]
|
|
115
|
+
model_hub: str = "huggingface"
|
|
116
|
+
model_uri: Optional[str]
|
|
117
|
+
model_revision: Optional[str]
|
|
118
|
+
|
|
119
|
+
@validator("model_size_in_billions", pre=False)
|
|
120
|
+
def validate_model_size_with_radix(cls, v: object) -> object:
|
|
121
|
+
if isinstance(v, str):
|
|
122
|
+
if (
|
|
123
|
+
"_" in v
|
|
124
|
+
): # for example, "1_8" just returns "1_8", otherwise int("1_8") returns 18
|
|
125
|
+
return v
|
|
126
|
+
else:
|
|
127
|
+
return int(v)
|
|
128
|
+
return v
|
|
129
|
+
|
|
130
|
+
|
|
110
131
|
class PromptStyleV1(BaseModel):
|
|
111
132
|
style_name: str
|
|
112
133
|
system_prompt: str = ""
|
|
@@ -226,7 +247,7 @@ class CustomLLMFamilyV1(LLMFamilyV1):
|
|
|
226
247
|
|
|
227
248
|
|
|
228
249
|
LLMSpecV1 = Annotated[
|
|
229
|
-
Union[GgmlLLMSpecV1, PytorchLLMSpecV1],
|
|
250
|
+
Union[GgmlLLMSpecV1, PytorchLLMSpecV1, MLXLLMSpecV1],
|
|
230
251
|
Field(discriminator="model_format"),
|
|
231
252
|
]
|
|
232
253
|
|
|
@@ -249,6 +270,8 @@ UD_LLM_FAMILIES_LOCK = Lock()
|
|
|
249
270
|
|
|
250
271
|
VLLM_CLASSES: List[Type[LLM]] = []
|
|
251
272
|
|
|
273
|
+
MLX_CLASSES: List[Type[LLM]] = []
|
|
274
|
+
|
|
252
275
|
LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
|
|
253
276
|
SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
|
|
254
277
|
|
|
@@ -517,15 +540,20 @@ def _get_cache_dir_for_model_mem(
|
|
|
517
540
|
def _get_cache_dir(
|
|
518
541
|
llm_family: LLMFamilyV1,
|
|
519
542
|
llm_spec: "LLMSpecV1",
|
|
543
|
+
quantization: Optional[str] = None,
|
|
520
544
|
create_if_not_exist=True,
|
|
521
545
|
):
|
|
522
546
|
# If the model id contains quantization, then we should give each
|
|
523
547
|
# quantization a dedicated cache dir.
|
|
524
548
|
quant_suffix = ""
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
549
|
+
if llm_spec.model_id and "{" in llm_spec.model_id and quantization is not None:
|
|
550
|
+
quant_suffix = quantization
|
|
551
|
+
else:
|
|
552
|
+
for q in llm_spec.quantizations:
|
|
553
|
+
if llm_spec.model_id and q in llm_spec.model_id:
|
|
554
|
+
quant_suffix = q
|
|
555
|
+
break
|
|
556
|
+
|
|
529
557
|
cache_dir_name = (
|
|
530
558
|
f"{llm_family.model_name}-{llm_spec.model_format}"
|
|
531
559
|
f"-{llm_spec.model_size_in_billions}b"
|
|
@@ -549,7 +577,7 @@ def _get_meta_path(
|
|
|
549
577
|
return os.path.join(cache_dir, "__valid_download")
|
|
550
578
|
else:
|
|
551
579
|
return os.path.join(cache_dir, f"__valid_download_{model_hub}")
|
|
552
|
-
elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
|
|
580
|
+
elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
|
|
553
581
|
assert quantization is not None
|
|
554
582
|
if model_hub == "huggingface":
|
|
555
583
|
return os.path.join(cache_dir, f"__valid_download_{quantization}")
|
|
@@ -588,7 +616,7 @@ def _skip_download(
|
|
|
588
616
|
logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
|
|
589
617
|
return True
|
|
590
618
|
return False
|
|
591
|
-
elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
|
|
619
|
+
elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
|
|
592
620
|
assert quantization is not None
|
|
593
621
|
return os.path.exists(
|
|
594
622
|
_get_meta_path(cache_dir, model_format, model_hub, quantization)
|
|
@@ -683,7 +711,7 @@ def cache_from_csghub(
|
|
|
683
711
|
):
|
|
684
712
|
return cache_dir
|
|
685
713
|
|
|
686
|
-
if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
|
|
714
|
+
if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
|
|
687
715
|
download_dir = retry_download(
|
|
688
716
|
snapshot_download,
|
|
689
717
|
llm_family.model_name,
|
|
@@ -751,7 +779,7 @@ def cache_from_modelscope(
|
|
|
751
779
|
):
|
|
752
780
|
return cache_dir
|
|
753
781
|
|
|
754
|
-
if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
|
|
782
|
+
if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
|
|
755
783
|
download_dir = retry_download(
|
|
756
784
|
snapshot_download,
|
|
757
785
|
llm_family.model_name,
|
|
@@ -820,8 +848,8 @@ def cache_from_huggingface(
|
|
|
820
848
|
if not IS_NEW_HUGGINGFACE_HUB:
|
|
821
849
|
use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
|
|
822
850
|
|
|
823
|
-
if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
|
|
824
|
-
assert isinstance(llm_spec, PytorchLLMSpecV1)
|
|
851
|
+
if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
|
|
852
|
+
assert isinstance(llm_spec, (PytorchLLMSpecV1, MLXLLMSpecV1))
|
|
825
853
|
download_dir = retry_download(
|
|
826
854
|
huggingface_hub.snapshot_download,
|
|
827
855
|
llm_family.model_name,
|
|
@@ -876,6 +904,7 @@ def _check_revision(
|
|
|
876
904
|
llm_spec: "LLMSpecV1",
|
|
877
905
|
builtin: list,
|
|
878
906
|
meta_path: str,
|
|
907
|
+
quantization: Optional[str] = None,
|
|
879
908
|
) -> bool:
|
|
880
909
|
for family in builtin:
|
|
881
910
|
if llm_family.model_name == family.model_name:
|
|
@@ -884,59 +913,63 @@ def _check_revision(
|
|
|
884
913
|
if (
|
|
885
914
|
spec.model_format == "pytorch"
|
|
886
915
|
and spec.model_size_in_billions == llm_spec.model_size_in_billions
|
|
916
|
+
and (quantization is None or quantization in spec.quantizations)
|
|
887
917
|
):
|
|
888
918
|
return valid_model_revision(meta_path, spec.model_revision)
|
|
889
919
|
return False
|
|
890
920
|
|
|
891
921
|
|
|
892
922
|
def get_cache_status(
|
|
893
|
-
llm_family: LLMFamilyV1,
|
|
894
|
-
llm_spec: "LLMSpecV1",
|
|
923
|
+
llm_family: LLMFamilyV1, llm_spec: "LLMSpecV1", quantization: Optional[str] = None
|
|
895
924
|
) -> Union[bool, List[bool]]:
|
|
896
925
|
"""
|
|
897
|
-
|
|
898
|
-
|
|
926
|
+
Checks if a model's cache status is available based on the model format and quantization.
|
|
927
|
+
Supports different directories and model formats.
|
|
899
928
|
"""
|
|
900
|
-
cache_dir = _get_cache_dir(llm_family, llm_spec, create_if_not_exist=False)
|
|
901
|
-
# check revision for pytorch model
|
|
902
|
-
if llm_spec.model_format == "pytorch":
|
|
903
|
-
hf_meta_path = _get_meta_path(cache_dir, "pytorch", "huggingface", "none")
|
|
904
|
-
ms_meta_path = _get_meta_path(cache_dir, "pytorch", "modelscope", "none")
|
|
905
|
-
revisions = [
|
|
906
|
-
_check_revision(llm_family, llm_spec, BUILTIN_LLM_FAMILIES, hf_meta_path),
|
|
907
|
-
_check_revision(
|
|
908
|
-
llm_family, llm_spec, BUILTIN_MODELSCOPE_LLM_FAMILIES, ms_meta_path
|
|
909
|
-
),
|
|
910
|
-
]
|
|
911
|
-
return any(revisions)
|
|
912
|
-
# just check meta file for ggml and gptq model
|
|
913
|
-
elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
|
|
914
|
-
ret = []
|
|
915
|
-
for q in llm_spec.quantizations:
|
|
916
|
-
assert q is not None
|
|
917
|
-
hf_meta_path = _get_meta_path(
|
|
918
|
-
cache_dir, llm_spec.model_format, "huggingface", q
|
|
919
|
-
)
|
|
920
|
-
ms_meta_path = _get_meta_path(
|
|
921
|
-
cache_dir, llm_spec.model_format, "modelscope", q
|
|
922
|
-
)
|
|
923
|
-
results = [os.path.exists(hf_meta_path), os.path.exists(ms_meta_path)]
|
|
924
|
-
ret.append(any(results))
|
|
925
|
-
return ret
|
|
926
|
-
else:
|
|
927
|
-
raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
|
|
928
|
-
|
|
929
929
|
|
|
930
|
-
def
|
|
931
|
-
|
|
930
|
+
def check_file_status(meta_path: str) -> bool:
|
|
931
|
+
return os.path.exists(meta_path)
|
|
932
932
|
|
|
933
|
+
def check_revision_status(
|
|
934
|
+
meta_path: str, families: list, quantization: Optional[str] = None
|
|
935
|
+
) -> bool:
|
|
936
|
+
return _check_revision(llm_family, llm_spec, families, meta_path, quantization)
|
|
933
937
|
|
|
934
|
-
def
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
+
def handle_quantization(q: Union[str, None]) -> bool:
|
|
939
|
+
specific_cache_dir = _get_cache_dir(
|
|
940
|
+
llm_family, llm_spec, q, create_if_not_exist=False
|
|
941
|
+
)
|
|
942
|
+
meta_paths = {
|
|
943
|
+
"huggingface": _get_meta_path(
|
|
944
|
+
specific_cache_dir, llm_spec.model_format, "huggingface", q
|
|
945
|
+
),
|
|
946
|
+
"modelscope": _get_meta_path(
|
|
947
|
+
specific_cache_dir, llm_spec.model_format, "modelscope", q
|
|
948
|
+
),
|
|
949
|
+
}
|
|
950
|
+
if llm_spec.model_format == "pytorch":
|
|
951
|
+
return check_revision_status(
|
|
952
|
+
meta_paths["huggingface"], BUILTIN_LLM_FAMILIES, q
|
|
953
|
+
) or check_revision_status(
|
|
954
|
+
meta_paths["modelscope"], BUILTIN_MODELSCOPE_LLM_FAMILIES, q
|
|
955
|
+
)
|
|
956
|
+
else:
|
|
957
|
+
return check_file_status(meta_paths["huggingface"]) or check_file_status(
|
|
958
|
+
meta_paths["modelscope"]
|
|
959
|
+
)
|
|
938
960
|
|
|
939
|
-
|
|
961
|
+
if llm_spec.model_id and "{" in llm_spec.model_id:
|
|
962
|
+
return (
|
|
963
|
+
[handle_quantization(q) for q in llm_spec.quantizations]
|
|
964
|
+
if quantization is None
|
|
965
|
+
else handle_quantization(quantization)
|
|
966
|
+
)
|
|
967
|
+
else:
|
|
968
|
+
return (
|
|
969
|
+
[handle_quantization(q) for q in llm_spec.quantizations]
|
|
970
|
+
if llm_spec.model_format != "pytorch"
|
|
971
|
+
else handle_quantization(None)
|
|
972
|
+
)
|
|
940
973
|
|
|
941
974
|
|
|
942
975
|
def get_user_defined_llm_families():
|
|
@@ -982,6 +1015,7 @@ def match_llm(
|
|
|
982
1015
|
model_format: Optional[str] = None,
|
|
983
1016
|
model_size_in_billions: Optional[Union[int, str]] = None,
|
|
984
1017
|
quantization: Optional[str] = None,
|
|
1018
|
+
download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
|
|
985
1019
|
) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
|
|
986
1020
|
"""
|
|
987
1021
|
Find an LLM family, spec, and quantization that satisfy given criteria.
|
|
@@ -1005,7 +1039,22 @@ def match_llm(
|
|
|
1005
1039
|
spec.model_id = spec.model_id.format(quantization=q)
|
|
1006
1040
|
return spec
|
|
1007
1041
|
|
|
1008
|
-
|
|
1042
|
+
# priority: download_hub > download_from_modelscope() and download_from_csghub()
|
|
1043
|
+
if download_hub == "modelscope":
|
|
1044
|
+
all_families = (
|
|
1045
|
+
BUILTIN_MODELSCOPE_LLM_FAMILIES
|
|
1046
|
+
+ BUILTIN_LLM_FAMILIES
|
|
1047
|
+
+ user_defined_llm_families
|
|
1048
|
+
)
|
|
1049
|
+
elif download_hub == "csghub":
|
|
1050
|
+
all_families = (
|
|
1051
|
+
BUILTIN_CSGHUB_LLM_FAMILIES
|
|
1052
|
+
+ BUILTIN_LLM_FAMILIES
|
|
1053
|
+
+ user_defined_llm_families
|
|
1054
|
+
)
|
|
1055
|
+
elif download_hub == "huggingface":
|
|
1056
|
+
all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
|
|
1057
|
+
elif download_from_modelscope():
|
|
1009
1058
|
all_families = (
|
|
1010
1059
|
BUILTIN_MODELSCOPE_LLM_FAMILIES
|
|
1011
1060
|
+ BUILTIN_LLM_FAMILIES
|