xinference 0.12.3__py3-none-any.whl → 0.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (101) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +56 -8
  3. xinference/client/restful/restful_client.py +49 -4
  4. xinference/core/model.py +36 -4
  5. xinference/core/scheduler.py +2 -0
  6. xinference/core/supervisor.py +132 -15
  7. xinference/core/worker.py +239 -53
  8. xinference/deploy/cmdline.py +5 -0
  9. xinference/deploy/utils.py +33 -2
  10. xinference/model/audio/chattts.py +6 -6
  11. xinference/model/audio/core.py +23 -15
  12. xinference/model/core.py +12 -3
  13. xinference/model/embedding/core.py +25 -16
  14. xinference/model/flexible/__init__.py +40 -0
  15. xinference/model/flexible/core.py +228 -0
  16. xinference/model/flexible/launchers/__init__.py +15 -0
  17. xinference/model/flexible/launchers/transformers_launcher.py +63 -0
  18. xinference/model/flexible/utils.py +33 -0
  19. xinference/model/image/core.py +18 -14
  20. xinference/model/image/custom.py +1 -1
  21. xinference/model/llm/__init__.py +5 -2
  22. xinference/model/llm/core.py +3 -2
  23. xinference/model/llm/ggml/llamacpp.py +1 -10
  24. xinference/model/llm/llm_family.json +292 -36
  25. xinference/model/llm/llm_family.py +102 -53
  26. xinference/model/llm/llm_family_modelscope.json +247 -27
  27. xinference/model/llm/mlx/__init__.py +13 -0
  28. xinference/model/llm/mlx/core.py +408 -0
  29. xinference/model/llm/pytorch/chatglm.py +2 -9
  30. xinference/model/llm/pytorch/cogvlm2.py +206 -21
  31. xinference/model/llm/pytorch/core.py +213 -120
  32. xinference/model/llm/pytorch/glm4v.py +171 -15
  33. xinference/model/llm/pytorch/qwen_vl.py +168 -7
  34. xinference/model/llm/pytorch/utils.py +53 -62
  35. xinference/model/llm/utils.py +28 -7
  36. xinference/model/rerank/core.py +29 -25
  37. xinference/thirdparty/deepseek_vl/serve/__init__.py +13 -0
  38. xinference/thirdparty/deepseek_vl/serve/app_deepseek.py +510 -0
  39. xinference/thirdparty/deepseek_vl/serve/app_modules/__init__.py +13 -0
  40. xinference/thirdparty/deepseek_vl/serve/app_modules/gradio_utils.py +94 -0
  41. xinference/thirdparty/deepseek_vl/serve/app_modules/overwrites.py +81 -0
  42. xinference/thirdparty/deepseek_vl/serve/app_modules/presets.py +96 -0
  43. xinference/thirdparty/deepseek_vl/serve/app_modules/utils.py +229 -0
  44. xinference/thirdparty/deepseek_vl/serve/inference.py +170 -0
  45. xinference/types.py +0 -1
  46. xinference/web/ui/build/asset-manifest.json +3 -3
  47. xinference/web/ui/build/index.html +1 -1
  48. xinference/web/ui/build/static/js/main.95c1d652.js +3 -0
  49. xinference/web/ui/build/static/js/main.95c1d652.js.map +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/2c63090c842376cdd368c3ded88a333ef40d94785747651343040a6f7872a223.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/70fa8c07463a5fe57c68bf92502910105a8f647371836fe8c3a7408246ca7ba0.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +1 -0
  65. {xinference-0.12.3.dist-info → xinference-0.13.1.dist-info}/METADATA +10 -11
  66. {xinference-0.12.3.dist-info → xinference-0.13.1.dist-info}/RECORD +71 -69
  67. xinference/model/llm/ggml/chatglm.py +0 -457
  68. xinference/thirdparty/ChatTTS/__init__.py +0 -1
  69. xinference/thirdparty/ChatTTS/core.py +0 -200
  70. xinference/thirdparty/ChatTTS/experimental/__init__.py +0 -0
  71. xinference/thirdparty/ChatTTS/experimental/llm.py +0 -40
  72. xinference/thirdparty/ChatTTS/infer/__init__.py +0 -0
  73. xinference/thirdparty/ChatTTS/infer/api.py +0 -125
  74. xinference/thirdparty/ChatTTS/model/__init__.py +0 -0
  75. xinference/thirdparty/ChatTTS/model/dvae.py +0 -155
  76. xinference/thirdparty/ChatTTS/model/gpt.py +0 -265
  77. xinference/thirdparty/ChatTTS/utils/__init__.py +0 -0
  78. xinference/thirdparty/ChatTTS/utils/gpu_utils.py +0 -23
  79. xinference/thirdparty/ChatTTS/utils/infer_utils.py +0 -141
  80. xinference/thirdparty/ChatTTS/utils/io_utils.py +0 -14
  81. xinference/web/ui/build/static/js/main.77dd47c3.js +0 -3
  82. xinference/web/ui/build/static/js/main.77dd47c3.js.map +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/0cd591866aa345566e0b63fb51ff2043e163a770af6fdc2f3bad395d046353e2.json +0 -1
  84. xinference/web/ui/node_modules/.cache/babel-loader/37c1476717199863bbba1530e3513a9368f8f73001b75b4a85c2075956308027.json +0 -1
  85. xinference/web/ui/node_modules/.cache/babel-loader/3da7d55e87882a4af923e187b1351160e34ca102f589086439c15131a227fb6e.json +0 -1
  86. xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +0 -1
  87. xinference/web/ui/node_modules/.cache/babel-loader/46edc1fe657dfedb2e673148332bb442c6eb98f09f2592c389209e376510afa5.json +0 -1
  88. xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +0 -1
  89. xinference/web/ui/node_modules/.cache/babel-loader/72bcecc71c5267250edeb89608859d449b586f13ff9923a5e70e7172976ec403.json +0 -1
  90. xinference/web/ui/node_modules/.cache/babel-loader/82db357f3fd5b32215d747ee593f69ff06c95ad6cde37f71a96c8290aaab64c0.json +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/935efd2867664c58230378fdf2ff1ea85e58d853b7214014e20dfbca8dab7b05.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/bc6da27195ec4607bb472bf61f97c928ad4966fa64e4c2247661bedb7400abba.json +0 -1
  93. xinference/web/ui/node_modules/.cache/babel-loader/c2abe75f04ad82fba68f35ed9cbe2e287762c876684fddccccfa73f739489b65.json +0 -1
  94. xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +0 -1
  95. xinference/web/ui/node_modules/.cache/babel-loader/f118f99c22b713c678c1209c4e1dd43fe86e3f6e801a4c0c35d3bbf41fd05fe6.json +0 -1
  96. xinference/web/ui/node_modules/.cache/babel-loader/f51bf63ddaa7afd125ef2254a105789333eecc1c94fdf5157a9b88ef7ad0a5bd.json +0 -1
  97. /xinference/web/ui/build/static/js/{main.77dd47c3.js.LICENSE.txt → main.95c1d652.js.LICENSE.txt} +0 -0
  98. {xinference-0.12.3.dist-info → xinference-0.13.1.dist-info}/LICENSE +0 -0
  99. {xinference-0.12.3.dist-info → xinference-0.13.1.dist-info}/WHEEL +0 -0
  100. {xinference-0.12.3.dist-info → xinference-0.13.1.dist-info}/entry_points.txt +0 -0
  101. {xinference-0.12.3.dist-info → xinference-0.13.1.dist-info}/top_level.txt +0 -0
@@ -574,19 +574,6 @@
574
574
  ],
575
575
  "model_description": "ChatGLM is an open-source General Language Model (GLM) based LLM trained on both Chinese and English data.",
576
576
  "model_specs": [
577
- {
578
- "model_format": "ggmlv3",
579
- "model_size_in_billions": 6,
580
- "quantizations": [
581
- "q4_0",
582
- "q4_1",
583
- "q5_0",
584
- "q5_1",
585
- "q8_0"
586
- ],
587
- "model_id": "Xorbits/chatglm-6B-GGML",
588
- "model_file_name_template": "chatglm-ggml-{quantization}.bin"
589
- },
590
577
  {
591
578
  "model_format": "pytorch",
592
579
  "model_size_in_billions": 6,
@@ -622,19 +609,6 @@
622
609
  ],
623
610
  "model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
624
611
  "model_specs": [
625
- {
626
- "model_format": "ggmlv3",
627
- "model_size_in_billions": 6,
628
- "quantizations": [
629
- "q4_0",
630
- "q4_1",
631
- "q5_0",
632
- "q5_1",
633
- "q8_0"
634
- ],
635
- "model_id": "Xorbits/chatglm2-6B-GGML",
636
- "model_file_name_template": "chatglm2-ggml-{quantization}.bin"
637
- },
638
612
  {
639
613
  "model_format": "pytorch",
640
614
  "model_size_in_billions": 6,
@@ -706,15 +680,6 @@
706
680
  ],
707
681
  "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
708
682
  "model_specs": [
709
- {
710
- "model_format": "ggmlv3",
711
- "model_size_in_billions": 6,
712
- "quantizations": [
713
- "q4_0"
714
- ],
715
- "model_id": "Xorbits/chatglm3-6B-GGML",
716
- "model_file_name_template": "chatglm3-ggml-{quantization}.bin"
717
- },
718
683
  {
719
684
  "model_format": "pytorch",
720
685
  "model_size_in_billions": 6,
@@ -855,6 +820,32 @@
855
820
  ],
856
821
  "model_id": "THUDM/glm-4-9b-chat",
857
822
  "model_revision": "b84dc74294ccd507a3d78bde8aebf628221af9bd"
823
+ },
824
+ {
825
+ "model_format": "ggufv2",
826
+ "model_size_in_billions": 9,
827
+ "quantizations": [
828
+ "Q2_K",
829
+ "IQ3_XS",
830
+ "IQ3_S",
831
+ "IQ3_M",
832
+ "Q3_K_S",
833
+ "Q3_K_L",
834
+ "Q3_K",
835
+ "IQ4_XS",
836
+ "IQ4_NL",
837
+ "Q4_K_S",
838
+ "Q4_K",
839
+ "Q5_K_S",
840
+ "Q5_K",
841
+ "Q6_K",
842
+ "Q8_0",
843
+ "BF16",
844
+ "FP16"
845
+ ],
846
+ "model_file_name_template": "glm-4-9b-chat.{quantization}.gguf",
847
+ "model_id": "legraphista/glm-4-9b-chat-GGUF",
848
+ "model_revision": "0155a14edf0176863e9a003cdd78ce599e4d62c0"
858
849
  }
859
850
  ],
860
851
  "prompt_style": {
@@ -900,6 +891,32 @@
900
891
  ],
901
892
  "model_id": "THUDM/glm-4-9b-chat-1m",
902
893
  "model_revision": "715ddbe91082f976ff6a4ca06d59e5bbff6c3642"
894
+ },
895
+ {
896
+ "model_format": "ggufv2",
897
+ "model_size_in_billions": 9,
898
+ "quantizations": [
899
+ "Q2_K",
900
+ "IQ3_XS",
901
+ "IQ3_S",
902
+ "IQ3_M",
903
+ "Q3_K_S",
904
+ "Q3_K_L",
905
+ "Q3_K",
906
+ "IQ4_XS",
907
+ "IQ4_NL",
908
+ "Q4_K_S",
909
+ "Q4_K",
910
+ "Q5_K_S",
911
+ "Q5_K",
912
+ "Q6_K",
913
+ "Q8_0",
914
+ "BF16",
915
+ "FP16"
916
+ ],
917
+ "model_file_name_template": "glm-4-9b-chat-1m.{quantization}.gguf",
918
+ "model_id": "legraphista/glm-4-9b-chat-1m-GGUF",
919
+ "model_revision": "782e28bd5eee3c514c07108da15e0b5e06dcf776"
903
920
  }
904
921
  ],
905
922
  "prompt_style": {
@@ -944,7 +961,7 @@
944
961
  "none"
945
962
  ],
946
963
  "model_id": "THUDM/glm-4v-9b",
947
- "model_revision": "e8b84fefc07e58a90c8489337675573fda95e289"
964
+ "model_revision": "6c2e4732db8443f64a48d5af04b74425a7d169c4"
948
965
  }
949
966
  ],
950
967
  "prompt_style": {
@@ -2549,6 +2566,38 @@
2549
2566
  ],
2550
2567
  "model_id": "Qwen/Qwen2-72B-Instruct-AWQ"
2551
2568
  },
2569
+ {
2570
+ "model_format": "mlx",
2571
+ "model_size_in_billions": "0_5",
2572
+ "quantizations": [
2573
+ "4-bit"
2574
+ ],
2575
+ "model_id": "Qwen/Qwen2-0.5B-Instruct-MLX"
2576
+ },
2577
+ {
2578
+ "model_format": "mlx",
2579
+ "model_size_in_billions": "1_5",
2580
+ "quantizations": [
2581
+ "4-bit"
2582
+ ],
2583
+ "model_id": "Qwen/Qwen2-1.5B-Instruct-MLX"
2584
+ },
2585
+ {
2586
+ "model_format": "mlx",
2587
+ "model_size_in_billions": 7,
2588
+ "quantizations": [
2589
+ "4-bit"
2590
+ ],
2591
+ "model_id": "Qwen/Qwen2-7B-Instruct-MLX"
2592
+ },
2593
+ {
2594
+ "model_format": "mlx",
2595
+ "model_size_in_billions": 72,
2596
+ "quantizations": [
2597
+ "4-bit"
2598
+ ],
2599
+ "model_id": "mlx-community/Qwen2-72B-Instruct-4bit"
2600
+ },
2552
2601
  {
2553
2602
  "model_format": "ggufv2",
2554
2603
  "model_size_in_billions": "0_5",
@@ -2565,6 +2614,82 @@
2565
2614
  ],
2566
2615
  "model_id": "Qwen/Qwen2-0.5B-Instruct-GGUF",
2567
2616
  "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf"
2617
+ },
2618
+ {
2619
+ "model_format": "ggufv2",
2620
+ "model_size_in_billions": "1_5",
2621
+ "quantizations": [
2622
+ "q2_k",
2623
+ "q3_k_m",
2624
+ "q4_0",
2625
+ "q4_k_m",
2626
+ "q5_0",
2627
+ "q5_k_m",
2628
+ "q6_k",
2629
+ "q8_0",
2630
+ "fp16"
2631
+ ],
2632
+ "model_id": "Qwen/Qwen2-1.5B-Instruct-GGUF",
2633
+ "model_file_name_template": "qwen2-1_5b-instruct-{quantization}.gguf"
2634
+ },
2635
+ {
2636
+ "model_format": "ggufv2",
2637
+ "model_size_in_billions": 7,
2638
+ "quantizations": [
2639
+ "q2_k",
2640
+ "q3_k_m",
2641
+ "q4_0",
2642
+ "q4_k_m",
2643
+ "q5_0",
2644
+ "q5_k_m",
2645
+ "q6_k",
2646
+ "q8_0",
2647
+ "fp16"
2648
+ ],
2649
+ "model_id": "Qwen/Qwen2-7B-Instruct-GGUF",
2650
+ "model_file_name_template": "qwen2-7b-instruct-{quantization}.gguf"
2651
+ },
2652
+ {
2653
+ "model_format": "ggufv2",
2654
+ "model_size_in_billions": 72,
2655
+ "quantizations": [
2656
+ "q2_k",
2657
+ "q3_k_m",
2658
+ "q4_0",
2659
+ "q4_k_m",
2660
+ "q5_0",
2661
+ "q5_k_m",
2662
+ "q6_k",
2663
+ "q8_0",
2664
+ "fp16"
2665
+ ],
2666
+ "model_id": "Qwen/Qwen2-72B-Instruct-GGUF",
2667
+ "model_file_name_template": "qwen2-72b-instruct-{quantization}.gguf",
2668
+ "model_file_name_split_template": "qwen2-72b-instruct-{quantization}-{part}.gguf",
2669
+ "quantization_parts": {
2670
+ "q5_0": [
2671
+ "00001-of-00002",
2672
+ "00002-of-00002"
2673
+ ],
2674
+ "q5_k_m": [
2675
+ "00001-of-00002",
2676
+ "00002-of-00002"
2677
+ ],
2678
+ "q6_k": [
2679
+ "00001-of-00002",
2680
+ "00002-of-00002"
2681
+ ],
2682
+ "q8_0": [
2683
+ "00001-of-00002",
2684
+ "00002-of-00002"
2685
+ ],
2686
+ "fp16": [
2687
+ "00001-of-00004",
2688
+ "00002-of-00004",
2689
+ "00003-of-00004",
2690
+ "00004-of-00004"
2691
+ ]
2692
+ }
2568
2693
  }
2569
2694
  ],
2570
2695
  "prompt_style": {
@@ -2618,6 +2743,34 @@
2618
2743
  "Int4"
2619
2744
  ],
2620
2745
  "model_id": "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4"
2746
+ },
2747
+ {
2748
+ "model_format": "ggufv2",
2749
+ "model_size_in_billions": 14,
2750
+ "quantizations": [
2751
+ "q3_k_m",
2752
+ "q4_0",
2753
+ "q4_k_m",
2754
+ "q5_0",
2755
+ "q5_k_m",
2756
+ "q6_k",
2757
+ "q8_0",
2758
+ "fp16"
2759
+ ],
2760
+ "model_id": "Qwen/Qwen2-57B-A14B-Instruct-GGUF",
2761
+ "model_file_name_template": "qwen2-57b-a14b-instruct-{quantization}.gguf",
2762
+ "model_file_name_split_template": "qwen2-57b-a14b-instruct-{quantization}-{part}.gguf",
2763
+ "quantization_parts": {
2764
+ "q8_0": [
2765
+ "00001-of-00002",
2766
+ "00002-of-00002"
2767
+ ],
2768
+ "fp16": [
2769
+ "00001-of-00003",
2770
+ "00002-of-00003",
2771
+ "00003-of-00003"
2772
+ ]
2773
+ }
2621
2774
  }
2622
2775
  ],
2623
2776
  "prompt_style": {
@@ -5809,6 +5962,16 @@
5809
5962
  "roles": [
5810
5963
  "user",
5811
5964
  "assistant"
5965
+ ],
5966
+ "stop_token_ids": [
5967
+ 151643,
5968
+ 151644,
5969
+ 151645
5970
+ ],
5971
+ "stop": [
5972
+ "<|endoftext|>",
5973
+ "<|im_start|>",
5974
+ "<|im_end|>"
5812
5975
  ]
5813
5976
  }
5814
5977
  },
@@ -5997,6 +6160,99 @@
5997
6160
  ]
5998
6161
  }
5999
6162
  },
6163
+ {
6164
+ "version": 1,
6165
+ "context_length": 8192,
6166
+ "model_name": "gemma-2-it",
6167
+ "model_lang": [
6168
+ "en"
6169
+ ],
6170
+ "model_ability": [
6171
+ "chat"
6172
+ ],
6173
+ "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
6174
+ "model_specs": [
6175
+ {
6176
+ "model_format": "pytorch",
6177
+ "model_size_in_billions": 9,
6178
+ "quantizations": [
6179
+ "none",
6180
+ "4-bit",
6181
+ "8-bit"
6182
+ ],
6183
+ "model_id": "google/gemma-2-9b-it"
6184
+ },
6185
+ {
6186
+ "model_format": "pytorch",
6187
+ "model_size_in_billions": 27,
6188
+ "quantizations": [
6189
+ "none",
6190
+ "4-bit",
6191
+ "8-bit"
6192
+ ],
6193
+ "model_id": "google/gemma-2-27b-it"
6194
+ },
6195
+ {
6196
+ "model_format": "mlx",
6197
+ "model_size_in_billions": 9,
6198
+ "quantizations": [
6199
+ "4-bit"
6200
+ ],
6201
+ "model_id": "mlx-community/gemma-2-9b-it-4bit"
6202
+ },
6203
+ {
6204
+ "model_format": "mlx",
6205
+ "model_size_in_billions": 9,
6206
+ "quantizations": [
6207
+ "8-bit"
6208
+ ],
6209
+ "model_id": "mlx-community/gemma-2-9b-it-8bit"
6210
+ },
6211
+ {
6212
+ "model_format": "mlx",
6213
+ "model_size_in_billions": 9,
6214
+ "quantizations": [
6215
+ "None"
6216
+ ],
6217
+ "model_id": "mlx-community/gemma-2-9b-it-fp16"
6218
+ },
6219
+ {
6220
+ "model_format": "mlx",
6221
+ "model_size_in_billions": 27,
6222
+ "quantizations": [
6223
+ "4-bit"
6224
+ ],
6225
+ "model_id": "mlx-community/gemma-2-27b-it-4bit"
6226
+ },
6227
+ {
6228
+ "model_format": "mlx",
6229
+ "model_size_in_billions": 27,
6230
+ "quantizations": [
6231
+ "8-bit"
6232
+ ],
6233
+ "model_id": "mlx-community/gemma-2-27b-it-8bit"
6234
+ },
6235
+ {
6236
+ "model_format": "mlx",
6237
+ "model_size_in_billions": 27,
6238
+ "quantizations": [
6239
+ "None"
6240
+ ],
6241
+ "model_id": "mlx-community/gemma-2-27b-it-fp16"
6242
+ }
6243
+ ],
6244
+ "prompt_style": {
6245
+ "style_name": "gemma",
6246
+ "roles": [
6247
+ "user",
6248
+ "model"
6249
+ ],
6250
+ "stop": [
6251
+ "<end_of_turn>",
6252
+ "<start_of_turn>"
6253
+ ]
6254
+ }
6255
+ },
6000
6256
  {
6001
6257
  "version": 1,
6002
6258
  "context_length": 4096,
@@ -14,7 +14,6 @@
14
14
 
15
15
  import logging
16
16
  import os
17
- import platform
18
17
  import shutil
19
18
  from threading import Lock
20
19
  from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
@@ -107,6 +106,28 @@ class PytorchLLMSpecV1(BaseModel):
107
106
  return v
108
107
 
109
108
 
109
+ class MLXLLMSpecV1(BaseModel):
110
+ model_format: Literal["mlx"]
111
+ # Must in order that `str` first, then `int`
112
+ model_size_in_billions: Union[str, int]
113
+ quantizations: List[str]
114
+ model_id: Optional[str]
115
+ model_hub: str = "huggingface"
116
+ model_uri: Optional[str]
117
+ model_revision: Optional[str]
118
+
119
+ @validator("model_size_in_billions", pre=False)
120
+ def validate_model_size_with_radix(cls, v: object) -> object:
121
+ if isinstance(v, str):
122
+ if (
123
+ "_" in v
124
+ ): # for example, "1_8" just returns "1_8", otherwise int("1_8") returns 18
125
+ return v
126
+ else:
127
+ return int(v)
128
+ return v
129
+
130
+
110
131
  class PromptStyleV1(BaseModel):
111
132
  style_name: str
112
133
  system_prompt: str = ""
@@ -226,7 +247,7 @@ class CustomLLMFamilyV1(LLMFamilyV1):
226
247
 
227
248
 
228
249
  LLMSpecV1 = Annotated[
229
- Union[GgmlLLMSpecV1, PytorchLLMSpecV1],
250
+ Union[GgmlLLMSpecV1, PytorchLLMSpecV1, MLXLLMSpecV1],
230
251
  Field(discriminator="model_format"),
231
252
  ]
232
253
 
@@ -249,6 +270,8 @@ UD_LLM_FAMILIES_LOCK = Lock()
249
270
 
250
271
  VLLM_CLASSES: List[Type[LLM]] = []
251
272
 
273
+ MLX_CLASSES: List[Type[LLM]] = []
274
+
252
275
  LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
253
276
  SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
254
277
 
@@ -517,15 +540,20 @@ def _get_cache_dir_for_model_mem(
517
540
  def _get_cache_dir(
518
541
  llm_family: LLMFamilyV1,
519
542
  llm_spec: "LLMSpecV1",
543
+ quantization: Optional[str] = None,
520
544
  create_if_not_exist=True,
521
545
  ):
522
546
  # If the model id contains quantization, then we should give each
523
547
  # quantization a dedicated cache dir.
524
548
  quant_suffix = ""
525
- for q in llm_spec.quantizations:
526
- if llm_spec.model_id and q in llm_spec.model_id:
527
- quant_suffix = q
528
- break
549
+ if llm_spec.model_id and "{" in llm_spec.model_id and quantization is not None:
550
+ quant_suffix = quantization
551
+ else:
552
+ for q in llm_spec.quantizations:
553
+ if llm_spec.model_id and q in llm_spec.model_id:
554
+ quant_suffix = q
555
+ break
556
+
529
557
  cache_dir_name = (
530
558
  f"{llm_family.model_name}-{llm_spec.model_format}"
531
559
  f"-{llm_spec.model_size_in_billions}b"
@@ -549,7 +577,7 @@ def _get_meta_path(
549
577
  return os.path.join(cache_dir, "__valid_download")
550
578
  else:
551
579
  return os.path.join(cache_dir, f"__valid_download_{model_hub}")
552
- elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
580
+ elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
553
581
  assert quantization is not None
554
582
  if model_hub == "huggingface":
555
583
  return os.path.join(cache_dir, f"__valid_download_{quantization}")
@@ -588,7 +616,7 @@ def _skip_download(
588
616
  logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
589
617
  return True
590
618
  return False
591
- elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
619
+ elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
592
620
  assert quantization is not None
593
621
  return os.path.exists(
594
622
  _get_meta_path(cache_dir, model_format, model_hub, quantization)
@@ -683,7 +711,7 @@ def cache_from_csghub(
683
711
  ):
684
712
  return cache_dir
685
713
 
686
- if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
714
+ if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
687
715
  download_dir = retry_download(
688
716
  snapshot_download,
689
717
  llm_family.model_name,
@@ -751,7 +779,7 @@ def cache_from_modelscope(
751
779
  ):
752
780
  return cache_dir
753
781
 
754
- if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
782
+ if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
755
783
  download_dir = retry_download(
756
784
  snapshot_download,
757
785
  llm_family.model_name,
@@ -820,8 +848,8 @@ def cache_from_huggingface(
820
848
  if not IS_NEW_HUGGINGFACE_HUB:
821
849
  use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
822
850
 
823
- if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
824
- assert isinstance(llm_spec, PytorchLLMSpecV1)
851
+ if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
852
+ assert isinstance(llm_spec, (PytorchLLMSpecV1, MLXLLMSpecV1))
825
853
  download_dir = retry_download(
826
854
  huggingface_hub.snapshot_download,
827
855
  llm_family.model_name,
@@ -876,6 +904,7 @@ def _check_revision(
876
904
  llm_spec: "LLMSpecV1",
877
905
  builtin: list,
878
906
  meta_path: str,
907
+ quantization: Optional[str] = None,
879
908
  ) -> bool:
880
909
  for family in builtin:
881
910
  if llm_family.model_name == family.model_name:
@@ -884,59 +913,63 @@ def _check_revision(
884
913
  if (
885
914
  spec.model_format == "pytorch"
886
915
  and spec.model_size_in_billions == llm_spec.model_size_in_billions
916
+ and (quantization is None or quantization in spec.quantizations)
887
917
  ):
888
918
  return valid_model_revision(meta_path, spec.model_revision)
889
919
  return False
890
920
 
891
921
 
892
922
  def get_cache_status(
893
- llm_family: LLMFamilyV1,
894
- llm_spec: "LLMSpecV1",
923
+ llm_family: LLMFamilyV1, llm_spec: "LLMSpecV1", quantization: Optional[str] = None
895
924
  ) -> Union[bool, List[bool]]:
896
925
  """
897
- When calling this function from above, `llm_family` is constructed only from BUILTIN_LLM_FAMILIES,
898
- so we should check both huggingface and modelscope cache files.
926
+ Checks if a model's cache status is available based on the model format and quantization.
927
+ Supports different directories and model formats.
899
928
  """
900
- cache_dir = _get_cache_dir(llm_family, llm_spec, create_if_not_exist=False)
901
- # check revision for pytorch model
902
- if llm_spec.model_format == "pytorch":
903
- hf_meta_path = _get_meta_path(cache_dir, "pytorch", "huggingface", "none")
904
- ms_meta_path = _get_meta_path(cache_dir, "pytorch", "modelscope", "none")
905
- revisions = [
906
- _check_revision(llm_family, llm_spec, BUILTIN_LLM_FAMILIES, hf_meta_path),
907
- _check_revision(
908
- llm_family, llm_spec, BUILTIN_MODELSCOPE_LLM_FAMILIES, ms_meta_path
909
- ),
910
- ]
911
- return any(revisions)
912
- # just check meta file for ggml and gptq model
913
- elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
914
- ret = []
915
- for q in llm_spec.quantizations:
916
- assert q is not None
917
- hf_meta_path = _get_meta_path(
918
- cache_dir, llm_spec.model_format, "huggingface", q
919
- )
920
- ms_meta_path = _get_meta_path(
921
- cache_dir, llm_spec.model_format, "modelscope", q
922
- )
923
- results = [os.path.exists(hf_meta_path), os.path.exists(ms_meta_path)]
924
- ret.append(any(results))
925
- return ret
926
- else:
927
- raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
928
-
929
929
 
930
- def _is_linux():
931
- return platform.system() == "Linux"
930
+ def check_file_status(meta_path: str) -> bool:
931
+ return os.path.exists(meta_path)
932
932
 
933
+ def check_revision_status(
934
+ meta_path: str, families: list, quantization: Optional[str] = None
935
+ ) -> bool:
936
+ return _check_revision(llm_family, llm_spec, families, meta_path, quantization)
933
937
 
934
- def _has_cuda_device():
935
- # `cuda_count` method already contains the logic for the
936
- # number of GPUs specified by `CUDA_VISIBLE_DEVICES`.
937
- from ...utils import cuda_count
938
+ def handle_quantization(q: Union[str, None]) -> bool:
939
+ specific_cache_dir = _get_cache_dir(
940
+ llm_family, llm_spec, q, create_if_not_exist=False
941
+ )
942
+ meta_paths = {
943
+ "huggingface": _get_meta_path(
944
+ specific_cache_dir, llm_spec.model_format, "huggingface", q
945
+ ),
946
+ "modelscope": _get_meta_path(
947
+ specific_cache_dir, llm_spec.model_format, "modelscope", q
948
+ ),
949
+ }
950
+ if llm_spec.model_format == "pytorch":
951
+ return check_revision_status(
952
+ meta_paths["huggingface"], BUILTIN_LLM_FAMILIES, q
953
+ ) or check_revision_status(
954
+ meta_paths["modelscope"], BUILTIN_MODELSCOPE_LLM_FAMILIES, q
955
+ )
956
+ else:
957
+ return check_file_status(meta_paths["huggingface"]) or check_file_status(
958
+ meta_paths["modelscope"]
959
+ )
938
960
 
939
- return cuda_count() > 0
961
+ if llm_spec.model_id and "{" in llm_spec.model_id:
962
+ return (
963
+ [handle_quantization(q) for q in llm_spec.quantizations]
964
+ if quantization is None
965
+ else handle_quantization(quantization)
966
+ )
967
+ else:
968
+ return (
969
+ [handle_quantization(q) for q in llm_spec.quantizations]
970
+ if llm_spec.model_format != "pytorch"
971
+ else handle_quantization(None)
972
+ )
940
973
 
941
974
 
942
975
  def get_user_defined_llm_families():
@@ -982,6 +1015,7 @@ def match_llm(
982
1015
  model_format: Optional[str] = None,
983
1016
  model_size_in_billions: Optional[Union[int, str]] = None,
984
1017
  quantization: Optional[str] = None,
1018
+ download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
985
1019
  ) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
986
1020
  """
987
1021
  Find an LLM family, spec, and quantization that satisfy given criteria.
@@ -1005,7 +1039,22 @@ def match_llm(
1005
1039
  spec.model_id = spec.model_id.format(quantization=q)
1006
1040
  return spec
1007
1041
 
1008
- if download_from_modelscope():
1042
+ # priority: download_hub > download_from_modelscope() and download_from_csghub()
1043
+ if download_hub == "modelscope":
1044
+ all_families = (
1045
+ BUILTIN_MODELSCOPE_LLM_FAMILIES
1046
+ + BUILTIN_LLM_FAMILIES
1047
+ + user_defined_llm_families
1048
+ )
1049
+ elif download_hub == "csghub":
1050
+ all_families = (
1051
+ BUILTIN_CSGHUB_LLM_FAMILIES
1052
+ + BUILTIN_LLM_FAMILIES
1053
+ + user_defined_llm_families
1054
+ )
1055
+ elif download_hub == "huggingface":
1056
+ all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
1057
+ elif download_from_modelscope():
1009
1058
  all_families = (
1010
1059
  BUILTIN_MODELSCOPE_LLM_FAMILIES
1011
1060
  + BUILTIN_LLM_FAMILIES