xinference 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (97) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +34 -15
  3. xinference/client/oscar/actor_client.py +4 -3
  4. xinference/client/restful/restful_client.py +40 -18
  5. xinference/core/supervisor.py +48 -9
  6. xinference/core/worker.py +13 -8
  7. xinference/deploy/cmdline.py +22 -9
  8. xinference/model/audio/__init__.py +40 -1
  9. xinference/model/audio/core.py +25 -45
  10. xinference/model/audio/custom.py +148 -0
  11. xinference/model/core.py +6 -9
  12. xinference/model/embedding/core.py +1 -2
  13. xinference/model/embedding/model_spec.json +24 -0
  14. xinference/model/embedding/model_spec_modelscope.json +24 -0
  15. xinference/model/image/core.py +12 -4
  16. xinference/model/image/stable_diffusion/core.py +8 -7
  17. xinference/model/llm/__init__.py +0 -6
  18. xinference/model/llm/core.py +9 -14
  19. xinference/model/llm/ggml/llamacpp.py +2 -10
  20. xinference/model/llm/llm_family.json +507 -7
  21. xinference/model/llm/llm_family.py +41 -4
  22. xinference/model/llm/llm_family_modelscope.json +260 -0
  23. xinference/model/llm/pytorch/baichuan.py +4 -3
  24. xinference/model/llm/pytorch/chatglm.py +5 -2
  25. xinference/model/llm/pytorch/core.py +37 -41
  26. xinference/model/llm/pytorch/falcon.py +6 -5
  27. xinference/model/llm/pytorch/internlm2.py +5 -2
  28. xinference/model/llm/pytorch/llama_2.py +6 -5
  29. xinference/model/llm/pytorch/qwen_vl.py +2 -0
  30. xinference/model/llm/pytorch/vicuna.py +4 -3
  31. xinference/model/llm/pytorch/yi_vl.py +4 -2
  32. xinference/model/llm/utils.py +42 -4
  33. xinference/model/llm/vllm/core.py +54 -6
  34. xinference/model/rerank/core.py +26 -12
  35. xinference/model/rerank/model_spec.json +24 -0
  36. xinference/model/rerank/model_spec_modelscope.json +25 -1
  37. xinference/model/utils.py +12 -1
  38. xinference/thirdparty/omnilmm/chat.py +1 -1
  39. xinference/types.py +70 -19
  40. xinference/utils.py +1 -0
  41. xinference/web/ui/build/asset-manifest.json +3 -3
  42. xinference/web/ui/build/index.html +1 -1
  43. xinference/web/ui/build/static/js/main.26fdbfbe.js +3 -0
  44. xinference/web/ui/build/static/js/main.26fdbfbe.js.map +1 -0
  45. xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
  46. xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +1 -0
  47. xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/f9290c0738db50065492ceedc6a4af25083fe18399b7c44d942273349ad9e643.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +1 -0
  65. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/METADATA +13 -10
  66. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/RECORD +71 -74
  67. xinference/model/llm/ggml/ctransformers.py +0 -281
  68. xinference/model/llm/ggml/ctransformers_util.py +0 -161
  69. xinference/web/ui/build/static/js/main.98516614.js +0 -3
  70. xinference/web/ui/build/static/js/main.98516614.js.map +0 -1
  71. xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
  72. xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
  73. xinference/web/ui/node_modules/.cache/babel-loader/139969fd25258eb7decc9505f30b779089bba50c402bb5c663008477c7bff73b.json +0 -1
  74. xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
  75. xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
  76. xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
  77. xinference/web/ui/node_modules/.cache/babel-loader/3f357ab57b8e7fade54c667f0e0ebf2787566f72bfdca0fea14e395b5c203753.json +0 -1
  78. xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
  79. xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
  80. xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
  81. xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
  84. xinference/web/ui/node_modules/.cache/babel-loader/9d7c49815d97539207e5aab2fb967591b5fed7791218a0762539efc9491f36af.json +0 -1
  85. xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
  86. xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
  87. xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
  88. xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
  89. xinference/web/ui/node_modules/.cache/babel-loader/d0d0b591d9adaf42b83ad6633f8b7c118541a4b80ea957c303d3bf9b86fbad0a.json +0 -1
  90. xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
  93. /xinference/web/ui/build/static/js/{main.98516614.js.LICENSE.txt → main.26fdbfbe.js.LICENSE.txt} +0 -0
  94. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/LICENSE +0 -0
  95. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/WHEEL +0 -0
  96. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/entry_points.txt +0 -0
  97. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/top_level.txt +0 -0
@@ -913,6 +913,38 @@
913
913
  "model_id": "meta-llama/Llama-2-7b-chat-hf",
914
914
  "model_revision": "08751db2aca9bf2f7f80d2e516117a53d7450235"
915
915
  },
916
+ {
917
+ "model_format": "gptq",
918
+ "model_size_in_billions": 7,
919
+ "quantizations": [
920
+ "Int4"
921
+ ],
922
+ "model_id": "TheBloke/Llama-2-7B-Chat-GPTQ"
923
+ },
924
+ {
925
+ "model_format": "gptq",
926
+ "model_size_in_billions": 70,
927
+ "quantizations": [
928
+ "Int4"
929
+ ],
930
+ "model_id": "TheBloke/Llama-2-70B-Chat-GPTQ"
931
+ },
932
+ {
933
+ "model_format": "awq",
934
+ "model_size_in_billions": 70,
935
+ "quantizations": [
936
+ "Int4"
937
+ ],
938
+ "model_id": "TheBloke/Llama-2-70B-Chat-AWQ"
939
+ },
940
+ {
941
+ "model_format": "awq",
942
+ "model_size_in_billions": 7,
943
+ "quantizations": [
944
+ "Int4"
945
+ ],
946
+ "model_id": "TheBloke/Llama-2-7B-Chat-AWQ"
947
+ },
916
948
  {
917
949
  "model_format": "pytorch",
918
950
  "model_size_in_billions": 13,
@@ -924,6 +956,22 @@
924
956
  "model_id": "meta-llama/Llama-2-13b-chat-hf",
925
957
  "model_revision": "0ba94ac9b9e1d5a0037780667e8b219adde1908c"
926
958
  },
959
+ {
960
+ "model_format": "gptq",
961
+ "model_size_in_billions": 13,
962
+ "quantizations": [
963
+ "Int4"
964
+ ],
965
+ "model_id": "TheBloke/Llama-2-13B-chat-GPTQ"
966
+ },
967
+ {
968
+ "model_format": "awq",
969
+ "model_size_in_billions": 13,
970
+ "quantizations": [
971
+ "Int4"
972
+ ],
973
+ "model_id": "TheBloke/Llama-2-13B-chat-AWQ"
974
+ },
927
975
  {
928
976
  "model_format": "pytorch",
929
977
  "model_size_in_billions": 70,
@@ -1045,6 +1093,22 @@
1045
1093
  "model_id": "TheBloke/Llama-2-7B-GGML",
1046
1094
  "model_file_name_template": "llama-2-7b.ggmlv3.{quantization}.bin"
1047
1095
  },
1096
+ {
1097
+ "model_format": "gptq",
1098
+ "model_size_in_billions": 7,
1099
+ "quantizations": [
1100
+ "Int4"
1101
+ ],
1102
+ "model_id": "TheBloke/Llama-2-7B-GPTQ"
1103
+ },
1104
+ {
1105
+ "model_format": "awq",
1106
+ "model_size_in_billions": 7,
1107
+ "quantizations": [
1108
+ "Int4"
1109
+ ],
1110
+ "model_id": "TheBloke/Llama-2-7B-AWQ"
1111
+ },
1048
1112
  {
1049
1113
  "model_format": "ggmlv3",
1050
1114
  "model_size_in_billions": 13,
@@ -1111,6 +1175,22 @@
1111
1175
  "model_id": "meta-llama/Llama-2-13b-hf",
1112
1176
  "model_revision": "db6b8eb1feabb38985fdf785a89895959e944936"
1113
1177
  },
1178
+ {
1179
+ "model_format": "gptq",
1180
+ "model_size_in_billions": 13,
1181
+ "quantizations": [
1182
+ "Int4"
1183
+ ],
1184
+ "model_id": "TheBloke/Llama-2-13B-GPTQ"
1185
+ },
1186
+ {
1187
+ "model_format": "awq",
1188
+ "model_size_in_billions": 13,
1189
+ "quantizations": [
1190
+ "Int4"
1191
+ ],
1192
+ "model_id": "TheBloke/Llama-2-13B-AWQ"
1193
+ },
1114
1194
  {
1115
1195
  "model_format": "pytorch",
1116
1196
  "model_size_in_billions": 70,
@@ -1121,6 +1201,22 @@
1121
1201
  ],
1122
1202
  "model_id": "meta-llama/Llama-2-70b-hf",
1123
1203
  "model_revision": "cc8aa03a000ff08b4d5c5b39673321a2a396c396"
1204
+ },
1205
+ {
1206
+ "model_format": "gptq",
1207
+ "model_size_in_billions": 70,
1208
+ "quantizations": [
1209
+ "Int4"
1210
+ ],
1211
+ "model_id": "TheBloke/Llama-2-70B-GPTQ"
1212
+ },
1213
+ {
1214
+ "model_format": "awq",
1215
+ "model_size_in_billions": 70,
1216
+ "quantizations": [
1217
+ "Int4"
1218
+ ],
1219
+ "model_id": "TheBloke/Llama-2-70B-AWQ"
1124
1220
  }
1125
1221
  ]
1126
1222
  },
@@ -1509,6 +1605,16 @@
1509
1605
  ],
1510
1606
  "model_id": "Qwen/Qwen1.5-14B-Chat"
1511
1607
  },
1608
+ {
1609
+ "model_format": "pytorch",
1610
+ "model_size_in_billions": 32,
1611
+ "quantizations": [
1612
+ "4-bit",
1613
+ "8-bit",
1614
+ "none"
1615
+ ],
1616
+ "model_id": "Qwen/Qwen1.5-32B-Chat"
1617
+ },
1512
1618
  {
1513
1619
  "model_format": "pytorch",
1514
1620
  "model_size_in_billions": 72,
@@ -1564,6 +1670,14 @@
1564
1670
  ],
1565
1671
  "model_id": "Qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}"
1566
1672
  },
1673
+ {
1674
+ "model_format": "gptq",
1675
+ "model_size_in_billions": 32,
1676
+ "quantizations": [
1677
+ "Int4"
1678
+ ],
1679
+ "model_id": "Qwen/Qwen1.5-32B-Chat-GPTQ-{quantization}"
1680
+ },
1567
1681
  {
1568
1682
  "model_format": "gptq",
1569
1683
  "model_size_in_billions": 72,
@@ -1613,6 +1727,14 @@
1613
1727
  ],
1614
1728
  "model_id": "Qwen/Qwen1.5-14B-Chat-AWQ"
1615
1729
  },
1730
+ {
1731
+ "model_format": "awq",
1732
+ "model_size_in_billions": 32,
1733
+ "quantizations": [
1734
+ "Int4"
1735
+ ],
1736
+ "model_id": "Qwen/Qwen1.5-32B-Chat-AWQ"
1737
+ },
1616
1738
  {
1617
1739
  "model_format": "awq",
1618
1740
  "model_size_in_billions": 72,
@@ -1701,6 +1823,22 @@
1701
1823
  "model_id": "Qwen/Qwen1.5-14B-Chat-GGUF",
1702
1824
  "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
1703
1825
  },
1826
+ {
1827
+ "model_format": "ggufv2",
1828
+ "model_size_in_billions": 32,
1829
+ "quantizations": [
1830
+ "q2_k",
1831
+ "q3_k_m",
1832
+ "q4_0",
1833
+ "q4_k_m",
1834
+ "q5_0",
1835
+ "q5_k_m",
1836
+ "q6_k",
1837
+ "q8_0"
1838
+ ],
1839
+ "model_id": "Qwen/Qwen1.5-32B-Chat-GGUF",
1840
+ "model_file_name_template": "qwen1_5-32b-chat-{quantization}.gguf"
1841
+ },
1704
1842
  {
1705
1843
  "model_format": "ggufv2",
1706
1844
  "model_size_in_billions": 72,
@@ -1740,6 +1878,126 @@
1740
1878
  ]
1741
1879
  }
1742
1880
  },
1881
+ {
1882
+ "version": 1,
1883
+ "context_length": 32768,
1884
+ "model_name": "qwen1.5-moe-chat",
1885
+ "model_lang": [
1886
+ "en",
1887
+ "zh"
1888
+ ],
1889
+ "model_ability": [
1890
+ "chat"
1891
+ ],
1892
+ "model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
1893
+ "model_specs": [
1894
+ {
1895
+ "model_format": "pytorch",
1896
+ "model_size_in_billions": "2_7",
1897
+ "quantizations": [
1898
+ "4-bit",
1899
+ "8-bit",
1900
+ "none"
1901
+ ],
1902
+ "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat"
1903
+ },
1904
+ {
1905
+ "model_format": "gptq",
1906
+ "model_size_in_billions": "2_7",
1907
+ "quantizations": [
1908
+ "Int4"
1909
+ ],
1910
+ "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4"
1911
+ }
1912
+ ],
1913
+ "prompt_style": {
1914
+ "style_name": "QWEN",
1915
+ "system_prompt": "You are a helpful assistant.",
1916
+ "roles": [
1917
+ "user",
1918
+ "assistant"
1919
+ ],
1920
+ "intra_message_sep": "\n",
1921
+ "stop_token_ids": [
1922
+ 151643,
1923
+ 151644,
1924
+ 151645
1925
+ ],
1926
+ "stop": [
1927
+ "<|endoftext|>",
1928
+ "<|im_start|>",
1929
+ "<|im_end|>"
1930
+ ]
1931
+ }
1932
+ },
1933
+ {
1934
+ "version": 1,
1935
+ "context_length": 65536,
1936
+ "model_name": "codeqwen1.5-chat",
1937
+ "model_lang": [
1938
+ "en",
1939
+ "zh"
1940
+ ],
1941
+ "model_ability": [
1942
+ "chat"
1943
+ ],
1944
+ "model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
1945
+ "model_specs": [
1946
+ {
1947
+ "model_format": "ggufv2",
1948
+ "model_size_in_billions": 7,
1949
+ "quantizations": [
1950
+ "q2_k",
1951
+ "q3_k_m",
1952
+ "q4_0",
1953
+ "q4_k_m",
1954
+ "q5_0",
1955
+ "q5_k_m",
1956
+ "q6_k",
1957
+ "q8_0"
1958
+ ],
1959
+ "model_id": "Qwen/CodeQwen1.5-7B-Chat-GGUF",
1960
+ "model_file_name_template": "codeqwen-1_5-7b-chat-{quantization}.gguf"
1961
+ },
1962
+ {
1963
+ "model_format": "pytorch",
1964
+ "model_size_in_billions": 7,
1965
+ "quantizations": [
1966
+ "4-bit",
1967
+ "8-bit",
1968
+ "none"
1969
+ ],
1970
+ "model_id": "Qwen/CodeQwen1.5-7B-Chat"
1971
+ },
1972
+ {
1973
+ "model_format": "awq",
1974
+ "model_size_in_billions": 7,
1975
+ "quantizations": [
1976
+ "Int4"
1977
+ ],
1978
+ "model_id": "Qwen/CodeQwen1.5-7B-Chat-AWQ"
1979
+ }
1980
+ ],
1981
+ "prompt_style": {
1982
+ "style_name": "QWEN",
1983
+ "system_prompt": "You are a helpful assistant.",
1984
+ "roles": [
1985
+ "user",
1986
+ "assistant"
1987
+ ],
1988
+ "intra_message_sep": "\n",
1989
+ "stop_token_ids": [
1990
+ 151643,
1991
+ 151644,
1992
+ 151645
1993
+ ],
1994
+ "stop": [
1995
+ "<|endoftext|>",
1996
+ "<|im_start|>",
1997
+ "<|im_end|>"
1998
+ ]
1999
+ }
2000
+ },
1743
2001
  {
1744
2002
  "version": 1,
1745
2003
  "context_length": 8192,
@@ -1780,13 +2038,13 @@
1780
2038
  "model_description": "GPT-2 is a Transformer-based LLM that is trained on WebTest, a 40 GB dataset of Reddit posts with 3+ upvotes.",
1781
2039
  "model_specs": [
1782
2040
  {
1783
- "model_format": "ggmlv3",
1784
- "model_size_in_billions": 1,
2041
+ "model_format": "pytorch",
2042
+ "model_size_in_billions": "1_5",
1785
2043
  "quantizations": [
1786
2044
  "none"
1787
2045
  ],
1788
- "model_id": "marella/gpt-2-ggml",
1789
- "model_file_name_template": "ggml-model.bin"
2046
+ "model_id": "openai-community/gpt2",
2047
+ "model_revision": "607a30d783dfa663caf39e06633721c8d4cfcd7e"
1790
2048
  }
1791
2049
  ]
1792
2050
  },
@@ -2569,6 +2827,22 @@
2569
2827
  "model_id": "mistralai/Mistral-7B-Instruct-v0.1",
2570
2828
  "model_revision": "54766df6d50e4d3d7ccd66758e5341ba105a6d36"
2571
2829
  },
2830
+ {
2831
+ "model_format": "awq",
2832
+ "model_size_in_billions": 7,
2833
+ "quantizations": [
2834
+ "Int4"
2835
+ ],
2836
+ "model_id": "TheBloke/Mistral-7B-Instruct-v0.1-AWQ"
2837
+ },
2838
+ {
2839
+ "model_format": "gptq",
2840
+ "model_size_in_billions": 7,
2841
+ "quantizations": [
2842
+ "Int4"
2843
+ ],
2844
+ "model_id": "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
2845
+ },
2572
2846
  {
2573
2847
  "model_format": "ggufv2",
2574
2848
  "model_size_in_billions": 7,
@@ -2630,6 +2904,22 @@
2630
2904
  "model_id": "mistralai/Mistral-7B-Instruct-v0.2",
2631
2905
  "model_revision": "b70aa86578567ba3301b21c8a27bea4e8f6d6d61"
2632
2906
  },
2907
+ {
2908
+ "model_format": "gptq",
2909
+ "model_size_in_billions": 7,
2910
+ "quantizations": [
2911
+ "Int4"
2912
+ ],
2913
+ "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
2914
+ },
2915
+ {
2916
+ "model_format": "awq",
2917
+ "model_size_in_billions": 7,
2918
+ "quantizations": [
2919
+ "Int4"
2920
+ ],
2921
+ "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
2922
+ },
2633
2923
  {
2634
2924
  "model_format": "ggufv2",
2635
2925
  "model_size_in_billions": 7,
@@ -2790,6 +3080,14 @@
2790
3080
  "model_id": "mistralai/Mixtral-8x7B-v0.1",
2791
3081
  "model_revision": "58301445dc1378584211722b7ebf8743ec4e192b"
2792
3082
  },
3083
+ {
3084
+ "model_format": "gptq",
3085
+ "model_size_in_billions": "46_7",
3086
+ "quantizations": [
3087
+ "Int4"
3088
+ ],
3089
+ "model_id": "TheBloke/Mixtral-8x7B-v0.1-GPTQ"
3090
+ },
2793
3091
  {
2794
3092
  "model_format": "ggufv2",
2795
3093
  "model_size_in_billions": "46_7",
@@ -2839,10 +3137,17 @@
2839
3137
  "model_format": "awq",
2840
3138
  "model_size_in_billions": "46_7",
2841
3139
  "quantizations": [
2842
- "4-bit"
3140
+ "Int4"
2843
3141
  ],
2844
- "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ",
2845
- "model_revision": "9afb6f0a7d7fe9ecebdda1baa4ff4e13e73e97d7"
3142
+ "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ"
3143
+ },
3144
+ {
3145
+ "model_format": "gptq",
3146
+ "model_size_in_billions": "46_7",
3147
+ "quantizations": [
3148
+ "Int4"
3149
+ ],
3150
+ "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
2846
3151
  },
2847
3152
  {
2848
3153
  "model_format": "ggufv2",
@@ -4515,5 +4820,200 @@
4515
4820
  "</s>"
4516
4821
  ]
4517
4822
  }
4823
+ },
4824
+ {
4825
+ "version": 1,
4826
+ "context_length": 8192,
4827
+ "model_name": "seallm_v2",
4828
+ "model_lang": [
4829
+ "en",
4830
+ "zh",
4831
+ "vi",
4832
+ "id",
4833
+ "th",
4834
+ "ms",
4835
+ "km",
4836
+ "lo",
4837
+ "my",
4838
+ "tl"
4839
+ ],
4840
+ "model_ability": [
4841
+ "generate"
4842
+ ],
4843
+ "model_description": "We introduce SeaLLM-7B-v2, the state-of-the-art multilingual LLM for Southeast Asian (SEA) languages",
4844
+ "model_specs": [
4845
+ {
4846
+ "model_format": "pytorch",
4847
+ "model_size_in_billions": 7,
4848
+ "quantizations": [
4849
+ "none"
4850
+ ],
4851
+ "model_id": "SeaLLMs/SeaLLM-7B-v2",
4852
+ "model_revision": "f1bd48e0d75365c24a3c5ad006b2d0a0c9dca30f"
4853
+ },
4854
+ {
4855
+ "model_format": "ggufv2",
4856
+ "model_size_in_billions": 7,
4857
+ "quantizations": [
4858
+ "Q4_0",
4859
+ "Q8_0"
4860
+ ],
4861
+ "model_id": "SeaLLMs/SeaLLM-7B-v2-gguf",
4862
+ "model_file_name_template": "SeaLLM-7B-v2.{quantization}.gguf"
4863
+ }
4864
+ ]
4865
+ },
4866
+ {
4867
+ "version": 1,
4868
+ "context_length": 8192,
4869
+ "model_name": "seallm_v2.5",
4870
+ "model_lang": [
4871
+ "en",
4872
+ "zh",
4873
+ "vi",
4874
+ "id",
4875
+ "th",
4876
+ "ms",
4877
+ "km",
4878
+ "lo",
4879
+ "my",
4880
+ "tl"
4881
+ ],
4882
+ "model_ability": [
4883
+ "generate"
4884
+ ],
4885
+ "model_description": "We introduce SeaLLM-7B-v2.5, the state-of-the-art multilingual LLM for Southeast Asian (SEA) languages",
4886
+ "model_specs": [
4887
+ {
4888
+ "model_format": "pytorch",
4889
+ "model_size_in_billions": 7,
4890
+ "quantizations": [
4891
+ "none"
4892
+ ],
4893
+ "model_id": "SeaLLMs/SeaLLM-7B-v2.5",
4894
+ "model_revision": "c54a8eb8e2d58c5a680bfbbe3a7ae71753bb644b"
4895
+ },
4896
+ {
4897
+ "model_format": "ggufv2",
4898
+ "model_size_in_billions": 7,
4899
+ "quantizations": [
4900
+ "Q4_K_M",
4901
+ "Q8_0"
4902
+ ],
4903
+ "model_id": "SeaLLMs/SeaLLM-7B-v2.5-GGUF",
4904
+ "model_file_name_template": "SeaLLM-7B-v2.5.{quantization}.gguf"
4905
+ }
4906
+ ]
4907
+ },
4908
+ {
4909
+ "version": 1,
4910
+ "context_length": 131072,
4911
+ "model_name": "c4ai-command-r-v01",
4912
+ "model_lang": [
4913
+ "en",
4914
+ "fr",
4915
+ "de",
4916
+ "es",
4917
+ "it",
4918
+ "pt",
4919
+ "ja",
4920
+ "ko",
4921
+ "zh",
4922
+ "ar"
4923
+ ],
4924
+ "model_ability": [
4925
+ "generate"
4926
+ ],
4927
+ "model_description": "C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.",
4928
+ "model_specs": [
4929
+ {
4930
+ "model_format": "pytorch",
4931
+ "model_size_in_billions": 35,
4932
+ "quantizations": [
4933
+ "none"
4934
+ ],
4935
+ "model_id": "CohereForAI/c4ai-command-r-v01",
4936
+ "model_revision": "16881ccde1c68bbc7041280e6a66637bc46bfe88"
4937
+ },
4938
+ {
4939
+ "model_format": "ggufv2",
4940
+ "model_size_in_billions": 35,
4941
+ "quantizations": [
4942
+ "Q2_K",
4943
+ "Q3_K_L",
4944
+ "Q3_K_M",
4945
+ "Q3_K_S",
4946
+ "Q4_0",
4947
+ "Q4_K_M",
4948
+ "Q4_K_S",
4949
+ "Q5_0",
4950
+ "Q5_K_M",
4951
+ "Q5_K_S",
4952
+ "Q6_K",
4953
+ "Q8_0"
4954
+ ],
4955
+ "model_id": "andrewcanis/c4ai-command-r-v01-GGUF",
4956
+ "model_file_name_template": "c4ai-command-r-v01.{quantization}.gguf"
4957
+ },
4958
+ {
4959
+ "model_format": "pytorch",
4960
+ "model_size_in_billions": 104,
4961
+ "quantizations": [
4962
+ "none"
4963
+ ],
4964
+ "model_id": "CohereForAI/c4ai-command-r-plus",
4965
+ "model_revision": "ba7f1d954c9d1609013677d87e4142ab95c34e62"
4966
+ },
4967
+ {
4968
+ "model_format": "gptq",
4969
+ "model_size_in_billions": 104,
4970
+ "quantizations": [
4971
+ "Int4"
4972
+ ],
4973
+ "model_id": "alpindale/c4ai-command-r-plus-GPTQ",
4974
+ "model_revision": "35febfc08f723ac0df32480eb4af349a7d08656e"
4975
+ }
4976
+ ]
4977
+ },
4978
+ {
4979
+ "version": 1,
4980
+ "context_length": 131072,
4981
+ "model_name": "c4ai-command-r-v01-4bit",
4982
+ "model_lang": [
4983
+ "en",
4984
+ "fr",
4985
+ "de",
4986
+ "es",
4987
+ "it",
4988
+ "pt",
4989
+ "ja",
4990
+ "ko",
4991
+ "zh",
4992
+ "ar"
4993
+ ],
4994
+ "model_ability": [
4995
+ "generate"
4996
+ ],
4997
+ "model_description": "This model is 4bit quantized version of C4AI Command-R using bitsandbytes.",
4998
+ "model_specs": [
4999
+ {
5000
+ "model_format": "pytorch",
5001
+ "model_size_in_billions": 35,
5002
+ "quantizations": [
5003
+ "none"
5004
+ ],
5005
+ "model_id": "CohereForAI/c4ai-command-r-v01-4bit",
5006
+ "model_revision": "f2e87936a146643c9dd143422dcafb9cb1552611"
5007
+ },
5008
+ {
5009
+ "model_format": "pytorch",
5010
+ "model_size_in_billions": 104,
5011
+ "quantizations": [
5012
+ "none"
5013
+ ],
5014
+ "model_id": "CohereForAI/c4ai-command-r-plus-4bit",
5015
+ "model_revision": "bb63b5b7005ecedb30b0cfd0d5953b02a5817f7b"
5016
+ }
5017
+ ]
4518
5018
  }
4519
5019
  ]
@@ -33,6 +33,7 @@ from ..._compat import (
33
33
  validator,
34
34
  )
35
35
  from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
36
+ from ...types import LoRA
36
37
  from ..utils import (
37
38
  download_from_modelscope,
38
39
  is_valid_model_uri,
@@ -199,6 +200,21 @@ class CustomLLMFamilyV1(LLMFamilyV1):
199
200
  )
200
201
  llm_spec.prompt_style = BUILTIN_LLM_PROMPT_STYLE[prompt_style_name]
201
202
 
203
+ # check model ability, registering LLM only provides generate and chat
204
+ # but for vision models, we add back the abilities so that
205
+ # gradio chat interface can be generated properly
206
+ if (
207
+ llm_spec.model_family != "other"
208
+ and llm_spec.model_family
209
+ in {
210
+ family.model_name
211
+ for family in BUILTIN_LLM_FAMILIES
212
+ if "vision" in family.model_ability
213
+ }
214
+ and "vision" not in llm_spec.model_ability
215
+ ):
216
+ llm_spec.model_ability.append("vision")
217
+
202
218
  return llm_spec
203
219
 
204
220
 
@@ -782,10 +798,29 @@ def get_user_defined_llm_families():
782
798
  return UD_LLM_FAMILIES.copy()
783
799
 
784
800
 
801
+ def match_model_size(
802
+ model_size: Union[int, str], spec_model_size: Union[int, str]
803
+ ) -> bool:
804
+ if isinstance(model_size, str):
805
+ model_size = model_size.replace("_", ".")
806
+ if isinstance(spec_model_size, str):
807
+ spec_model_size = spec_model_size.replace("_", ".")
808
+
809
+ if model_size == spec_model_size:
810
+ return True
811
+
812
+ try:
813
+ ms = int(model_size)
814
+ ss = int(spec_model_size)
815
+ return ms == ss
816
+ except ValueError:
817
+ return False
818
+
819
+
785
820
  def match_llm(
786
821
  model_name: str,
787
822
  model_format: Optional[str] = None,
788
- model_size_in_billions: Optional[int] = None,
823
+ model_size_in_billions: Optional[Union[int, str]] = None,
789
824
  quantization: Optional[str] = None,
790
825
  is_local_deployment: bool = False,
791
826
  ) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
@@ -829,7 +864,9 @@ def match_llm(
829
864
  model_format
830
865
  and model_format != spec.model_format
831
866
  or model_size_in_billions
832
- and model_size_in_billions != spec.model_size_in_billions
867
+ and not match_model_size(
868
+ model_size_in_billions, spec.model_size_in_billions
869
+ )
833
870
  or quantization
834
871
  and matched_quantization is None
835
872
  ):
@@ -939,12 +976,12 @@ def match_llm_cls(
939
976
  family: LLMFamilyV1,
940
977
  llm_spec: "LLMSpecV1",
941
978
  quantization: str,
942
- peft_model_path: Optional[str] = None,
979
+ peft_model: Optional[List[LoRA]] = None,
943
980
  ) -> Optional[Type[LLM]]:
944
981
  """
945
982
  Find an LLM implementation for given LLM family and spec.
946
983
  """
947
- if peft_model_path is not None:
984
+ if peft_model is not None:
948
985
  for cls in PEFT_SUPPORTED_CLASSES:
949
986
  if cls.match(family, llm_spec, quantization):
950
987
  return cls