xinference 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (97) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +34 -15
  3. xinference/client/oscar/actor_client.py +4 -3
  4. xinference/client/restful/restful_client.py +40 -18
  5. xinference/core/supervisor.py +48 -9
  6. xinference/core/worker.py +13 -8
  7. xinference/deploy/cmdline.py +22 -9
  8. xinference/model/audio/__init__.py +40 -1
  9. xinference/model/audio/core.py +25 -45
  10. xinference/model/audio/custom.py +148 -0
  11. xinference/model/core.py +6 -9
  12. xinference/model/embedding/core.py +1 -2
  13. xinference/model/embedding/model_spec.json +24 -0
  14. xinference/model/embedding/model_spec_modelscope.json +24 -0
  15. xinference/model/image/core.py +12 -4
  16. xinference/model/image/stable_diffusion/core.py +8 -7
  17. xinference/model/llm/__init__.py +0 -6
  18. xinference/model/llm/core.py +9 -14
  19. xinference/model/llm/ggml/llamacpp.py +2 -10
  20. xinference/model/llm/llm_family.json +507 -7
  21. xinference/model/llm/llm_family.py +41 -4
  22. xinference/model/llm/llm_family_modelscope.json +260 -0
  23. xinference/model/llm/pytorch/baichuan.py +4 -3
  24. xinference/model/llm/pytorch/chatglm.py +5 -2
  25. xinference/model/llm/pytorch/core.py +37 -41
  26. xinference/model/llm/pytorch/falcon.py +6 -5
  27. xinference/model/llm/pytorch/internlm2.py +5 -2
  28. xinference/model/llm/pytorch/llama_2.py +6 -5
  29. xinference/model/llm/pytorch/qwen_vl.py +2 -0
  30. xinference/model/llm/pytorch/vicuna.py +4 -3
  31. xinference/model/llm/pytorch/yi_vl.py +4 -2
  32. xinference/model/llm/utils.py +42 -4
  33. xinference/model/llm/vllm/core.py +54 -6
  34. xinference/model/rerank/core.py +26 -12
  35. xinference/model/rerank/model_spec.json +24 -0
  36. xinference/model/rerank/model_spec_modelscope.json +25 -1
  37. xinference/model/utils.py +12 -1
  38. xinference/thirdparty/omnilmm/chat.py +1 -1
  39. xinference/types.py +70 -19
  40. xinference/utils.py +1 -0
  41. xinference/web/ui/build/asset-manifest.json +3 -3
  42. xinference/web/ui/build/index.html +1 -1
  43. xinference/web/ui/build/static/js/main.26fdbfbe.js +3 -0
  44. xinference/web/ui/build/static/js/main.26fdbfbe.js.map +1 -0
  45. xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
  46. xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +1 -0
  47. xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/f9290c0738db50065492ceedc6a4af25083fe18399b7c44d942273349ad9e643.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +1 -0
  65. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/METADATA +13 -10
  66. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/RECORD +71 -74
  67. xinference/model/llm/ggml/ctransformers.py +0 -281
  68. xinference/model/llm/ggml/ctransformers_util.py +0 -161
  69. xinference/web/ui/build/static/js/main.98516614.js +0 -3
  70. xinference/web/ui/build/static/js/main.98516614.js.map +0 -1
  71. xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
  72. xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
  73. xinference/web/ui/node_modules/.cache/babel-loader/139969fd25258eb7decc9505f30b779089bba50c402bb5c663008477c7bff73b.json +0 -1
  74. xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
  75. xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
  76. xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
  77. xinference/web/ui/node_modules/.cache/babel-loader/3f357ab57b8e7fade54c667f0e0ebf2787566f72bfdca0fea14e395b5c203753.json +0 -1
  78. xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
  79. xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
  80. xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
  81. xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
  84. xinference/web/ui/node_modules/.cache/babel-loader/9d7c49815d97539207e5aab2fb967591b5fed7791218a0762539efc9491f36af.json +0 -1
  85. xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
  86. xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
  87. xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
  88. xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
  89. xinference/web/ui/node_modules/.cache/babel-loader/d0d0b591d9adaf42b83ad6633f8b7c118541a4b80ea957c303d3bf9b86fbad0a.json +0 -1
  90. xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
  93. /xinference/web/ui/build/static/js/{main.98516614.js.LICENSE.txt → main.26fdbfbe.js.LICENSE.txt} +0 -0
  94. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/LICENSE +0 -0
  95. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/WHEEL +0 -0
  96. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/entry_points.txt +0 -0
  97. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1825,6 +1825,17 @@
1825
1825
  "model_id": "qwen/Qwen1.5-14B-Chat",
1826
1826
  "model_hub": "modelscope"
1827
1827
  },
1828
+ {
1829
+ "model_format": "pytorch",
1830
+ "model_size_in_billions": 32,
1831
+ "quantizations": [
1832
+ "4-bit",
1833
+ "8-bit",
1834
+ "none"
1835
+ ],
1836
+ "model_id": "qwen/Qwen1.5-32B-Chat",
1837
+ "model_hub": "modelscope"
1838
+ },
1828
1839
  {
1829
1840
  "model_format": "pytorch",
1830
1841
  "model_size_in_billions": 72,
@@ -1886,6 +1897,15 @@
1886
1897
  "model_id": "qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}",
1887
1898
  "model_hub": "modelscope"
1888
1899
  },
1900
+ {
1901
+ "model_format": "gptq",
1902
+ "model_size_in_billions": 32,
1903
+ "quantizations": [
1904
+ "Int4"
1905
+ ],
1906
+ "model_id": "qwen/Qwen1.5-32B-Chat-GPTQ-{quantization}",
1907
+ "model_hub": "modelscope"
1908
+ },
1889
1909
  {
1890
1910
  "model_format": "gptq",
1891
1911
  "model_size_in_billions": 72,
@@ -1941,6 +1961,15 @@
1941
1961
  "model_id": "qwen/Qwen1.5-14B-Chat-AWQ",
1942
1962
  "model_hub": "modelscope"
1943
1963
  },
1964
+ {
1965
+ "model_format": "awq",
1966
+ "model_size_in_billions": 32,
1967
+ "quantizations": [
1968
+ "Int4"
1969
+ ],
1970
+ "model_id": "qwen/Qwen1.5-32B-Chat-AWQ",
1971
+ "model_hub": "modelscope"
1972
+ },
1944
1973
  {
1945
1974
  "model_format": "awq",
1946
1975
  "model_size_in_billions": 72,
@@ -2035,6 +2064,23 @@
2035
2064
  "model_hub": "modelscope",
2036
2065
  "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
2037
2066
  },
2067
+ {
2068
+ "model_format": "ggufv2",
2069
+ "model_size_in_billions": 32,
2070
+ "quantizations": [
2071
+ "q2_k",
2072
+ "q3_k_m",
2073
+ "q4_0",
2074
+ "q4_k_m",
2075
+ "q5_0",
2076
+ "q5_k_m",
2077
+ "q6_k",
2078
+ "q8_0"
2079
+ ],
2080
+ "model_id": "qwen/Qwen1.5-32B-Chat-GGUF",
2081
+ "model_hub": "modelscope",
2082
+ "model_file_name_template": "qwen1_5-32b-chat-{quantization}.gguf"
2083
+ },
2038
2084
  {
2039
2085
  "model_format": "ggufv2",
2040
2086
  "model_size_in_billions": 72,
@@ -2075,6 +2121,131 @@
2075
2121
  ]
2076
2122
  }
2077
2123
  },
2124
+ {
2125
+ "version": 1,
2126
+ "context_length": 32768,
2127
+ "model_name": "qwen1.5-moe-chat",
2128
+ "model_lang": [
2129
+ "en",
2130
+ "zh"
2131
+ ],
2132
+ "model_ability": [
2133
+ "chat"
2134
+ ],
2135
+ "model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
2136
+ "model_specs": [
2137
+ {
2138
+ "model_format": "pytorch",
2139
+ "model_size_in_billions": "2_7",
2140
+ "quantizations": [
2141
+ "4-bit",
2142
+ "8-bit",
2143
+ "none"
2144
+ ],
2145
+ "model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat",
2146
+ "model_hub": "modelscope"
2147
+ },
2148
+ {
2149
+ "model_format": "gptq",
2150
+ "model_size_in_billions": "2_7",
2151
+ "quantizations": [
2152
+ "Int4"
2153
+ ],
2154
+ "model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
2155
+ "model_hub": "modelscope"
2156
+ }
2157
+ ],
2158
+ "prompt_style": {
2159
+ "style_name": "QWEN",
2160
+ "system_prompt": "You are a helpful assistant.",
2161
+ "roles": [
2162
+ "user",
2163
+ "assistant"
2164
+ ],
2165
+ "intra_message_sep": "\n",
2166
+ "stop_token_ids": [
2167
+ 151643,
2168
+ 151644,
2169
+ 151645
2170
+ ],
2171
+ "stop": [
2172
+ "<|endoftext|>",
2173
+ "<|im_start|>",
2174
+ "<|im_end|>"
2175
+ ]
2176
+ }
2177
+ },
2178
+ {
2179
+ "version": 1,
2180
+ "context_length": 65536,
2181
+ "model_name": "codeqwen1.5-chat",
2182
+ "model_lang": [
2183
+ "en",
2184
+ "zh"
2185
+ ],
2186
+ "model_ability": [
2187
+ "chat"
2188
+ ],
2189
+ "model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
2190
+ "model_specs": [
2191
+ {
2192
+ "model_format": "ggufv2",
2193
+ "model_size_in_billions": 7,
2194
+ "quantizations": [
2195
+ "q2_k",
2196
+ "q3_k_m",
2197
+ "q4_0",
2198
+ "q4_k_m",
2199
+ "q5_0",
2200
+ "q5_k_m",
2201
+ "q6_k",
2202
+ "q8_0"
2203
+ ],
2204
+ "model_id": "qwen/CodeQwen1.5-7B-Chat-GGUF",
2205
+ "model_hub": "modelscope",
2206
+ "model_file_name_template": "codeqwen-1_5-7b-chat-{quantization}.gguf"
2207
+ },
2208
+ {
2209
+ "model_format": "pytorch",
2210
+ "model_size_in_billions": 7,
2211
+ "quantizations": [
2212
+ "4-bit",
2213
+ "8-bit",
2214
+ "none"
2215
+ ],
2216
+ "model_id": "qwen/CodeQwen1.5-7B-Chat",
2217
+ "model_hub": "modelscope"
2218
+ },
2219
+ {
2220
+ "model_format": "awq",
2221
+ "model_size_in_billions": 7,
2222
+ "quantizations": [
2223
+ "Int4"
2224
+ ],
2225
+ "model_id": "qwen/CodeQwen1.5-7B-Chat-AWQ",
2226
+ "model_hub": "modelscope"
2227
+ }
2228
+ ],
2229
+ "prompt_style": {
2230
+ "style_name": "QWEN",
2231
+ "system_prompt": "You are a helpful assistant.",
2232
+ "roles": [
2233
+ "user",
2234
+ "assistant"
2235
+ ],
2236
+ "intra_message_sep": "\n",
2237
+ "stop_token_ids": [
2238
+ 151643,
2239
+ 151644,
2240
+ 151645
2241
+ ],
2242
+ "stop": [
2243
+ "<|endoftext|>",
2244
+ "<|im_start|>",
2245
+ "<|im_end|>"
2246
+ ]
2247
+ }
2248
+ },
2078
2249
  {
2079
2250
  "version": 1,
2080
2251
  "context_length": 4096,
@@ -2945,5 +3116,94 @@
2945
3116
  "</s>"
2946
3117
  ]
2947
3118
  }
3119
+ },
3120
+ {
3121
+ "version": 1,
3122
+ "context_length": 131072,
3123
+ "model_name": "c4ai-command-r-v01",
3124
+ "model_lang": [
3125
+ "en",
3126
+ "fr",
3127
+ "de",
3128
+ "es",
3129
+ "it",
3130
+ "pt",
3131
+ "ja",
3132
+ "ko",
3133
+ "zh",
3134
+ "ar"
3135
+ ],
3136
+ "model_ability": [
3137
+ "generate"
3138
+ ],
3139
+ "model_description": "C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.",
3140
+ "model_specs": [
3141
+ {
3142
+ "model_format": "pytorch",
3143
+ "model_size_in_billions": 35,
3144
+ "quantizations": [
3145
+ "none"
3146
+ ],
3147
+ "model_hub": "modelscope",
3148
+ "model_id": "AI-ModelScope/c4ai-command-r-v01",
3149
+ "model_revision": "master"
3150
+ },
3151
+ {
3152
+ "model_format": "ggufv2",
3153
+ "model_size_in_billions": 35,
3154
+ "quantizations": [
3155
+ "Q2_K",
3156
+ "Q4_K_M",
3157
+ "Q5_K_M"
3158
+ ],
3159
+ "model_id": "mirror013/C4AI-Command-R-v01-GGUF",
3160
+ "model_file_name_template": "c4ai-command-r-v01.{quantization}.gguf",
3161
+ "model_hub": "modelscope",
3162
+ "model_revision": "master"
3163
+ },
3164
+ {
3165
+ "model_format": "pytorch",
3166
+ "model_size_in_billions": 104,
3167
+ "quantizations": [
3168
+ "none"
3169
+ ],
3170
+ "model_hub": "modelscope",
3171
+ "model_id": "AI-ModelScope/c4ai-command-r-plus",
3172
+ "model_revision": "master"
3173
+ }
3174
+ ]
3175
+ },
3176
+ {
3177
+ "version": 1,
3178
+ "context_length": 131072,
3179
+ "model_name": "c4ai-command-r-v01-4bit",
3180
+ "model_lang": [
3181
+ "en",
3182
+ "fr",
3183
+ "de",
3184
+ "es",
3185
+ "it",
3186
+ "pt",
3187
+ "ja",
3188
+ "ko",
3189
+ "zh",
3190
+ "ar"
3191
+ ],
3192
+ "model_ability": [
3193
+ "generate"
3194
+ ],
3195
+ "model_description": "This model is 4bit quantized version of C4AI Command-R using bitsandbytes.",
3196
+ "model_specs": [
3197
+ {
3198
+ "model_format": "pytorch",
3199
+ "model_size_in_billions": 35,
3200
+ "quantizations": [
3201
+ "none"
3202
+ ],
3203
+ "model_hub": "modelscope",
3204
+ "model_id": "mirror013/c4ai-command-r-v01-4bit",
3205
+ "model_revision": "master"
3206
+ }
3207
+ ]
2948
3208
  }
2949
3209
  ]
@@ -12,8 +12,9 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Optional
15
+ from typing import List, Optional
16
16
 
17
+ from ....types import LoRA
17
18
  from ..llm_family import LLMFamilyV1, LLMSpecV1
18
19
  from .core import PytorchChatModel, PytorchModelConfig
19
20
 
@@ -27,7 +28,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
27
28
  quantization: str,
28
29
  model_path: str,
29
30
  pytorch_model_config: Optional[PytorchModelConfig] = None,
30
- peft_model_path: Optional[str] = None,
31
+ peft_model: Optional[List[LoRA]] = None,
31
32
  ):
32
33
  super().__init__(
33
34
  model_uid,
@@ -36,7 +37,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
36
37
  quantization,
37
38
  model_path,
38
39
  pytorch_model_config=pytorch_model_config,
39
- peft_model_path=peft_model_path,
40
+ peft_model=peft_model,
40
41
  )
41
42
  self._use_fast_tokenizer = False
42
43
 
@@ -24,6 +24,7 @@ from ....types import (
24
24
  CompletionChoice,
25
25
  CompletionChunk,
26
26
  CompletionUsage,
27
+ LoRA,
27
28
  PytorchGenerateConfig,
28
29
  )
29
30
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -39,7 +40,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
39
40
  quantization: str,
40
41
  model_path: str,
41
42
  pytorch_model_config: Optional[PytorchModelConfig] = None,
42
- peft_model_path: Optional[str] = None,
43
+ peft_model: Optional[List[LoRA]] = None,
43
44
  ):
44
45
  super().__init__(
45
46
  model_uid,
@@ -48,7 +49,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
48
49
  quantization,
49
50
  model_path,
50
51
  pytorch_model_config=pytorch_model_config,
51
- peft_model_path=peft_model_path,
52
+ peft_model=peft_model,
52
53
  )
53
54
 
54
55
  def _load_model(self, **kwargs):
@@ -135,6 +136,8 @@ class ChatglmPytorchChatModel(PytorchChatModel):
135
136
  chat_history = [h for h in chat_history if not h.get("tool_calls")]
136
137
  if not chat_history:
137
138
  chat_history = []
139
+ if system_prompt:
140
+ chat_history.append({"role": "system", "content": system_prompt})
138
141
  if tools:
139
142
  msg = self._model.chat(
140
143
  self._tokenizer, prompt, [tools] + chat_history, **kwargs
@@ -32,6 +32,7 @@ from ....types import (
32
32
  Embedding,
33
33
  EmbeddingData,
34
34
  EmbeddingUsage,
35
+ LoRA,
35
36
  PytorchGenerateConfig,
36
37
  PytorchModelConfig,
37
38
  )
@@ -42,6 +43,25 @@ from ..utils import ChatModelMixin
42
43
 
43
44
  logger = logging.getLogger(__name__)
44
45
 
46
+ NON_DEFAULT_MODEL_LIST: List[str] = [
47
+ "baichuan-chat",
48
+ "baichuan-2-chat",
49
+ "vicuna-v1.3",
50
+ "falcon",
51
+ "falcon-instruct",
52
+ "chatglm",
53
+ "chatglm2",
54
+ "chatglm2-32k",
55
+ "chatglm2-128k",
56
+ "llama-2",
57
+ "llama-2-chat",
58
+ "internlm2-chat",
59
+ "qwen-vl-chat",
60
+ "OmniLMM",
61
+ "yi-vl-chat",
62
+ "deepseek-vl-chat",
63
+ ]
64
+
45
65
 
46
66
  class PytorchModel(LLM):
47
67
  def __init__(
@@ -52,14 +72,14 @@ class PytorchModel(LLM):
52
72
  quantization: str,
53
73
  model_path: str,
54
74
  pytorch_model_config: Optional[PytorchModelConfig] = None,
55
- peft_model_path: Optional[str] = None,
75
+ peft_model: Optional[List[LoRA]] = None,
56
76
  ):
57
77
  super().__init__(model_uid, model_family, model_spec, quantization, model_path)
58
78
  self._use_fast_tokenizer = True
59
79
  self._pytorch_model_config: PytorchModelConfig = self._sanitize_model_config(
60
80
  pytorch_model_config
61
81
  )
62
- self._peft_model_path = peft_model_path
82
+ self._peft_model = peft_model
63
83
 
64
84
  def _sanitize_model_config(
65
85
  self, pytorch_model_config: Optional[PytorchModelConfig]
@@ -115,7 +135,7 @@ class PytorchModel(LLM):
115
135
  return model, tokenizer
116
136
 
117
137
  def _apply_lora(self):
118
- if self._peft_model_path is not None:
138
+ if self._peft_model is not None:
119
139
  try:
120
140
  from peft import PeftModel
121
141
  except ImportError:
@@ -123,14 +143,15 @@ class PytorchModel(LLM):
123
143
  f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
124
144
  )
125
145
 
126
- # Apply LoRA
127
- self._model = PeftModel.from_pretrained(
128
- self._model,
129
- self._peft_model_path,
130
- )
131
- logger.info(
132
- f"Successfully loaded the PEFT adaptor for model {self.model_uid}."
133
- )
146
+ for peft_model in self._peft_model:
147
+ # Apply LoRA
148
+ self._model = PeftModel.from_pretrained(
149
+ self._model,
150
+ peft_model.local_path,
151
+ )
152
+ logger.info(
153
+ f"PEFT adaptor '{peft_model.lora_name}' successfully loaded for model '{self.model_uid}'."
154
+ )
134
155
 
135
156
  def load(self):
136
157
  try:
@@ -233,17 +254,7 @@ class PytorchModel(LLM):
233
254
  if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
234
255
  return False
235
256
  model_family = llm_family.model_family or llm_family.model_name
236
- if model_family in [
237
- "baichuan-chat",
238
- "vicuna-v1.3",
239
- "falcon",
240
- "falcon-instruct",
241
- "chatglm",
242
- "chatglm2",
243
- "chatglm2-32k",
244
- "llama-2",
245
- "llama-2-chat",
246
- ]:
257
+ if model_family in NON_DEFAULT_MODEL_LIST:
247
258
  return False
248
259
  if "generate" not in llm_family.model_ability:
249
260
  return False
@@ -412,7 +423,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
412
423
  quantization: str,
413
424
  model_path: str,
414
425
  pytorch_model_config: Optional[PytorchModelConfig] = None,
415
- peft_model_path: Optional[str] = None,
426
+ peft_model: Optional[List[LoRA]] = None,
416
427
  ):
417
428
  super().__init__(
418
429
  model_uid,
@@ -421,7 +432,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
421
432
  quantization,
422
433
  model_path,
423
434
  pytorch_model_config,
424
- peft_model_path,
435
+ peft_model,
425
436
  )
426
437
 
427
438
  def _sanitize_generate_config(
@@ -452,23 +463,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
452
463
  ) -> bool:
453
464
  if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
454
465
  return False
455
- if llm_family.model_name in [
456
- "baichuan-chat",
457
- "baichuan-2-chat",
458
- "vicuna-v1.3",
459
- "falcon",
460
- "falcon-instruct",
461
- "chatglm",
462
- "chatglm2",
463
- "chatglm2-32k",
464
- "llama-2",
465
- "llama-2-chat",
466
- "internlm2-chat",
467
- "qwen-vl-chat",
468
- "OmniLMM",
469
- "yi-vl-chat",
470
- "deepseek-vl-chat",
471
- ]:
466
+ model_family = llm_family.model_family or llm_family.model_name
467
+ if model_family in NON_DEFAULT_MODEL_LIST:
472
468
  return False
473
469
  if "chat" not in llm_family.model_ability:
474
470
  return False
@@ -12,8 +12,9 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Optional
15
+ from typing import List, Optional
16
16
 
17
+ from ....types import LoRA
17
18
  from ..llm_family import LLMFamilyV1, LLMSpecV1
18
19
  from .core import PytorchChatModel, PytorchModel, PytorchModelConfig
19
20
 
@@ -27,7 +28,7 @@ class FalconPytorchModel(PytorchModel):
27
28
  quantization: str,
28
29
  model_path: str,
29
30
  pytorch_model_config: Optional[PytorchModelConfig] = None,
30
- peft_model_path: Optional[str] = None,
31
+ peft_model: Optional[List[LoRA]] = None,
31
32
  ):
32
33
  super().__init__(
33
34
  model_uid,
@@ -36,7 +37,7 @@ class FalconPytorchModel(PytorchModel):
36
37
  quantization,
37
38
  model_path,
38
39
  pytorch_model_config=pytorch_model_config,
39
- peft_model_path=peft_model_path,
40
+ peft_model=peft_model,
40
41
  )
41
42
 
42
43
  def _load_model(self, **kwargs):
@@ -86,7 +87,7 @@ class FalconPytorchChatModel(PytorchChatModel):
86
87
  quantization: str,
87
88
  model_path: str,
88
89
  pytorch_model_config: Optional[PytorchModelConfig] = None,
89
- peft_model_path: Optional[str] = None,
90
+ peft_model: Optional[List[LoRA]] = None,
90
91
  ):
91
92
  super().__init__(
92
93
  model_uid,
@@ -95,7 +96,7 @@ class FalconPytorchChatModel(PytorchChatModel):
95
96
  quantization,
96
97
  model_path,
97
98
  pytorch_model_config=pytorch_model_config,
98
- peft_model_path=peft_model_path,
99
+ peft_model=peft_model,
99
100
  )
100
101
 
101
102
  def _load_model(self, **kwargs):
@@ -23,6 +23,7 @@ from ....types import (
23
23
  CompletionChoice,
24
24
  CompletionChunk,
25
25
  CompletionUsage,
26
+ LoRA,
26
27
  PytorchGenerateConfig,
27
28
  )
28
29
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -38,7 +39,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
38
39
  quantization: str,
39
40
  model_path: str,
40
41
  pytorch_model_config: Optional[PytorchModelConfig] = None,
41
- peft_model_path: Optional[str] = None,
42
+ peft_model: Optional[List[LoRA]] = None,
42
43
  ):
43
44
  super().__init__(
44
45
  model_uid,
@@ -47,7 +48,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
47
48
  quantization,
48
49
  model_path,
49
50
  pytorch_model_config=pytorch_model_config,
50
- peft_model_path=peft_model_path,
51
+ peft_model=peft_model,
51
52
  )
52
53
 
53
54
  def _load_model(self, **kwargs):
@@ -114,6 +115,8 @@ class Internlm2PytorchChatModel(PytorchChatModel):
114
115
  ]
115
116
  else:
116
117
  input_history = []
118
+ if system_prompt:
119
+ kwargs["meta_instruction"] = system_prompt
117
120
  if stream:
118
121
 
119
122
  def _stream_generator():
@@ -12,8 +12,9 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Optional
15
+ from typing import List, Optional
16
16
 
17
+ from ....types import LoRA
17
18
  from ..llm_family import LLMFamilyV1, LLMSpecV1
18
19
  from .core import PytorchChatModel, PytorchModel, PytorchModelConfig
19
20
 
@@ -27,7 +28,7 @@ class LlamaPytorchModel(PytorchModel):
27
28
  quantization: str,
28
29
  model_path: str,
29
30
  pytorch_model_config: Optional[PytorchModelConfig] = None,
30
- peft_model_path: Optional[str] = None,
31
+ peft_model: Optional[List[LoRA]] = None,
31
32
  ):
32
33
  super().__init__(
33
34
  model_uid,
@@ -36,7 +37,7 @@ class LlamaPytorchModel(PytorchModel):
36
37
  quantization,
37
38
  model_path,
38
39
  pytorch_model_config=pytorch_model_config,
39
- peft_model_path=peft_model_path,
40
+ peft_model=peft_model,
40
41
  )
41
42
 
42
43
  def _load_model(self, **kwargs):
@@ -69,8 +70,8 @@ class LlamaPytorchChatModel(PytorchChatModel):
69
70
  model_spec: "LLMSpecV1",
70
71
  quantization: str,
71
72
  model_path: str,
72
- peft_model_path: Optional[str] = None,
73
73
  pytorch_model_config: Optional["PytorchModelConfig"] = None,
74
+ peft_model: Optional[List[LoRA]] = None,
74
75
  ):
75
76
  super().__init__(
76
77
  model_uid,
@@ -78,7 +79,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
78
79
  model_spec,
79
80
  quantization,
80
81
  model_path,
81
- peft_model_path=peft_model_path,
82
+ peft_model=peft_model,
82
83
  pytorch_model_config=pytorch_model_config,
83
84
  )
84
85
  self._use_fast_tokenizer = False
@@ -53,6 +53,8 @@ class QwenVLChatModel(PytorchChatModel):
53
53
 
54
54
  device = self._pytorch_model_config.get("device", "auto")
55
55
  device = select_device(device)
56
+ # for multiple GPU, set back to auto to make multiple devices work
57
+ device = "auto" if device == "cuda" else device
56
58
 
57
59
  self._tokenizer = AutoTokenizer.from_pretrained(
58
60
  self.model_path,
@@ -26,8 +26,9 @@
26
26
  # See the License for the specific language governing permissions and
27
27
  # limitations under the License.
28
28
 
29
- from typing import Optional
29
+ from typing import List, Optional
30
30
 
31
+ from ....types import LoRA
31
32
  from .. import LLMFamilyV1, LLMSpecV1
32
33
  from .core import PytorchChatModel, PytorchModelConfig
33
34
 
@@ -41,7 +42,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
41
42
  quantization: str,
42
43
  model_path: str,
43
44
  pytorch_model_config: Optional["PytorchModelConfig"] = None,
44
- peft_model_path: Optional[str] = None,
45
+ peft_model: Optional[List[LoRA]] = None,
45
46
  ):
46
47
  super().__init__(
47
48
  model_uid,
@@ -50,7 +51,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
50
51
  quantization,
51
52
  model_path,
52
53
  pytorch_model_config=pytorch_model_config,
53
- peft_model_path=peft_model_path,
54
+ peft_model=peft_model,
54
55
  )
55
56
  self._use_fast_tokenizer = False
56
57