xinference 0.16.3__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (69) hide show
  1. xinference/_compat.py +22 -2
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +148 -12
  4. xinference/client/restful/restful_client.py +47 -2
  5. xinference/constants.py +1 -0
  6. xinference/core/model.py +45 -15
  7. xinference/core/supervisor.py +8 -2
  8. xinference/core/utils.py +67 -2
  9. xinference/model/audio/__init__.py +12 -0
  10. xinference/model/audio/core.py +21 -4
  11. xinference/model/audio/fish_speech.py +70 -35
  12. xinference/model/audio/model_spec.json +81 -1
  13. xinference/model/audio/whisper_mlx.py +208 -0
  14. xinference/model/embedding/core.py +259 -4
  15. xinference/model/embedding/model_spec.json +1 -1
  16. xinference/model/embedding/model_spec_modelscope.json +1 -1
  17. xinference/model/image/stable_diffusion/core.py +5 -2
  18. xinference/model/llm/__init__.py +2 -0
  19. xinference/model/llm/llm_family.json +485 -6
  20. xinference/model/llm/llm_family_modelscope.json +519 -0
  21. xinference/model/llm/mlx/core.py +45 -3
  22. xinference/model/llm/sglang/core.py +1 -0
  23. xinference/model/llm/transformers/core.py +1 -0
  24. xinference/model/llm/transformers/glm_edge_v.py +230 -0
  25. xinference/model/llm/utils.py +19 -0
  26. xinference/model/llm/vllm/core.py +84 -2
  27. xinference/model/rerank/core.py +11 -4
  28. xinference/thirdparty/fish_speech/fish_speech/conversation.py +254 -0
  29. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +2 -1
  30. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +2 -1
  31. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +2 -2
  32. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json +123 -0
  33. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +2 -1
  34. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +76 -11
  35. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +9 -9
  36. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +1 -1
  37. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +32 -1
  38. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +2 -1
  39. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +22 -0
  40. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +1 -1
  41. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
  42. xinference/thirdparty/fish_speech/tools/api.py +578 -75
  43. xinference/thirdparty/fish_speech/tools/e2e_webui.py +232 -0
  44. xinference/thirdparty/fish_speech/tools/fish_e2e.py +298 -0
  45. xinference/thirdparty/fish_speech/tools/llama/generate.py +393 -9
  46. xinference/thirdparty/fish_speech/tools/msgpack_api.py +90 -29
  47. xinference/thirdparty/fish_speech/tools/post_api.py +37 -15
  48. xinference/thirdparty/fish_speech/tools/schema.py +187 -0
  49. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +7 -1
  50. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +2 -3
  51. xinference/thirdparty/fish_speech/tools/webui.py +138 -75
  52. xinference/types.py +2 -1
  53. {xinference-0.16.3.dist-info → xinference-1.0.1.dist-info}/METADATA +30 -6
  54. {xinference-0.16.3.dist-info → xinference-1.0.1.dist-info}/RECORD +58 -63
  55. {xinference-0.16.3.dist-info → xinference-1.0.1.dist-info}/WHEEL +1 -1
  56. xinference/thirdparty/fish_speech/fish_speech/configs/__init__.py +0 -0
  57. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  58. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  59. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  60. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  61. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  62. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  63. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  64. xinference/thirdparty/fish_speech/tools/commons.py +0 -35
  65. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  66. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  67. {xinference-0.16.3.dist-info → xinference-1.0.1.dist-info}/LICENSE +0 -0
  68. {xinference-0.16.3.dist-info → xinference-1.0.1.dist-info}/entry_points.txt +0 -0
  69. {xinference-0.16.3.dist-info → xinference-1.0.1.dist-info}/top_level.txt +0 -0
@@ -5907,6 +5907,18 @@
5907
5907
  ],
5908
5908
  "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
5909
5909
  "model_specs": [
5910
+ {
5911
+ "model_format": "pytorch",
5912
+ "model_size_in_billions": "0_5",
5913
+ "quantizations": [
5914
+ "4-bit",
5915
+ "8-bit",
5916
+ "none"
5917
+ ],
5918
+ "model_id": "qwen/Qwen2.5-Coder-0.5B",
5919
+ "model_revision": "master",
5920
+ "model_hub": "modelscope"
5921
+ },
5910
5922
  {
5911
5923
  "model_format": "pytorch",
5912
5924
  "model_size_in_billions": "1_5",
@@ -5919,6 +5931,18 @@
5919
5931
  "model_revision": "master",
5920
5932
  "model_hub": "modelscope"
5921
5933
  },
5934
+ {
5935
+ "model_format": "pytorch",
5936
+ "model_size_in_billions": "3",
5937
+ "quantizations": [
5938
+ "4-bit",
5939
+ "8-bit",
5940
+ "none"
5941
+ ],
5942
+ "model_id": "qwen/Qwen2.5-Coder-3B",
5943
+ "model_revision": "master",
5944
+ "model_hub": "modelscope"
5945
+ },
5922
5946
  {
5923
5947
  "model_format": "pytorch",
5924
5948
  "model_size_in_billions": 7,
@@ -5930,6 +5954,30 @@
5930
5954
  "model_id": "qwen/Qwen2.5-Coder-7B",
5931
5955
  "model_revision": "master",
5932
5956
  "model_hub": "modelscope"
5957
+ },
5958
+ {
5959
+ "model_format": "pytorch",
5960
+ "model_size_in_billions": 14,
5961
+ "quantizations": [
5962
+ "4-bit",
5963
+ "8-bit",
5964
+ "none"
5965
+ ],
5966
+ "model_id": "qwen/Qwen2.5-Coder-14B",
5967
+ "model_revision": "master",
5968
+ "model_hub": "modelscope"
5969
+ },
5970
+ {
5971
+ "model_format": "pytorch",
5972
+ "model_size_in_billions": 32,
5973
+ "quantizations": [
5974
+ "4-bit",
5975
+ "8-bit",
5976
+ "none"
5977
+ ],
5978
+ "model_id": "qwen/Qwen2.5-Coder-32B",
5979
+ "model_revision": "master",
5980
+ "model_hub": "modelscope"
5933
5981
  }
5934
5982
  ]
5935
5983
  },
@@ -5947,6 +5995,18 @@
5947
5995
  ],
5948
5996
  "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
5949
5997
  "model_specs": [
5998
+ {
5999
+ "model_format": "pytorch",
6000
+ "model_size_in_billions": "0_5",
6001
+ "quantizations": [
6002
+ "4-bit",
6003
+ "8-bit",
6004
+ "none"
6005
+ ],
6006
+ "model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct",
6007
+ "model_revision": "master",
6008
+ "model_hub": "modelscope"
6009
+ },
5950
6010
  {
5951
6011
  "model_format": "pytorch",
5952
6012
  "model_size_in_billions": "1_5",
@@ -5958,6 +6018,17 @@
5958
6018
  "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct",
5959
6019
  "model_revision": "master",
5960
6020
  "model_hub": "modelscope"
6021
+ }, {
6022
+ "model_format": "pytorch",
6023
+ "model_size_in_billions": "3",
6024
+ "quantizations": [
6025
+ "4-bit",
6026
+ "8-bit",
6027
+ "none"
6028
+ ],
6029
+ "model_id": "qwen/Qwen2.5-Coder-3B-Instruct",
6030
+ "model_revision": "master",
6031
+ "model_hub": "modelscope"
5961
6032
  },
5962
6033
  {
5963
6034
  "model_format": "pytorch",
@@ -5971,6 +6042,63 @@
5971
6042
  "model_revision": "master",
5972
6043
  "model_hub": "modelscope"
5973
6044
  },
6045
+ {
6046
+ "model_format": "pytorch",
6047
+ "model_size_in_billions": 14,
6048
+ "quantizations": [
6049
+ "4-bit",
6050
+ "8-bit",
6051
+ "none"
6052
+ ],
6053
+ "model_id": "qwen/Qwen2.5-Coder-14B-Instruct",
6054
+ "model_revision": "master",
6055
+ "model_hub": "modelscope"
6056
+ },
6057
+ {
6058
+ "model_format": "pytorch",
6059
+ "model_size_in_billions": 32,
6060
+ "quantizations": [
6061
+ "4-bit",
6062
+ "8-bit",
6063
+ "none"
6064
+ ],
6065
+ "model_id": "qwen/Qwen2.5-Coder-32B-Instruct",
6066
+ "model_revision": "master",
6067
+ "model_hub": "modelscope"
6068
+ },
6069
+ {
6070
+ "model_format": "gptq",
6071
+ "model_size_in_billions": "0_5",
6072
+ "quantizations": [
6073
+ "Int4",
6074
+ "Int8"
6075
+ ],
6076
+ "model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-{quantization}",
6077
+ "model_revision": "master",
6078
+ "model_hub": "modelscope"
6079
+ },
6080
+ {
6081
+ "model_format": "gptq",
6082
+ "model_size_in_billions": "1_5",
6083
+ "quantizations": [
6084
+ "Int4",
6085
+ "Int8"
6086
+ ],
6087
+ "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-{quantization}",
6088
+ "model_revision": "master",
6089
+ "model_hub": "modelscope"
6090
+ },
6091
+ {
6092
+ "model_format": "gptq",
6093
+ "model_size_in_billions": 3,
6094
+ "quantizations": [
6095
+ "Int4",
6096
+ "Int8"
6097
+ ],
6098
+ "model_id": "qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-{quantization}",
6099
+ "model_revision": "master",
6100
+ "model_hub": "modelscope"
6101
+ },
5974
6102
  {
5975
6103
  "model_format": "gptq",
5976
6104
  "model_size_in_billions": 7,
@@ -5982,6 +6110,89 @@
5982
6110
  "model_revision": "master",
5983
6111
  "model_hub": "modelscope"
5984
6112
  },
6113
+ {
6114
+ "model_format": "gptq",
6115
+ "model_size_in_billions": 14,
6116
+ "quantizations": [
6117
+ "Int4",
6118
+ "Int8"
6119
+ ],
6120
+ "model_id": "qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-{quantization}",
6121
+ "model_revision": "master",
6122
+ "model_hub": "modelscope"
6123
+ },
6124
+ {
6125
+ "model_format": "gptq",
6126
+ "model_size_in_billions": 32,
6127
+ "quantizations": [
6128
+ "Int4",
6129
+ "Int8"
6130
+ ],
6131
+ "model_id": "qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-{quantization}",
6132
+ "model_revision": "master",
6133
+ "model_hub": "modelscope"
6134
+ },
6135
+ {
6136
+ "model_format": "awq",
6137
+ "model_size_in_billions": "0_5",
6138
+ "quantizations": [
6139
+ "Int4"
6140
+ ],
6141
+ "model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ",
6142
+ "model_revision": "master",
6143
+ "model_hub": "modelscope"
6144
+ },
6145
+ {
6146
+ "model_format": "awq",
6147
+ "model_size_in_billions": "1_5",
6148
+ "quantizations": [
6149
+ "Int4"
6150
+ ],
6151
+ "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ",
6152
+ "model_revision": "master",
6153
+ "model_hub": "modelscope"
6154
+ },
6155
+ {
6156
+ "model_format": "awq",
6157
+ "model_size_in_billions": 3,
6158
+ "quantizations": [
6159
+ "Int4"
6160
+ ],
6161
+ "model_id": "qwen/Qwen2.5-Coder-3B-Instruct-AWQ",
6162
+ "model_revision": "master",
6163
+ "model_hub": "modelscope"
6164
+ },
6165
+ {
6166
+ "model_format": "awq",
6167
+ "model_size_in_billions": 7,
6168
+ "quantizations": [
6169
+ "Int4"
6170
+ ],
6171
+ "model_id": "qwen/Qwen2.5-Coder-7B-Instruct-AWQ",
6172
+ "model_revision": "master",
6173
+ "model_hub": "modelscope"
6174
+ },
6175
+ {
6176
+ "model_format": "awq",
6177
+ "model_size_in_billions": 14,
6178
+ "quantizations": [
6179
+ "Int4"
6180
+ ],
6181
+ "model_id": "qwen/Qwen2.5-Coder-14B-Instruct-AWQ",
6182
+ "model_revision": "master",
6183
+ "model_hub": "modelscope"
6184
+ },
6185
+ {
6186
+ "model_format": "awq",
6187
+ "model_size_in_billions": 32,
6188
+ "quantizations": [
6189
+ "Int4"
6190
+ ],
6191
+ "model_id": "qwen/Qwen2.5-Coder-32B-Instruct-AWQ",
6192
+ "model_revision": "master",
6193
+ "model_hub": "modelscope"
6194
+ },
6195
+
5985
6196
  {
5986
6197
  "model_format": "ggufv2",
5987
6198
  "model_size_in_billions": "1_5",
@@ -6056,5 +6267,313 @@
6056
6267
  "<|im_start|>",
6057
6268
  "<|im_end|>"
6058
6269
  ]
6270
+ },
6271
+ {
6272
+ "version": 1,
6273
+ "context_length": 32768,
6274
+ "model_name": "QwQ-32B-Preview",
6275
+ "model_lang": [
6276
+ "en",
6277
+ "zh"
6278
+ ],
6279
+ "model_ability": [
6280
+ "chat"
6281
+ ],
6282
+ "model_description": "QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities.",
6283
+ "model_specs": [
6284
+ {
6285
+ "model_format": "pytorch",
6286
+ "model_size_in_billions": 32,
6287
+ "quantizations": [
6288
+ "4-bit",
6289
+ "8-bit",
6290
+ "none"
6291
+ ],
6292
+ "model_id": "Qwen/QwQ-32B-Preview",
6293
+ "model_hub": "modelscope"
6294
+ },
6295
+ {
6296
+ "model_format": "mlx",
6297
+ "model_size_in_billions": 32,
6298
+ "quantizations": [
6299
+ "4-bit"
6300
+ ],
6301
+ "model_id": "okwinds/QwQ-32B-Preview-MLX-4bit",
6302
+ "model_hub": "modelscope"
6303
+ },
6304
+ {
6305
+ "model_format": "mlx",
6306
+ "model_size_in_billions": 32,
6307
+ "quantizations": [
6308
+ "8-bit"
6309
+ ],
6310
+ "model_id": "okwinds/QwQ-32B-Preview-MLX-8bit",
6311
+ "model_hub": "modelscope"
6312
+ },
6313
+ {
6314
+ "model_format": "ggufv2",
6315
+ "model_size_in_billions": 32,
6316
+ "quantizations": [
6317
+ "Q3_K_L",
6318
+ "Q4_K_M",
6319
+ "Q6_K",
6320
+ "Q8_0"
6321
+ ],
6322
+ "model_id": "AI-ModelScope/QwQ-32B-Preview-GGUF",
6323
+ "model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf"
6324
+ }
6325
+ ],
6326
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
6327
+ "stop_token_ids": [
6328
+ 151643,
6329
+ 151644,
6330
+ 151645
6331
+ ],
6332
+ "stop": [
6333
+ "<|endoftext|>",
6334
+ "<|im_start|>",
6335
+ "<|im_end|>"
6336
+ ]
6337
+ },
6338
+ {
6339
+ "version": 1,
6340
+ "context_length": 8192,
6341
+ "model_name": "glm-edge-chat",
6342
+ "model_lang": [
6343
+ "en",
6344
+ "zh"
6345
+ ],
6346
+ "model_ability": [
6347
+ "chat"
6348
+ ],
6349
+ "model_description": "The GLM-Edge series is our attempt to face the end-side real-life scenarios, which consists of two sizes of large-language dialogue models and multimodal comprehension models (GLM-Edge-1.5B-Chat, GLM-Edge-4B-Chat, GLM-Edge-V-2B, GLM-Edge-V-5B). Among them, the 1.5B / 2B model is mainly for platforms such as mobile phones and cars, and the 4B / 5B model is mainly for platforms such as PCs.",
6350
+ "model_specs": [
6351
+ {
6352
+ "model_format": "pytorch",
6353
+ "model_size_in_billions": "1_5",
6354
+ "quantizations": [
6355
+ "4-bit",
6356
+ "8-bit",
6357
+ "none"
6358
+ ],
6359
+ "model_id": "ZhipuAI/glm-edge-1.5b-chat",
6360
+ "model_hub": "modelscope"
6361
+ },
6362
+ {
6363
+ "model_format": "pytorch",
6364
+ "model_size_in_billions": "4",
6365
+ "quantizations": [
6366
+ "4-bit",
6367
+ "8-bit",
6368
+ "none"
6369
+ ],
6370
+ "model_id": "ZhipuAI/glm-edge-4b-chat",
6371
+ "model_hub": "modelscope"
6372
+ },
6373
+ {
6374
+ "model_format": "ggufv2",
6375
+ "model_size_in_billions": "1_5",
6376
+ "quantizations": [
6377
+ "Q4_0",
6378
+ "Q4_1",
6379
+ "Q4_K",
6380
+ "Q4_K_M",
6381
+ "Q4_K_S",
6382
+ "Q5_0",
6383
+ "Q5_1",
6384
+ "Q5_K",
6385
+ "Q5_K_M",
6386
+ "Q5_K_S",
6387
+ "Q6_K",
6388
+ "Q8_0"
6389
+ ],
6390
+ "model_file_name_template": "ggml-model-{quantization}.gguf",
6391
+ "model_hub": "modelscope",
6392
+ "model_id": "ZhipuAI/glm-edge-1.5b-chat-gguf"
6393
+ },
6394
+ {
6395
+ "model_format": "ggufv2",
6396
+ "model_size_in_billions": "1_5",
6397
+ "quantizations": [
6398
+ "F16"
6399
+ ],
6400
+ "model_file_name_template": "glm-edge-1.5B-chat-{quantization}.gguf",
6401
+ "model_hub": "modelscope",
6402
+ "model_id": "ZhipuAI/glm-edge-1.5b-chat-gguf"
6403
+ },
6404
+ {
6405
+ "model_format": "ggufv2",
6406
+ "model_size_in_billions": "4",
6407
+ "quantizations": [
6408
+ "Q4_0",
6409
+ "Q4_1",
6410
+ "Q4_K",
6411
+ "Q4_K_M",
6412
+ "Q4_K_S",
6413
+ "Q5_0",
6414
+ "Q5_1",
6415
+ "Q5_K",
6416
+ "Q5_K_M",
6417
+ "Q5_K_S",
6418
+ "Q6_K",
6419
+ "Q8_0"
6420
+ ],
6421
+ "model_file_name_template": "ggml-model-{quantization}.gguf",
6422
+ "model_hub": "modelscope",
6423
+ "model_id": "ZhipuAI/glm-edge-4b-chat-gguf"
6424
+ },
6425
+ {
6426
+ "model_format": "ggufv2",
6427
+ "model_size_in_billions": "4",
6428
+ "quantizations": [
6429
+ "F16"
6430
+ ],
6431
+ "model_file_name_template": "glm-edge-4B-chat-{quantization}.gguf",
6432
+ "model_hub": "modelscope",
6433
+ "model_id": "ZhipuAI/glm-edge-4b-chat-gguf"
6434
+ }
6435
+ ],
6436
+ "chat_template": "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}",
6437
+ "stop_token_ids": [
6438
+ 59246,
6439
+ 59253,
6440
+ 59255
6441
+ ],
6442
+ "stop": [
6443
+ "<|endoftext|>",
6444
+ "<|user|>",
6445
+ "<|observation|>"
6446
+ ]
6447
+ },
6448
+ {
6449
+ "version": 1,
6450
+ "context_length": 8192,
6451
+ "model_name": "glm-edge-v",
6452
+ "model_lang": [
6453
+ "en",
6454
+ "zh"
6455
+ ],
6456
+ "model_ability": [
6457
+ "chat",
6458
+ "vision"
6459
+ ],
6460
+ "model_description": "The GLM-Edge series is our attempt to face the end-side real-life scenarios, which consists of two sizes of large-language dialogue models and multimodal comprehension models (GLM-Edge-1.5B-Chat, GLM-Edge-4B-Chat, GLM-Edge-V-2B, GLM-Edge-V-5B). Among them, the 1.5B / 2B model is mainly for platforms such as mobile phones and cars, and the 4B / 5B model is mainly for platforms such as PCs.",
6461
+ "model_specs": [
6462
+ {
6463
+ "model_format": "pytorch",
6464
+ "model_size_in_billions": "2",
6465
+ "quantizations": [
6466
+ "4-bit",
6467
+ "8-bit",
6468
+ "none"
6469
+ ],
6470
+ "model_id": "ZhipuAI/glm-edge-v-2b",
6471
+ "model_hub": "modelscope"
6472
+ },
6473
+ {
6474
+ "model_format": "pytorch",
6475
+ "model_size_in_billions": "5",
6476
+ "quantizations": [
6477
+ "4-bit",
6478
+ "8-bit",
6479
+ "none"
6480
+ ],
6481
+ "model_id": "ZhipuAI/glm-edge-v-5b",
6482
+ "model_hub": "modelscope"
6483
+ },
6484
+ {
6485
+ "model_format": "ggufv2",
6486
+ "model_size_in_billions": "2",
6487
+ "quantizations": [
6488
+ "Q4_0",
6489
+ "Q4_1",
6490
+ "Q4_K",
6491
+ "Q4_K_M",
6492
+ "Q4_K_S",
6493
+ "Q5_0",
6494
+ "Q5_1",
6495
+ "Q5_K",
6496
+ "Q5_K_M",
6497
+ "Q5_K_S",
6498
+ "Q6_K",
6499
+ "Q8_0"
6500
+ ],
6501
+ "model_file_name_template": "ggml-model-{quantization}.gguf",
6502
+ "model_hub": "modelscope",
6503
+ "model_id": "ZhipuAI/glm-edge-v-2b-gguf"
6504
+ },
6505
+ {
6506
+ "model_format": "ggufv2",
6507
+ "model_size_in_billions": "2",
6508
+ "quantizations": [
6509
+ "F16"
6510
+ ],
6511
+ "model_file_name_template": "glm-edge-v-2B-{quantization}.gguf",
6512
+ "model_hub": "modelscope",
6513
+ "model_id": "ZhipuAI/glm-edge-v-2b-gguf"
6514
+ },
6515
+ {
6516
+ "model_format": "ggufv2",
6517
+ "model_size_in_billions": "2",
6518
+ "quantizations": [
6519
+ "f16"
6520
+ ],
6521
+ "model_file_name_template": "mmproj-model-{quantization}.gguf",
6522
+ "model_hub": "modelscope",
6523
+ "model_id": "ZhipuAI/glm-edge-v-2b-gguf"
6524
+ },
6525
+ {
6526
+ "model_format": "ggufv2",
6527
+ "model_size_in_billions": "5",
6528
+ "quantizations": [
6529
+ "Q4_0",
6530
+ "Q4_1",
6531
+ "Q4_K",
6532
+ "Q4_K_M",
6533
+ "Q4_K_S",
6534
+ "Q5_0",
6535
+ "Q5_1",
6536
+ "Q5_K",
6537
+ "Q5_K_M",
6538
+ "Q5_K_S",
6539
+ "Q6_K",
6540
+ "Q8_0"
6541
+ ],
6542
+ "model_file_name_template": "ggml-model-{quantization}.gguf",
6543
+ "model_hub": "modelscope",
6544
+ "model_id": "ZhipuAI/glm-edge-v-5b-gguf"
6545
+ },
6546
+ {
6547
+ "model_format": "ggufv2",
6548
+ "model_size_in_billions": "5",
6549
+ "quantizations": [
6550
+ "F16"
6551
+ ],
6552
+ "model_file_name_template": "glm-edge-v-5B-{quantization}.gguf",
6553
+ "model_hub": "modelscope",
6554
+ "model_id": "ZhipuAI/glm-edge-v-5b-gguf"
6555
+ },
6556
+ {
6557
+ "model_format": "ggufv2",
6558
+ "model_size_in_billions": "5",
6559
+ "quantizations": [
6560
+ "f16"
6561
+ ],
6562
+ "model_file_name_template": "mmproj-model-{quantization}.gguf",
6563
+ "model_hub": "modelscope",
6564
+ "model_id": "ZhipuAI/glm-edge-v-5b-gguf"
6565
+ }
6566
+ ],
6567
+ "chat_template": "{% for item in messages %}{% if item['role'] != 'system' %}<|{{ item['role'] }}|>\n{% for content in item['content'] %}{% if content['type'] == 'image' %}{% for _ in range(578) %}<|begin_of_image|>{% endfor %}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}",
6568
+ "stop_token_ids": [
6569
+ 59246,
6570
+ 59253,
6571
+ 59255
6572
+ ],
6573
+ "stop": [
6574
+ "<|endoftext|>",
6575
+ "<|user|>",
6576
+ "<|observation|>"
6577
+ ]
6059
6578
  }
6060
6579
  ]
@@ -17,7 +17,8 @@ import platform
17
17
  import sys
18
18
  import time
19
19
  import uuid
20
- from typing import Dict, Iterator, List, Optional, TypedDict, Union
20
+ from dataclasses import dataclass, field
21
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, TypedDict, Union
21
22
 
22
23
  from ....fields import max_tokens_field
23
24
  from ....types import (
@@ -53,6 +54,14 @@ class MLXGenerateConfig(TypedDict, total=False):
53
54
  stream: bool
54
55
  stream_options: Optional[Union[dict, None]]
55
56
  tools: Optional[List[Dict]]
57
+ lora_name: Optional[str]
58
+
59
+
60
+ @dataclass
61
+ class PromptCache:
62
+ cache: List[Any] = field(default_factory=list)
63
+ model_key: Tuple[str, Optional[str]] = ("", None)
64
+ tokens: List[int] = field(default_factory=list)
56
65
 
57
66
 
58
67
  class MLXModel(LLM):
@@ -69,6 +78,8 @@ class MLXModel(LLM):
69
78
  super().__init__(model_uid, model_family, model_spec, quantization, model_path)
70
79
  self._use_fast_tokenizer = True
71
80
  self._model_config: MLXModelConfig = self._sanitize_model_config(model_config)
81
+ self._max_kv_size = None
82
+ self._prompt_cache = None
72
83
  if peft_model is not None:
73
84
  raise ValueError("MLX engine has not supported lora yet")
74
85
 
@@ -127,6 +138,9 @@ class MLXModel(LLM):
127
138
  logger.debug(f"Setting cache limit to {cache_limit_gb} GB")
128
139
  mx.metal.set_cache_limit(cache_limit_gb * 1024 * 1024 * 1024)
129
140
 
141
+ self._max_kv_size = kwargs.get("max_kv_size", None)
142
+ self._prompt_cache = PromptCache()
143
+
130
144
  return load(
131
145
  self.model_path,
132
146
  tokenizer_config=tokenizer_config,
@@ -156,6 +170,27 @@ class MLXModel(LLM):
156
170
  return False
157
171
  return True
158
172
 
173
+ def _get_prompt_cache(self, prompt, lora_name: Optional[str] = None):
174
+ from mlx_lm.models.cache import make_prompt_cache
175
+
176
+ assert self._prompt_cache is not None
177
+ cache_len = len(self._prompt_cache.tokens)
178
+ model_key = (self.model_path, lora_name)
179
+ if (
180
+ self._prompt_cache.model_key != model_key
181
+ or cache_len >= len(prompt)
182
+ or self._prompt_cache.tokens != prompt[:cache_len]
183
+ ):
184
+ self._prompt_cache.model_key = model_key
185
+ self._prompt_cache.cache = make_prompt_cache(self._model, self._max_kv_size)
186
+ self._prompt_cache.tokens = []
187
+ logger.debug("Making new prompt cache for %s", self.model_uid)
188
+ else:
189
+ prompt = prompt[cache_len:]
190
+ logger.debug("Cache hit for %s", self.model_uid)
191
+ self._prompt_cache.tokens.extend(prompt)
192
+ return prompt
193
+
159
194
  def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig):
160
195
  import mlx.core as mx
161
196
  from mlx_lm.utils import generate_step
@@ -167,6 +202,7 @@ class MLXModel(LLM):
167
202
  chunk_id = str(uuid.uuid4())
168
203
  stop_token_ids = kwargs.get("stop_token_ids", [])
169
204
  stream = kwargs.get("stream", False)
205
+ lora_name = kwargs.get("lora_name")
170
206
  stream_options = kwargs.pop("stream_options", None)
171
207
  include_usage = (
172
208
  stream_options["include_usage"]
@@ -174,12 +210,15 @@ class MLXModel(LLM):
174
210
  else False
175
211
  )
176
212
 
177
- prompt_tokens = mx.array(tokenizer.encode(prompt))
213
+ prompt_token_ids = tokenizer.encode(prompt)
214
+ prompt_token_ids = self._get_prompt_cache(prompt_token_ids, lora_name)
215
+ prompt_tokens = mx.array(prompt_token_ids)
178
216
  input_echo_len = len(prompt_tokens)
179
217
 
180
218
  i = 0
181
219
  start = time.time()
182
220
  output = ""
221
+ tokens = []
183
222
  for (token, _), i in zip(
184
223
  generate_step(
185
224
  prompt_tokens,
@@ -188,10 +227,11 @@ class MLXModel(LLM):
188
227
  repetition_penalty=kwargs["repetition_penalty"],
189
228
  repetition_context_size=kwargs["repetition_context_size"],
190
229
  top_p=kwargs["top_p"],
191
- logit_bias=kwargs["logit_bias"],
230
+ prompt_cache=self._prompt_cache.cache, # type: ignore
192
231
  ),
193
232
  range(max_tokens),
194
233
  ):
234
+ tokens.append(token)
195
235
  if token == tokenizer.eos_token_id or token in stop_token_ids: # type: ignore
196
236
  break
197
237
 
@@ -230,6 +270,8 @@ class MLXModel(LLM):
230
270
  f"Average generation speed: {i / (time.time() - start):.2f} tokens/s."
231
271
  )
232
272
 
273
+ self._prompt_cache.tokens.extend(tokens) # type: ignore
274
+
233
275
  if i == max_tokens - 1:
234
276
  finish_reason = "length"
235
277
  else:
@@ -89,6 +89,7 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
89
89
  "deepseek-v2-chat-0628",
90
90
  "qwen2.5-instruct",
91
91
  "qwen2.5-coder-instruct",
92
+ "QwQ-32B-Preview",
92
93
  ]
93
94
 
94
95