xinference 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

@@ -3411,8 +3411,8 @@
3411
3411
  "8-bit",
3412
3412
  "none"
3413
3413
  ],
3414
- "model_id": "mistralai/Mistral-7B-Instruct-v0.2",
3415
- "model_revision": "9552e7b1d9b2d5bbd87a5aa7221817285dbb6366"
3414
+ "model_id": "mistralai/Codestral-22B-v0.1",
3415
+ "model_revision": "8f5fe23af91885222a1563283c87416745a5e212"
3416
3416
  },
3417
3417
  {
3418
3418
  "model_format": "ggufv2",
@@ -8516,5 +8516,312 @@
8516
8516
  "<|im_start|>",
8517
8517
  "<|im_end|>"
8518
8518
  ]
8519
+ },
8520
+ {
8521
+ "version": 1,
8522
+ "context_length": 32768,
8523
+ "model_name": "QwQ-32B-Preview",
8524
+ "model_lang": [
8525
+ "en",
8526
+ "zh"
8527
+ ],
8528
+ "model_ability": [
8529
+ "chat"
8530
+ ],
8531
+ "model_description": "QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities.",
8532
+ "model_specs": [
8533
+ {
8534
+ "model_format": "pytorch",
8535
+ "model_size_in_billions": 32,
8536
+ "quantizations": [
8537
+ "4-bit",
8538
+ "8-bit",
8539
+ "none"
8540
+ ],
8541
+ "model_id": "Qwen/QwQ-32B-Preview"
8542
+ },
8543
+ {
8544
+ "model_format": "awq",
8545
+ "model_size_in_billions": 32,
8546
+ "quantizations": [
8547
+ "Int4"
8548
+ ],
8549
+ "model_id": "KirillR/QwQ-32B-Preview-AWQ"
8550
+ },
8551
+ {
8552
+ "model_format": "ggufv2",
8553
+ "model_size_in_billions": 32,
8554
+ "quantizations": [
8555
+ "Q3_K_L",
8556
+ "Q4_K_M",
8557
+ "Q6_K",
8558
+ "Q8_0"
8559
+ ],
8560
+ "model_id": "lmstudio-community/QwQ-32B-Preview-GGUF",
8561
+ "model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf"
8562
+ },
8563
+ {
8564
+ "model_format": "mlx",
8565
+ "model_size_in_billions": 32,
8566
+ "quantizations": [
8567
+ "4-bit"
8568
+ ],
8569
+ "model_id": "mlx-community/Qwen_QwQ-32B-Preview_MLX-4bit"
8570
+ },
8571
+ {
8572
+ "model_format": "mlx",
8573
+ "model_size_in_billions": 32,
8574
+ "quantizations": [
8575
+ "8-bit"
8576
+ ],
8577
+ "model_id": "mlx-community/Qwen_QwQ-32B-Preview_MLX-8bit"
8578
+ },
8579
+ {
8580
+ "model_format": "mlx",
8581
+ "model_size_in_billions": 32,
8582
+ "quantizations": [
8583
+ "none"
8584
+ ],
8585
+ "model_id": "mlx-community/QwQ-32B-Preview-bf16"
8586
+ }
8587
+ ],
8588
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
8589
+ "stop_token_ids": [
8590
+ 151643,
8591
+ 151644,
8592
+ 151645
8593
+ ],
8594
+ "stop": [
8595
+ "<|endoftext|>",
8596
+ "<|im_start|>",
8597
+ "<|im_end|>"
8598
+ ]
8599
+ },
8600
+ {
8601
+ "version": 1,
8602
+ "context_length": 8192,
8603
+ "model_name": "glm-edge-chat",
8604
+ "model_lang": [
8605
+ "en",
8606
+ "zh"
8607
+ ],
8608
+ "model_ability": [
8609
+ "chat"
8610
+ ],
8611
+ "model_description": "The GLM-Edge series is our attempt to face the end-side real-life scenarios, which consists of two sizes of large-language dialogue models and multimodal comprehension models (GLM-Edge-1.5B-Chat, GLM-Edge-4B-Chat, GLM-Edge-V-2B, GLM-Edge-V-5B). Among them, the 1.5B / 2B model is mainly for platforms such as mobile phones and cars, and the 4B / 5B model is mainly for platforms such as PCs.",
8612
+ "model_specs": [
8613
+ {
8614
+ "model_format": "pytorch",
8615
+ "model_size_in_billions": "1_5",
8616
+ "quantizations": [
8617
+ "4-bit",
8618
+ "8-bit",
8619
+ "none"
8620
+ ],
8621
+ "model_id": "THUDM/glm-edge-1.5b-chat"
8622
+ },
8623
+ {
8624
+ "model_format": "pytorch",
8625
+ "model_size_in_billions": "4",
8626
+ "quantizations": [
8627
+ "4-bit",
8628
+ "8-bit",
8629
+ "none"
8630
+ ],
8631
+ "model_id": "THUDM/glm-edge-4b-chat"
8632
+ },
8633
+ {
8634
+ "model_format": "ggufv2",
8635
+ "model_size_in_billions": "1_5",
8636
+ "quantizations": [
8637
+ "Q4_0",
8638
+ "Q4_1",
8639
+ "Q4_K",
8640
+ "Q4_K_M",
8641
+ "Q4_K_S",
8642
+ "Q5_0",
8643
+ "Q5_1",
8644
+ "Q5_K",
8645
+ "Q5_K_M",
8646
+ "Q5_K_S",
8647
+ "Q6_K",
8648
+ "Q8_0"
8649
+ ],
8650
+ "model_file_name_template": "ggml-model-{quantization}.gguf",
8651
+ "model_id": "THUDM/glm-edge-1.5b-chat-gguf"
8652
+ },
8653
+ {
8654
+ "model_format": "ggufv2",
8655
+ "model_size_in_billions": "1_5",
8656
+ "quantizations": [
8657
+ "F16"
8658
+ ],
8659
+ "model_file_name_template": "glm-edge-1.5B-chat-{quantization}.gguf",
8660
+ "model_id": "THUDM/glm-edge-1.5b-chat-gguf"
8661
+ },
8662
+ {
8663
+ "model_format": "ggufv2",
8664
+ "model_size_in_billions": "4",
8665
+ "quantizations": [
8666
+ "Q4_0",
8667
+ "Q4_1",
8668
+ "Q4_K",
8669
+ "Q4_K_M",
8670
+ "Q4_K_S",
8671
+ "Q5_0",
8672
+ "Q5_1",
8673
+ "Q5_K",
8674
+ "Q5_K_M",
8675
+ "Q5_K_S",
8676
+ "Q6_K",
8677
+ "Q8_0"
8678
+ ],
8679
+ "model_file_name_template": "ggml-model-{quantization}.gguf",
8680
+ "model_id": "THUDM/glm-edge-4b-chat-gguf"
8681
+ },
8682
+ {
8683
+ "model_format": "ggufv2",
8684
+ "model_size_in_billions": "4",
8685
+ "quantizations": [
8686
+ "F16"
8687
+ ],
8688
+ "model_file_name_template": "glm-edge-4B-chat-{quantization}.gguf",
8689
+ "model_id": "THUDM/glm-edge-4b-chat-gguf"
8690
+ }
8691
+ ],
8692
+ "chat_template": "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}",
8693
+ "stop_token_ids": [
8694
+ 59246,
8695
+ 59253,
8696
+ 59255
8697
+ ],
8698
+ "stop": [
8699
+ "<|endoftext|>",
8700
+ "<|user|>",
8701
+ "<|observation|>"
8702
+ ]
8703
+ },
8704
+ {
8705
+ "version": 1,
8706
+ "context_length": 8192,
8707
+ "model_name": "glm-edge-v",
8708
+ "model_lang": [
8709
+ "en",
8710
+ "zh"
8711
+ ],
8712
+ "model_ability": [
8713
+ "chat",
8714
+ "vision"
8715
+ ],
8716
+ "model_description": "The GLM-Edge series is our attempt to face the end-side real-life scenarios, which consists of two sizes of large-language dialogue models and multimodal comprehension models (GLM-Edge-1.5B-Chat, GLM-Edge-4B-Chat, GLM-Edge-V-2B, GLM-Edge-V-5B). Among them, the 1.5B / 2B model is mainly for platforms such as mobile phones and cars, and the 4B / 5B model is mainly for platforms such as PCs.",
8717
+ "model_specs": [
8718
+ {
8719
+ "model_format": "pytorch",
8720
+ "model_size_in_billions": "2",
8721
+ "quantizations": [
8722
+ "4-bit",
8723
+ "8-bit",
8724
+ "none"
8725
+ ],
8726
+ "model_id": "THUDM/glm-edge-v-2b"
8727
+ },
8728
+ {
8729
+ "model_format": "pytorch",
8730
+ "model_size_in_billions": "5",
8731
+ "quantizations": [
8732
+ "4-bit",
8733
+ "8-bit",
8734
+ "none"
8735
+ ],
8736
+ "model_id": "THUDM/glm-edge-v-5b"
8737
+ },
8738
+ {
8739
+ "model_format": "ggufv2",
8740
+ "model_size_in_billions": "2",
8741
+ "quantizations": [
8742
+ "Q4_0",
8743
+ "Q4_1",
8744
+ "Q4_K",
8745
+ "Q4_K_M",
8746
+ "Q4_K_S",
8747
+ "Q5_0",
8748
+ "Q5_1",
8749
+ "Q5_K",
8750
+ "Q5_K_M",
8751
+ "Q5_K_S",
8752
+ "Q6_K",
8753
+ "Q8_0"
8754
+ ],
8755
+ "model_file_name_template": "ggml-model-{quantization}.gguf",
8756
+ "model_id": "THUDM/glm-edge-v-2b-gguf"
8757
+ },
8758
+ {
8759
+ "model_format": "ggufv2",
8760
+ "model_size_in_billions": "2",
8761
+ "quantizations": [
8762
+ "F16"
8763
+ ],
8764
+ "model_file_name_template": "glm-edge-v-2B-{quantization}.gguf",
8765
+ "model_id": "THUDM/glm-edge-v-2b-gguf"
8766
+ },
8767
+ {
8768
+ "model_format": "ggufv2",
8769
+ "model_size_in_billions": "2",
8770
+ "quantizations": [
8771
+ "f16"
8772
+ ],
8773
+ "model_file_name_template": "mmproj-model-{quantization}.gguf",
8774
+ "model_id": "THUDM/glm-edge-v-2b-gguf"
8775
+ },
8776
+ {
8777
+ "model_format": "ggufv2",
8778
+ "model_size_in_billions": "5",
8779
+ "quantizations": [
8780
+ "Q4_0",
8781
+ "Q4_1",
8782
+ "Q4_K",
8783
+ "Q4_K_M",
8784
+ "Q4_K_S",
8785
+ "Q5_0",
8786
+ "Q5_1",
8787
+ "Q5_K",
8788
+ "Q5_K_M",
8789
+ "Q5_K_S",
8790
+ "Q6_K",
8791
+ "Q8_0"
8792
+ ],
8793
+ "model_file_name_template": "ggml-model-{quantization}.gguf",
8794
+ "model_id": "THUDM/glm-edge-v-5b-gguf"
8795
+ },
8796
+ {
8797
+ "model_format": "ggufv2",
8798
+ "model_size_in_billions": "5",
8799
+ "quantizations": [
8800
+ "F16"
8801
+ ],
8802
+ "model_file_name_template": "glm-edge-v-5B-{quantization}.gguf",
8803
+ "model_id": "THUDM/glm-edge-v-5b-gguf"
8804
+ },
8805
+ {
8806
+ "model_format": "ggufv2",
8807
+ "model_size_in_billions": "5",
8808
+ "quantizations": [
8809
+ "f16"
8810
+ ],
8811
+ "model_file_name_template": "mmproj-model-{quantization}.gguf",
8812
+ "model_id": "THUDM/glm-edge-v-5b-gguf"
8813
+ }
8814
+ ],
8815
+ "chat_template": "{% for item in messages %}{% if item['role'] != 'system' %}<|{{ item['role'] }}|>\n{% for content in item['content'] %}{% if content['type'] == 'image' %}{% for _ in range(578) %}<|begin_of_image|>{% endfor %}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}",
8816
+ "stop_token_ids": [
8817
+ 59246,
8818
+ 59253,
8819
+ 59255
8820
+ ],
8821
+ "stop": [
8822
+ "<|endoftext|>",
8823
+ "<|user|>",
8824
+ "<|observation|>"
8825
+ ]
8519
8826
  }
8520
8827
  ]
@@ -6267,5 +6267,313 @@
6267
6267
  "<|im_start|>",
6268
6268
  "<|im_end|>"
6269
6269
  ]
6270
+ },
6271
+ {
6272
+ "version": 1,
6273
+ "context_length": 32768,
6274
+ "model_name": "QwQ-32B-Preview",
6275
+ "model_lang": [
6276
+ "en",
6277
+ "zh"
6278
+ ],
6279
+ "model_ability": [
6280
+ "chat"
6281
+ ],
6282
+ "model_description": "QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities.",
6283
+ "model_specs": [
6284
+ {
6285
+ "model_format": "pytorch",
6286
+ "model_size_in_billions": 32,
6287
+ "quantizations": [
6288
+ "4-bit",
6289
+ "8-bit",
6290
+ "none"
6291
+ ],
6292
+ "model_id": "Qwen/QwQ-32B-Preview",
6293
+ "model_hub": "modelscope"
6294
+ },
6295
+ {
6296
+ "model_format": "mlx",
6297
+ "model_size_in_billions": 32,
6298
+ "quantizations": [
6299
+ "4-bit"
6300
+ ],
6301
+ "model_id": "okwinds/QwQ-32B-Preview-MLX-4bit",
6302
+ "model_hub": "modelscope"
6303
+ },
6304
+ {
6305
+ "model_format": "mlx",
6306
+ "model_size_in_billions": 32,
6307
+ "quantizations": [
6308
+ "8-bit"
6309
+ ],
6310
+ "model_id": "okwinds/QwQ-32B-Preview-MLX-8bit",
6311
+ "model_hub": "modelscope"
6312
+ },
6313
+ {
6314
+ "model_format": "ggufv2",
6315
+ "model_size_in_billions": 32,
6316
+ "quantizations": [
6317
+ "Q3_K_L",
6318
+ "Q4_K_M",
6319
+ "Q6_K",
6320
+ "Q8_0"
6321
+ ],
6322
+ "model_id": "AI-ModelScope/QwQ-32B-Preview-GGUF",
6323
+ "model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf"
6324
+ }
6325
+ ],
6326
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
6327
+ "stop_token_ids": [
6328
+ 151643,
6329
+ 151644,
6330
+ 151645
6331
+ ],
6332
+ "stop": [
6333
+ "<|endoftext|>",
6334
+ "<|im_start|>",
6335
+ "<|im_end|>"
6336
+ ]
6337
+ },
6338
+ {
6339
+ "version": 1,
6340
+ "context_length": 8192,
6341
+ "model_name": "glm-edge-chat",
6342
+ "model_lang": [
6343
+ "en",
6344
+ "zh"
6345
+ ],
6346
+ "model_ability": [
6347
+ "chat"
6348
+ ],
6349
+ "model_description": "The GLM-Edge series is our attempt to face the end-side real-life scenarios, which consists of two sizes of large-language dialogue models and multimodal comprehension models (GLM-Edge-1.5B-Chat, GLM-Edge-4B-Chat, GLM-Edge-V-2B, GLM-Edge-V-5B). Among them, the 1.5B / 2B model is mainly for platforms such as mobile phones and cars, and the 4B / 5B model is mainly for platforms such as PCs.",
6350
+ "model_specs": [
6351
+ {
6352
+ "model_format": "pytorch",
6353
+ "model_size_in_billions": "1_5",
6354
+ "quantizations": [
6355
+ "4-bit",
6356
+ "8-bit",
6357
+ "none"
6358
+ ],
6359
+ "model_id": "ZhipuAI/glm-edge-1.5b-chat",
6360
+ "model_hub": "modelscope"
6361
+ },
6362
+ {
6363
+ "model_format": "pytorch",
6364
+ "model_size_in_billions": "4",
6365
+ "quantizations": [
6366
+ "4-bit",
6367
+ "8-bit",
6368
+ "none"
6369
+ ],
6370
+ "model_id": "ZhipuAI/glm-edge-4b-chat",
6371
+ "model_hub": "modelscope"
6372
+ },
6373
+ {
6374
+ "model_format": "ggufv2",
6375
+ "model_size_in_billions": "1_5",
6376
+ "quantizations": [
6377
+ "Q4_0",
6378
+ "Q4_1",
6379
+ "Q4_K",
6380
+ "Q4_K_M",
6381
+ "Q4_K_S",
6382
+ "Q5_0",
6383
+ "Q5_1",
6384
+ "Q5_K",
6385
+ "Q5_K_M",
6386
+ "Q5_K_S",
6387
+ "Q6_K",
6388
+ "Q8_0"
6389
+ ],
6390
+ "model_file_name_template": "ggml-model-{quantization}.gguf",
6391
+ "model_hub": "modelscope",
6392
+ "model_id": "ZhipuAI/glm-edge-1.5b-chat-gguf"
6393
+ },
6394
+ {
6395
+ "model_format": "ggufv2",
6396
+ "model_size_in_billions": "1_5",
6397
+ "quantizations": [
6398
+ "F16"
6399
+ ],
6400
+ "model_file_name_template": "glm-edge-1.5B-chat-{quantization}.gguf",
6401
+ "model_hub": "modelscope",
6402
+ "model_id": "ZhipuAI/glm-edge-1.5b-chat-gguf"
6403
+ },
6404
+ {
6405
+ "model_format": "ggufv2",
6406
+ "model_size_in_billions": "4",
6407
+ "quantizations": [
6408
+ "Q4_0",
6409
+ "Q4_1",
6410
+ "Q4_K",
6411
+ "Q4_K_M",
6412
+ "Q4_K_S",
6413
+ "Q5_0",
6414
+ "Q5_1",
6415
+ "Q5_K",
6416
+ "Q5_K_M",
6417
+ "Q5_K_S",
6418
+ "Q6_K",
6419
+ "Q8_0"
6420
+ ],
6421
+ "model_file_name_template": "ggml-model-{quantization}.gguf",
6422
+ "model_hub": "modelscope",
6423
+ "model_id": "ZhipuAI/glm-edge-4b-chat-gguf"
6424
+ },
6425
+ {
6426
+ "model_format": "ggufv2",
6427
+ "model_size_in_billions": "4",
6428
+ "quantizations": [
6429
+ "F16"
6430
+ ],
6431
+ "model_file_name_template": "glm-edge-4B-chat-{quantization}.gguf",
6432
+ "model_hub": "modelscope",
6433
+ "model_id": "ZhipuAI/glm-edge-4b-chat-gguf"
6434
+ }
6435
+ ],
6436
+ "chat_template": "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}",
6437
+ "stop_token_ids": [
6438
+ 59246,
6439
+ 59253,
6440
+ 59255
6441
+ ],
6442
+ "stop": [
6443
+ "<|endoftext|>",
6444
+ "<|user|>",
6445
+ "<|observation|>"
6446
+ ]
6447
+ },
6448
+ {
6449
+ "version": 1,
6450
+ "context_length": 8192,
6451
+ "model_name": "glm-edge-v",
6452
+ "model_lang": [
6453
+ "en",
6454
+ "zh"
6455
+ ],
6456
+ "model_ability": [
6457
+ "chat",
6458
+ "vision"
6459
+ ],
6460
+ "model_description": "The GLM-Edge series is our attempt to face the end-side real-life scenarios, which consists of two sizes of large-language dialogue models and multimodal comprehension models (GLM-Edge-1.5B-Chat, GLM-Edge-4B-Chat, GLM-Edge-V-2B, GLM-Edge-V-5B). Among them, the 1.5B / 2B model is mainly for platforms such as mobile phones and cars, and the 4B / 5B model is mainly for platforms such as PCs.",
6461
+ "model_specs": [
6462
+ {
6463
+ "model_format": "pytorch",
6464
+ "model_size_in_billions": "2",
6465
+ "quantizations": [
6466
+ "4-bit",
6467
+ "8-bit",
6468
+ "none"
6469
+ ],
6470
+ "model_id": "ZhipuAI/glm-edge-v-2b",
6471
+ "model_hub": "modelscope"
6472
+ },
6473
+ {
6474
+ "model_format": "pytorch",
6475
+ "model_size_in_billions": "5",
6476
+ "quantizations": [
6477
+ "4-bit",
6478
+ "8-bit",
6479
+ "none"
6480
+ ],
6481
+ "model_id": "ZhipuAI/glm-edge-v-5b",
6482
+ "model_hub": "modelscope"
6483
+ },
6484
+ {
6485
+ "model_format": "ggufv2",
6486
+ "model_size_in_billions": "2",
6487
+ "quantizations": [
6488
+ "Q4_0",
6489
+ "Q4_1",
6490
+ "Q4_K",
6491
+ "Q4_K_M",
6492
+ "Q4_K_S",
6493
+ "Q5_0",
6494
+ "Q5_1",
6495
+ "Q5_K",
6496
+ "Q5_K_M",
6497
+ "Q5_K_S",
6498
+ "Q6_K",
6499
+ "Q8_0"
6500
+ ],
6501
+ "model_file_name_template": "ggml-model-{quantization}.gguf",
6502
+ "model_hub": "modelscope",
6503
+ "model_id": "ZhipuAI/glm-edge-v-2b-gguf"
6504
+ },
6505
+ {
6506
+ "model_format": "ggufv2",
6507
+ "model_size_in_billions": "2",
6508
+ "quantizations": [
6509
+ "F16"
6510
+ ],
6511
+ "model_file_name_template": "glm-edge-v-2B-{quantization}.gguf",
6512
+ "model_hub": "modelscope",
6513
+ "model_id": "ZhipuAI/glm-edge-v-2b-gguf"
6514
+ },
6515
+ {
6516
+ "model_format": "ggufv2",
6517
+ "model_size_in_billions": "2",
6518
+ "quantizations": [
6519
+ "f16"
6520
+ ],
6521
+ "model_file_name_template": "mmproj-model-{quantization}.gguf",
6522
+ "model_hub": "modelscope",
6523
+ "model_id": "ZhipuAI/glm-edge-v-2b-gguf"
6524
+ },
6525
+ {
6526
+ "model_format": "ggufv2",
6527
+ "model_size_in_billions": "5",
6528
+ "quantizations": [
6529
+ "Q4_0",
6530
+ "Q4_1",
6531
+ "Q4_K",
6532
+ "Q4_K_M",
6533
+ "Q4_K_S",
6534
+ "Q5_0",
6535
+ "Q5_1",
6536
+ "Q5_K",
6537
+ "Q5_K_M",
6538
+ "Q5_K_S",
6539
+ "Q6_K",
6540
+ "Q8_0"
6541
+ ],
6542
+ "model_file_name_template": "ggml-model-{quantization}.gguf",
6543
+ "model_hub": "modelscope",
6544
+ "model_id": "ZhipuAI/glm-edge-v-5b-gguf"
6545
+ },
6546
+ {
6547
+ "model_format": "ggufv2",
6548
+ "model_size_in_billions": "5",
6549
+ "quantizations": [
6550
+ "F16"
6551
+ ],
6552
+ "model_file_name_template": "glm-edge-v-5B-{quantization}.gguf",
6553
+ "model_hub": "modelscope",
6554
+ "model_id": "ZhipuAI/glm-edge-v-5b-gguf"
6555
+ },
6556
+ {
6557
+ "model_format": "ggufv2",
6558
+ "model_size_in_billions": "5",
6559
+ "quantizations": [
6560
+ "f16"
6561
+ ],
6562
+ "model_file_name_template": "mmproj-model-{quantization}.gguf",
6563
+ "model_hub": "modelscope",
6564
+ "model_id": "ZhipuAI/glm-edge-v-5b-gguf"
6565
+ }
6566
+ ],
6567
+ "chat_template": "{% for item in messages %}{% if item['role'] != 'system' %}<|{{ item['role'] }}|>\n{% for content in item['content'] %}{% if content['type'] == 'image' %}{% for _ in range(578) %}<|begin_of_image|>{% endfor %}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}",
6568
+ "stop_token_ids": [
6569
+ 59246,
6570
+ 59253,
6571
+ 59255
6572
+ ],
6573
+ "stop": [
6574
+ "<|endoftext|>",
6575
+ "<|user|>",
6576
+ "<|observation|>"
6577
+ ]
6270
6578
  }
6271
6579
  ]
@@ -227,7 +227,6 @@ class MLXModel(LLM):
227
227
  repetition_penalty=kwargs["repetition_penalty"],
228
228
  repetition_context_size=kwargs["repetition_context_size"],
229
229
  top_p=kwargs["top_p"],
230
- logit_bias=kwargs["logit_bias"],
231
230
  prompt_cache=self._prompt_cache.cache, # type: ignore
232
231
  ),
233
232
  range(max_tokens),
@@ -89,6 +89,7 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
89
89
  "deepseek-v2-chat-0628",
90
90
  "qwen2.5-instruct",
91
91
  "qwen2.5-coder-instruct",
92
+ "QwQ-32B-Preview",
92
93
  ]
93
94
 
94
95
 
@@ -68,6 +68,7 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
68
68
  "deepseek-v2-chat",
69
69
  "deepseek-v2.5",
70
70
  "deepseek-v2-chat-0628",
71
+ "glm-edge-v",
71
72
  ]
72
73
 
73
74