xinference 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (101) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/auth_service.py +1 -1
  3. xinference/api/restful_api.py +53 -61
  4. xinference/client/restful/restful_client.py +52 -57
  5. xinference/conftest.py +1 -1
  6. xinference/core/cache_tracker.py +1 -1
  7. xinference/core/chat_interface.py +10 -4
  8. xinference/core/event.py +1 -1
  9. xinference/core/model.py +17 -6
  10. xinference/core/status_guard.py +1 -1
  11. xinference/core/supervisor.py +58 -72
  12. xinference/core/worker.py +68 -101
  13. xinference/deploy/cmdline.py +166 -1
  14. xinference/deploy/test/test_cmdline.py +2 -0
  15. xinference/deploy/utils.py +1 -1
  16. xinference/device_utils.py +29 -3
  17. xinference/fields.py +7 -1
  18. xinference/model/audio/whisper.py +88 -12
  19. xinference/model/core.py +2 -2
  20. xinference/model/image/__init__.py +29 -0
  21. xinference/model/image/core.py +6 -0
  22. xinference/model/image/custom.py +109 -0
  23. xinference/model/llm/__init__.py +92 -32
  24. xinference/model/llm/core.py +57 -102
  25. xinference/model/llm/ggml/chatglm.py +98 -13
  26. xinference/model/llm/ggml/llamacpp.py +49 -2
  27. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
  28. xinference/model/llm/llm_family.json +438 -7
  29. xinference/model/llm/llm_family.py +45 -41
  30. xinference/model/llm/llm_family_modelscope.json +258 -5
  31. xinference/model/llm/pytorch/chatglm.py +48 -0
  32. xinference/model/llm/pytorch/core.py +23 -6
  33. xinference/model/llm/pytorch/deepseek_vl.py +115 -33
  34. xinference/model/llm/pytorch/internlm2.py +32 -1
  35. xinference/model/llm/pytorch/qwen_vl.py +94 -12
  36. xinference/model/llm/pytorch/utils.py +38 -1
  37. xinference/model/llm/pytorch/yi_vl.py +96 -51
  38. xinference/model/llm/sglang/core.py +31 -9
  39. xinference/model/llm/utils.py +54 -20
  40. xinference/model/llm/vllm/core.py +101 -7
  41. xinference/thirdparty/omnilmm/chat.py +2 -1
  42. xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
  43. xinference/types.py +11 -0
  44. xinference/web/ui/build/asset-manifest.json +6 -3
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/css/main.54bca460.css +2 -0
  47. xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
  48. xinference/web/ui/build/static/js/main.551aa479.js +3 -0
  49. xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.551aa479.js.LICENSE.txt} +7 -0
  50. xinference/web/ui/build/static/js/main.551aa479.js.map +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/23caf6f1e52c43e983ca3bfd4189f41dbd645fa78f2dfdcd7f6b69bc41678665.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
  67. xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
  68. xinference/web/ui/node_modules/.package-lock.json +33 -0
  69. xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
  70. xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
  71. xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
  72. xinference/web/ui/node_modules/clipboard/bower.json +18 -0
  73. xinference/web/ui/node_modules/clipboard/composer.json +25 -0
  74. xinference/web/ui/node_modules/clipboard/package.json +63 -0
  75. xinference/web/ui/node_modules/delegate/package.json +31 -0
  76. xinference/web/ui/node_modules/good-listener/bower.json +11 -0
  77. xinference/web/ui/node_modules/good-listener/package.json +35 -0
  78. xinference/web/ui/node_modules/select/bower.json +13 -0
  79. xinference/web/ui/node_modules/select/package.json +29 -0
  80. xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
  81. xinference/web/ui/package-lock.json +34 -0
  82. xinference/web/ui/package.json +1 -0
  83. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/METADATA +13 -12
  84. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/RECORD +88 -67
  85. xinference/client/oscar/__init__.py +0 -13
  86. xinference/client/oscar/actor_client.py +0 -611
  87. xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
  88. xinference/model/llm/pytorch/spec_model.py +0 -186
  89. xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
  90. xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +0 -1
  93. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
  94. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
  95. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
  96. xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
  97. xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
  98. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/LICENSE +0 -0
  99. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/WHEEL +0 -0
  100. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/entry_points.txt +0 -0
  101. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/top_level.txt +0 -0
@@ -413,7 +413,7 @@
413
413
  ],
414
414
  "model_hub": "modelscope",
415
415
  "model_id": "ZhipuAI/chatglm3-6b",
416
- "model_revision": "v1.0.0"
416
+ "model_revision": "v1.0.2"
417
417
  }
418
418
  ],
419
419
  "prompt_style": {
@@ -1289,7 +1289,7 @@
1289
1289
  },
1290
1290
  {
1291
1291
  "version": 1,
1292
- "context_length": 204800,
1292
+ "context_length": 262144,
1293
1293
  "model_name": "Yi-200k",
1294
1294
  "model_lang": [
1295
1295
  "en",
@@ -1328,7 +1328,7 @@
1328
1328
  },
1329
1329
  {
1330
1330
  "version": 1,
1331
- "context_length": 204800,
1331
+ "context_length": 4096,
1332
1332
  "model_name": "Yi-chat",
1333
1333
  "model_lang": [
1334
1334
  "en",
@@ -1349,6 +1349,18 @@
1349
1349
  "model_id": "01ai/Yi-34B-Chat-{quantization}",
1350
1350
  "model_revision": "master"
1351
1351
  },
1352
+ {
1353
+ "model_format": "pytorch",
1354
+ "model_size_in_billions": 6,
1355
+ "quantizations": [
1356
+ "4-bit",
1357
+ "8-bit",
1358
+ "none"
1359
+ ],
1360
+ "model_hub": "modelscope",
1361
+ "model_id": "01ai/Yi-6B-Chat",
1362
+ "model_revision": "master"
1363
+ },
1352
1364
  {
1353
1365
  "model_format": "pytorch",
1354
1366
  "model_size_in_billions": 34,
@@ -1385,6 +1397,130 @@
1385
1397
  ]
1386
1398
  }
1387
1399
  },
1400
+ {
1401
+ "version": 1,
1402
+ "context_length": 4096,
1403
+ "model_name": "Yi-1.5",
1404
+ "model_lang": [
1405
+ "en",
1406
+ "zh"
1407
+ ],
1408
+ "model_ability": [
1409
+ "generate"
1410
+ ],
1411
+ "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
1412
+ "model_specs": [
1413
+ {
1414
+ "model_format": "pytorch",
1415
+ "model_size_in_billions": 6,
1416
+ "quantizations": [
1417
+ "4-bit",
1418
+ "8-bit",
1419
+ "none"
1420
+ ],
1421
+ "model_hub": "modelscope",
1422
+ "model_id": "01ai/Yi-1.5-6B",
1423
+ "model_revision": "master"
1424
+ },
1425
+ {
1426
+ "model_format": "pytorch",
1427
+ "model_size_in_billions": 9,
1428
+ "quantizations": [
1429
+ "4-bit",
1430
+ "8-bit",
1431
+ "none"
1432
+ ],
1433
+ "model_hub": "modelscope",
1434
+ "model_id": "01ai/Yi-1.5-9B",
1435
+ "model_revision": "master"
1436
+ },
1437
+ {
1438
+ "model_format": "pytorch",
1439
+ "model_size_in_billions": 34,
1440
+ "quantizations": [
1441
+ "4-bit",
1442
+ "8-bit",
1443
+ "none"
1444
+ ],
1445
+ "model_hub": "modelscope",
1446
+ "model_id": "01ai/Yi-1.5-34B",
1447
+ "model_revision": "master"
1448
+ }
1449
+ ]
1450
+ },
1451
+ {
1452
+ "version": 1,
1453
+ "context_length": 4096,
1454
+ "model_name": "Yi-1.5-chat",
1455
+ "model_lang": [
1456
+ "en",
1457
+ "zh"
1458
+ ],
1459
+ "model_ability": [
1460
+ "chat"
1461
+ ],
1462
+ "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
1463
+ "model_specs": [
1464
+ {
1465
+ "model_format": "pytorch",
1466
+ "model_size_in_billions": 6,
1467
+ "quantizations": [
1468
+ "4-bit",
1469
+ "8-bit",
1470
+ "none"
1471
+ ],
1472
+ "model_hub": "modelscope",
1473
+ "model_id": "01ai/Yi-1.5-6B-Chat",
1474
+ "model_revision": "master"
1475
+ },
1476
+ {
1477
+ "model_format": "pytorch",
1478
+ "model_size_in_billions": 9,
1479
+ "quantizations": [
1480
+ "4-bit",
1481
+ "8-bit",
1482
+ "none"
1483
+ ],
1484
+ "model_hub": "modelscope",
1485
+ "model_id": "01ai/Yi-1.5-9B-Chat",
1486
+ "model_revision": "master"
1487
+ },
1488
+ {
1489
+ "model_format": "pytorch",
1490
+ "model_size_in_billions": 34,
1491
+ "quantizations": [
1492
+ "4-bit",
1493
+ "8-bit",
1494
+ "none"
1495
+ ],
1496
+ "model_hub": "modelscope",
1497
+ "model_id": "01ai/Yi-1.5-34B-Chat",
1498
+ "model_revision": "master"
1499
+ }
1500
+ ],
1501
+ "prompt_style": {
1502
+ "style_name": "CHATML",
1503
+ "system_prompt": "",
1504
+ "roles": [
1505
+ "<|im_start|>user",
1506
+ "<|im_start|>assistant"
1507
+ ],
1508
+ "intra_message_sep": "<|im_end|>",
1509
+ "inter_message_sep": "",
1510
+ "stop_token_ids": [
1511
+ 2,
1512
+ 6,
1513
+ 7,
1514
+ 8
1515
+ ],
1516
+ "stop": [
1517
+ "<|endoftext|>",
1518
+ "<|im_start|>",
1519
+ "<|im_end|>",
1520
+ "<|im_sep|>"
1521
+ ]
1522
+ }
1523
+ },
1388
1524
  {
1389
1525
  "version": 1,
1390
1526
  "context_length": 2048,
@@ -1937,6 +2073,17 @@
1937
2073
  "model_id": "qwen/Qwen1.5-72B-Chat",
1938
2074
  "model_hub": "modelscope"
1939
2075
  },
2076
+ {
2077
+ "model_format": "pytorch",
2078
+ "model_size_in_billions": 110,
2079
+ "quantizations": [
2080
+ "4-bit",
2081
+ "8-bit",
2082
+ "none"
2083
+ ],
2084
+ "model_id": "qwen/Qwen1.5-110B-Chat",
2085
+ "model_hub": "modelscope"
2086
+ },
1940
2087
  {
1941
2088
  "model_format": "gptq",
1942
2089
  "model_size_in_billions": "0_5",
@@ -2006,6 +2153,15 @@
2006
2153
  "model_id": "qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}",
2007
2154
  "model_hub": "modelscope"
2008
2155
  },
2156
+ {
2157
+ "model_format": "gptq",
2158
+ "model_size_in_billions": 110,
2159
+ "quantizations": [
2160
+ "Int4"
2161
+ ],
2162
+ "model_id": "qwen/Qwen1.5-110B-Chat-GPTQ-Int4",
2163
+ "model_hub": "modelscope"
2164
+ },
2009
2165
  {
2010
2166
  "model_format": "awq",
2011
2167
  "model_size_in_billions": "0_5",
@@ -2069,6 +2225,15 @@
2069
2225
  "model_id": "qwen/Qwen1.5-72B-Chat-AWQ",
2070
2226
  "model_hub": "modelscope"
2071
2227
  },
2228
+ {
2229
+ "model_format": "awq",
2230
+ "model_size_in_billions": 110,
2231
+ "quantizations": [
2232
+ "Int4"
2233
+ ],
2234
+ "model_id": "qwen/Qwen1.5-110B-Chat-AWQ",
2235
+ "model_hub": "modelscope"
2236
+ },
2072
2237
  {
2073
2238
  "model_format": "ggufv2",
2074
2239
  "model_size_in_billions": "0_5",
@@ -2267,7 +2432,7 @@
2267
2432
  },
2268
2433
  {
2269
2434
  "version": 1,
2270
- "context_length": 32768,
2435
+ "context_length": 65536,
2271
2436
  "model_name": "codeqwen1.5-chat",
2272
2437
  "model_lang": [
2273
2438
  "en",
@@ -2726,7 +2891,7 @@
2726
2891
  },
2727
2892
  {
2728
2893
  "version": 1,
2729
- "context_length": 204800,
2894
+ "context_length": 4096,
2730
2895
  "model_name": "yi-vl-chat",
2731
2896
  "model_lang": [
2732
2897
  "en",
@@ -3295,5 +3460,93 @@
3295
3460
  "model_revision": "master"
3296
3461
  }
3297
3462
  ]
3463
+ },
3464
+ {
3465
+ "version": 1,
3466
+ "context_length": 128000,
3467
+ "model_name": "phi-3-mini-128k-instruct",
3468
+ "model_lang": [
3469
+ "en"
3470
+ ],
3471
+ "model_ability": [
3472
+ "chat"
3473
+ ],
3474
+ "model_description": "The Phi-3-Mini-128K-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
3475
+ "model_specs": [
3476
+ {
3477
+ "model_format": "pytorch",
3478
+ "model_size_in_billions": 4,
3479
+ "quantizations": [
3480
+ "4-bit",
3481
+ "8-bit",
3482
+ "none"
3483
+ ],
3484
+ "model_hub": "modelscope",
3485
+ "model_id": "LLM-Research/Phi-3-mini-128k-instruct",
3486
+ "model_revision": "master"
3487
+ }
3488
+ ],
3489
+ "prompt_style": {
3490
+ "style_name": "PHI3",
3491
+ "system_prompt": "You are a helpful AI assistant.",
3492
+ "roles": [
3493
+ "user",
3494
+ "assistant"
3495
+ ],
3496
+ "intra_message_sep": "\n",
3497
+ "inter_message_sep": "<|end|>\n",
3498
+ "stop_token_ids":[
3499
+ 32000,
3500
+ 32007
3501
+ ],
3502
+ "stop": [
3503
+ "<|endoftext|>",
3504
+ "<|end|>"
3505
+ ]
3506
+ }
3507
+ },
3508
+ {
3509
+ "version": 1,
3510
+ "context_length": 4096,
3511
+ "model_name": "phi-3-mini-4k-instruct",
3512
+ "model_lang": [
3513
+ "en"
3514
+ ],
3515
+ "model_ability": [
3516
+ "chat"
3517
+ ],
3518
+ "model_description": "The Phi-3-Mini-4k-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
3519
+ "model_specs": [
3520
+ {
3521
+ "model_format": "pytorch",
3522
+ "model_size_in_billions": 4,
3523
+ "quantizations": [
3524
+ "4-bit",
3525
+ "8-bit",
3526
+ "none"
3527
+ ],
3528
+ "model_hub": "modelscope",
3529
+ "model_id": "LLM-Research/Phi-3-mini-4k-instruct",
3530
+ "model_revision": "master"
3531
+ }
3532
+ ],
3533
+ "prompt_style": {
3534
+ "style_name": "PHI3",
3535
+ "system_prompt": "You are a helpful AI assistant.",
3536
+ "roles": [
3537
+ "user",
3538
+ "assistant"
3539
+ ],
3540
+ "intra_message_sep": "\n",
3541
+ "inter_message_sep": "<|end|>\n",
3542
+ "stop_token_ids":[
3543
+ 32000,
3544
+ 32007
3545
+ ],
3546
+ "stop": [
3547
+ "<|endoftext|>",
3548
+ "<|end|>"
3549
+ ]
3550
+ }
3298
3551
  }
3299
3552
  ]
@@ -147,14 +147,26 @@ class ChatglmPytorchChatModel(PytorchChatModel):
147
147
  )
148
148
  else:
149
149
  stream = generate_config.get("stream", False)
150
+ stream_options = generate_config.pop("stream_options", None)
151
+ include_usage = (
152
+ stream_options["include_usage"]
153
+ if isinstance(stream_options, dict)
154
+ else False
155
+ )
150
156
  if stream:
151
157
 
152
158
  def _stream_generator():
153
159
  last_chunk_text_length = 0
154
160
  chunk_id = "chat-" + str(uuid.uuid1())
161
+ prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
162
+ inputs = self._tokenizer([prompt], return_tensors="pt")
163
+ inputs = inputs.to(self._model.device)
164
+ prompt_tokens = len(inputs["input_ids"][0])
155
165
  for chunk_text, _ in self._model.stream_chat(
156
166
  self._tokenizer, prompt, chat_history, **kwargs
157
167
  ):
168
+ completion_tokens = completion_tokens + 1
169
+ total_tokens = prompt_tokens + completion_tokens
158
170
  chunk_text = chunk_text[last_chunk_text_length:]
159
171
  last_chunk_text_length += len(chunk_text)
160
172
  completion_choice = CompletionChoice(
@@ -166,7 +178,43 @@ class ChatglmPytorchChatModel(PytorchChatModel):
166
178
  created=int(time.time()),
167
179
  model=self.model_uid,
168
180
  choices=[completion_choice],
181
+ usage=CompletionUsage(
182
+ prompt_tokens=prompt_tokens,
183
+ completion_tokens=completion_tokens,
184
+ total_tokens=total_tokens,
185
+ ),
186
+ )
187
+ completion_choice = CompletionChoice(
188
+ text="", index=0, logprobs=None, finish_reason="stop"
189
+ )
190
+ chunk = CompletionChunk(
191
+ id=chunk_id,
192
+ object="text_completion",
193
+ created=int(time.time()),
194
+ model=self.model_uid,
195
+ choices=[completion_choice],
196
+ )
197
+ completion_usage = CompletionUsage(
198
+ prompt_tokens=prompt_tokens,
199
+ completion_tokens=completion_tokens,
200
+ total_tokens=total_tokens,
201
+ )
202
+ chunk["usage"] = completion_usage
203
+ yield chunk
204
+ if include_usage:
205
+ chunk = CompletionChunk(
206
+ id=chunk_id,
207
+ object="text_completion",
208
+ created=int(time.time()),
209
+ model=self.model_uid,
210
+ choices=[],
211
+ )
212
+ chunk["usage"] = CompletionUsage(
213
+ prompt_tokens=prompt_tokens,
214
+ completion_tokens=completion_tokens,
215
+ total_tokens=total_tokens,
169
216
  )
217
+ yield chunk
170
218
 
171
219
  return self._to_chat_completion_chunks(_stream_generator())
172
220
  else:
@@ -143,12 +143,17 @@ class PytorchModel(LLM):
143
143
  f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
144
144
  )
145
145
 
146
- for peft_model in self._peft_model:
147
- # Apply LoRA
148
- self._model = PeftModel.from_pretrained(
149
- self._model,
150
- peft_model.local_path,
151
- )
146
+ for i, peft_model in enumerate(self._peft_model):
147
+ if i == 0:
148
+ self._model = PeftModel.from_pretrained(
149
+ self._model,
150
+ peft_model.local_path,
151
+ adapter_name=peft_model.lora_name,
152
+ )
153
+ else:
154
+ self._model.load_adapter(
155
+ peft_model.local_path, adapter_name=peft_model.lora_name
156
+ )
152
157
  logger.info(
153
158
  f"PEFT adaptor '{peft_model.lora_name}' successfully loaded for model '{self.model_uid}'."
154
159
  )
@@ -302,6 +307,18 @@ class PytorchModel(LLM):
302
307
  assert self._model is not None
303
308
  assert self._tokenizer is not None
304
309
 
310
+ lora_model = generate_config.pop("lora_name")
311
+
312
+ if lora_model is not None and self._peft_model is not None:
313
+ for lora in self._peft_model:
314
+ if lora_model == lora.lora_name:
315
+ self._model.set_adapter(lora_model)
316
+ logger.info(f"Set lora model to {lora_model}")
317
+ break
318
+ else:
319
+ self._model.disable_adapter()
320
+ logger.info(f"No lora model {lora_model} found, skip setting")
321
+
305
322
  stream = generate_config.get("stream", False)
306
323
  if not stream:
307
324
  if "falcon" in model_family_name:
@@ -27,9 +27,11 @@ import torch
27
27
  from ....model.utils import select_device
28
28
  from ....types import (
29
29
  ChatCompletion,
30
- ChatCompletionChoice,
31
30
  ChatCompletionChunk,
32
31
  ChatCompletionMessage,
32
+ Completion,
33
+ CompletionChoice,
34
+ CompletionChunk,
33
35
  CompletionUsage,
34
36
  )
35
37
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -67,12 +69,12 @@ class DeepSeekVLChatModel(PytorchChatModel):
67
69
  self._type = torch.float16 if self._device == "mps" else torch.bfloat16
68
70
 
69
71
  # specify the path to the model
70
- self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
72
+ self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained( # type: ignore
71
73
  self.model_path
72
74
  )
73
75
  self._tokenizer = self._vl_chat_processor.tokenizer
74
76
 
75
- vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
77
+ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained( # type: ignore
76
78
  self.model_path, trust_remote_code=True, device_map=self._device
77
79
  )
78
80
  self._model = vl_gpt.to(self._type).eval()
@@ -149,10 +151,16 @@ class DeepSeekVLChatModel(PytorchChatModel):
149
151
  chat_history: Optional[List[ChatCompletionMessage]] = None,
150
152
  generate_config: Optional[PytorchGenerateConfig] = None,
151
153
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
152
- if generate_config and generate_config.get("stream"):
153
- raise Exception(
154
- f"Chat with model {self.model_family.model_name} does not support stream."
155
- )
154
+ if not generate_config:
155
+ generate_config = {}
156
+
157
+ stream = generate_config.get("stream", False)
158
+ stream_options = generate_config.pop("stream_options", None)
159
+ include_usage = (
160
+ stream_options["include_usage"]
161
+ if isinstance(stream_options, dict)
162
+ else False
163
+ )
156
164
  prompt, images = self._message_content_to_deepseek(prompt)
157
165
  prompt_messages: List[Dict[str, Any]] = [
158
166
  {
@@ -184,6 +192,7 @@ class DeepSeekVLChatModel(PytorchChatModel):
184
192
 
185
193
  deepseek_history.extend(prompt_messages)
186
194
 
195
+ from ....thirdparty.deepseek_vl.serve.inference import generate
187
196
  from ....thirdparty.deepseek_vl.utils.io import load_pil_images
188
197
 
189
198
  # load images and prepare for inputs
@@ -192,41 +201,114 @@ class DeepSeekVLChatModel(PytorchChatModel):
192
201
  conversations=deepseek_history, images=pil_images, force_batchify=True
193
202
  ).to(self._model.device, self._model.dtype)
194
203
 
195
- # run image encoder to get the image embeddings
196
- inputs_embeds = self._model.prepare_inputs_embeds(**prepare_inputs)
197
-
198
- # run the model to get the response
199
- outputs = self._model.language_model.generate(
200
- inputs_embeds=inputs_embeds,
201
- attention_mask=prepare_inputs.attention_mask,
202
- pad_token_id=self._tokenizer.eos_token_id,
203
- bos_token_id=self._tokenizer.bos_token_id,
204
- eos_token_id=self._tokenizer.eos_token_id,
205
- max_new_tokens=512,
206
- do_sample=True,
207
- top_p=0.95,
208
- temperature=0.2,
209
- repetition_penalty=1.1,
210
- use_cache=True,
211
- )
204
+ temperature = generate_config.get("temperature", 0.2)
205
+ top_p = generate_config.get("top_p", 0.95)
206
+ max_new_tokens = generate_config.get("max_tokens", 512)
207
+ repetition_penalty = generate_config.get("repetition_penalty", 1.1)
208
+
209
+ conversation = self._vl_chat_processor.new_chat_template()
210
+ stop_str = conversation.sep2
211
+ stop_words = [stop_str]
212
212
 
213
- answer = self._tokenizer.decode(
214
- outputs[0].cpu().tolist(), skip_special_tokens=True
213
+ streamer = generate(
214
+ vl_gpt=self._model,
215
+ tokenizer=self._tokenizer,
216
+ prepare_inputs=prepare_inputs,
217
+ max_gen_len=max_new_tokens,
218
+ temperature=temperature,
219
+ repetition_penalty=repetition_penalty,
220
+ top_p=top_p,
221
+ stop_words=stop_words,
215
222
  )
216
223
 
217
- return ChatCompletion(
218
- id="chat" + str(uuid.uuid1()),
219
- object="chat.completion",
224
+ if stream:
225
+ it = self._generate_stream(streamer, stop_str, include_usage, prompt)
226
+ return self._to_chat_completion_chunks(it)
227
+ else:
228
+ c = self._generate(streamer, stop_str)
229
+ return self._to_chat_completion(c)
230
+
231
+ def _generate(self, streamer, stop_str) -> Completion:
232
+ generated_text = ""
233
+ for new_text in streamer:
234
+ if new_text.endswith(stop_str):
235
+ new_text = new_text[: -len(stop_str)]
236
+ generated_text += new_text
237
+
238
+ c = Completion(
239
+ id=str(uuid.uuid1()),
240
+ object="text_completion",
220
241
  created=int(time.time()),
221
242
  model=self.model_uid,
222
243
  choices=[
223
- ChatCompletionChoice(
224
- index=0,
225
- message={"role": "assistant", "content": answer},
226
- finish_reason="stop",
244
+ CompletionChoice(
245
+ index=0, text=generated_text, finish_reason="stop", logprobs=None
227
246
  )
228
247
  ],
229
248
  usage=CompletionUsage(
230
249
  prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
231
250
  ),
232
251
  )
252
+ return c
253
+
254
+ def _generate_stream(
255
+ self, streamer, stop_str, include_usage, prompt
256
+ ) -> Iterator[CompletionChunk]:
257
+ completion_id = str(uuid.uuid1())
258
+ prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
259
+ input_ids = self._tokenizer(prompt).input_ids
260
+ prompt_tokens = len(input_ids)
261
+ for i, new_text in enumerate(streamer):
262
+ if new_text.endswith(stop_str):
263
+ new_text = new_text[: -len(stop_str)]
264
+ completion_choice = CompletionChoice(
265
+ text=new_text, index=0, logprobs=None, finish_reason=None
266
+ )
267
+ chunk = CompletionChunk(
268
+ id=completion_id,
269
+ object="text_completion",
270
+ created=int(time.time()),
271
+ model=self.model_uid,
272
+ choices=[completion_choice],
273
+ )
274
+ completion_tokens = i
275
+ total_tokens = prompt_tokens + completion_tokens
276
+ completion_usage = CompletionUsage(
277
+ prompt_tokens=prompt_tokens,
278
+ completion_tokens=completion_tokens,
279
+ total_tokens=total_tokens,
280
+ )
281
+ chunk["usage"] = completion_usage
282
+ yield chunk
283
+
284
+ completion_choice = CompletionChoice(
285
+ text="", index=0, logprobs=None, finish_reason="stop"
286
+ )
287
+ chunk = CompletionChunk(
288
+ id=completion_id,
289
+ object="text_completion",
290
+ created=int(time.time()),
291
+ model=self.model_uid,
292
+ choices=[completion_choice],
293
+ )
294
+ completion_usage = CompletionUsage(
295
+ prompt_tokens=prompt_tokens,
296
+ completion_tokens=completion_tokens,
297
+ total_tokens=total_tokens,
298
+ )
299
+ chunk["usage"] = completion_usage
300
+ yield chunk
301
+ if include_usage:
302
+ chunk = CompletionChunk(
303
+ id=completion_id,
304
+ object="text_completion",
305
+ created=int(time.time()),
306
+ model=self.model_uid,
307
+ choices=[],
308
+ )
309
+ chunk["usage"] = CompletionUsage(
310
+ prompt_tokens=prompt_tokens,
311
+ completion_tokens=completion_tokens,
312
+ total_tokens=total_tokens,
313
+ )
314
+ yield chunk