xinference 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/auth_service.py +1 -1
- xinference/api/restful_api.py +53 -61
- xinference/client/restful/restful_client.py +52 -57
- xinference/conftest.py +1 -1
- xinference/core/cache_tracker.py +1 -1
- xinference/core/chat_interface.py +10 -4
- xinference/core/event.py +1 -1
- xinference/core/model.py +17 -6
- xinference/core/status_guard.py +1 -1
- xinference/core/supervisor.py +58 -72
- xinference/core/worker.py +68 -101
- xinference/deploy/cmdline.py +166 -1
- xinference/deploy/test/test_cmdline.py +2 -0
- xinference/deploy/utils.py +1 -1
- xinference/device_utils.py +29 -3
- xinference/fields.py +7 -1
- xinference/model/audio/whisper.py +88 -12
- xinference/model/core.py +2 -2
- xinference/model/image/__init__.py +29 -0
- xinference/model/image/core.py +6 -0
- xinference/model/image/custom.py +109 -0
- xinference/model/llm/__init__.py +92 -32
- xinference/model/llm/core.py +57 -102
- xinference/model/llm/ggml/chatglm.py +98 -13
- xinference/model/llm/ggml/llamacpp.py +49 -2
- xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
- xinference/model/llm/llm_family.json +438 -7
- xinference/model/llm/llm_family.py +45 -41
- xinference/model/llm/llm_family_modelscope.json +258 -5
- xinference/model/llm/pytorch/chatglm.py +48 -0
- xinference/model/llm/pytorch/core.py +23 -6
- xinference/model/llm/pytorch/deepseek_vl.py +115 -33
- xinference/model/llm/pytorch/internlm2.py +32 -1
- xinference/model/llm/pytorch/qwen_vl.py +94 -12
- xinference/model/llm/pytorch/utils.py +38 -1
- xinference/model/llm/pytorch/yi_vl.py +96 -51
- xinference/model/llm/sglang/core.py +31 -9
- xinference/model/llm/utils.py +54 -20
- xinference/model/llm/vllm/core.py +101 -7
- xinference/thirdparty/omnilmm/chat.py +2 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
- xinference/types.py +11 -0
- xinference/web/ui/build/asset-manifest.json +6 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.54bca460.css +2 -0
- xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
- xinference/web/ui/build/static/js/main.551aa479.js +3 -0
- xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.551aa479.js.LICENSE.txt} +7 -0
- xinference/web/ui/build/static/js/main.551aa479.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/23caf6f1e52c43e983ca3bfd4189f41dbd645fa78f2dfdcd7f6b69bc41678665.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +33 -0
- xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
- xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
- xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
- xinference/web/ui/node_modules/clipboard/bower.json +18 -0
- xinference/web/ui/node_modules/clipboard/composer.json +25 -0
- xinference/web/ui/node_modules/clipboard/package.json +63 -0
- xinference/web/ui/node_modules/delegate/package.json +31 -0
- xinference/web/ui/node_modules/good-listener/bower.json +11 -0
- xinference/web/ui/node_modules/good-listener/package.json +35 -0
- xinference/web/ui/node_modules/select/bower.json +13 -0
- xinference/web/ui/node_modules/select/package.json +29 -0
- xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
- xinference/web/ui/package-lock.json +34 -0
- xinference/web/ui/package.json +1 -0
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/METADATA +13 -12
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/RECORD +88 -67
- xinference/client/oscar/__init__.py +0 -13
- xinference/client/oscar/actor_client.py +0 -611
- xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
- xinference/model/llm/pytorch/spec_model.py +0 -186
- xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
- xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/LICENSE +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/WHEEL +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/top_level.txt +0 -0
|
@@ -413,7 +413,7 @@
|
|
|
413
413
|
],
|
|
414
414
|
"model_hub": "modelscope",
|
|
415
415
|
"model_id": "ZhipuAI/chatglm3-6b",
|
|
416
|
-
"model_revision": "v1.0.
|
|
416
|
+
"model_revision": "v1.0.2"
|
|
417
417
|
}
|
|
418
418
|
],
|
|
419
419
|
"prompt_style": {
|
|
@@ -1289,7 +1289,7 @@
|
|
|
1289
1289
|
},
|
|
1290
1290
|
{
|
|
1291
1291
|
"version": 1,
|
|
1292
|
-
"context_length":
|
|
1292
|
+
"context_length": 262144,
|
|
1293
1293
|
"model_name": "Yi-200k",
|
|
1294
1294
|
"model_lang": [
|
|
1295
1295
|
"en",
|
|
@@ -1328,7 +1328,7 @@
|
|
|
1328
1328
|
},
|
|
1329
1329
|
{
|
|
1330
1330
|
"version": 1,
|
|
1331
|
-
"context_length":
|
|
1331
|
+
"context_length": 4096,
|
|
1332
1332
|
"model_name": "Yi-chat",
|
|
1333
1333
|
"model_lang": [
|
|
1334
1334
|
"en",
|
|
@@ -1349,6 +1349,18 @@
|
|
|
1349
1349
|
"model_id": "01ai/Yi-34B-Chat-{quantization}",
|
|
1350
1350
|
"model_revision": "master"
|
|
1351
1351
|
},
|
|
1352
|
+
{
|
|
1353
|
+
"model_format": "pytorch",
|
|
1354
|
+
"model_size_in_billions": 6,
|
|
1355
|
+
"quantizations": [
|
|
1356
|
+
"4-bit",
|
|
1357
|
+
"8-bit",
|
|
1358
|
+
"none"
|
|
1359
|
+
],
|
|
1360
|
+
"model_hub": "modelscope",
|
|
1361
|
+
"model_id": "01ai/Yi-6B-Chat",
|
|
1362
|
+
"model_revision": "master"
|
|
1363
|
+
},
|
|
1352
1364
|
{
|
|
1353
1365
|
"model_format": "pytorch",
|
|
1354
1366
|
"model_size_in_billions": 34,
|
|
@@ -1385,6 +1397,130 @@
|
|
|
1385
1397
|
]
|
|
1386
1398
|
}
|
|
1387
1399
|
},
|
|
1400
|
+
{
|
|
1401
|
+
"version": 1,
|
|
1402
|
+
"context_length": 4096,
|
|
1403
|
+
"model_name": "Yi-1.5",
|
|
1404
|
+
"model_lang": [
|
|
1405
|
+
"en",
|
|
1406
|
+
"zh"
|
|
1407
|
+
],
|
|
1408
|
+
"model_ability": [
|
|
1409
|
+
"generate"
|
|
1410
|
+
],
|
|
1411
|
+
"model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
|
|
1412
|
+
"model_specs": [
|
|
1413
|
+
{
|
|
1414
|
+
"model_format": "pytorch",
|
|
1415
|
+
"model_size_in_billions": 6,
|
|
1416
|
+
"quantizations": [
|
|
1417
|
+
"4-bit",
|
|
1418
|
+
"8-bit",
|
|
1419
|
+
"none"
|
|
1420
|
+
],
|
|
1421
|
+
"model_hub": "modelscope",
|
|
1422
|
+
"model_id": "01ai/Yi-1.5-6B",
|
|
1423
|
+
"model_revision": "master"
|
|
1424
|
+
},
|
|
1425
|
+
{
|
|
1426
|
+
"model_format": "pytorch",
|
|
1427
|
+
"model_size_in_billions": 9,
|
|
1428
|
+
"quantizations": [
|
|
1429
|
+
"4-bit",
|
|
1430
|
+
"8-bit",
|
|
1431
|
+
"none"
|
|
1432
|
+
],
|
|
1433
|
+
"model_hub": "modelscope",
|
|
1434
|
+
"model_id": "01ai/Yi-1.5-9B",
|
|
1435
|
+
"model_revision": "master"
|
|
1436
|
+
},
|
|
1437
|
+
{
|
|
1438
|
+
"model_format": "pytorch",
|
|
1439
|
+
"model_size_in_billions": 34,
|
|
1440
|
+
"quantizations": [
|
|
1441
|
+
"4-bit",
|
|
1442
|
+
"8-bit",
|
|
1443
|
+
"none"
|
|
1444
|
+
],
|
|
1445
|
+
"model_hub": "modelscope",
|
|
1446
|
+
"model_id": "01ai/Yi-1.5-34B",
|
|
1447
|
+
"model_revision": "master"
|
|
1448
|
+
}
|
|
1449
|
+
]
|
|
1450
|
+
},
|
|
1451
|
+
{
|
|
1452
|
+
"version": 1,
|
|
1453
|
+
"context_length": 4096,
|
|
1454
|
+
"model_name": "Yi-1.5-chat",
|
|
1455
|
+
"model_lang": [
|
|
1456
|
+
"en",
|
|
1457
|
+
"zh"
|
|
1458
|
+
],
|
|
1459
|
+
"model_ability": [
|
|
1460
|
+
"chat"
|
|
1461
|
+
],
|
|
1462
|
+
"model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
|
|
1463
|
+
"model_specs": [
|
|
1464
|
+
{
|
|
1465
|
+
"model_format": "pytorch",
|
|
1466
|
+
"model_size_in_billions": 6,
|
|
1467
|
+
"quantizations": [
|
|
1468
|
+
"4-bit",
|
|
1469
|
+
"8-bit",
|
|
1470
|
+
"none"
|
|
1471
|
+
],
|
|
1472
|
+
"model_hub": "modelscope",
|
|
1473
|
+
"model_id": "01ai/Yi-1.5-6B-Chat",
|
|
1474
|
+
"model_revision": "master"
|
|
1475
|
+
},
|
|
1476
|
+
{
|
|
1477
|
+
"model_format": "pytorch",
|
|
1478
|
+
"model_size_in_billions": 9,
|
|
1479
|
+
"quantizations": [
|
|
1480
|
+
"4-bit",
|
|
1481
|
+
"8-bit",
|
|
1482
|
+
"none"
|
|
1483
|
+
],
|
|
1484
|
+
"model_hub": "modelscope",
|
|
1485
|
+
"model_id": "01ai/Yi-1.5-9B-Chat",
|
|
1486
|
+
"model_revision": "master"
|
|
1487
|
+
},
|
|
1488
|
+
{
|
|
1489
|
+
"model_format": "pytorch",
|
|
1490
|
+
"model_size_in_billions": 34,
|
|
1491
|
+
"quantizations": [
|
|
1492
|
+
"4-bit",
|
|
1493
|
+
"8-bit",
|
|
1494
|
+
"none"
|
|
1495
|
+
],
|
|
1496
|
+
"model_hub": "modelscope",
|
|
1497
|
+
"model_id": "01ai/Yi-1.5-34B-Chat",
|
|
1498
|
+
"model_revision": "master"
|
|
1499
|
+
}
|
|
1500
|
+
],
|
|
1501
|
+
"prompt_style": {
|
|
1502
|
+
"style_name": "CHATML",
|
|
1503
|
+
"system_prompt": "",
|
|
1504
|
+
"roles": [
|
|
1505
|
+
"<|im_start|>user",
|
|
1506
|
+
"<|im_start|>assistant"
|
|
1507
|
+
],
|
|
1508
|
+
"intra_message_sep": "<|im_end|>",
|
|
1509
|
+
"inter_message_sep": "",
|
|
1510
|
+
"stop_token_ids": [
|
|
1511
|
+
2,
|
|
1512
|
+
6,
|
|
1513
|
+
7,
|
|
1514
|
+
8
|
|
1515
|
+
],
|
|
1516
|
+
"stop": [
|
|
1517
|
+
"<|endoftext|>",
|
|
1518
|
+
"<|im_start|>",
|
|
1519
|
+
"<|im_end|>",
|
|
1520
|
+
"<|im_sep|>"
|
|
1521
|
+
]
|
|
1522
|
+
}
|
|
1523
|
+
},
|
|
1388
1524
|
{
|
|
1389
1525
|
"version": 1,
|
|
1390
1526
|
"context_length": 2048,
|
|
@@ -1937,6 +2073,17 @@
|
|
|
1937
2073
|
"model_id": "qwen/Qwen1.5-72B-Chat",
|
|
1938
2074
|
"model_hub": "modelscope"
|
|
1939
2075
|
},
|
|
2076
|
+
{
|
|
2077
|
+
"model_format": "pytorch",
|
|
2078
|
+
"model_size_in_billions": 110,
|
|
2079
|
+
"quantizations": [
|
|
2080
|
+
"4-bit",
|
|
2081
|
+
"8-bit",
|
|
2082
|
+
"none"
|
|
2083
|
+
],
|
|
2084
|
+
"model_id": "qwen/Qwen1.5-110B-Chat",
|
|
2085
|
+
"model_hub": "modelscope"
|
|
2086
|
+
},
|
|
1940
2087
|
{
|
|
1941
2088
|
"model_format": "gptq",
|
|
1942
2089
|
"model_size_in_billions": "0_5",
|
|
@@ -2006,6 +2153,15 @@
|
|
|
2006
2153
|
"model_id": "qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}",
|
|
2007
2154
|
"model_hub": "modelscope"
|
|
2008
2155
|
},
|
|
2156
|
+
{
|
|
2157
|
+
"model_format": "gptq",
|
|
2158
|
+
"model_size_in_billions": 110,
|
|
2159
|
+
"quantizations": [
|
|
2160
|
+
"Int4"
|
|
2161
|
+
],
|
|
2162
|
+
"model_id": "qwen/Qwen1.5-110B-Chat-GPTQ-Int4",
|
|
2163
|
+
"model_hub": "modelscope"
|
|
2164
|
+
},
|
|
2009
2165
|
{
|
|
2010
2166
|
"model_format": "awq",
|
|
2011
2167
|
"model_size_in_billions": "0_5",
|
|
@@ -2069,6 +2225,15 @@
|
|
|
2069
2225
|
"model_id": "qwen/Qwen1.5-72B-Chat-AWQ",
|
|
2070
2226
|
"model_hub": "modelscope"
|
|
2071
2227
|
},
|
|
2228
|
+
{
|
|
2229
|
+
"model_format": "awq",
|
|
2230
|
+
"model_size_in_billions": 110,
|
|
2231
|
+
"quantizations": [
|
|
2232
|
+
"Int4"
|
|
2233
|
+
],
|
|
2234
|
+
"model_id": "qwen/Qwen1.5-110B-Chat-AWQ",
|
|
2235
|
+
"model_hub": "modelscope"
|
|
2236
|
+
},
|
|
2072
2237
|
{
|
|
2073
2238
|
"model_format": "ggufv2",
|
|
2074
2239
|
"model_size_in_billions": "0_5",
|
|
@@ -2267,7 +2432,7 @@
|
|
|
2267
2432
|
},
|
|
2268
2433
|
{
|
|
2269
2434
|
"version": 1,
|
|
2270
|
-
"context_length":
|
|
2435
|
+
"context_length": 65536,
|
|
2271
2436
|
"model_name": "codeqwen1.5-chat",
|
|
2272
2437
|
"model_lang": [
|
|
2273
2438
|
"en",
|
|
@@ -2726,7 +2891,7 @@
|
|
|
2726
2891
|
},
|
|
2727
2892
|
{
|
|
2728
2893
|
"version": 1,
|
|
2729
|
-
"context_length":
|
|
2894
|
+
"context_length": 4096,
|
|
2730
2895
|
"model_name": "yi-vl-chat",
|
|
2731
2896
|
"model_lang": [
|
|
2732
2897
|
"en",
|
|
@@ -3295,5 +3460,93 @@
|
|
|
3295
3460
|
"model_revision": "master"
|
|
3296
3461
|
}
|
|
3297
3462
|
]
|
|
3463
|
+
},
|
|
3464
|
+
{
|
|
3465
|
+
"version": 1,
|
|
3466
|
+
"context_length": 128000,
|
|
3467
|
+
"model_name": "phi-3-mini-128k-instruct",
|
|
3468
|
+
"model_lang": [
|
|
3469
|
+
"en"
|
|
3470
|
+
],
|
|
3471
|
+
"model_ability": [
|
|
3472
|
+
"chat"
|
|
3473
|
+
],
|
|
3474
|
+
"model_description": "The Phi-3-Mini-128K-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
|
|
3475
|
+
"model_specs": [
|
|
3476
|
+
{
|
|
3477
|
+
"model_format": "pytorch",
|
|
3478
|
+
"model_size_in_billions": 4,
|
|
3479
|
+
"quantizations": [
|
|
3480
|
+
"4-bit",
|
|
3481
|
+
"8-bit",
|
|
3482
|
+
"none"
|
|
3483
|
+
],
|
|
3484
|
+
"model_hub": "modelscope",
|
|
3485
|
+
"model_id": "LLM-Research/Phi-3-mini-128k-instruct",
|
|
3486
|
+
"model_revision": "master"
|
|
3487
|
+
}
|
|
3488
|
+
],
|
|
3489
|
+
"prompt_style": {
|
|
3490
|
+
"style_name": "PHI3",
|
|
3491
|
+
"system_prompt": "You are a helpful AI assistant.",
|
|
3492
|
+
"roles": [
|
|
3493
|
+
"user",
|
|
3494
|
+
"assistant"
|
|
3495
|
+
],
|
|
3496
|
+
"intra_message_sep": "\n",
|
|
3497
|
+
"inter_message_sep": "<|end|>\n",
|
|
3498
|
+
"stop_token_ids":[
|
|
3499
|
+
32000,
|
|
3500
|
+
32007
|
|
3501
|
+
],
|
|
3502
|
+
"stop": [
|
|
3503
|
+
"<|endoftext|>",
|
|
3504
|
+
"<|end|>"
|
|
3505
|
+
]
|
|
3506
|
+
}
|
|
3507
|
+
},
|
|
3508
|
+
{
|
|
3509
|
+
"version": 1,
|
|
3510
|
+
"context_length": 4096,
|
|
3511
|
+
"model_name": "phi-3-mini-4k-instruct",
|
|
3512
|
+
"model_lang": [
|
|
3513
|
+
"en"
|
|
3514
|
+
],
|
|
3515
|
+
"model_ability": [
|
|
3516
|
+
"chat"
|
|
3517
|
+
],
|
|
3518
|
+
"model_description": "The Phi-3-Mini-4k-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
|
|
3519
|
+
"model_specs": [
|
|
3520
|
+
{
|
|
3521
|
+
"model_format": "pytorch",
|
|
3522
|
+
"model_size_in_billions": 4,
|
|
3523
|
+
"quantizations": [
|
|
3524
|
+
"4-bit",
|
|
3525
|
+
"8-bit",
|
|
3526
|
+
"none"
|
|
3527
|
+
],
|
|
3528
|
+
"model_hub": "modelscope",
|
|
3529
|
+
"model_id": "LLM-Research/Phi-3-mini-4k-instruct",
|
|
3530
|
+
"model_revision": "master"
|
|
3531
|
+
}
|
|
3532
|
+
],
|
|
3533
|
+
"prompt_style": {
|
|
3534
|
+
"style_name": "PHI3",
|
|
3535
|
+
"system_prompt": "You are a helpful AI assistant.",
|
|
3536
|
+
"roles": [
|
|
3537
|
+
"user",
|
|
3538
|
+
"assistant"
|
|
3539
|
+
],
|
|
3540
|
+
"intra_message_sep": "\n",
|
|
3541
|
+
"inter_message_sep": "<|end|>\n",
|
|
3542
|
+
"stop_token_ids":[
|
|
3543
|
+
32000,
|
|
3544
|
+
32007
|
|
3545
|
+
],
|
|
3546
|
+
"stop": [
|
|
3547
|
+
"<|endoftext|>",
|
|
3548
|
+
"<|end|>"
|
|
3549
|
+
]
|
|
3550
|
+
}
|
|
3298
3551
|
}
|
|
3299
3552
|
]
|
|
@@ -147,14 +147,26 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
147
147
|
)
|
|
148
148
|
else:
|
|
149
149
|
stream = generate_config.get("stream", False)
|
|
150
|
+
stream_options = generate_config.pop("stream_options", None)
|
|
151
|
+
include_usage = (
|
|
152
|
+
stream_options["include_usage"]
|
|
153
|
+
if isinstance(stream_options, dict)
|
|
154
|
+
else False
|
|
155
|
+
)
|
|
150
156
|
if stream:
|
|
151
157
|
|
|
152
158
|
def _stream_generator():
|
|
153
159
|
last_chunk_text_length = 0
|
|
154
160
|
chunk_id = "chat-" + str(uuid.uuid1())
|
|
161
|
+
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
|
162
|
+
inputs = self._tokenizer([prompt], return_tensors="pt")
|
|
163
|
+
inputs = inputs.to(self._model.device)
|
|
164
|
+
prompt_tokens = len(inputs["input_ids"][0])
|
|
155
165
|
for chunk_text, _ in self._model.stream_chat(
|
|
156
166
|
self._tokenizer, prompt, chat_history, **kwargs
|
|
157
167
|
):
|
|
168
|
+
completion_tokens = completion_tokens + 1
|
|
169
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
158
170
|
chunk_text = chunk_text[last_chunk_text_length:]
|
|
159
171
|
last_chunk_text_length += len(chunk_text)
|
|
160
172
|
completion_choice = CompletionChoice(
|
|
@@ -166,7 +178,43 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
166
178
|
created=int(time.time()),
|
|
167
179
|
model=self.model_uid,
|
|
168
180
|
choices=[completion_choice],
|
|
181
|
+
usage=CompletionUsage(
|
|
182
|
+
prompt_tokens=prompt_tokens,
|
|
183
|
+
completion_tokens=completion_tokens,
|
|
184
|
+
total_tokens=total_tokens,
|
|
185
|
+
),
|
|
186
|
+
)
|
|
187
|
+
completion_choice = CompletionChoice(
|
|
188
|
+
text="", index=0, logprobs=None, finish_reason="stop"
|
|
189
|
+
)
|
|
190
|
+
chunk = CompletionChunk(
|
|
191
|
+
id=chunk_id,
|
|
192
|
+
object="text_completion",
|
|
193
|
+
created=int(time.time()),
|
|
194
|
+
model=self.model_uid,
|
|
195
|
+
choices=[completion_choice],
|
|
196
|
+
)
|
|
197
|
+
completion_usage = CompletionUsage(
|
|
198
|
+
prompt_tokens=prompt_tokens,
|
|
199
|
+
completion_tokens=completion_tokens,
|
|
200
|
+
total_tokens=total_tokens,
|
|
201
|
+
)
|
|
202
|
+
chunk["usage"] = completion_usage
|
|
203
|
+
yield chunk
|
|
204
|
+
if include_usage:
|
|
205
|
+
chunk = CompletionChunk(
|
|
206
|
+
id=chunk_id,
|
|
207
|
+
object="text_completion",
|
|
208
|
+
created=int(time.time()),
|
|
209
|
+
model=self.model_uid,
|
|
210
|
+
choices=[],
|
|
211
|
+
)
|
|
212
|
+
chunk["usage"] = CompletionUsage(
|
|
213
|
+
prompt_tokens=prompt_tokens,
|
|
214
|
+
completion_tokens=completion_tokens,
|
|
215
|
+
total_tokens=total_tokens,
|
|
169
216
|
)
|
|
217
|
+
yield chunk
|
|
170
218
|
|
|
171
219
|
return self._to_chat_completion_chunks(_stream_generator())
|
|
172
220
|
else:
|
|
@@ -143,12 +143,17 @@ class PytorchModel(LLM):
|
|
|
143
143
|
f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
|
|
144
144
|
)
|
|
145
145
|
|
|
146
|
-
for peft_model in self._peft_model:
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
146
|
+
for i, peft_model in enumerate(self._peft_model):
|
|
147
|
+
if i == 0:
|
|
148
|
+
self._model = PeftModel.from_pretrained(
|
|
149
|
+
self._model,
|
|
150
|
+
peft_model.local_path,
|
|
151
|
+
adapter_name=peft_model.lora_name,
|
|
152
|
+
)
|
|
153
|
+
else:
|
|
154
|
+
self._model.load_adapter(
|
|
155
|
+
peft_model.local_path, adapter_name=peft_model.lora_name
|
|
156
|
+
)
|
|
152
157
|
logger.info(
|
|
153
158
|
f"PEFT adaptor '{peft_model.lora_name}' successfully loaded for model '{self.model_uid}'."
|
|
154
159
|
)
|
|
@@ -302,6 +307,18 @@ class PytorchModel(LLM):
|
|
|
302
307
|
assert self._model is not None
|
|
303
308
|
assert self._tokenizer is not None
|
|
304
309
|
|
|
310
|
+
lora_model = generate_config.pop("lora_name")
|
|
311
|
+
|
|
312
|
+
if lora_model is not None and self._peft_model is not None:
|
|
313
|
+
for lora in self._peft_model:
|
|
314
|
+
if lora_model == lora.lora_name:
|
|
315
|
+
self._model.set_adapter(lora_model)
|
|
316
|
+
logger.info(f"Set lora model to {lora_model}")
|
|
317
|
+
break
|
|
318
|
+
else:
|
|
319
|
+
self._model.disable_adapter()
|
|
320
|
+
logger.info(f"No lora model {lora_model} found, skip setting")
|
|
321
|
+
|
|
305
322
|
stream = generate_config.get("stream", False)
|
|
306
323
|
if not stream:
|
|
307
324
|
if "falcon" in model_family_name:
|
|
@@ -27,9 +27,11 @@ import torch
|
|
|
27
27
|
from ....model.utils import select_device
|
|
28
28
|
from ....types import (
|
|
29
29
|
ChatCompletion,
|
|
30
|
-
ChatCompletionChoice,
|
|
31
30
|
ChatCompletionChunk,
|
|
32
31
|
ChatCompletionMessage,
|
|
32
|
+
Completion,
|
|
33
|
+
CompletionChoice,
|
|
34
|
+
CompletionChunk,
|
|
33
35
|
CompletionUsage,
|
|
34
36
|
)
|
|
35
37
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -67,12 +69,12 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
67
69
|
self._type = torch.float16 if self._device == "mps" else torch.bfloat16
|
|
68
70
|
|
|
69
71
|
# specify the path to the model
|
|
70
|
-
self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
|
|
72
|
+
self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained( # type: ignore
|
|
71
73
|
self.model_path
|
|
72
74
|
)
|
|
73
75
|
self._tokenizer = self._vl_chat_processor.tokenizer
|
|
74
76
|
|
|
75
|
-
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
|
|
77
|
+
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained( # type: ignore
|
|
76
78
|
self.model_path, trust_remote_code=True, device_map=self._device
|
|
77
79
|
)
|
|
78
80
|
self._model = vl_gpt.to(self._type).eval()
|
|
@@ -149,10 +151,16 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
149
151
|
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
150
152
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
151
153
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
152
|
-
if
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
154
|
+
if not generate_config:
|
|
155
|
+
generate_config = {}
|
|
156
|
+
|
|
157
|
+
stream = generate_config.get("stream", False)
|
|
158
|
+
stream_options = generate_config.pop("stream_options", None)
|
|
159
|
+
include_usage = (
|
|
160
|
+
stream_options["include_usage"]
|
|
161
|
+
if isinstance(stream_options, dict)
|
|
162
|
+
else False
|
|
163
|
+
)
|
|
156
164
|
prompt, images = self._message_content_to_deepseek(prompt)
|
|
157
165
|
prompt_messages: List[Dict[str, Any]] = [
|
|
158
166
|
{
|
|
@@ -184,6 +192,7 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
184
192
|
|
|
185
193
|
deepseek_history.extend(prompt_messages)
|
|
186
194
|
|
|
195
|
+
from ....thirdparty.deepseek_vl.serve.inference import generate
|
|
187
196
|
from ....thirdparty.deepseek_vl.utils.io import load_pil_images
|
|
188
197
|
|
|
189
198
|
# load images and prepare for inputs
|
|
@@ -192,41 +201,114 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
192
201
|
conversations=deepseek_history, images=pil_images, force_batchify=True
|
|
193
202
|
).to(self._model.device, self._model.dtype)
|
|
194
203
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
bos_token_id=self._tokenizer.bos_token_id,
|
|
204
|
-
eos_token_id=self._tokenizer.eos_token_id,
|
|
205
|
-
max_new_tokens=512,
|
|
206
|
-
do_sample=True,
|
|
207
|
-
top_p=0.95,
|
|
208
|
-
temperature=0.2,
|
|
209
|
-
repetition_penalty=1.1,
|
|
210
|
-
use_cache=True,
|
|
211
|
-
)
|
|
204
|
+
temperature = generate_config.get("temperature", 0.2)
|
|
205
|
+
top_p = generate_config.get("top_p", 0.95)
|
|
206
|
+
max_new_tokens = generate_config.get("max_tokens", 512)
|
|
207
|
+
repetition_penalty = generate_config.get("repetition_penalty", 1.1)
|
|
208
|
+
|
|
209
|
+
conversation = self._vl_chat_processor.new_chat_template()
|
|
210
|
+
stop_str = conversation.sep2
|
|
211
|
+
stop_words = [stop_str]
|
|
212
212
|
|
|
213
|
-
|
|
214
|
-
|
|
213
|
+
streamer = generate(
|
|
214
|
+
vl_gpt=self._model,
|
|
215
|
+
tokenizer=self._tokenizer,
|
|
216
|
+
prepare_inputs=prepare_inputs,
|
|
217
|
+
max_gen_len=max_new_tokens,
|
|
218
|
+
temperature=temperature,
|
|
219
|
+
repetition_penalty=repetition_penalty,
|
|
220
|
+
top_p=top_p,
|
|
221
|
+
stop_words=stop_words,
|
|
215
222
|
)
|
|
216
223
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
224
|
+
if stream:
|
|
225
|
+
it = self._generate_stream(streamer, stop_str, include_usage, prompt)
|
|
226
|
+
return self._to_chat_completion_chunks(it)
|
|
227
|
+
else:
|
|
228
|
+
c = self._generate(streamer, stop_str)
|
|
229
|
+
return self._to_chat_completion(c)
|
|
230
|
+
|
|
231
|
+
def _generate(self, streamer, stop_str) -> Completion:
|
|
232
|
+
generated_text = ""
|
|
233
|
+
for new_text in streamer:
|
|
234
|
+
if new_text.endswith(stop_str):
|
|
235
|
+
new_text = new_text[: -len(stop_str)]
|
|
236
|
+
generated_text += new_text
|
|
237
|
+
|
|
238
|
+
c = Completion(
|
|
239
|
+
id=str(uuid.uuid1()),
|
|
240
|
+
object="text_completion",
|
|
220
241
|
created=int(time.time()),
|
|
221
242
|
model=self.model_uid,
|
|
222
243
|
choices=[
|
|
223
|
-
|
|
224
|
-
index=0,
|
|
225
|
-
message={"role": "assistant", "content": answer},
|
|
226
|
-
finish_reason="stop",
|
|
244
|
+
CompletionChoice(
|
|
245
|
+
index=0, text=generated_text, finish_reason="stop", logprobs=None
|
|
227
246
|
)
|
|
228
247
|
],
|
|
229
248
|
usage=CompletionUsage(
|
|
230
249
|
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
231
250
|
),
|
|
232
251
|
)
|
|
252
|
+
return c
|
|
253
|
+
|
|
254
|
+
def _generate_stream(
|
|
255
|
+
self, streamer, stop_str, include_usage, prompt
|
|
256
|
+
) -> Iterator[CompletionChunk]:
|
|
257
|
+
completion_id = str(uuid.uuid1())
|
|
258
|
+
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
|
259
|
+
input_ids = self._tokenizer(prompt).input_ids
|
|
260
|
+
prompt_tokens = len(input_ids)
|
|
261
|
+
for i, new_text in enumerate(streamer):
|
|
262
|
+
if new_text.endswith(stop_str):
|
|
263
|
+
new_text = new_text[: -len(stop_str)]
|
|
264
|
+
completion_choice = CompletionChoice(
|
|
265
|
+
text=new_text, index=0, logprobs=None, finish_reason=None
|
|
266
|
+
)
|
|
267
|
+
chunk = CompletionChunk(
|
|
268
|
+
id=completion_id,
|
|
269
|
+
object="text_completion",
|
|
270
|
+
created=int(time.time()),
|
|
271
|
+
model=self.model_uid,
|
|
272
|
+
choices=[completion_choice],
|
|
273
|
+
)
|
|
274
|
+
completion_tokens = i
|
|
275
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
276
|
+
completion_usage = CompletionUsage(
|
|
277
|
+
prompt_tokens=prompt_tokens,
|
|
278
|
+
completion_tokens=completion_tokens,
|
|
279
|
+
total_tokens=total_tokens,
|
|
280
|
+
)
|
|
281
|
+
chunk["usage"] = completion_usage
|
|
282
|
+
yield chunk
|
|
283
|
+
|
|
284
|
+
completion_choice = CompletionChoice(
|
|
285
|
+
text="", index=0, logprobs=None, finish_reason="stop"
|
|
286
|
+
)
|
|
287
|
+
chunk = CompletionChunk(
|
|
288
|
+
id=completion_id,
|
|
289
|
+
object="text_completion",
|
|
290
|
+
created=int(time.time()),
|
|
291
|
+
model=self.model_uid,
|
|
292
|
+
choices=[completion_choice],
|
|
293
|
+
)
|
|
294
|
+
completion_usage = CompletionUsage(
|
|
295
|
+
prompt_tokens=prompt_tokens,
|
|
296
|
+
completion_tokens=completion_tokens,
|
|
297
|
+
total_tokens=total_tokens,
|
|
298
|
+
)
|
|
299
|
+
chunk["usage"] = completion_usage
|
|
300
|
+
yield chunk
|
|
301
|
+
if include_usage:
|
|
302
|
+
chunk = CompletionChunk(
|
|
303
|
+
id=completion_id,
|
|
304
|
+
object="text_completion",
|
|
305
|
+
created=int(time.time()),
|
|
306
|
+
model=self.model_uid,
|
|
307
|
+
choices=[],
|
|
308
|
+
)
|
|
309
|
+
chunk["usage"] = CompletionUsage(
|
|
310
|
+
prompt_tokens=prompt_tokens,
|
|
311
|
+
completion_tokens=completion_tokens,
|
|
312
|
+
total_tokens=total_tokens,
|
|
313
|
+
)
|
|
314
|
+
yield chunk
|