xinference 0.10.3__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/auth_service.py +1 -1
- xinference/api/restful_api.py +53 -61
- xinference/client/restful/restful_client.py +52 -57
- xinference/conftest.py +1 -1
- xinference/core/cache_tracker.py +1 -1
- xinference/core/event.py +1 -1
- xinference/core/model.py +15 -4
- xinference/core/status_guard.py +1 -1
- xinference/core/supervisor.py +58 -72
- xinference/core/worker.py +68 -101
- xinference/deploy/cmdline.py +166 -1
- xinference/deploy/test/test_cmdline.py +2 -0
- xinference/deploy/utils.py +1 -1
- xinference/device_utils.py +29 -3
- xinference/fields.py +5 -1
- xinference/model/audio/whisper.py +88 -12
- xinference/model/core.py +2 -2
- xinference/model/image/__init__.py +29 -0
- xinference/model/image/core.py +6 -0
- xinference/model/image/custom.py +109 -0
- xinference/model/llm/__init__.py +92 -32
- xinference/model/llm/core.py +57 -102
- xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
- xinference/model/llm/llm_family.json +306 -4
- xinference/model/llm/llm_family.py +45 -41
- xinference/model/llm/llm_family_modelscope.json +119 -2
- xinference/model/llm/pytorch/deepseek_vl.py +89 -33
- xinference/model/llm/pytorch/qwen_vl.py +67 -12
- xinference/model/llm/pytorch/yi_vl.py +62 -45
- xinference/model/llm/utils.py +29 -15
- xinference/model/llm/vllm/core.py +19 -4
- xinference/thirdparty/omnilmm/chat.py +2 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
- xinference/types.py +2 -0
- xinference/web/ui/build/asset-manifest.json +6 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.54bca460.css +2 -0
- xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
- xinference/web/ui/build/static/js/main.8e44da4b.js +3 -0
- xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.8e44da4b.js.LICENSE.txt} +7 -0
- xinference/web/ui/build/static/js/main.8e44da4b.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ddaec68b88e5eff792df1e39a4b4b8b737bfc832293c015660c3c69334e3cf5c.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +33 -0
- xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
- xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
- xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
- xinference/web/ui/node_modules/clipboard/bower.json +18 -0
- xinference/web/ui/node_modules/clipboard/composer.json +25 -0
- xinference/web/ui/node_modules/clipboard/package.json +63 -0
- xinference/web/ui/node_modules/delegate/package.json +31 -0
- xinference/web/ui/node_modules/good-listener/bower.json +11 -0
- xinference/web/ui/node_modules/good-listener/package.json +35 -0
- xinference/web/ui/node_modules/select/bower.json +13 -0
- xinference/web/ui/node_modules/select/package.json +29 -0
- xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
- xinference/web/ui/package-lock.json +34 -0
- xinference/web/ui/package.json +1 -0
- {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/METADATA +11 -11
- {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/RECORD +78 -57
- xinference/client/oscar/__init__.py +0 -13
- xinference/client/oscar/actor_client.py +0 -611
- xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
- xinference/model/llm/pytorch/spec_model.py +0 -186
- xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
- xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
- {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/LICENSE +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/WHEEL +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/top_level.txt +0 -0
|
@@ -461,6 +461,106 @@
|
|
|
461
461
|
}
|
|
462
462
|
]
|
|
463
463
|
},
|
|
464
|
+
{
|
|
465
|
+
"version": 1,
|
|
466
|
+
"context_length": 128000,
|
|
467
|
+
"model_name": "phi-3-mini-128k-instruct",
|
|
468
|
+
"model_lang": [
|
|
469
|
+
"en"
|
|
470
|
+
],
|
|
471
|
+
"model_ability": [
|
|
472
|
+
"chat"
|
|
473
|
+
],
|
|
474
|
+
"model_description": "The Phi-3-Mini-128K-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
|
|
475
|
+
"model_specs": [
|
|
476
|
+
{
|
|
477
|
+
"model_format": "pytorch",
|
|
478
|
+
"model_size_in_billions": 4,
|
|
479
|
+
"quantizations": [
|
|
480
|
+
"4-bit",
|
|
481
|
+
"8-bit",
|
|
482
|
+
"none"
|
|
483
|
+
],
|
|
484
|
+
"model_id": "microsoft/Phi-3-mini-128k-instruct",
|
|
485
|
+
"model_revision": "ebee18c488086b396dde649f2aa6548b9b8d2404"
|
|
486
|
+
}
|
|
487
|
+
],
|
|
488
|
+
"prompt_style": {
|
|
489
|
+
"style_name": "PHI3",
|
|
490
|
+
"system_prompt": "You are a helpful AI assistant.",
|
|
491
|
+
"roles": [
|
|
492
|
+
"user",
|
|
493
|
+
"assistant"
|
|
494
|
+
],
|
|
495
|
+
"intra_message_sep": "\n",
|
|
496
|
+
"inter_message_sep": "<|end|>\n",
|
|
497
|
+
"stop_token_ids":[
|
|
498
|
+
32000,
|
|
499
|
+
32001,
|
|
500
|
+
32007
|
|
501
|
+
],
|
|
502
|
+
"stop": [
|
|
503
|
+
"<|endoftext|>",
|
|
504
|
+
"<|assistant|>",
|
|
505
|
+
"<|end|>"
|
|
506
|
+
]
|
|
507
|
+
}
|
|
508
|
+
},
|
|
509
|
+
{
|
|
510
|
+
"version": 1,
|
|
511
|
+
"context_length": 4096,
|
|
512
|
+
"model_name": "phi-3-mini-4k-instruct",
|
|
513
|
+
"model_lang": [
|
|
514
|
+
"en"
|
|
515
|
+
],
|
|
516
|
+
"model_ability": [
|
|
517
|
+
"chat"
|
|
518
|
+
],
|
|
519
|
+
"model_description": "The Phi-3-Mini-4k-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
|
|
520
|
+
"model_specs": [
|
|
521
|
+
{
|
|
522
|
+
"model_format": "ggufv2",
|
|
523
|
+
"model_size_in_billions": 4,
|
|
524
|
+
"quantizations": [
|
|
525
|
+
"fp16",
|
|
526
|
+
"q4"
|
|
527
|
+
],
|
|
528
|
+
"model_id": "microsoft/Phi-3-mini-4k-instruct-gguf",
|
|
529
|
+
"model_file_name_template": "Phi-3-mini-4k-instruct-{quantization}.gguf"
|
|
530
|
+
},
|
|
531
|
+
{
|
|
532
|
+
"model_format": "pytorch",
|
|
533
|
+
"model_size_in_billions": 4,
|
|
534
|
+
"quantizations": [
|
|
535
|
+
"4-bit",
|
|
536
|
+
"8-bit",
|
|
537
|
+
"none"
|
|
538
|
+
],
|
|
539
|
+
"model_id": "microsoft/Phi-3-mini-4k-instruct",
|
|
540
|
+
"model_revision": "b86bcaf57ea4dfdec5dbe12a377028b2fab0d480"
|
|
541
|
+
}
|
|
542
|
+
],
|
|
543
|
+
"prompt_style": {
|
|
544
|
+
"style_name": "PHI3",
|
|
545
|
+
"system_prompt": "You are a helpful AI assistant.",
|
|
546
|
+
"roles": [
|
|
547
|
+
"user",
|
|
548
|
+
"assistant"
|
|
549
|
+
],
|
|
550
|
+
"intra_message_sep": "\n",
|
|
551
|
+
"inter_message_sep": "<|end|>\n",
|
|
552
|
+
"stop_token_ids":[
|
|
553
|
+
32000,
|
|
554
|
+
32001,
|
|
555
|
+
32007
|
|
556
|
+
],
|
|
557
|
+
"stop": [
|
|
558
|
+
"<|endoftext|>",
|
|
559
|
+
"<|assistant|>",
|
|
560
|
+
"<|end|>"
|
|
561
|
+
]
|
|
562
|
+
}
|
|
563
|
+
},
|
|
464
564
|
{
|
|
465
565
|
"version": 1,
|
|
466
566
|
"context_length": 2048,
|
|
@@ -624,7 +724,7 @@
|
|
|
624
724
|
"none"
|
|
625
725
|
],
|
|
626
726
|
"model_id": "THUDM/chatglm3-6b",
|
|
627
|
-
"model_revision": "
|
|
727
|
+
"model_revision": "103caa40027ebfd8450289ca2f278eac4ff26405"
|
|
628
728
|
}
|
|
629
729
|
],
|
|
630
730
|
"prompt_style": {
|
|
@@ -1330,7 +1430,7 @@
|
|
|
1330
1430
|
"Q4_K_M"
|
|
1331
1431
|
],
|
|
1332
1432
|
"model_id": "lmstudio-community/Meta-Llama-3-70B-Instruct-GGUF",
|
|
1333
|
-
"model_file_name_template": "Meta-Llama-3-
|
|
1433
|
+
"model_file_name_template": "Meta-Llama-3-70B-Instruct-{quantization}.gguf"
|
|
1334
1434
|
},
|
|
1335
1435
|
{
|
|
1336
1436
|
"model_format": "pytorch",
|
|
@@ -1767,6 +1867,16 @@
|
|
|
1767
1867
|
],
|
|
1768
1868
|
"model_id": "Qwen/Qwen1.5-72B-Chat"
|
|
1769
1869
|
},
|
|
1870
|
+
{
|
|
1871
|
+
"model_format": "pytorch",
|
|
1872
|
+
"model_size_in_billions": 110,
|
|
1873
|
+
"quantizations": [
|
|
1874
|
+
"4-bit",
|
|
1875
|
+
"8-bit",
|
|
1876
|
+
"none"
|
|
1877
|
+
],
|
|
1878
|
+
"model_id": "Qwen/Qwen1.5-110B-Chat"
|
|
1879
|
+
},
|
|
1770
1880
|
{
|
|
1771
1881
|
"model_format": "gptq",
|
|
1772
1882
|
"model_size_in_billions": "0_5",
|
|
@@ -1829,6 +1939,14 @@
|
|
|
1829
1939
|
],
|
|
1830
1940
|
"model_id": "Qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}"
|
|
1831
1941
|
},
|
|
1942
|
+
{
|
|
1943
|
+
"model_format": "gptq",
|
|
1944
|
+
"model_size_in_billions": 110,
|
|
1945
|
+
"quantizations": [
|
|
1946
|
+
"Int4"
|
|
1947
|
+
],
|
|
1948
|
+
"model_id": "Qwen/Qwen1.5-110B-Chat-GPTQ-Int4"
|
|
1949
|
+
},
|
|
1832
1950
|
{
|
|
1833
1951
|
"model_format": "awq",
|
|
1834
1952
|
"model_size_in_billions": "0_5",
|
|
@@ -1885,6 +2003,14 @@
|
|
|
1885
2003
|
],
|
|
1886
2004
|
"model_id": "Qwen/Qwen1.5-72B-Chat-AWQ"
|
|
1887
2005
|
},
|
|
2006
|
+
{
|
|
2007
|
+
"model_format": "awq",
|
|
2008
|
+
"model_size_in_billions": 110,
|
|
2009
|
+
"quantizations": [
|
|
2010
|
+
"Int4"
|
|
2011
|
+
],
|
|
2012
|
+
"model_id": "Qwen/Qwen1.5-110B-Chat-AWQ"
|
|
2013
|
+
},
|
|
1888
2014
|
{
|
|
1889
2015
|
"model_format": "ggufv2",
|
|
1890
2016
|
"model_size_in_billions": "0_5",
|
|
@@ -2074,7 +2200,7 @@
|
|
|
2074
2200
|
},
|
|
2075
2201
|
{
|
|
2076
2202
|
"version": 1,
|
|
2077
|
-
"context_length":
|
|
2203
|
+
"context_length": 65536,
|
|
2078
2204
|
"model_name": "codeqwen1.5-chat",
|
|
2079
2205
|
"model_lang": [
|
|
2080
2206
|
"en",
|
|
@@ -3319,6 +3445,142 @@
|
|
|
3319
3445
|
"inter_message_sep": ""
|
|
3320
3446
|
}
|
|
3321
3447
|
},
|
|
3448
|
+
{
|
|
3449
|
+
"version": 1,
|
|
3450
|
+
"context_length": 65536,
|
|
3451
|
+
"model_name": "mixtral-8x22B-instruct-v0.1",
|
|
3452
|
+
"model_lang": [
|
|
3453
|
+
"en",
|
|
3454
|
+
"fr",
|
|
3455
|
+
"it",
|
|
3456
|
+
"de",
|
|
3457
|
+
"es"
|
|
3458
|
+
],
|
|
3459
|
+
"model_ability": [
|
|
3460
|
+
"chat"
|
|
3461
|
+
],
|
|
3462
|
+
"model_description": "The Mixtral-8x22B-Instruct-v0.1 Large Language Model (LLM) is an instruct fine-tuned version of the Mixtral-8x22B-v0.1, specializing in chatting.",
|
|
3463
|
+
"model_specs": [
|
|
3464
|
+
{
|
|
3465
|
+
"model_format": "pytorch",
|
|
3466
|
+
"model_size_in_billions": "141",
|
|
3467
|
+
"quantizations": [
|
|
3468
|
+
"4-bit",
|
|
3469
|
+
"8-bit",
|
|
3470
|
+
"none"
|
|
3471
|
+
],
|
|
3472
|
+
"model_id": "mistralai/Mixtral-8x22B-Instruct-v0.1",
|
|
3473
|
+
"model_revision": "ebb919ac9e9f7f9a900644621bae7963bc593f4f"
|
|
3474
|
+
},
|
|
3475
|
+
{
|
|
3476
|
+
"model_format": "awq",
|
|
3477
|
+
"model_size_in_billions": "141",
|
|
3478
|
+
"quantizations": [
|
|
3479
|
+
"Int4"
|
|
3480
|
+
],
|
|
3481
|
+
"model_id": "MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ"
|
|
3482
|
+
},
|
|
3483
|
+
{
|
|
3484
|
+
"model_format": "gptq",
|
|
3485
|
+
"model_size_in_billions": "141",
|
|
3486
|
+
"quantizations": [
|
|
3487
|
+
"Int4"
|
|
3488
|
+
],
|
|
3489
|
+
"model_id": "jarrelscy/Mixtral-8x22B-Instruct-v0.1-GPTQ-4bit"
|
|
3490
|
+
},
|
|
3491
|
+
{
|
|
3492
|
+
"model_format": "ggufv2",
|
|
3493
|
+
"model_size_in_billions": "141",
|
|
3494
|
+
"quantizations": [
|
|
3495
|
+
"Q2_K",
|
|
3496
|
+
"Q3_K_L",
|
|
3497
|
+
"Q3_K_M",
|
|
3498
|
+
"Q3_K_S",
|
|
3499
|
+
"Q4_K_M",
|
|
3500
|
+
"Q4_K_S",
|
|
3501
|
+
"Q5_K_M",
|
|
3502
|
+
"Q5_K_S",
|
|
3503
|
+
"Q6",
|
|
3504
|
+
"Q8_0",
|
|
3505
|
+
"fp16"
|
|
3506
|
+
],
|
|
3507
|
+
"model_id": "MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-GGUF",
|
|
3508
|
+
"model_file_name_template": "Mixtral-8x22B-Instruct-{quantization}.gguf",
|
|
3509
|
+
"model_file_name_split_template": "Mixtral-8x22B-Instruct-v0.1.{quantization}-{part}.gguf",
|
|
3510
|
+
"quantization_parts": {
|
|
3511
|
+
"Q2_K": [
|
|
3512
|
+
"00001-of-00003",
|
|
3513
|
+
"00002-of-00003",
|
|
3514
|
+
"00003-of-00003"
|
|
3515
|
+
],
|
|
3516
|
+
"Q3_K_L": [
|
|
3517
|
+
"00001-of-00002",
|
|
3518
|
+
"00002-of-00002"
|
|
3519
|
+
],
|
|
3520
|
+
"Q3_K_M": [
|
|
3521
|
+
"00001-of-00002",
|
|
3522
|
+
"00002-of-00002"
|
|
3523
|
+
],
|
|
3524
|
+
"Q3_K_S": [
|
|
3525
|
+
"00001-of-00003",
|
|
3526
|
+
"00002-of-00003",
|
|
3527
|
+
"00003-of-00003"
|
|
3528
|
+
],
|
|
3529
|
+
"Q4_K_M": [
|
|
3530
|
+
"00001-of-00002",
|
|
3531
|
+
"00002-of-00002"
|
|
3532
|
+
],
|
|
3533
|
+
"Q4_K_S": [
|
|
3534
|
+
"00001-of-00002",
|
|
3535
|
+
"00002-of-00002"
|
|
3536
|
+
],
|
|
3537
|
+
"Q5_K_M": [
|
|
3538
|
+
"00001-of-00004",
|
|
3539
|
+
"00002-of-00004",
|
|
3540
|
+
"00003-of-00004",
|
|
3541
|
+
"00004-of-00004"
|
|
3542
|
+
],
|
|
3543
|
+
"Q5_K_S": [
|
|
3544
|
+
"00001-of-00004",
|
|
3545
|
+
"00002-of-00004",
|
|
3546
|
+
"00003-of-00004",
|
|
3547
|
+
"00004-of-00004"
|
|
3548
|
+
],
|
|
3549
|
+
"Q6": [
|
|
3550
|
+
"00001-of-00004",
|
|
3551
|
+
"00002-of-00004",
|
|
3552
|
+
"00003-of-00004",
|
|
3553
|
+
"00004-of-00004"
|
|
3554
|
+
],
|
|
3555
|
+
"Q8_0": [
|
|
3556
|
+
"00001-of-00004",
|
|
3557
|
+
"00002-of-00004",
|
|
3558
|
+
"00003-of-00004",
|
|
3559
|
+
"00004-of-00004"
|
|
3560
|
+
],
|
|
3561
|
+
"fp16": [
|
|
3562
|
+
"00001-of-00007",
|
|
3563
|
+
"00002-of-00007",
|
|
3564
|
+
"00003-of-00007",
|
|
3565
|
+
"00004-of-00007",
|
|
3566
|
+
"00005-of-00007",
|
|
3567
|
+
"00006-of-00007",
|
|
3568
|
+
"00007-of-00007"
|
|
3569
|
+
]
|
|
3570
|
+
}
|
|
3571
|
+
}
|
|
3572
|
+
],
|
|
3573
|
+
"prompt_style": {
|
|
3574
|
+
"style_name": "MIXTRAL_V01",
|
|
3575
|
+
"system_prompt": "",
|
|
3576
|
+
"roles": [
|
|
3577
|
+
"user",
|
|
3578
|
+
"assistant"
|
|
3579
|
+
],
|
|
3580
|
+
"intra_message_sep": "",
|
|
3581
|
+
"inter_message_sep": ""
|
|
3582
|
+
}
|
|
3583
|
+
},
|
|
3322
3584
|
{
|
|
3323
3585
|
"version": 1,
|
|
3324
3586
|
"context_length": 4096,
|
|
@@ -5095,7 +5357,7 @@
|
|
|
5095
5357
|
"Q8_0"
|
|
5096
5358
|
],
|
|
5097
5359
|
"model_id": "andrewcanis/c4ai-command-r-v01-GGUF",
|
|
5098
|
-
"model_file_name_template": "c4ai-command-r-v01
|
|
5360
|
+
"model_file_name_template": "c4ai-command-r-v01-{quantization}.gguf"
|
|
5099
5361
|
},
|
|
5100
5362
|
{
|
|
5101
5363
|
"model_format": "pytorch",
|
|
@@ -5157,5 +5419,45 @@
|
|
|
5157
5419
|
"model_revision": "bb63b5b7005ecedb30b0cfd0d5953b02a5817f7b"
|
|
5158
5420
|
}
|
|
5159
5421
|
]
|
|
5422
|
+
},
|
|
5423
|
+
{
|
|
5424
|
+
"version": 1,
|
|
5425
|
+
"context_length": 4096,
|
|
5426
|
+
"model_name": "Starling-LM",
|
|
5427
|
+
"model_lang": [
|
|
5428
|
+
"en",
|
|
5429
|
+
"zh"
|
|
5430
|
+
],
|
|
5431
|
+
"model_ability": [
|
|
5432
|
+
"chat"
|
|
5433
|
+
],
|
|
5434
|
+
"model_description": "We introduce Starling-7B, an open large language model (LLM) trained by Reinforcement Learning from AI Feedback (RLAIF). The model harnesses the power of our new GPT-4 labeled ranking dataset",
|
|
5435
|
+
"model_specs": [
|
|
5436
|
+
{
|
|
5437
|
+
"model_format": "pytorch",
|
|
5438
|
+
"model_size_in_billions": 7,
|
|
5439
|
+
"quantizations": [
|
|
5440
|
+
"4-bit",
|
|
5441
|
+
"8-bit",
|
|
5442
|
+
"none"
|
|
5443
|
+
],
|
|
5444
|
+
"model_id": "berkeley-nest/Starling-LM-7B-alpha",
|
|
5445
|
+
"model_revision": "1dddf3b95bc1391f6307299eb1c162c194bde9bd"
|
|
5446
|
+
}
|
|
5447
|
+
],
|
|
5448
|
+
"prompt_style": {
|
|
5449
|
+
"style_name": "ADD_COLON_SINGLE",
|
|
5450
|
+
"system_prompt": "",
|
|
5451
|
+
"roles": [
|
|
5452
|
+
"GPT4 Correct User",
|
|
5453
|
+
"GPT4 Correct Assistant"
|
|
5454
|
+
],
|
|
5455
|
+
"intra_message_sep": "<|end_of_turn|>",
|
|
5456
|
+
"inter_message_sep": "",
|
|
5457
|
+
"stop_token_ids": [
|
|
5458
|
+
2,
|
|
5459
|
+
32000
|
|
5460
|
+
]
|
|
5461
|
+
}
|
|
5160
5462
|
}
|
|
5161
5463
|
]
|
|
@@ -33,7 +33,6 @@ from ..._compat import (
|
|
|
33
33
|
validator,
|
|
34
34
|
)
|
|
35
35
|
from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
|
|
36
|
-
from ...types import LoRA
|
|
37
36
|
from ..utils import (
|
|
38
37
|
download_from_modelscope,
|
|
39
38
|
is_valid_model_uri,
|
|
@@ -167,7 +166,7 @@ class CustomLLMFamilyV1(LLMFamilyV1):
|
|
|
167
166
|
)
|
|
168
167
|
if (
|
|
169
168
|
llm_spec.model_family != "other"
|
|
170
|
-
and "
|
|
169
|
+
and "tools" in llm_spec.model_ability
|
|
171
170
|
and llm_spec.model_family not in BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES
|
|
172
171
|
):
|
|
173
172
|
raise ValueError(
|
|
@@ -227,16 +226,23 @@ LLMFamilyV1.update_forward_refs()
|
|
|
227
226
|
CustomLLMFamilyV1.update_forward_refs()
|
|
228
227
|
|
|
229
228
|
|
|
230
|
-
|
|
231
|
-
PEFT_SUPPORTED_CLASSES: List[Type[LLM]] = []
|
|
229
|
+
LLAMA_CLASSES: List[Type[LLM]] = []
|
|
232
230
|
|
|
233
231
|
BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
|
|
234
232
|
BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
|
|
235
233
|
|
|
234
|
+
SGLANG_CLASSES: List[Type[LLM]] = []
|
|
235
|
+
TRANSFORMERS_CLASSES: List[Type[LLM]] = []
|
|
236
|
+
|
|
236
237
|
UD_LLM_FAMILIES: List["LLMFamilyV1"] = []
|
|
237
238
|
|
|
238
239
|
UD_LLM_FAMILIES_LOCK = Lock()
|
|
239
240
|
|
|
241
|
+
VLLM_CLASSES: List[Type[LLM]] = []
|
|
242
|
+
|
|
243
|
+
LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
|
|
244
|
+
SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
|
|
245
|
+
|
|
240
246
|
LLM_LAUNCH_VERSIONS: Dict[str, List[str]] = {}
|
|
241
247
|
|
|
242
248
|
|
|
@@ -822,7 +828,6 @@ def match_llm(
|
|
|
822
828
|
model_format: Optional[str] = None,
|
|
823
829
|
model_size_in_billions: Optional[Union[int, str]] = None,
|
|
824
830
|
quantization: Optional[str] = None,
|
|
825
|
-
is_local_deployment: bool = False,
|
|
826
831
|
) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
|
|
827
832
|
"""
|
|
828
833
|
Find an LLM family, spec, and quantization that satisfy given criteria.
|
|
@@ -880,30 +885,15 @@ def match_llm(
|
|
|
880
885
|
matched_quantization,
|
|
881
886
|
)
|
|
882
887
|
else:
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
# by default, choose the most coarse-grained quantization.
|
|
887
|
-
# TODO: too hacky.
|
|
888
|
-
quantizations = spec.quantizations
|
|
889
|
-
quantizations.sort()
|
|
890
|
-
for q in quantizations:
|
|
891
|
-
if (
|
|
892
|
-
is_local_deployment
|
|
893
|
-
and not (_is_linux() and _has_cuda_device())
|
|
894
|
-
and q == "4-bit"
|
|
895
|
-
):
|
|
896
|
-
logger.warning(
|
|
897
|
-
"Skipping %s for non-linux or non-cuda local deployment .",
|
|
898
|
-
q,
|
|
899
|
-
)
|
|
900
|
-
continue
|
|
901
|
-
return family, _apply_format_to_model_id(spec, q), q
|
|
888
|
+
# TODO: If user does not specify quantization, just use the first one
|
|
889
|
+
_q = "none" if spec.model_format == "pytorch" else spec.quantizations[0]
|
|
890
|
+
return family, _apply_format_to_model_id(spec, _q), _q
|
|
902
891
|
return None
|
|
903
892
|
|
|
904
893
|
|
|
905
894
|
def register_llm(llm_family: LLMFamilyV1, persist: bool):
|
|
906
895
|
from ..utils import is_valid_model_name
|
|
896
|
+
from . import generate_engine_config_by_model_family
|
|
907
897
|
|
|
908
898
|
if not is_valid_model_name(llm_family.model_name):
|
|
909
899
|
raise ValueError(f"Invalid model name {llm_family.model_name}.")
|
|
@@ -916,6 +906,7 @@ def register_llm(llm_family: LLMFamilyV1, persist: bool):
|
|
|
916
906
|
)
|
|
917
907
|
|
|
918
908
|
UD_LLM_FAMILIES.append(llm_family)
|
|
909
|
+
generate_engine_config_by_model_family(llm_family)
|
|
919
910
|
|
|
920
911
|
if persist:
|
|
921
912
|
# We only validate model URL when persist is True.
|
|
@@ -941,6 +932,7 @@ def unregister_llm(model_name: str, raise_error: bool = True):
|
|
|
941
932
|
break
|
|
942
933
|
if llm_family:
|
|
943
934
|
UD_LLM_FAMILIES.remove(llm_family)
|
|
935
|
+
del LLM_ENGINES[model_name]
|
|
944
936
|
|
|
945
937
|
persist_path = os.path.join(
|
|
946
938
|
XINFERENCE_MODEL_DIR, "llm", f"{llm_family.model_name}.json"
|
|
@@ -972,21 +964,33 @@ def unregister_llm(model_name: str, raise_error: bool = True):
|
|
|
972
964
|
logger.warning(f"Custom model {model_name} not found")
|
|
973
965
|
|
|
974
966
|
|
|
975
|
-
def
|
|
976
|
-
|
|
977
|
-
|
|
967
|
+
def check_engine_by_spec_parameters(
|
|
968
|
+
model_engine: str,
|
|
969
|
+
model_name: str,
|
|
970
|
+
model_format: str,
|
|
971
|
+
model_size_in_billions: Union[str, int],
|
|
978
972
|
quantization: str,
|
|
979
|
-
|
|
980
|
-
) ->
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
973
|
+
) -> Type[LLM]:
|
|
974
|
+
def get_model_engine_from_spell(engine_str: str) -> str:
|
|
975
|
+
for engine in LLM_ENGINES[model_name].keys():
|
|
976
|
+
if engine.lower() == engine_str.lower():
|
|
977
|
+
return engine
|
|
978
|
+
return engine_str
|
|
979
|
+
|
|
980
|
+
if model_name not in LLM_ENGINES:
|
|
981
|
+
raise ValueError(f"Model {model_name} not found.")
|
|
982
|
+
model_engine = get_model_engine_from_spell(model_engine)
|
|
983
|
+
if model_engine not in LLM_ENGINES[model_name]:
|
|
984
|
+
raise ValueError(f"Model {model_name} cannot be run on engine {model_engine}.")
|
|
985
|
+
match_params = LLM_ENGINES[model_name][model_engine]
|
|
986
|
+
for param in match_params:
|
|
987
|
+
if (
|
|
988
|
+
model_name == param["model_name"]
|
|
989
|
+
and model_format == param["model_format"]
|
|
990
|
+
and model_size_in_billions == param["model_size_in_billions"]
|
|
991
|
+
and quantization in param["quantizations"]
|
|
992
|
+
):
|
|
993
|
+
return param["llm_class"]
|
|
994
|
+
raise ValueError(
|
|
995
|
+
f"Model {model_name} cannot be run on engine {model_engine}, with format {model_format}, size {model_size_in_billions} and quantization {quantization}."
|
|
996
|
+
)
|
|
@@ -413,7 +413,7 @@
|
|
|
413
413
|
],
|
|
414
414
|
"model_hub": "modelscope",
|
|
415
415
|
"model_id": "ZhipuAI/chatglm3-6b",
|
|
416
|
-
"model_revision": "v1.0.
|
|
416
|
+
"model_revision": "v1.0.2"
|
|
417
417
|
}
|
|
418
418
|
],
|
|
419
419
|
"prompt_style": {
|
|
@@ -1937,6 +1937,17 @@
|
|
|
1937
1937
|
"model_id": "qwen/Qwen1.5-72B-Chat",
|
|
1938
1938
|
"model_hub": "modelscope"
|
|
1939
1939
|
},
|
|
1940
|
+
{
|
|
1941
|
+
"model_format": "pytorch",
|
|
1942
|
+
"model_size_in_billions": 110,
|
|
1943
|
+
"quantizations": [
|
|
1944
|
+
"4-bit",
|
|
1945
|
+
"8-bit",
|
|
1946
|
+
"none"
|
|
1947
|
+
],
|
|
1948
|
+
"model_id": "qwen/Qwen1.5-110B-Chat",
|
|
1949
|
+
"model_hub": "modelscope"
|
|
1950
|
+
},
|
|
1940
1951
|
{
|
|
1941
1952
|
"model_format": "gptq",
|
|
1942
1953
|
"model_size_in_billions": "0_5",
|
|
@@ -2006,6 +2017,15 @@
|
|
|
2006
2017
|
"model_id": "qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}",
|
|
2007
2018
|
"model_hub": "modelscope"
|
|
2008
2019
|
},
|
|
2020
|
+
{
|
|
2021
|
+
"model_format": "gptq",
|
|
2022
|
+
"model_size_in_billions": 110,
|
|
2023
|
+
"quantizations": [
|
|
2024
|
+
"Int4"
|
|
2025
|
+
],
|
|
2026
|
+
"model_id": "qwen/Qwen1.5-110B-Chat-GPTQ-Int4",
|
|
2027
|
+
"model_hub": "modelscope"
|
|
2028
|
+
},
|
|
2009
2029
|
{
|
|
2010
2030
|
"model_format": "awq",
|
|
2011
2031
|
"model_size_in_billions": "0_5",
|
|
@@ -2069,6 +2089,15 @@
|
|
|
2069
2089
|
"model_id": "qwen/Qwen1.5-72B-Chat-AWQ",
|
|
2070
2090
|
"model_hub": "modelscope"
|
|
2071
2091
|
},
|
|
2092
|
+
{
|
|
2093
|
+
"model_format": "awq",
|
|
2094
|
+
"model_size_in_billions": 110,
|
|
2095
|
+
"quantizations": [
|
|
2096
|
+
"Int4"
|
|
2097
|
+
],
|
|
2098
|
+
"model_id": "qwen/Qwen1.5-110B-Chat-AWQ",
|
|
2099
|
+
"model_hub": "modelscope"
|
|
2100
|
+
},
|
|
2072
2101
|
{
|
|
2073
2102
|
"model_format": "ggufv2",
|
|
2074
2103
|
"model_size_in_billions": "0_5",
|
|
@@ -2267,7 +2296,7 @@
|
|
|
2267
2296
|
},
|
|
2268
2297
|
{
|
|
2269
2298
|
"version": 1,
|
|
2270
|
-
"context_length":
|
|
2299
|
+
"context_length": 65536,
|
|
2271
2300
|
"model_name": "codeqwen1.5-chat",
|
|
2272
2301
|
"model_lang": [
|
|
2273
2302
|
"en",
|
|
@@ -3295,5 +3324,93 @@
|
|
|
3295
3324
|
"model_revision": "master"
|
|
3296
3325
|
}
|
|
3297
3326
|
]
|
|
3327
|
+
},
|
|
3328
|
+
{
|
|
3329
|
+
"version": 1,
|
|
3330
|
+
"context_length": 128000,
|
|
3331
|
+
"model_name": "phi-3-mini-128k-instruct",
|
|
3332
|
+
"model_lang": [
|
|
3333
|
+
"en"
|
|
3334
|
+
],
|
|
3335
|
+
"model_ability": [
|
|
3336
|
+
"chat"
|
|
3337
|
+
],
|
|
3338
|
+
"model_description": "The Phi-3-Mini-128K-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
|
|
3339
|
+
"model_specs": [
|
|
3340
|
+
{
|
|
3341
|
+
"model_format": "pytorch",
|
|
3342
|
+
"model_size_in_billions": 4,
|
|
3343
|
+
"quantizations": [
|
|
3344
|
+
"4-bit",
|
|
3345
|
+
"8-bit",
|
|
3346
|
+
"none"
|
|
3347
|
+
],
|
|
3348
|
+
"model_hub": "modelscope",
|
|
3349
|
+
"model_id": "LLM-Research/Phi-3-mini-128k-instruct",
|
|
3350
|
+
"model_revision": "master"
|
|
3351
|
+
}
|
|
3352
|
+
],
|
|
3353
|
+
"prompt_style": {
|
|
3354
|
+
"style_name": "PHI3",
|
|
3355
|
+
"system_prompt": "You are a helpful AI assistant.",
|
|
3356
|
+
"roles": [
|
|
3357
|
+
"user",
|
|
3358
|
+
"assistant"
|
|
3359
|
+
],
|
|
3360
|
+
"intra_message_sep": "\n",
|
|
3361
|
+
"inter_message_sep": "<|end|>\n",
|
|
3362
|
+
"stop_token_ids":[
|
|
3363
|
+
32000,
|
|
3364
|
+
32007
|
|
3365
|
+
],
|
|
3366
|
+
"stop": [
|
|
3367
|
+
"<|endoftext|>",
|
|
3368
|
+
"<|end|>"
|
|
3369
|
+
]
|
|
3370
|
+
}
|
|
3371
|
+
},
|
|
3372
|
+
{
|
|
3373
|
+
"version": 1,
|
|
3374
|
+
"context_length": 4096,
|
|
3375
|
+
"model_name": "phi-3-mini-4k-instruct",
|
|
3376
|
+
"model_lang": [
|
|
3377
|
+
"en"
|
|
3378
|
+
],
|
|
3379
|
+
"model_ability": [
|
|
3380
|
+
"chat"
|
|
3381
|
+
],
|
|
3382
|
+
"model_description": "The Phi-3-Mini-4k-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
|
|
3383
|
+
"model_specs": [
|
|
3384
|
+
{
|
|
3385
|
+
"model_format": "pytorch",
|
|
3386
|
+
"model_size_in_billions": 4,
|
|
3387
|
+
"quantizations": [
|
|
3388
|
+
"4-bit",
|
|
3389
|
+
"8-bit",
|
|
3390
|
+
"none"
|
|
3391
|
+
],
|
|
3392
|
+
"model_hub": "modelscope",
|
|
3393
|
+
"model_id": "LLM-Research/Phi-3-mini-4k-instruct",
|
|
3394
|
+
"model_revision": "master"
|
|
3395
|
+
}
|
|
3396
|
+
],
|
|
3397
|
+
"prompt_style": {
|
|
3398
|
+
"style_name": "PHI3",
|
|
3399
|
+
"system_prompt": "You are a helpful AI assistant.",
|
|
3400
|
+
"roles": [
|
|
3401
|
+
"user",
|
|
3402
|
+
"assistant"
|
|
3403
|
+
],
|
|
3404
|
+
"intra_message_sep": "\n",
|
|
3405
|
+
"inter_message_sep": "<|end|>\n",
|
|
3406
|
+
"stop_token_ids":[
|
|
3407
|
+
32000,
|
|
3408
|
+
32007
|
|
3409
|
+
],
|
|
3410
|
+
"stop": [
|
|
3411
|
+
"<|endoftext|>",
|
|
3412
|
+
"<|end|>"
|
|
3413
|
+
]
|
|
3414
|
+
}
|
|
3298
3415
|
}
|
|
3299
3416
|
]
|