xinference 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (101) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/auth_service.py +1 -1
  3. xinference/api/restful_api.py +53 -61
  4. xinference/client/restful/restful_client.py +52 -57
  5. xinference/conftest.py +1 -1
  6. xinference/core/cache_tracker.py +1 -1
  7. xinference/core/chat_interface.py +10 -4
  8. xinference/core/event.py +1 -1
  9. xinference/core/model.py +17 -6
  10. xinference/core/status_guard.py +1 -1
  11. xinference/core/supervisor.py +58 -72
  12. xinference/core/worker.py +68 -101
  13. xinference/deploy/cmdline.py +166 -1
  14. xinference/deploy/test/test_cmdline.py +2 -0
  15. xinference/deploy/utils.py +1 -1
  16. xinference/device_utils.py +29 -3
  17. xinference/fields.py +7 -1
  18. xinference/model/audio/whisper.py +88 -12
  19. xinference/model/core.py +2 -2
  20. xinference/model/image/__init__.py +29 -0
  21. xinference/model/image/core.py +6 -0
  22. xinference/model/image/custom.py +109 -0
  23. xinference/model/llm/__init__.py +92 -32
  24. xinference/model/llm/core.py +57 -102
  25. xinference/model/llm/ggml/chatglm.py +98 -13
  26. xinference/model/llm/ggml/llamacpp.py +49 -2
  27. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
  28. xinference/model/llm/llm_family.json +438 -7
  29. xinference/model/llm/llm_family.py +45 -41
  30. xinference/model/llm/llm_family_modelscope.json +258 -5
  31. xinference/model/llm/pytorch/chatglm.py +48 -0
  32. xinference/model/llm/pytorch/core.py +23 -6
  33. xinference/model/llm/pytorch/deepseek_vl.py +115 -33
  34. xinference/model/llm/pytorch/internlm2.py +32 -1
  35. xinference/model/llm/pytorch/qwen_vl.py +94 -12
  36. xinference/model/llm/pytorch/utils.py +38 -1
  37. xinference/model/llm/pytorch/yi_vl.py +96 -51
  38. xinference/model/llm/sglang/core.py +31 -9
  39. xinference/model/llm/utils.py +54 -20
  40. xinference/model/llm/vllm/core.py +101 -7
  41. xinference/thirdparty/omnilmm/chat.py +2 -1
  42. xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
  43. xinference/types.py +11 -0
  44. xinference/web/ui/build/asset-manifest.json +6 -3
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/css/main.54bca460.css +2 -0
  47. xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
  48. xinference/web/ui/build/static/js/main.551aa479.js +3 -0
  49. xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.551aa479.js.LICENSE.txt} +7 -0
  50. xinference/web/ui/build/static/js/main.551aa479.js.map +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/23caf6f1e52c43e983ca3bfd4189f41dbd645fa78f2dfdcd7f6b69bc41678665.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
  67. xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
  68. xinference/web/ui/node_modules/.package-lock.json +33 -0
  69. xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
  70. xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
  71. xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
  72. xinference/web/ui/node_modules/clipboard/bower.json +18 -0
  73. xinference/web/ui/node_modules/clipboard/composer.json +25 -0
  74. xinference/web/ui/node_modules/clipboard/package.json +63 -0
  75. xinference/web/ui/node_modules/delegate/package.json +31 -0
  76. xinference/web/ui/node_modules/good-listener/bower.json +11 -0
  77. xinference/web/ui/node_modules/good-listener/package.json +35 -0
  78. xinference/web/ui/node_modules/select/bower.json +13 -0
  79. xinference/web/ui/node_modules/select/package.json +29 -0
  80. xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
  81. xinference/web/ui/package-lock.json +34 -0
  82. xinference/web/ui/package.json +1 -0
  83. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/METADATA +13 -12
  84. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/RECORD +88 -67
  85. xinference/client/oscar/__init__.py +0 -13
  86. xinference/client/oscar/actor_client.py +0 -611
  87. xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
  88. xinference/model/llm/pytorch/spec_model.py +0 -186
  89. xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
  90. xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +0 -1
  93. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
  94. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
  95. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
  96. xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
  97. xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
  98. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/LICENSE +0 -0
  99. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/WHEEL +0 -0
  100. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/entry_points.txt +0 -0
  101. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/top_level.txt +0 -0
@@ -461,6 +461,106 @@
461
461
  }
462
462
  ]
463
463
  },
464
+ {
465
+ "version": 1,
466
+ "context_length": 128000,
467
+ "model_name": "phi-3-mini-128k-instruct",
468
+ "model_lang": [
469
+ "en"
470
+ ],
471
+ "model_ability": [
472
+ "chat"
473
+ ],
474
+ "model_description": "The Phi-3-Mini-128K-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
475
+ "model_specs": [
476
+ {
477
+ "model_format": "pytorch",
478
+ "model_size_in_billions": 4,
479
+ "quantizations": [
480
+ "4-bit",
481
+ "8-bit",
482
+ "none"
483
+ ],
484
+ "model_id": "microsoft/Phi-3-mini-128k-instruct",
485
+ "model_revision": "ebee18c488086b396dde649f2aa6548b9b8d2404"
486
+ }
487
+ ],
488
+ "prompt_style": {
489
+ "style_name": "PHI3",
490
+ "system_prompt": "You are a helpful AI assistant.",
491
+ "roles": [
492
+ "user",
493
+ "assistant"
494
+ ],
495
+ "intra_message_sep": "\n",
496
+ "inter_message_sep": "<|end|>\n",
497
+ "stop_token_ids":[
498
+ 32000,
499
+ 32001,
500
+ 32007
501
+ ],
502
+ "stop": [
503
+ "<|endoftext|>",
504
+ "<|assistant|>",
505
+ "<|end|>"
506
+ ]
507
+ }
508
+ },
509
+ {
510
+ "version": 1,
511
+ "context_length": 4096,
512
+ "model_name": "phi-3-mini-4k-instruct",
513
+ "model_lang": [
514
+ "en"
515
+ ],
516
+ "model_ability": [
517
+ "chat"
518
+ ],
519
+ "model_description": "The Phi-3-Mini-4k-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
520
+ "model_specs": [
521
+ {
522
+ "model_format": "ggufv2",
523
+ "model_size_in_billions": 4,
524
+ "quantizations": [
525
+ "fp16",
526
+ "q4"
527
+ ],
528
+ "model_id": "microsoft/Phi-3-mini-4k-instruct-gguf",
529
+ "model_file_name_template": "Phi-3-mini-4k-instruct-{quantization}.gguf"
530
+ },
531
+ {
532
+ "model_format": "pytorch",
533
+ "model_size_in_billions": 4,
534
+ "quantizations": [
535
+ "4-bit",
536
+ "8-bit",
537
+ "none"
538
+ ],
539
+ "model_id": "microsoft/Phi-3-mini-4k-instruct",
540
+ "model_revision": "b86bcaf57ea4dfdec5dbe12a377028b2fab0d480"
541
+ }
542
+ ],
543
+ "prompt_style": {
544
+ "style_name": "PHI3",
545
+ "system_prompt": "You are a helpful AI assistant.",
546
+ "roles": [
547
+ "user",
548
+ "assistant"
549
+ ],
550
+ "intra_message_sep": "\n",
551
+ "inter_message_sep": "<|end|>\n",
552
+ "stop_token_ids":[
553
+ 32000,
554
+ 32001,
555
+ 32007
556
+ ],
557
+ "stop": [
558
+ "<|endoftext|>",
559
+ "<|assistant|>",
560
+ "<|end|>"
561
+ ]
562
+ }
563
+ },
464
564
  {
465
565
  "version": 1,
466
566
  "context_length": 2048,
@@ -624,7 +724,7 @@
624
724
  "none"
625
725
  ],
626
726
  "model_id": "THUDM/chatglm3-6b",
627
- "model_revision": "b098244a71fbe69ce149682d9072a7629f7e908c"
727
+ "model_revision": "103caa40027ebfd8450289ca2f278eac4ff26405"
628
728
  }
629
729
  ],
630
730
  "prompt_style": {
@@ -1330,7 +1430,7 @@
1330
1430
  "Q4_K_M"
1331
1431
  ],
1332
1432
  "model_id": "lmstudio-community/Meta-Llama-3-70B-Instruct-GGUF",
1333
- "model_file_name_template": "Meta-Llama-3-8B-Instruct-{quantization}.gguf"
1433
+ "model_file_name_template": "Meta-Llama-3-70B-Instruct-{quantization}.gguf"
1334
1434
  },
1335
1435
  {
1336
1436
  "model_format": "pytorch",
@@ -1767,6 +1867,16 @@
1767
1867
  ],
1768
1868
  "model_id": "Qwen/Qwen1.5-72B-Chat"
1769
1869
  },
1870
+ {
1871
+ "model_format": "pytorch",
1872
+ "model_size_in_billions": 110,
1873
+ "quantizations": [
1874
+ "4-bit",
1875
+ "8-bit",
1876
+ "none"
1877
+ ],
1878
+ "model_id": "Qwen/Qwen1.5-110B-Chat"
1879
+ },
1770
1880
  {
1771
1881
  "model_format": "gptq",
1772
1882
  "model_size_in_billions": "0_5",
@@ -1829,6 +1939,14 @@
1829
1939
  ],
1830
1940
  "model_id": "Qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}"
1831
1941
  },
1942
+ {
1943
+ "model_format": "gptq",
1944
+ "model_size_in_billions": 110,
1945
+ "quantizations": [
1946
+ "Int4"
1947
+ ],
1948
+ "model_id": "Qwen/Qwen1.5-110B-Chat-GPTQ-Int4"
1949
+ },
1832
1950
  {
1833
1951
  "model_format": "awq",
1834
1952
  "model_size_in_billions": "0_5",
@@ -1885,6 +2003,14 @@
1885
2003
  ],
1886
2004
  "model_id": "Qwen/Qwen1.5-72B-Chat-AWQ"
1887
2005
  },
2006
+ {
2007
+ "model_format": "awq",
2008
+ "model_size_in_billions": 110,
2009
+ "quantizations": [
2010
+ "Int4"
2011
+ ],
2012
+ "model_id": "Qwen/Qwen1.5-110B-Chat-AWQ"
2013
+ },
1888
2014
  {
1889
2015
  "model_format": "ggufv2",
1890
2016
  "model_size_in_billions": "0_5",
@@ -2074,7 +2200,7 @@
2074
2200
  },
2075
2201
  {
2076
2202
  "version": 1,
2077
- "context_length": 32768,
2203
+ "context_length": 65536,
2078
2204
  "model_name": "codeqwen1.5-chat",
2079
2205
  "model_lang": [
2080
2206
  "en",
@@ -3319,6 +3445,142 @@
3319
3445
  "inter_message_sep": ""
3320
3446
  }
3321
3447
  },
3448
+ {
3449
+ "version": 1,
3450
+ "context_length": 65536,
3451
+ "model_name": "mixtral-8x22B-instruct-v0.1",
3452
+ "model_lang": [
3453
+ "en",
3454
+ "fr",
3455
+ "it",
3456
+ "de",
3457
+ "es"
3458
+ ],
3459
+ "model_ability": [
3460
+ "chat"
3461
+ ],
3462
+ "model_description": "The Mixtral-8x22B-Instruct-v0.1 Large Language Model (LLM) is an instruct fine-tuned version of the Mixtral-8x22B-v0.1, specializing in chatting.",
3463
+ "model_specs": [
3464
+ {
3465
+ "model_format": "pytorch",
3466
+ "model_size_in_billions": "141",
3467
+ "quantizations": [
3468
+ "4-bit",
3469
+ "8-bit",
3470
+ "none"
3471
+ ],
3472
+ "model_id": "mistralai/Mixtral-8x22B-Instruct-v0.1",
3473
+ "model_revision": "ebb919ac9e9f7f9a900644621bae7963bc593f4f"
3474
+ },
3475
+ {
3476
+ "model_format": "awq",
3477
+ "model_size_in_billions": "141",
3478
+ "quantizations": [
3479
+ "Int4"
3480
+ ],
3481
+ "model_id": "MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ"
3482
+ },
3483
+ {
3484
+ "model_format": "gptq",
3485
+ "model_size_in_billions": "141",
3486
+ "quantizations": [
3487
+ "Int4"
3488
+ ],
3489
+ "model_id": "jarrelscy/Mixtral-8x22B-Instruct-v0.1-GPTQ-4bit"
3490
+ },
3491
+ {
3492
+ "model_format": "ggufv2",
3493
+ "model_size_in_billions": "141",
3494
+ "quantizations": [
3495
+ "Q2_K",
3496
+ "Q3_K_L",
3497
+ "Q3_K_M",
3498
+ "Q3_K_S",
3499
+ "Q4_K_M",
3500
+ "Q4_K_S",
3501
+ "Q5_K_M",
3502
+ "Q5_K_S",
3503
+ "Q6",
3504
+ "Q8_0",
3505
+ "fp16"
3506
+ ],
3507
+ "model_id": "MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-GGUF",
3508
+ "model_file_name_template": "Mixtral-8x22B-Instruct-{quantization}.gguf",
3509
+ "model_file_name_split_template": "Mixtral-8x22B-Instruct-v0.1.{quantization}-{part}.gguf",
3510
+ "quantization_parts": {
3511
+ "Q2_K": [
3512
+ "00001-of-00003",
3513
+ "00002-of-00003",
3514
+ "00003-of-00003"
3515
+ ],
3516
+ "Q3_K_L": [
3517
+ "00001-of-00002",
3518
+ "00002-of-00002"
3519
+ ],
3520
+ "Q3_K_M": [
3521
+ "00001-of-00002",
3522
+ "00002-of-00002"
3523
+ ],
3524
+ "Q3_K_S": [
3525
+ "00001-of-00003",
3526
+ "00002-of-00003",
3527
+ "00003-of-00003"
3528
+ ],
3529
+ "Q4_K_M": [
3530
+ "00001-of-00002",
3531
+ "00002-of-00002"
3532
+ ],
3533
+ "Q4_K_S": [
3534
+ "00001-of-00002",
3535
+ "00002-of-00002"
3536
+ ],
3537
+ "Q5_K_M": [
3538
+ "00001-of-00004",
3539
+ "00002-of-00004",
3540
+ "00003-of-00004",
3541
+ "00004-of-00004"
3542
+ ],
3543
+ "Q5_K_S": [
3544
+ "00001-of-00004",
3545
+ "00002-of-00004",
3546
+ "00003-of-00004",
3547
+ "00004-of-00004"
3548
+ ],
3549
+ "Q6": [
3550
+ "00001-of-00004",
3551
+ "00002-of-00004",
3552
+ "00003-of-00004",
3553
+ "00004-of-00004"
3554
+ ],
3555
+ "Q8_0": [
3556
+ "00001-of-00004",
3557
+ "00002-of-00004",
3558
+ "00003-of-00004",
3559
+ "00004-of-00004"
3560
+ ],
3561
+ "fp16": [
3562
+ "00001-of-00007",
3563
+ "00002-of-00007",
3564
+ "00003-of-00007",
3565
+ "00004-of-00007",
3566
+ "00005-of-00007",
3567
+ "00006-of-00007",
3568
+ "00007-of-00007"
3569
+ ]
3570
+ }
3571
+ }
3572
+ ],
3573
+ "prompt_style": {
3574
+ "style_name": "MIXTRAL_V01",
3575
+ "system_prompt": "",
3576
+ "roles": [
3577
+ "user",
3578
+ "assistant"
3579
+ ],
3580
+ "intra_message_sep": "",
3581
+ "inter_message_sep": ""
3582
+ }
3583
+ },
3322
3584
  {
3323
3585
  "version": 1,
3324
3586
  "context_length": 4096,
@@ -3389,7 +3651,7 @@
3389
3651
  },
3390
3652
  {
3391
3653
  "version": 1,
3392
- "context_length": 204800,
3654
+ "context_length": 262144,
3393
3655
  "model_name": "Yi-200k",
3394
3656
  "model_lang": [
3395
3657
  "en",
@@ -3426,7 +3688,7 @@
3426
3688
  },
3427
3689
  {
3428
3690
  "version": 1,
3429
- "context_length": 204800,
3691
+ "context_length": 4096,
3430
3692
  "model_name": "Yi-chat",
3431
3693
  "model_lang": [
3432
3694
  "en",
@@ -3445,6 +3707,17 @@
3445
3707
  ],
3446
3708
  "model_id": "01-ai/Yi-34B-Chat-{quantization}"
3447
3709
  },
3710
+ {
3711
+ "model_format": "pytorch",
3712
+ "model_size_in_billions": 6,
3713
+ "quantizations": [
3714
+ "4-bit",
3715
+ "8-bit",
3716
+ "none"
3717
+ ],
3718
+ "model_id": "01-ai/Yi-6B-Chat",
3719
+ "model_revision": "1c20c960895e4c3877cf478bc2df074221b81d7b"
3720
+ },
3448
3721
  {
3449
3722
  "model_format": "pytorch",
3450
3723
  "model_size_in_billions": 34,
@@ -3500,6 +3773,124 @@
3500
3773
  ]
3501
3774
  }
3502
3775
  },
3776
+ {
3777
+ "version": 1,
3778
+ "context_length": 4096,
3779
+ "model_name": "Yi-1.5",
3780
+ "model_lang": [
3781
+ "en",
3782
+ "zh"
3783
+ ],
3784
+ "model_ability": [
3785
+ "generate"
3786
+ ],
3787
+ "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
3788
+ "model_specs": [
3789
+ {
3790
+ "model_format": "pytorch",
3791
+ "model_size_in_billions": 6,
3792
+ "quantizations": [
3793
+ "4-bit",
3794
+ "8-bit",
3795
+ "none"
3796
+ ],
3797
+ "model_id": "01-ai/Yi-1.5-6B",
3798
+ "model_revision": "741a657c42d2081f777ce4c6c5572090f8b8c886"
3799
+ },
3800
+ {
3801
+ "model_format": "pytorch",
3802
+ "model_size_in_billions": 9,
3803
+ "quantizations": [
3804
+ "4-bit",
3805
+ "8-bit",
3806
+ "none"
3807
+ ],
3808
+ "model_id": "01-ai/Yi-1.5-9B",
3809
+ "model_revision": "9a6839c5b9db3dbb245fb98a072bfabc242621f2"
3810
+ },
3811
+ {
3812
+ "model_format": "pytorch",
3813
+ "model_size_in_billions": 34,
3814
+ "quantizations": [
3815
+ "4-bit",
3816
+ "8-bit",
3817
+ "none"
3818
+ ],
3819
+ "model_id": "01-ai/Yi-1.5-34B",
3820
+ "model_revision": "4f83007957ec3eec76d87df19ad061eb0f57b5c5"
3821
+ }
3822
+ ]
3823
+ },
3824
+ {
3825
+ "version": 1,
3826
+ "context_length": 4096,
3827
+ "model_name": "Yi-1.5-chat",
3828
+ "model_lang": [
3829
+ "en",
3830
+ "zh"
3831
+ ],
3832
+ "model_ability": [
3833
+ "chat"
3834
+ ],
3835
+ "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
3836
+ "model_specs": [
3837
+ {
3838
+ "model_format": "pytorch",
3839
+ "model_size_in_billions": 6,
3840
+ "quantizations": [
3841
+ "4-bit",
3842
+ "8-bit",
3843
+ "none"
3844
+ ],
3845
+ "model_id": "01-ai/Yi-1.5-6B-Chat",
3846
+ "model_revision": "d68dab90947a3c869e28c9cb2806996af99a6080"
3847
+ },
3848
+ {
3849
+ "model_format": "pytorch",
3850
+ "model_size_in_billions": 9,
3851
+ "quantizations": [
3852
+ "4-bit",
3853
+ "8-bit",
3854
+ "none"
3855
+ ],
3856
+ "model_id": "01-ai/Yi-1.5-9B-Chat",
3857
+ "model_revision": "1dc6e2b8dcfc12b95bede8dec67e6b6332ac64c6"
3858
+ },
3859
+ {
3860
+ "model_format": "pytorch",
3861
+ "model_size_in_billions": 34,
3862
+ "quantizations": [
3863
+ "4-bit",
3864
+ "8-bit",
3865
+ "none"
3866
+ ],
3867
+ "model_id": "01-ai/Yi-1.5-34B-Chat",
3868
+ "model_revision": "fa695ee438bfcd0ec2b378fa1c7e0dea1b40393e"
3869
+ }
3870
+ ],
3871
+ "prompt_style": {
3872
+ "style_name": "CHATML",
3873
+ "system_prompt": "",
3874
+ "roles": [
3875
+ "<|im_start|>user",
3876
+ "<|im_start|>assistant"
3877
+ ],
3878
+ "intra_message_sep": "<|im_end|>",
3879
+ "inter_message_sep": "",
3880
+ "stop_token_ids": [
3881
+ 2,
3882
+ 6,
3883
+ 7,
3884
+ 8
3885
+ ],
3886
+ "stop": [
3887
+ "<|endoftext|>",
3888
+ "<|im_start|>",
3889
+ "<|im_end|>",
3890
+ "<|im_sep|>"
3891
+ ]
3892
+ }
3893
+ },
3503
3894
  {
3504
3895
  "version": 1,
3505
3896
  "context_length": 2048,
@@ -4422,7 +4813,7 @@
4422
4813
  },
4423
4814
  {
4424
4815
  "version": 1,
4425
- "context_length": 204800,
4816
+ "context_length": 4096,
4426
4817
  "model_name": "yi-vl-chat",
4427
4818
  "model_lang": [
4428
4819
  "en",
@@ -5095,7 +5486,7 @@
5095
5486
  "Q8_0"
5096
5487
  ],
5097
5488
  "model_id": "andrewcanis/c4ai-command-r-v01-GGUF",
5098
- "model_file_name_template": "c4ai-command-r-v01.{quantization}.gguf"
5489
+ "model_file_name_template": "c4ai-command-r-v01-{quantization}.gguf"
5099
5490
  },
5100
5491
  {
5101
5492
  "model_format": "pytorch",
@@ -5157,5 +5548,45 @@
5157
5548
  "model_revision": "bb63b5b7005ecedb30b0cfd0d5953b02a5817f7b"
5158
5549
  }
5159
5550
  ]
5551
+ },
5552
+ {
5553
+ "version": 1,
5554
+ "context_length": 4096,
5555
+ "model_name": "Starling-LM",
5556
+ "model_lang": [
5557
+ "en",
5558
+ "zh"
5559
+ ],
5560
+ "model_ability": [
5561
+ "chat"
5562
+ ],
5563
+ "model_description": "We introduce Starling-7B, an open large language model (LLM) trained by Reinforcement Learning from AI Feedback (RLAIF). The model harnesses the power of our new GPT-4 labeled ranking dataset",
5564
+ "model_specs": [
5565
+ {
5566
+ "model_format": "pytorch",
5567
+ "model_size_in_billions": 7,
5568
+ "quantizations": [
5569
+ "4-bit",
5570
+ "8-bit",
5571
+ "none"
5572
+ ],
5573
+ "model_id": "berkeley-nest/Starling-LM-7B-alpha",
5574
+ "model_revision": "1dddf3b95bc1391f6307299eb1c162c194bde9bd"
5575
+ }
5576
+ ],
5577
+ "prompt_style": {
5578
+ "style_name": "ADD_COLON_SINGLE",
5579
+ "system_prompt": "",
5580
+ "roles": [
5581
+ "GPT4 Correct User",
5582
+ "GPT4 Correct Assistant"
5583
+ ],
5584
+ "intra_message_sep": "<|end_of_turn|>",
5585
+ "inter_message_sep": "",
5586
+ "stop_token_ids": [
5587
+ 2,
5588
+ 32000
5589
+ ]
5590
+ }
5160
5591
  }
5161
5592
  ]
@@ -33,7 +33,6 @@ from ..._compat import (
33
33
  validator,
34
34
  )
35
35
  from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
36
- from ...types import LoRA
37
36
  from ..utils import (
38
37
  download_from_modelscope,
39
38
  is_valid_model_uri,
@@ -167,7 +166,7 @@ class CustomLLMFamilyV1(LLMFamilyV1):
167
166
  )
168
167
  if (
169
168
  llm_spec.model_family != "other"
170
- and "tool_call" in llm_spec.model_ability
169
+ and "tools" in llm_spec.model_ability
171
170
  and llm_spec.model_family not in BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES
172
171
  ):
173
172
  raise ValueError(
@@ -227,16 +226,23 @@ LLMFamilyV1.update_forward_refs()
227
226
  CustomLLMFamilyV1.update_forward_refs()
228
227
 
229
228
 
230
- LLM_CLASSES: List[Type[LLM]] = []
231
- PEFT_SUPPORTED_CLASSES: List[Type[LLM]] = []
229
+ LLAMA_CLASSES: List[Type[LLM]] = []
232
230
 
233
231
  BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
234
232
  BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
235
233
 
234
+ SGLANG_CLASSES: List[Type[LLM]] = []
235
+ TRANSFORMERS_CLASSES: List[Type[LLM]] = []
236
+
236
237
  UD_LLM_FAMILIES: List["LLMFamilyV1"] = []
237
238
 
238
239
  UD_LLM_FAMILIES_LOCK = Lock()
239
240
 
241
+ VLLM_CLASSES: List[Type[LLM]] = []
242
+
243
+ LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
244
+ SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
245
+
240
246
  LLM_LAUNCH_VERSIONS: Dict[str, List[str]] = {}
241
247
 
242
248
 
@@ -822,7 +828,6 @@ def match_llm(
822
828
  model_format: Optional[str] = None,
823
829
  model_size_in_billions: Optional[Union[int, str]] = None,
824
830
  quantization: Optional[str] = None,
825
- is_local_deployment: bool = False,
826
831
  ) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
827
832
  """
828
833
  Find an LLM family, spec, and quantization that satisfy given criteria.
@@ -880,30 +885,15 @@ def match_llm(
880
885
  matched_quantization,
881
886
  )
882
887
  else:
883
- if spec.model_format == "pytorch":
884
- return family, _apply_format_to_model_id(spec, "none"), "none"
885
- else:
886
- # by default, choose the most coarse-grained quantization.
887
- # TODO: too hacky.
888
- quantizations = spec.quantizations
889
- quantizations.sort()
890
- for q in quantizations:
891
- if (
892
- is_local_deployment
893
- and not (_is_linux() and _has_cuda_device())
894
- and q == "4-bit"
895
- ):
896
- logger.warning(
897
- "Skipping %s for non-linux or non-cuda local deployment .",
898
- q,
899
- )
900
- continue
901
- return family, _apply_format_to_model_id(spec, q), q
888
+ # TODO: If user does not specify quantization, just use the first one
889
+ _q = "none" if spec.model_format == "pytorch" else spec.quantizations[0]
890
+ return family, _apply_format_to_model_id(spec, _q), _q
902
891
  return None
903
892
 
904
893
 
905
894
  def register_llm(llm_family: LLMFamilyV1, persist: bool):
906
895
  from ..utils import is_valid_model_name
896
+ from . import generate_engine_config_by_model_family
907
897
 
908
898
  if not is_valid_model_name(llm_family.model_name):
909
899
  raise ValueError(f"Invalid model name {llm_family.model_name}.")
@@ -916,6 +906,7 @@ def register_llm(llm_family: LLMFamilyV1, persist: bool):
916
906
  )
917
907
 
918
908
  UD_LLM_FAMILIES.append(llm_family)
909
+ generate_engine_config_by_model_family(llm_family)
919
910
 
920
911
  if persist:
921
912
  # We only validate model URL when persist is True.
@@ -941,6 +932,7 @@ def unregister_llm(model_name: str, raise_error: bool = True):
941
932
  break
942
933
  if llm_family:
943
934
  UD_LLM_FAMILIES.remove(llm_family)
935
+ del LLM_ENGINES[model_name]
944
936
 
945
937
  persist_path = os.path.join(
946
938
  XINFERENCE_MODEL_DIR, "llm", f"{llm_family.model_name}.json"
@@ -972,21 +964,33 @@ def unregister_llm(model_name: str, raise_error: bool = True):
972
964
  logger.warning(f"Custom model {model_name} not found")
973
965
 
974
966
 
975
- def match_llm_cls(
976
- family: LLMFamilyV1,
977
- llm_spec: "LLMSpecV1",
967
+ def check_engine_by_spec_parameters(
968
+ model_engine: str,
969
+ model_name: str,
970
+ model_format: str,
971
+ model_size_in_billions: Union[str, int],
978
972
  quantization: str,
979
- peft_model: Optional[List[LoRA]] = None,
980
- ) -> Optional[Type[LLM]]:
981
- """
982
- Find an LLM implementation for given LLM family and spec.
983
- """
984
- if peft_model is not None:
985
- for cls in PEFT_SUPPORTED_CLASSES:
986
- if cls.match(family, llm_spec, quantization):
987
- return cls
988
- else:
989
- for cls in LLM_CLASSES:
990
- if cls.match(family, llm_spec, quantization):
991
- return cls
992
- return None
973
+ ) -> Type[LLM]:
974
+ def get_model_engine_from_spell(engine_str: str) -> str:
975
+ for engine in LLM_ENGINES[model_name].keys():
976
+ if engine.lower() == engine_str.lower():
977
+ return engine
978
+ return engine_str
979
+
980
+ if model_name not in LLM_ENGINES:
981
+ raise ValueError(f"Model {model_name} not found.")
982
+ model_engine = get_model_engine_from_spell(model_engine)
983
+ if model_engine not in LLM_ENGINES[model_name]:
984
+ raise ValueError(f"Model {model_name} cannot be run on engine {model_engine}.")
985
+ match_params = LLM_ENGINES[model_name][model_engine]
986
+ for param in match_params:
987
+ if (
988
+ model_name == param["model_name"]
989
+ and model_format == param["model_format"]
990
+ and model_size_in_billions == param["model_size_in_billions"]
991
+ and quantization in param["quantizations"]
992
+ ):
993
+ return param["llm_class"]
994
+ raise ValueError(
995
+ f"Model {model_name} cannot be run on engine {model_engine}, with format {model_format}, size {model_size_in_billions} and quantization {quantization}."
996
+ )