xinference 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/auth_service.py +1 -1
- xinference/api/restful_api.py +53 -61
- xinference/client/restful/restful_client.py +52 -57
- xinference/conftest.py +1 -1
- xinference/core/cache_tracker.py +1 -1
- xinference/core/chat_interface.py +10 -4
- xinference/core/event.py +1 -1
- xinference/core/model.py +17 -6
- xinference/core/status_guard.py +1 -1
- xinference/core/supervisor.py +58 -72
- xinference/core/worker.py +68 -101
- xinference/deploy/cmdline.py +166 -1
- xinference/deploy/test/test_cmdline.py +2 -0
- xinference/deploy/utils.py +1 -1
- xinference/device_utils.py +29 -3
- xinference/fields.py +7 -1
- xinference/model/audio/whisper.py +88 -12
- xinference/model/core.py +2 -2
- xinference/model/image/__init__.py +29 -0
- xinference/model/image/core.py +6 -0
- xinference/model/image/custom.py +109 -0
- xinference/model/llm/__init__.py +92 -32
- xinference/model/llm/core.py +57 -102
- xinference/model/llm/ggml/chatglm.py +98 -13
- xinference/model/llm/ggml/llamacpp.py +49 -2
- xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
- xinference/model/llm/llm_family.json +438 -7
- xinference/model/llm/llm_family.py +45 -41
- xinference/model/llm/llm_family_modelscope.json +258 -5
- xinference/model/llm/pytorch/chatglm.py +48 -0
- xinference/model/llm/pytorch/core.py +23 -6
- xinference/model/llm/pytorch/deepseek_vl.py +115 -33
- xinference/model/llm/pytorch/internlm2.py +32 -1
- xinference/model/llm/pytorch/qwen_vl.py +94 -12
- xinference/model/llm/pytorch/utils.py +38 -1
- xinference/model/llm/pytorch/yi_vl.py +96 -51
- xinference/model/llm/sglang/core.py +31 -9
- xinference/model/llm/utils.py +54 -20
- xinference/model/llm/vllm/core.py +101 -7
- xinference/thirdparty/omnilmm/chat.py +2 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
- xinference/types.py +11 -0
- xinference/web/ui/build/asset-manifest.json +6 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.54bca460.css +2 -0
- xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
- xinference/web/ui/build/static/js/main.551aa479.js +3 -0
- xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.551aa479.js.LICENSE.txt} +7 -0
- xinference/web/ui/build/static/js/main.551aa479.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/23caf6f1e52c43e983ca3bfd4189f41dbd645fa78f2dfdcd7f6b69bc41678665.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +33 -0
- xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
- xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
- xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
- xinference/web/ui/node_modules/clipboard/bower.json +18 -0
- xinference/web/ui/node_modules/clipboard/composer.json +25 -0
- xinference/web/ui/node_modules/clipboard/package.json +63 -0
- xinference/web/ui/node_modules/delegate/package.json +31 -0
- xinference/web/ui/node_modules/good-listener/bower.json +11 -0
- xinference/web/ui/node_modules/good-listener/package.json +35 -0
- xinference/web/ui/node_modules/select/bower.json +13 -0
- xinference/web/ui/node_modules/select/package.json +29 -0
- xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
- xinference/web/ui/package-lock.json +34 -0
- xinference/web/ui/package.json +1 -0
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/METADATA +13 -12
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/RECORD +88 -67
- xinference/client/oscar/__init__.py +0 -13
- xinference/client/oscar/actor_client.py +0 -611
- xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
- xinference/model/llm/pytorch/spec_model.py +0 -186
- xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
- xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/LICENSE +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/WHEEL +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2024-
|
|
11
|
+
"date": "2024-05-17T14:10:09+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.
|
|
14
|
+
"full-revisionid": "55a0200079eacf4fd6ee10c5868f0eaba244db29",
|
|
15
|
+
"version": "0.11.1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -48,7 +48,7 @@ class AuthService:
|
|
|
48
48
|
|
|
49
49
|
def init_auth_config(self):
|
|
50
50
|
if self._auth_config_file:
|
|
51
|
-
config: AuthStartupConfig = parse_file_as(
|
|
51
|
+
config: AuthStartupConfig = parse_file_as( # type: ignore
|
|
52
52
|
path=self._auth_config_file, type_=AuthStartupConfig
|
|
53
53
|
)
|
|
54
54
|
all_api_keys = set()
|
xinference/api/restful_api.py
CHANGED
|
@@ -275,6 +275,16 @@ class RESTfulAPI:
|
|
|
275
275
|
self._router.add_api_route(
|
|
276
276
|
"/v1/cluster/auth", self.is_cluster_authenticated, methods=["GET"]
|
|
277
277
|
)
|
|
278
|
+
self._router.add_api_route(
|
|
279
|
+
"/v1/engines/{model_name}",
|
|
280
|
+
self.query_engines_by_model_name,
|
|
281
|
+
methods=["GET"],
|
|
282
|
+
dependencies=(
|
|
283
|
+
[Security(self._auth_service, scopes=["models:list"])]
|
|
284
|
+
if self.is_authenticated()
|
|
285
|
+
else None
|
|
286
|
+
),
|
|
287
|
+
)
|
|
278
288
|
# running instances
|
|
279
289
|
self._router.add_api_route(
|
|
280
290
|
"/v1/models/instances",
|
|
@@ -347,16 +357,6 @@ class RESTfulAPI:
|
|
|
347
357
|
else None
|
|
348
358
|
),
|
|
349
359
|
)
|
|
350
|
-
self._router.add_api_route(
|
|
351
|
-
"/experimental/speculative_llms",
|
|
352
|
-
self.launch_speculative_llm,
|
|
353
|
-
methods=["POST"],
|
|
354
|
-
dependencies=(
|
|
355
|
-
[Security(self._auth_service, scopes=["models:start"])]
|
|
356
|
-
if self.is_authenticated()
|
|
357
|
-
else None
|
|
358
|
-
),
|
|
359
|
-
)
|
|
360
360
|
self._router.add_api_route(
|
|
361
361
|
"/v1/models/{model_uid}",
|
|
362
362
|
self.terminate_model,
|
|
@@ -639,57 +639,17 @@ class RESTfulAPI:
|
|
|
639
639
|
logger.error(e, exc_info=True)
|
|
640
640
|
raise HTTPException(status_code=500, detail=str(e))
|
|
641
641
|
|
|
642
|
-
async def launch_speculative_llm(self, request: Request) -> JSONResponse:
|
|
643
|
-
payload = await request.json()
|
|
644
|
-
model_uid = payload.get("model_uid")
|
|
645
|
-
model_name = payload.get("model_name")
|
|
646
|
-
model_size_in_billions = payload.get("model_size_in_billions")
|
|
647
|
-
quantization = payload.get("quantization")
|
|
648
|
-
draft_model_name = payload.get("draft_model_name")
|
|
649
|
-
draft_model_size_in_billions = payload.get("draft_model_size_in_billions")
|
|
650
|
-
draft_quantization = payload.get("draft_quantization")
|
|
651
|
-
n_gpu = payload.get("n_gpu", "auto")
|
|
652
|
-
|
|
653
|
-
if not model_name:
|
|
654
|
-
raise HTTPException(
|
|
655
|
-
status_code=400,
|
|
656
|
-
detail="Invalid input. Please specify the model name",
|
|
657
|
-
)
|
|
658
|
-
|
|
659
|
-
try:
|
|
660
|
-
model_uid = await (await self._get_supervisor_ref()).launch_speculative_llm(
|
|
661
|
-
model_uid=model_uid,
|
|
662
|
-
model_name=model_name,
|
|
663
|
-
model_size_in_billions=model_size_in_billions,
|
|
664
|
-
quantization=quantization,
|
|
665
|
-
draft_model_name=draft_model_name,
|
|
666
|
-
draft_model_size_in_billions=draft_model_size_in_billions,
|
|
667
|
-
draft_quantization=draft_quantization,
|
|
668
|
-
n_gpu=n_gpu,
|
|
669
|
-
)
|
|
670
|
-
|
|
671
|
-
except ValueError as ve:
|
|
672
|
-
logger.error(str(ve), exc_info=True)
|
|
673
|
-
raise HTTPException(status_code=400, detail=str(ve))
|
|
674
|
-
except RuntimeError as re:
|
|
675
|
-
logger.error(str(re), exc_info=True)
|
|
676
|
-
raise HTTPException(status_code=503, detail=str(re))
|
|
677
|
-
except Exception as e:
|
|
678
|
-
logger.error(str(e), exc_info=True)
|
|
679
|
-
raise HTTPException(status_code=500, detail=str(e))
|
|
680
|
-
|
|
681
|
-
return JSONResponse(content={"model_uid": model_uid})
|
|
682
|
-
|
|
683
642
|
async def launch_model(
|
|
684
643
|
self, request: Request, wait_ready: bool = Query(True)
|
|
685
644
|
) -> JSONResponse:
|
|
686
645
|
payload = await request.json()
|
|
687
646
|
model_uid = payload.get("model_uid")
|
|
688
647
|
model_name = payload.get("model_name")
|
|
648
|
+
model_engine = payload.get("model_engine")
|
|
689
649
|
model_size_in_billions = payload.get("model_size_in_billions")
|
|
690
650
|
model_format = payload.get("model_format")
|
|
691
651
|
quantization = payload.get("quantization")
|
|
692
|
-
model_type = payload.get("model_type")
|
|
652
|
+
model_type = payload.get("model_type", "LLM")
|
|
693
653
|
replica = payload.get("replica", 1)
|
|
694
654
|
n_gpu = payload.get("n_gpu", "auto")
|
|
695
655
|
request_limits = payload.get("request_limits", None)
|
|
@@ -700,6 +660,7 @@ class RESTfulAPI:
|
|
|
700
660
|
exclude_keys = {
|
|
701
661
|
"model_uid",
|
|
702
662
|
"model_name",
|
|
663
|
+
"model_engine",
|
|
703
664
|
"model_size_in_billions",
|
|
704
665
|
"model_format",
|
|
705
666
|
"quantization",
|
|
@@ -719,7 +680,12 @@ class RESTfulAPI:
|
|
|
719
680
|
if not model_name:
|
|
720
681
|
raise HTTPException(
|
|
721
682
|
status_code=400,
|
|
722
|
-
detail="Invalid input. Please specify the
|
|
683
|
+
detail="Invalid input. Please specify the `model_name` field.",
|
|
684
|
+
)
|
|
685
|
+
if not model_engine and model_type == "LLM":
|
|
686
|
+
raise HTTPException(
|
|
687
|
+
status_code=400,
|
|
688
|
+
detail="Invalid input. Please specify the `model_engine` field.",
|
|
723
689
|
)
|
|
724
690
|
|
|
725
691
|
if peft_model_config is not None:
|
|
@@ -731,6 +697,7 @@ class RESTfulAPI:
|
|
|
731
697
|
model_uid = await (await self._get_supervisor_ref()).launch_builtin_model(
|
|
732
698
|
model_uid=model_uid,
|
|
733
699
|
model_name=model_name,
|
|
700
|
+
model_engine=model_engine,
|
|
734
701
|
model_size_in_billions=model_size_in_billions,
|
|
735
702
|
model_format=model_format,
|
|
736
703
|
quantization=quantization,
|
|
@@ -776,6 +743,7 @@ class RESTfulAPI:
|
|
|
776
743
|
) -> JSONResponse:
|
|
777
744
|
payload = await request.json()
|
|
778
745
|
model_uid = payload.get("model_uid")
|
|
746
|
+
model_engine = payload.get("model_engine")
|
|
779
747
|
model_type = payload.get("model_type")
|
|
780
748
|
model_version = payload.get("model_version")
|
|
781
749
|
replica = payload.get("replica", 1)
|
|
@@ -786,6 +754,7 @@ class RESTfulAPI:
|
|
|
786
754
|
await self._get_supervisor_ref()
|
|
787
755
|
).launch_model_by_version(
|
|
788
756
|
model_uid=model_uid,
|
|
757
|
+
model_engine=model_engine,
|
|
789
758
|
model_type=model_type,
|
|
790
759
|
model_version=model_version,
|
|
791
760
|
replica=replica,
|
|
@@ -1085,6 +1054,7 @@ class RESTfulAPI:
|
|
|
1085
1054
|
|
|
1086
1055
|
async def create_transcriptions(
|
|
1087
1056
|
self,
|
|
1057
|
+
request: Request,
|
|
1088
1058
|
model: str = Form(...),
|
|
1089
1059
|
file: UploadFile = File(media_type="application/octet-stream"),
|
|
1090
1060
|
language: Optional[str] = Form(None),
|
|
@@ -1093,6 +1063,10 @@ class RESTfulAPI:
|
|
|
1093
1063
|
temperature: Optional[float] = Form(0),
|
|
1094
1064
|
kwargs: Optional[str] = Form(None),
|
|
1095
1065
|
) -> Response:
|
|
1066
|
+
form = await request.form()
|
|
1067
|
+
timestamp_granularities = form.get("timestamp_granularities[]")
|
|
1068
|
+
if timestamp_granularities:
|
|
1069
|
+
timestamp_granularities = [timestamp_granularities]
|
|
1096
1070
|
model_uid = model
|
|
1097
1071
|
try:
|
|
1098
1072
|
model_ref = await (await self._get_supervisor_ref()).get_model(model_uid)
|
|
@@ -1116,6 +1090,7 @@ class RESTfulAPI:
|
|
|
1116
1090
|
prompt=prompt,
|
|
1117
1091
|
response_format=response_format,
|
|
1118
1092
|
temperature=temperature,
|
|
1093
|
+
timestamp_granularities=timestamp_granularities,
|
|
1119
1094
|
**parsed_kwargs,
|
|
1120
1095
|
)
|
|
1121
1096
|
return Response(content=transcription, media_type="application/json")
|
|
@@ -1130,13 +1105,19 @@ class RESTfulAPI:
|
|
|
1130
1105
|
|
|
1131
1106
|
async def create_translations(
|
|
1132
1107
|
self,
|
|
1108
|
+
request: Request,
|
|
1133
1109
|
model: str = Form(...),
|
|
1134
1110
|
file: UploadFile = File(media_type="application/octet-stream"),
|
|
1111
|
+
language: Optional[str] = Form(None),
|
|
1135
1112
|
prompt: Optional[str] = Form(None),
|
|
1136
1113
|
response_format: Optional[str] = Form("json"),
|
|
1137
1114
|
temperature: Optional[float] = Form(0),
|
|
1138
1115
|
kwargs: Optional[str] = Form(None),
|
|
1139
1116
|
) -> Response:
|
|
1117
|
+
form = await request.form()
|
|
1118
|
+
timestamp_granularities = form.get("timestamp_granularities[]")
|
|
1119
|
+
if timestamp_granularities:
|
|
1120
|
+
timestamp_granularities = [timestamp_granularities]
|
|
1140
1121
|
model_uid = model
|
|
1141
1122
|
try:
|
|
1142
1123
|
model_ref = await (await self._get_supervisor_ref()).get_model(model_uid)
|
|
@@ -1156,9 +1137,11 @@ class RESTfulAPI:
|
|
|
1156
1137
|
parsed_kwargs = {}
|
|
1157
1138
|
translation = await model_ref.translations(
|
|
1158
1139
|
audio=await file.read(),
|
|
1140
|
+
language=language,
|
|
1159
1141
|
prompt=prompt,
|
|
1160
1142
|
response_format=response_format,
|
|
1161
1143
|
temperature=temperature,
|
|
1144
|
+
timestamp_granularities=timestamp_granularities,
|
|
1162
1145
|
**parsed_kwargs,
|
|
1163
1146
|
)
|
|
1164
1147
|
return Response(content=translation, media_type="application/json")
|
|
@@ -1274,11 +1257,7 @@ class RESTfulAPI:
|
|
|
1274
1257
|
|
|
1275
1258
|
messages = body.messages and list(body.messages) or None
|
|
1276
1259
|
|
|
1277
|
-
if (
|
|
1278
|
-
not messages
|
|
1279
|
-
or messages[-1].get("role") not in ["user", "system", "tool"]
|
|
1280
|
-
or not messages[-1].get("content")
|
|
1281
|
-
):
|
|
1260
|
+
if not messages or messages[-1].get("role") not in ["user", "system", "tool"]:
|
|
1282
1261
|
raise HTTPException(
|
|
1283
1262
|
status_code=400, detail="Invalid input. Please specify the prompt."
|
|
1284
1263
|
)
|
|
@@ -1298,15 +1277,15 @@ class RESTfulAPI:
|
|
|
1298
1277
|
{"role": "system", "content": ". ".join(system_messages_contents)}
|
|
1299
1278
|
)
|
|
1300
1279
|
|
|
1301
|
-
assert non_system_messages
|
|
1302
|
-
|
|
1303
1280
|
has_tool_message = messages[-1].get("role") == "tool"
|
|
1304
1281
|
if has_tool_message:
|
|
1305
1282
|
prompt = SPECIAL_TOOL_PROMPT
|
|
1306
1283
|
system_prompt = system_messages[0]["content"] if system_messages else None
|
|
1307
1284
|
chat_history = non_system_messages # exclude the prompt
|
|
1308
1285
|
else:
|
|
1309
|
-
prompt =
|
|
1286
|
+
prompt = None
|
|
1287
|
+
if non_system_messages:
|
|
1288
|
+
prompt = non_system_messages[-1]["content"]
|
|
1310
1289
|
system_prompt = system_messages[0]["content"] if system_messages else None
|
|
1311
1290
|
chat_history = non_system_messages[:-1] # exclude the prompt
|
|
1312
1291
|
|
|
@@ -1418,6 +1397,19 @@ class RESTfulAPI:
|
|
|
1418
1397
|
self.handle_request_limit_error(e)
|
|
1419
1398
|
raise HTTPException(status_code=500, detail=str(e))
|
|
1420
1399
|
|
|
1400
|
+
async def query_engines_by_model_name(self, model_name: str) -> JSONResponse:
|
|
1401
|
+
try:
|
|
1402
|
+
content = await (
|
|
1403
|
+
await self._get_supervisor_ref()
|
|
1404
|
+
).query_engines_by_model_name(model_name)
|
|
1405
|
+
return JSONResponse(content=content)
|
|
1406
|
+
except ValueError as re:
|
|
1407
|
+
logger.error(re, exc_info=True)
|
|
1408
|
+
raise HTTPException(status_code=400, detail=str(re))
|
|
1409
|
+
except Exception as e:
|
|
1410
|
+
logger.error(e, exc_info=True)
|
|
1411
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
1412
|
+
|
|
1421
1413
|
async def register_model(self, model_type: str, request: Request) -> JSONResponse:
|
|
1422
1414
|
body = RegisterModelRequest.parse_obj(await request.json())
|
|
1423
1415
|
model = body.model
|
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import json
|
|
15
15
|
import typing
|
|
16
|
-
import warnings
|
|
17
16
|
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
|
|
18
17
|
|
|
19
18
|
import requests
|
|
@@ -566,6 +565,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
|
|
|
566
565
|
prompt: Optional[str] = None,
|
|
567
566
|
response_format: Optional[str] = "json",
|
|
568
567
|
temperature: Optional[float] = 0,
|
|
568
|
+
timestamp_granularities: Optional[List[str]] = None,
|
|
569
569
|
):
|
|
570
570
|
"""
|
|
571
571
|
Transcribes audio into the input language.
|
|
@@ -589,6 +589,11 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
|
|
|
589
589
|
while lower values like 0.2 will make it more focused and deterministic.
|
|
590
590
|
If set to 0, the model will use log probability to automatically increase the temperature
|
|
591
591
|
until certain thresholds are hit.
|
|
592
|
+
timestamp_granularities: Optional[List[str]], default is None.
|
|
593
|
+
The timestamp granularities to populate for this transcription. response_format must be set verbose_json
|
|
594
|
+
to use timestamp granularities. Either or both of these options are supported: word, or segment.
|
|
595
|
+
Note: There is no additional latency for segment timestamps, but generating word timestamps incurs
|
|
596
|
+
additional latency.
|
|
592
597
|
|
|
593
598
|
Returns
|
|
594
599
|
-------
|
|
@@ -601,12 +606,13 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
|
|
|
601
606
|
"prompt": prompt,
|
|
602
607
|
"response_format": response_format,
|
|
603
608
|
"temperature": temperature,
|
|
609
|
+
"timestamp_granularities[]": timestamp_granularities,
|
|
604
610
|
}
|
|
605
611
|
files: List[Any] = []
|
|
606
|
-
for key, value in params.items():
|
|
607
|
-
files.append((key, (None, value)))
|
|
608
612
|
files.append(("file", ("file", audio, "application/octet-stream")))
|
|
609
|
-
response = requests.post(
|
|
613
|
+
response = requests.post(
|
|
614
|
+
url, data=params, files=files, headers=self.auth_headers
|
|
615
|
+
)
|
|
610
616
|
if response.status_code != 200:
|
|
611
617
|
raise RuntimeError(
|
|
612
618
|
f"Failed to transcribe the audio, detail: {_get_error_string(response)}"
|
|
@@ -618,9 +624,11 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
|
|
|
618
624
|
def translations(
|
|
619
625
|
self,
|
|
620
626
|
audio: bytes,
|
|
627
|
+
language: Optional[str] = None,
|
|
621
628
|
prompt: Optional[str] = None,
|
|
622
629
|
response_format: Optional[str] = "json",
|
|
623
630
|
temperature: Optional[float] = 0,
|
|
631
|
+
timestamp_granularities: Optional[List[str]] = None,
|
|
624
632
|
):
|
|
625
633
|
"""
|
|
626
634
|
Translates audio into English.
|
|
@@ -631,6 +639,9 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
|
|
|
631
639
|
audio: bytes
|
|
632
640
|
The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg,
|
|
633
641
|
mpga, m4a, ogg, wav, or webm.
|
|
642
|
+
language: Optional[str]
|
|
643
|
+
The language of the input audio. Supplying the input language in ISO-639-1
|
|
644
|
+
(https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes) format will improve accuracy and latency.
|
|
634
645
|
prompt: Optional[str]
|
|
635
646
|
An optional text to guide the model's style or continue a previous audio segment.
|
|
636
647
|
The prompt should match the audio language.
|
|
@@ -641,6 +652,11 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
|
|
|
641
652
|
while lower values like 0.2 will make it more focused and deterministic.
|
|
642
653
|
If set to 0, the model will use log probability to automatically increase the temperature
|
|
643
654
|
until certain thresholds are hit.
|
|
655
|
+
timestamp_granularities: Optional[List[str]], default is None.
|
|
656
|
+
The timestamp granularities to populate for this transcription. response_format must be set verbose_json
|
|
657
|
+
to use timestamp granularities. Either or both of these options are supported: word, or segment.
|
|
658
|
+
Note: There is no additional latency for segment timestamps, but generating word timestamps incurs
|
|
659
|
+
additional latency.
|
|
644
660
|
|
|
645
661
|
Returns
|
|
646
662
|
-------
|
|
@@ -649,15 +665,17 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
|
|
|
649
665
|
url = f"{self._base_url}/v1/audio/translations"
|
|
650
666
|
params = {
|
|
651
667
|
"model": self._model_uid,
|
|
668
|
+
"language": language,
|
|
652
669
|
"prompt": prompt,
|
|
653
670
|
"response_format": response_format,
|
|
654
671
|
"temperature": temperature,
|
|
672
|
+
"timestamp_granularities[]": timestamp_granularities,
|
|
655
673
|
}
|
|
656
674
|
files: List[Any] = []
|
|
657
|
-
for key, value in params.items():
|
|
658
|
-
files.append((key, (None, value)))
|
|
659
675
|
files.append(("file", ("file", audio, "application/octet-stream")))
|
|
660
|
-
response = requests.post(
|
|
676
|
+
response = requests.post(
|
|
677
|
+
url, data=params, files=files, headers=self.auth_headers
|
|
678
|
+
)
|
|
661
679
|
if response.status_code != 200:
|
|
662
680
|
raise RuntimeError(
|
|
663
681
|
f"Failed to translate the audio, detail: {_get_error_string(response)}"
|
|
@@ -754,60 +772,11 @@ class Client:
|
|
|
754
772
|
model_list = response_data["data"]
|
|
755
773
|
return {item["id"]: item for item in model_list}
|
|
756
774
|
|
|
757
|
-
def launch_speculative_llm(
|
|
758
|
-
self,
|
|
759
|
-
model_name: str,
|
|
760
|
-
model_size_in_billions: Optional[Union[int, str, float]],
|
|
761
|
-
quantization: Optional[str],
|
|
762
|
-
draft_model_name: str,
|
|
763
|
-
draft_model_size_in_billions: Optional[int],
|
|
764
|
-
draft_quantization: Optional[str],
|
|
765
|
-
n_gpu: Optional[Union[int, str]] = "auto",
|
|
766
|
-
):
|
|
767
|
-
"""
|
|
768
|
-
Launch the LLM along with a draft model based on the parameters on the server via RESTful APIs. This is an
|
|
769
|
-
experimental feature and the API may change in the future.
|
|
770
|
-
|
|
771
|
-
Returns
|
|
772
|
-
-------
|
|
773
|
-
str
|
|
774
|
-
The unique model_uid for the launched model.
|
|
775
|
-
|
|
776
|
-
"""
|
|
777
|
-
warnings.warn(
|
|
778
|
-
"`launch_speculative_llm` is an experimental feature and the API may change in the future."
|
|
779
|
-
)
|
|
780
|
-
|
|
781
|
-
# convert float to int or string since the RESTful API does not accept float.
|
|
782
|
-
if isinstance(model_size_in_billions, float):
|
|
783
|
-
model_size_in_billions = convert_float_to_int_or_str(model_size_in_billions)
|
|
784
|
-
|
|
785
|
-
payload = {
|
|
786
|
-
"model_uid": None,
|
|
787
|
-
"model_name": model_name,
|
|
788
|
-
"model_size_in_billions": model_size_in_billions,
|
|
789
|
-
"quantization": quantization,
|
|
790
|
-
"draft_model_name": draft_model_name,
|
|
791
|
-
"draft_model_size_in_billions": draft_model_size_in_billions,
|
|
792
|
-
"draft_quantization": draft_quantization,
|
|
793
|
-
"n_gpu": n_gpu,
|
|
794
|
-
}
|
|
795
|
-
|
|
796
|
-
url = f"{self.base_url}/experimental/speculative_llms"
|
|
797
|
-
response = requests.post(url, json=payload, headers=self._headers)
|
|
798
|
-
if response.status_code != 200:
|
|
799
|
-
raise RuntimeError(
|
|
800
|
-
f"Failed to launch model, detail: {_get_error_string(response)}"
|
|
801
|
-
)
|
|
802
|
-
|
|
803
|
-
response_data = response.json()
|
|
804
|
-
model_uid = response_data["model_uid"]
|
|
805
|
-
return model_uid
|
|
806
|
-
|
|
807
775
|
def launch_model(
|
|
808
776
|
self,
|
|
809
777
|
model_name: str,
|
|
810
778
|
model_type: str = "LLM",
|
|
779
|
+
model_engine: Optional[str] = None,
|
|
811
780
|
model_uid: Optional[str] = None,
|
|
812
781
|
model_size_in_billions: Optional[Union[int, str, float]] = None,
|
|
813
782
|
model_format: Optional[str] = None,
|
|
@@ -829,6 +798,8 @@ class Client:
|
|
|
829
798
|
The name of model.
|
|
830
799
|
model_type: str
|
|
831
800
|
type of model.
|
|
801
|
+
model_engine: Optional[str]
|
|
802
|
+
Specify the inference engine of the model when launching LLM.
|
|
832
803
|
model_uid: str
|
|
833
804
|
UID of model, auto generate a UUID if is None.
|
|
834
805
|
model_size_in_billions: Optional[Union[int, str, float]]
|
|
@@ -872,6 +843,7 @@ class Client:
|
|
|
872
843
|
payload = {
|
|
873
844
|
"model_uid": model_uid,
|
|
874
845
|
"model_name": model_name,
|
|
846
|
+
"model_engine": model_engine,
|
|
875
847
|
"peft_model_config": peft_model_config,
|
|
876
848
|
"model_type": model_type,
|
|
877
849
|
"model_size_in_billions": model_size_in_billions,
|
|
@@ -1157,3 +1129,26 @@ class Client:
|
|
|
1157
1129
|
|
|
1158
1130
|
response_data = response.json()
|
|
1159
1131
|
return response_data
|
|
1132
|
+
|
|
1133
|
+
def query_engine_by_model_name(self, model_name: str):
|
|
1134
|
+
"""
|
|
1135
|
+
Get the engine parameters with the model name registered on the server.
|
|
1136
|
+
|
|
1137
|
+
Parameters
|
|
1138
|
+
----------
|
|
1139
|
+
model_name: str
|
|
1140
|
+
The name of the model.
|
|
1141
|
+
Returns
|
|
1142
|
+
-------
|
|
1143
|
+
Dict[str, List[Dict[str, Any]]]
|
|
1144
|
+
The supported engine parameters of registered models on the server.
|
|
1145
|
+
"""
|
|
1146
|
+
url = f"{self.base_url}/v1/engines/{model_name}"
|
|
1147
|
+
response = requests.get(url, headers=self._headers)
|
|
1148
|
+
if response.status_code != 200:
|
|
1149
|
+
raise RuntimeError(
|
|
1150
|
+
f"Failed to query engine parameters by model name, detail: {_get_error_string(response)}"
|
|
1151
|
+
)
|
|
1152
|
+
|
|
1153
|
+
response_data = response.json()
|
|
1154
|
+
return response_data
|
xinference/conftest.py
CHANGED
|
@@ -237,7 +237,7 @@ def setup_with_file_logging():
|
|
|
237
237
|
logging_conf=TEST_FILE_LOGGING_CONF,
|
|
238
238
|
)
|
|
239
239
|
endpoint = f"http://localhost:{port}"
|
|
240
|
-
if not api_health_check(endpoint, max_attempts=
|
|
240
|
+
if not api_health_check(endpoint, max_attempts=10, sleep_interval=5):
|
|
241
241
|
raise RuntimeError("Endpoint is not available after multiple attempts")
|
|
242
242
|
|
|
243
243
|
try:
|
xinference/core/cache_tracker.py
CHANGED
|
@@ -22,7 +22,7 @@ logger = getLogger(__name__)
|
|
|
22
22
|
class CacheTrackerActor(xo.Actor):
|
|
23
23
|
def __init__(self):
|
|
24
24
|
super().__init__()
|
|
25
|
-
self._model_name_to_version_info: Dict[str, List[Dict]] = {}
|
|
25
|
+
self._model_name_to_version_info: Dict[str, List[Dict]] = {} # type: ignore
|
|
26
26
|
|
|
27
27
|
@classmethod
|
|
28
28
|
def uid(cls) -> str:
|
|
@@ -109,6 +109,7 @@ class GradioInterface:
|
|
|
109
109
|
history: List[List[str]],
|
|
110
110
|
max_tokens: int,
|
|
111
111
|
temperature: float,
|
|
112
|
+
lora_name: str,
|
|
112
113
|
) -> Generator:
|
|
113
114
|
from ..client import RESTfulClient
|
|
114
115
|
|
|
@@ -127,6 +128,7 @@ class GradioInterface:
|
|
|
127
128
|
"max_tokens": int(max_tokens),
|
|
128
129
|
"temperature": temperature,
|
|
129
130
|
"stream": True,
|
|
131
|
+
"lora_name": lora_name,
|
|
130
132
|
},
|
|
131
133
|
):
|
|
132
134
|
assert isinstance(chunk, dict)
|
|
@@ -152,6 +154,7 @@ class GradioInterface:
|
|
|
152
154
|
gr.Slider(
|
|
153
155
|
minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
|
|
154
156
|
),
|
|
157
|
+
gr.Text(label="LoRA Name"),
|
|
155
158
|
],
|
|
156
159
|
title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
|
|
157
160
|
css="""
|
|
@@ -331,7 +334,7 @@ class GradioInterface:
|
|
|
331
334
|
history: hist,
|
|
332
335
|
}
|
|
333
336
|
|
|
334
|
-
def complete(text, hist, max_tokens, temperature) -> Generator:
|
|
337
|
+
def complete(text, hist, max_tokens, temperature, lora_name) -> Generator:
|
|
335
338
|
from ..client import RESTfulClient
|
|
336
339
|
|
|
337
340
|
client = RESTfulClient(self.endpoint)
|
|
@@ -349,6 +352,7 @@ class GradioInterface:
|
|
|
349
352
|
"max_tokens": max_tokens,
|
|
350
353
|
"temperature": temperature,
|
|
351
354
|
"stream": True,
|
|
355
|
+
"lora_name": lora_name,
|
|
352
356
|
},
|
|
353
357
|
):
|
|
354
358
|
assert isinstance(chunk, dict)
|
|
@@ -368,7 +372,7 @@ class GradioInterface:
|
|
|
368
372
|
history: hist,
|
|
369
373
|
}
|
|
370
374
|
|
|
371
|
-
def retry(text, hist, max_tokens, temperature) -> Generator:
|
|
375
|
+
def retry(text, hist, max_tokens, temperature, lora_name) -> Generator:
|
|
372
376
|
from ..client import RESTfulClient
|
|
373
377
|
|
|
374
378
|
client = RESTfulClient(self.endpoint)
|
|
@@ -387,6 +391,7 @@ class GradioInterface:
|
|
|
387
391
|
"max_tokens": max_tokens,
|
|
388
392
|
"temperature": temperature,
|
|
389
393
|
"stream": True,
|
|
394
|
+
"lora_name": lora_name,
|
|
390
395
|
},
|
|
391
396
|
):
|
|
392
397
|
assert isinstance(chunk, dict)
|
|
@@ -470,10 +475,11 @@ class GradioInterface:
|
|
|
470
475
|
temperature = gr.Slider(
|
|
471
476
|
minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
|
|
472
477
|
)
|
|
478
|
+
lora_name = gr.Text(label="LoRA Name")
|
|
473
479
|
|
|
474
480
|
btn_generate.click(
|
|
475
481
|
fn=complete,
|
|
476
|
-
inputs=[textbox, history, length, temperature],
|
|
482
|
+
inputs=[textbox, history, length, temperature, lora_name],
|
|
477
483
|
outputs=[textbox, history],
|
|
478
484
|
)
|
|
479
485
|
|
|
@@ -485,7 +491,7 @@ class GradioInterface:
|
|
|
485
491
|
|
|
486
492
|
btn_retry.click(
|
|
487
493
|
fn=retry,
|
|
488
|
-
inputs=[textbox, history, length, temperature],
|
|
494
|
+
inputs=[textbox, history, length, temperature, lora_name],
|
|
489
495
|
outputs=[textbox, history],
|
|
490
496
|
)
|
|
491
497
|
|
xinference/core/event.py
CHANGED
|
@@ -37,7 +37,7 @@ class Event(TypedDict):
|
|
|
37
37
|
class EventCollectorActor(xo.StatelessActor):
|
|
38
38
|
def __init__(self):
|
|
39
39
|
super().__init__()
|
|
40
|
-
self._model_uid_to_events: Dict[str, queue.Queue] = defaultdict(
|
|
40
|
+
self._model_uid_to_events: Dict[str, queue.Queue] = defaultdict( # type: ignore
|
|
41
41
|
lambda: queue.Queue(maxsize=MAX_EVENT_COUNT_PER_MODEL)
|
|
42
42
|
)
|
|
43
43
|
|
xinference/core/model.py
CHANGED
|
@@ -25,6 +25,7 @@ from typing import (
|
|
|
25
25
|
AsyncGenerator,
|
|
26
26
|
Callable,
|
|
27
27
|
Dict,
|
|
28
|
+
Generator,
|
|
28
29
|
Iterator,
|
|
29
30
|
List,
|
|
30
31
|
Optional,
|
|
@@ -153,7 +154,6 @@ class ModelActor(xo.StatelessActor):
|
|
|
153
154
|
):
|
|
154
155
|
super().__init__()
|
|
155
156
|
from ..model.llm.pytorch.core import PytorchModel
|
|
156
|
-
from ..model.llm.pytorch.spec_model import SpeculativeModel
|
|
157
157
|
from ..model.llm.vllm.core import VLLMModel
|
|
158
158
|
|
|
159
159
|
self._worker_address = worker_address
|
|
@@ -167,7 +167,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
167
167
|
self._current_generator = lambda: None
|
|
168
168
|
self._lock = (
|
|
169
169
|
None
|
|
170
|
-
if isinstance(self._model, (PytorchModel,
|
|
170
|
+
if isinstance(self._model, (PytorchModel, VLLMModel))
|
|
171
171
|
else asyncio.locks.Lock()
|
|
172
172
|
)
|
|
173
173
|
self._worker_ref = None
|
|
@@ -257,7 +257,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
257
257
|
for v in gen:
|
|
258
258
|
if time_to_first_token is None:
|
|
259
259
|
time_to_first_token = (time.time() - start_time) * 1000
|
|
260
|
-
final_usage = v.
|
|
260
|
+
final_usage = v.get("usage", None)
|
|
261
261
|
v = dict(data=json.dumps(v))
|
|
262
262
|
yield sse_starlette.sse.ensure_bytes(v, None)
|
|
263
263
|
except OutOfMemoryError:
|
|
@@ -289,7 +289,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
289
289
|
async for v in gen:
|
|
290
290
|
if time_to_first_token is None:
|
|
291
291
|
time_to_first_token = (time.time() - start_time) * 1000
|
|
292
|
-
final_usage = v.
|
|
292
|
+
final_usage = v.get("usage", None)
|
|
293
293
|
v = await asyncio.to_thread(json.dumps, v)
|
|
294
294
|
v = dict(data=v) # noqa: F821
|
|
295
295
|
yield await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)
|
|
@@ -379,8 +379,13 @@ class ModelActor(xo.StatelessActor):
|
|
|
379
379
|
raise AttributeError(f"Model {self._model.model_spec} is not for chat.")
|
|
380
380
|
finally:
|
|
381
381
|
# For the non stream result.
|
|
382
|
-
|
|
383
|
-
|
|
382
|
+
record = None
|
|
383
|
+
if isinstance(response, Generator) or isinstance(response, AsyncGenerator):
|
|
384
|
+
record = response
|
|
385
|
+
elif isinstance(response, bytes):
|
|
386
|
+
record = json.loads(response)
|
|
387
|
+
if record and isinstance(record, dict):
|
|
388
|
+
usage = record["usage"]
|
|
384
389
|
# Some backends may not have a valid usage, we just skip them.
|
|
385
390
|
completion_tokens = usage["completion_tokens"]
|
|
386
391
|
prompt_tokens = usage["prompt_tokens"]
|
|
@@ -436,6 +441,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
436
441
|
prompt: Optional[str] = None,
|
|
437
442
|
response_format: str = "json",
|
|
438
443
|
temperature: float = 0,
|
|
444
|
+
timestamp_granularities: Optional[List[str]] = None,
|
|
439
445
|
):
|
|
440
446
|
if hasattr(self._model, "transcriptions"):
|
|
441
447
|
return await self._call_wrapper(
|
|
@@ -445,6 +451,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
445
451
|
prompt,
|
|
446
452
|
response_format,
|
|
447
453
|
temperature,
|
|
454
|
+
timestamp_granularities,
|
|
448
455
|
)
|
|
449
456
|
raise AttributeError(
|
|
450
457
|
f"Model {self._model.model_spec} is not for creating transcriptions."
|
|
@@ -455,17 +462,21 @@ class ModelActor(xo.StatelessActor):
|
|
|
455
462
|
async def translations(
|
|
456
463
|
self,
|
|
457
464
|
audio: bytes,
|
|
465
|
+
language: Optional[str] = None,
|
|
458
466
|
prompt: Optional[str] = None,
|
|
459
467
|
response_format: str = "json",
|
|
460
468
|
temperature: float = 0,
|
|
469
|
+
timestamp_granularities: Optional[List[str]] = None,
|
|
461
470
|
):
|
|
462
471
|
if hasattr(self._model, "translations"):
|
|
463
472
|
return await self._call_wrapper(
|
|
464
473
|
self._model.translations,
|
|
465
474
|
audio,
|
|
475
|
+
language,
|
|
466
476
|
prompt,
|
|
467
477
|
response_format,
|
|
468
478
|
temperature,
|
|
479
|
+
timestamp_granularities,
|
|
469
480
|
)
|
|
470
481
|
raise AttributeError(
|
|
471
482
|
f"Model {self._model.model_spec} is not for creating translations."
|