xinference 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (101) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/auth_service.py +1 -1
  3. xinference/api/restful_api.py +53 -61
  4. xinference/client/restful/restful_client.py +52 -57
  5. xinference/conftest.py +1 -1
  6. xinference/core/cache_tracker.py +1 -1
  7. xinference/core/chat_interface.py +10 -4
  8. xinference/core/event.py +1 -1
  9. xinference/core/model.py +17 -6
  10. xinference/core/status_guard.py +1 -1
  11. xinference/core/supervisor.py +58 -72
  12. xinference/core/worker.py +68 -101
  13. xinference/deploy/cmdline.py +166 -1
  14. xinference/deploy/test/test_cmdline.py +2 -0
  15. xinference/deploy/utils.py +1 -1
  16. xinference/device_utils.py +29 -3
  17. xinference/fields.py +7 -1
  18. xinference/model/audio/whisper.py +88 -12
  19. xinference/model/core.py +2 -2
  20. xinference/model/image/__init__.py +29 -0
  21. xinference/model/image/core.py +6 -0
  22. xinference/model/image/custom.py +109 -0
  23. xinference/model/llm/__init__.py +92 -32
  24. xinference/model/llm/core.py +57 -102
  25. xinference/model/llm/ggml/chatglm.py +98 -13
  26. xinference/model/llm/ggml/llamacpp.py +49 -2
  27. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
  28. xinference/model/llm/llm_family.json +438 -7
  29. xinference/model/llm/llm_family.py +45 -41
  30. xinference/model/llm/llm_family_modelscope.json +258 -5
  31. xinference/model/llm/pytorch/chatglm.py +48 -0
  32. xinference/model/llm/pytorch/core.py +23 -6
  33. xinference/model/llm/pytorch/deepseek_vl.py +115 -33
  34. xinference/model/llm/pytorch/internlm2.py +32 -1
  35. xinference/model/llm/pytorch/qwen_vl.py +94 -12
  36. xinference/model/llm/pytorch/utils.py +38 -1
  37. xinference/model/llm/pytorch/yi_vl.py +96 -51
  38. xinference/model/llm/sglang/core.py +31 -9
  39. xinference/model/llm/utils.py +54 -20
  40. xinference/model/llm/vllm/core.py +101 -7
  41. xinference/thirdparty/omnilmm/chat.py +2 -1
  42. xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
  43. xinference/types.py +11 -0
  44. xinference/web/ui/build/asset-manifest.json +6 -3
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/css/main.54bca460.css +2 -0
  47. xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
  48. xinference/web/ui/build/static/js/main.551aa479.js +3 -0
  49. xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.551aa479.js.LICENSE.txt} +7 -0
  50. xinference/web/ui/build/static/js/main.551aa479.js.map +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/23caf6f1e52c43e983ca3bfd4189f41dbd645fa78f2dfdcd7f6b69bc41678665.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
  67. xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
  68. xinference/web/ui/node_modules/.package-lock.json +33 -0
  69. xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
  70. xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
  71. xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
  72. xinference/web/ui/node_modules/clipboard/bower.json +18 -0
  73. xinference/web/ui/node_modules/clipboard/composer.json +25 -0
  74. xinference/web/ui/node_modules/clipboard/package.json +63 -0
  75. xinference/web/ui/node_modules/delegate/package.json +31 -0
  76. xinference/web/ui/node_modules/good-listener/bower.json +11 -0
  77. xinference/web/ui/node_modules/good-listener/package.json +35 -0
  78. xinference/web/ui/node_modules/select/bower.json +13 -0
  79. xinference/web/ui/node_modules/select/package.json +29 -0
  80. xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
  81. xinference/web/ui/package-lock.json +34 -0
  82. xinference/web/ui/package.json +1 -0
  83. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/METADATA +13 -12
  84. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/RECORD +88 -67
  85. xinference/client/oscar/__init__.py +0 -13
  86. xinference/client/oscar/actor_client.py +0 -611
  87. xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
  88. xinference/model/llm/pytorch/spec_model.py +0 -186
  89. xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
  90. xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +0 -1
  93. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
  94. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
  95. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
  96. xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
  97. xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
  98. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/LICENSE +0 -0
  99. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/WHEEL +0 -0
  100. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/entry_points.txt +0 -0
  101. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/top_level.txt +0 -0
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2024-04-24T10:45:37+0800",
11
+ "date": "2024-05-17T14:10:09+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "2ba72b0ed55c2dbff12491485ffacee7996d3490",
15
- "version": "0.10.3"
14
+ "full-revisionid": "55a0200079eacf4fd6ee10c5868f0eaba244db29",
15
+ "version": "0.11.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -48,7 +48,7 @@ class AuthService:
48
48
 
49
49
  def init_auth_config(self):
50
50
  if self._auth_config_file:
51
- config: AuthStartupConfig = parse_file_as(
51
+ config: AuthStartupConfig = parse_file_as( # type: ignore
52
52
  path=self._auth_config_file, type_=AuthStartupConfig
53
53
  )
54
54
  all_api_keys = set()
@@ -275,6 +275,16 @@ class RESTfulAPI:
275
275
  self._router.add_api_route(
276
276
  "/v1/cluster/auth", self.is_cluster_authenticated, methods=["GET"]
277
277
  )
278
+ self._router.add_api_route(
279
+ "/v1/engines/{model_name}",
280
+ self.query_engines_by_model_name,
281
+ methods=["GET"],
282
+ dependencies=(
283
+ [Security(self._auth_service, scopes=["models:list"])]
284
+ if self.is_authenticated()
285
+ else None
286
+ ),
287
+ )
278
288
  # running instances
279
289
  self._router.add_api_route(
280
290
  "/v1/models/instances",
@@ -347,16 +357,6 @@ class RESTfulAPI:
347
357
  else None
348
358
  ),
349
359
  )
350
- self._router.add_api_route(
351
- "/experimental/speculative_llms",
352
- self.launch_speculative_llm,
353
- methods=["POST"],
354
- dependencies=(
355
- [Security(self._auth_service, scopes=["models:start"])]
356
- if self.is_authenticated()
357
- else None
358
- ),
359
- )
360
360
  self._router.add_api_route(
361
361
  "/v1/models/{model_uid}",
362
362
  self.terminate_model,
@@ -639,57 +639,17 @@ class RESTfulAPI:
639
639
  logger.error(e, exc_info=True)
640
640
  raise HTTPException(status_code=500, detail=str(e))
641
641
 
642
- async def launch_speculative_llm(self, request: Request) -> JSONResponse:
643
- payload = await request.json()
644
- model_uid = payload.get("model_uid")
645
- model_name = payload.get("model_name")
646
- model_size_in_billions = payload.get("model_size_in_billions")
647
- quantization = payload.get("quantization")
648
- draft_model_name = payload.get("draft_model_name")
649
- draft_model_size_in_billions = payload.get("draft_model_size_in_billions")
650
- draft_quantization = payload.get("draft_quantization")
651
- n_gpu = payload.get("n_gpu", "auto")
652
-
653
- if not model_name:
654
- raise HTTPException(
655
- status_code=400,
656
- detail="Invalid input. Please specify the model name",
657
- )
658
-
659
- try:
660
- model_uid = await (await self._get_supervisor_ref()).launch_speculative_llm(
661
- model_uid=model_uid,
662
- model_name=model_name,
663
- model_size_in_billions=model_size_in_billions,
664
- quantization=quantization,
665
- draft_model_name=draft_model_name,
666
- draft_model_size_in_billions=draft_model_size_in_billions,
667
- draft_quantization=draft_quantization,
668
- n_gpu=n_gpu,
669
- )
670
-
671
- except ValueError as ve:
672
- logger.error(str(ve), exc_info=True)
673
- raise HTTPException(status_code=400, detail=str(ve))
674
- except RuntimeError as re:
675
- logger.error(str(re), exc_info=True)
676
- raise HTTPException(status_code=503, detail=str(re))
677
- except Exception as e:
678
- logger.error(str(e), exc_info=True)
679
- raise HTTPException(status_code=500, detail=str(e))
680
-
681
- return JSONResponse(content={"model_uid": model_uid})
682
-
683
642
  async def launch_model(
684
643
  self, request: Request, wait_ready: bool = Query(True)
685
644
  ) -> JSONResponse:
686
645
  payload = await request.json()
687
646
  model_uid = payload.get("model_uid")
688
647
  model_name = payload.get("model_name")
648
+ model_engine = payload.get("model_engine")
689
649
  model_size_in_billions = payload.get("model_size_in_billions")
690
650
  model_format = payload.get("model_format")
691
651
  quantization = payload.get("quantization")
692
- model_type = payload.get("model_type")
652
+ model_type = payload.get("model_type", "LLM")
693
653
  replica = payload.get("replica", 1)
694
654
  n_gpu = payload.get("n_gpu", "auto")
695
655
  request_limits = payload.get("request_limits", None)
@@ -700,6 +660,7 @@ class RESTfulAPI:
700
660
  exclude_keys = {
701
661
  "model_uid",
702
662
  "model_name",
663
+ "model_engine",
703
664
  "model_size_in_billions",
704
665
  "model_format",
705
666
  "quantization",
@@ -719,7 +680,12 @@ class RESTfulAPI:
719
680
  if not model_name:
720
681
  raise HTTPException(
721
682
  status_code=400,
722
- detail="Invalid input. Please specify the model name",
683
+ detail="Invalid input. Please specify the `model_name` field.",
684
+ )
685
+ if not model_engine and model_type == "LLM":
686
+ raise HTTPException(
687
+ status_code=400,
688
+ detail="Invalid input. Please specify the `model_engine` field.",
723
689
  )
724
690
 
725
691
  if peft_model_config is not None:
@@ -731,6 +697,7 @@ class RESTfulAPI:
731
697
  model_uid = await (await self._get_supervisor_ref()).launch_builtin_model(
732
698
  model_uid=model_uid,
733
699
  model_name=model_name,
700
+ model_engine=model_engine,
734
701
  model_size_in_billions=model_size_in_billions,
735
702
  model_format=model_format,
736
703
  quantization=quantization,
@@ -776,6 +743,7 @@ class RESTfulAPI:
776
743
  ) -> JSONResponse:
777
744
  payload = await request.json()
778
745
  model_uid = payload.get("model_uid")
746
+ model_engine = payload.get("model_engine")
779
747
  model_type = payload.get("model_type")
780
748
  model_version = payload.get("model_version")
781
749
  replica = payload.get("replica", 1)
@@ -786,6 +754,7 @@ class RESTfulAPI:
786
754
  await self._get_supervisor_ref()
787
755
  ).launch_model_by_version(
788
756
  model_uid=model_uid,
757
+ model_engine=model_engine,
789
758
  model_type=model_type,
790
759
  model_version=model_version,
791
760
  replica=replica,
@@ -1085,6 +1054,7 @@ class RESTfulAPI:
1085
1054
 
1086
1055
  async def create_transcriptions(
1087
1056
  self,
1057
+ request: Request,
1088
1058
  model: str = Form(...),
1089
1059
  file: UploadFile = File(media_type="application/octet-stream"),
1090
1060
  language: Optional[str] = Form(None),
@@ -1093,6 +1063,10 @@ class RESTfulAPI:
1093
1063
  temperature: Optional[float] = Form(0),
1094
1064
  kwargs: Optional[str] = Form(None),
1095
1065
  ) -> Response:
1066
+ form = await request.form()
1067
+ timestamp_granularities = form.get("timestamp_granularities[]")
1068
+ if timestamp_granularities:
1069
+ timestamp_granularities = [timestamp_granularities]
1096
1070
  model_uid = model
1097
1071
  try:
1098
1072
  model_ref = await (await self._get_supervisor_ref()).get_model(model_uid)
@@ -1116,6 +1090,7 @@ class RESTfulAPI:
1116
1090
  prompt=prompt,
1117
1091
  response_format=response_format,
1118
1092
  temperature=temperature,
1093
+ timestamp_granularities=timestamp_granularities,
1119
1094
  **parsed_kwargs,
1120
1095
  )
1121
1096
  return Response(content=transcription, media_type="application/json")
@@ -1130,13 +1105,19 @@ class RESTfulAPI:
1130
1105
 
1131
1106
  async def create_translations(
1132
1107
  self,
1108
+ request: Request,
1133
1109
  model: str = Form(...),
1134
1110
  file: UploadFile = File(media_type="application/octet-stream"),
1111
+ language: Optional[str] = Form(None),
1135
1112
  prompt: Optional[str] = Form(None),
1136
1113
  response_format: Optional[str] = Form("json"),
1137
1114
  temperature: Optional[float] = Form(0),
1138
1115
  kwargs: Optional[str] = Form(None),
1139
1116
  ) -> Response:
1117
+ form = await request.form()
1118
+ timestamp_granularities = form.get("timestamp_granularities[]")
1119
+ if timestamp_granularities:
1120
+ timestamp_granularities = [timestamp_granularities]
1140
1121
  model_uid = model
1141
1122
  try:
1142
1123
  model_ref = await (await self._get_supervisor_ref()).get_model(model_uid)
@@ -1156,9 +1137,11 @@ class RESTfulAPI:
1156
1137
  parsed_kwargs = {}
1157
1138
  translation = await model_ref.translations(
1158
1139
  audio=await file.read(),
1140
+ language=language,
1159
1141
  prompt=prompt,
1160
1142
  response_format=response_format,
1161
1143
  temperature=temperature,
1144
+ timestamp_granularities=timestamp_granularities,
1162
1145
  **parsed_kwargs,
1163
1146
  )
1164
1147
  return Response(content=translation, media_type="application/json")
@@ -1274,11 +1257,7 @@ class RESTfulAPI:
1274
1257
 
1275
1258
  messages = body.messages and list(body.messages) or None
1276
1259
 
1277
- if (
1278
- not messages
1279
- or messages[-1].get("role") not in ["user", "system", "tool"]
1280
- or not messages[-1].get("content")
1281
- ):
1260
+ if not messages or messages[-1].get("role") not in ["user", "system", "tool"]:
1282
1261
  raise HTTPException(
1283
1262
  status_code=400, detail="Invalid input. Please specify the prompt."
1284
1263
  )
@@ -1298,15 +1277,15 @@ class RESTfulAPI:
1298
1277
  {"role": "system", "content": ". ".join(system_messages_contents)}
1299
1278
  )
1300
1279
 
1301
- assert non_system_messages
1302
-
1303
1280
  has_tool_message = messages[-1].get("role") == "tool"
1304
1281
  if has_tool_message:
1305
1282
  prompt = SPECIAL_TOOL_PROMPT
1306
1283
  system_prompt = system_messages[0]["content"] if system_messages else None
1307
1284
  chat_history = non_system_messages # exclude the prompt
1308
1285
  else:
1309
- prompt = non_system_messages[-1]["content"]
1286
+ prompt = None
1287
+ if non_system_messages:
1288
+ prompt = non_system_messages[-1]["content"]
1310
1289
  system_prompt = system_messages[0]["content"] if system_messages else None
1311
1290
  chat_history = non_system_messages[:-1] # exclude the prompt
1312
1291
 
@@ -1418,6 +1397,19 @@ class RESTfulAPI:
1418
1397
  self.handle_request_limit_error(e)
1419
1398
  raise HTTPException(status_code=500, detail=str(e))
1420
1399
 
1400
+ async def query_engines_by_model_name(self, model_name: str) -> JSONResponse:
1401
+ try:
1402
+ content = await (
1403
+ await self._get_supervisor_ref()
1404
+ ).query_engines_by_model_name(model_name)
1405
+ return JSONResponse(content=content)
1406
+ except ValueError as re:
1407
+ logger.error(re, exc_info=True)
1408
+ raise HTTPException(status_code=400, detail=str(re))
1409
+ except Exception as e:
1410
+ logger.error(e, exc_info=True)
1411
+ raise HTTPException(status_code=500, detail=str(e))
1412
+
1421
1413
  async def register_model(self, model_type: str, request: Request) -> JSONResponse:
1422
1414
  body = RegisterModelRequest.parse_obj(await request.json())
1423
1415
  model = body.model
@@ -13,7 +13,6 @@
13
13
  # limitations under the License.
14
14
  import json
15
15
  import typing
16
- import warnings
17
16
  from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
18
17
 
19
18
  import requests
@@ -566,6 +565,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
566
565
  prompt: Optional[str] = None,
567
566
  response_format: Optional[str] = "json",
568
567
  temperature: Optional[float] = 0,
568
+ timestamp_granularities: Optional[List[str]] = None,
569
569
  ):
570
570
  """
571
571
  Transcribes audio into the input language.
@@ -589,6 +589,11 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
589
589
  while lower values like 0.2 will make it more focused and deterministic.
590
590
  If set to 0, the model will use log probability to automatically increase the temperature
591
591
  until certain thresholds are hit.
592
+ timestamp_granularities: Optional[List[str]], default is None.
593
+ The timestamp granularities to populate for this transcription. response_format must be set verbose_json
594
+ to use timestamp granularities. Either or both of these options are supported: word, or segment.
595
+ Note: There is no additional latency for segment timestamps, but generating word timestamps incurs
596
+ additional latency.
592
597
 
593
598
  Returns
594
599
  -------
@@ -601,12 +606,13 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
601
606
  "prompt": prompt,
602
607
  "response_format": response_format,
603
608
  "temperature": temperature,
609
+ "timestamp_granularities[]": timestamp_granularities,
604
610
  }
605
611
  files: List[Any] = []
606
- for key, value in params.items():
607
- files.append((key, (None, value)))
608
612
  files.append(("file", ("file", audio, "application/octet-stream")))
609
- response = requests.post(url, files=files, headers=self.auth_headers)
613
+ response = requests.post(
614
+ url, data=params, files=files, headers=self.auth_headers
615
+ )
610
616
  if response.status_code != 200:
611
617
  raise RuntimeError(
612
618
  f"Failed to transcribe the audio, detail: {_get_error_string(response)}"
@@ -618,9 +624,11 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
618
624
  def translations(
619
625
  self,
620
626
  audio: bytes,
627
+ language: Optional[str] = None,
621
628
  prompt: Optional[str] = None,
622
629
  response_format: Optional[str] = "json",
623
630
  temperature: Optional[float] = 0,
631
+ timestamp_granularities: Optional[List[str]] = None,
624
632
  ):
625
633
  """
626
634
  Translates audio into English.
@@ -631,6 +639,9 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
631
639
  audio: bytes
632
640
  The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg,
633
641
  mpga, m4a, ogg, wav, or webm.
642
+ language: Optional[str]
643
+ The language of the input audio. Supplying the input language in ISO-639-1
644
+ (https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes) format will improve accuracy and latency.
634
645
  prompt: Optional[str]
635
646
  An optional text to guide the model's style or continue a previous audio segment.
636
647
  The prompt should match the audio language.
@@ -641,6 +652,11 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
641
652
  while lower values like 0.2 will make it more focused and deterministic.
642
653
  If set to 0, the model will use log probability to automatically increase the temperature
643
654
  until certain thresholds are hit.
655
+ timestamp_granularities: Optional[List[str]], default is None.
656
+ The timestamp granularities to populate for this transcription. response_format must be set verbose_json
657
+ to use timestamp granularities. Either or both of these options are supported: word, or segment.
658
+ Note: There is no additional latency for segment timestamps, but generating word timestamps incurs
659
+ additional latency.
644
660
 
645
661
  Returns
646
662
  -------
@@ -649,15 +665,17 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
649
665
  url = f"{self._base_url}/v1/audio/translations"
650
666
  params = {
651
667
  "model": self._model_uid,
668
+ "language": language,
652
669
  "prompt": prompt,
653
670
  "response_format": response_format,
654
671
  "temperature": temperature,
672
+ "timestamp_granularities[]": timestamp_granularities,
655
673
  }
656
674
  files: List[Any] = []
657
- for key, value in params.items():
658
- files.append((key, (None, value)))
659
675
  files.append(("file", ("file", audio, "application/octet-stream")))
660
- response = requests.post(url, files=files, headers=self.auth_headers)
676
+ response = requests.post(
677
+ url, data=params, files=files, headers=self.auth_headers
678
+ )
661
679
  if response.status_code != 200:
662
680
  raise RuntimeError(
663
681
  f"Failed to translate the audio, detail: {_get_error_string(response)}"
@@ -754,60 +772,11 @@ class Client:
754
772
  model_list = response_data["data"]
755
773
  return {item["id"]: item for item in model_list}
756
774
 
757
- def launch_speculative_llm(
758
- self,
759
- model_name: str,
760
- model_size_in_billions: Optional[Union[int, str, float]],
761
- quantization: Optional[str],
762
- draft_model_name: str,
763
- draft_model_size_in_billions: Optional[int],
764
- draft_quantization: Optional[str],
765
- n_gpu: Optional[Union[int, str]] = "auto",
766
- ):
767
- """
768
- Launch the LLM along with a draft model based on the parameters on the server via RESTful APIs. This is an
769
- experimental feature and the API may change in the future.
770
-
771
- Returns
772
- -------
773
- str
774
- The unique model_uid for the launched model.
775
-
776
- """
777
- warnings.warn(
778
- "`launch_speculative_llm` is an experimental feature and the API may change in the future."
779
- )
780
-
781
- # convert float to int or string since the RESTful API does not accept float.
782
- if isinstance(model_size_in_billions, float):
783
- model_size_in_billions = convert_float_to_int_or_str(model_size_in_billions)
784
-
785
- payload = {
786
- "model_uid": None,
787
- "model_name": model_name,
788
- "model_size_in_billions": model_size_in_billions,
789
- "quantization": quantization,
790
- "draft_model_name": draft_model_name,
791
- "draft_model_size_in_billions": draft_model_size_in_billions,
792
- "draft_quantization": draft_quantization,
793
- "n_gpu": n_gpu,
794
- }
795
-
796
- url = f"{self.base_url}/experimental/speculative_llms"
797
- response = requests.post(url, json=payload, headers=self._headers)
798
- if response.status_code != 200:
799
- raise RuntimeError(
800
- f"Failed to launch model, detail: {_get_error_string(response)}"
801
- )
802
-
803
- response_data = response.json()
804
- model_uid = response_data["model_uid"]
805
- return model_uid
806
-
807
775
  def launch_model(
808
776
  self,
809
777
  model_name: str,
810
778
  model_type: str = "LLM",
779
+ model_engine: Optional[str] = None,
811
780
  model_uid: Optional[str] = None,
812
781
  model_size_in_billions: Optional[Union[int, str, float]] = None,
813
782
  model_format: Optional[str] = None,
@@ -829,6 +798,8 @@ class Client:
829
798
  The name of model.
830
799
  model_type: str
831
800
  type of model.
801
+ model_engine: Optional[str]
802
+ Specify the inference engine of the model when launching LLM.
832
803
  model_uid: str
833
804
  UID of model, auto generate a UUID if is None.
834
805
  model_size_in_billions: Optional[Union[int, str, float]]
@@ -872,6 +843,7 @@ class Client:
872
843
  payload = {
873
844
  "model_uid": model_uid,
874
845
  "model_name": model_name,
846
+ "model_engine": model_engine,
875
847
  "peft_model_config": peft_model_config,
876
848
  "model_type": model_type,
877
849
  "model_size_in_billions": model_size_in_billions,
@@ -1157,3 +1129,26 @@ class Client:
1157
1129
 
1158
1130
  response_data = response.json()
1159
1131
  return response_data
1132
+
1133
+ def query_engine_by_model_name(self, model_name: str):
1134
+ """
1135
+ Get the engine parameters with the model name registered on the server.
1136
+
1137
+ Parameters
1138
+ ----------
1139
+ model_name: str
1140
+ The name of the model.
1141
+ Returns
1142
+ -------
1143
+ Dict[str, List[Dict[str, Any]]]
1144
+ The supported engine parameters of registered models on the server.
1145
+ """
1146
+ url = f"{self.base_url}/v1/engines/{model_name}"
1147
+ response = requests.get(url, headers=self._headers)
1148
+ if response.status_code != 200:
1149
+ raise RuntimeError(
1150
+ f"Failed to query engine parameters by model name, detail: {_get_error_string(response)}"
1151
+ )
1152
+
1153
+ response_data = response.json()
1154
+ return response_data
xinference/conftest.py CHANGED
@@ -237,7 +237,7 @@ def setup_with_file_logging():
237
237
  logging_conf=TEST_FILE_LOGGING_CONF,
238
238
  )
239
239
  endpoint = f"http://localhost:{port}"
240
- if not api_health_check(endpoint, max_attempts=3, sleep_interval=5):
240
+ if not api_health_check(endpoint, max_attempts=10, sleep_interval=5):
241
241
  raise RuntimeError("Endpoint is not available after multiple attempts")
242
242
 
243
243
  try:
@@ -22,7 +22,7 @@ logger = getLogger(__name__)
22
22
  class CacheTrackerActor(xo.Actor):
23
23
  def __init__(self):
24
24
  super().__init__()
25
- self._model_name_to_version_info: Dict[str, List[Dict]] = {}
25
+ self._model_name_to_version_info: Dict[str, List[Dict]] = {} # type: ignore
26
26
 
27
27
  @classmethod
28
28
  def uid(cls) -> str:
@@ -109,6 +109,7 @@ class GradioInterface:
109
109
  history: List[List[str]],
110
110
  max_tokens: int,
111
111
  temperature: float,
112
+ lora_name: str,
112
113
  ) -> Generator:
113
114
  from ..client import RESTfulClient
114
115
 
@@ -127,6 +128,7 @@ class GradioInterface:
127
128
  "max_tokens": int(max_tokens),
128
129
  "temperature": temperature,
129
130
  "stream": True,
131
+ "lora_name": lora_name,
130
132
  },
131
133
  ):
132
134
  assert isinstance(chunk, dict)
@@ -152,6 +154,7 @@ class GradioInterface:
152
154
  gr.Slider(
153
155
  minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
154
156
  ),
157
+ gr.Text(label="LoRA Name"),
155
158
  ],
156
159
  title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
157
160
  css="""
@@ -331,7 +334,7 @@ class GradioInterface:
331
334
  history: hist,
332
335
  }
333
336
 
334
- def complete(text, hist, max_tokens, temperature) -> Generator:
337
+ def complete(text, hist, max_tokens, temperature, lora_name) -> Generator:
335
338
  from ..client import RESTfulClient
336
339
 
337
340
  client = RESTfulClient(self.endpoint)
@@ -349,6 +352,7 @@ class GradioInterface:
349
352
  "max_tokens": max_tokens,
350
353
  "temperature": temperature,
351
354
  "stream": True,
355
+ "lora_name": lora_name,
352
356
  },
353
357
  ):
354
358
  assert isinstance(chunk, dict)
@@ -368,7 +372,7 @@ class GradioInterface:
368
372
  history: hist,
369
373
  }
370
374
 
371
- def retry(text, hist, max_tokens, temperature) -> Generator:
375
+ def retry(text, hist, max_tokens, temperature, lora_name) -> Generator:
372
376
  from ..client import RESTfulClient
373
377
 
374
378
  client = RESTfulClient(self.endpoint)
@@ -387,6 +391,7 @@ class GradioInterface:
387
391
  "max_tokens": max_tokens,
388
392
  "temperature": temperature,
389
393
  "stream": True,
394
+ "lora_name": lora_name,
390
395
  },
391
396
  ):
392
397
  assert isinstance(chunk, dict)
@@ -470,10 +475,11 @@ class GradioInterface:
470
475
  temperature = gr.Slider(
471
476
  minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
472
477
  )
478
+ lora_name = gr.Text(label="LoRA Name")
473
479
 
474
480
  btn_generate.click(
475
481
  fn=complete,
476
- inputs=[textbox, history, length, temperature],
482
+ inputs=[textbox, history, length, temperature, lora_name],
477
483
  outputs=[textbox, history],
478
484
  )
479
485
 
@@ -485,7 +491,7 @@ class GradioInterface:
485
491
 
486
492
  btn_retry.click(
487
493
  fn=retry,
488
- inputs=[textbox, history, length, temperature],
494
+ inputs=[textbox, history, length, temperature, lora_name],
489
495
  outputs=[textbox, history],
490
496
  )
491
497
 
xinference/core/event.py CHANGED
@@ -37,7 +37,7 @@ class Event(TypedDict):
37
37
  class EventCollectorActor(xo.StatelessActor):
38
38
  def __init__(self):
39
39
  super().__init__()
40
- self._model_uid_to_events: Dict[str, queue.Queue] = defaultdict(
40
+ self._model_uid_to_events: Dict[str, queue.Queue] = defaultdict( # type: ignore
41
41
  lambda: queue.Queue(maxsize=MAX_EVENT_COUNT_PER_MODEL)
42
42
  )
43
43
 
xinference/core/model.py CHANGED
@@ -25,6 +25,7 @@ from typing import (
25
25
  AsyncGenerator,
26
26
  Callable,
27
27
  Dict,
28
+ Generator,
28
29
  Iterator,
29
30
  List,
30
31
  Optional,
@@ -153,7 +154,6 @@ class ModelActor(xo.StatelessActor):
153
154
  ):
154
155
  super().__init__()
155
156
  from ..model.llm.pytorch.core import PytorchModel
156
- from ..model.llm.pytorch.spec_model import SpeculativeModel
157
157
  from ..model.llm.vllm.core import VLLMModel
158
158
 
159
159
  self._worker_address = worker_address
@@ -167,7 +167,7 @@ class ModelActor(xo.StatelessActor):
167
167
  self._current_generator = lambda: None
168
168
  self._lock = (
169
169
  None
170
- if isinstance(self._model, (PytorchModel, SpeculativeModel, VLLMModel))
170
+ if isinstance(self._model, (PytorchModel, VLLMModel))
171
171
  else asyncio.locks.Lock()
172
172
  )
173
173
  self._worker_ref = None
@@ -257,7 +257,7 @@ class ModelActor(xo.StatelessActor):
257
257
  for v in gen:
258
258
  if time_to_first_token is None:
259
259
  time_to_first_token = (time.time() - start_time) * 1000
260
- final_usage = v.pop("usage", None)
260
+ final_usage = v.get("usage", None)
261
261
  v = dict(data=json.dumps(v))
262
262
  yield sse_starlette.sse.ensure_bytes(v, None)
263
263
  except OutOfMemoryError:
@@ -289,7 +289,7 @@ class ModelActor(xo.StatelessActor):
289
289
  async for v in gen:
290
290
  if time_to_first_token is None:
291
291
  time_to_first_token = (time.time() - start_time) * 1000
292
- final_usage = v.pop("usage", None)
292
+ final_usage = v.get("usage", None)
293
293
  v = await asyncio.to_thread(json.dumps, v)
294
294
  v = dict(data=v) # noqa: F821
295
295
  yield await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)
@@ -379,8 +379,13 @@ class ModelActor(xo.StatelessActor):
379
379
  raise AttributeError(f"Model {self._model.model_spec} is not for chat.")
380
380
  finally:
381
381
  # For the non stream result.
382
- if response is not None and isinstance(response, dict):
383
- usage = response["usage"]
382
+ record = None
383
+ if isinstance(response, Generator) or isinstance(response, AsyncGenerator):
384
+ record = response
385
+ elif isinstance(response, bytes):
386
+ record = json.loads(response)
387
+ if record and isinstance(record, dict):
388
+ usage = record["usage"]
384
389
  # Some backends may not have a valid usage, we just skip them.
385
390
  completion_tokens = usage["completion_tokens"]
386
391
  prompt_tokens = usage["prompt_tokens"]
@@ -436,6 +441,7 @@ class ModelActor(xo.StatelessActor):
436
441
  prompt: Optional[str] = None,
437
442
  response_format: str = "json",
438
443
  temperature: float = 0,
444
+ timestamp_granularities: Optional[List[str]] = None,
439
445
  ):
440
446
  if hasattr(self._model, "transcriptions"):
441
447
  return await self._call_wrapper(
@@ -445,6 +451,7 @@ class ModelActor(xo.StatelessActor):
445
451
  prompt,
446
452
  response_format,
447
453
  temperature,
454
+ timestamp_granularities,
448
455
  )
449
456
  raise AttributeError(
450
457
  f"Model {self._model.model_spec} is not for creating transcriptions."
@@ -455,17 +462,21 @@ class ModelActor(xo.StatelessActor):
455
462
  async def translations(
456
463
  self,
457
464
  audio: bytes,
465
+ language: Optional[str] = None,
458
466
  prompt: Optional[str] = None,
459
467
  response_format: str = "json",
460
468
  temperature: float = 0,
469
+ timestamp_granularities: Optional[List[str]] = None,
461
470
  ):
462
471
  if hasattr(self._model, "translations"):
463
472
  return await self._call_wrapper(
464
473
  self._model.translations,
465
474
  audio,
475
+ language,
466
476
  prompt,
467
477
  response_format,
468
478
  temperature,
479
+ timestamp_granularities,
469
480
  )
470
481
  raise AttributeError(
471
482
  f"Model {self._model.model_spec} is not for creating translations."