xinference 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +79 -2
- xinference/client/restful/restful_client.py +65 -3
- xinference/conftest.py +0 -7
- xinference/core/media_interface.py +132 -8
- xinference/core/model.py +44 -6
- xinference/core/scheduler.py +1 -10
- xinference/core/supervisor.py +8 -17
- xinference/core/worker.py +5 -27
- xinference/deploy/cmdline.py +6 -2
- xinference/model/audio/chattts.py +24 -39
- xinference/model/audio/cosyvoice.py +18 -30
- xinference/model/audio/funasr.py +42 -0
- xinference/model/audio/model_spec.json +71 -1
- xinference/model/audio/model_spec_modelscope.json +76 -2
- xinference/model/audio/utils.py +75 -0
- xinference/model/core.py +1 -0
- xinference/model/embedding/__init__.py +74 -18
- xinference/model/embedding/core.py +98 -589
- xinference/model/embedding/embed_family.py +133 -0
- xinference/{thirdparty/omnilmm/train → model/embedding/flag}/__init__.py +1 -1
- xinference/model/embedding/flag/core.py +282 -0
- xinference/model/embedding/model_spec.json +24 -0
- xinference/model/embedding/model_spec_modelscope.json +24 -0
- xinference/model/embedding/sentence_transformers/__init__.py +13 -0
- xinference/model/embedding/sentence_transformers/core.py +399 -0
- xinference/model/embedding/vllm/core.py +95 -0
- xinference/model/image/model_spec.json +30 -3
- xinference/model/image/model_spec_modelscope.json +41 -2
- xinference/model/image/stable_diffusion/core.py +144 -53
- xinference/model/llm/__init__.py +6 -54
- xinference/model/llm/core.py +19 -5
- xinference/model/llm/llama_cpp/core.py +59 -3
- xinference/model/llm/llama_cpp/memory.py +457 -0
- xinference/model/llm/llm_family.json +247 -402
- xinference/model/llm/llm_family.py +88 -16
- xinference/model/llm/llm_family_modelscope.json +260 -421
- xinference/model/llm/llm_family_openmind_hub.json +0 -34
- xinference/model/llm/sglang/core.py +8 -0
- xinference/model/llm/transformers/__init__.py +27 -6
- xinference/model/llm/transformers/chatglm.py +4 -2
- xinference/model/llm/transformers/core.py +49 -28
- xinference/model/llm/transformers/deepseek_v2.py +6 -49
- xinference/model/llm/transformers/gemma3.py +119 -164
- xinference/model/llm/transformers/multimodal/__init__.py +13 -0
- xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
- xinference/model/llm/transformers/multimodal/core.py +205 -0
- xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
- xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
- xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
- xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
- xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
- xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
- xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
- xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
- xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
- xinference/model/llm/transformers/opt.py +4 -2
- xinference/model/llm/transformers/utils.py +6 -37
- xinference/model/llm/utils.py +11 -0
- xinference/model/llm/vllm/core.py +7 -0
- xinference/model/rerank/core.py +91 -3
- xinference/model/rerank/model_spec.json +24 -0
- xinference/model/rerank/model_spec_modelscope.json +24 -0
- xinference/model/rerank/utils.py +20 -2
- xinference/model/utils.py +38 -1
- xinference/model/video/diffusers.py +65 -3
- xinference/model/video/model_spec.json +31 -4
- xinference/model/video/model_spec_modelscope.json +32 -4
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.013f296b.css +2 -0
- xinference/web/ui/build/static/css/main.013f296b.css.map +1 -0
- xinference/web/ui/build/static/js/main.8a9e3ba0.js +3 -0
- xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6595880facebca7ceace6f17cf21c3a5a9219a2f52fb0ba9f3cf1131eddbcf6b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/aa998bc2d9c11853add6b8a2e08f50327f56d8824ccaaec92d6dde1b305f0d85.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c748246b1d7bcebc16153be69f37e955bb2145526c47dd425aeeff70d3004dbc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e31234e95d60a5a7883fbcd70de2475dc1c88c90705df1a530abb68f86f80a51.json +1 -0
- xinference/web/ui/src/locales/en.json +21 -8
- xinference/web/ui/src/locales/ja.json +224 -0
- xinference/web/ui/src/locales/ko.json +224 -0
- xinference/web/ui/src/locales/zh.json +21 -8
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/METADATA +14 -11
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/RECORD +93 -100
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/WHEEL +1 -1
- xinference/model/llm/transformers/cogvlm2.py +0 -442
- xinference/model/llm/transformers/cogvlm2_video.py +0 -333
- xinference/model/llm/transformers/deepseek_vl.py +0 -280
- xinference/model/llm/transformers/glm_edge_v.py +0 -213
- xinference/model/llm/transformers/intern_vl.py +0 -526
- xinference/model/llm/transformers/internlm2.py +0 -94
- xinference/model/llm/transformers/minicpmv25.py +0 -193
- xinference/model/llm/transformers/omnilmm.py +0 -132
- xinference/model/llm/transformers/qwen2_audio.py +0 -179
- xinference/model/llm/transformers/qwen_vl.py +0 -360
- xinference/thirdparty/omnilmm/LICENSE +0 -201
- xinference/thirdparty/omnilmm/chat.py +0 -218
- xinference/thirdparty/omnilmm/constants.py +0 -4
- xinference/thirdparty/omnilmm/conversation.py +0 -332
- xinference/thirdparty/omnilmm/model/__init__.py +0 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
- xinference/thirdparty/omnilmm/model/resampler.py +0 -166
- xinference/thirdparty/omnilmm/model/utils.py +0 -578
- xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
- xinference/thirdparty/omnilmm/utils.py +0 -134
- xinference/web/ui/build/static/css/main.337afe76.css +0 -2
- xinference/web/ui/build/static/css/main.337afe76.css.map +0 -1
- xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
- xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +0 -1
- /xinference/{thirdparty/omnilmm → model/embedding/vllm}/__init__.py +0 -0
- /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.8a9e3ba0.js.LICENSE.txt} +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-
|
|
11
|
+
"date": "2025-06-13T18:51:07+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "1.
|
|
14
|
+
"full-revisionid": "a362dba7334ef08c758bbc4a3d4904fe53cefe78",
|
|
15
|
+
"version": "1.7.0"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
xinference/api/restful_api.py
CHANGED
|
@@ -387,6 +387,7 @@ class RESTfulAPI(CancelMixin):
|
|
|
387
387
|
self._router.add_api_route(
|
|
388
388
|
"/v1/cluster/auth", self.is_cluster_authenticated, methods=["GET"]
|
|
389
389
|
)
|
|
390
|
+
# just for compatibility, LLM only
|
|
390
391
|
self._router.add_api_route(
|
|
391
392
|
"/v1/engines/{model_name}",
|
|
392
393
|
self.query_engines_by_model_name,
|
|
@@ -397,6 +398,17 @@ class RESTfulAPI(CancelMixin):
|
|
|
397
398
|
else None
|
|
398
399
|
),
|
|
399
400
|
)
|
|
401
|
+
# engines for all model types
|
|
402
|
+
self._router.add_api_route(
|
|
403
|
+
"/v1/engines/{model_type}/{model_name}",
|
|
404
|
+
self.query_engines_by_model_name,
|
|
405
|
+
methods=["GET"],
|
|
406
|
+
dependencies=(
|
|
407
|
+
[Security(self._auth_service, scopes=["models:list"])]
|
|
408
|
+
if self.is_authenticated()
|
|
409
|
+
else None
|
|
410
|
+
),
|
|
411
|
+
)
|
|
400
412
|
# running instances
|
|
401
413
|
self._router.add_api_route(
|
|
402
414
|
"/v1/models/instances",
|
|
@@ -708,6 +720,17 @@ class RESTfulAPI(CancelMixin):
|
|
|
708
720
|
else None
|
|
709
721
|
),
|
|
710
722
|
)
|
|
723
|
+
self._router.add_api_route(
|
|
724
|
+
"/v1/video/generations/flf",
|
|
725
|
+
self.create_videos_from_first_last_frame,
|
|
726
|
+
methods=["POST"],
|
|
727
|
+
response_model=VideoList,
|
|
728
|
+
dependencies=(
|
|
729
|
+
[Security(self._auth_service, scopes=["models:read"])]
|
|
730
|
+
if self.is_authenticated()
|
|
731
|
+
else None
|
|
732
|
+
),
|
|
733
|
+
)
|
|
711
734
|
self._router.add_api_route(
|
|
712
735
|
"/v1/chat/completions",
|
|
713
736
|
self.create_chat_completion,
|
|
@@ -2084,6 +2107,57 @@ class RESTfulAPI(CancelMixin):
|
|
|
2084
2107
|
self.handle_request_limit_error(e)
|
|
2085
2108
|
raise HTTPException(status_code=500, detail=str(e))
|
|
2086
2109
|
|
|
2110
|
+
async def create_videos_from_first_last_frame(
|
|
2111
|
+
self,
|
|
2112
|
+
model: str = Form(...),
|
|
2113
|
+
first_frame: UploadFile = File(media_type="application/octet-stream"),
|
|
2114
|
+
last_frame: UploadFile = File(media_type="application/octet-stream"),
|
|
2115
|
+
prompt: Optional[Union[str, List[str]]] = Form(None),
|
|
2116
|
+
negative_prompt: Optional[Union[str, List[str]]] = Form(None),
|
|
2117
|
+
n: Optional[int] = Form(1),
|
|
2118
|
+
kwargs: Optional[str] = Form(None),
|
|
2119
|
+
) -> Response:
|
|
2120
|
+
model_uid = model
|
|
2121
|
+
try:
|
|
2122
|
+
model_ref = await (await self._get_supervisor_ref()).get_model(model_uid)
|
|
2123
|
+
except ValueError as ve:
|
|
2124
|
+
logger.error(str(ve), exc_info=True)
|
|
2125
|
+
await self._report_error_event(model_uid, str(ve))
|
|
2126
|
+
raise HTTPException(status_code=400, detail=str(ve))
|
|
2127
|
+
except Exception as e:
|
|
2128
|
+
logger.error(e, exc_info=True)
|
|
2129
|
+
await self._report_error_event(model_uid, str(e))
|
|
2130
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
2131
|
+
|
|
2132
|
+
request_id = None
|
|
2133
|
+
try:
|
|
2134
|
+
if kwargs is not None:
|
|
2135
|
+
parsed_kwargs = json.loads(kwargs)
|
|
2136
|
+
else:
|
|
2137
|
+
parsed_kwargs = {}
|
|
2138
|
+
request_id = parsed_kwargs.get("request_id")
|
|
2139
|
+
self._add_running_task(request_id)
|
|
2140
|
+
video_list = await model_ref.flf_to_video(
|
|
2141
|
+
first_frame=Image.open(first_frame.file),
|
|
2142
|
+
last_frame=Image.open(last_frame.file),
|
|
2143
|
+
prompt=prompt,
|
|
2144
|
+
negative_prompt=negative_prompt,
|
|
2145
|
+
n=n,
|
|
2146
|
+
**parsed_kwargs,
|
|
2147
|
+
)
|
|
2148
|
+
return Response(content=video_list, media_type="application/json")
|
|
2149
|
+
except asyncio.CancelledError:
|
|
2150
|
+
err_str = f"The request has been cancelled: {request_id}"
|
|
2151
|
+
logger.error(err_str)
|
|
2152
|
+
await self._report_error_event(model_uid, err_str)
|
|
2153
|
+
raise HTTPException(status_code=409, detail=err_str)
|
|
2154
|
+
except Exception as e:
|
|
2155
|
+
e = await self._get_model_last_error(model_ref.uid, e)
|
|
2156
|
+
logger.error(e, exc_info=True)
|
|
2157
|
+
await self._report_error_event(model_uid, str(e))
|
|
2158
|
+
self.handle_request_limit_error(e)
|
|
2159
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
2160
|
+
|
|
2087
2161
|
async def create_chat_completion(self, request: Request) -> Response:
|
|
2088
2162
|
raw_body = await request.json()
|
|
2089
2163
|
body = CreateChatCompletion.parse_obj(raw_body)
|
|
@@ -2234,11 +2308,14 @@ class RESTfulAPI(CancelMixin):
|
|
|
2234
2308
|
self.handle_request_limit_error(e)
|
|
2235
2309
|
raise HTTPException(status_code=500, detail=str(e))
|
|
2236
2310
|
|
|
2237
|
-
async def query_engines_by_model_name(
|
|
2311
|
+
async def query_engines_by_model_name(
|
|
2312
|
+
self, request: Request, model_name: str, model_type: Optional[str] = None
|
|
2313
|
+
) -> JSONResponse:
|
|
2238
2314
|
try:
|
|
2315
|
+
model_type = model_type or request.path_params.get("model_type", "LLM")
|
|
2239
2316
|
content = await (
|
|
2240
2317
|
await self._get_supervisor_ref()
|
|
2241
|
-
).query_engines_by_model_name(model_name)
|
|
2318
|
+
).query_engines_by_model_name(model_name, model_type=model_type)
|
|
2242
2319
|
return JSONResponse(content=content)
|
|
2243
2320
|
except ValueError as re:
|
|
2244
2321
|
logger.error(re, exc_info=True)
|
|
@@ -510,6 +510,59 @@ class RESTfulVideoModelHandle(RESTfulModelHandle):
|
|
|
510
510
|
response_data = response.json()
|
|
511
511
|
return response_data
|
|
512
512
|
|
|
513
|
+
def flf_to_video(
|
|
514
|
+
self,
|
|
515
|
+
first_frame: Union[str, bytes],
|
|
516
|
+
last_frame: Union[str, bytes],
|
|
517
|
+
prompt: str,
|
|
518
|
+
negative_prompt: Optional[str] = None,
|
|
519
|
+
n: int = 1,
|
|
520
|
+
**kwargs,
|
|
521
|
+
) -> "VideoList":
|
|
522
|
+
"""
|
|
523
|
+
Creates a video by the first frame, last frame and text.
|
|
524
|
+
|
|
525
|
+
Parameters
|
|
526
|
+
----------
|
|
527
|
+
first_frame: `Union[str, bytes]`
|
|
528
|
+
The first frame to condition the generation on.
|
|
529
|
+
last_frame: `Union[str, bytes]`
|
|
530
|
+
The last frame to condition the generation on.
|
|
531
|
+
prompt: `str` or `List[str]`
|
|
532
|
+
The prompt or prompts to guide video generation. If not defined, you need to pass `prompt_embeds`.
|
|
533
|
+
negative_prompt (`str` or `List[str]`, *optional*):
|
|
534
|
+
The prompt or prompts not to guide the image generation.
|
|
535
|
+
n: `int`, defaults to 1
|
|
536
|
+
The number of videos to generate per prompt. Must be between 1 and 10.
|
|
537
|
+
Returns
|
|
538
|
+
-------
|
|
539
|
+
VideoList
|
|
540
|
+
A list of video objects.
|
|
541
|
+
"""
|
|
542
|
+
url = f"{self._base_url}/v1/video/generations/flf"
|
|
543
|
+
params = {
|
|
544
|
+
"model": self._model_uid,
|
|
545
|
+
"prompt": prompt,
|
|
546
|
+
"negative_prompt": negative_prompt,
|
|
547
|
+
"n": n,
|
|
548
|
+
"kwargs": json.dumps(kwargs),
|
|
549
|
+
}
|
|
550
|
+
files: List[Any] = []
|
|
551
|
+
for key, value in params.items():
|
|
552
|
+
files.append((key, (None, value)))
|
|
553
|
+
files.append(
|
|
554
|
+
("first_frame", ("image", first_frame, "application/octet-stream"))
|
|
555
|
+
)
|
|
556
|
+
files.append(("last_frame", ("image", last_frame, "application/octet-stream")))
|
|
557
|
+
response = requests.post(url, files=files, headers=self.auth_headers)
|
|
558
|
+
if response.status_code != 200:
|
|
559
|
+
raise RuntimeError(
|
|
560
|
+
f"Failed to create the video from image, detail: {_get_error_string(response)}"
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
response_data = response.json()
|
|
564
|
+
return response_data
|
|
565
|
+
|
|
513
566
|
|
|
514
567
|
class RESTfulGenerateModelHandle(RESTfulModelHandle):
|
|
515
568
|
def generate(
|
|
@@ -637,6 +690,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
|
|
|
637
690
|
response_format: Optional[str] = "json",
|
|
638
691
|
temperature: Optional[float] = 0,
|
|
639
692
|
timestamp_granularities: Optional[List[str]] = None,
|
|
693
|
+
**kwargs,
|
|
640
694
|
):
|
|
641
695
|
"""
|
|
642
696
|
Transcribes audio into the input language.
|
|
@@ -678,6 +732,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
|
|
|
678
732
|
"response_format": response_format,
|
|
679
733
|
"temperature": temperature,
|
|
680
734
|
"timestamp_granularities[]": timestamp_granularities,
|
|
735
|
+
"kwargs": json.dumps(kwargs),
|
|
681
736
|
}
|
|
682
737
|
files: List[Any] = []
|
|
683
738
|
files.append(("file", ("file", audio, "application/octet-stream")))
|
|
@@ -1017,7 +1072,7 @@ class Client:
|
|
|
1017
1072
|
model_path: Optional[str]
|
|
1018
1073
|
Model path, if gguf format, should be the file path, otherwise, should be directory of the model.
|
|
1019
1074
|
**kwargs:
|
|
1020
|
-
Any other parameters been specified.
|
|
1075
|
+
Any other parameters been specified. e.g. multimodal_projector for multimodal inference with the llama.cpp backend.
|
|
1021
1076
|
|
|
1022
1077
|
Returns
|
|
1023
1078
|
-------
|
|
@@ -1502,7 +1557,9 @@ class Client:
|
|
|
1502
1557
|
response_data = response.json()
|
|
1503
1558
|
return response_data
|
|
1504
1559
|
|
|
1505
|
-
def query_engine_by_model_name(
|
|
1560
|
+
def query_engine_by_model_name(
|
|
1561
|
+
self, model_name: str, model_type: Optional[str] = "LLM"
|
|
1562
|
+
):
|
|
1506
1563
|
"""
|
|
1507
1564
|
Get the engine parameters with the model name registered on the server.
|
|
1508
1565
|
|
|
@@ -1510,12 +1567,17 @@ class Client:
|
|
|
1510
1567
|
----------
|
|
1511
1568
|
model_name: str
|
|
1512
1569
|
The name of the model.
|
|
1570
|
+
model_type: str
|
|
1571
|
+
Model type, LLM by default.
|
|
1513
1572
|
Returns
|
|
1514
1573
|
-------
|
|
1515
1574
|
Dict[str, List[Dict[str, Any]]]
|
|
1516
1575
|
The supported engine parameters of registered models on the server.
|
|
1517
1576
|
"""
|
|
1518
|
-
|
|
1577
|
+
if not model_type:
|
|
1578
|
+
url = f"{self.base_url}/v1/engines/{model_name}"
|
|
1579
|
+
else:
|
|
1580
|
+
url = f"{self.base_url}/v1/engines/{model_type}/{model_name}"
|
|
1519
1581
|
response = requests.get(url, headers=self._headers)
|
|
1520
1582
|
if response.status_code != 200:
|
|
1521
1583
|
raise RuntimeError(
|
xinference/conftest.py
CHANGED
|
@@ -19,7 +19,7 @@ import os
|
|
|
19
19
|
import threading
|
|
20
20
|
import time
|
|
21
21
|
import uuid
|
|
22
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
22
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
23
23
|
|
|
24
24
|
import gradio as gr
|
|
25
25
|
import PIL.Image
|
|
@@ -463,7 +463,7 @@ class MediaInterface:
|
|
|
463
463
|
|
|
464
464
|
def image2video_interface(self) -> "gr.Blocks":
|
|
465
465
|
def image_generate_video(
|
|
466
|
-
image: "PIL.Image",
|
|
466
|
+
image: "PIL.Image.Image",
|
|
467
467
|
prompt: str,
|
|
468
468
|
negative_prompt: str,
|
|
469
469
|
num_frames: int,
|
|
@@ -577,6 +577,126 @@ class MediaInterface:
|
|
|
577
577
|
|
|
578
578
|
return image2video_ui
|
|
579
579
|
|
|
580
|
+
def flf2video_interface(self) -> "gr.Blocks":
|
|
581
|
+
def generate_video_from_flf(
|
|
582
|
+
first_frame: "PIL.Image.Image",
|
|
583
|
+
last_frame: "PIL.Image.Image",
|
|
584
|
+
prompt: str,
|
|
585
|
+
negative_prompt: str,
|
|
586
|
+
num_frames: int,
|
|
587
|
+
fps: int,
|
|
588
|
+
num_inference_steps: int,
|
|
589
|
+
guidance_scale: float,
|
|
590
|
+
width: int,
|
|
591
|
+
height: int,
|
|
592
|
+
progress=gr.Progress(),
|
|
593
|
+
) -> List[Tuple[str, str]]:
|
|
594
|
+
from ..client import RESTfulClient
|
|
595
|
+
|
|
596
|
+
client = RESTfulClient(self.endpoint)
|
|
597
|
+
client._set_token(self.access_token)
|
|
598
|
+
model = client.get_model(self.model_uid)
|
|
599
|
+
assert hasattr(model, "flf_to_video")
|
|
600
|
+
|
|
601
|
+
request_id = str(uuid.uuid4())
|
|
602
|
+
response = None
|
|
603
|
+
exc = None
|
|
604
|
+
|
|
605
|
+
buffer_first = io.BytesIO()
|
|
606
|
+
buffer_last = io.BytesIO()
|
|
607
|
+
first_frame.save(buffer_first, format="PNG")
|
|
608
|
+
last_frame.save(buffer_last, format="PNG")
|
|
609
|
+
|
|
610
|
+
def run_in_thread():
|
|
611
|
+
nonlocal exc, response
|
|
612
|
+
try:
|
|
613
|
+
response = model.flf_to_video(
|
|
614
|
+
first_frame=buffer_first.getvalue(),
|
|
615
|
+
last_frame=buffer_last.getvalue(),
|
|
616
|
+
prompt=prompt,
|
|
617
|
+
negative_prompt=negative_prompt,
|
|
618
|
+
n=1,
|
|
619
|
+
num_frames=num_frames,
|
|
620
|
+
fps=fps,
|
|
621
|
+
num_inference_steps=num_inference_steps,
|
|
622
|
+
guidance_scale=guidance_scale,
|
|
623
|
+
width=width,
|
|
624
|
+
height=height,
|
|
625
|
+
response_format="b64_json",
|
|
626
|
+
request_id=request_id,
|
|
627
|
+
)
|
|
628
|
+
except Exception as e:
|
|
629
|
+
exc = e
|
|
630
|
+
|
|
631
|
+
t = threading.Thread(target=run_in_thread)
|
|
632
|
+
t.start()
|
|
633
|
+
|
|
634
|
+
while t.is_alive():
|
|
635
|
+
try:
|
|
636
|
+
cur_progress = client.get_progress(request_id)["progress"]
|
|
637
|
+
except Exception:
|
|
638
|
+
cur_progress = 0.0
|
|
639
|
+
progress(cur_progress, desc="Generating video from first/last frames")
|
|
640
|
+
time.sleep(1)
|
|
641
|
+
|
|
642
|
+
if exc:
|
|
643
|
+
raise exc
|
|
644
|
+
|
|
645
|
+
videos = []
|
|
646
|
+
for video_dict in response["data"]: # type: ignore
|
|
647
|
+
video_data = base64.b64decode(video_dict["b64_json"])
|
|
648
|
+
video_path = f"/tmp/{uuid.uuid4()}.mp4"
|
|
649
|
+
with open(video_path, "wb") as f:
|
|
650
|
+
f.write(video_data)
|
|
651
|
+
videos.append((video_path, "Generated Video"))
|
|
652
|
+
|
|
653
|
+
return videos
|
|
654
|
+
|
|
655
|
+
# Gradio UI
|
|
656
|
+
with gr.Blocks() as flf2video_ui:
|
|
657
|
+
with gr.Row():
|
|
658
|
+
first_frame = gr.Image(label="First Frame", type="pil")
|
|
659
|
+
last_frame = gr.Image(label="Last Frame", type="pil")
|
|
660
|
+
|
|
661
|
+
prompt = gr.Textbox(label="Prompt", placeholder="Enter video prompt")
|
|
662
|
+
negative_prompt = gr.Textbox(
|
|
663
|
+
label="Negative Prompt", placeholder="Enter negative prompt"
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
with gr.Row():
|
|
667
|
+
with gr.Column():
|
|
668
|
+
width = gr.Number(label="Width", value=512)
|
|
669
|
+
num_frames = gr.Number(label="Frames", value=16)
|
|
670
|
+
steps = gr.Number(label="Inference Steps", value=25)
|
|
671
|
+
with gr.Column():
|
|
672
|
+
height = gr.Number(label="Height", value=512)
|
|
673
|
+
fps = gr.Number(label="FPS", value=8)
|
|
674
|
+
guidance_scale = gr.Slider(
|
|
675
|
+
label="Guidance Scale", minimum=1, maximum=20, value=7.5
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
generate = gr.Button("Generate")
|
|
679
|
+
gallery = gr.Gallery(label="Generated Videos", columns=2)
|
|
680
|
+
|
|
681
|
+
generate.click(
|
|
682
|
+
fn=generate_video_from_flf,
|
|
683
|
+
inputs=[
|
|
684
|
+
first_frame,
|
|
685
|
+
last_frame,
|
|
686
|
+
prompt,
|
|
687
|
+
negative_prompt,
|
|
688
|
+
num_frames,
|
|
689
|
+
fps,
|
|
690
|
+
steps,
|
|
691
|
+
guidance_scale,
|
|
692
|
+
width,
|
|
693
|
+
height,
|
|
694
|
+
],
|
|
695
|
+
outputs=gallery,
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
return flf2video_ui
|
|
699
|
+
|
|
580
700
|
def audio2text_interface(self) -> "gr.Blocks":
|
|
581
701
|
def transcribe_audio(
|
|
582
702
|
audio_path: str,
|
|
@@ -653,13 +773,14 @@ class MediaInterface:
|
|
|
653
773
|
with open(prompt_speech_file, "rb") as f:
|
|
654
774
|
prompt_speech_bytes = f.read()
|
|
655
775
|
|
|
776
|
+
kw: Dict[str, Any] = {}
|
|
777
|
+
if prompt_speech_bytes:
|
|
778
|
+
kw["prompt_speech"] = prompt_speech_bytes
|
|
779
|
+
if prompt_text:
|
|
780
|
+
kw["prompt_text"] = prompt_text
|
|
781
|
+
|
|
656
782
|
response = model.speech(
|
|
657
|
-
input=input_text,
|
|
658
|
-
voice=voice,
|
|
659
|
-
speed=speed,
|
|
660
|
-
response_format="mp3",
|
|
661
|
-
prompt_speech=prompt_speech_bytes,
|
|
662
|
-
prompt_text=prompt_text,
|
|
783
|
+
input=input_text, voice=voice, speed=speed, response_format="mp3", **kw
|
|
663
784
|
)
|
|
664
785
|
|
|
665
786
|
# Write to a temp .mp3 file and return its path
|
|
@@ -749,6 +870,9 @@ class MediaInterface:
|
|
|
749
870
|
if "image2video" in self.model_ability:
|
|
750
871
|
with gr.Tab("Image to Video"):
|
|
751
872
|
self.image2video_interface()
|
|
873
|
+
if "firstlastframe2video" in self.model_ability:
|
|
874
|
+
with gr.Tab("FirstLastFrame to Video"):
|
|
875
|
+
self.flf2video_interface()
|
|
752
876
|
if "audio2text" in self.model_ability:
|
|
753
877
|
with gr.Tab("Audio to Text"):
|
|
754
878
|
self.audio2text_interface()
|
xinference/core/model.py
CHANGED
|
@@ -71,12 +71,8 @@ except ImportError:
|
|
|
71
71
|
OutOfMemoryError = _OutOfMemoryError
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
"cogvlm2",
|
|
77
|
-
"glm-4v",
|
|
78
|
-
"MiniCPM-V-2.6",
|
|
79
|
-
]
|
|
74
|
+
# !!!!! DO NOT add model_name to this list, using `register_batching_multimodal_models` below instead.
|
|
75
|
+
XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = []
|
|
80
76
|
|
|
81
77
|
XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS = ["FLUX.1-dev", "FLUX.1-schnell"]
|
|
82
78
|
XINFERENCE_TEST_OUT_OF_MEMORY_ERROR = bool(
|
|
@@ -84,6 +80,16 @@ XINFERENCE_TEST_OUT_OF_MEMORY_ERROR = bool(
|
|
|
84
80
|
)
|
|
85
81
|
|
|
86
82
|
|
|
83
|
+
def register_batching_multimodal_models(*model_names: str):
|
|
84
|
+
def decorator(cls):
|
|
85
|
+
for name in model_names:
|
|
86
|
+
if name not in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS:
|
|
87
|
+
XINFERENCE_BATCHING_ALLOWED_VISION_MODELS.append(name)
|
|
88
|
+
return cls
|
|
89
|
+
|
|
90
|
+
return decorator
|
|
91
|
+
|
|
92
|
+
|
|
87
93
|
def request_limit(fn):
|
|
88
94
|
"""
|
|
89
95
|
Used by ModelActor.
|
|
@@ -977,6 +983,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
|
|
|
977
983
|
response_format,
|
|
978
984
|
temperature,
|
|
979
985
|
timestamp_granularities,
|
|
986
|
+
**kwargs,
|
|
980
987
|
)
|
|
981
988
|
raise AttributeError(
|
|
982
989
|
f"Model {self._model.model_spec} is not for creating transcriptions."
|
|
@@ -1282,6 +1289,37 @@ class ModelActor(xo.StatelessActor, CancelMixin):
|
|
|
1282
1289
|
f"Model {self._model.model_spec} is not for creating video from image."
|
|
1283
1290
|
)
|
|
1284
1291
|
|
|
1292
|
+
@request_limit
|
|
1293
|
+
@log_async(logger=logger)
|
|
1294
|
+
async def flf_to_video(
|
|
1295
|
+
self,
|
|
1296
|
+
first_frame: "PIL.Image.Image",
|
|
1297
|
+
last_frame: "PIL.Image.Image",
|
|
1298
|
+
prompt: str,
|
|
1299
|
+
negative_prompt: Optional[str] = None,
|
|
1300
|
+
n: int = 1,
|
|
1301
|
+
*args,
|
|
1302
|
+
**kwargs,
|
|
1303
|
+
):
|
|
1304
|
+
kwargs["negative_prompt"] = negative_prompt
|
|
1305
|
+
progressor = kwargs["progressor"] = await self._get_progressor(
|
|
1306
|
+
kwargs.pop("request_id", None)
|
|
1307
|
+
)
|
|
1308
|
+
with progressor:
|
|
1309
|
+
if hasattr(self._model, "firstlastframe_to_video"):
|
|
1310
|
+
return await self._call_wrapper_json(
|
|
1311
|
+
self._model.firstlastframe_to_video,
|
|
1312
|
+
first_frame,
|
|
1313
|
+
last_frame,
|
|
1314
|
+
prompt,
|
|
1315
|
+
n,
|
|
1316
|
+
*args,
|
|
1317
|
+
**kwargs,
|
|
1318
|
+
)
|
|
1319
|
+
raise AttributeError(
|
|
1320
|
+
f"Model {self._model.model_spec} is not for creating video from first-last-frame."
|
|
1321
|
+
)
|
|
1322
|
+
|
|
1285
1323
|
async def record_metrics(self, name, op, kwargs):
|
|
1286
1324
|
worker_ref = await self._get_worker_ref()
|
|
1287
1325
|
await worker_ref.record_metrics(name, op, kwargs)
|
xinference/core/scheduler.py
CHANGED
|
@@ -272,15 +272,6 @@ class InferenceRequest:
|
|
|
272
272
|
)
|
|
273
273
|
|
|
274
274
|
|
|
275
|
-
def _get_valid_batch_kv_cache(cache, skipped_indexes: Set[int]):
|
|
276
|
-
batch_size = cache.key_cache[0].shape[0]
|
|
277
|
-
batch_slices = [num for num in range(batch_size) if num not in skipped_indexes]
|
|
278
|
-
for idx in range(len(cache)):
|
|
279
|
-
cache.key_cache[idx] = cache.key_cache[idx][batch_slices, ::].contiguous()
|
|
280
|
-
cache.value_cache[idx] = cache.value_cache[idx][batch_slices, ::].contiguous()
|
|
281
|
-
return cache
|
|
282
|
-
|
|
283
|
-
|
|
284
275
|
class SchedulerActor(xo.StatelessActor):
|
|
285
276
|
@classmethod
|
|
286
277
|
def gen_uid(cls, model_uid: str, replica_id: str):
|
|
@@ -409,7 +400,7 @@ class SchedulerActor(xo.StatelessActor):
|
|
|
409
400
|
# Some requests have been completed. Batch size needs to be reduced for kv cache.
|
|
410
401
|
if stopped_batch_indexes and len(self._running_queue) > 0:
|
|
411
402
|
kv_cache = self._running_queue[0].kv_cache
|
|
412
|
-
reduced_kv_cache =
|
|
403
|
+
reduced_kv_cache = self._model.build_reduced_kv_cache(
|
|
413
404
|
kv_cache, stopped_batch_indexes
|
|
414
405
|
)
|
|
415
406
|
for r in self._running_queue:
|
xinference/core/supervisor.py
CHANGED
|
@@ -45,6 +45,7 @@ from ..constants import (
|
|
|
45
45
|
)
|
|
46
46
|
from ..core.model import ModelActor
|
|
47
47
|
from ..core.status_guard import InstanceInfo, LaunchStatus
|
|
48
|
+
from ..model.utils import get_engine_params_by_name
|
|
48
49
|
from ..types import PeftModelConfig
|
|
49
50
|
from .metrics import record_metrics
|
|
50
51
|
from .resource import GPUStatus, ResourceStatus
|
|
@@ -780,29 +781,19 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
780
781
|
raise ValueError(f"Unsupported model type: {model_type}")
|
|
781
782
|
|
|
782
783
|
@log_async(logger=logger)
|
|
783
|
-
async def query_engines_by_model_name(
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
from ..model.llm.llm_family import LLM_ENGINES
|
|
787
|
-
|
|
784
|
+
async def query_engines_by_model_name(
|
|
785
|
+
self, model_name: str, model_type: Optional[str] = None
|
|
786
|
+
):
|
|
788
787
|
# search in worker first
|
|
789
788
|
workers = list(self._worker_address_to_worker.values())
|
|
790
789
|
for worker in workers:
|
|
791
|
-
res = await worker.query_engines_by_model_name(
|
|
790
|
+
res = await worker.query_engines_by_model_name(
|
|
791
|
+
model_name, model_type=model_type
|
|
792
|
+
)
|
|
792
793
|
if res is not None:
|
|
793
794
|
return res
|
|
794
795
|
|
|
795
|
-
|
|
796
|
-
raise ValueError(f"Model {model_name} not found")
|
|
797
|
-
|
|
798
|
-
# filter llm_class
|
|
799
|
-
engine_params = deepcopy(LLM_ENGINES[model_name])
|
|
800
|
-
for engine in engine_params:
|
|
801
|
-
params = engine_params[engine]
|
|
802
|
-
for param in params:
|
|
803
|
-
del param["llm_class"]
|
|
804
|
-
|
|
805
|
-
return engine_params
|
|
796
|
+
return get_engine_params_by_name(model_type, model_name)
|
|
806
797
|
|
|
807
798
|
@log_async(logger=logger)
|
|
808
799
|
async def register_model(
|
xinference/core/worker.py
CHANGED
|
@@ -53,7 +53,7 @@ from ..core.model import ModelActor
|
|
|
53
53
|
from ..core.status_guard import LaunchStatus
|
|
54
54
|
from ..device_utils import get_available_device_env_name, gpu_count
|
|
55
55
|
from ..model.core import ModelDescription, VirtualEnvSettings, create_model_instance
|
|
56
|
-
from ..model.utils import CancellableDownloader
|
|
56
|
+
from ..model.utils import CancellableDownloader, get_engine_params_by_name
|
|
57
57
|
from ..types import PeftModelConfig
|
|
58
58
|
from ..utils import get_pip_config_args, get_real_path
|
|
59
59
|
from .cache_tracker import CacheTrackerActor
|
|
@@ -533,16 +533,6 @@ class WorkerActor(xo.StatelessActor):
|
|
|
533
533
|
existing_model_uids.append(rep_uid)
|
|
534
534
|
if idx in self._gpu_to_embedding_model_uids:
|
|
535
535
|
existing_model_uids.extend(self._gpu_to_embedding_model_uids[idx])
|
|
536
|
-
# If user has run the vLLM model on the GPU that was forced to be specified,
|
|
537
|
-
# it is not possible to force this GPU to be allocated again
|
|
538
|
-
if idx in self._user_specified_gpu_to_model_uids:
|
|
539
|
-
for rep_uid, _ in self._user_specified_gpu_to_model_uids[idx]:
|
|
540
|
-
is_vllm_model = await self.is_model_vllm_backend(rep_uid)
|
|
541
|
-
if is_vllm_model:
|
|
542
|
-
raise RuntimeError(
|
|
543
|
-
f"User specified GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
|
|
544
|
-
f"therefore cannot allocate GPU memory for a new model."
|
|
545
|
-
)
|
|
546
536
|
|
|
547
537
|
if existing_model_uids:
|
|
548
538
|
logger.warning(
|
|
@@ -757,22 +747,10 @@ class WorkerActor(xo.StatelessActor):
|
|
|
757
747
|
return None
|
|
758
748
|
|
|
759
749
|
@log_async(logger=logger)
|
|
760
|
-
async def query_engines_by_model_name(
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
if model_name not in LLM_ENGINES:
|
|
766
|
-
return None
|
|
767
|
-
|
|
768
|
-
# filter llm_class
|
|
769
|
-
engine_params = deepcopy(LLM_ENGINES[model_name])
|
|
770
|
-
for engine in engine_params:
|
|
771
|
-
params = engine_params[engine]
|
|
772
|
-
for param in params:
|
|
773
|
-
del param["llm_class"]
|
|
774
|
-
|
|
775
|
-
return engine_params
|
|
750
|
+
async def query_engines_by_model_name(
|
|
751
|
+
self, model_name: str, model_type: Optional[str] = None
|
|
752
|
+
):
|
|
753
|
+
return get_engine_params_by_name(model_type, model_name)
|
|
776
754
|
|
|
777
755
|
async def _get_model_ability(self, model: Any, model_type: str) -> List[str]:
|
|
778
756
|
from ..model.llm.core import LLM
|
xinference/deploy/cmdline.py
CHANGED
|
@@ -1315,8 +1315,12 @@ def model_chat(
|
|
|
1315
1315
|
if "content" not in delta:
|
|
1316
1316
|
continue
|
|
1317
1317
|
else:
|
|
1318
|
-
|
|
1319
|
-
|
|
1318
|
+
# The first chunk of stream output may have no content (None). Related PRs:
|
|
1319
|
+
# https://github.com/ggml-org/llama.cpp/pull/13634
|
|
1320
|
+
# https://github.com/ggml-org/llama.cpp/pull/12379
|
|
1321
|
+
content = delta["content"] or ""
|
|
1322
|
+
response_content += content
|
|
1323
|
+
print(content, end="", flush=True, file=sys.stdout)
|
|
1320
1324
|
print("", file=sys.stdout)
|
|
1321
1325
|
messages.append(dict(role="assistant", content=response_content))
|
|
1322
1326
|
|