xinference 0.14.1__py3-none-any.whl → 0.14.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +15 -34
- xinference/client/restful/restful_client.py +2 -2
- xinference/core/chat_interface.py +44 -9
- xinference/core/model.py +4 -4
- xinference/core/scheduler.py +1 -2
- xinference/core/worker.py +1 -1
- xinference/deploy/cmdline.py +2 -2
- xinference/deploy/test/test_cmdline.py +7 -7
- xinference/model/llm/__init__.py +20 -27
- xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
- xinference/model/llm/llm_family.json +448 -1153
- xinference/model/llm/llm_family.py +14 -139
- xinference/model/llm/llm_family_modelscope.json +230 -313
- xinference/model/llm/memory.py +9 -9
- xinference/model/llm/sglang/core.py +2 -2
- xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
- xinference/model/llm/{pytorch → transformers}/core.py +2 -10
- xinference/model/llm/transformers/intern_vl.py +457 -0
- xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
- xinference/model/llm/{pytorch → transformers}/minicpmv26.py +67 -22
- xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
- xinference/model/llm/utils.py +76 -70
- xinference/model/llm/vllm/core.py +110 -11
- xinference/model/utils.py +1 -95
- xinference/thirdparty/internvl/__init__.py +0 -0
- xinference/thirdparty/internvl/conversation.py +393 -0
- xinference/thirdparty/omnilmm/model/utils.py +16 -1
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.ffc26121.js +3 -0
- xinference/web/ui/build/static/js/main.ffc26121.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
- {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/METADATA +12 -15
- {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/RECORD +63 -70
- xinference/locale/utils.py +0 -39
- xinference/locale/zh_CN.json +0 -26
- xinference/model/llm/ggml/tools/__init__.py +0 -15
- xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
- xinference/model/llm/ggml/tools/gguf.py +0 -884
- xinference/model/llm/pytorch/__init__.py +0 -13
- xinference/model/llm/pytorch/baichuan.py +0 -81
- xinference/model/llm/pytorch/falcon.py +0 -138
- xinference/model/llm/pytorch/intern_vl.py +0 -352
- xinference/model/llm/pytorch/vicuna.py +0 -69
- xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
- xinference/web/ui/build/static/js/main.17ca0398.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
- /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
- /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/cogvlm2.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/glm4v.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/minicpmv25.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/yi_vl.py +0 -0
- /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.ffc26121.js.LICENSE.txt} +0 -0
- {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/LICENSE +0 -0
- {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/WHEEL +0 -0
- {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/entry_points.txt +0 -0
- {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2024-08-
|
|
11
|
+
"date": "2024-08-16T18:10:38+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.14.
|
|
14
|
+
"full-revisionid": "e4d225774dc7a9a9260396bf833e03a1df8e8a92",
|
|
15
|
+
"version": "0.14.2"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
xinference/api/restful_api.py
CHANGED
|
@@ -1682,18 +1682,9 @@ class RESTfulAPI:
|
|
|
1682
1682
|
|
|
1683
1683
|
model_family = desc.get("model_family", "")
|
|
1684
1684
|
function_call_models = (
|
|
1685
|
-
["
|
|
1686
|
-
+ QWEN_TOOL_CALL_FAMILY
|
|
1687
|
-
+ GLM4_TOOL_CALL_FAMILY
|
|
1685
|
+
["gorilla-openfunctions-v1"] + QWEN_TOOL_CALL_FAMILY + GLM4_TOOL_CALL_FAMILY
|
|
1688
1686
|
)
|
|
1689
1687
|
|
|
1690
|
-
is_qwen = desc.get("model_format") == "ggmlv3" and "qwen-chat" == model_family
|
|
1691
|
-
|
|
1692
|
-
if is_qwen and system_prompt is not None:
|
|
1693
|
-
raise HTTPException(
|
|
1694
|
-
status_code=400, detail="Qwen ggml does not have system prompt"
|
|
1695
|
-
)
|
|
1696
|
-
|
|
1697
1688
|
if model_family not in function_call_models:
|
|
1698
1689
|
if body.tools:
|
|
1699
1690
|
raise HTTPException(
|
|
@@ -1724,18 +1715,13 @@ class RESTfulAPI:
|
|
|
1724
1715
|
iterator = None
|
|
1725
1716
|
try:
|
|
1726
1717
|
try:
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
system_prompt,
|
|
1735
|
-
chat_history,
|
|
1736
|
-
kwargs,
|
|
1737
|
-
raw_params=raw_kwargs,
|
|
1738
|
-
)
|
|
1718
|
+
iterator = await model.chat(
|
|
1719
|
+
prompt,
|
|
1720
|
+
system_prompt,
|
|
1721
|
+
chat_history,
|
|
1722
|
+
kwargs,
|
|
1723
|
+
raw_params=raw_kwargs,
|
|
1724
|
+
)
|
|
1739
1725
|
except RuntimeError as re:
|
|
1740
1726
|
await self._report_error_event(model_uid, str(re))
|
|
1741
1727
|
self.handle_request_limit_error(re)
|
|
@@ -1763,18 +1749,13 @@ class RESTfulAPI:
|
|
|
1763
1749
|
return EventSourceResponse(stream_results())
|
|
1764
1750
|
else:
|
|
1765
1751
|
try:
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
system_prompt,
|
|
1774
|
-
chat_history,
|
|
1775
|
-
kwargs,
|
|
1776
|
-
raw_params=raw_kwargs,
|
|
1777
|
-
)
|
|
1752
|
+
data = await model.chat(
|
|
1753
|
+
prompt,
|
|
1754
|
+
system_prompt,
|
|
1755
|
+
chat_history,
|
|
1756
|
+
kwargs,
|
|
1757
|
+
raw_params=raw_kwargs,
|
|
1758
|
+
)
|
|
1778
1759
|
return Response(content=data, media_type="application/json")
|
|
1779
1760
|
except Exception as e:
|
|
1780
1761
|
logger.error(e, exc_info=True)
|
|
@@ -426,7 +426,7 @@ class RESTfulGenerateModelHandle(RESTfulModelHandle):
|
|
|
426
426
|
The user's message or user's input.
|
|
427
427
|
generate_config: Optional[Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]]
|
|
428
428
|
Additional configuration for the chat generation.
|
|
429
|
-
"LlamaCppGenerateConfig" -> Configuration for
|
|
429
|
+
"LlamaCppGenerateConfig" -> Configuration for llama-cpp-python model
|
|
430
430
|
"PytorchGenerateConfig" -> Configuration for pytorch model
|
|
431
431
|
|
|
432
432
|
Returns
|
|
@@ -493,7 +493,7 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
|
|
|
493
493
|
A tool list.
|
|
494
494
|
generate_config: Optional[Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]]
|
|
495
495
|
Additional configuration for the chat generation.
|
|
496
|
-
"LlamaCppGenerateConfig" -> configuration for
|
|
496
|
+
"LlamaCppGenerateConfig" -> configuration for llama-cpp-python model
|
|
497
497
|
"PytorchGenerateConfig" -> configuration for pytorch model
|
|
498
498
|
|
|
499
499
|
Returns
|
|
@@ -236,8 +236,8 @@ class GradioInterface:
|
|
|
236
236
|
bot[-1][1] = history[-1]["content"]
|
|
237
237
|
yield history, bot
|
|
238
238
|
|
|
239
|
-
def add_text(history, bot, text, image):
|
|
240
|
-
logger.debug("Add text, text: %s, image: %s", text, image)
|
|
239
|
+
def add_text(history, bot, text, image, video):
|
|
240
|
+
logger.debug("Add text, text: %s, image: %s, video: %s", text, image, video)
|
|
241
241
|
if image:
|
|
242
242
|
buffered = BytesIO()
|
|
243
243
|
with PIL.Image.open(image) as img:
|
|
@@ -257,16 +257,47 @@ class GradioInterface:
|
|
|
257
257
|
},
|
|
258
258
|
],
|
|
259
259
|
}
|
|
260
|
+
elif video:
|
|
261
|
+
|
|
262
|
+
def video_to_base64(video_path):
|
|
263
|
+
with open(video_path, "rb") as video_file:
|
|
264
|
+
encoded_string = base64.b64encode(video_file.read()).decode(
|
|
265
|
+
"utf-8"
|
|
266
|
+
)
|
|
267
|
+
return encoded_string
|
|
268
|
+
|
|
269
|
+
def generate_html_video(video_path):
|
|
270
|
+
base64_video = video_to_base64(video_path)
|
|
271
|
+
video_format = video_path.split(".")[-1]
|
|
272
|
+
html_code = f"""
|
|
273
|
+
<video controls>
|
|
274
|
+
<source src="data:video/{video_format};base64,{base64_video}" type="video/{video_format}">
|
|
275
|
+
Your browser does not support the video tag.
|
|
276
|
+
</video>
|
|
277
|
+
"""
|
|
278
|
+
return html_code
|
|
279
|
+
|
|
280
|
+
display_content = f"{generate_html_video(video)}\n{text}"
|
|
281
|
+
message = {
|
|
282
|
+
"role": "user",
|
|
283
|
+
"content": [
|
|
284
|
+
{"type": "text", "text": text},
|
|
285
|
+
{
|
|
286
|
+
"type": "video_url",
|
|
287
|
+
"video_url": {"url": video},
|
|
288
|
+
},
|
|
289
|
+
],
|
|
290
|
+
}
|
|
260
291
|
else:
|
|
261
292
|
display_content = text
|
|
262
293
|
message = {"role": "user", "content": text}
|
|
263
294
|
history = history + [message]
|
|
264
295
|
bot = bot + [[display_content, None]]
|
|
265
|
-
return history, bot, "", None
|
|
296
|
+
return history, bot, "", None, None
|
|
266
297
|
|
|
267
298
|
def clear_history():
|
|
268
299
|
logger.debug("Clear history.")
|
|
269
|
-
return [], None, "", None
|
|
300
|
+
return [], None, "", None, None
|
|
270
301
|
|
|
271
302
|
def update_button(text):
|
|
272
303
|
return gr.update(interactive=bool(text))
|
|
@@ -313,6 +344,7 @@ class GradioInterface:
|
|
|
313
344
|
)
|
|
314
345
|
with gr.Column(scale=3):
|
|
315
346
|
imagebox = gr.Image(type="filepath")
|
|
347
|
+
videobox = gr.Video()
|
|
316
348
|
textbox = gr.Textbox(
|
|
317
349
|
show_label=False,
|
|
318
350
|
placeholder="Enter text and press ENTER",
|
|
@@ -340,8 +372,8 @@ class GradioInterface:
|
|
|
340
372
|
|
|
341
373
|
textbox.submit(
|
|
342
374
|
add_text,
|
|
343
|
-
[state, chatbot, textbox, imagebox],
|
|
344
|
-
[state, chatbot, textbox, imagebox],
|
|
375
|
+
[state, chatbot, textbox, imagebox, videobox],
|
|
376
|
+
[state, chatbot, textbox, imagebox, videobox],
|
|
345
377
|
queue=False,
|
|
346
378
|
).then(
|
|
347
379
|
predict,
|
|
@@ -351,8 +383,8 @@ class GradioInterface:
|
|
|
351
383
|
|
|
352
384
|
submit_btn.click(
|
|
353
385
|
add_text,
|
|
354
|
-
[state, chatbot, textbox, imagebox],
|
|
355
|
-
[state, chatbot, textbox, imagebox],
|
|
386
|
+
[state, chatbot, textbox, imagebox, videobox],
|
|
387
|
+
[state, chatbot, textbox, imagebox, videobox],
|
|
356
388
|
queue=False,
|
|
357
389
|
).then(
|
|
358
390
|
predict,
|
|
@@ -361,7 +393,10 @@ class GradioInterface:
|
|
|
361
393
|
)
|
|
362
394
|
|
|
363
395
|
clear_btn.click(
|
|
364
|
-
clear_history,
|
|
396
|
+
clear_history,
|
|
397
|
+
None,
|
|
398
|
+
[state, chatbot, textbox, imagebox, videobox],
|
|
399
|
+
queue=False,
|
|
365
400
|
)
|
|
366
401
|
|
|
367
402
|
return chat_vl_interface
|
xinference/core/model.py
CHANGED
|
@@ -132,8 +132,8 @@ class ModelActor(xo.StatelessActor):
|
|
|
132
132
|
|
|
133
133
|
async def __pre_destroy__(self):
|
|
134
134
|
from ..model.embedding.core import EmbeddingModel
|
|
135
|
-
from ..model.llm.pytorch.core import PytorchModel as LLMPytorchModel
|
|
136
135
|
from ..model.llm.sglang.core import SGLANGModel
|
|
136
|
+
from ..model.llm.transformers.core import PytorchModel as LLMPytorchModel
|
|
137
137
|
from ..model.llm.vllm.core import VLLMModel as LLMVLLMModel
|
|
138
138
|
|
|
139
139
|
if self.allow_batching():
|
|
@@ -177,8 +177,8 @@ class ModelActor(xo.StatelessActor):
|
|
|
177
177
|
request_limits: Optional[int] = None,
|
|
178
178
|
):
|
|
179
179
|
super().__init__()
|
|
180
|
-
from ..model.llm.pytorch.core import PytorchModel
|
|
181
180
|
from ..model.llm.sglang.core import SGLANGModel
|
|
181
|
+
from ..model.llm.transformers.core import PytorchModel
|
|
182
182
|
from ..model.llm.vllm.core import VLLMModel
|
|
183
183
|
|
|
184
184
|
self._worker_address = worker_address
|
|
@@ -272,7 +272,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
272
272
|
return isinstance(self._model, VLLMModel)
|
|
273
273
|
|
|
274
274
|
def allow_batching(self) -> bool:
|
|
275
|
-
from ..model.llm.
|
|
275
|
+
from ..model.llm.transformers.core import PytorchModel
|
|
276
276
|
|
|
277
277
|
model_ability = self._model_description.get("model_ability", [])
|
|
278
278
|
|
|
@@ -415,7 +415,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
415
415
|
ret = await asyncio.to_thread(fn, *args, **kwargs)
|
|
416
416
|
|
|
417
417
|
if self._lock is not None and self._current_generator():
|
|
418
|
-
raise Exception("Parallel generation is not supported by
|
|
418
|
+
raise Exception("Parallel generation is not supported by llama-cpp-python.")
|
|
419
419
|
|
|
420
420
|
if inspect.isgenerator(ret):
|
|
421
421
|
gen = self._to_generator(output_type, ret)
|
xinference/core/scheduler.py
CHANGED
|
@@ -24,7 +24,6 @@ import xoscar as xo
|
|
|
24
24
|
|
|
25
25
|
logger = logging.getLogger(__name__)
|
|
26
26
|
|
|
27
|
-
XINFERENCE_BATCHING_CLEAN_CACHE_INTERVAL = 5
|
|
28
27
|
XINFERENCE_STREAMING_DONE_FLAG = "<XINFERENCE_STREAMING_DONE>"
|
|
29
28
|
XINFERENCE_STREAMING_ERROR_FLAG = "<XINFERENCE_STREAMING_ERROR>"
|
|
30
29
|
XINFERENCE_STREAMING_ABORT_FLAG = "<XINFERENCE_STREAMING_ABORT>"
|
|
@@ -359,7 +358,7 @@ class SchedulerActor(xo.StatelessActor):
|
|
|
359
358
|
|
|
360
359
|
@staticmethod
|
|
361
360
|
def _empty_cache():
|
|
362
|
-
from ..model.llm.
|
|
361
|
+
from ..model.llm.transformers.utils import empty_cache
|
|
363
362
|
|
|
364
363
|
empty_cache()
|
|
365
364
|
|
xinference/core/worker.py
CHANGED
|
@@ -830,7 +830,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
830
830
|
raise ValueError(
|
|
831
831
|
f"PEFT adaptors cannot be applied to embedding or rerank models."
|
|
832
832
|
)
|
|
833
|
-
if model_type == "LLM" and model_format in ("ggufv2",
|
|
833
|
+
if model_type == "LLM" and model_format in ("ggufv2",):
|
|
834
834
|
raise ValueError(
|
|
835
835
|
f"PEFT adaptors can only be applied to pytorch-like models"
|
|
836
836
|
)
|
xinference/deploy/cmdline.py
CHANGED
|
@@ -750,7 +750,7 @@ def remove_cache(
|
|
|
750
750
|
"-f",
|
|
751
751
|
default=None,
|
|
752
752
|
type=str,
|
|
753
|
-
help="Specify the format of the model, e.g. pytorch,
|
|
753
|
+
help="Specify the format of the model, e.g. pytorch, ggufv2, etc.",
|
|
754
754
|
)
|
|
755
755
|
@click.option(
|
|
756
756
|
"--quantization",
|
|
@@ -1516,7 +1516,7 @@ def query_engine_by_model_name(
|
|
|
1516
1516
|
"-f",
|
|
1517
1517
|
type=str,
|
|
1518
1518
|
required=True,
|
|
1519
|
-
help="Specify the format of the model, e.g. pytorch,
|
|
1519
|
+
help="Specify the format of the model, e.g. pytorch, ggufv2, etc.",
|
|
1520
1520
|
)
|
|
1521
1521
|
@click.option(
|
|
1522
1522
|
"--quantization",
|
|
@@ -66,10 +66,10 @@ def test_cmdline(setup, stream, model_uid):
|
|
|
66
66
|
replica = 1
|
|
67
67
|
original_model_uid = model_uid
|
|
68
68
|
model_uid = client.launch_model(
|
|
69
|
-
model_name="
|
|
69
|
+
model_name="qwen1.5-chat",
|
|
70
70
|
model_engine="llama.cpp",
|
|
71
71
|
model_uid=model_uid,
|
|
72
|
-
model_size_in_billions=
|
|
72
|
+
model_size_in_billions="0_5",
|
|
73
73
|
quantization="q4_0",
|
|
74
74
|
replica=replica,
|
|
75
75
|
)
|
|
@@ -249,10 +249,10 @@ def test_rotate_logs(setup_with_file_logging):
|
|
|
249
249
|
runner = CliRunner()
|
|
250
250
|
replica = 1 if os.name == "nt" else 2
|
|
251
251
|
model_uid = client.launch_model(
|
|
252
|
-
model_name="
|
|
252
|
+
model_name="qwen1.5-chat",
|
|
253
253
|
model_engine="llama.cpp",
|
|
254
254
|
model_uid=None,
|
|
255
|
-
model_size_in_billions=
|
|
255
|
+
model_size_in_billions="0_5",
|
|
256
256
|
quantization="q4_0",
|
|
257
257
|
replica=replica,
|
|
258
258
|
)
|
|
@@ -288,7 +288,7 @@ def test_list_cached_models(setup):
|
|
|
288
288
|
|
|
289
289
|
result = runner.invoke(
|
|
290
290
|
list_cached_models,
|
|
291
|
-
["--endpoint", endpoint, "--model_name", "
|
|
291
|
+
["--endpoint", endpoint, "--model_name", "qwen1.5-chat"],
|
|
292
292
|
)
|
|
293
293
|
assert "model_name" in result.stdout
|
|
294
294
|
assert "model_format" in result.stdout
|
|
@@ -305,9 +305,9 @@ def test_remove_cache(setup):
|
|
|
305
305
|
|
|
306
306
|
result = runner.invoke(
|
|
307
307
|
remove_cache,
|
|
308
|
-
["--endpoint", endpoint, "--model_version", "
|
|
308
|
+
["--endpoint", endpoint, "--model_version", "qwen1.5-chat"],
|
|
309
309
|
input="y\n",
|
|
310
310
|
)
|
|
311
311
|
|
|
312
312
|
assert result.exit_code == 0
|
|
313
|
-
assert "Cache directory
|
|
313
|
+
assert "Cache directory qwen1.5-chat has been deleted."
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -40,7 +40,7 @@ from .llm_family import (
|
|
|
40
40
|
TRANSFORMERS_CLASSES,
|
|
41
41
|
VLLM_CLASSES,
|
|
42
42
|
CustomLLMFamilyV1,
|
|
43
|
-
|
|
43
|
+
LlamaCppLLMSpecV1,
|
|
44
44
|
LLMFamilyV1,
|
|
45
45
|
LLMSpecV1,
|
|
46
46
|
MLXLLMSpecV1,
|
|
@@ -55,10 +55,10 @@ from .llm_family import (
|
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
def check_format_with_engine(model_format, engine):
|
|
58
|
-
# only llama-cpp-python support and only support ggufv2
|
|
59
|
-
if model_format in ["ggufv2"
|
|
58
|
+
# only llama-cpp-python support and only support ggufv2
|
|
59
|
+
if model_format in ["ggufv2"] and engine != "llama.cpp":
|
|
60
60
|
return False
|
|
61
|
-
if model_format not in ["ggufv2"
|
|
61
|
+
if model_format not in ["ggufv2"] and engine == "llama.cpp":
|
|
62
62
|
return False
|
|
63
63
|
return True
|
|
64
64
|
|
|
@@ -112,28 +112,25 @@ def generate_engine_config_by_model_family(model_family):
|
|
|
112
112
|
|
|
113
113
|
|
|
114
114
|
def _install():
|
|
115
|
-
from .
|
|
115
|
+
from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
|
|
116
116
|
from .mlx.core import MLXChatModel, MLXModel
|
|
117
|
-
from .pytorch.baichuan import BaichuanPytorchChatModel
|
|
118
|
-
from .pytorch.chatglm import ChatglmPytorchChatModel
|
|
119
|
-
from .pytorch.cogvlm2 import CogVLM2Model
|
|
120
|
-
from .pytorch.core import PytorchChatModel, PytorchModel
|
|
121
|
-
from .pytorch.deepseek_vl import DeepSeekVLChatModel
|
|
122
|
-
from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
|
|
123
|
-
from .pytorch.glm4v import Glm4VModel
|
|
124
|
-
from .pytorch.intern_vl import InternVLChatModel
|
|
125
|
-
from .pytorch.internlm2 import Internlm2PytorchChatModel
|
|
126
|
-
from .pytorch.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
|
|
127
|
-
from .pytorch.minicpmv25 import MiniCPMV25Model
|
|
128
|
-
from .pytorch.minicpmv26 import MiniCPMV26Model
|
|
129
|
-
from .pytorch.qwen_vl import QwenVLChatModel
|
|
130
|
-
from .pytorch.vicuna import VicunaPytorchChatModel
|
|
131
|
-
from .pytorch.yi_vl import YiVLChatModel
|
|
132
117
|
from .sglang.core import SGLANGChatModel, SGLANGModel
|
|
133
|
-
from .
|
|
118
|
+
from .transformers.chatglm import ChatglmPytorchChatModel
|
|
119
|
+
from .transformers.cogvlm2 import CogVLM2Model
|
|
120
|
+
from .transformers.core import PytorchChatModel, PytorchModel
|
|
121
|
+
from .transformers.deepseek_vl import DeepSeekVLChatModel
|
|
122
|
+
from .transformers.glm4v import Glm4VModel
|
|
123
|
+
from .transformers.intern_vl import InternVLChatModel
|
|
124
|
+
from .transformers.internlm2 import Internlm2PytorchChatModel
|
|
125
|
+
from .transformers.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
|
|
126
|
+
from .transformers.minicpmv25 import MiniCPMV25Model
|
|
127
|
+
from .transformers.minicpmv26 import MiniCPMV26Model
|
|
128
|
+
from .transformers.qwen_vl import QwenVLChatModel
|
|
129
|
+
from .transformers.yi_vl import YiVLChatModel
|
|
130
|
+
from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
|
|
134
131
|
|
|
135
132
|
try:
|
|
136
|
-
from .
|
|
133
|
+
from .transformers.omnilmm import OmniLMMModel
|
|
137
134
|
except ImportError as e:
|
|
138
135
|
# For quite old transformers version,
|
|
139
136
|
# import will generate error
|
|
@@ -148,18 +145,14 @@ def _install():
|
|
|
148
145
|
]
|
|
149
146
|
)
|
|
150
147
|
SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
|
|
151
|
-
VLLM_CLASSES.extend([VLLMModel, VLLMChatModel])
|
|
148
|
+
VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
|
|
152
149
|
MLX_CLASSES.extend([MLXModel, MLXChatModel])
|
|
153
150
|
TRANSFORMERS_CLASSES.extend(
|
|
154
151
|
[
|
|
155
|
-
BaichuanPytorchChatModel,
|
|
156
|
-
VicunaPytorchChatModel,
|
|
157
|
-
FalconPytorchChatModel,
|
|
158
152
|
ChatglmPytorchChatModel,
|
|
159
153
|
LlamaPytorchModel,
|
|
160
154
|
LlamaPytorchChatModel,
|
|
161
155
|
PytorchChatModel,
|
|
162
|
-
FalconPytorchModel,
|
|
163
156
|
Internlm2PytorchChatModel,
|
|
164
157
|
QwenVLChatModel,
|
|
165
158
|
YiVLChatModel,
|
|
@@ -11,7 +11,6 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
import datetime
|
|
15
14
|
import logging
|
|
16
15
|
import os
|
|
17
16
|
import time
|
|
@@ -104,35 +103,6 @@ class LlamaCppModel(LLM):
|
|
|
104
103
|
generate_config.pop("lora_name", None) # type: ignore
|
|
105
104
|
return generate_config
|
|
106
105
|
|
|
107
|
-
def _convert_ggml_to_gguf(self, model_path: str) -> str:
|
|
108
|
-
from .tools import convert
|
|
109
|
-
|
|
110
|
-
root_dir = os.path.dirname(os.path.dirname(model_path))
|
|
111
|
-
gguf_dir = os.path.join(
|
|
112
|
-
root_dir,
|
|
113
|
-
"{}-ggufv2-{}b".format(
|
|
114
|
-
self.model_family.model_name, self.model_spec.model_size_in_billions
|
|
115
|
-
),
|
|
116
|
-
)
|
|
117
|
-
os.makedirs(gguf_dir, exist_ok=True)
|
|
118
|
-
gguf_path = os.path.join(
|
|
119
|
-
gguf_dir,
|
|
120
|
-
"{}.{}.ggufv2".format(self.model_family.model_name, self.quantization),
|
|
121
|
-
)
|
|
122
|
-
# trick for validation, use a mark file to make sure the gguf file is converted
|
|
123
|
-
mark_file = os.path.join(gguf_dir, f"__valid_{self.quantization}")
|
|
124
|
-
if os.path.exists(mark_file):
|
|
125
|
-
return gguf_path
|
|
126
|
-
else:
|
|
127
|
-
logger.warning(
|
|
128
|
-
"You are using a model with ggmlv3, "
|
|
129
|
-
"and it will take some time to convert to ggufv2"
|
|
130
|
-
)
|
|
131
|
-
convert(model_path, gguf_path)
|
|
132
|
-
with open(mark_file, "w") as f:
|
|
133
|
-
f.write(str(datetime.datetime.now()))
|
|
134
|
-
return gguf_path
|
|
135
|
-
|
|
136
106
|
def load(self):
|
|
137
107
|
try:
|
|
138
108
|
import llama_cpp
|
|
@@ -167,9 +137,6 @@ class LlamaCppModel(LLM):
|
|
|
167
137
|
if os.path.exists(legacy_model_file_path):
|
|
168
138
|
model_path = legacy_model_file_path
|
|
169
139
|
|
|
170
|
-
if self.model_spec.model_format == "ggmlv3":
|
|
171
|
-
model_path = self._convert_ggml_to_gguf(model_path)
|
|
172
|
-
|
|
173
140
|
try:
|
|
174
141
|
self._llm = Llama(
|
|
175
142
|
model_path=model_path,
|
|
@@ -183,7 +150,7 @@ class LlamaCppModel(LLM):
|
|
|
183
150
|
def match(
|
|
184
151
|
cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
|
|
185
152
|
) -> bool:
|
|
186
|
-
if llm_spec.model_format not in ["
|
|
153
|
+
if llm_spec.model_format not in ["ggufv2"]:
|
|
187
154
|
return False
|
|
188
155
|
if "qwen" in llm_family.model_name:
|
|
189
156
|
return False
|
|
@@ -285,7 +252,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
|
|
|
285
252
|
def match(
|
|
286
253
|
cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
|
|
287
254
|
) -> bool:
|
|
288
|
-
if llm_spec.model_format not in ["
|
|
255
|
+
if llm_spec.model_format not in ["ggufv2"]:
|
|
289
256
|
return False
|
|
290
257
|
if "chat" not in llm_family.model_ability:
|
|
291
258
|
return False
|