xinference 0.10.2.post1__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/auth_service.py +1 -1
- xinference/api/restful_api.py +53 -61
- xinference/client/restful/restful_client.py +52 -57
- xinference/conftest.py +1 -1
- xinference/core/cache_tracker.py +1 -1
- xinference/core/event.py +1 -1
- xinference/core/model.py +15 -4
- xinference/core/status_guard.py +1 -1
- xinference/core/supervisor.py +58 -72
- xinference/core/worker.py +73 -102
- xinference/deploy/cmdline.py +175 -6
- xinference/deploy/test/test_cmdline.py +2 -0
- xinference/deploy/utils.py +1 -1
- xinference/device_utils.py +29 -3
- xinference/fields.py +5 -1
- xinference/model/audio/model_spec.json +8 -1
- xinference/model/audio/whisper.py +88 -12
- xinference/model/core.py +2 -2
- xinference/model/embedding/core.py +13 -0
- xinference/model/image/__init__.py +29 -0
- xinference/model/image/core.py +6 -0
- xinference/model/image/custom.py +109 -0
- xinference/model/llm/__init__.py +92 -32
- xinference/model/llm/core.py +57 -102
- xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
- xinference/model/llm/llm_family.json +446 -2
- xinference/model/llm/llm_family.py +45 -41
- xinference/model/llm/llm_family_modelscope.json +208 -1
- xinference/model/llm/pytorch/deepseek_vl.py +89 -33
- xinference/model/llm/pytorch/qwen_vl.py +67 -12
- xinference/model/llm/pytorch/yi_vl.py +62 -45
- xinference/model/llm/utils.py +45 -15
- xinference/model/llm/vllm/core.py +21 -4
- xinference/model/rerank/core.py +48 -20
- xinference/thirdparty/omnilmm/chat.py +2 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
- xinference/types.py +2 -0
- xinference/web/ui/build/asset-manifest.json +6 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.54bca460.css +2 -0
- xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
- xinference/web/ui/build/static/js/main.8e44da4b.js +3 -0
- xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.8e44da4b.js.LICENSE.txt} +7 -0
- xinference/web/ui/build/static/js/main.8e44da4b.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ddaec68b88e5eff792df1e39a4b4b8b737bfc832293c015660c3c69334e3cf5c.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +33 -0
- xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
- xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
- xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
- xinference/web/ui/node_modules/clipboard/bower.json +18 -0
- xinference/web/ui/node_modules/clipboard/composer.json +25 -0
- xinference/web/ui/node_modules/clipboard/package.json +63 -0
- xinference/web/ui/node_modules/delegate/package.json +31 -0
- xinference/web/ui/node_modules/good-listener/bower.json +11 -0
- xinference/web/ui/node_modules/good-listener/package.json +35 -0
- xinference/web/ui/node_modules/select/bower.json +13 -0
- xinference/web/ui/node_modules/select/package.json +29 -0
- xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
- xinference/web/ui/package-lock.json +34 -0
- xinference/web/ui/package.json +1 -0
- {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/METADATA +14 -13
- {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/RECORD +81 -60
- xinference/client/oscar/__init__.py +0 -13
- xinference/client/oscar/actor_client.py +0 -611
- xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
- xinference/model/llm/pytorch/spec_model.py +0 -186
- xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
- xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
- {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/LICENSE +0 -0
- {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/WHEEL +0 -0
- {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/top_level.txt +0 -0
|
@@ -84,6 +84,96 @@
|
|
|
84
84
|
]
|
|
85
85
|
}
|
|
86
86
|
},
|
|
87
|
+
{
|
|
88
|
+
"version": 1,
|
|
89
|
+
"context_length": 8192,
|
|
90
|
+
"model_name": "llama-3",
|
|
91
|
+
"model_lang": [
|
|
92
|
+
"en"
|
|
93
|
+
],
|
|
94
|
+
"model_ability": [
|
|
95
|
+
"generate"
|
|
96
|
+
],
|
|
97
|
+
"model_description": "Llama 3 is an auto-regressive language model that uses an optimized transformer architecture",
|
|
98
|
+
"model_specs": [
|
|
99
|
+
{
|
|
100
|
+
"model_format": "pytorch",
|
|
101
|
+
"model_size_in_billions": 8,
|
|
102
|
+
"quantizations": [
|
|
103
|
+
"4-bit",
|
|
104
|
+
"8-bit",
|
|
105
|
+
"none"
|
|
106
|
+
],
|
|
107
|
+
"model_id": "LLM-Research/Meta-Llama-3-8B",
|
|
108
|
+
"model_hub": "modelscope"
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
"model_format": "pytorch",
|
|
112
|
+
"model_size_in_billions": 70,
|
|
113
|
+
"quantizations": [
|
|
114
|
+
"4-bit",
|
|
115
|
+
"8-bit",
|
|
116
|
+
"none"
|
|
117
|
+
],
|
|
118
|
+
"model_id": "LLM-Research/Meta-Llama-3-70B",
|
|
119
|
+
"model_hub": "modelscope"
|
|
120
|
+
}
|
|
121
|
+
]
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
"version": 1,
|
|
125
|
+
"context_length": 8192,
|
|
126
|
+
"model_name": "llama-3-instruct",
|
|
127
|
+
"model_lang": [
|
|
128
|
+
"en"
|
|
129
|
+
],
|
|
130
|
+
"model_ability": [
|
|
131
|
+
"chat"
|
|
132
|
+
],
|
|
133
|
+
"model_description": "The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..",
|
|
134
|
+
"model_specs": [
|
|
135
|
+
{
|
|
136
|
+
"model_format": "pytorch",
|
|
137
|
+
"model_size_in_billions": 8,
|
|
138
|
+
"quantizations": [
|
|
139
|
+
"4-bit",
|
|
140
|
+
"8-bit",
|
|
141
|
+
"none"
|
|
142
|
+
],
|
|
143
|
+
"model_id": "LLM-Research/Meta-Llama-3-8B-Instruct",
|
|
144
|
+
"model_hub": "modelscope"
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
"model_format": "pytorch",
|
|
148
|
+
"model_size_in_billions": 70,
|
|
149
|
+
"quantizations": [
|
|
150
|
+
"4-bit",
|
|
151
|
+
"8-bit",
|
|
152
|
+
"none"
|
|
153
|
+
],
|
|
154
|
+
"model_id": "LLM-Research/Meta-Llama-3-70B-Instruct",
|
|
155
|
+
"model_hub": "modelscope"
|
|
156
|
+
}
|
|
157
|
+
],
|
|
158
|
+
"prompt_style": {
|
|
159
|
+
"style_name": "LLAMA3",
|
|
160
|
+
"system_prompt": "You are a helpful assistant.",
|
|
161
|
+
"roles": [
|
|
162
|
+
"user",
|
|
163
|
+
"assistant"
|
|
164
|
+
],
|
|
165
|
+
"intra_message_sep": "\n\n",
|
|
166
|
+
"inter_message_sep": "<|eot_id|>",
|
|
167
|
+
"stop_token_ids": [
|
|
168
|
+
128001,
|
|
169
|
+
128009
|
|
170
|
+
],
|
|
171
|
+
"stop": [
|
|
172
|
+
"<|end_of_text|>",
|
|
173
|
+
"<|eot_id|>"
|
|
174
|
+
]
|
|
175
|
+
}
|
|
176
|
+
},
|
|
87
177
|
{
|
|
88
178
|
"version": 1,
|
|
89
179
|
"context_length": 2048,
|
|
@@ -323,7 +413,7 @@
|
|
|
323
413
|
],
|
|
324
414
|
"model_hub": "modelscope",
|
|
325
415
|
"model_id": "ZhipuAI/chatglm3-6b",
|
|
326
|
-
"model_revision": "v1.0.
|
|
416
|
+
"model_revision": "v1.0.2"
|
|
327
417
|
}
|
|
328
418
|
],
|
|
329
419
|
"prompt_style": {
|
|
@@ -1847,6 +1937,17 @@
|
|
|
1847
1937
|
"model_id": "qwen/Qwen1.5-72B-Chat",
|
|
1848
1938
|
"model_hub": "modelscope"
|
|
1849
1939
|
},
|
|
1940
|
+
{
|
|
1941
|
+
"model_format": "pytorch",
|
|
1942
|
+
"model_size_in_billions": 110,
|
|
1943
|
+
"quantizations": [
|
|
1944
|
+
"4-bit",
|
|
1945
|
+
"8-bit",
|
|
1946
|
+
"none"
|
|
1947
|
+
],
|
|
1948
|
+
"model_id": "qwen/Qwen1.5-110B-Chat",
|
|
1949
|
+
"model_hub": "modelscope"
|
|
1950
|
+
},
|
|
1850
1951
|
{
|
|
1851
1952
|
"model_format": "gptq",
|
|
1852
1953
|
"model_size_in_billions": "0_5",
|
|
@@ -1916,6 +2017,15 @@
|
|
|
1916
2017
|
"model_id": "qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}",
|
|
1917
2018
|
"model_hub": "modelscope"
|
|
1918
2019
|
},
|
|
2020
|
+
{
|
|
2021
|
+
"model_format": "gptq",
|
|
2022
|
+
"model_size_in_billions": 110,
|
|
2023
|
+
"quantizations": [
|
|
2024
|
+
"Int4"
|
|
2025
|
+
],
|
|
2026
|
+
"model_id": "qwen/Qwen1.5-110B-Chat-GPTQ-Int4",
|
|
2027
|
+
"model_hub": "modelscope"
|
|
2028
|
+
},
|
|
1919
2029
|
{
|
|
1920
2030
|
"model_format": "awq",
|
|
1921
2031
|
"model_size_in_billions": "0_5",
|
|
@@ -1979,6 +2089,15 @@
|
|
|
1979
2089
|
"model_id": "qwen/Qwen1.5-72B-Chat-AWQ",
|
|
1980
2090
|
"model_hub": "modelscope"
|
|
1981
2091
|
},
|
|
2092
|
+
{
|
|
2093
|
+
"model_format": "awq",
|
|
2094
|
+
"model_size_in_billions": 110,
|
|
2095
|
+
"quantizations": [
|
|
2096
|
+
"Int4"
|
|
2097
|
+
],
|
|
2098
|
+
"model_id": "qwen/Qwen1.5-110B-Chat-AWQ",
|
|
2099
|
+
"model_hub": "modelscope"
|
|
2100
|
+
},
|
|
1982
2101
|
{
|
|
1983
2102
|
"model_format": "ggufv2",
|
|
1984
2103
|
"model_size_in_billions": "0_5",
|
|
@@ -3205,5 +3324,93 @@
|
|
|
3205
3324
|
"model_revision": "master"
|
|
3206
3325
|
}
|
|
3207
3326
|
]
|
|
3327
|
+
},
|
|
3328
|
+
{
|
|
3329
|
+
"version": 1,
|
|
3330
|
+
"context_length": 128000,
|
|
3331
|
+
"model_name": "phi-3-mini-128k-instruct",
|
|
3332
|
+
"model_lang": [
|
|
3333
|
+
"en"
|
|
3334
|
+
],
|
|
3335
|
+
"model_ability": [
|
|
3336
|
+
"chat"
|
|
3337
|
+
],
|
|
3338
|
+
"model_description": "The Phi-3-Mini-128K-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
|
|
3339
|
+
"model_specs": [
|
|
3340
|
+
{
|
|
3341
|
+
"model_format": "pytorch",
|
|
3342
|
+
"model_size_in_billions": 4,
|
|
3343
|
+
"quantizations": [
|
|
3344
|
+
"4-bit",
|
|
3345
|
+
"8-bit",
|
|
3346
|
+
"none"
|
|
3347
|
+
],
|
|
3348
|
+
"model_hub": "modelscope",
|
|
3349
|
+
"model_id": "LLM-Research/Phi-3-mini-128k-instruct",
|
|
3350
|
+
"model_revision": "master"
|
|
3351
|
+
}
|
|
3352
|
+
],
|
|
3353
|
+
"prompt_style": {
|
|
3354
|
+
"style_name": "PHI3",
|
|
3355
|
+
"system_prompt": "You are a helpful AI assistant.",
|
|
3356
|
+
"roles": [
|
|
3357
|
+
"user",
|
|
3358
|
+
"assistant"
|
|
3359
|
+
],
|
|
3360
|
+
"intra_message_sep": "\n",
|
|
3361
|
+
"inter_message_sep": "<|end|>\n",
|
|
3362
|
+
"stop_token_ids":[
|
|
3363
|
+
32000,
|
|
3364
|
+
32007
|
|
3365
|
+
],
|
|
3366
|
+
"stop": [
|
|
3367
|
+
"<|endoftext|>",
|
|
3368
|
+
"<|end|>"
|
|
3369
|
+
]
|
|
3370
|
+
}
|
|
3371
|
+
},
|
|
3372
|
+
{
|
|
3373
|
+
"version": 1,
|
|
3374
|
+
"context_length": 4096,
|
|
3375
|
+
"model_name": "phi-3-mini-4k-instruct",
|
|
3376
|
+
"model_lang": [
|
|
3377
|
+
"en"
|
|
3378
|
+
],
|
|
3379
|
+
"model_ability": [
|
|
3380
|
+
"chat"
|
|
3381
|
+
],
|
|
3382
|
+
"model_description": "The Phi-3-Mini-4k-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
|
|
3383
|
+
"model_specs": [
|
|
3384
|
+
{
|
|
3385
|
+
"model_format": "pytorch",
|
|
3386
|
+
"model_size_in_billions": 4,
|
|
3387
|
+
"quantizations": [
|
|
3388
|
+
"4-bit",
|
|
3389
|
+
"8-bit",
|
|
3390
|
+
"none"
|
|
3391
|
+
],
|
|
3392
|
+
"model_hub": "modelscope",
|
|
3393
|
+
"model_id": "LLM-Research/Phi-3-mini-4k-instruct",
|
|
3394
|
+
"model_revision": "master"
|
|
3395
|
+
}
|
|
3396
|
+
],
|
|
3397
|
+
"prompt_style": {
|
|
3398
|
+
"style_name": "PHI3",
|
|
3399
|
+
"system_prompt": "You are a helpful AI assistant.",
|
|
3400
|
+
"roles": [
|
|
3401
|
+
"user",
|
|
3402
|
+
"assistant"
|
|
3403
|
+
],
|
|
3404
|
+
"intra_message_sep": "\n",
|
|
3405
|
+
"inter_message_sep": "<|end|>\n",
|
|
3406
|
+
"stop_token_ids":[
|
|
3407
|
+
32000,
|
|
3408
|
+
32007
|
|
3409
|
+
],
|
|
3410
|
+
"stop": [
|
|
3411
|
+
"<|endoftext|>",
|
|
3412
|
+
"<|end|>"
|
|
3413
|
+
]
|
|
3414
|
+
}
|
|
3208
3415
|
}
|
|
3209
3416
|
]
|
|
@@ -27,9 +27,11 @@ import torch
|
|
|
27
27
|
from ....model.utils import select_device
|
|
28
28
|
from ....types import (
|
|
29
29
|
ChatCompletion,
|
|
30
|
-
ChatCompletionChoice,
|
|
31
30
|
ChatCompletionChunk,
|
|
32
31
|
ChatCompletionMessage,
|
|
32
|
+
Completion,
|
|
33
|
+
CompletionChoice,
|
|
34
|
+
CompletionChunk,
|
|
33
35
|
CompletionUsage,
|
|
34
36
|
)
|
|
35
37
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -67,12 +69,12 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
67
69
|
self._type = torch.float16 if self._device == "mps" else torch.bfloat16
|
|
68
70
|
|
|
69
71
|
# specify the path to the model
|
|
70
|
-
self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
|
|
72
|
+
self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained( # type: ignore
|
|
71
73
|
self.model_path
|
|
72
74
|
)
|
|
73
75
|
self._tokenizer = self._vl_chat_processor.tokenizer
|
|
74
76
|
|
|
75
|
-
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
|
|
77
|
+
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained( # type: ignore
|
|
76
78
|
self.model_path, trust_remote_code=True, device_map=self._device
|
|
77
79
|
)
|
|
78
80
|
self._model = vl_gpt.to(self._type).eval()
|
|
@@ -149,10 +151,11 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
149
151
|
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
150
152
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
151
153
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
152
|
-
if
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
154
|
+
if not generate_config:
|
|
155
|
+
generate_config = {}
|
|
156
|
+
|
|
157
|
+
stream = generate_config.get("stream", False)
|
|
158
|
+
|
|
156
159
|
prompt, images = self._message_content_to_deepseek(prompt)
|
|
157
160
|
prompt_messages: List[Dict[str, Any]] = [
|
|
158
161
|
{
|
|
@@ -184,6 +187,7 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
184
187
|
|
|
185
188
|
deepseek_history.extend(prompt_messages)
|
|
186
189
|
|
|
190
|
+
from ....thirdparty.deepseek_vl.serve.inference import generate
|
|
187
191
|
from ....thirdparty.deepseek_vl.utils.io import load_pil_images
|
|
188
192
|
|
|
189
193
|
# load images and prepare for inputs
|
|
@@ -192,41 +196,93 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
192
196
|
conversations=deepseek_history, images=pil_images, force_batchify=True
|
|
193
197
|
).to(self._model.device, self._model.dtype)
|
|
194
198
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
bos_token_id=self._tokenizer.bos_token_id,
|
|
204
|
-
eos_token_id=self._tokenizer.eos_token_id,
|
|
205
|
-
max_new_tokens=512,
|
|
206
|
-
do_sample=True,
|
|
207
|
-
top_p=0.95,
|
|
208
|
-
temperature=0.2,
|
|
209
|
-
repetition_penalty=1.1,
|
|
210
|
-
use_cache=True,
|
|
211
|
-
)
|
|
199
|
+
temperature = generate_config.get("temperature", 0.2)
|
|
200
|
+
top_p = generate_config.get("top_p", 0.95)
|
|
201
|
+
max_new_tokens = generate_config.get("max_tokens", 512)
|
|
202
|
+
repetition_penalty = generate_config.get("repetition_penalty", 1.1)
|
|
203
|
+
|
|
204
|
+
conversation = self._vl_chat_processor.new_chat_template()
|
|
205
|
+
stop_str = conversation.sep2
|
|
206
|
+
stop_words = [stop_str]
|
|
212
207
|
|
|
213
|
-
|
|
214
|
-
|
|
208
|
+
streamer = generate(
|
|
209
|
+
vl_gpt=self._model,
|
|
210
|
+
tokenizer=self._tokenizer,
|
|
211
|
+
prepare_inputs=prepare_inputs,
|
|
212
|
+
max_gen_len=max_new_tokens,
|
|
213
|
+
temperature=temperature,
|
|
214
|
+
repetition_penalty=repetition_penalty,
|
|
215
|
+
top_p=top_p,
|
|
216
|
+
stop_words=stop_words,
|
|
215
217
|
)
|
|
216
218
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
219
|
+
if stream:
|
|
220
|
+
it = self._generate_stream(streamer, stop_str)
|
|
221
|
+
return self._to_chat_completion_chunks(it)
|
|
222
|
+
else:
|
|
223
|
+
c = self._generate(streamer, stop_str)
|
|
224
|
+
return self._to_chat_completion(c)
|
|
225
|
+
|
|
226
|
+
def _generate(self, streamer, stop_str) -> Completion:
|
|
227
|
+
generated_text = ""
|
|
228
|
+
for new_text in streamer:
|
|
229
|
+
if new_text.endswith(stop_str):
|
|
230
|
+
new_text = new_text[: -len(stop_str)]
|
|
231
|
+
generated_text += new_text
|
|
232
|
+
|
|
233
|
+
c = Completion(
|
|
234
|
+
id=str(uuid.uuid1()),
|
|
235
|
+
object="text_completion",
|
|
220
236
|
created=int(time.time()),
|
|
221
237
|
model=self.model_uid,
|
|
222
238
|
choices=[
|
|
223
|
-
|
|
224
|
-
index=0,
|
|
225
|
-
message={"role": "assistant", "content": answer},
|
|
226
|
-
finish_reason="stop",
|
|
239
|
+
CompletionChoice(
|
|
240
|
+
index=0, text=generated_text, finish_reason="stop", logprobs=None
|
|
227
241
|
)
|
|
228
242
|
],
|
|
229
243
|
usage=CompletionUsage(
|
|
230
244
|
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
231
245
|
),
|
|
232
246
|
)
|
|
247
|
+
return c
|
|
248
|
+
|
|
249
|
+
def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
|
|
250
|
+
completion_id = str(uuid.uuid1())
|
|
251
|
+
for i, new_text in enumerate(streamer):
|
|
252
|
+
if new_text.endswith(stop_str):
|
|
253
|
+
new_text = new_text[: -len(stop_str)]
|
|
254
|
+
completion_choice = CompletionChoice(
|
|
255
|
+
text=new_text, index=0, logprobs=None, finish_reason=None
|
|
256
|
+
)
|
|
257
|
+
chunk = CompletionChunk(
|
|
258
|
+
id=completion_id,
|
|
259
|
+
object="text_completion",
|
|
260
|
+
created=int(time.time()),
|
|
261
|
+
model=self.model_uid,
|
|
262
|
+
choices=[completion_choice],
|
|
263
|
+
)
|
|
264
|
+
completion_usage = CompletionUsage(
|
|
265
|
+
prompt_tokens=-1,
|
|
266
|
+
completion_tokens=-1,
|
|
267
|
+
total_tokens=-1,
|
|
268
|
+
)
|
|
269
|
+
chunk["usage"] = completion_usage
|
|
270
|
+
yield chunk
|
|
271
|
+
|
|
272
|
+
completion_choice = CompletionChoice(
|
|
273
|
+
text="", index=0, logprobs=None, finish_reason="stop"
|
|
274
|
+
)
|
|
275
|
+
chunk = CompletionChunk(
|
|
276
|
+
id=completion_id,
|
|
277
|
+
object="text_completion",
|
|
278
|
+
created=int(time.time()),
|
|
279
|
+
model=self.model_uid,
|
|
280
|
+
choices=[completion_choice],
|
|
281
|
+
)
|
|
282
|
+
completion_usage = CompletionUsage(
|
|
283
|
+
prompt_tokens=-1,
|
|
284
|
+
completion_tokens=-1,
|
|
285
|
+
total_tokens=-1,
|
|
286
|
+
)
|
|
287
|
+
chunk["usage"] = completion_usage
|
|
288
|
+
yield chunk
|
|
@@ -22,9 +22,11 @@ from typing import Dict, Iterator, List, Optional, Union
|
|
|
22
22
|
from ....model.utils import select_device
|
|
23
23
|
from ....types import (
|
|
24
24
|
ChatCompletion,
|
|
25
|
-
ChatCompletionChoice,
|
|
26
25
|
ChatCompletionChunk,
|
|
27
26
|
ChatCompletionMessage,
|
|
27
|
+
Completion,
|
|
28
|
+
CompletionChoice,
|
|
29
|
+
CompletionChunk,
|
|
28
30
|
CompletionUsage,
|
|
29
31
|
)
|
|
30
32
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -116,10 +118,6 @@ class QwenVLChatModel(PytorchChatModel):
|
|
|
116
118
|
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
117
119
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
118
120
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
119
|
-
if generate_config and generate_config.get("stream"):
|
|
120
|
-
raise Exception(
|
|
121
|
-
f"Chat with model {self.model_family.model_name} does not support stream."
|
|
122
|
-
)
|
|
123
121
|
prompt = self._message_content_to_qwen(prompt)
|
|
124
122
|
# Convert openai history to qwen vl history
|
|
125
123
|
qwen_history = []
|
|
@@ -134,22 +132,79 @@ class QwenVLChatModel(PytorchChatModel):
|
|
|
134
132
|
if len(query_to_response) == 2:
|
|
135
133
|
qwen_history.append(query_to_response)
|
|
136
134
|
query_to_response = []
|
|
135
|
+
|
|
136
|
+
stream = generate_config.get("stream", False) if generate_config else False
|
|
137
|
+
|
|
138
|
+
if stream:
|
|
139
|
+
it = self._generate_stream(prompt, qwen_history)
|
|
140
|
+
return self._to_chat_completion_chunks(it)
|
|
141
|
+
else:
|
|
142
|
+
c = self._generate(prompt, qwen_history)
|
|
143
|
+
return self._to_chat_completion(c)
|
|
144
|
+
|
|
145
|
+
def _generate(self, prompt: str, qwen_history: List) -> Completion:
|
|
137
146
|
response, history = self._model.chat(
|
|
138
147
|
self._tokenizer, query=prompt, history=qwen_history
|
|
139
148
|
)
|
|
140
|
-
|
|
141
|
-
id=
|
|
142
|
-
object="
|
|
149
|
+
c = Completion(
|
|
150
|
+
id=str(uuid.uuid1()),
|
|
151
|
+
object="text_completion",
|
|
143
152
|
created=int(time.time()),
|
|
144
153
|
model=self.model_uid,
|
|
145
154
|
choices=[
|
|
146
|
-
|
|
147
|
-
index=0,
|
|
148
|
-
message={"role": "assistant", "content": response},
|
|
149
|
-
finish_reason="stop",
|
|
155
|
+
CompletionChoice(
|
|
156
|
+
index=0, text=response, finish_reason="stop", logprobs=None
|
|
150
157
|
)
|
|
151
158
|
],
|
|
152
159
|
usage=CompletionUsage(
|
|
153
160
|
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
154
161
|
),
|
|
155
162
|
)
|
|
163
|
+
return c
|
|
164
|
+
|
|
165
|
+
def _generate_stream(
|
|
166
|
+
self, prompt: str, qwen_history: List
|
|
167
|
+
) -> Iterator[CompletionChunk]:
|
|
168
|
+
# response, history = model.chat(tokenizer, message, history=history)
|
|
169
|
+
response_generator = self._model.chat_stream(
|
|
170
|
+
self._tokenizer, query=prompt, history=qwen_history
|
|
171
|
+
)
|
|
172
|
+
full_response = ""
|
|
173
|
+
for response in response_generator:
|
|
174
|
+
inc_content = response[len(full_response) :]
|
|
175
|
+
full_response = response
|
|
176
|
+
completion_choice = CompletionChoice(
|
|
177
|
+
text=inc_content, index=0, logprobs=None, finish_reason=None
|
|
178
|
+
)
|
|
179
|
+
completion_chunk = CompletionChunk(
|
|
180
|
+
id=str(uuid.uuid1()),
|
|
181
|
+
object="text_completion",
|
|
182
|
+
created=int(time.time()),
|
|
183
|
+
model=self.model_uid,
|
|
184
|
+
choices=[completion_choice],
|
|
185
|
+
)
|
|
186
|
+
completion_usage = CompletionUsage(
|
|
187
|
+
prompt_tokens=-1,
|
|
188
|
+
completion_tokens=-1,
|
|
189
|
+
total_tokens=-1,
|
|
190
|
+
)
|
|
191
|
+
completion_chunk["usage"] = completion_usage
|
|
192
|
+
yield completion_chunk
|
|
193
|
+
|
|
194
|
+
completion_choice = CompletionChoice(
|
|
195
|
+
text="", index=0, logprobs=None, finish_reason="stop"
|
|
196
|
+
)
|
|
197
|
+
completion_chunk = CompletionChunk(
|
|
198
|
+
id=str(uuid.uuid1()),
|
|
199
|
+
object="text_completion",
|
|
200
|
+
created=int(time.time()),
|
|
201
|
+
model=self.model_uid,
|
|
202
|
+
choices=[completion_choice],
|
|
203
|
+
)
|
|
204
|
+
completion_usage = CompletionUsage(
|
|
205
|
+
prompt_tokens=-1,
|
|
206
|
+
completion_tokens=-1,
|
|
207
|
+
total_tokens=-1,
|
|
208
|
+
)
|
|
209
|
+
completion_chunk["usage"] = completion_usage
|
|
210
|
+
yield completion_chunk
|
|
@@ -27,9 +27,11 @@ from PIL import Image
|
|
|
27
27
|
from ....model.utils import select_device
|
|
28
28
|
from ....types import (
|
|
29
29
|
ChatCompletion,
|
|
30
|
-
ChatCompletionChoice,
|
|
31
30
|
ChatCompletionChunk,
|
|
32
31
|
ChatCompletionMessage,
|
|
32
|
+
Completion,
|
|
33
|
+
CompletionChoice,
|
|
34
|
+
CompletionChunk,
|
|
33
35
|
CompletionUsage,
|
|
34
36
|
)
|
|
35
37
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -122,38 +124,6 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
122
124
|
raise RuntimeError("Only one image per message is supported by Yi VL.")
|
|
123
125
|
return content
|
|
124
126
|
|
|
125
|
-
@staticmethod
|
|
126
|
-
def _parse_text(text):
|
|
127
|
-
lines = text.split("\n")
|
|
128
|
-
lines = [line for line in lines if line != ""]
|
|
129
|
-
count = 0
|
|
130
|
-
for i, line in enumerate(lines):
|
|
131
|
-
if "```" in line:
|
|
132
|
-
count += 1
|
|
133
|
-
items = line.split("`")
|
|
134
|
-
if count % 2 == 1:
|
|
135
|
-
lines[i] = f'<pre><code class="language-{items[-1]}">'
|
|
136
|
-
else:
|
|
137
|
-
lines[i] = f"<br></code></pre>"
|
|
138
|
-
else:
|
|
139
|
-
if i > 0:
|
|
140
|
-
if count % 2 == 1:
|
|
141
|
-
line = line.replace("`", r"\`")
|
|
142
|
-
line = line.replace("<", "<")
|
|
143
|
-
line = line.replace(">", ">")
|
|
144
|
-
line = line.replace(" ", " ")
|
|
145
|
-
line = line.replace("*", "*")
|
|
146
|
-
line = line.replace("_", "_")
|
|
147
|
-
line = line.replace("-", "-")
|
|
148
|
-
line = line.replace(".", ".")
|
|
149
|
-
line = line.replace("!", "!")
|
|
150
|
-
line = line.replace("(", "(")
|
|
151
|
-
line = line.replace(")", ")")
|
|
152
|
-
line = line.replace("$", "$")
|
|
153
|
-
lines[i] = "<br>" + line
|
|
154
|
-
text = "".join(lines)
|
|
155
|
-
return text
|
|
156
|
-
|
|
157
127
|
def chat(
|
|
158
128
|
self,
|
|
159
129
|
prompt: Union[str, List[Dict]],
|
|
@@ -164,12 +134,12 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
164
134
|
from transformers import TextIteratorStreamer
|
|
165
135
|
|
|
166
136
|
# TODO(codingl2k1): implement stream mode.
|
|
167
|
-
|
|
168
|
-
raise Exception(
|
|
169
|
-
f"Chat with model {self.model_family.model_name} does not support stream."
|
|
170
|
-
)
|
|
137
|
+
|
|
171
138
|
if not generate_config:
|
|
172
139
|
generate_config = {}
|
|
140
|
+
|
|
141
|
+
stream = generate_config.get("stream", False)
|
|
142
|
+
|
|
173
143
|
from ....thirdparty.llava.conversation import conv_templates
|
|
174
144
|
from ....thirdparty.llava.mm_utils import (
|
|
175
145
|
KeywordsStoppingCriteria,
|
|
@@ -229,25 +199,72 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
229
199
|
t = Thread(target=self._model.generate, kwargs=generate_kwargs)
|
|
230
200
|
t.start()
|
|
231
201
|
|
|
202
|
+
if stream:
|
|
203
|
+
it = self._generate_stream(streamer, stop_str)
|
|
204
|
+
return self._to_chat_completion_chunks(it)
|
|
205
|
+
else:
|
|
206
|
+
c = self._generate(streamer, stop_str)
|
|
207
|
+
return self._to_chat_completion(c)
|
|
208
|
+
|
|
209
|
+
def _generate(self, streamer, stop_str) -> Completion:
|
|
232
210
|
generated_text = ""
|
|
233
211
|
for new_text in streamer:
|
|
234
212
|
generated_text += new_text
|
|
235
213
|
if generated_text.endswith(stop_str):
|
|
236
214
|
generated_text = generated_text[: -len(stop_str)]
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
id=
|
|
240
|
-
object="
|
|
215
|
+
|
|
216
|
+
c = Completion(
|
|
217
|
+
id=str(uuid.uuid1()),
|
|
218
|
+
object="text_completion",
|
|
241
219
|
created=int(time.time()),
|
|
242
220
|
model=self.model_uid,
|
|
243
221
|
choices=[
|
|
244
|
-
|
|
245
|
-
index=0,
|
|
246
|
-
message={"role": "assistant", "content": r},
|
|
247
|
-
finish_reason="stop",
|
|
222
|
+
CompletionChoice(
|
|
223
|
+
index=0, text=generated_text, finish_reason="stop", logprobs=None
|
|
248
224
|
)
|
|
249
225
|
],
|
|
250
226
|
usage=CompletionUsage(
|
|
251
227
|
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
252
228
|
),
|
|
253
229
|
)
|
|
230
|
+
return c
|
|
231
|
+
|
|
232
|
+
def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
|
|
233
|
+
completion_id = str(uuid.uuid1())
|
|
234
|
+
for i, new_text in enumerate(streamer):
|
|
235
|
+
if not new_text.endswith(stop_str):
|
|
236
|
+
completion_choice = CompletionChoice(
|
|
237
|
+
text=new_text, index=0, logprobs=None, finish_reason=None
|
|
238
|
+
)
|
|
239
|
+
chunk = CompletionChunk(
|
|
240
|
+
id=completion_id,
|
|
241
|
+
object="text_completion",
|
|
242
|
+
created=int(time.time()),
|
|
243
|
+
model=self.model_uid,
|
|
244
|
+
choices=[completion_choice],
|
|
245
|
+
)
|
|
246
|
+
completion_usage = CompletionUsage(
|
|
247
|
+
prompt_tokens=-1,
|
|
248
|
+
completion_tokens=-1,
|
|
249
|
+
total_tokens=-1,
|
|
250
|
+
)
|
|
251
|
+
chunk["usage"] = completion_usage
|
|
252
|
+
yield chunk
|
|
253
|
+
|
|
254
|
+
completion_choice = CompletionChoice(
|
|
255
|
+
text="", index=0, logprobs=None, finish_reason="stop"
|
|
256
|
+
)
|
|
257
|
+
chunk = CompletionChunk(
|
|
258
|
+
id=completion_id,
|
|
259
|
+
object="text_completion",
|
|
260
|
+
created=int(time.time()),
|
|
261
|
+
model=self.model_uid,
|
|
262
|
+
choices=[completion_choice],
|
|
263
|
+
)
|
|
264
|
+
completion_usage = CompletionUsage(
|
|
265
|
+
prompt_tokens=-1,
|
|
266
|
+
completion_tokens=-1,
|
|
267
|
+
total_tokens=-1,
|
|
268
|
+
)
|
|
269
|
+
chunk["usage"] = completion_usage
|
|
270
|
+
yield chunk
|