xinference 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +79 -2
- xinference/client/restful/restful_client.py +65 -3
- xinference/conftest.py +0 -7
- xinference/core/media_interface.py +132 -8
- xinference/core/model.py +44 -6
- xinference/core/scheduler.py +1 -10
- xinference/core/supervisor.py +8 -17
- xinference/core/worker.py +5 -27
- xinference/deploy/cmdline.py +6 -2
- xinference/model/audio/chattts.py +24 -39
- xinference/model/audio/cosyvoice.py +18 -30
- xinference/model/audio/funasr.py +42 -0
- xinference/model/audio/model_spec.json +71 -1
- xinference/model/audio/model_spec_modelscope.json +76 -2
- xinference/model/audio/utils.py +75 -0
- xinference/model/core.py +1 -0
- xinference/model/embedding/__init__.py +74 -18
- xinference/model/embedding/core.py +98 -589
- xinference/model/embedding/embed_family.py +133 -0
- xinference/{thirdparty/omnilmm/train → model/embedding/flag}/__init__.py +1 -1
- xinference/model/embedding/flag/core.py +282 -0
- xinference/model/embedding/model_spec.json +24 -0
- xinference/model/embedding/model_spec_modelscope.json +24 -0
- xinference/model/embedding/sentence_transformers/__init__.py +13 -0
- xinference/model/embedding/sentence_transformers/core.py +399 -0
- xinference/model/embedding/vllm/core.py +95 -0
- xinference/model/image/model_spec.json +30 -3
- xinference/model/image/model_spec_modelscope.json +41 -2
- xinference/model/image/stable_diffusion/core.py +144 -53
- xinference/model/llm/__init__.py +6 -54
- xinference/model/llm/core.py +19 -5
- xinference/model/llm/llama_cpp/core.py +59 -3
- xinference/model/llm/llama_cpp/memory.py +457 -0
- xinference/model/llm/llm_family.json +247 -402
- xinference/model/llm/llm_family.py +88 -16
- xinference/model/llm/llm_family_modelscope.json +260 -421
- xinference/model/llm/llm_family_openmind_hub.json +0 -34
- xinference/model/llm/sglang/core.py +8 -0
- xinference/model/llm/transformers/__init__.py +27 -6
- xinference/model/llm/transformers/chatglm.py +4 -2
- xinference/model/llm/transformers/core.py +49 -28
- xinference/model/llm/transformers/deepseek_v2.py +6 -49
- xinference/model/llm/transformers/gemma3.py +119 -164
- xinference/model/llm/transformers/multimodal/__init__.py +13 -0
- xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
- xinference/model/llm/transformers/multimodal/core.py +205 -0
- xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
- xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
- xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
- xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
- xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
- xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
- xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
- xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
- xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
- xinference/model/llm/transformers/opt.py +4 -2
- xinference/model/llm/transformers/utils.py +6 -37
- xinference/model/llm/utils.py +11 -0
- xinference/model/llm/vllm/core.py +7 -0
- xinference/model/rerank/core.py +91 -3
- xinference/model/rerank/model_spec.json +24 -0
- xinference/model/rerank/model_spec_modelscope.json +24 -0
- xinference/model/rerank/utils.py +20 -2
- xinference/model/utils.py +38 -1
- xinference/model/video/diffusers.py +65 -3
- xinference/model/video/model_spec.json +31 -4
- xinference/model/video/model_spec_modelscope.json +32 -4
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.013f296b.css +2 -0
- xinference/web/ui/build/static/css/main.013f296b.css.map +1 -0
- xinference/web/ui/build/static/js/main.8a9e3ba0.js +3 -0
- xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6595880facebca7ceace6f17cf21c3a5a9219a2f52fb0ba9f3cf1131eddbcf6b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/aa998bc2d9c11853add6b8a2e08f50327f56d8824ccaaec92d6dde1b305f0d85.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c748246b1d7bcebc16153be69f37e955bb2145526c47dd425aeeff70d3004dbc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e31234e95d60a5a7883fbcd70de2475dc1c88c90705df1a530abb68f86f80a51.json +1 -0
- xinference/web/ui/src/locales/en.json +21 -8
- xinference/web/ui/src/locales/ja.json +224 -0
- xinference/web/ui/src/locales/ko.json +224 -0
- xinference/web/ui/src/locales/zh.json +21 -8
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/METADATA +14 -11
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/RECORD +93 -100
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/WHEEL +1 -1
- xinference/model/llm/transformers/cogvlm2.py +0 -442
- xinference/model/llm/transformers/cogvlm2_video.py +0 -333
- xinference/model/llm/transformers/deepseek_vl.py +0 -280
- xinference/model/llm/transformers/glm_edge_v.py +0 -213
- xinference/model/llm/transformers/intern_vl.py +0 -526
- xinference/model/llm/transformers/internlm2.py +0 -94
- xinference/model/llm/transformers/minicpmv25.py +0 -193
- xinference/model/llm/transformers/omnilmm.py +0 -132
- xinference/model/llm/transformers/qwen2_audio.py +0 -179
- xinference/model/llm/transformers/qwen_vl.py +0 -360
- xinference/thirdparty/omnilmm/LICENSE +0 -201
- xinference/thirdparty/omnilmm/chat.py +0 -218
- xinference/thirdparty/omnilmm/constants.py +0 -4
- xinference/thirdparty/omnilmm/conversation.py +0 -332
- xinference/thirdparty/omnilmm/model/__init__.py +0 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
- xinference/thirdparty/omnilmm/model/resampler.py +0 -166
- xinference/thirdparty/omnilmm/model/utils.py +0 -578
- xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
- xinference/thirdparty/omnilmm/utils.py +0 -134
- xinference/web/ui/build/static/css/main.337afe76.css +0 -2
- xinference/web/ui/build/static/css/main.337afe76.css.map +0 -1
- xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
- xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +0 -1
- /xinference/{thirdparty/omnilmm → model/embedding/vllm}/__init__.py +0 -0
- /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.8a9e3ba0.js.LICENSE.txt} +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -1,218 +0,0 @@
|
|
|
1
|
-
import base64
|
|
2
|
-
import io
|
|
3
|
-
import json
|
|
4
|
-
import os
|
|
5
|
-
|
|
6
|
-
import torch
|
|
7
|
-
from PIL import Image
|
|
8
|
-
from transformers import AutoModel, AutoTokenizer
|
|
9
|
-
|
|
10
|
-
DEFAULT_IMAGE_TOKEN = "<image>"
|
|
11
|
-
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
|
|
12
|
-
DEFAULT_IM_START_TOKEN = "<im_start>"
|
|
13
|
-
DEFAULT_IM_END_TOKEN = "<im_end>"
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def init_omni_lmm(model_path, device_map):
|
|
17
|
-
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
|
|
18
|
-
|
|
19
|
-
from .model.omnilmm import OmniLMMForCausalLM
|
|
20
|
-
from .model.utils import build_transform
|
|
21
|
-
from .utils import disable_torch_init
|
|
22
|
-
|
|
23
|
-
torch.backends.cuda.matmul.allow_tf32 = True
|
|
24
|
-
disable_torch_init()
|
|
25
|
-
model_name = os.path.expanduser(model_path)
|
|
26
|
-
print(f"Load omni_lmm model and tokenizer from {model_name}")
|
|
27
|
-
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=2048)
|
|
28
|
-
|
|
29
|
-
if False:
|
|
30
|
-
# model on multiple devices for small size gpu memory (Nvidia 3090 24G x2)
|
|
31
|
-
with init_empty_weights():
|
|
32
|
-
model = OmniLMMForCausalLM.from_pretrained(
|
|
33
|
-
model_name, tune_clip=True, torch_dtype=torch.bfloat16
|
|
34
|
-
)
|
|
35
|
-
model = load_checkpoint_and_dispatch(
|
|
36
|
-
model,
|
|
37
|
-
model_name,
|
|
38
|
-
dtype=torch.bfloat16,
|
|
39
|
-
device_map="auto",
|
|
40
|
-
no_split_module_classes=[
|
|
41
|
-
"Eva",
|
|
42
|
-
"MistralDecoderLayer",
|
|
43
|
-
"ModuleList",
|
|
44
|
-
"Resampler",
|
|
45
|
-
],
|
|
46
|
-
)
|
|
47
|
-
else:
|
|
48
|
-
model = OmniLMMForCausalLM.from_pretrained(
|
|
49
|
-
model_name,
|
|
50
|
-
tune_clip=True,
|
|
51
|
-
torch_dtype=torch.bfloat16,
|
|
52
|
-
device_map=device_map,
|
|
53
|
-
).to(dtype=torch.bfloat16)
|
|
54
|
-
|
|
55
|
-
image_processor = build_transform(
|
|
56
|
-
is_train=False, input_size=model.model.config.image_size, std_mode="OPENAI_CLIP"
|
|
57
|
-
)
|
|
58
|
-
|
|
59
|
-
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
|
|
60
|
-
assert mm_use_im_start_end
|
|
61
|
-
|
|
62
|
-
tokenizer.add_tokens(
|
|
63
|
-
[DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN],
|
|
64
|
-
special_tokens=True,
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
vision_config = model.model.vision_config
|
|
68
|
-
vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
|
|
69
|
-
[DEFAULT_IMAGE_PATCH_TOKEN]
|
|
70
|
-
)[0]
|
|
71
|
-
vision_config.use_im_start_end = mm_use_im_start_end
|
|
72
|
-
(
|
|
73
|
-
vision_config.im_start_token,
|
|
74
|
-
vision_config.im_end_token,
|
|
75
|
-
) = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
|
|
76
|
-
image_token_len = model.model.config.num_query
|
|
77
|
-
|
|
78
|
-
return model, image_processor, image_token_len, tokenizer
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def expand_question_into_multimodal(
|
|
82
|
-
question_text, image_token_len, im_st_token, im_ed_token, im_patch_token
|
|
83
|
-
):
|
|
84
|
-
if "<image>" in question_text[0]["content"]:
|
|
85
|
-
question_text[0]["content"] = question_text[0]["content"].replace(
|
|
86
|
-
"<image>", im_st_token + im_patch_token * image_token_len + im_ed_token
|
|
87
|
-
)
|
|
88
|
-
else:
|
|
89
|
-
question_text[0]["content"] = (
|
|
90
|
-
im_st_token
|
|
91
|
-
+ im_patch_token * image_token_len
|
|
92
|
-
+ im_ed_token
|
|
93
|
-
+ "\n"
|
|
94
|
-
+ question_text[0]["content"]
|
|
95
|
-
)
|
|
96
|
-
return question_text
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def wrap_question_for_omni_lmm(question, image_token_len, tokenizer):
|
|
100
|
-
from .train.train_utils import omni_preprocess
|
|
101
|
-
|
|
102
|
-
question = expand_question_into_multimodal(
|
|
103
|
-
question,
|
|
104
|
-
image_token_len,
|
|
105
|
-
DEFAULT_IM_START_TOKEN,
|
|
106
|
-
DEFAULT_IM_END_TOKEN,
|
|
107
|
-
DEFAULT_IMAGE_PATCH_TOKEN,
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
conversation = question
|
|
111
|
-
data_dict = omni_preprocess(
|
|
112
|
-
sources=[conversation], tokenizer=tokenizer, generation=True
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
|
|
116
|
-
return data_dict
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
class OmniLMM12B:
|
|
120
|
-
def __init__(self, model_path, device_map) -> None:
|
|
121
|
-
model, img_processor, image_token_len, tokenizer = init_omni_lmm(
|
|
122
|
-
model_path, device_map
|
|
123
|
-
)
|
|
124
|
-
self.model = model
|
|
125
|
-
self.image_token_len = image_token_len
|
|
126
|
-
self.image_transform = img_processor
|
|
127
|
-
self.tokenizer = tokenizer
|
|
128
|
-
self.model.eval()
|
|
129
|
-
|
|
130
|
-
def decode(self, image, input_ids):
|
|
131
|
-
with torch.inference_mode():
|
|
132
|
-
output = self.model.generate_vllm(
|
|
133
|
-
input_ids=input_ids.unsqueeze(0).cuda(),
|
|
134
|
-
images=image.unsqueeze(0).half().cuda(),
|
|
135
|
-
temperature=0.6,
|
|
136
|
-
max_new_tokens=1024,
|
|
137
|
-
# num_beams=num_beams,
|
|
138
|
-
do_sample=True,
|
|
139
|
-
output_scores=True,
|
|
140
|
-
return_dict_in_generate=True,
|
|
141
|
-
repetition_penalty=1.1,
|
|
142
|
-
top_k=30,
|
|
143
|
-
top_p=0.9,
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
response = self.tokenizer.decode(
|
|
147
|
-
output.sequences[0], skip_special_tokens=True
|
|
148
|
-
)
|
|
149
|
-
response = response.strip()
|
|
150
|
-
return response
|
|
151
|
-
|
|
152
|
-
def chat(self, input):
|
|
153
|
-
try:
|
|
154
|
-
image = Image.open(io.BytesIO(base64.b64decode(input["image"]))).convert(
|
|
155
|
-
"RGB"
|
|
156
|
-
)
|
|
157
|
-
except Exception as e:
|
|
158
|
-
return f"Image decode error: {e}"
|
|
159
|
-
|
|
160
|
-
msgs = json.loads(input["question"])
|
|
161
|
-
input_ids = wrap_question_for_omni_lmm(
|
|
162
|
-
msgs, self.image_token_len, self.tokenizer
|
|
163
|
-
)["input_ids"]
|
|
164
|
-
input_ids = torch.as_tensor(input_ids)
|
|
165
|
-
# print('input_ids', input_ids)
|
|
166
|
-
image = self.image_transform(image)
|
|
167
|
-
|
|
168
|
-
out = self.decode(image, input_ids)
|
|
169
|
-
|
|
170
|
-
return out
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
def img2base64(file_name):
|
|
174
|
-
with open(file_name, "rb") as f:
|
|
175
|
-
encoded_string = base64.b64encode(f.read())
|
|
176
|
-
return encoded_string
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
class OmniLMM3B:
|
|
180
|
-
def __init__(self, model_path, device_map) -> None:
|
|
181
|
-
self.model = AutoModel.from_pretrained(
|
|
182
|
-
model_path, trust_remote_code=True, device_map=device_map
|
|
183
|
-
).to(dtype=torch.bfloat16)
|
|
184
|
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
185
|
-
model_path, trust_remote_code=True
|
|
186
|
-
)
|
|
187
|
-
self.model.eval().cuda()
|
|
188
|
-
|
|
189
|
-
def chat(self, input):
|
|
190
|
-
try:
|
|
191
|
-
image = Image.open(io.BytesIO(base64.b64decode(input["image"]))).convert(
|
|
192
|
-
"RGB"
|
|
193
|
-
)
|
|
194
|
-
except Exception as e:
|
|
195
|
-
return f"Image decode error: {e}"
|
|
196
|
-
|
|
197
|
-
msgs = json.loads(input["question"])
|
|
198
|
-
|
|
199
|
-
answer, context, _ = self.model.chat(
|
|
200
|
-
image=image,
|
|
201
|
-
msgs=msgs,
|
|
202
|
-
context=None,
|
|
203
|
-
tokenizer=self.tokenizer,
|
|
204
|
-
sampling=True,
|
|
205
|
-
temperature=0.7,
|
|
206
|
-
)
|
|
207
|
-
return answer
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
class OmniLMMChat:
|
|
211
|
-
def __init__(self, model_path, device_map) -> None:
|
|
212
|
-
if "12b" in model_path:
|
|
213
|
-
self.model = OmniLMM12B(model_path, device_map)
|
|
214
|
-
else:
|
|
215
|
-
self.model = OmniLMM3B(model_path, device_map)
|
|
216
|
-
|
|
217
|
-
def chat(self, input):
|
|
218
|
-
return self.model.chat(input)
|
|
@@ -1,332 +0,0 @@
|
|
|
1
|
-
import dataclasses
|
|
2
|
-
from enum import Enum, auto
|
|
3
|
-
from typing import List
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class SeparatorStyle(Enum):
|
|
7
|
-
"""Different separator style."""
|
|
8
|
-
|
|
9
|
-
SINGLE = auto()
|
|
10
|
-
TWO = auto()
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
@dataclasses.dataclass
|
|
14
|
-
class Conversation:
|
|
15
|
-
"""A class that keeps all conversation history."""
|
|
16
|
-
|
|
17
|
-
system: str
|
|
18
|
-
roles: List[str]
|
|
19
|
-
messages: List[List[str]]
|
|
20
|
-
offset: int
|
|
21
|
-
sep_style: SeparatorStyle = SeparatorStyle.SINGLE
|
|
22
|
-
sep: str = "###"
|
|
23
|
-
sep2: str = None
|
|
24
|
-
version: str = "Unknown"
|
|
25
|
-
|
|
26
|
-
skip_next: bool = False
|
|
27
|
-
|
|
28
|
-
def get_prompt(self):
|
|
29
|
-
if self.sep_style == SeparatorStyle.SINGLE:
|
|
30
|
-
ret = self.system + self.sep
|
|
31
|
-
for role, message in self.messages:
|
|
32
|
-
if message:
|
|
33
|
-
if type(message) is tuple:
|
|
34
|
-
message, _, _ = message
|
|
35
|
-
ret += role + ": " + message + self.sep
|
|
36
|
-
else:
|
|
37
|
-
ret += role + ":"
|
|
38
|
-
return ret
|
|
39
|
-
elif self.sep_style == SeparatorStyle.TWO:
|
|
40
|
-
seps = [self.sep, self.sep2]
|
|
41
|
-
ret = self.system + seps[0]
|
|
42
|
-
for i, (role, message) in enumerate(self.messages):
|
|
43
|
-
if message:
|
|
44
|
-
if type(message) is tuple:
|
|
45
|
-
message, _, _ = message
|
|
46
|
-
ret += role + ": " + message + seps[i % 2]
|
|
47
|
-
else:
|
|
48
|
-
ret += role + ":"
|
|
49
|
-
return ret
|
|
50
|
-
else:
|
|
51
|
-
raise ValueError(f"Invalid style: {self.sep_style}")
|
|
52
|
-
|
|
53
|
-
def append_message(self, role, message):
|
|
54
|
-
self.messages.append([role, message])
|
|
55
|
-
|
|
56
|
-
def get_images(self, return_pil=False):
|
|
57
|
-
images = []
|
|
58
|
-
for i, (role, msg) in enumerate(self.messages[self.offset :]):
|
|
59
|
-
if i % 2 == 0:
|
|
60
|
-
if type(msg) is tuple:
|
|
61
|
-
import base64
|
|
62
|
-
from io import BytesIO
|
|
63
|
-
|
|
64
|
-
from PIL import Image
|
|
65
|
-
|
|
66
|
-
msg, image, image_process_mode = msg
|
|
67
|
-
if image_process_mode == "Pad":
|
|
68
|
-
|
|
69
|
-
def expand2square(pil_img, background_color=(122, 116, 104)):
|
|
70
|
-
width, height = pil_img.size
|
|
71
|
-
if width == height:
|
|
72
|
-
return pil_img
|
|
73
|
-
elif width > height:
|
|
74
|
-
result = Image.new(
|
|
75
|
-
pil_img.mode, (width, width), background_color
|
|
76
|
-
)
|
|
77
|
-
result.paste(pil_img, (0, (width - height) // 2))
|
|
78
|
-
return result
|
|
79
|
-
else:
|
|
80
|
-
result = Image.new(
|
|
81
|
-
pil_img.mode, (height, height), background_color
|
|
82
|
-
)
|
|
83
|
-
result.paste(pil_img, ((height - width) // 2, 0))
|
|
84
|
-
return result
|
|
85
|
-
|
|
86
|
-
image = expand2square(image)
|
|
87
|
-
elif image_process_mode == "Crop":
|
|
88
|
-
pass
|
|
89
|
-
elif image_process_mode == "Resize":
|
|
90
|
-
image = image.resize((224, 224))
|
|
91
|
-
else:
|
|
92
|
-
raise ValueError(
|
|
93
|
-
f"Invalid image_process_mode: {image_process_mode}"
|
|
94
|
-
)
|
|
95
|
-
max_hw, min_hw = max(image.size), min(image.size)
|
|
96
|
-
aspect_ratio = max_hw / min_hw
|
|
97
|
-
max_len, min_len = 800, 400
|
|
98
|
-
shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
|
|
99
|
-
longest_edge = int(shortest_edge * aspect_ratio)
|
|
100
|
-
W, H = image.size
|
|
101
|
-
if H > W:
|
|
102
|
-
H, W = longest_edge, shortest_edge
|
|
103
|
-
else:
|
|
104
|
-
H, W = shortest_edge, longest_edge
|
|
105
|
-
image = image.resize((W, H))
|
|
106
|
-
if return_pil:
|
|
107
|
-
images.append(image)
|
|
108
|
-
else:
|
|
109
|
-
buffered = BytesIO()
|
|
110
|
-
image.save(buffered, format="JPEG")
|
|
111
|
-
img_b64_str = base64.b64encode(buffered.getvalue()).decode()
|
|
112
|
-
images.append(img_b64_str)
|
|
113
|
-
return images
|
|
114
|
-
|
|
115
|
-
def to_gradio_chatbot(self):
|
|
116
|
-
ret = []
|
|
117
|
-
for i, (role, msg) in enumerate(self.messages[self.offset :]):
|
|
118
|
-
if i % 2 == 0:
|
|
119
|
-
if type(msg) is tuple:
|
|
120
|
-
import base64
|
|
121
|
-
from io import BytesIO
|
|
122
|
-
|
|
123
|
-
msg, image, image_process_mode = msg
|
|
124
|
-
max_hw, min_hw = max(image.size), min(image.size)
|
|
125
|
-
aspect_ratio = max_hw / min_hw
|
|
126
|
-
max_len, min_len = 800, 400
|
|
127
|
-
shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
|
|
128
|
-
longest_edge = int(shortest_edge * aspect_ratio)
|
|
129
|
-
W, H = image.size
|
|
130
|
-
if H > W:
|
|
131
|
-
H, W = longest_edge, shortest_edge
|
|
132
|
-
else:
|
|
133
|
-
H, W = shortest_edge, longest_edge
|
|
134
|
-
image = image.resize((W, H))
|
|
135
|
-
# image = image.resize((224, 224))
|
|
136
|
-
buffered = BytesIO()
|
|
137
|
-
image.save(buffered, format="JPEG")
|
|
138
|
-
img_b64_str = base64.b64encode(buffered.getvalue()).decode()
|
|
139
|
-
img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
|
|
140
|
-
msg = msg.replace("<image>", img_str)
|
|
141
|
-
ret.append([msg, None])
|
|
142
|
-
else:
|
|
143
|
-
ret[-1][-1] = msg
|
|
144
|
-
return ret
|
|
145
|
-
|
|
146
|
-
def copy(self):
|
|
147
|
-
return Conversation(
|
|
148
|
-
system=self.system,
|
|
149
|
-
roles=self.roles,
|
|
150
|
-
messages=[[x, y] for x, y in self.messages],
|
|
151
|
-
offset=self.offset,
|
|
152
|
-
sep_style=self.sep_style,
|
|
153
|
-
sep=self.sep,
|
|
154
|
-
sep2=self.sep2,
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
def dict(self):
|
|
158
|
-
if len(self.get_images()) > 0:
|
|
159
|
-
return {
|
|
160
|
-
"system": self.system,
|
|
161
|
-
"roles": self.roles,
|
|
162
|
-
"messages": [
|
|
163
|
-
[x, y[0] if type(y) is tuple else y] for x, y in self.messages
|
|
164
|
-
],
|
|
165
|
-
"offset": self.offset,
|
|
166
|
-
"sep": self.sep,
|
|
167
|
-
"sep2": self.sep2,
|
|
168
|
-
}
|
|
169
|
-
return {
|
|
170
|
-
"system": self.system,
|
|
171
|
-
"roles": self.roles,
|
|
172
|
-
"messages": self.messages,
|
|
173
|
-
"offset": self.offset,
|
|
174
|
-
"sep": self.sep,
|
|
175
|
-
"sep2": self.sep2,
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
conv_v1 = Conversation(
|
|
180
|
-
system="A chat between a curious human and an artificial intelligence assistant. "
|
|
181
|
-
"The assistant gives helpful, detailed, and polite answers to the human's questions.",
|
|
182
|
-
roles=("Human", "Assistant"),
|
|
183
|
-
messages=(
|
|
184
|
-
("Human", "Give three tips for staying healthy."),
|
|
185
|
-
(
|
|
186
|
-
"Assistant",
|
|
187
|
-
"Sure, here are three tips for staying healthy:\n"
|
|
188
|
-
"1. Exercise regularly: Regular physical activity can help improve your overall health and wellbeing. "
|
|
189
|
-
"It can also help reduce your risk of chronic conditions such as obesity, diabetes, heart disease, "
|
|
190
|
-
"and certain cancers. Aim for at least 150 minutes of moderate-intensity aerobic exercise or "
|
|
191
|
-
"75 minutes of vigorous-intensity aerobic exercise per week, along with muscle-strengthening "
|
|
192
|
-
"activities at least two days per week.\n"
|
|
193
|
-
"2. Eat a balanced diet: Eating a balanced diet that is rich in fruits, "
|
|
194
|
-
"vegetables, whole grains, lean proteins, and healthy fats can help support "
|
|
195
|
-
"your overall health. Try to limit your intake of processed and high-sugar foods, "
|
|
196
|
-
"and aim to drink plenty of water throughout the day.\n"
|
|
197
|
-
"3. Get enough sleep: Getting enough quality sleep is essential for your physical "
|
|
198
|
-
"and mental health. Adults should aim for seven to nine hours of sleep per night. "
|
|
199
|
-
"Establish a regular sleep schedule and try to create a relaxing bedtime routine to "
|
|
200
|
-
"help improve the quality of your sleep.",
|
|
201
|
-
),
|
|
202
|
-
),
|
|
203
|
-
offset=2,
|
|
204
|
-
sep_style=SeparatorStyle.SINGLE,
|
|
205
|
-
sep="###",
|
|
206
|
-
)
|
|
207
|
-
|
|
208
|
-
conv_v1_2 = Conversation(
|
|
209
|
-
system="A chat between a curious human and an artificial intelligence assistant. "
|
|
210
|
-
"The assistant gives helpful, detailed, and polite answers to the human's questions.",
|
|
211
|
-
roles=("Human", "Assistant"),
|
|
212
|
-
messages=(
|
|
213
|
-
(
|
|
214
|
-
"Human",
|
|
215
|
-
"What are the key differences between renewable and non-renewable energy sources?",
|
|
216
|
-
),
|
|
217
|
-
(
|
|
218
|
-
"Assistant",
|
|
219
|
-
"Renewable energy sources are those that can be replenished naturally in a relatively "
|
|
220
|
-
"short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
|
|
221
|
-
"Non-renewable energy sources, on the other hand, are finite and will eventually be "
|
|
222
|
-
"depleted, such as coal, oil, and natural gas. Here are some key differences between "
|
|
223
|
-
"renewable and non-renewable energy sources:\n"
|
|
224
|
-
"1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
|
|
225
|
-
"energy sources are finite and will eventually run out.\n"
|
|
226
|
-
"2. Environmental impact: Renewable energy sources have a much lower environmental impact "
|
|
227
|
-
"than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
|
|
228
|
-
"and other negative effects.\n"
|
|
229
|
-
"3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
|
|
230
|
-
"have lower operational costs than non-renewable sources.\n"
|
|
231
|
-
"4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
|
|
232
|
-
"locations than non-renewable sources.\n"
|
|
233
|
-
"5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
|
|
234
|
-
"situations and needs, while non-renewable sources are more rigid and inflexible.\n"
|
|
235
|
-
"6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
|
|
236
|
-
"non-renewable sources are not, and their depletion can lead to economic and social instability.\n",
|
|
237
|
-
),
|
|
238
|
-
),
|
|
239
|
-
offset=2,
|
|
240
|
-
sep_style=SeparatorStyle.SINGLE,
|
|
241
|
-
sep="###",
|
|
242
|
-
)
|
|
243
|
-
|
|
244
|
-
conv_vicuna_v1_1 = Conversation(
|
|
245
|
-
system="A chat between a curious user and an artificial intelligence assistant. "
|
|
246
|
-
"The assistant gives helpful, detailed, and polite answers to the user's questions.",
|
|
247
|
-
roles=("USER", "ASSISTANT"),
|
|
248
|
-
version="v1",
|
|
249
|
-
messages=(),
|
|
250
|
-
offset=0,
|
|
251
|
-
sep_style=SeparatorStyle.TWO,
|
|
252
|
-
sep=" ",
|
|
253
|
-
sep2="</s>",
|
|
254
|
-
)
|
|
255
|
-
|
|
256
|
-
conv_bair_v1 = Conversation(
|
|
257
|
-
system="BEGINNING OF CONVERSATION:",
|
|
258
|
-
roles=("USER", "GPT"),
|
|
259
|
-
messages=(),
|
|
260
|
-
offset=0,
|
|
261
|
-
sep_style=SeparatorStyle.TWO,
|
|
262
|
-
sep=" ",
|
|
263
|
-
sep2="</s>",
|
|
264
|
-
)
|
|
265
|
-
|
|
266
|
-
simple_conv = Conversation(
|
|
267
|
-
system="You are LLaVA, a large language model trained by UW Madison WAIV Lab, based on LLaMA architecture."
|
|
268
|
-
"You are designed to assist human with a variety of tasks using natural language."
|
|
269
|
-
"Follow the instructions carefully.",
|
|
270
|
-
roles=("Human", "Assistant"),
|
|
271
|
-
messages=(
|
|
272
|
-
("Human", "Hi!"),
|
|
273
|
-
("Assistant", "Hi there! How can I help you today?\n"),
|
|
274
|
-
),
|
|
275
|
-
offset=2,
|
|
276
|
-
sep_style=SeparatorStyle.SINGLE,
|
|
277
|
-
sep="###",
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
simple_conv_multimodal = Conversation(
|
|
281
|
-
system="A chat between a curious user and an artificial intelligence assistant. "
|
|
282
|
-
"The assistant gives helpful, detailed, and polite answers to the user's questions.",
|
|
283
|
-
roles=("Human", "Assistant"),
|
|
284
|
-
messages=(),
|
|
285
|
-
offset=0,
|
|
286
|
-
sep_style=SeparatorStyle.SINGLE,
|
|
287
|
-
sep="###",
|
|
288
|
-
)
|
|
289
|
-
|
|
290
|
-
simple_conv_legacy = Conversation(
|
|
291
|
-
system="You are LLaVA, a large language model trained by UW Madison WAIV Lab."
|
|
292
|
-
"You are designed to assist human with a variety of tasks using natural language."
|
|
293
|
-
"Follow the instructions carefully.",
|
|
294
|
-
roles=("Human", "Assistant"),
|
|
295
|
-
messages=(
|
|
296
|
-
("Human", "Hi!\n\n### Response:"),
|
|
297
|
-
("Assistant", "Hi there! How can I help you today?\n"),
|
|
298
|
-
),
|
|
299
|
-
offset=2,
|
|
300
|
-
sep_style=SeparatorStyle.SINGLE,
|
|
301
|
-
sep="###",
|
|
302
|
-
)
|
|
303
|
-
|
|
304
|
-
conv_llava_v1 = Conversation(
|
|
305
|
-
system="You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab."
|
|
306
|
-
"You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
|
|
307
|
-
"Follow the instructions carefully and explain your answers in detail.",
|
|
308
|
-
roles=("USER", "ASSISTANT"),
|
|
309
|
-
version="v1",
|
|
310
|
-
messages=(),
|
|
311
|
-
offset=0,
|
|
312
|
-
sep_style=SeparatorStyle.TWO,
|
|
313
|
-
sep=" ",
|
|
314
|
-
sep2="</s>",
|
|
315
|
-
)
|
|
316
|
-
|
|
317
|
-
default_conversation = conv_v1_2
|
|
318
|
-
conv_templates = {
|
|
319
|
-
"default": conv_v1_2,
|
|
320
|
-
"simple": simple_conv,
|
|
321
|
-
"simple_legacy": simple_conv_legacy,
|
|
322
|
-
"multimodal": simple_conv_multimodal,
|
|
323
|
-
"llava_v1": conv_llava_v1,
|
|
324
|
-
# fastchat
|
|
325
|
-
"v1": conv_v1_2,
|
|
326
|
-
"bair_v1": conv_bair_v1,
|
|
327
|
-
"vicuna_v1_1": conv_vicuna_v1_1,
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
if __name__ == "__main__":
|
|
332
|
-
print(default_conversation.get_prompt())
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from .omnilmm import OmniLMMForCausalLM
|