sglang 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +13 -1
- sglang/bench_latency.py +10 -5
- sglang/bench_serving.py +50 -26
- sglang/check_env.py +15 -0
- sglang/global_config.py +1 -1
- sglang/lang/backend/runtime_endpoint.py +60 -49
- sglang/lang/chat_template.py +10 -5
- sglang/lang/compiler.py +4 -0
- sglang/lang/interpreter.py +5 -2
- sglang/lang/ir.py +22 -4
- sglang/launch_server.py +8 -1
- sglang/srt/constrained/jump_forward.py +13 -2
- sglang/srt/conversation.py +50 -1
- sglang/srt/hf_transformers_utils.py +22 -23
- sglang/srt/layers/activation.py +24 -2
- sglang/srt/layers/decode_attention.py +338 -50
- sglang/srt/layers/extend_attention.py +3 -1
- sglang/srt/layers/fused_moe/__init__.py +1 -0
- sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
- sglang/srt/layers/fused_moe/layer.py +587 -0
- sglang/srt/layers/layernorm.py +3 -0
- sglang/srt/layers/logits_processor.py +64 -27
- sglang/srt/layers/radix_attention.py +41 -18
- sglang/srt/layers/sampler.py +154 -0
- sglang/srt/managers/controller_multi.py +2 -8
- sglang/srt/managers/controller_single.py +7 -10
- sglang/srt/managers/detokenizer_manager.py +20 -9
- sglang/srt/managers/io_struct.py +44 -11
- sglang/srt/managers/policy_scheduler.py +5 -2
- sglang/srt/managers/schedule_batch.py +59 -179
- sglang/srt/managers/tokenizer_manager.py +193 -84
- sglang/srt/managers/tp_worker.py +131 -50
- sglang/srt/mem_cache/memory_pool.py +82 -8
- sglang/srt/mm_utils.py +79 -7
- sglang/srt/model_executor/cuda_graph_runner.py +97 -28
- sglang/srt/model_executor/forward_batch_info.py +188 -82
- sglang/srt/model_executor/model_runner.py +269 -87
- sglang/srt/models/chatglm.py +6 -14
- sglang/srt/models/commandr.py +6 -2
- sglang/srt/models/dbrx.py +5 -1
- sglang/srt/models/deepseek.py +7 -3
- sglang/srt/models/deepseek_v2.py +12 -7
- sglang/srt/models/gemma.py +6 -2
- sglang/srt/models/gemma2.py +22 -8
- sglang/srt/models/gpt_bigcode.py +5 -1
- sglang/srt/models/grok.py +66 -398
- sglang/srt/models/internlm2.py +5 -1
- sglang/srt/models/llama2.py +7 -3
- sglang/srt/models/llama_classification.py +2 -2
- sglang/srt/models/llama_embedding.py +4 -0
- sglang/srt/models/llava.py +176 -59
- sglang/srt/models/minicpm.py +7 -3
- sglang/srt/models/mixtral.py +61 -255
- sglang/srt/models/mixtral_quant.py +6 -5
- sglang/srt/models/qwen.py +7 -4
- sglang/srt/models/qwen2.py +15 -5
- sglang/srt/models/qwen2_moe.py +7 -16
- sglang/srt/models/stablelm.py +6 -2
- sglang/srt/openai_api/adapter.py +149 -58
- sglang/srt/sampling/sampling_batch_info.py +209 -0
- sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -4
- sglang/srt/server.py +107 -71
- sglang/srt/server_args.py +49 -15
- sglang/srt/utils.py +27 -18
- sglang/test/runners.py +38 -38
- sglang/test/simple_eval_common.py +9 -10
- sglang/test/simple_eval_gpqa.py +2 -1
- sglang/test/simple_eval_humaneval.py +2 -2
- sglang/test/simple_eval_math.py +2 -1
- sglang/test/simple_eval_mmlu.py +2 -1
- sglang/test/test_activation.py +55 -0
- sglang/test/test_programs.py +32 -5
- sglang/test/test_utils.py +37 -50
- sglang/version.py +1 -1
- {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/METADATA +102 -27
- sglang-0.2.14.dist-info/RECORD +114 -0
- {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
- sglang/launch_server_llavavid.py +0 -29
- sglang/srt/model_loader/model_loader.py +0 -292
- sglang/srt/model_loader/utils.py +0 -275
- sglang-0.2.12.dist-info/RECORD +0 -112
- {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
- {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0
@@ -21,7 +21,7 @@ import dataclasses
|
|
21
21
|
import logging
|
22
22
|
import multiprocessing as mp
|
23
23
|
import os
|
24
|
-
from typing import Dict, List, Tuple, Union
|
24
|
+
from typing import Dict, List, Optional, Tuple, Union
|
25
25
|
|
26
26
|
import numpy as np
|
27
27
|
import transformers
|
@@ -46,9 +46,11 @@ from sglang.srt.managers.io_struct import (
|
|
46
46
|
GenerateReqInput,
|
47
47
|
TokenizedEmbeddingReqInput,
|
48
48
|
TokenizedGenerateReqInput,
|
49
|
+
UpdateWeightReqInput,
|
50
|
+
UpdateWeightReqOutput,
|
49
51
|
)
|
50
52
|
from sglang.srt.mm_utils import expand2square, process_anyres_image
|
51
|
-
from sglang.srt.sampling_params import SamplingParams
|
53
|
+
from sglang.srt.sampling.sampling_params import SamplingParams
|
52
54
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
53
55
|
from sglang.srt.utils import is_generation_model, is_multimodal_model, load_image
|
54
56
|
from sglang.utils import get_exception_traceback
|
@@ -60,12 +62,16 @@ logger = logging.getLogger(__name__)
|
|
60
62
|
|
61
63
|
@dataclasses.dataclass
|
62
64
|
class ReqState:
|
65
|
+
"""Store the state a request."""
|
66
|
+
|
63
67
|
out_list: List
|
64
68
|
finished: bool
|
65
69
|
event: asyncio.Event
|
66
70
|
|
67
71
|
|
68
72
|
class TokenizerManager:
|
73
|
+
"""TokenizerManager is a process that tokenizes the text."""
|
74
|
+
|
69
75
|
def __init__(
|
70
76
|
self,
|
71
77
|
server_args: ServerArgs,
|
@@ -74,6 +80,7 @@ class TokenizerManager:
|
|
74
80
|
):
|
75
81
|
self.server_args = server_args
|
76
82
|
|
83
|
+
# Init inter-process communication
|
77
84
|
context = zmq.asyncio.Context(2)
|
78
85
|
self.recv_from_detokenizer = context.socket(zmq.PULL)
|
79
86
|
self.recv_from_detokenizer.bind(f"tcp://127.0.0.1:{port_args.tokenizer_port}")
|
@@ -81,6 +88,7 @@ class TokenizerManager:
|
|
81
88
|
self.send_to_router = context.socket(zmq.PUSH)
|
82
89
|
self.send_to_router.connect(f"tcp://127.0.0.1:{port_args.controller_port}")
|
83
90
|
|
91
|
+
# Read model args
|
84
92
|
self.model_path = server_args.model_path
|
85
93
|
self.served_model_name = server_args.served_model_name
|
86
94
|
self.hf_config = get_config(
|
@@ -88,13 +96,17 @@ class TokenizerManager:
|
|
88
96
|
trust_remote_code=server_args.trust_remote_code,
|
89
97
|
model_overide_args=model_overide_args,
|
90
98
|
)
|
91
|
-
|
99
|
+
|
100
|
+
self.is_generation = is_generation_model(
|
101
|
+
self.hf_config.architectures, self.server_args.is_embedding
|
102
|
+
)
|
92
103
|
|
93
104
|
if server_args.context_length is not None:
|
94
105
|
self.context_len = server_args.context_length
|
95
106
|
else:
|
96
107
|
self.context_len = get_context_length(self.hf_config)
|
97
108
|
|
109
|
+
# Create tokenizer
|
98
110
|
if server_args.skip_tokenizer_init:
|
99
111
|
self.tokenizer = self.processor = None
|
100
112
|
else:
|
@@ -118,27 +130,13 @@ class TokenizerManager:
|
|
118
130
|
trust_remote_code=server_args.trust_remote_code,
|
119
131
|
)
|
120
132
|
|
133
|
+
# Store states
|
121
134
|
self.to_create_loop = True
|
122
135
|
self.rid_to_state: Dict[str, ReqState] = {}
|
123
136
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
self.hf_config.image_grid_pinpoints if aspect_ratio == "anyres" else None
|
128
|
-
)
|
129
|
-
if self.executor is not None:
|
130
|
-
loop = asyncio.get_event_loop()
|
131
|
-
return await loop.run_in_executor(
|
132
|
-
self.executor,
|
133
|
-
get_pixel_values,
|
134
|
-
image_data,
|
135
|
-
aspect_ratio,
|
136
|
-
grid_pinpoints,
|
137
|
-
)
|
138
|
-
else:
|
139
|
-
return get_pixel_values(
|
140
|
-
image_data, aspect_ratio, grid_pinpoints, self.processor
|
141
|
-
)
|
137
|
+
# for update model weights
|
138
|
+
self.model_update_lock = asyncio.Lock()
|
139
|
+
self.model_update_result = None
|
142
140
|
|
143
141
|
async def generate_request(
|
144
142
|
self, obj: Union[GenerateReqInput, EmbeddingReqInput], request=None
|
@@ -146,6 +144,9 @@ class TokenizerManager:
|
|
146
144
|
if self.to_create_loop:
|
147
145
|
self.create_handle_loop()
|
148
146
|
|
147
|
+
while self.model_update_lock.locked():
|
148
|
+
await asyncio.sleep(0.001)
|
149
|
+
|
149
150
|
obj.post_init()
|
150
151
|
is_single = obj.is_single
|
151
152
|
|
@@ -153,9 +154,6 @@ class TokenizerManager:
|
|
153
154
|
async for response in self._handle_single_request(obj, request):
|
154
155
|
yield response
|
155
156
|
else:
|
156
|
-
if hasattr(obj, "stream") and obj.stream:
|
157
|
-
raise ValueError("Do not support stream for batch mode.")
|
158
|
-
|
159
157
|
async for response in self._handle_batch_request(obj, request):
|
160
158
|
yield response
|
161
159
|
|
@@ -163,8 +161,8 @@ class TokenizerManager:
|
|
163
161
|
self,
|
164
162
|
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
165
163
|
request,
|
166
|
-
index=None,
|
167
|
-
is_cache_for_prefill=False,
|
164
|
+
index: Optional[int] = None,
|
165
|
+
is_cache_for_prefill: Optional[bool] = False,
|
168
166
|
):
|
169
167
|
if not is_cache_for_prefill: # The normal case with a single prompt
|
170
168
|
not_use_index = index is None
|
@@ -185,7 +183,7 @@ class TokenizerManager:
|
|
185
183
|
|
186
184
|
if self.is_generation:
|
187
185
|
pixel_values, image_hash, image_size = await self._get_pixel_values(
|
188
|
-
obj.image_data
|
186
|
+
obj.image_data
|
189
187
|
)
|
190
188
|
return_logprob = (
|
191
189
|
obj.return_logprob if not_use_index else obj.return_logprob[index]
|
@@ -195,6 +193,9 @@ class TokenizerManager:
|
|
195
193
|
if not_use_index
|
196
194
|
else obj.logprob_start_len[index]
|
197
195
|
)
|
196
|
+
if return_logprob and logprob_start_len == -1:
|
197
|
+
logprob_start_len = len(input_ids) - 1
|
198
|
+
|
198
199
|
top_logprobs_num = (
|
199
200
|
obj.top_logprobs_num
|
200
201
|
if not_use_index
|
@@ -245,6 +246,8 @@ class TokenizerManager:
|
|
245
246
|
top_logprobs_num = obj.top_logprobs_num[0]
|
246
247
|
|
247
248
|
if self.is_generation:
|
249
|
+
if return_logprob and logprob_start_len == -1:
|
250
|
+
logprob_start_len = len(input_ids) - 1
|
248
251
|
tokenized_obj = TokenizedGenerateReqInput(
|
249
252
|
rid,
|
250
253
|
input_text,
|
@@ -289,7 +292,7 @@ class TokenizerManager:
|
|
289
292
|
parallel_sample_num = obj.parallel_sample_num
|
290
293
|
|
291
294
|
if parallel_sample_num != 1:
|
292
|
-
# Send prefill requests to cache the common
|
295
|
+
# Send prefill requests to cache the common prefix
|
293
296
|
parallel_sample_num += 1
|
294
297
|
input_id_result = [] if obj.input_ids is None else None
|
295
298
|
for i in range(batch_size):
|
@@ -306,6 +309,7 @@ class TokenizerManager:
|
|
306
309
|
parallel_sample_num = 1
|
307
310
|
|
308
311
|
# First send out all requests
|
312
|
+
generators = []
|
309
313
|
for i in range(batch_size):
|
310
314
|
for j in range(parallel_sample_num):
|
311
315
|
if j == 0 and parallel_sample_num != 1:
|
@@ -334,6 +338,8 @@ class TokenizerManager:
|
|
334
338
|
sampling_params = self._get_sampling_params(obj.sampling_params[index])
|
335
339
|
|
336
340
|
if self.is_generation:
|
341
|
+
if obj.return_logprob[index] and obj.logprob_start_len[index] == -1:
|
342
|
+
obj.logprob_start_len[index] = len(input_ids) - 1
|
337
343
|
pixel_values, image_hash, image_size = await self._get_pixel_values(
|
338
344
|
obj.image_data[index]
|
339
345
|
)
|
@@ -364,42 +370,47 @@ class TokenizerManager:
|
|
364
370
|
state = ReqState([], False, event)
|
365
371
|
self.rid_to_state[rid] = state
|
366
372
|
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
rid = obj.rid[index]
|
377
|
-
state = self.rid_to_state[rid]
|
378
|
-
|
379
|
-
while True:
|
380
|
-
try:
|
381
|
-
await asyncio.wait_for(state.event.wait(), timeout=4)
|
382
|
-
break
|
383
|
-
except asyncio.TimeoutError:
|
384
|
-
if request is not None and await request.is_disconnected():
|
385
|
-
for rid in obj.rid:
|
386
|
-
self.abort_request(rid)
|
387
|
-
raise ValueError(f"Abort request {rid}")
|
388
|
-
continue
|
389
|
-
if self.is_generation:
|
390
|
-
output_list.append(
|
391
|
-
self.convert_logprob_style(
|
392
|
-
state.out_list[-1],
|
393
|
-
obj.return_logprob[index],
|
394
|
-
obj.top_logprobs_num[index],
|
395
|
-
obj.return_text_in_logprobs,
|
396
|
-
)
|
373
|
+
generators.append(
|
374
|
+
self._wait_for_response(
|
375
|
+
event,
|
376
|
+
state,
|
377
|
+
obj,
|
378
|
+
rid,
|
379
|
+
request,
|
380
|
+
index=index,
|
381
|
+
response_index=len(generators),
|
397
382
|
)
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
383
|
+
)
|
384
|
+
|
385
|
+
# Then process the responses based on streaming option
|
386
|
+
is_stream = hasattr(obj, "stream") and obj.stream
|
387
|
+
|
388
|
+
tasks = [asyncio.create_task(gen.__anext__()) for gen in generators]
|
389
|
+
output_list = [None] * len(tasks)
|
390
|
+
|
391
|
+
while tasks:
|
392
|
+
done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
|
393
|
+
|
394
|
+
for task in done:
|
395
|
+
cur_index = tasks.index(task)
|
396
|
+
|
397
|
+
try:
|
398
|
+
result = task.result()
|
399
|
+
|
400
|
+
if is_stream:
|
401
|
+
yield result
|
402
|
+
else:
|
403
|
+
output_list[result["index"]] = result
|
404
|
+
|
405
|
+
tasks[cur_index] = asyncio.create_task(
|
406
|
+
generators[cur_index].__anext__()
|
407
|
+
)
|
408
|
+
except StopAsyncIteration:
|
409
|
+
del generators[cur_index]
|
410
|
+
del tasks[cur_index]
|
411
|
+
|
412
|
+
if not is_stream:
|
413
|
+
yield output_list
|
403
414
|
|
404
415
|
def _validate_input_length(self, input_ids: List[int]):
|
405
416
|
if len(input_ids) >= self.context_len:
|
@@ -416,12 +427,10 @@ class TokenizerManager:
|
|
416
427
|
return sampling_params
|
417
428
|
|
418
429
|
async def _get_pixel_values(self, image_data):
|
419
|
-
if
|
420
|
-
return await self.get_pixel_values(image_data[0])
|
421
|
-
elif isinstance(image_data, str):
|
422
|
-
return await self.get_pixel_values(image_data)
|
423
|
-
else:
|
430
|
+
if image_data is None:
|
424
431
|
return None, None, None
|
432
|
+
else:
|
433
|
+
return await self._get_pixel_values_internal(image_data)
|
425
434
|
|
426
435
|
async def _wait_for_response(
|
427
436
|
self,
|
@@ -430,33 +439,38 @@ class TokenizerManager:
|
|
430
439
|
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
431
440
|
rid: str,
|
432
441
|
request,
|
442
|
+
index: int = None,
|
443
|
+
response_index: int = 0,
|
433
444
|
):
|
434
445
|
while True:
|
435
446
|
try:
|
436
447
|
await asyncio.wait_for(event.wait(), timeout=4)
|
437
448
|
except asyncio.TimeoutError:
|
438
449
|
if request is not None and await request.is_disconnected():
|
439
|
-
|
450
|
+
for rid in [obj.rid] if obj.is_single else obj.rid:
|
451
|
+
self.abort_request(rid)
|
440
452
|
raise ValueError(f"Abort request {rid}")
|
441
453
|
continue
|
442
454
|
|
443
455
|
if self.is_generation:
|
444
456
|
out = self.convert_logprob_style(
|
445
457
|
state.out_list[-1],
|
446
|
-
obj.return_logprob,
|
447
|
-
|
458
|
+
obj.return_logprob if index is None else obj.return_logprob[index],
|
459
|
+
(
|
460
|
+
obj.top_logprobs_num
|
461
|
+
if index is None
|
462
|
+
else obj.top_logprobs_num[index]
|
463
|
+
),
|
448
464
|
obj.return_text_in_logprobs,
|
449
465
|
)
|
450
466
|
else: # isinstance(obj, EmbeddingReqInput)
|
451
467
|
out = state.out_list[-1]
|
452
468
|
|
469
|
+
out["index"] = response_index
|
470
|
+
|
453
471
|
# Log requests
|
454
472
|
if self.server_args.log_requests and state.finished:
|
455
|
-
|
456
|
-
in_obj = {"input_ids": obj.input_ids}
|
457
|
-
else:
|
458
|
-
in_obj = {"text": obj.text}
|
459
|
-
logger.info(f"in={in_obj}, out={out}")
|
473
|
+
logger.info(f"in={obj}, out={out}")
|
460
474
|
|
461
475
|
state.out_list = []
|
462
476
|
if state.finished:
|
@@ -500,6 +514,30 @@ class TokenizerManager:
|
|
500
514
|
req = AbortReq(rid)
|
501
515
|
self.send_to_router.send_pyobj(req)
|
502
516
|
|
517
|
+
async def update_weights(self, obj: UpdateWeightReqInput, request):
|
518
|
+
if self.to_create_loop:
|
519
|
+
self.create_handle_loop()
|
520
|
+
|
521
|
+
# default the load format to the server_args
|
522
|
+
if obj.load_format is None:
|
523
|
+
obj.load_format = self.server_args.load_format
|
524
|
+
|
525
|
+
if not self.model_update_lock.locked():
|
526
|
+
async with self.model_update_lock:
|
527
|
+
# wait for the previous generation requests to finish
|
528
|
+
while len(self.rid_to_state) > 0:
|
529
|
+
await asyncio.sleep(0)
|
530
|
+
self.send_to_router.send_pyobj(obj)
|
531
|
+
self.model_update_result = asyncio.Future()
|
532
|
+
result = await self.model_update_result
|
533
|
+
if result.success:
|
534
|
+
self.server_args.model_path = obj.model_path
|
535
|
+
self.server_args.load_format = obj.load_format
|
536
|
+
self.model_path = obj.model_path
|
537
|
+
return result.success, result.message
|
538
|
+
else:
|
539
|
+
return False, "Another update is in progress. Please try again later."
|
540
|
+
|
503
541
|
def create_abort_task(self, obj: GenerateReqInput):
|
504
542
|
# Abort the request if the client is disconnected.
|
505
543
|
async def abort_request():
|
@@ -507,7 +545,7 @@ class TokenizerManager:
|
|
507
545
|
if obj.is_single:
|
508
546
|
self.abort_request(obj.rid)
|
509
547
|
else:
|
510
|
-
for rid in obj.
|
548
|
+
for rid in obj.rid:
|
511
549
|
self.abort_request(rid)
|
512
550
|
|
513
551
|
background_tasks = BackgroundTasks()
|
@@ -515,18 +553,29 @@ class TokenizerManager:
|
|
515
553
|
return background_tasks
|
516
554
|
|
517
555
|
def create_handle_loop(self):
|
556
|
+
if not self.to_create_loop:
|
557
|
+
return
|
558
|
+
|
518
559
|
self.to_create_loop = False
|
519
560
|
loop = asyncio.get_event_loop()
|
520
561
|
loop.create_task(self.handle_loop())
|
521
562
|
|
522
563
|
async def handle_loop(self):
|
564
|
+
"""The event loop that handles requests"""
|
565
|
+
|
523
566
|
while True:
|
524
|
-
recv_obj: Union[
|
525
|
-
|
526
|
-
)
|
567
|
+
recv_obj: Union[
|
568
|
+
BatchStrOut, BatchEmbeddingOut, BatchTokenIDOut, UpdateWeightReqOutput
|
569
|
+
] = await self.recv_from_detokenizer.recv_pyobj()
|
570
|
+
|
571
|
+
if isinstance(recv_obj, UpdateWeightReqOutput):
|
572
|
+
self.model_update_result.set_result(recv_obj)
|
573
|
+
continue
|
574
|
+
|
527
575
|
assert isinstance(
|
528
576
|
recv_obj, (BatchStrOut, BatchEmbeddingOut, BatchTokenIDOut)
|
529
577
|
), f"Unexpected obj received: {type(recv_obj)}"
|
578
|
+
|
530
579
|
for i, rid in enumerate(recv_obj.rids):
|
531
580
|
state = self.rid_to_state.get(rid, None)
|
532
581
|
if state is None:
|
@@ -610,11 +659,69 @@ class TokenizerManager:
|
|
610
659
|
)
|
611
660
|
return top_logprobs
|
612
661
|
|
662
|
+
async def _get_pixel_values_internal(self, image_data, aspect_ratio=None):
|
663
|
+
aspect_ratio = (
|
664
|
+
getattr(self.hf_config, "image_aspect_ratio", None)
|
665
|
+
if aspect_ratio is None
|
666
|
+
else aspect_ratio
|
667
|
+
)
|
668
|
+
grid_pinpoints = (
|
669
|
+
self.hf_config.image_grid_pinpoints
|
670
|
+
if hasattr(self.hf_config, "image_grid_pinpoints")
|
671
|
+
and "anyres" in aspect_ratio
|
672
|
+
else None
|
673
|
+
)
|
674
|
+
|
675
|
+
if isinstance(image_data, list) and len(image_data) > 0:
|
676
|
+
pixel_values, image_hash, image_size = [], [], []
|
677
|
+
if len(image_data) > 1:
|
678
|
+
aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
|
679
|
+
for img_data in image_data:
|
680
|
+
pixel_v, image_h, image_s = await self._process_single_image(
|
681
|
+
img_data, aspect_ratio, grid_pinpoints
|
682
|
+
)
|
683
|
+
pixel_values.append(pixel_v)
|
684
|
+
image_hash.append(image_h)
|
685
|
+
image_size.append(image_s)
|
686
|
+
pixel_values = np.stack(pixel_values, axis=0)
|
687
|
+
else:
|
688
|
+
pixel_values, image_hash, image_size = await self._process_single_image(
|
689
|
+
image_data[0], aspect_ratio, grid_pinpoints
|
690
|
+
)
|
691
|
+
image_hash = [image_hash]
|
692
|
+
image_size = [image_size]
|
693
|
+
elif isinstance(image_data, str):
|
694
|
+
pixel_values, image_hash, image_size = await self._process_single_image(
|
695
|
+
image_data, aspect_ratio, grid_pinpoints
|
696
|
+
)
|
697
|
+
image_hash = [image_hash]
|
698
|
+
image_size = [image_size]
|
699
|
+
else:
|
700
|
+
pixel_values, image_hash, image_size = None, None, None
|
701
|
+
|
702
|
+
return pixel_values, image_hash, image_size
|
703
|
+
|
704
|
+
async def _process_single_image(self, image_data, aspect_ratio, grid_pinpoints):
|
705
|
+
if self.executor is not None:
|
706
|
+
loop = asyncio.get_event_loop()
|
707
|
+
return await loop.run_in_executor(
|
708
|
+
self.executor,
|
709
|
+
_process_single_image_task,
|
710
|
+
image_data,
|
711
|
+
aspect_ratio,
|
712
|
+
grid_pinpoints,
|
713
|
+
)
|
714
|
+
else:
|
715
|
+
return _process_single_image_task(
|
716
|
+
image_data, aspect_ratio, grid_pinpoints, self.processor
|
717
|
+
)
|
718
|
+
|
613
719
|
|
614
720
|
global global_processor
|
615
721
|
|
616
722
|
|
617
723
|
def init_global_processor(server_args: ServerArgs):
|
724
|
+
"""Init the global processor for multi modal models."""
|
618
725
|
global global_processor
|
619
726
|
transformers.logging.set_verbosity_error()
|
620
727
|
global_processor = get_processor(
|
@@ -624,7 +731,7 @@ def init_global_processor(server_args: ServerArgs):
|
|
624
731
|
)
|
625
732
|
|
626
733
|
|
627
|
-
def
|
734
|
+
def _process_single_image_task(
|
628
735
|
image_data, image_aspect_ratio=None, image_grid_pinpoints=None, processor=None
|
629
736
|
):
|
630
737
|
try:
|
@@ -644,8 +751,10 @@ def get_pixel_values(
|
|
644
751
|
image,
|
645
752
|
tuple(int(x * 255) for x in processor.image_processor.image_mean),
|
646
753
|
)
|
647
|
-
pixel_values = processor.image_processor(image
|
648
|
-
|
754
|
+
pixel_values = processor.image_processor(image.convert("RGB"))[
|
755
|
+
"pixel_values"
|
756
|
+
][0]
|
757
|
+
elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
|
649
758
|
pixel_values = process_anyres_image(
|
650
759
|
image, processor.image_processor, image_grid_pinpoints
|
651
760
|
)
|
@@ -654,4 +763,4 @@ def get_pixel_values(
|
|
654
763
|
pixel_values = pixel_values.astype(np.float16)
|
655
764
|
return pixel_values, image_hash, image.size
|
656
765
|
except Exception:
|
657
|
-
|
766
|
+
logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())
|