sglang 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +23 -1
- sglang/bench_latency.py +46 -25
- sglang/bench_serving.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +14 -1
- sglang/lang/interpreter.py +16 -6
- sglang/lang/ir.py +20 -4
- sglang/srt/configs/model_config.py +11 -9
- sglang/srt/constrained/fsm_cache.py +9 -1
- sglang/srt/constrained/jump_forward.py +15 -2
- sglang/srt/layers/activation.py +4 -4
- sglang/srt/layers/attention/__init__.py +49 -0
- sglang/srt/layers/attention/flashinfer_backend.py +277 -0
- sglang/srt/layers/{flashinfer_utils.py → attention/flashinfer_utils.py} +82 -80
- sglang/srt/layers/attention/triton_backend.py +161 -0
- sglang/srt/layers/{triton_attention → attention/triton_ops}/extend_attention.py +3 -1
- sglang/srt/layers/layernorm.py +4 -4
- sglang/srt/layers/logits_processor.py +19 -15
- sglang/srt/layers/pooler.py +3 -3
- sglang/srt/layers/quantization/__init__.py +0 -2
- sglang/srt/layers/radix_attention.py +6 -4
- sglang/srt/layers/sampler.py +6 -4
- sglang/srt/layers/torchao_utils.py +18 -0
- sglang/srt/lora/lora.py +20 -21
- sglang/srt/lora/lora_manager.py +97 -25
- sglang/srt/managers/detokenizer_manager.py +31 -18
- sglang/srt/managers/image_processor.py +187 -0
- sglang/srt/managers/io_struct.py +99 -75
- sglang/srt/managers/schedule_batch.py +184 -63
- sglang/srt/managers/{policy_scheduler.py → schedule_policy.py} +31 -21
- sglang/srt/managers/scheduler.py +1021 -0
- sglang/srt/managers/tokenizer_manager.py +120 -248
- sglang/srt/managers/tp_worker.py +28 -925
- sglang/srt/mem_cache/memory_pool.py +34 -52
- sglang/srt/model_executor/cuda_graph_runner.py +15 -19
- sglang/srt/model_executor/forward_batch_info.py +94 -95
- sglang/srt/model_executor/model_runner.py +76 -75
- sglang/srt/models/baichuan.py +10 -10
- sglang/srt/models/chatglm.py +12 -12
- sglang/srt/models/commandr.py +10 -10
- sglang/srt/models/dbrx.py +12 -12
- sglang/srt/models/deepseek.py +10 -10
- sglang/srt/models/deepseek_v2.py +14 -15
- sglang/srt/models/exaone.py +10 -10
- sglang/srt/models/gemma.py +10 -10
- sglang/srt/models/gemma2.py +11 -11
- sglang/srt/models/gpt_bigcode.py +10 -10
- sglang/srt/models/grok.py +10 -10
- sglang/srt/models/internlm2.py +10 -10
- sglang/srt/models/llama.py +14 -10
- sglang/srt/models/llama_classification.py +5 -5
- sglang/srt/models/llama_embedding.py +4 -4
- sglang/srt/models/llama_reward.py +142 -0
- sglang/srt/models/llava.py +39 -33
- sglang/srt/models/llavavid.py +31 -28
- sglang/srt/models/minicpm.py +10 -10
- sglang/srt/models/minicpm3.py +14 -15
- sglang/srt/models/mixtral.py +10 -10
- sglang/srt/models/mixtral_quant.py +10 -10
- sglang/srt/models/olmoe.py +10 -10
- sglang/srt/models/qwen.py +10 -10
- sglang/srt/models/qwen2.py +11 -11
- sglang/srt/models/qwen2_moe.py +10 -10
- sglang/srt/models/stablelm.py +10 -10
- sglang/srt/models/torch_native_llama.py +506 -0
- sglang/srt/models/xverse.py +10 -10
- sglang/srt/models/xverse_moe.py +10 -10
- sglang/srt/sampling/sampling_batch_info.py +36 -27
- sglang/srt/sampling/sampling_params.py +3 -1
- sglang/srt/server.py +170 -119
- sglang/srt/server_args.py +54 -27
- sglang/srt/utils.py +101 -128
- sglang/test/runners.py +71 -26
- sglang/test/test_programs.py +38 -5
- sglang/test/test_utils.py +18 -9
- sglang/version.py +1 -1
- {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/METADATA +37 -19
- sglang-0.3.3.dist-info/RECORD +139 -0
- sglang/srt/layers/attention_backend.py +0 -474
- sglang/srt/managers/controller_multi.py +0 -207
- sglang/srt/managers/controller_single.py +0 -164
- sglang-0.3.2.dist-info/RECORD +0 -135
- /sglang/srt/layers/{triton_attention → attention/triton_ops}/decode_attention.py +0 -0
- /sglang/srt/layers/{triton_attention → attention/triton_ops}/prefill_attention.py +0 -0
- {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/LICENSE +0 -0
- {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/WHEEL +0 -0
- {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/top_level.txt +0 -0
@@ -16,17 +16,13 @@ limitations under the License.
|
|
16
16
|
"""TokenizerManager is a process that tokenizes the text."""
|
17
17
|
|
18
18
|
import asyncio
|
19
|
-
import concurrent.futures
|
20
19
|
import dataclasses
|
21
20
|
import json
|
22
21
|
import logging
|
23
|
-
import multiprocessing as mp
|
24
22
|
import os
|
25
23
|
from typing import Dict, List, Optional, Tuple, Union
|
26
24
|
|
27
25
|
import fastapi
|
28
|
-
import numpy as np
|
29
|
-
import transformers
|
30
26
|
import uvloop
|
31
27
|
import zmq
|
32
28
|
import zmq.asyncio
|
@@ -38,6 +34,10 @@ from sglang.srt.hf_transformers_utils import (
|
|
38
34
|
get_processor,
|
39
35
|
get_tokenizer,
|
40
36
|
)
|
37
|
+
from sglang.srt.managers.image_processor import (
|
38
|
+
get_dummy_image_processor,
|
39
|
+
get_image_processor,
|
40
|
+
)
|
41
41
|
from sglang.srt.managers.io_struct import (
|
42
42
|
AbortReq,
|
43
43
|
BatchEmbeddingOut,
|
@@ -46,16 +46,16 @@ from sglang.srt.managers.io_struct import (
|
|
46
46
|
EmbeddingReqInput,
|
47
47
|
FlushCacheReq,
|
48
48
|
GenerateReqInput,
|
49
|
+
RewardReqInput,
|
49
50
|
TokenizedEmbeddingReqInput,
|
50
51
|
TokenizedGenerateReqInput,
|
52
|
+
TokenizedRewardReqInput,
|
51
53
|
UpdateWeightReqInput,
|
52
54
|
UpdateWeightReqOutput,
|
53
55
|
)
|
54
|
-
from sglang.srt.mm_utils import expand2square, process_anyres_image
|
55
56
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
56
57
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
57
|
-
from sglang.srt.utils import is_generation_model, is_multimodal_model
|
58
|
-
from sglang.utils import get_exception_traceback
|
58
|
+
from sglang.srt.utils import is_generation_model, is_multimodal_model
|
59
59
|
|
60
60
|
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
61
61
|
|
@@ -84,10 +84,10 @@ class TokenizerManager:
|
|
84
84
|
# Init inter-process communication
|
85
85
|
context = zmq.asyncio.Context(2)
|
86
86
|
self.recv_from_detokenizer = context.socket(zmq.PULL)
|
87
|
-
self.recv_from_detokenizer.bind(f"
|
87
|
+
self.recv_from_detokenizer.bind(f"ipc://{port_args.tokenizer_ipc_name}")
|
88
88
|
|
89
|
-
self.
|
90
|
-
self.
|
89
|
+
self.send_to_scheduler = context.socket(zmq.PUSH)
|
90
|
+
self.send_to_scheduler.connect(f"ipc://{port_args.scheduler_input_ipc_name}")
|
91
91
|
|
92
92
|
# Read model args
|
93
93
|
self.model_path = server_args.model_path
|
@@ -103,6 +103,8 @@ class TokenizerManager:
|
|
103
103
|
self.context_len = server_args.context_length or get_context_length(
|
104
104
|
self.hf_config
|
105
105
|
)
|
106
|
+
# Create image processor placeholder
|
107
|
+
self.image_processor = get_dummy_image_processor()
|
106
108
|
|
107
109
|
# Create tokenizer
|
108
110
|
if server_args.skip_tokenizer_init:
|
@@ -117,13 +119,9 @@ class TokenizerManager:
|
|
117
119
|
self.tokenizer = self.processor.tokenizer
|
118
120
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
119
121
|
|
120
|
-
# We want to parallelize the image pre-processing so we
|
121
|
-
|
122
|
-
|
123
|
-
initializer=init_global_processor,
|
124
|
-
mp_context=mp.get_context("fork"),
|
125
|
-
initargs=(server_args,),
|
126
|
-
max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
|
122
|
+
# We want to parallelize the image pre-processing so we create an executor for it
|
123
|
+
self.image_processor = get_image_processor(
|
124
|
+
self.hf_config, server_args, self.processor.image_processor
|
127
125
|
)
|
128
126
|
else:
|
129
127
|
self.tokenizer = get_tokenizer(
|
@@ -142,7 +140,7 @@ class TokenizerManager:
|
|
142
140
|
|
143
141
|
async def generate_request(
|
144
142
|
self,
|
145
|
-
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
143
|
+
obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
|
146
144
|
request: Optional[fastapi.Request] = None,
|
147
145
|
):
|
148
146
|
if self.to_create_loop:
|
@@ -161,53 +159,72 @@ class TokenizerManager:
|
|
161
159
|
async for response in self._handle_batch_request(obj, request):
|
162
160
|
yield response
|
163
161
|
|
164
|
-
async def
|
162
|
+
async def _send_single_request(
|
165
163
|
self,
|
166
|
-
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
167
|
-
request: Optional[fastapi.Request] = None,
|
164
|
+
obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
|
168
165
|
index: Optional[int] = None,
|
166
|
+
input_id_index: Optional[int] = None,
|
169
167
|
is_cache_for_prefill: Optional[bool] = False,
|
170
168
|
):
|
171
169
|
if not is_cache_for_prefill: # The normal case with a single prompt
|
172
|
-
|
170
|
+
if index is None:
|
171
|
+
rid = obj.rid
|
172
|
+
if hasattr(obj, "conv"):
|
173
|
+
# reward model
|
174
|
+
conv = obj.conv
|
175
|
+
input_text = self.tokenizer.apply_chat_template(
|
176
|
+
conv, tokenize=False
|
177
|
+
)
|
178
|
+
input_ids = self.tokenizer.encode(input_text)
|
179
|
+
elif obj.input_ids is None:
|
180
|
+
input_text = obj.text
|
181
|
+
input_ids = self.tokenizer.encode(input_text)
|
182
|
+
else:
|
183
|
+
input_text = obj.text if obj.text is not None else None
|
184
|
+
input_ids = obj.input_ids
|
173
185
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
186
|
+
sampling_params = self._get_sampling_params(obj.sampling_params)
|
187
|
+
if self.is_generation:
|
188
|
+
image_inputs = await self.image_processor.process_images_async(
|
189
|
+
obj.image_data, obj
|
190
|
+
)
|
191
|
+
return_logprob = obj.return_logprob
|
192
|
+
logprob_start_len = obj.logprob_start_len
|
193
|
+
top_logprobs_num = obj.top_logprobs_num
|
179
194
|
else:
|
180
|
-
|
195
|
+
rid = obj.rid[index]
|
196
|
+
if hasattr(obj, "conv"):
|
197
|
+
# reward model
|
198
|
+
conv = obj.conv[index]
|
199
|
+
input_text = self.tokenizer.apply_chat_template(
|
200
|
+
conv, tokenize=False
|
201
|
+
)
|
202
|
+
input_ids = self.tokenizer.encode(input_text)
|
203
|
+
elif obj.input_ids is None:
|
204
|
+
input_text = obj.text[input_id_index]
|
205
|
+
input_ids = self.tokenizer.encode(input_text)
|
206
|
+
else:
|
207
|
+
input_text = (
|
208
|
+
obj.text[input_id_index] if obj.text is not None else None
|
209
|
+
)
|
210
|
+
input_ids = obj.input_ids[input_id_index]
|
181
211
|
|
182
|
-
|
212
|
+
sampling_params = self._get_sampling_params(obj.sampling_params[index])
|
213
|
+
if self.is_generation:
|
214
|
+
image_inputs = await self.image_processor.process_images_async(
|
215
|
+
obj.image_data[index], obj
|
216
|
+
)
|
217
|
+
return_logprob = obj.return_logprob[index]
|
218
|
+
logprob_start_len = obj.logprob_start_len[index]
|
219
|
+
top_logprobs_num = obj.top_logprobs_num[index]
|
183
220
|
|
184
|
-
|
185
|
-
obj.sampling_params if not_use_index else obj.sampling_params[index]
|
186
|
-
)
|
221
|
+
self._validate_input_length(input_ids)
|
187
222
|
|
188
|
-
if self.is_generation:
|
189
|
-
pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
|
190
|
-
obj.image_data if not_use_index else obj.image_data[index]
|
191
|
-
)
|
192
|
-
modalities = obj.modalities
|
193
|
-
return_logprob = (
|
194
|
-
obj.return_logprob if not_use_index else obj.return_logprob[index]
|
195
|
-
)
|
196
|
-
logprob_start_len = (
|
197
|
-
obj.logprob_start_len
|
198
|
-
if not_use_index
|
199
|
-
else obj.logprob_start_len[index]
|
200
|
-
)
|
201
|
-
top_logprobs_num = (
|
202
|
-
obj.top_logprobs_num
|
203
|
-
if not_use_index
|
204
|
-
else obj.top_logprobs_num[index]
|
205
|
-
)
|
206
223
|
else: # A prefill request to cache the common prompt for parallel sampling
|
207
224
|
assert self.is_generation
|
208
225
|
if obj.text is not None:
|
209
226
|
if isinstance(obj.text, list):
|
210
|
-
input_text = obj.text[
|
227
|
+
input_text = obj.text[input_id_index]
|
211
228
|
rid = obj.rid[index]
|
212
229
|
else:
|
213
230
|
input_text = obj.text
|
@@ -221,7 +238,7 @@ class TokenizerManager:
|
|
221
238
|
obj.input_ids[0], list
|
222
239
|
):
|
223
240
|
# when obj["input_ids"] is List[List[int]]
|
224
|
-
input_ids = obj.input_ids[
|
241
|
+
input_ids = obj.input_ids[input_id_index]
|
225
242
|
rid = obj.rid[index]
|
226
243
|
else:
|
227
244
|
input_ids = obj.input_ids
|
@@ -232,7 +249,7 @@ class TokenizerManager:
|
|
232
249
|
obj.input_ids[0], list
|
233
250
|
):
|
234
251
|
# when obj["input_ids"] is List[List[int]]
|
235
|
-
input_ids = obj.input_ids[
|
252
|
+
input_ids = obj.input_ids[input_id_index]
|
236
253
|
rid = obj.rid[index]
|
237
254
|
else:
|
238
255
|
input_ids = obj.input_ids
|
@@ -240,10 +257,9 @@ class TokenizerManager:
|
|
240
257
|
|
241
258
|
sampling_params = SamplingParams(**obj.sampling_params[0])
|
242
259
|
sampling_params.max_new_tokens = 0
|
243
|
-
|
244
|
-
obj.image_data[0]
|
260
|
+
image_inputs = await self.image_processor.process_images_async(
|
261
|
+
obj.image_data[0], obj
|
245
262
|
)
|
246
|
-
modalities = obj.modalities
|
247
263
|
return_logprob = obj.return_logprob[0]
|
248
264
|
logprob_start_len = obj.logprob_start_len[0]
|
249
265
|
top_logprobs_num = obj.top_logprobs_num[0]
|
@@ -254,34 +270,57 @@ class TokenizerManager:
|
|
254
270
|
rid,
|
255
271
|
input_text,
|
256
272
|
input_ids,
|
257
|
-
|
258
|
-
image_hashes,
|
259
|
-
image_sizes,
|
273
|
+
image_inputs,
|
260
274
|
sampling_params,
|
261
275
|
return_logprob,
|
262
276
|
logprob_start_len,
|
263
277
|
top_logprobs_num,
|
264
278
|
obj.stream,
|
265
|
-
modalities,
|
266
279
|
(
|
267
|
-
obj.lora_path[
|
280
|
+
obj.lora_path[input_id_index]
|
268
281
|
if isinstance(obj.lora_path, list)
|
269
282
|
else obj.lora_path
|
270
283
|
),
|
271
284
|
)
|
272
|
-
|
285
|
+
elif isinstance(obj, EmbeddingReqInput):
|
273
286
|
tokenized_obj = TokenizedEmbeddingReqInput(
|
274
287
|
rid,
|
275
288
|
input_text,
|
276
289
|
input_ids,
|
277
290
|
sampling_params,
|
278
291
|
)
|
279
|
-
|
292
|
+
else:
|
293
|
+
assert isinstance(obj, RewardReqInput)
|
294
|
+
tokenized_obj = TokenizedRewardReqInput(
|
295
|
+
rid,
|
296
|
+
input_text,
|
297
|
+
input_ids,
|
298
|
+
sampling_params,
|
299
|
+
)
|
300
|
+
|
301
|
+
self.send_to_scheduler.send_pyobj(tokenized_obj)
|
302
|
+
return rid, input_ids
|
303
|
+
|
304
|
+
async def _handle_single_request(
|
305
|
+
self,
|
306
|
+
obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
|
307
|
+
request: Optional[fastapi.Request] = None,
|
308
|
+
index: Optional[int] = None,
|
309
|
+
input_id_index: Optional[int] = None,
|
310
|
+
is_cache_for_prefill: Optional[bool] = False,
|
311
|
+
):
|
312
|
+
rid, input_ids = await self._send_single_request(
|
313
|
+
obj,
|
314
|
+
index,
|
315
|
+
input_id_index=input_id_index,
|
316
|
+
is_cache_for_prefill=is_cache_for_prefill,
|
317
|
+
)
|
280
318
|
|
281
319
|
# Recv results
|
282
320
|
event = asyncio.Event()
|
283
321
|
state = ReqState([], False, event)
|
284
322
|
self.rid_to_state[rid] = state
|
323
|
+
|
285
324
|
if not is_cache_for_prefill:
|
286
325
|
async for response in self._wait_for_response(state, obj, rid, request):
|
287
326
|
yield response
|
@@ -292,7 +331,7 @@ class TokenizerManager:
|
|
292
331
|
|
293
332
|
async def _handle_batch_request(
|
294
333
|
self,
|
295
|
-
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
334
|
+
obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
|
296
335
|
request: Optional[fastapi.Request] = None,
|
297
336
|
):
|
298
337
|
batch_size = obj.batch_size
|
@@ -305,14 +344,16 @@ class TokenizerManager:
|
|
305
344
|
input_id_result = [] if obj.input_ids is None else None
|
306
345
|
for i in range(batch_size):
|
307
346
|
async for input_id in self._handle_single_request(
|
308
|
-
obj,
|
347
|
+
obj,
|
348
|
+
request,
|
349
|
+
index=i,
|
350
|
+
input_id_index=i,
|
351
|
+
is_cache_for_prefill=True,
|
309
352
|
):
|
310
353
|
if input_id_result is not None:
|
311
354
|
input_id_result.append(input_id)
|
312
|
-
if input_id_result is not None
|
355
|
+
if input_id_result is not None:
|
313
356
|
obj.input_ids = input_id_result
|
314
|
-
elif input_id_result is not None:
|
315
|
-
obj.input_ids = input_id_result[0]
|
316
357
|
else:
|
317
358
|
parallel_sample_num = 1
|
318
359
|
|
@@ -326,58 +367,10 @@ class TokenizerManager:
|
|
326
367
|
if parallel_sample_num != 1:
|
327
368
|
# Here when using parallel sampling we should consider prefill stage so the index is : j + i * (parallel_sample_num-1) + batch_size - 1
|
328
369
|
index += batch_size - 1 - i
|
329
|
-
rid = obj.rid[index]
|
330
|
-
if parallel_sample_num == 1:
|
331
|
-
## select operation
|
332
|
-
if obj.input_ids is None:
|
333
|
-
input_text = obj.text[i]
|
334
|
-
input_ids = self.tokenizer.encode(obj.text[i])
|
335
|
-
else:
|
336
|
-
input_text = None
|
337
|
-
input_ids = obj.input_ids[i]
|
338
|
-
else:
|
339
|
-
assert obj.input_ids is not None
|
340
|
-
if batch_size == 1:
|
341
|
-
input_text = None
|
342
|
-
input_ids = obj.input_ids
|
343
|
-
else:
|
344
|
-
input_text = None
|
345
|
-
input_ids = obj.input_ids[i]
|
346
|
-
sampling_params = self._get_sampling_params(obj.sampling_params[index])
|
347
|
-
|
348
|
-
if self.is_generation:
|
349
|
-
pixel_values, image_hashes, image_sizes = (
|
350
|
-
await self._get_pixel_values(obj.image_data[index])
|
351
|
-
)
|
352
|
-
modalities = obj.modalities
|
353
370
|
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
input_ids,
|
358
|
-
pixel_values,
|
359
|
-
image_hashes,
|
360
|
-
image_sizes,
|
361
|
-
sampling_params,
|
362
|
-
obj.return_logprob[index],
|
363
|
-
obj.logprob_start_len[index],
|
364
|
-
obj.top_logprobs_num[index],
|
365
|
-
obj.stream,
|
366
|
-
modalities,
|
367
|
-
(
|
368
|
-
obj.lora_path[index]
|
369
|
-
if isinstance(obj.lora_path, list)
|
370
|
-
else obj.lora_path
|
371
|
-
),
|
372
|
-
)
|
373
|
-
else:
|
374
|
-
tokenized_obj = TokenizedEmbeddingReqInput(
|
375
|
-
rid,
|
376
|
-
input_text,
|
377
|
-
input_ids,
|
378
|
-
sampling_params,
|
379
|
-
)
|
380
|
-
self.send_to_controller.send_pyobj(tokenized_obj)
|
371
|
+
rid, _ = await self._send_single_request(
|
372
|
+
obj, index, input_id_index=i, is_cache_for_prefill=False
|
373
|
+
)
|
381
374
|
|
382
375
|
event = asyncio.Event()
|
383
376
|
state = ReqState([], False, event)
|
@@ -400,7 +393,7 @@ class TokenizerManager:
|
|
400
393
|
tasks = [asyncio.create_task(gen.__anext__()) for gen in generators]
|
401
394
|
output_list = [None] * len(tasks)
|
402
395
|
|
403
|
-
#
|
396
|
+
# Fetch results
|
404
397
|
while tasks:
|
405
398
|
done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
|
406
399
|
|
@@ -442,7 +435,7 @@ class TokenizerManager:
|
|
442
435
|
async def _wait_for_response(
|
443
436
|
self,
|
444
437
|
state: ReqState,
|
445
|
-
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
438
|
+
obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
|
446
439
|
rid: str,
|
447
440
|
request: Optional[fastapi.Request] = None,
|
448
441
|
index: Optional[int] = None,
|
@@ -469,7 +462,7 @@ class TokenizerManager:
|
|
469
462
|
),
|
470
463
|
obj.return_text_in_logprobs,
|
471
464
|
)
|
472
|
-
else: # isinstance(obj, EmbeddingReqInput)
|
465
|
+
else: # isinstance(obj, (EmbeddingReqInput, RewardReqInput))
|
473
466
|
out = state.out_list[-1]
|
474
467
|
|
475
468
|
out["index"] = response_index
|
@@ -510,14 +503,14 @@ class TokenizerManager:
|
|
510
503
|
|
511
504
|
def flush_cache(self):
|
512
505
|
req = FlushCacheReq()
|
513
|
-
self.
|
506
|
+
self.send_to_scheduler.send_pyobj(req)
|
514
507
|
|
515
508
|
def abort_request(self, rid: str):
|
516
509
|
if rid not in self.rid_to_state:
|
517
510
|
return
|
518
511
|
del self.rid_to_state[rid]
|
519
512
|
req = AbortReq(rid)
|
520
|
-
self.
|
513
|
+
self.send_to_scheduler.send_pyobj(req)
|
521
514
|
|
522
515
|
async def update_weights(
|
523
516
|
self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
|
@@ -533,8 +526,8 @@ class TokenizerManager:
|
|
533
526
|
async with self.model_update_lock:
|
534
527
|
# wait for the previous generation requests to finish
|
535
528
|
while len(self.rid_to_state) > 0:
|
536
|
-
await asyncio.sleep(0)
|
537
|
-
self.
|
529
|
+
await asyncio.sleep(0.001)
|
530
|
+
self.send_to_scheduler.send_pyobj(obj)
|
538
531
|
self.model_update_result = asyncio.Future()
|
539
532
|
result = await self.model_update_result
|
540
533
|
if result.success:
|
@@ -645,6 +638,7 @@ class TokenizerManager:
|
|
645
638
|
def detokenize_logprob_tokens(
|
646
639
|
self, token_logprobs: List[Tuple[float, int]], decode_to_text: bool
|
647
640
|
):
|
641
|
+
# TODO(lianmin): This should run on DetokenizerManager
|
648
642
|
if not decode_to_text:
|
649
643
|
return [(logprob, token_id, None) for logprob, token_id in token_logprobs]
|
650
644
|
|
@@ -665,125 +659,3 @@ class TokenizerManager:
|
|
665
659
|
token_top_logprobs, decode_to_text
|
666
660
|
)
|
667
661
|
return top_logprobs
|
668
|
-
|
669
|
-
async def _get_pixel_values(self, image_data: List[Union[str, bytes]]):
|
670
|
-
if not image_data:
|
671
|
-
return None, None, None
|
672
|
-
|
673
|
-
aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
|
674
|
-
grid_pinpoints = (
|
675
|
-
self.hf_config.image_grid_pinpoints
|
676
|
-
if hasattr(self.hf_config, "image_grid_pinpoints")
|
677
|
-
and "anyres" in aspect_ratio
|
678
|
-
else None
|
679
|
-
)
|
680
|
-
|
681
|
-
if isinstance(image_data, list) and len(image_data) > 0:
|
682
|
-
# Multiple images
|
683
|
-
if len(image_data) > 1:
|
684
|
-
aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
|
685
|
-
pixel_values, image_hashes, image_sizes = [], [], []
|
686
|
-
for img_data in image_data:
|
687
|
-
pixel_v, image_h, image_s = await self._process_single_image(
|
688
|
-
img_data, aspect_ratio, grid_pinpoints
|
689
|
-
)
|
690
|
-
pixel_values.append(pixel_v)
|
691
|
-
image_hashes.append(image_h)
|
692
|
-
image_sizes.append(image_s)
|
693
|
-
|
694
|
-
if isinstance(pixel_values[0], np.ndarray):
|
695
|
-
pixel_values = np.stack(pixel_values, axis=0)
|
696
|
-
else:
|
697
|
-
# A single image
|
698
|
-
pixel_values, image_hash, image_size = await self._process_single_image(
|
699
|
-
image_data[0], aspect_ratio, grid_pinpoints
|
700
|
-
)
|
701
|
-
image_hashes = [image_hash]
|
702
|
-
image_sizes = [image_size]
|
703
|
-
elif isinstance(image_data, str):
|
704
|
-
# A single image
|
705
|
-
pixel_values, image_hash, image_size = await self._process_single_image(
|
706
|
-
image_data, aspect_ratio, grid_pinpoints
|
707
|
-
)
|
708
|
-
image_hashes = [image_hash]
|
709
|
-
image_sizes = [image_size]
|
710
|
-
else:
|
711
|
-
raise ValueError(f"Invalid image data: {image_data}")
|
712
|
-
|
713
|
-
return pixel_values, image_hashes, image_sizes
|
714
|
-
|
715
|
-
async def _process_single_image(
|
716
|
-
self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str
|
717
|
-
):
|
718
|
-
if self.executor is not None:
|
719
|
-
loop = asyncio.get_event_loop()
|
720
|
-
return await loop.run_in_executor(
|
721
|
-
self.executor,
|
722
|
-
_process_single_image_task,
|
723
|
-
image_data,
|
724
|
-
aspect_ratio,
|
725
|
-
grid_pinpoints,
|
726
|
-
)
|
727
|
-
else:
|
728
|
-
return _process_single_image_task(
|
729
|
-
image_data, aspect_ratio, grid_pinpoints, self.processor
|
730
|
-
)
|
731
|
-
|
732
|
-
|
733
|
-
global global_processor
|
734
|
-
|
735
|
-
|
736
|
-
def init_global_processor(server_args: ServerArgs):
|
737
|
-
"""Init the global processor for multi modal models."""
|
738
|
-
global global_processor
|
739
|
-
transformers.logging.set_verbosity_error()
|
740
|
-
global_processor = get_processor(
|
741
|
-
server_args.tokenizer_path,
|
742
|
-
tokenizer_mode=server_args.tokenizer_mode,
|
743
|
-
trust_remote_code=server_args.trust_remote_code,
|
744
|
-
)
|
745
|
-
|
746
|
-
|
747
|
-
def _process_single_image_task(
|
748
|
-
image_data: Union[str, bytes],
|
749
|
-
image_aspect_ratio: Optional[str] = None,
|
750
|
-
image_grid_pinpoints: Optional[str] = None,
|
751
|
-
processor=None,
|
752
|
-
):
|
753
|
-
try:
|
754
|
-
processor = processor or global_processor
|
755
|
-
image, image_size = load_image(image_data)
|
756
|
-
if image_size is not None:
|
757
|
-
# It is a video with multiple images
|
758
|
-
image_hash = hash(image_data)
|
759
|
-
pixel_values = processor.image_processor(image)["pixel_values"]
|
760
|
-
for _ in range(len(pixel_values)):
|
761
|
-
pixel_values[_] = pixel_values[_].astype(np.float16)
|
762
|
-
pixel_values = np.stack(pixel_values, axis=0)
|
763
|
-
return pixel_values, image_hash, image_size
|
764
|
-
else:
|
765
|
-
# It is an image
|
766
|
-
image_hash = hash(image_data)
|
767
|
-
if image_aspect_ratio == "pad":
|
768
|
-
image = expand2square(
|
769
|
-
image,
|
770
|
-
tuple(int(x * 255) for x in processor.image_processor.image_mean),
|
771
|
-
)
|
772
|
-
pixel_values = processor.image_processor(image.convert("RGB"))[
|
773
|
-
"pixel_values"
|
774
|
-
][0]
|
775
|
-
elif image_aspect_ratio == "anyres" or (
|
776
|
-
image_aspect_ratio is not None and "anyres_max" in image_aspect_ratio
|
777
|
-
):
|
778
|
-
pixel_values = process_anyres_image(
|
779
|
-
image, processor.image_processor, image_grid_pinpoints
|
780
|
-
)
|
781
|
-
else:
|
782
|
-
pixel_values = processor.image_processor(image)["pixel_values"][0]
|
783
|
-
|
784
|
-
if isinstance(pixel_values, np.ndarray):
|
785
|
-
pixel_values = pixel_values.astype(np.float16)
|
786
|
-
|
787
|
-
return pixel_values, image_hash, image.size
|
788
|
-
except Exception:
|
789
|
-
logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())
|