sglang 0.3.1.post3__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +23 -1
- sglang/bench_latency.py +48 -33
- sglang/bench_server_latency.py +0 -6
- sglang/bench_serving.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +14 -1
- sglang/lang/interpreter.py +16 -6
- sglang/lang/ir.py +20 -4
- sglang/srt/configs/model_config.py +11 -9
- sglang/srt/constrained/fsm_cache.py +9 -1
- sglang/srt/constrained/jump_forward.py +15 -2
- sglang/srt/hf_transformers_utils.py +1 -0
- sglang/srt/layers/activation.py +4 -4
- sglang/srt/layers/attention/__init__.py +49 -0
- sglang/srt/layers/attention/flashinfer_backend.py +277 -0
- sglang/srt/layers/{flashinfer_utils.py → attention/flashinfer_utils.py} +82 -80
- sglang/srt/layers/attention/triton_backend.py +161 -0
- sglang/srt/layers/{triton_attention → attention/triton_ops}/extend_attention.py +3 -1
- sglang/srt/layers/fused_moe/patch.py +117 -0
- sglang/srt/layers/layernorm.py +4 -4
- sglang/srt/layers/logits_processor.py +19 -15
- sglang/srt/layers/pooler.py +3 -3
- sglang/srt/layers/quantization/__init__.py +0 -2
- sglang/srt/layers/radix_attention.py +6 -4
- sglang/srt/layers/sampler.py +6 -4
- sglang/srt/layers/torchao_utils.py +18 -0
- sglang/srt/lora/lora.py +20 -21
- sglang/srt/lora/lora_manager.py +97 -25
- sglang/srt/managers/detokenizer_manager.py +31 -18
- sglang/srt/managers/image_processor.py +187 -0
- sglang/srt/managers/io_struct.py +99 -75
- sglang/srt/managers/schedule_batch.py +187 -68
- sglang/srt/managers/{policy_scheduler.py → schedule_policy.py} +31 -21
- sglang/srt/managers/scheduler.py +1021 -0
- sglang/srt/managers/tokenizer_manager.py +120 -247
- sglang/srt/managers/tp_worker.py +28 -925
- sglang/srt/mem_cache/memory_pool.py +34 -52
- sglang/srt/mem_cache/radix_cache.py +5 -5
- sglang/srt/model_executor/cuda_graph_runner.py +25 -25
- sglang/srt/model_executor/forward_batch_info.py +94 -97
- sglang/srt/model_executor/model_runner.py +76 -78
- sglang/srt/models/baichuan.py +10 -10
- sglang/srt/models/chatglm.py +12 -12
- sglang/srt/models/commandr.py +10 -10
- sglang/srt/models/dbrx.py +12 -12
- sglang/srt/models/deepseek.py +10 -10
- sglang/srt/models/deepseek_v2.py +14 -15
- sglang/srt/models/exaone.py +10 -10
- sglang/srt/models/gemma.py +10 -10
- sglang/srt/models/gemma2.py +11 -11
- sglang/srt/models/gpt_bigcode.py +10 -10
- sglang/srt/models/grok.py +10 -10
- sglang/srt/models/internlm2.py +10 -10
- sglang/srt/models/llama.py +22 -10
- sglang/srt/models/llama_classification.py +5 -5
- sglang/srt/models/llama_embedding.py +4 -4
- sglang/srt/models/llama_reward.py +142 -0
- sglang/srt/models/llava.py +39 -33
- sglang/srt/models/llavavid.py +31 -28
- sglang/srt/models/minicpm.py +10 -10
- sglang/srt/models/minicpm3.py +14 -15
- sglang/srt/models/mixtral.py +10 -10
- sglang/srt/models/mixtral_quant.py +10 -10
- sglang/srt/models/olmoe.py +10 -10
- sglang/srt/models/qwen.py +10 -10
- sglang/srt/models/qwen2.py +11 -11
- sglang/srt/models/qwen2_moe.py +10 -10
- sglang/srt/models/stablelm.py +10 -10
- sglang/srt/models/torch_native_llama.py +506 -0
- sglang/srt/models/xverse.py +10 -10
- sglang/srt/models/xverse_moe.py +10 -10
- sglang/srt/openai_api/adapter.py +7 -0
- sglang/srt/sampling/sampling_batch_info.py +36 -27
- sglang/srt/sampling/sampling_params.py +3 -1
- sglang/srt/server.py +170 -119
- sglang/srt/server_args.py +54 -27
- sglang/srt/utils.py +101 -128
- sglang/test/runners.py +76 -33
- sglang/test/test_programs.py +38 -5
- sglang/test/test_utils.py +53 -9
- sglang/version.py +1 -1
- {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/METADATA +42 -23
- sglang-0.3.3.dist-info/RECORD +139 -0
- sglang/srt/layers/attention_backend.py +0 -482
- sglang/srt/managers/controller_multi.py +0 -207
- sglang/srt/managers/controller_single.py +0 -164
- sglang-0.3.1.post3.dist-info/RECORD +0 -134
- /sglang/srt/layers/{triton_attention → attention/triton_ops}/decode_attention.py +0 -0
- /sglang/srt/layers/{triton_attention → attention/triton_ops}/prefill_attention.py +0 -0
- {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/LICENSE +0 -0
- {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/WHEEL +0 -0
- {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/top_level.txt +0 -0
@@ -16,17 +16,13 @@ limitations under the License.
|
|
16
16
|
"""TokenizerManager is a process that tokenizes the text."""
|
17
17
|
|
18
18
|
import asyncio
|
19
|
-
import concurrent.futures
|
20
19
|
import dataclasses
|
21
20
|
import json
|
22
21
|
import logging
|
23
|
-
import multiprocessing as mp
|
24
22
|
import os
|
25
23
|
from typing import Dict, List, Optional, Tuple, Union
|
26
24
|
|
27
25
|
import fastapi
|
28
|
-
import numpy as np
|
29
|
-
import transformers
|
30
26
|
import uvloop
|
31
27
|
import zmq
|
32
28
|
import zmq.asyncio
|
@@ -38,6 +34,10 @@ from sglang.srt.hf_transformers_utils import (
|
|
38
34
|
get_processor,
|
39
35
|
get_tokenizer,
|
40
36
|
)
|
37
|
+
from sglang.srt.managers.image_processor import (
|
38
|
+
get_dummy_image_processor,
|
39
|
+
get_image_processor,
|
40
|
+
)
|
41
41
|
from sglang.srt.managers.io_struct import (
|
42
42
|
AbortReq,
|
43
43
|
BatchEmbeddingOut,
|
@@ -46,16 +46,16 @@ from sglang.srt.managers.io_struct import (
|
|
46
46
|
EmbeddingReqInput,
|
47
47
|
FlushCacheReq,
|
48
48
|
GenerateReqInput,
|
49
|
+
RewardReqInput,
|
49
50
|
TokenizedEmbeddingReqInput,
|
50
51
|
TokenizedGenerateReqInput,
|
52
|
+
TokenizedRewardReqInput,
|
51
53
|
UpdateWeightReqInput,
|
52
54
|
UpdateWeightReqOutput,
|
53
55
|
)
|
54
|
-
from sglang.srt.mm_utils import expand2square, process_anyres_image
|
55
56
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
56
57
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
57
|
-
from sglang.srt.utils import is_generation_model, is_multimodal_model
|
58
|
-
from sglang.utils import get_exception_traceback
|
58
|
+
from sglang.srt.utils import is_generation_model, is_multimodal_model
|
59
59
|
|
60
60
|
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
61
61
|
|
@@ -84,10 +84,10 @@ class TokenizerManager:
|
|
84
84
|
# Init inter-process communication
|
85
85
|
context = zmq.asyncio.Context(2)
|
86
86
|
self.recv_from_detokenizer = context.socket(zmq.PULL)
|
87
|
-
self.recv_from_detokenizer.bind(f"
|
87
|
+
self.recv_from_detokenizer.bind(f"ipc://{port_args.tokenizer_ipc_name}")
|
88
88
|
|
89
|
-
self.
|
90
|
-
self.
|
89
|
+
self.send_to_scheduler = context.socket(zmq.PUSH)
|
90
|
+
self.send_to_scheduler.connect(f"ipc://{port_args.scheduler_input_ipc_name}")
|
91
91
|
|
92
92
|
# Read model args
|
93
93
|
self.model_path = server_args.model_path
|
@@ -103,6 +103,8 @@ class TokenizerManager:
|
|
103
103
|
self.context_len = server_args.context_length or get_context_length(
|
104
104
|
self.hf_config
|
105
105
|
)
|
106
|
+
# Create image processor placeholder
|
107
|
+
self.image_processor = get_dummy_image_processor()
|
106
108
|
|
107
109
|
# Create tokenizer
|
108
110
|
if server_args.skip_tokenizer_init:
|
@@ -117,12 +119,9 @@ class TokenizerManager:
|
|
117
119
|
self.tokenizer = self.processor.tokenizer
|
118
120
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
119
121
|
|
120
|
-
# We want to parallelize the image pre-processing so we
|
121
|
-
|
122
|
-
|
123
|
-
initializer=init_global_processor,
|
124
|
-
mp_context=mp.get_context("fork"),
|
125
|
-
initargs=(server_args,),
|
122
|
+
# We want to parallelize the image pre-processing so we create an executor for it
|
123
|
+
self.image_processor = get_image_processor(
|
124
|
+
self.hf_config, server_args, self.processor.image_processor
|
126
125
|
)
|
127
126
|
else:
|
128
127
|
self.tokenizer = get_tokenizer(
|
@@ -141,7 +140,7 @@ class TokenizerManager:
|
|
141
140
|
|
142
141
|
async def generate_request(
|
143
142
|
self,
|
144
|
-
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
143
|
+
obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
|
145
144
|
request: Optional[fastapi.Request] = None,
|
146
145
|
):
|
147
146
|
if self.to_create_loop:
|
@@ -160,53 +159,72 @@ class TokenizerManager:
|
|
160
159
|
async for response in self._handle_batch_request(obj, request):
|
161
160
|
yield response
|
162
161
|
|
163
|
-
async def
|
162
|
+
async def _send_single_request(
|
164
163
|
self,
|
165
|
-
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
166
|
-
request: Optional[fastapi.Request] = None,
|
164
|
+
obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
|
167
165
|
index: Optional[int] = None,
|
166
|
+
input_id_index: Optional[int] = None,
|
168
167
|
is_cache_for_prefill: Optional[bool] = False,
|
169
168
|
):
|
170
169
|
if not is_cache_for_prefill: # The normal case with a single prompt
|
171
|
-
|
170
|
+
if index is None:
|
171
|
+
rid = obj.rid
|
172
|
+
if hasattr(obj, "conv"):
|
173
|
+
# reward model
|
174
|
+
conv = obj.conv
|
175
|
+
input_text = self.tokenizer.apply_chat_template(
|
176
|
+
conv, tokenize=False
|
177
|
+
)
|
178
|
+
input_ids = self.tokenizer.encode(input_text)
|
179
|
+
elif obj.input_ids is None:
|
180
|
+
input_text = obj.text
|
181
|
+
input_ids = self.tokenizer.encode(input_text)
|
182
|
+
else:
|
183
|
+
input_text = obj.text if obj.text is not None else None
|
184
|
+
input_ids = obj.input_ids
|
172
185
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
186
|
+
sampling_params = self._get_sampling_params(obj.sampling_params)
|
187
|
+
if self.is_generation:
|
188
|
+
image_inputs = await self.image_processor.process_images_async(
|
189
|
+
obj.image_data, obj
|
190
|
+
)
|
191
|
+
return_logprob = obj.return_logprob
|
192
|
+
logprob_start_len = obj.logprob_start_len
|
193
|
+
top_logprobs_num = obj.top_logprobs_num
|
178
194
|
else:
|
179
|
-
|
195
|
+
rid = obj.rid[index]
|
196
|
+
if hasattr(obj, "conv"):
|
197
|
+
# reward model
|
198
|
+
conv = obj.conv[index]
|
199
|
+
input_text = self.tokenizer.apply_chat_template(
|
200
|
+
conv, tokenize=False
|
201
|
+
)
|
202
|
+
input_ids = self.tokenizer.encode(input_text)
|
203
|
+
elif obj.input_ids is None:
|
204
|
+
input_text = obj.text[input_id_index]
|
205
|
+
input_ids = self.tokenizer.encode(input_text)
|
206
|
+
else:
|
207
|
+
input_text = (
|
208
|
+
obj.text[input_id_index] if obj.text is not None else None
|
209
|
+
)
|
210
|
+
input_ids = obj.input_ids[input_id_index]
|
180
211
|
|
181
|
-
|
212
|
+
sampling_params = self._get_sampling_params(obj.sampling_params[index])
|
213
|
+
if self.is_generation:
|
214
|
+
image_inputs = await self.image_processor.process_images_async(
|
215
|
+
obj.image_data[index], obj
|
216
|
+
)
|
217
|
+
return_logprob = obj.return_logprob[index]
|
218
|
+
logprob_start_len = obj.logprob_start_len[index]
|
219
|
+
top_logprobs_num = obj.top_logprobs_num[index]
|
182
220
|
|
183
|
-
|
184
|
-
obj.sampling_params if not_use_index else obj.sampling_params[index]
|
185
|
-
)
|
221
|
+
self._validate_input_length(input_ids)
|
186
222
|
|
187
|
-
if self.is_generation:
|
188
|
-
pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
|
189
|
-
obj.image_data if not_use_index else obj.image_data[index]
|
190
|
-
)
|
191
|
-
modalities = obj.modalities
|
192
|
-
return_logprob = (
|
193
|
-
obj.return_logprob if not_use_index else obj.return_logprob[index]
|
194
|
-
)
|
195
|
-
logprob_start_len = (
|
196
|
-
obj.logprob_start_len
|
197
|
-
if not_use_index
|
198
|
-
else obj.logprob_start_len[index]
|
199
|
-
)
|
200
|
-
top_logprobs_num = (
|
201
|
-
obj.top_logprobs_num
|
202
|
-
if not_use_index
|
203
|
-
else obj.top_logprobs_num[index]
|
204
|
-
)
|
205
223
|
else: # A prefill request to cache the common prompt for parallel sampling
|
206
224
|
assert self.is_generation
|
207
225
|
if obj.text is not None:
|
208
226
|
if isinstance(obj.text, list):
|
209
|
-
input_text = obj.text[
|
227
|
+
input_text = obj.text[input_id_index]
|
210
228
|
rid = obj.rid[index]
|
211
229
|
else:
|
212
230
|
input_text = obj.text
|
@@ -220,7 +238,7 @@ class TokenizerManager:
|
|
220
238
|
obj.input_ids[0], list
|
221
239
|
):
|
222
240
|
# when obj["input_ids"] is List[List[int]]
|
223
|
-
input_ids = obj.input_ids[
|
241
|
+
input_ids = obj.input_ids[input_id_index]
|
224
242
|
rid = obj.rid[index]
|
225
243
|
else:
|
226
244
|
input_ids = obj.input_ids
|
@@ -231,7 +249,7 @@ class TokenizerManager:
|
|
231
249
|
obj.input_ids[0], list
|
232
250
|
):
|
233
251
|
# when obj["input_ids"] is List[List[int]]
|
234
|
-
input_ids = obj.input_ids[
|
252
|
+
input_ids = obj.input_ids[input_id_index]
|
235
253
|
rid = obj.rid[index]
|
236
254
|
else:
|
237
255
|
input_ids = obj.input_ids
|
@@ -239,10 +257,9 @@ class TokenizerManager:
|
|
239
257
|
|
240
258
|
sampling_params = SamplingParams(**obj.sampling_params[0])
|
241
259
|
sampling_params.max_new_tokens = 0
|
242
|
-
|
243
|
-
obj.image_data[0]
|
260
|
+
image_inputs = await self.image_processor.process_images_async(
|
261
|
+
obj.image_data[0], obj
|
244
262
|
)
|
245
|
-
modalities = obj.modalities
|
246
263
|
return_logprob = obj.return_logprob[0]
|
247
264
|
logprob_start_len = obj.logprob_start_len[0]
|
248
265
|
top_logprobs_num = obj.top_logprobs_num[0]
|
@@ -253,34 +270,57 @@ class TokenizerManager:
|
|
253
270
|
rid,
|
254
271
|
input_text,
|
255
272
|
input_ids,
|
256
|
-
|
257
|
-
image_hashes,
|
258
|
-
image_sizes,
|
273
|
+
image_inputs,
|
259
274
|
sampling_params,
|
260
275
|
return_logprob,
|
261
276
|
logprob_start_len,
|
262
277
|
top_logprobs_num,
|
263
278
|
obj.stream,
|
264
|
-
modalities,
|
265
279
|
(
|
266
|
-
obj.lora_path[
|
280
|
+
obj.lora_path[input_id_index]
|
267
281
|
if isinstance(obj.lora_path, list)
|
268
282
|
else obj.lora_path
|
269
283
|
),
|
270
284
|
)
|
271
|
-
|
285
|
+
elif isinstance(obj, EmbeddingReqInput):
|
272
286
|
tokenized_obj = TokenizedEmbeddingReqInput(
|
273
287
|
rid,
|
274
288
|
input_text,
|
275
289
|
input_ids,
|
276
290
|
sampling_params,
|
277
291
|
)
|
278
|
-
|
292
|
+
else:
|
293
|
+
assert isinstance(obj, RewardReqInput)
|
294
|
+
tokenized_obj = TokenizedRewardReqInput(
|
295
|
+
rid,
|
296
|
+
input_text,
|
297
|
+
input_ids,
|
298
|
+
sampling_params,
|
299
|
+
)
|
300
|
+
|
301
|
+
self.send_to_scheduler.send_pyobj(tokenized_obj)
|
302
|
+
return rid, input_ids
|
303
|
+
|
304
|
+
async def _handle_single_request(
|
305
|
+
self,
|
306
|
+
obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
|
307
|
+
request: Optional[fastapi.Request] = None,
|
308
|
+
index: Optional[int] = None,
|
309
|
+
input_id_index: Optional[int] = None,
|
310
|
+
is_cache_for_prefill: Optional[bool] = False,
|
311
|
+
):
|
312
|
+
rid, input_ids = await self._send_single_request(
|
313
|
+
obj,
|
314
|
+
index,
|
315
|
+
input_id_index=input_id_index,
|
316
|
+
is_cache_for_prefill=is_cache_for_prefill,
|
317
|
+
)
|
279
318
|
|
280
319
|
# Recv results
|
281
320
|
event = asyncio.Event()
|
282
321
|
state = ReqState([], False, event)
|
283
322
|
self.rid_to_state[rid] = state
|
323
|
+
|
284
324
|
if not is_cache_for_prefill:
|
285
325
|
async for response in self._wait_for_response(state, obj, rid, request):
|
286
326
|
yield response
|
@@ -291,7 +331,7 @@ class TokenizerManager:
|
|
291
331
|
|
292
332
|
async def _handle_batch_request(
|
293
333
|
self,
|
294
|
-
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
334
|
+
obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
|
295
335
|
request: Optional[fastapi.Request] = None,
|
296
336
|
):
|
297
337
|
batch_size = obj.batch_size
|
@@ -304,14 +344,16 @@ class TokenizerManager:
|
|
304
344
|
input_id_result = [] if obj.input_ids is None else None
|
305
345
|
for i in range(batch_size):
|
306
346
|
async for input_id in self._handle_single_request(
|
307
|
-
obj,
|
347
|
+
obj,
|
348
|
+
request,
|
349
|
+
index=i,
|
350
|
+
input_id_index=i,
|
351
|
+
is_cache_for_prefill=True,
|
308
352
|
):
|
309
353
|
if input_id_result is not None:
|
310
354
|
input_id_result.append(input_id)
|
311
|
-
if input_id_result is not None
|
355
|
+
if input_id_result is not None:
|
312
356
|
obj.input_ids = input_id_result
|
313
|
-
elif input_id_result is not None:
|
314
|
-
obj.input_ids = input_id_result[0]
|
315
357
|
else:
|
316
358
|
parallel_sample_num = 1
|
317
359
|
|
@@ -325,58 +367,10 @@ class TokenizerManager:
|
|
325
367
|
if parallel_sample_num != 1:
|
326
368
|
# Here when using parallel sampling we should consider prefill stage so the index is : j + i * (parallel_sample_num-1) + batch_size - 1
|
327
369
|
index += batch_size - 1 - i
|
328
|
-
rid = obj.rid[index]
|
329
|
-
if parallel_sample_num == 1:
|
330
|
-
## select operation
|
331
|
-
if obj.input_ids is None:
|
332
|
-
input_text = obj.text[i]
|
333
|
-
input_ids = self.tokenizer.encode(obj.text[i])
|
334
|
-
else:
|
335
|
-
input_text = None
|
336
|
-
input_ids = obj.input_ids[i]
|
337
|
-
else:
|
338
|
-
assert obj.input_ids is not None
|
339
|
-
if batch_size == 1:
|
340
|
-
input_text = None
|
341
|
-
input_ids = obj.input_ids
|
342
|
-
else:
|
343
|
-
input_text = None
|
344
|
-
input_ids = obj.input_ids[i]
|
345
|
-
sampling_params = self._get_sampling_params(obj.sampling_params[index])
|
346
|
-
|
347
|
-
if self.is_generation:
|
348
|
-
pixel_values, image_hashes, image_sizes = (
|
349
|
-
await self._get_pixel_values(obj.image_data[index])
|
350
|
-
)
|
351
|
-
modalities = obj.modalities
|
352
370
|
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
input_ids,
|
357
|
-
pixel_values,
|
358
|
-
image_hashes,
|
359
|
-
image_sizes,
|
360
|
-
sampling_params,
|
361
|
-
obj.return_logprob[index],
|
362
|
-
obj.logprob_start_len[index],
|
363
|
-
obj.top_logprobs_num[index],
|
364
|
-
obj.stream,
|
365
|
-
modalities,
|
366
|
-
(
|
367
|
-
obj.lora_path[index]
|
368
|
-
if isinstance(obj.lora_path, list)
|
369
|
-
else obj.lora_path
|
370
|
-
),
|
371
|
-
)
|
372
|
-
else:
|
373
|
-
tokenized_obj = TokenizedEmbeddingReqInput(
|
374
|
-
rid,
|
375
|
-
input_text,
|
376
|
-
input_ids,
|
377
|
-
sampling_params,
|
378
|
-
)
|
379
|
-
self.send_to_controller.send_pyobj(tokenized_obj)
|
371
|
+
rid, _ = await self._send_single_request(
|
372
|
+
obj, index, input_id_index=i, is_cache_for_prefill=False
|
373
|
+
)
|
380
374
|
|
381
375
|
event = asyncio.Event()
|
382
376
|
state = ReqState([], False, event)
|
@@ -399,7 +393,7 @@ class TokenizerManager:
|
|
399
393
|
tasks = [asyncio.create_task(gen.__anext__()) for gen in generators]
|
400
394
|
output_list = [None] * len(tasks)
|
401
395
|
|
402
|
-
#
|
396
|
+
# Fetch results
|
403
397
|
while tasks:
|
404
398
|
done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
|
405
399
|
|
@@ -441,7 +435,7 @@ class TokenizerManager:
|
|
441
435
|
async def _wait_for_response(
|
442
436
|
self,
|
443
437
|
state: ReqState,
|
444
|
-
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
438
|
+
obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
|
445
439
|
rid: str,
|
446
440
|
request: Optional[fastapi.Request] = None,
|
447
441
|
index: Optional[int] = None,
|
@@ -468,7 +462,7 @@ class TokenizerManager:
|
|
468
462
|
),
|
469
463
|
obj.return_text_in_logprobs,
|
470
464
|
)
|
471
|
-
else: # isinstance(obj, EmbeddingReqInput)
|
465
|
+
else: # isinstance(obj, (EmbeddingReqInput, RewardReqInput))
|
472
466
|
out = state.out_list[-1]
|
473
467
|
|
474
468
|
out["index"] = response_index
|
@@ -509,14 +503,14 @@ class TokenizerManager:
|
|
509
503
|
|
510
504
|
def flush_cache(self):
|
511
505
|
req = FlushCacheReq()
|
512
|
-
self.
|
506
|
+
self.send_to_scheduler.send_pyobj(req)
|
513
507
|
|
514
508
|
def abort_request(self, rid: str):
|
515
509
|
if rid not in self.rid_to_state:
|
516
510
|
return
|
517
511
|
del self.rid_to_state[rid]
|
518
512
|
req = AbortReq(rid)
|
519
|
-
self.
|
513
|
+
self.send_to_scheduler.send_pyobj(req)
|
520
514
|
|
521
515
|
async def update_weights(
|
522
516
|
self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
|
@@ -532,8 +526,8 @@ class TokenizerManager:
|
|
532
526
|
async with self.model_update_lock:
|
533
527
|
# wait for the previous generation requests to finish
|
534
528
|
while len(self.rid_to_state) > 0:
|
535
|
-
await asyncio.sleep(0)
|
536
|
-
self.
|
529
|
+
await asyncio.sleep(0.001)
|
530
|
+
self.send_to_scheduler.send_pyobj(obj)
|
537
531
|
self.model_update_result = asyncio.Future()
|
538
532
|
result = await self.model_update_result
|
539
533
|
if result.success:
|
@@ -644,6 +638,7 @@ class TokenizerManager:
|
|
644
638
|
def detokenize_logprob_tokens(
|
645
639
|
self, token_logprobs: List[Tuple[float, int]], decode_to_text: bool
|
646
640
|
):
|
641
|
+
# TODO(lianmin): This should run on DetokenizerManager
|
647
642
|
if not decode_to_text:
|
648
643
|
return [(logprob, token_id, None) for logprob, token_id in token_logprobs]
|
649
644
|
|
@@ -664,125 +659,3 @@ class TokenizerManager:
|
|
664
659
|
token_top_logprobs, decode_to_text
|
665
660
|
)
|
666
661
|
return top_logprobs
|
667
|
-
|
668
|
-
async def _get_pixel_values(self, image_data: List[Union[str, bytes]]):
|
669
|
-
if not image_data:
|
670
|
-
return None, None, None
|
671
|
-
|
672
|
-
aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
|
673
|
-
grid_pinpoints = (
|
674
|
-
self.hf_config.image_grid_pinpoints
|
675
|
-
if hasattr(self.hf_config, "image_grid_pinpoints")
|
676
|
-
and "anyres" in aspect_ratio
|
677
|
-
else None
|
678
|
-
)
|
679
|
-
|
680
|
-
if isinstance(image_data, list) and len(image_data) > 0:
|
681
|
-
# Multiple images
|
682
|
-
if len(image_data) > 1:
|
683
|
-
aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
|
684
|
-
pixel_values, image_hashes, image_sizes = [], [], []
|
685
|
-
for img_data in image_data:
|
686
|
-
pixel_v, image_h, image_s = await self._process_single_image(
|
687
|
-
img_data, aspect_ratio, grid_pinpoints
|
688
|
-
)
|
689
|
-
pixel_values.append(pixel_v)
|
690
|
-
image_hashes.append(image_h)
|
691
|
-
image_sizes.append(image_s)
|
692
|
-
|
693
|
-
if isinstance(pixel_values[0], np.ndarray):
|
694
|
-
pixel_values = np.stack(pixel_values, axis=0)
|
695
|
-
else:
|
696
|
-
# A single image
|
697
|
-
pixel_values, image_hash, image_size = await self._process_single_image(
|
698
|
-
image_data[0], aspect_ratio, grid_pinpoints
|
699
|
-
)
|
700
|
-
image_hashes = [image_hash]
|
701
|
-
image_sizes = [image_size]
|
702
|
-
elif isinstance(image_data, str):
|
703
|
-
# A single image
|
704
|
-
pixel_values, image_hash, image_size = await self._process_single_image(
|
705
|
-
image_data, aspect_ratio, grid_pinpoints
|
706
|
-
)
|
707
|
-
image_hashes = [image_hash]
|
708
|
-
image_sizes = [image_size]
|
709
|
-
else:
|
710
|
-
raise ValueError(f"Invalid image data: {image_data}")
|
711
|
-
|
712
|
-
return pixel_values, image_hashes, image_sizes
|
713
|
-
|
714
|
-
async def _process_single_image(
|
715
|
-
self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str
|
716
|
-
):
|
717
|
-
if self.executor is not None:
|
718
|
-
loop = asyncio.get_event_loop()
|
719
|
-
return await loop.run_in_executor(
|
720
|
-
self.executor,
|
721
|
-
_process_single_image_task,
|
722
|
-
image_data,
|
723
|
-
aspect_ratio,
|
724
|
-
grid_pinpoints,
|
725
|
-
)
|
726
|
-
else:
|
727
|
-
return _process_single_image_task(
|
728
|
-
image_data, aspect_ratio, grid_pinpoints, self.processor
|
729
|
-
)
|
730
|
-
|
731
|
-
|
732
|
-
global global_processor
|
733
|
-
|
734
|
-
|
735
|
-
def init_global_processor(server_args: ServerArgs):
|
736
|
-
"""Init the global processor for multi modal models."""
|
737
|
-
global global_processor
|
738
|
-
transformers.logging.set_verbosity_error()
|
739
|
-
global_processor = get_processor(
|
740
|
-
server_args.tokenizer_path,
|
741
|
-
tokenizer_mode=server_args.tokenizer_mode,
|
742
|
-
trust_remote_code=server_args.trust_remote_code,
|
743
|
-
)
|
744
|
-
|
745
|
-
|
746
|
-
def _process_single_image_task(
|
747
|
-
image_data: Union[str, bytes],
|
748
|
-
image_aspect_ratio: Optional[str] = None,
|
749
|
-
image_grid_pinpoints: Optional[str] = None,
|
750
|
-
processor=None,
|
751
|
-
):
|
752
|
-
try:
|
753
|
-
processor = processor or global_processor
|
754
|
-
image, image_size = load_image(image_data)
|
755
|
-
if image_size is not None:
|
756
|
-
# It is a video with multiple images
|
757
|
-
image_hash = hash(image_data)
|
758
|
-
pixel_values = processor.image_processor(image)["pixel_values"]
|
759
|
-
for _ in range(len(pixel_values)):
|
760
|
-
pixel_values[_] = pixel_values[_].astype(np.float16)
|
761
|
-
pixel_values = np.stack(pixel_values, axis=0)
|
762
|
-
return pixel_values, image_hash, image_size
|
763
|
-
else:
|
764
|
-
# It is an image
|
765
|
-
image_hash = hash(image_data)
|
766
|
-
if image_aspect_ratio == "pad":
|
767
|
-
image = expand2square(
|
768
|
-
image,
|
769
|
-
tuple(int(x * 255) for x in processor.image_processor.image_mean),
|
770
|
-
)
|
771
|
-
pixel_values = processor.image_processor(image.convert("RGB"))[
|
772
|
-
"pixel_values"
|
773
|
-
][0]
|
774
|
-
elif image_aspect_ratio == "anyres" or (
|
775
|
-
image_aspect_ratio is not None and "anyres_max" in image_aspect_ratio
|
776
|
-
):
|
777
|
-
pixel_values = process_anyres_image(
|
778
|
-
image, processor.image_processor, image_grid_pinpoints
|
779
|
-
)
|
780
|
-
else:
|
781
|
-
pixel_values = processor.image_processor(image)["pixel_values"][0]
|
782
|
-
|
783
|
-
if isinstance(pixel_values, np.ndarray):
|
784
|
-
pixel_values = pixel_values.astype(np.float16)
|
785
|
-
|
786
|
-
return pixel_values, image_hash, image.size
|
787
|
-
except Exception:
|
788
|
-
logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())
|