sglang 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +23 -1
  3. sglang/bench_latency.py +46 -25
  4. sglang/bench_serving.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +14 -1
  6. sglang/lang/interpreter.py +16 -6
  7. sglang/lang/ir.py +20 -4
  8. sglang/srt/configs/model_config.py +11 -9
  9. sglang/srt/constrained/fsm_cache.py +9 -1
  10. sglang/srt/constrained/jump_forward.py +15 -2
  11. sglang/srt/layers/activation.py +4 -4
  12. sglang/srt/layers/attention/__init__.py +49 -0
  13. sglang/srt/layers/attention/flashinfer_backend.py +277 -0
  14. sglang/srt/layers/{flashinfer_utils.py → attention/flashinfer_utils.py} +82 -80
  15. sglang/srt/layers/attention/triton_backend.py +161 -0
  16. sglang/srt/layers/{triton_attention → attention/triton_ops}/extend_attention.py +3 -1
  17. sglang/srt/layers/layernorm.py +4 -4
  18. sglang/srt/layers/logits_processor.py +19 -15
  19. sglang/srt/layers/pooler.py +3 -3
  20. sglang/srt/layers/quantization/__init__.py +0 -2
  21. sglang/srt/layers/radix_attention.py +6 -4
  22. sglang/srt/layers/sampler.py +6 -4
  23. sglang/srt/layers/torchao_utils.py +18 -0
  24. sglang/srt/lora/lora.py +20 -21
  25. sglang/srt/lora/lora_manager.py +97 -25
  26. sglang/srt/managers/detokenizer_manager.py +31 -18
  27. sglang/srt/managers/image_processor.py +187 -0
  28. sglang/srt/managers/io_struct.py +99 -75
  29. sglang/srt/managers/schedule_batch.py +184 -63
  30. sglang/srt/managers/{policy_scheduler.py → schedule_policy.py} +31 -21
  31. sglang/srt/managers/scheduler.py +1021 -0
  32. sglang/srt/managers/tokenizer_manager.py +120 -248
  33. sglang/srt/managers/tp_worker.py +28 -925
  34. sglang/srt/mem_cache/memory_pool.py +34 -52
  35. sglang/srt/model_executor/cuda_graph_runner.py +15 -19
  36. sglang/srt/model_executor/forward_batch_info.py +94 -95
  37. sglang/srt/model_executor/model_runner.py +76 -75
  38. sglang/srt/models/baichuan.py +10 -10
  39. sglang/srt/models/chatglm.py +12 -12
  40. sglang/srt/models/commandr.py +10 -10
  41. sglang/srt/models/dbrx.py +12 -12
  42. sglang/srt/models/deepseek.py +10 -10
  43. sglang/srt/models/deepseek_v2.py +14 -15
  44. sglang/srt/models/exaone.py +10 -10
  45. sglang/srt/models/gemma.py +10 -10
  46. sglang/srt/models/gemma2.py +11 -11
  47. sglang/srt/models/gpt_bigcode.py +10 -10
  48. sglang/srt/models/grok.py +10 -10
  49. sglang/srt/models/internlm2.py +10 -10
  50. sglang/srt/models/llama.py +14 -10
  51. sglang/srt/models/llama_classification.py +5 -5
  52. sglang/srt/models/llama_embedding.py +4 -4
  53. sglang/srt/models/llama_reward.py +142 -0
  54. sglang/srt/models/llava.py +39 -33
  55. sglang/srt/models/llavavid.py +31 -28
  56. sglang/srt/models/minicpm.py +10 -10
  57. sglang/srt/models/minicpm3.py +14 -15
  58. sglang/srt/models/mixtral.py +10 -10
  59. sglang/srt/models/mixtral_quant.py +10 -10
  60. sglang/srt/models/olmoe.py +10 -10
  61. sglang/srt/models/qwen.py +10 -10
  62. sglang/srt/models/qwen2.py +11 -11
  63. sglang/srt/models/qwen2_moe.py +10 -10
  64. sglang/srt/models/stablelm.py +10 -10
  65. sglang/srt/models/torch_native_llama.py +506 -0
  66. sglang/srt/models/xverse.py +10 -10
  67. sglang/srt/models/xverse_moe.py +10 -10
  68. sglang/srt/sampling/sampling_batch_info.py +36 -27
  69. sglang/srt/sampling/sampling_params.py +3 -1
  70. sglang/srt/server.py +170 -119
  71. sglang/srt/server_args.py +54 -27
  72. sglang/srt/utils.py +101 -128
  73. sglang/test/runners.py +71 -26
  74. sglang/test/test_programs.py +38 -5
  75. sglang/test/test_utils.py +18 -9
  76. sglang/version.py +1 -1
  77. {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/METADATA +37 -19
  78. sglang-0.3.3.dist-info/RECORD +139 -0
  79. sglang/srt/layers/attention_backend.py +0 -474
  80. sglang/srt/managers/controller_multi.py +0 -207
  81. sglang/srt/managers/controller_single.py +0 -164
  82. sglang-0.3.2.dist-info/RECORD +0 -135
  83. /sglang/srt/layers/{triton_attention → attention/triton_ops}/decode_attention.py +0 -0
  84. /sglang/srt/layers/{triton_attention → attention/triton_ops}/prefill_attention.py +0 -0
  85. {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/LICENSE +0 -0
  86. {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/WHEEL +0 -0
  87. {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/top_level.txt +0 -0
@@ -16,17 +16,13 @@ limitations under the License.
16
16
  """TokenizerManager is a process that tokenizes the text."""
17
17
 
18
18
  import asyncio
19
- import concurrent.futures
20
19
  import dataclasses
21
20
  import json
22
21
  import logging
23
- import multiprocessing as mp
24
22
  import os
25
23
  from typing import Dict, List, Optional, Tuple, Union
26
24
 
27
25
  import fastapi
28
- import numpy as np
29
- import transformers
30
26
  import uvloop
31
27
  import zmq
32
28
  import zmq.asyncio
@@ -38,6 +34,10 @@ from sglang.srt.hf_transformers_utils import (
38
34
  get_processor,
39
35
  get_tokenizer,
40
36
  )
37
+ from sglang.srt.managers.image_processor import (
38
+ get_dummy_image_processor,
39
+ get_image_processor,
40
+ )
41
41
  from sglang.srt.managers.io_struct import (
42
42
  AbortReq,
43
43
  BatchEmbeddingOut,
@@ -46,16 +46,16 @@ from sglang.srt.managers.io_struct import (
46
46
  EmbeddingReqInput,
47
47
  FlushCacheReq,
48
48
  GenerateReqInput,
49
+ RewardReqInput,
49
50
  TokenizedEmbeddingReqInput,
50
51
  TokenizedGenerateReqInput,
52
+ TokenizedRewardReqInput,
51
53
  UpdateWeightReqInput,
52
54
  UpdateWeightReqOutput,
53
55
  )
54
- from sglang.srt.mm_utils import expand2square, process_anyres_image
55
56
  from sglang.srt.sampling.sampling_params import SamplingParams
56
57
  from sglang.srt.server_args import PortArgs, ServerArgs
57
- from sglang.srt.utils import is_generation_model, is_multimodal_model, load_image
58
- from sglang.utils import get_exception_traceback
58
+ from sglang.srt.utils import is_generation_model, is_multimodal_model
59
59
 
60
60
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
61
61
 
@@ -84,10 +84,10 @@ class TokenizerManager:
84
84
  # Init inter-process communication
85
85
  context = zmq.asyncio.Context(2)
86
86
  self.recv_from_detokenizer = context.socket(zmq.PULL)
87
- self.recv_from_detokenizer.bind(f"tcp://127.0.0.1:{port_args.tokenizer_port}")
87
+ self.recv_from_detokenizer.bind(f"ipc://{port_args.tokenizer_ipc_name}")
88
88
 
89
- self.send_to_controller = context.socket(zmq.PUSH)
90
- self.send_to_controller.connect(f"tcp://127.0.0.1:{port_args.controller_port}")
89
+ self.send_to_scheduler = context.socket(zmq.PUSH)
90
+ self.send_to_scheduler.connect(f"ipc://{port_args.scheduler_input_ipc_name}")
91
91
 
92
92
  # Read model args
93
93
  self.model_path = server_args.model_path
@@ -103,6 +103,8 @@ class TokenizerManager:
103
103
  self.context_len = server_args.context_length or get_context_length(
104
104
  self.hf_config
105
105
  )
106
+ # Create image processor placeholder
107
+ self.image_processor = get_dummy_image_processor()
106
108
 
107
109
  # Create tokenizer
108
110
  if server_args.skip_tokenizer_init:
@@ -117,13 +119,9 @@ class TokenizerManager:
117
119
  self.tokenizer = self.processor.tokenizer
118
120
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
119
121
 
120
- # We want to parallelize the image pre-processing so we
121
- # create an executor for it
122
- self.executor = concurrent.futures.ProcessPoolExecutor(
123
- initializer=init_global_processor,
124
- mp_context=mp.get_context("fork"),
125
- initargs=(server_args,),
126
- max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
122
+ # We want to parallelize the image pre-processing so we create an executor for it
123
+ self.image_processor = get_image_processor(
124
+ self.hf_config, server_args, self.processor.image_processor
127
125
  )
128
126
  else:
129
127
  self.tokenizer = get_tokenizer(
@@ -142,7 +140,7 @@ class TokenizerManager:
142
140
 
143
141
  async def generate_request(
144
142
  self,
145
- obj: Union[GenerateReqInput, EmbeddingReqInput],
143
+ obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
146
144
  request: Optional[fastapi.Request] = None,
147
145
  ):
148
146
  if self.to_create_loop:
@@ -161,53 +159,72 @@ class TokenizerManager:
161
159
  async for response in self._handle_batch_request(obj, request):
162
160
  yield response
163
161
 
164
- async def _handle_single_request(
162
+ async def _send_single_request(
165
163
  self,
166
- obj: Union[GenerateReqInput, EmbeddingReqInput],
167
- request: Optional[fastapi.Request] = None,
164
+ obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
168
165
  index: Optional[int] = None,
166
+ input_id_index: Optional[int] = None,
169
167
  is_cache_for_prefill: Optional[bool] = False,
170
168
  ):
171
169
  if not is_cache_for_prefill: # The normal case with a single prompt
172
- not_use_index = index is None
170
+ if index is None:
171
+ rid = obj.rid
172
+ if hasattr(obj, "conv"):
173
+ # reward model
174
+ conv = obj.conv
175
+ input_text = self.tokenizer.apply_chat_template(
176
+ conv, tokenize=False
177
+ )
178
+ input_ids = self.tokenizer.encode(input_text)
179
+ elif obj.input_ids is None:
180
+ input_text = obj.text
181
+ input_ids = self.tokenizer.encode(input_text)
182
+ else:
183
+ input_text = obj.text if obj.text is not None else None
184
+ input_ids = obj.input_ids
173
185
 
174
- rid = obj.rid if not_use_index else obj.rid[index]
175
- input_text = obj.text if not_use_index else obj.text[index]
176
- if obj.input_ids is None:
177
- assert self.tokenizer is not None
178
- input_ids = self.tokenizer.encode(input_text)
186
+ sampling_params = self._get_sampling_params(obj.sampling_params)
187
+ if self.is_generation:
188
+ image_inputs = await self.image_processor.process_images_async(
189
+ obj.image_data, obj
190
+ )
191
+ return_logprob = obj.return_logprob
192
+ logprob_start_len = obj.logprob_start_len
193
+ top_logprobs_num = obj.top_logprobs_num
179
194
  else:
180
- input_ids = obj.input_ids if not_use_index else obj.input_ids[index]
195
+ rid = obj.rid[index]
196
+ if hasattr(obj, "conv"):
197
+ # reward model
198
+ conv = obj.conv[index]
199
+ input_text = self.tokenizer.apply_chat_template(
200
+ conv, tokenize=False
201
+ )
202
+ input_ids = self.tokenizer.encode(input_text)
203
+ elif obj.input_ids is None:
204
+ input_text = obj.text[input_id_index]
205
+ input_ids = self.tokenizer.encode(input_text)
206
+ else:
207
+ input_text = (
208
+ obj.text[input_id_index] if obj.text is not None else None
209
+ )
210
+ input_ids = obj.input_ids[input_id_index]
181
211
 
182
- self._validate_input_length(input_ids)
212
+ sampling_params = self._get_sampling_params(obj.sampling_params[index])
213
+ if self.is_generation:
214
+ image_inputs = await self.image_processor.process_images_async(
215
+ obj.image_data[index], obj
216
+ )
217
+ return_logprob = obj.return_logprob[index]
218
+ logprob_start_len = obj.logprob_start_len[index]
219
+ top_logprobs_num = obj.top_logprobs_num[index]
183
220
 
184
- sampling_params = self._get_sampling_params(
185
- obj.sampling_params if not_use_index else obj.sampling_params[index]
186
- )
221
+ self._validate_input_length(input_ids)
187
222
 
188
- if self.is_generation:
189
- pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
190
- obj.image_data if not_use_index else obj.image_data[index]
191
- )
192
- modalities = obj.modalities
193
- return_logprob = (
194
- obj.return_logprob if not_use_index else obj.return_logprob[index]
195
- )
196
- logprob_start_len = (
197
- obj.logprob_start_len
198
- if not_use_index
199
- else obj.logprob_start_len[index]
200
- )
201
- top_logprobs_num = (
202
- obj.top_logprobs_num
203
- if not_use_index
204
- else obj.top_logprobs_num[index]
205
- )
206
223
  else: # A prefill request to cache the common prompt for parallel sampling
207
224
  assert self.is_generation
208
225
  if obj.text is not None:
209
226
  if isinstance(obj.text, list):
210
- input_text = obj.text[index]
227
+ input_text = obj.text[input_id_index]
211
228
  rid = obj.rid[index]
212
229
  else:
213
230
  input_text = obj.text
@@ -221,7 +238,7 @@ class TokenizerManager:
221
238
  obj.input_ids[0], list
222
239
  ):
223
240
  # when obj["input_ids"] is List[List[int]]
224
- input_ids = obj.input_ids[index]
241
+ input_ids = obj.input_ids[input_id_index]
225
242
  rid = obj.rid[index]
226
243
  else:
227
244
  input_ids = obj.input_ids
@@ -232,7 +249,7 @@ class TokenizerManager:
232
249
  obj.input_ids[0], list
233
250
  ):
234
251
  # when obj["input_ids"] is List[List[int]]
235
- input_ids = obj.input_ids[index]
252
+ input_ids = obj.input_ids[input_id_index]
236
253
  rid = obj.rid[index]
237
254
  else:
238
255
  input_ids = obj.input_ids
@@ -240,10 +257,9 @@ class TokenizerManager:
240
257
 
241
258
  sampling_params = SamplingParams(**obj.sampling_params[0])
242
259
  sampling_params.max_new_tokens = 0
243
- pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
244
- obj.image_data[0]
260
+ image_inputs = await self.image_processor.process_images_async(
261
+ obj.image_data[0], obj
245
262
  )
246
- modalities = obj.modalities
247
263
  return_logprob = obj.return_logprob[0]
248
264
  logprob_start_len = obj.logprob_start_len[0]
249
265
  top_logprobs_num = obj.top_logprobs_num[0]
@@ -254,34 +270,57 @@ class TokenizerManager:
254
270
  rid,
255
271
  input_text,
256
272
  input_ids,
257
- pixel_values,
258
- image_hashes,
259
- image_sizes,
273
+ image_inputs,
260
274
  sampling_params,
261
275
  return_logprob,
262
276
  logprob_start_len,
263
277
  top_logprobs_num,
264
278
  obj.stream,
265
- modalities,
266
279
  (
267
- obj.lora_path[index]
280
+ obj.lora_path[input_id_index]
268
281
  if isinstance(obj.lora_path, list)
269
282
  else obj.lora_path
270
283
  ),
271
284
  )
272
- else: # is embedding
285
+ elif isinstance(obj, EmbeddingReqInput):
273
286
  tokenized_obj = TokenizedEmbeddingReqInput(
274
287
  rid,
275
288
  input_text,
276
289
  input_ids,
277
290
  sampling_params,
278
291
  )
279
- self.send_to_controller.send_pyobj(tokenized_obj)
292
+ else:
293
+ assert isinstance(obj, RewardReqInput)
294
+ tokenized_obj = TokenizedRewardReqInput(
295
+ rid,
296
+ input_text,
297
+ input_ids,
298
+ sampling_params,
299
+ )
300
+
301
+ self.send_to_scheduler.send_pyobj(tokenized_obj)
302
+ return rid, input_ids
303
+
304
+ async def _handle_single_request(
305
+ self,
306
+ obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
307
+ request: Optional[fastapi.Request] = None,
308
+ index: Optional[int] = None,
309
+ input_id_index: Optional[int] = None,
310
+ is_cache_for_prefill: Optional[bool] = False,
311
+ ):
312
+ rid, input_ids = await self._send_single_request(
313
+ obj,
314
+ index,
315
+ input_id_index=input_id_index,
316
+ is_cache_for_prefill=is_cache_for_prefill,
317
+ )
280
318
 
281
319
  # Recv results
282
320
  event = asyncio.Event()
283
321
  state = ReqState([], False, event)
284
322
  self.rid_to_state[rid] = state
323
+
285
324
  if not is_cache_for_prefill:
286
325
  async for response in self._wait_for_response(state, obj, rid, request):
287
326
  yield response
@@ -292,7 +331,7 @@ class TokenizerManager:
292
331
 
293
332
  async def _handle_batch_request(
294
333
  self,
295
- obj: Union[GenerateReqInput, EmbeddingReqInput],
334
+ obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
296
335
  request: Optional[fastapi.Request] = None,
297
336
  ):
298
337
  batch_size = obj.batch_size
@@ -305,14 +344,16 @@ class TokenizerManager:
305
344
  input_id_result = [] if obj.input_ids is None else None
306
345
  for i in range(batch_size):
307
346
  async for input_id in self._handle_single_request(
308
- obj, request, index=i, is_cache_for_prefill=True
347
+ obj,
348
+ request,
349
+ index=i,
350
+ input_id_index=i,
351
+ is_cache_for_prefill=True,
309
352
  ):
310
353
  if input_id_result is not None:
311
354
  input_id_result.append(input_id)
312
- if input_id_result is not None and len(input_id_result) > 1:
355
+ if input_id_result is not None:
313
356
  obj.input_ids = input_id_result
314
- elif input_id_result is not None:
315
- obj.input_ids = input_id_result[0]
316
357
  else:
317
358
  parallel_sample_num = 1
318
359
 
@@ -326,58 +367,10 @@ class TokenizerManager:
326
367
  if parallel_sample_num != 1:
327
368
  # Here when using parallel sampling we should consider prefill stage so the index is : j + i * (parallel_sample_num-1) + batch_size - 1
328
369
  index += batch_size - 1 - i
329
- rid = obj.rid[index]
330
- if parallel_sample_num == 1:
331
- ## select operation
332
- if obj.input_ids is None:
333
- input_text = obj.text[i]
334
- input_ids = self.tokenizer.encode(obj.text[i])
335
- else:
336
- input_text = None
337
- input_ids = obj.input_ids[i]
338
- else:
339
- assert obj.input_ids is not None
340
- if batch_size == 1:
341
- input_text = None
342
- input_ids = obj.input_ids
343
- else:
344
- input_text = None
345
- input_ids = obj.input_ids[i]
346
- sampling_params = self._get_sampling_params(obj.sampling_params[index])
347
-
348
- if self.is_generation:
349
- pixel_values, image_hashes, image_sizes = (
350
- await self._get_pixel_values(obj.image_data[index])
351
- )
352
- modalities = obj.modalities
353
370
 
354
- tokenized_obj = TokenizedGenerateReqInput(
355
- rid,
356
- input_text,
357
- input_ids,
358
- pixel_values,
359
- image_hashes,
360
- image_sizes,
361
- sampling_params,
362
- obj.return_logprob[index],
363
- obj.logprob_start_len[index],
364
- obj.top_logprobs_num[index],
365
- obj.stream,
366
- modalities,
367
- (
368
- obj.lora_path[index]
369
- if isinstance(obj.lora_path, list)
370
- else obj.lora_path
371
- ),
372
- )
373
- else:
374
- tokenized_obj = TokenizedEmbeddingReqInput(
375
- rid,
376
- input_text,
377
- input_ids,
378
- sampling_params,
379
- )
380
- self.send_to_controller.send_pyobj(tokenized_obj)
371
+ rid, _ = await self._send_single_request(
372
+ obj, index, input_id_index=i, is_cache_for_prefill=False
373
+ )
381
374
 
382
375
  event = asyncio.Event()
383
376
  state = ReqState([], False, event)
@@ -400,7 +393,7 @@ class TokenizerManager:
400
393
  tasks = [asyncio.create_task(gen.__anext__()) for gen in generators]
401
394
  output_list = [None] * len(tasks)
402
395
 
403
- # Recv results
396
+ # Fetch results
404
397
  while tasks:
405
398
  done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
406
399
 
@@ -442,7 +435,7 @@ class TokenizerManager:
442
435
  async def _wait_for_response(
443
436
  self,
444
437
  state: ReqState,
445
- obj: Union[GenerateReqInput, EmbeddingReqInput],
438
+ obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
446
439
  rid: str,
447
440
  request: Optional[fastapi.Request] = None,
448
441
  index: Optional[int] = None,
@@ -469,7 +462,7 @@ class TokenizerManager:
469
462
  ),
470
463
  obj.return_text_in_logprobs,
471
464
  )
472
- else: # isinstance(obj, EmbeddingReqInput)
465
+ else: # isinstance(obj, (EmbeddingReqInput, RewardReqInput))
473
466
  out = state.out_list[-1]
474
467
 
475
468
  out["index"] = response_index
@@ -510,14 +503,14 @@ class TokenizerManager:
510
503
 
511
504
  def flush_cache(self):
512
505
  req = FlushCacheReq()
513
- self.send_to_controller.send_pyobj(req)
506
+ self.send_to_scheduler.send_pyobj(req)
514
507
 
515
508
  def abort_request(self, rid: str):
516
509
  if rid not in self.rid_to_state:
517
510
  return
518
511
  del self.rid_to_state[rid]
519
512
  req = AbortReq(rid)
520
- self.send_to_controller.send_pyobj(req)
513
+ self.send_to_scheduler.send_pyobj(req)
521
514
 
522
515
  async def update_weights(
523
516
  self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
@@ -533,8 +526,8 @@ class TokenizerManager:
533
526
  async with self.model_update_lock:
534
527
  # wait for the previous generation requests to finish
535
528
  while len(self.rid_to_state) > 0:
536
- await asyncio.sleep(0)
537
- self.send_to_controller.send_pyobj(obj)
529
+ await asyncio.sleep(0.001)
530
+ self.send_to_scheduler.send_pyobj(obj)
538
531
  self.model_update_result = asyncio.Future()
539
532
  result = await self.model_update_result
540
533
  if result.success:
@@ -645,6 +638,7 @@ class TokenizerManager:
645
638
  def detokenize_logprob_tokens(
646
639
  self, token_logprobs: List[Tuple[float, int]], decode_to_text: bool
647
640
  ):
641
+ # TODO(lianmin): This should run on DetokenizerManager
648
642
  if not decode_to_text:
649
643
  return [(logprob, token_id, None) for logprob, token_id in token_logprobs]
650
644
 
@@ -665,125 +659,3 @@ class TokenizerManager:
665
659
  token_top_logprobs, decode_to_text
666
660
  )
667
661
  return top_logprobs
668
-
669
- async def _get_pixel_values(self, image_data: List[Union[str, bytes]]):
670
- if not image_data:
671
- return None, None, None
672
-
673
- aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
674
- grid_pinpoints = (
675
- self.hf_config.image_grid_pinpoints
676
- if hasattr(self.hf_config, "image_grid_pinpoints")
677
- and "anyres" in aspect_ratio
678
- else None
679
- )
680
-
681
- if isinstance(image_data, list) and len(image_data) > 0:
682
- # Multiple images
683
- if len(image_data) > 1:
684
- aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
685
- pixel_values, image_hashes, image_sizes = [], [], []
686
- for img_data in image_data:
687
- pixel_v, image_h, image_s = await self._process_single_image(
688
- img_data, aspect_ratio, grid_pinpoints
689
- )
690
- pixel_values.append(pixel_v)
691
- image_hashes.append(image_h)
692
- image_sizes.append(image_s)
693
-
694
- if isinstance(pixel_values[0], np.ndarray):
695
- pixel_values = np.stack(pixel_values, axis=0)
696
- else:
697
- # A single image
698
- pixel_values, image_hash, image_size = await self._process_single_image(
699
- image_data[0], aspect_ratio, grid_pinpoints
700
- )
701
- image_hashes = [image_hash]
702
- image_sizes = [image_size]
703
- elif isinstance(image_data, str):
704
- # A single image
705
- pixel_values, image_hash, image_size = await self._process_single_image(
706
- image_data, aspect_ratio, grid_pinpoints
707
- )
708
- image_hashes = [image_hash]
709
- image_sizes = [image_size]
710
- else:
711
- raise ValueError(f"Invalid image data: {image_data}")
712
-
713
- return pixel_values, image_hashes, image_sizes
714
-
715
- async def _process_single_image(
716
- self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str
717
- ):
718
- if self.executor is not None:
719
- loop = asyncio.get_event_loop()
720
- return await loop.run_in_executor(
721
- self.executor,
722
- _process_single_image_task,
723
- image_data,
724
- aspect_ratio,
725
- grid_pinpoints,
726
- )
727
- else:
728
- return _process_single_image_task(
729
- image_data, aspect_ratio, grid_pinpoints, self.processor
730
- )
731
-
732
-
733
- global global_processor
734
-
735
-
736
- def init_global_processor(server_args: ServerArgs):
737
- """Init the global processor for multi modal models."""
738
- global global_processor
739
- transformers.logging.set_verbosity_error()
740
- global_processor = get_processor(
741
- server_args.tokenizer_path,
742
- tokenizer_mode=server_args.tokenizer_mode,
743
- trust_remote_code=server_args.trust_remote_code,
744
- )
745
-
746
-
747
- def _process_single_image_task(
748
- image_data: Union[str, bytes],
749
- image_aspect_ratio: Optional[str] = None,
750
- image_grid_pinpoints: Optional[str] = None,
751
- processor=None,
752
- ):
753
- try:
754
- processor = processor or global_processor
755
- image, image_size = load_image(image_data)
756
- if image_size is not None:
757
- # It is a video with multiple images
758
- image_hash = hash(image_data)
759
- pixel_values = processor.image_processor(image)["pixel_values"]
760
- for _ in range(len(pixel_values)):
761
- pixel_values[_] = pixel_values[_].astype(np.float16)
762
- pixel_values = np.stack(pixel_values, axis=0)
763
- return pixel_values, image_hash, image_size
764
- else:
765
- # It is an image
766
- image_hash = hash(image_data)
767
- if image_aspect_ratio == "pad":
768
- image = expand2square(
769
- image,
770
- tuple(int(x * 255) for x in processor.image_processor.image_mean),
771
- )
772
- pixel_values = processor.image_processor(image.convert("RGB"))[
773
- "pixel_values"
774
- ][0]
775
- elif image_aspect_ratio == "anyres" or (
776
- image_aspect_ratio is not None and "anyres_max" in image_aspect_ratio
777
- ):
778
- pixel_values = process_anyres_image(
779
- image, processor.image_processor, image_grid_pinpoints
780
- )
781
- else:
782
- pixel_values = processor.image_processor(image)["pixel_values"][0]
783
-
784
- if isinstance(pixel_values, np.ndarray):
785
- pixel_values = pixel_values.astype(np.float16)
786
-
787
- return pixel_values, image_hash, image.size
788
- except Exception:
789
- logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())