sglang 0.3.1.post3__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +23 -1
  3. sglang/bench_latency.py +48 -33
  4. sglang/bench_server_latency.py +0 -6
  5. sglang/bench_serving.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +14 -1
  7. sglang/lang/interpreter.py +16 -6
  8. sglang/lang/ir.py +20 -4
  9. sglang/srt/configs/model_config.py +11 -9
  10. sglang/srt/constrained/fsm_cache.py +9 -1
  11. sglang/srt/constrained/jump_forward.py +15 -2
  12. sglang/srt/hf_transformers_utils.py +1 -0
  13. sglang/srt/layers/activation.py +4 -4
  14. sglang/srt/layers/attention/__init__.py +49 -0
  15. sglang/srt/layers/attention/flashinfer_backend.py +277 -0
  16. sglang/srt/layers/{flashinfer_utils.py → attention/flashinfer_utils.py} +82 -80
  17. sglang/srt/layers/attention/triton_backend.py +161 -0
  18. sglang/srt/layers/{triton_attention → attention/triton_ops}/extend_attention.py +3 -1
  19. sglang/srt/layers/fused_moe/patch.py +117 -0
  20. sglang/srt/layers/layernorm.py +4 -4
  21. sglang/srt/layers/logits_processor.py +19 -15
  22. sglang/srt/layers/pooler.py +3 -3
  23. sglang/srt/layers/quantization/__init__.py +0 -2
  24. sglang/srt/layers/radix_attention.py +6 -4
  25. sglang/srt/layers/sampler.py +6 -4
  26. sglang/srt/layers/torchao_utils.py +18 -0
  27. sglang/srt/lora/lora.py +20 -21
  28. sglang/srt/lora/lora_manager.py +97 -25
  29. sglang/srt/managers/detokenizer_manager.py +31 -18
  30. sglang/srt/managers/image_processor.py +187 -0
  31. sglang/srt/managers/io_struct.py +99 -75
  32. sglang/srt/managers/schedule_batch.py +187 -68
  33. sglang/srt/managers/{policy_scheduler.py → schedule_policy.py} +31 -21
  34. sglang/srt/managers/scheduler.py +1021 -0
  35. sglang/srt/managers/tokenizer_manager.py +120 -247
  36. sglang/srt/managers/tp_worker.py +28 -925
  37. sglang/srt/mem_cache/memory_pool.py +34 -52
  38. sglang/srt/mem_cache/radix_cache.py +5 -5
  39. sglang/srt/model_executor/cuda_graph_runner.py +25 -25
  40. sglang/srt/model_executor/forward_batch_info.py +94 -97
  41. sglang/srt/model_executor/model_runner.py +76 -78
  42. sglang/srt/models/baichuan.py +10 -10
  43. sglang/srt/models/chatglm.py +12 -12
  44. sglang/srt/models/commandr.py +10 -10
  45. sglang/srt/models/dbrx.py +12 -12
  46. sglang/srt/models/deepseek.py +10 -10
  47. sglang/srt/models/deepseek_v2.py +14 -15
  48. sglang/srt/models/exaone.py +10 -10
  49. sglang/srt/models/gemma.py +10 -10
  50. sglang/srt/models/gemma2.py +11 -11
  51. sglang/srt/models/gpt_bigcode.py +10 -10
  52. sglang/srt/models/grok.py +10 -10
  53. sglang/srt/models/internlm2.py +10 -10
  54. sglang/srt/models/llama.py +22 -10
  55. sglang/srt/models/llama_classification.py +5 -5
  56. sglang/srt/models/llama_embedding.py +4 -4
  57. sglang/srt/models/llama_reward.py +142 -0
  58. sglang/srt/models/llava.py +39 -33
  59. sglang/srt/models/llavavid.py +31 -28
  60. sglang/srt/models/minicpm.py +10 -10
  61. sglang/srt/models/minicpm3.py +14 -15
  62. sglang/srt/models/mixtral.py +10 -10
  63. sglang/srt/models/mixtral_quant.py +10 -10
  64. sglang/srt/models/olmoe.py +10 -10
  65. sglang/srt/models/qwen.py +10 -10
  66. sglang/srt/models/qwen2.py +11 -11
  67. sglang/srt/models/qwen2_moe.py +10 -10
  68. sglang/srt/models/stablelm.py +10 -10
  69. sglang/srt/models/torch_native_llama.py +506 -0
  70. sglang/srt/models/xverse.py +10 -10
  71. sglang/srt/models/xverse_moe.py +10 -10
  72. sglang/srt/openai_api/adapter.py +7 -0
  73. sglang/srt/sampling/sampling_batch_info.py +36 -27
  74. sglang/srt/sampling/sampling_params.py +3 -1
  75. sglang/srt/server.py +170 -119
  76. sglang/srt/server_args.py +54 -27
  77. sglang/srt/utils.py +101 -128
  78. sglang/test/runners.py +76 -33
  79. sglang/test/test_programs.py +38 -5
  80. sglang/test/test_utils.py +53 -9
  81. sglang/version.py +1 -1
  82. {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/METADATA +42 -23
  83. sglang-0.3.3.dist-info/RECORD +139 -0
  84. sglang/srt/layers/attention_backend.py +0 -482
  85. sglang/srt/managers/controller_multi.py +0 -207
  86. sglang/srt/managers/controller_single.py +0 -164
  87. sglang-0.3.1.post3.dist-info/RECORD +0 -134
  88. /sglang/srt/layers/{triton_attention → attention/triton_ops}/decode_attention.py +0 -0
  89. /sglang/srt/layers/{triton_attention → attention/triton_ops}/prefill_attention.py +0 -0
  90. {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/LICENSE +0 -0
  91. {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/WHEEL +0 -0
  92. {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/top_level.txt +0 -0
@@ -16,17 +16,13 @@ limitations under the License.
16
16
  """TokenizerManager is a process that tokenizes the text."""
17
17
 
18
18
  import asyncio
19
- import concurrent.futures
20
19
  import dataclasses
21
20
  import json
22
21
  import logging
23
- import multiprocessing as mp
24
22
  import os
25
23
  from typing import Dict, List, Optional, Tuple, Union
26
24
 
27
25
  import fastapi
28
- import numpy as np
29
- import transformers
30
26
  import uvloop
31
27
  import zmq
32
28
  import zmq.asyncio
@@ -38,6 +34,10 @@ from sglang.srt.hf_transformers_utils import (
38
34
  get_processor,
39
35
  get_tokenizer,
40
36
  )
37
+ from sglang.srt.managers.image_processor import (
38
+ get_dummy_image_processor,
39
+ get_image_processor,
40
+ )
41
41
  from sglang.srt.managers.io_struct import (
42
42
  AbortReq,
43
43
  BatchEmbeddingOut,
@@ -46,16 +46,16 @@ from sglang.srt.managers.io_struct import (
46
46
  EmbeddingReqInput,
47
47
  FlushCacheReq,
48
48
  GenerateReqInput,
49
+ RewardReqInput,
49
50
  TokenizedEmbeddingReqInput,
50
51
  TokenizedGenerateReqInput,
52
+ TokenizedRewardReqInput,
51
53
  UpdateWeightReqInput,
52
54
  UpdateWeightReqOutput,
53
55
  )
54
- from sglang.srt.mm_utils import expand2square, process_anyres_image
55
56
  from sglang.srt.sampling.sampling_params import SamplingParams
56
57
  from sglang.srt.server_args import PortArgs, ServerArgs
57
- from sglang.srt.utils import is_generation_model, is_multimodal_model, load_image
58
- from sglang.utils import get_exception_traceback
58
+ from sglang.srt.utils import is_generation_model, is_multimodal_model
59
59
 
60
60
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
61
61
 
@@ -84,10 +84,10 @@ class TokenizerManager:
84
84
  # Init inter-process communication
85
85
  context = zmq.asyncio.Context(2)
86
86
  self.recv_from_detokenizer = context.socket(zmq.PULL)
87
- self.recv_from_detokenizer.bind(f"tcp://127.0.0.1:{port_args.tokenizer_port}")
87
+ self.recv_from_detokenizer.bind(f"ipc://{port_args.tokenizer_ipc_name}")
88
88
 
89
- self.send_to_controller = context.socket(zmq.PUSH)
90
- self.send_to_controller.connect(f"tcp://127.0.0.1:{port_args.controller_port}")
89
+ self.send_to_scheduler = context.socket(zmq.PUSH)
90
+ self.send_to_scheduler.connect(f"ipc://{port_args.scheduler_input_ipc_name}")
91
91
 
92
92
  # Read model args
93
93
  self.model_path = server_args.model_path
@@ -103,6 +103,8 @@ class TokenizerManager:
103
103
  self.context_len = server_args.context_length or get_context_length(
104
104
  self.hf_config
105
105
  )
106
+ # Create image processor placeholder
107
+ self.image_processor = get_dummy_image_processor()
106
108
 
107
109
  # Create tokenizer
108
110
  if server_args.skip_tokenizer_init:
@@ -117,12 +119,9 @@ class TokenizerManager:
117
119
  self.tokenizer = self.processor.tokenizer
118
120
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
119
121
 
120
- # We want to parallelize the image pre-processing so we
121
- # create an executor for it
122
- self.executor = concurrent.futures.ProcessPoolExecutor(
123
- initializer=init_global_processor,
124
- mp_context=mp.get_context("fork"),
125
- initargs=(server_args,),
122
+ # We want to parallelize the image pre-processing so we create an executor for it
123
+ self.image_processor = get_image_processor(
124
+ self.hf_config, server_args, self.processor.image_processor
126
125
  )
127
126
  else:
128
127
  self.tokenizer = get_tokenizer(
@@ -141,7 +140,7 @@ class TokenizerManager:
141
140
 
142
141
  async def generate_request(
143
142
  self,
144
- obj: Union[GenerateReqInput, EmbeddingReqInput],
143
+ obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
145
144
  request: Optional[fastapi.Request] = None,
146
145
  ):
147
146
  if self.to_create_loop:
@@ -160,53 +159,72 @@ class TokenizerManager:
160
159
  async for response in self._handle_batch_request(obj, request):
161
160
  yield response
162
161
 
163
- async def _handle_single_request(
162
+ async def _send_single_request(
164
163
  self,
165
- obj: Union[GenerateReqInput, EmbeddingReqInput],
166
- request: Optional[fastapi.Request] = None,
164
+ obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
167
165
  index: Optional[int] = None,
166
+ input_id_index: Optional[int] = None,
168
167
  is_cache_for_prefill: Optional[bool] = False,
169
168
  ):
170
169
  if not is_cache_for_prefill: # The normal case with a single prompt
171
- not_use_index = index is None
170
+ if index is None:
171
+ rid = obj.rid
172
+ if hasattr(obj, "conv"):
173
+ # reward model
174
+ conv = obj.conv
175
+ input_text = self.tokenizer.apply_chat_template(
176
+ conv, tokenize=False
177
+ )
178
+ input_ids = self.tokenizer.encode(input_text)
179
+ elif obj.input_ids is None:
180
+ input_text = obj.text
181
+ input_ids = self.tokenizer.encode(input_text)
182
+ else:
183
+ input_text = obj.text if obj.text is not None else None
184
+ input_ids = obj.input_ids
172
185
 
173
- rid = obj.rid if not_use_index else obj.rid[index]
174
- input_text = obj.text if not_use_index else obj.text[index]
175
- if obj.input_ids is None:
176
- assert self.tokenizer is not None
177
- input_ids = self.tokenizer.encode(input_text)
186
+ sampling_params = self._get_sampling_params(obj.sampling_params)
187
+ if self.is_generation:
188
+ image_inputs = await self.image_processor.process_images_async(
189
+ obj.image_data, obj
190
+ )
191
+ return_logprob = obj.return_logprob
192
+ logprob_start_len = obj.logprob_start_len
193
+ top_logprobs_num = obj.top_logprobs_num
178
194
  else:
179
- input_ids = obj.input_ids if not_use_index else obj.input_ids[index]
195
+ rid = obj.rid[index]
196
+ if hasattr(obj, "conv"):
197
+ # reward model
198
+ conv = obj.conv[index]
199
+ input_text = self.tokenizer.apply_chat_template(
200
+ conv, tokenize=False
201
+ )
202
+ input_ids = self.tokenizer.encode(input_text)
203
+ elif obj.input_ids is None:
204
+ input_text = obj.text[input_id_index]
205
+ input_ids = self.tokenizer.encode(input_text)
206
+ else:
207
+ input_text = (
208
+ obj.text[input_id_index] if obj.text is not None else None
209
+ )
210
+ input_ids = obj.input_ids[input_id_index]
180
211
 
181
- self._validate_input_length(input_ids)
212
+ sampling_params = self._get_sampling_params(obj.sampling_params[index])
213
+ if self.is_generation:
214
+ image_inputs = await self.image_processor.process_images_async(
215
+ obj.image_data[index], obj
216
+ )
217
+ return_logprob = obj.return_logprob[index]
218
+ logprob_start_len = obj.logprob_start_len[index]
219
+ top_logprobs_num = obj.top_logprobs_num[index]
182
220
 
183
- sampling_params = self._get_sampling_params(
184
- obj.sampling_params if not_use_index else obj.sampling_params[index]
185
- )
221
+ self._validate_input_length(input_ids)
186
222
 
187
- if self.is_generation:
188
- pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
189
- obj.image_data if not_use_index else obj.image_data[index]
190
- )
191
- modalities = obj.modalities
192
- return_logprob = (
193
- obj.return_logprob if not_use_index else obj.return_logprob[index]
194
- )
195
- logprob_start_len = (
196
- obj.logprob_start_len
197
- if not_use_index
198
- else obj.logprob_start_len[index]
199
- )
200
- top_logprobs_num = (
201
- obj.top_logprobs_num
202
- if not_use_index
203
- else obj.top_logprobs_num[index]
204
- )
205
223
  else: # A prefill request to cache the common prompt for parallel sampling
206
224
  assert self.is_generation
207
225
  if obj.text is not None:
208
226
  if isinstance(obj.text, list):
209
- input_text = obj.text[index]
227
+ input_text = obj.text[input_id_index]
210
228
  rid = obj.rid[index]
211
229
  else:
212
230
  input_text = obj.text
@@ -220,7 +238,7 @@ class TokenizerManager:
220
238
  obj.input_ids[0], list
221
239
  ):
222
240
  # when obj["input_ids"] is List[List[int]]
223
- input_ids = obj.input_ids[index]
241
+ input_ids = obj.input_ids[input_id_index]
224
242
  rid = obj.rid[index]
225
243
  else:
226
244
  input_ids = obj.input_ids
@@ -231,7 +249,7 @@ class TokenizerManager:
231
249
  obj.input_ids[0], list
232
250
  ):
233
251
  # when obj["input_ids"] is List[List[int]]
234
- input_ids = obj.input_ids[index]
252
+ input_ids = obj.input_ids[input_id_index]
235
253
  rid = obj.rid[index]
236
254
  else:
237
255
  input_ids = obj.input_ids
@@ -239,10 +257,9 @@ class TokenizerManager:
239
257
 
240
258
  sampling_params = SamplingParams(**obj.sampling_params[0])
241
259
  sampling_params.max_new_tokens = 0
242
- pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
243
- obj.image_data[0]
260
+ image_inputs = await self.image_processor.process_images_async(
261
+ obj.image_data[0], obj
244
262
  )
245
- modalities = obj.modalities
246
263
  return_logprob = obj.return_logprob[0]
247
264
  logprob_start_len = obj.logprob_start_len[0]
248
265
  top_logprobs_num = obj.top_logprobs_num[0]
@@ -253,34 +270,57 @@ class TokenizerManager:
253
270
  rid,
254
271
  input_text,
255
272
  input_ids,
256
- pixel_values,
257
- image_hashes,
258
- image_sizes,
273
+ image_inputs,
259
274
  sampling_params,
260
275
  return_logprob,
261
276
  logprob_start_len,
262
277
  top_logprobs_num,
263
278
  obj.stream,
264
- modalities,
265
279
  (
266
- obj.lora_path[index]
280
+ obj.lora_path[input_id_index]
267
281
  if isinstance(obj.lora_path, list)
268
282
  else obj.lora_path
269
283
  ),
270
284
  )
271
- else: # is embedding
285
+ elif isinstance(obj, EmbeddingReqInput):
272
286
  tokenized_obj = TokenizedEmbeddingReqInput(
273
287
  rid,
274
288
  input_text,
275
289
  input_ids,
276
290
  sampling_params,
277
291
  )
278
- self.send_to_controller.send_pyobj(tokenized_obj)
292
+ else:
293
+ assert isinstance(obj, RewardReqInput)
294
+ tokenized_obj = TokenizedRewardReqInput(
295
+ rid,
296
+ input_text,
297
+ input_ids,
298
+ sampling_params,
299
+ )
300
+
301
+ self.send_to_scheduler.send_pyobj(tokenized_obj)
302
+ return rid, input_ids
303
+
304
+ async def _handle_single_request(
305
+ self,
306
+ obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
307
+ request: Optional[fastapi.Request] = None,
308
+ index: Optional[int] = None,
309
+ input_id_index: Optional[int] = None,
310
+ is_cache_for_prefill: Optional[bool] = False,
311
+ ):
312
+ rid, input_ids = await self._send_single_request(
313
+ obj,
314
+ index,
315
+ input_id_index=input_id_index,
316
+ is_cache_for_prefill=is_cache_for_prefill,
317
+ )
279
318
 
280
319
  # Recv results
281
320
  event = asyncio.Event()
282
321
  state = ReqState([], False, event)
283
322
  self.rid_to_state[rid] = state
323
+
284
324
  if not is_cache_for_prefill:
285
325
  async for response in self._wait_for_response(state, obj, rid, request):
286
326
  yield response
@@ -291,7 +331,7 @@ class TokenizerManager:
291
331
 
292
332
  async def _handle_batch_request(
293
333
  self,
294
- obj: Union[GenerateReqInput, EmbeddingReqInput],
334
+ obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
295
335
  request: Optional[fastapi.Request] = None,
296
336
  ):
297
337
  batch_size = obj.batch_size
@@ -304,14 +344,16 @@ class TokenizerManager:
304
344
  input_id_result = [] if obj.input_ids is None else None
305
345
  for i in range(batch_size):
306
346
  async for input_id in self._handle_single_request(
307
- obj, request, index=i, is_cache_for_prefill=True
347
+ obj,
348
+ request,
349
+ index=i,
350
+ input_id_index=i,
351
+ is_cache_for_prefill=True,
308
352
  ):
309
353
  if input_id_result is not None:
310
354
  input_id_result.append(input_id)
311
- if input_id_result is not None and len(input_id_result) > 1:
355
+ if input_id_result is not None:
312
356
  obj.input_ids = input_id_result
313
- elif input_id_result is not None:
314
- obj.input_ids = input_id_result[0]
315
357
  else:
316
358
  parallel_sample_num = 1
317
359
 
@@ -325,58 +367,10 @@ class TokenizerManager:
325
367
  if parallel_sample_num != 1:
326
368
  # Here when using parallel sampling we should consider prefill stage so the index is : j + i * (parallel_sample_num-1) + batch_size - 1
327
369
  index += batch_size - 1 - i
328
- rid = obj.rid[index]
329
- if parallel_sample_num == 1:
330
- ## select operation
331
- if obj.input_ids is None:
332
- input_text = obj.text[i]
333
- input_ids = self.tokenizer.encode(obj.text[i])
334
- else:
335
- input_text = None
336
- input_ids = obj.input_ids[i]
337
- else:
338
- assert obj.input_ids is not None
339
- if batch_size == 1:
340
- input_text = None
341
- input_ids = obj.input_ids
342
- else:
343
- input_text = None
344
- input_ids = obj.input_ids[i]
345
- sampling_params = self._get_sampling_params(obj.sampling_params[index])
346
-
347
- if self.is_generation:
348
- pixel_values, image_hashes, image_sizes = (
349
- await self._get_pixel_values(obj.image_data[index])
350
- )
351
- modalities = obj.modalities
352
370
 
353
- tokenized_obj = TokenizedGenerateReqInput(
354
- rid,
355
- input_text,
356
- input_ids,
357
- pixel_values,
358
- image_hashes,
359
- image_sizes,
360
- sampling_params,
361
- obj.return_logprob[index],
362
- obj.logprob_start_len[index],
363
- obj.top_logprobs_num[index],
364
- obj.stream,
365
- modalities,
366
- (
367
- obj.lora_path[index]
368
- if isinstance(obj.lora_path, list)
369
- else obj.lora_path
370
- ),
371
- )
372
- else:
373
- tokenized_obj = TokenizedEmbeddingReqInput(
374
- rid,
375
- input_text,
376
- input_ids,
377
- sampling_params,
378
- )
379
- self.send_to_controller.send_pyobj(tokenized_obj)
371
+ rid, _ = await self._send_single_request(
372
+ obj, index, input_id_index=i, is_cache_for_prefill=False
373
+ )
380
374
 
381
375
  event = asyncio.Event()
382
376
  state = ReqState([], False, event)
@@ -399,7 +393,7 @@ class TokenizerManager:
399
393
  tasks = [asyncio.create_task(gen.__anext__()) for gen in generators]
400
394
  output_list = [None] * len(tasks)
401
395
 
402
- # Recv results
396
+ # Fetch results
403
397
  while tasks:
404
398
  done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
405
399
 
@@ -441,7 +435,7 @@ class TokenizerManager:
441
435
  async def _wait_for_response(
442
436
  self,
443
437
  state: ReqState,
444
- obj: Union[GenerateReqInput, EmbeddingReqInput],
438
+ obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
445
439
  rid: str,
446
440
  request: Optional[fastapi.Request] = None,
447
441
  index: Optional[int] = None,
@@ -468,7 +462,7 @@ class TokenizerManager:
468
462
  ),
469
463
  obj.return_text_in_logprobs,
470
464
  )
471
- else: # isinstance(obj, EmbeddingReqInput)
465
+ else: # isinstance(obj, (EmbeddingReqInput, RewardReqInput))
472
466
  out = state.out_list[-1]
473
467
 
474
468
  out["index"] = response_index
@@ -509,14 +503,14 @@ class TokenizerManager:
509
503
 
510
504
  def flush_cache(self):
511
505
  req = FlushCacheReq()
512
- self.send_to_controller.send_pyobj(req)
506
+ self.send_to_scheduler.send_pyobj(req)
513
507
 
514
508
  def abort_request(self, rid: str):
515
509
  if rid not in self.rid_to_state:
516
510
  return
517
511
  del self.rid_to_state[rid]
518
512
  req = AbortReq(rid)
519
- self.send_to_controller.send_pyobj(req)
513
+ self.send_to_scheduler.send_pyobj(req)
520
514
 
521
515
  async def update_weights(
522
516
  self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
@@ -532,8 +526,8 @@ class TokenizerManager:
532
526
  async with self.model_update_lock:
533
527
  # wait for the previous generation requests to finish
534
528
  while len(self.rid_to_state) > 0:
535
- await asyncio.sleep(0)
536
- self.send_to_controller.send_pyobj(obj)
529
+ await asyncio.sleep(0.001)
530
+ self.send_to_scheduler.send_pyobj(obj)
537
531
  self.model_update_result = asyncio.Future()
538
532
  result = await self.model_update_result
539
533
  if result.success:
@@ -644,6 +638,7 @@ class TokenizerManager:
644
638
  def detokenize_logprob_tokens(
645
639
  self, token_logprobs: List[Tuple[float, int]], decode_to_text: bool
646
640
  ):
641
+ # TODO(lianmin): This should run on DetokenizerManager
647
642
  if not decode_to_text:
648
643
  return [(logprob, token_id, None) for logprob, token_id in token_logprobs]
649
644
 
@@ -664,125 +659,3 @@ class TokenizerManager:
664
659
  token_top_logprobs, decode_to_text
665
660
  )
666
661
  return top_logprobs
667
-
668
- async def _get_pixel_values(self, image_data: List[Union[str, bytes]]):
669
- if not image_data:
670
- return None, None, None
671
-
672
- aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
673
- grid_pinpoints = (
674
- self.hf_config.image_grid_pinpoints
675
- if hasattr(self.hf_config, "image_grid_pinpoints")
676
- and "anyres" in aspect_ratio
677
- else None
678
- )
679
-
680
- if isinstance(image_data, list) and len(image_data) > 0:
681
- # Multiple images
682
- if len(image_data) > 1:
683
- aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
684
- pixel_values, image_hashes, image_sizes = [], [], []
685
- for img_data in image_data:
686
- pixel_v, image_h, image_s = await self._process_single_image(
687
- img_data, aspect_ratio, grid_pinpoints
688
- )
689
- pixel_values.append(pixel_v)
690
- image_hashes.append(image_h)
691
- image_sizes.append(image_s)
692
-
693
- if isinstance(pixel_values[0], np.ndarray):
694
- pixel_values = np.stack(pixel_values, axis=0)
695
- else:
696
- # A single image
697
- pixel_values, image_hash, image_size = await self._process_single_image(
698
- image_data[0], aspect_ratio, grid_pinpoints
699
- )
700
- image_hashes = [image_hash]
701
- image_sizes = [image_size]
702
- elif isinstance(image_data, str):
703
- # A single image
704
- pixel_values, image_hash, image_size = await self._process_single_image(
705
- image_data, aspect_ratio, grid_pinpoints
706
- )
707
- image_hashes = [image_hash]
708
- image_sizes = [image_size]
709
- else:
710
- raise ValueError(f"Invalid image data: {image_data}")
711
-
712
- return pixel_values, image_hashes, image_sizes
713
-
714
- async def _process_single_image(
715
- self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str
716
- ):
717
- if self.executor is not None:
718
- loop = asyncio.get_event_loop()
719
- return await loop.run_in_executor(
720
- self.executor,
721
- _process_single_image_task,
722
- image_data,
723
- aspect_ratio,
724
- grid_pinpoints,
725
- )
726
- else:
727
- return _process_single_image_task(
728
- image_data, aspect_ratio, grid_pinpoints, self.processor
729
- )
730
-
731
-
732
- global global_processor
733
-
734
-
735
- def init_global_processor(server_args: ServerArgs):
736
- """Init the global processor for multi modal models."""
737
- global global_processor
738
- transformers.logging.set_verbosity_error()
739
- global_processor = get_processor(
740
- server_args.tokenizer_path,
741
- tokenizer_mode=server_args.tokenizer_mode,
742
- trust_remote_code=server_args.trust_remote_code,
743
- )
744
-
745
-
746
- def _process_single_image_task(
747
- image_data: Union[str, bytes],
748
- image_aspect_ratio: Optional[str] = None,
749
- image_grid_pinpoints: Optional[str] = None,
750
- processor=None,
751
- ):
752
- try:
753
- processor = processor or global_processor
754
- image, image_size = load_image(image_data)
755
- if image_size is not None:
756
- # It is a video with multiple images
757
- image_hash = hash(image_data)
758
- pixel_values = processor.image_processor(image)["pixel_values"]
759
- for _ in range(len(pixel_values)):
760
- pixel_values[_] = pixel_values[_].astype(np.float16)
761
- pixel_values = np.stack(pixel_values, axis=0)
762
- return pixel_values, image_hash, image_size
763
- else:
764
- # It is an image
765
- image_hash = hash(image_data)
766
- if image_aspect_ratio == "pad":
767
- image = expand2square(
768
- image,
769
- tuple(int(x * 255) for x in processor.image_processor.image_mean),
770
- )
771
- pixel_values = processor.image_processor(image.convert("RGB"))[
772
- "pixel_values"
773
- ][0]
774
- elif image_aspect_ratio == "anyres" or (
775
- image_aspect_ratio is not None and "anyres_max" in image_aspect_ratio
776
- ):
777
- pixel_values = process_anyres_image(
778
- image, processor.image_processor, image_grid_pinpoints
779
- )
780
- else:
781
- pixel_values = processor.image_processor(image)["pixel_values"][0]
782
-
783
- if isinstance(pixel_values, np.ndarray):
784
- pixel_values = pixel_values.astype(np.float16)
785
-
786
- return pixel_values, image_hash, image.size
787
- except Exception:
788
- logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())