sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. sglang/bench_one_batch.py +1 -11
  2. sglang/bench_serving.py +149 -1
  3. sglang/lang/chat_template.py +44 -0
  4. sglang/srt/configs/deepseekvl2.py +3 -0
  5. sglang/srt/configs/device_config.py +1 -1
  6. sglang/srt/configs/internvl.py +696 -0
  7. sglang/srt/configs/janus_pro.py +3 -0
  8. sglang/srt/configs/model_config.py +17 -0
  9. sglang/srt/constrained/xgrammar_backend.py +11 -19
  10. sglang/srt/conversation.py +30 -3
  11. sglang/srt/disaggregation/decode.py +4 -1
  12. sglang/srt/disaggregation/mini_lb.py +74 -23
  13. sglang/srt/disaggregation/mooncake/conn.py +9 -18
  14. sglang/srt/disaggregation/nixl/conn.py +241 -71
  15. sglang/srt/disaggregation/utils.py +44 -1
  16. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
  17. sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
  18. sglang/srt/distributed/device_communicators/pynccl.py +2 -1
  19. sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
  20. sglang/srt/distributed/parallel_state.py +22 -1
  21. sglang/srt/entrypoints/engine.py +14 -2
  22. sglang/srt/entrypoints/http_server.py +28 -1
  23. sglang/srt/entrypoints/verl_engine.py +3 -2
  24. sglang/srt/hf_transformers_utils.py +20 -1
  25. sglang/srt/layers/attention/flashattention_backend.py +146 -50
  26. sglang/srt/layers/attention/flashinfer_backend.py +23 -13
  27. sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
  28. sglang/srt/layers/attention/merge_state.py +46 -0
  29. sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
  30. sglang/srt/layers/attention/vision.py +290 -163
  31. sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
  32. sglang/srt/layers/moe/ep_moe/layer.py +120 -1
  33. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  36. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +4 -1
  37. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
  38. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
  39. sglang/srt/layers/quantization/deep_gemm.py +5 -0
  40. sglang/srt/layers/quantization/fp8.py +108 -95
  41. sglang/srt/layers/quantization/fp8_kernel.py +79 -60
  42. sglang/srt/layers/quantization/fp8_utils.py +71 -23
  43. sglang/srt/layers/quantization/kv_cache.py +3 -10
  44. sglang/srt/layers/quantization/utils.py +0 -5
  45. sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
  46. sglang/srt/lora/lora_manager.py +10 -13
  47. sglang/srt/managers/cache_controller.py +115 -119
  48. sglang/srt/managers/io_struct.py +10 -0
  49. sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
  50. sglang/srt/managers/multimodal_processors/internvl.py +232 -0
  51. sglang/srt/managers/schedule_batch.py +19 -1
  52. sglang/srt/managers/schedule_policy.py +11 -5
  53. sglang/srt/managers/scheduler.py +28 -13
  54. sglang/srt/managers/tokenizer_manager.py +24 -13
  55. sglang/srt/managers/tp_worker.py +9 -12
  56. sglang/srt/mem_cache/chunk_cache.py +2 -0
  57. sglang/srt/mem_cache/memory_pool.py +2 -2
  58. sglang/srt/model_executor/model_runner.py +44 -33
  59. sglang/srt/model_loader/loader.py +18 -11
  60. sglang/srt/models/clip.py +4 -4
  61. sglang/srt/models/deepseek_janus_pro.py +1 -1
  62. sglang/srt/models/deepseek_nextn.py +1 -20
  63. sglang/srt/models/deepseek_v2.py +55 -20
  64. sglang/srt/models/gemma3_mm.py +1 -1
  65. sglang/srt/models/internlm2.py +3 -0
  66. sglang/srt/models/internvl.py +670 -0
  67. sglang/srt/models/llama.py +1 -1
  68. sglang/srt/models/llama4.py +53 -7
  69. sglang/srt/models/minicpmv.py +1 -1
  70. sglang/srt/models/mllama.py +1 -1
  71. sglang/srt/models/phi3_small.py +16 -2
  72. sglang/srt/models/qwen2_5_vl.py +8 -4
  73. sglang/srt/models/qwen2_vl.py +4 -4
  74. sglang/srt/models/xiaomi_mimo.py +171 -0
  75. sglang/srt/openai_api/adapter.py +24 -40
  76. sglang/srt/openai_api/protocol.py +28 -16
  77. sglang/srt/reasoning_parser.py +2 -2
  78. sglang/srt/sampling/sampling_batch_info.py +54 -2
  79. sglang/srt/sampling/sampling_params.py +2 -0
  80. sglang/srt/server_args.py +30 -6
  81. sglang/srt/utils.py +35 -1
  82. sglang/test/test_block_fp8.py +2 -2
  83. sglang/test/test_deepep_utils.py +219 -0
  84. sglang/test/test_utils.py +3 -1
  85. sglang/version.py +1 -1
  86. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +14 -6
  87. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +90 -80
  88. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
  89. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
  90. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py CHANGED
@@ -137,17 +137,7 @@ def load_model(server_args, port_args, tp_rank):
137
137
  suppress_other_loggers()
138
138
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
139
139
 
140
- model_config = ModelConfig(
141
- server_args.model_path,
142
- trust_remote_code=server_args.trust_remote_code,
143
- revision=server_args.revision,
144
- context_length=server_args.context_length,
145
- model_override_args=server_args.json_model_override_args,
146
- is_embedding=server_args.is_embedding,
147
- enable_multimodal=server_args.enable_multimodal,
148
- dtype=server_args.dtype,
149
- quantization=server_args.quantization,
150
- )
140
+ model_config = ModelConfig.from_server_args(server_args)
151
141
  model_runner = ModelRunner(
152
142
  model_config=model_config,
153
143
  mem_fraction_static=server_args.mem_fraction_static,
sglang/bench_serving.py CHANGED
@@ -58,6 +58,7 @@ class RequestFuncInput:
58
58
  output_len: int
59
59
  model: str
60
60
  lora_name: str
61
+ image_data: str
61
62
  extra_request_body: Dict[str, Any]
62
63
 
63
64
 
@@ -347,6 +348,11 @@ async def async_request_sglang_generate(
347
348
  "logprob_start_len": -1,
348
349
  **request_func_input.extra_request_body,
349
350
  }
351
+
352
+ # Add image data if available
353
+ if request_func_input.image_data:
354
+ payload["image_data"] = request_func_input.image_data
355
+
350
356
  headers = get_auth_headers()
351
357
 
352
358
  output = RequestFuncOutput()
@@ -510,6 +516,13 @@ def get_dataset(args, tokenizer):
510
516
  tokenizer=tokenizer,
511
517
  args=args,
512
518
  )
519
+ elif args.dataset_name == "mmmu":
520
+ input_requests = sample_mmmu_requests(
521
+ num_requests=args.num_prompts,
522
+ tokenizer=tokenizer,
523
+ fixed_output_len=args.random_output_len,
524
+ random_sample=True,
525
+ )
513
526
  else:
514
527
  raise ValueError(f"Unknown dataset: {args.dataset_name}")
515
528
  return input_requests
@@ -597,6 +610,121 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
597
610
  return filename
598
611
 
599
612
 
613
+ def sample_mmmu_requests(
614
+ num_requests: int,
615
+ tokenizer: PreTrainedTokenizerBase,
616
+ fixed_output_len: Optional[int] = None,
617
+ random_sample: bool = True,
618
+ ) -> List[Tuple[str, int, int]]:
619
+ """
620
+ Sample requests from the MMMU dataset using HuggingFace datasets.
621
+
622
+ Args:
623
+ num_requests: Number of requests to sample.
624
+ tokenizer: Tokenizer to use for token counting.
625
+ fixed_output_len: If provided, use this fixed output length for all requests.
626
+ random_sample: Whether to randomly sample or take the first N.
627
+
628
+ Returns:
629
+ List of tuples (prompt, prompt_token_len, output_token_len).
630
+ """
631
+ try:
632
+ import base64
633
+ import io
634
+
635
+ from datasets import load_dataset
636
+ except ImportError:
637
+ raise ImportError("Please install datasets: pip install datasets")
638
+
639
+ print("Loading MMMU dataset from HuggingFace...")
640
+
641
+ try:
642
+ print("Attempting to load MMMU Math dataset...")
643
+ mmmu_dataset = load_dataset("MMMU/MMMU", "Math", split="test")
644
+ print(
645
+ f"Successfully loaded MMMU Math dataset from HuggingFace with {len(mmmu_dataset)} examples"
646
+ )
647
+ except Exception as e:
648
+ print(f"Failed to load MMMU Math dataset: {e}")
649
+ raise ValueError(f"Failed to load MMMU dataset: {e}")
650
+
651
+ # Sample from the dataset
652
+ if len(mmmu_dataset) > num_requests:
653
+ if random_sample:
654
+ # Random sample
655
+ indices = random.sample(range(len(mmmu_dataset)), num_requests)
656
+ sample_dataset = mmmu_dataset.select(indices)
657
+ else:
658
+ # Take first N
659
+ sample_dataset = mmmu_dataset.select(
660
+ range(min(num_requests, len(mmmu_dataset)))
661
+ )
662
+ else:
663
+ print(f"Dataset has less than {num_requests} examples, using all examples")
664
+ sample_dataset = mmmu_dataset
665
+
666
+ print(f"Selected {len(sample_dataset)} examples for benchmarking")
667
+
668
+ # Create prompts
669
+ filtered_dataset = []
670
+
671
+ for i, example in enumerate(sample_dataset):
672
+ try:
673
+ # Extract image_1
674
+ image = example.get("image_1")
675
+
676
+ if image is not None:
677
+ if hasattr(image, "save"):
678
+ # Convert RGBA images to RGB before encoding
679
+ if image.mode == "RGBA":
680
+ image = image.convert("RGB")
681
+
682
+ # Encode image to base64
683
+ buffered = io.BytesIO()
684
+ image.save(buffered, format="JPEG")
685
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
686
+ image_path = f"data:image/jpeg;base64,{img_str}"
687
+ else:
688
+ continue
689
+
690
+ # Extract the question
691
+ question = example.get("question")
692
+
693
+ # Create the prompt with image, question
694
+ prompt = f"Question: {question}\n\nAnswer: "
695
+ prompt = tokenizer.apply_chat_template(
696
+ [
697
+ {
698
+ "role": "user",
699
+ "content": [
700
+ {"type": "image_url", "image_url": {"url": image_path}},
701
+ {"type": "text", "text": prompt},
702
+ ],
703
+ }
704
+ ],
705
+ add_generation_prompt=True,
706
+ tokenize=False,
707
+ )
708
+ prompt = f"<image>{image_path}</image>{prompt}"
709
+
710
+ # Calculate token lengths
711
+ # Note: This is approximate since we're not rendering the actual image tokens
712
+ prompt_token_ids = tokenizer.encode(prompt)
713
+ prompt_len = (
714
+ len(prompt_token_ids) + 512
715
+ ) # Add estimate for image tokens
716
+
717
+ output_len = fixed_output_len if fixed_output_len is not None else 256
718
+
719
+ filtered_dataset.append((prompt, prompt_len, output_len))
720
+
721
+ except Exception as e:
722
+ print(f"Error processing example {i}: {e}")
723
+
724
+ print(f"\nCreated {len(filtered_dataset)} MMMU prompts")
725
+ return filtered_dataset
726
+
727
+
600
728
  def sample_sharegpt_requests(
601
729
  dataset_path: str,
602
730
  num_requests: int,
@@ -1004,6 +1132,15 @@ async def benchmark(
1004
1132
  else:
1005
1133
  lora_name = None
1006
1134
 
1135
+ if "<image>" in test_prompt:
1136
+ import re
1137
+
1138
+ image_match = re.search(r"<image>(.*?)</image>(.*)", test_prompt)
1139
+ image_data = image_match.group(1) if image_match else None
1140
+ test_prompt = image_match.group(2) if image_match else test_prompt
1141
+ else:
1142
+ image_data = None
1143
+
1007
1144
  # Create the test input once
1008
1145
  test_input = RequestFuncInput(
1009
1146
  model=model_id,
@@ -1012,6 +1149,7 @@ async def benchmark(
1012
1149
  prompt_len=test_prompt_len,
1013
1150
  output_len=min(test_output_len, 32),
1014
1151
  lora_name=lora_name,
1152
+ image_data=image_data,
1015
1153
  extra_request_body=extra_request_body,
1016
1154
  )
1017
1155
 
@@ -1063,6 +1201,15 @@ async def benchmark(
1063
1201
  else:
1064
1202
  lora_name = None
1065
1203
 
1204
+ if "<image>" in prompt:
1205
+ import re
1206
+
1207
+ image_match = re.search(r"<image>(.*?)</image>(.*)", prompt)
1208
+ image_data = image_match.group(1) if image_match else None
1209
+ prompt = image_match.group(2) if image_match else prompt
1210
+ else:
1211
+ image_data = None
1212
+
1066
1213
  request_func_input = RequestFuncInput(
1067
1214
  model=model_id,
1068
1215
  prompt=prompt,
@@ -1070,6 +1217,7 @@ async def benchmark(
1070
1217
  prompt_len=prompt_len,
1071
1218
  output_len=output_len,
1072
1219
  lora_name=lora_name,
1220
+ image_data=image_data,
1073
1221
  extra_request_body=extra_request_body,
1074
1222
  )
1075
1223
  tasks.append(
@@ -1444,7 +1592,7 @@ if __name__ == "__main__":
1444
1592
  "--dataset-name",
1445
1593
  type=str,
1446
1594
  default="sharegpt",
1447
- choices=["sharegpt", "random", "random-ids", "generated-shared-prefix"],
1595
+ choices=["sharegpt", "random", "random-ids", "generated-shared-prefix", "mmmu"],
1448
1596
  help="Name of the dataset to benchmark on.",
1449
1597
  )
1450
1598
  parser.add_argument(
@@ -270,6 +270,29 @@ register_chat_template(
270
270
  )
271
271
  )
272
272
 
273
+ register_chat_template(
274
+ ChatTemplate(
275
+ name="janus",
276
+ default_system_prompt=None,
277
+ role_prefix_and_suffix={
278
+ "system": (
279
+ "",
280
+ "",
281
+ ),
282
+ "user": (
283
+ "<|User|>",
284
+ "",
285
+ ),
286
+ "assistant": (
287
+ "<|Assistant|>",
288
+ "<|end▁of▁sentence|>",
289
+ ),
290
+ },
291
+ stop_str=("<|end▁of▁sentence|>",),
292
+ image_token="<image_placeholder>\n",
293
+ )
294
+ )
295
+
273
296
  # The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
274
297
  register_chat_template(
275
298
  ChatTemplate(
@@ -395,6 +418,20 @@ register_chat_template(
395
418
  )
396
419
  )
397
420
 
421
+ # Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
422
+ register_chat_template(
423
+ ChatTemplate(
424
+ name="internvl-2-5",
425
+ default_system_prompt="你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。",
426
+ role_prefix_and_suffix={
427
+ "system": ("<|im_start|>system\n", "<|im_end|>\n"),
428
+ "user": ("<|im_start|>user\n", "<|im_end|>\n"),
429
+ "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
430
+ },
431
+ stop_str=["<|im_end|>", "<|action_end|>"],
432
+ )
433
+ )
434
+
398
435
  register_chat_template(
399
436
  ChatTemplate(
400
437
  name="granite-3-instruct",
@@ -565,6 +602,13 @@ def match_gemma3_instruct(model_path: str):
565
602
  return get_chat_template("gemma-it")
566
603
 
567
604
 
605
+ @register_chat_template_matching_function
606
+ def match_internvl_chat(model_path: str):
607
+ model_path = model_path.lower()
608
+ if "internvl" in model_path:
609
+ return get_chat_template("internvl-2-5")
610
+
611
+
568
612
  if __name__ == "__main__":
569
613
  messages = [
570
614
  {"role": "system", "content": None}, # None means default
@@ -48,6 +48,9 @@ class DictOutput(object):
48
48
  def __getitem__(self, item):
49
49
  return self.__dict__[item]
50
50
 
51
+ def __contains__(self, key):
52
+ return key in self.__dict__
53
+
51
54
  def __setitem__(self, key, value):
52
55
  self.__dict__[key] = value
53
56
 
@@ -10,7 +10,7 @@ class DeviceConfig:
10
10
  device: Optional[torch.device]
11
11
 
12
12
  def __init__(self, device: str = "cuda") -> None:
13
- if device in ["cuda", "xpu", "hpu", "cpu"]:
13
+ if device in ["cuda", "xpu", "hpu", "cpu", "npu"]:
14
14
  self.device_type = device
15
15
  else:
16
16
  raise RuntimeError(f"Not supported device type: {device}")