sglang 0.5.4__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. sglang/bench_serving.py +56 -12
  2. sglang/launch_server.py +2 -0
  3. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +101 -4
  4. sglang/srt/compilation/backend.py +1 -1
  5. sglang/srt/configs/model_config.py +5 -5
  6. sglang/srt/distributed/parallel_state.py +0 -7
  7. sglang/srt/entrypoints/engine.py +18 -15
  8. sglang/srt/entrypoints/grpc_server.py +0 -1
  9. sglang/srt/entrypoints/http_server.py +75 -94
  10. sglang/srt/environ.py +16 -2
  11. sglang/srt/eplb/expert_distribution.py +30 -0
  12. sglang/srt/function_call/function_call_parser.py +2 -0
  13. sglang/srt/function_call/minimax_m2.py +367 -0
  14. sglang/srt/layers/activation.py +6 -0
  15. sglang/srt/layers/attention/flashattention_backend.py +12 -2
  16. sglang/srt/layers/attention/flashinfer_backend.py +10 -1
  17. sglang/srt/layers/attention/flashinfer_mla_backend.py +18 -10
  18. sglang/srt/layers/attention/trtllm_mla_backend.py +1 -13
  19. sglang/srt/layers/attention/utils.py +78 -0
  20. sglang/srt/layers/communicator.py +1 -0
  21. sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
  22. sglang/srt/layers/layernorm.py +19 -4
  23. sglang/srt/layers/logits_processor.py +5 -0
  24. sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
  25. sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
  26. sglang/srt/layers/moe/ep_moe/layer.py +79 -272
  27. sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
  28. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
  29. sglang/srt/layers/moe/moe_runner/deep_gemm.py +287 -22
  30. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  31. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  32. sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
  33. sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
  34. sglang/srt/layers/moe/token_dispatcher/deepep.py +18 -14
  35. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  36. sglang/srt/layers/moe/topk.py +4 -4
  37. sglang/srt/layers/moe/utils.py +3 -4
  38. sglang/srt/layers/quantization/__init__.py +3 -5
  39. sglang/srt/layers/quantization/awq.py +0 -3
  40. sglang/srt/layers/quantization/base_config.py +7 -0
  41. sglang/srt/layers/quantization/fp8.py +68 -63
  42. sglang/srt/layers/quantization/gguf.py +566 -0
  43. sglang/srt/layers/quantization/mxfp4.py +30 -38
  44. sglang/srt/layers/quantization/unquant.py +23 -45
  45. sglang/srt/layers/quantization/w4afp8.py +38 -2
  46. sglang/srt/layers/radix_attention.py +5 -2
  47. sglang/srt/layers/rotary_embedding.py +13 -1
  48. sglang/srt/layers/sampler.py +12 -1
  49. sglang/srt/managers/io_struct.py +3 -0
  50. sglang/srt/managers/multi_tokenizer_mixin.py +17 -1
  51. sglang/srt/managers/scheduler.py +21 -15
  52. sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
  53. sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
  54. sglang/srt/managers/tokenizer_manager.py +11 -19
  55. sglang/srt/mem_cache/hicache_storage.py +7 -1
  56. sglang/srt/mem_cache/memory_pool.py +82 -0
  57. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  58. sglang/srt/model_executor/forward_batch_info.py +44 -3
  59. sglang/srt/model_executor/model_runner.py +1 -149
  60. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
  61. sglang/srt/models/deepseek_v2.py +147 -44
  62. sglang/srt/models/glm4_moe.py +322 -354
  63. sglang/srt/models/glm4_moe_nextn.py +4 -14
  64. sglang/srt/models/glm4v_moe.py +29 -196
  65. sglang/srt/models/minimax_m2.py +922 -0
  66. sglang/srt/models/nvila.py +355 -0
  67. sglang/srt/models/nvila_lite.py +184 -0
  68. sglang/srt/models/qwen2.py +22 -1
  69. sglang/srt/models/qwen3.py +34 -4
  70. sglang/srt/models/qwen3_moe.py +2 -4
  71. sglang/srt/multimodal/processors/base_processor.py +1 -0
  72. sglang/srt/multimodal/processors/glm4v.py +1 -1
  73. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  74. sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
  75. sglang/srt/parser/reasoning_parser.py +28 -1
  76. sglang/srt/server_args.py +365 -186
  77. sglang/srt/single_batch_overlap.py +2 -7
  78. sglang/srt/utils/common.py +87 -42
  79. sglang/srt/utils/hf_transformers_utils.py +7 -3
  80. sglang/test/test_deterministic.py +235 -12
  81. sglang/test/test_deterministic_utils.py +2 -1
  82. sglang/version.py +1 -1
  83. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +7 -6
  84. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +87 -82
  85. sglang/srt/models/vila.py +0 -306
  86. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  87. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  88. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -1,64 +1,72 @@
1
- from typing import Any, Dict, List, Optional, Type
1
+ from typing import Any
2
2
 
3
3
  import torch.nn as nn
4
4
  from transformers.configuration_utils import PretrainedConfig
5
5
  from transformers.processing_utils import ProcessorMixin
6
6
  from transformers.tokenization_utils_base import PreTrainedTokenizerBase
7
7
 
8
- from sglang.srt.managers.io_struct import (
9
- EmbeddingReqInput,
10
- GenerateReqInput,
11
- ImageDataInputItem,
12
- )
13
- from sglang.srt.models.vila import VILAForConditionalGeneration
8
+ from sglang.srt.managers.io_struct import GenerateReqInput
9
+ from sglang.srt.models.nvila import NVILAForConditionalGeneration
10
+ from sglang.srt.models.nvila_lite import NVILALiteForConditionalGeneration
14
11
  from sglang.srt.multimodal.processors.base_processor import (
15
12
  BaseMultimodalProcessor,
16
13
  MultimodalSpecialTokens,
17
14
  )
18
15
  from sglang.srt.server_args import ServerArgs
19
16
 
17
+ NUM_VIDEO_FRAMES = 8
20
18
 
21
- class VILAProcessor(ProcessorMixin):
22
- """A stub class for the VILA processor."""
23
-
24
- tokenizer: PreTrainedTokenizerBase
25
-
26
-
27
- class VILAMultimodalProcessor(BaseMultimodalProcessor):
28
- models: List[Type[nn.Module]] = [VILAForConditionalGeneration]
29
19
 
30
- _processor: VILAProcessor
20
+ class NVILAMultimodalProcessor(BaseMultimodalProcessor):
21
+ models: list[type[nn.Module]] = [
22
+ NVILAForConditionalGeneration,
23
+ NVILALiteForConditionalGeneration,
24
+ ]
31
25
 
32
26
  def __init__(
33
27
  self,
34
28
  hf_config: PretrainedConfig,
35
29
  server_args: ServerArgs,
36
- _processor: VILAProcessor,
30
+ _processor: ProcessorMixin,
37
31
  *args,
38
32
  **kwargs,
39
33
  ) -> None:
40
34
  super().__init__(hf_config, server_args, _processor, *args, **kwargs)
35
+
36
+ self._processor: ProcessorMixin
37
+
38
+ tokenizer: PreTrainedTokenizerBase = getattr(self._processor, "tokenizer")
39
+
41
40
  self.mm_tokens = MultimodalSpecialTokens(
42
- image_token=self._processor.tokenizer.image_token,
41
+ image_token=tokenizer.image_token,
43
42
  image_token_id=hf_config.image_token_id,
43
+ video_token=tokenizer.video_token,
44
44
  video_token_id=hf_config.video_token_id,
45
45
  ).build(_processor)
46
46
 
47
47
  async def process_mm_data_async(
48
48
  self,
49
- image_data: Optional[ImageDataInputItem | List[ImageDataInputItem]],
50
- input_text: str | List[int],
51
- request_obj: GenerateReqInput | EmbeddingReqInput,
49
+ image_data,
50
+ audio_data,
51
+ input_text,
52
+ request_obj: GenerateReqInput,
52
53
  **kwargs,
53
- ) -> Optional[Dict[str, Any]]:
54
+ ) -> dict[str, Any] | None:
54
55
  base_output = self.load_mm_data(
55
56
  prompt=input_text,
56
57
  multimodal_tokens=self.mm_tokens,
57
- image_data=image_data,
58
+ image_data=request_obj.image_data, # type: ignore
59
+ video_data=request_obj.video_data, # type: ignore
58
60
  )
59
61
 
62
+ for i, video in enumerate(base_output.videos): # type: ignore
63
+ base_output.videos[i] = [x.asnumpy() for x in video] # type: ignore
64
+
60
65
  mm_items, input_ids, _ = self.process_and_combine_mm_data(
61
- base_output, self.mm_tokens
66
+ base_output,
67
+ self.mm_tokens,
68
+ do_sample_frames=True,
69
+ num_frames=NUM_VIDEO_FRAMES,
62
70
  )
63
71
 
64
72
  return {
@@ -7,12 +7,12 @@ from PIL import Image
7
7
 
8
8
  from sglang.srt.models.points_v15_chat import POINTSV15ChatModel
9
9
  from sglang.srt.multimodal.processors.qwen_vl import (
10
- Qwen2_5VLImageProcessor,
10
+ QwenVLImageProcessor,
11
11
  resize_image_async,
12
12
  )
13
13
 
14
14
 
15
- class POINTSV15ChatProcessor(Qwen2_5VLImageProcessor):
15
+ class POINTSV15ChatProcessor(QwenVLImageProcessor):
16
16
  models = [POINTSV15ChatModel]
17
17
 
18
18
  def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
@@ -249,6 +249,31 @@ class GptOssDetector(BaseReasoningFormatDetector):
249
249
  )
250
250
 
251
251
 
252
+ class MiniMaxAppendThinkDetector(BaseReasoningFormatDetector):
253
+ """
254
+ Append `<think>` token to the beginning of the text.
255
+ """
256
+
257
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
258
+ # scheduler.py need `reasoning_parser.detector.think_end_token`
259
+ super().__init__(
260
+ "<think>",
261
+ "</think>",
262
+ force_reasoning=force_reasoning,
263
+ stream_reasoning=stream_reasoning,
264
+ )
265
+ self.is_first_chunk = False
266
+
267
+ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
268
+ if not self.is_first_chunk:
269
+ self.is_first_chunk = True
270
+ new_text = self.think_start_token + new_text
271
+ return StreamingParseResult(normal_text=new_text)
272
+
273
+ def detect_and_parse(self, text: str) -> StreamingParseResult:
274
+ return StreamingParseResult(normal_text=self.think_start_token + text)
275
+
276
+
252
277
  class ReasoningParser:
253
278
  """
254
279
  Parser that handles both streaming and non-streaming scenarios for extracting
@@ -268,6 +293,8 @@ class ReasoningParser:
268
293
  "kimi": KimiDetector,
269
294
  "qwen3": Qwen3Detector,
270
295
  "qwen3-thinking": Qwen3Detector,
296
+ "minimax": Qwen3Detector,
297
+ "minimax-append-think": MiniMaxAppendThinkDetector,
271
298
  "step3": DeepSeekR1Detector,
272
299
  }
273
300
 
@@ -285,7 +312,7 @@ class ReasoningParser:
285
312
  raise ValueError(f"Unsupported model type: {model_type}")
286
313
 
287
314
  # Special cases where we override force_reasoning
288
- if model_type.lower() in {"qwen3-thinking", "gpt-oss"}:
315
+ if model_type.lower() in {"qwen3-thinking", "gpt-oss", "minimax"}:
289
316
  force_reasoning = True
290
317
 
291
318
  # Only pass force_reasoning if explicitly set, let detectors use their defaults