sglang 0.4.7.post1__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. sglang/bench_one_batch.py +8 -6
  2. sglang/srt/_custom_ops.py +2 -2
  3. sglang/srt/code_completion_parser.py +2 -44
  4. sglang/srt/configs/model_config.py +1 -0
  5. sglang/srt/constants.py +3 -0
  6. sglang/srt/conversation.py +14 -3
  7. sglang/srt/custom_op.py +11 -1
  8. sglang/srt/disaggregation/base/conn.py +2 -0
  9. sglang/srt/disaggregation/decode.py +22 -28
  10. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
  11. sglang/srt/disaggregation/mini_lb.py +34 -4
  12. sglang/srt/disaggregation/mooncake/conn.py +301 -64
  13. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
  14. sglang/srt/disaggregation/nixl/conn.py +94 -46
  15. sglang/srt/disaggregation/prefill.py +20 -15
  16. sglang/srt/disaggregation/utils.py +47 -18
  17. sglang/srt/distributed/parallel_state.py +12 -4
  18. sglang/srt/entrypoints/engine.py +27 -31
  19. sglang/srt/entrypoints/http_server.py +149 -79
  20. sglang/srt/entrypoints/http_server_engine.py +0 -3
  21. sglang/srt/entrypoints/openai/__init__.py +0 -0
  22. sglang/srt/{openai_api → entrypoints/openai}/protocol.py +115 -34
  23. sglang/srt/entrypoints/openai/serving_base.py +149 -0
  24. sglang/srt/entrypoints/openai/serving_chat.py +897 -0
  25. sglang/srt/entrypoints/openai/serving_completions.py +425 -0
  26. sglang/srt/entrypoints/openai/serving_embedding.py +170 -0
  27. sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
  28. sglang/srt/entrypoints/openai/serving_score.py +61 -0
  29. sglang/srt/entrypoints/openai/usage_processor.py +81 -0
  30. sglang/srt/entrypoints/openai/utils.py +72 -0
  31. sglang/srt/function_call/base_format_detector.py +7 -4
  32. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  33. sglang/srt/function_call/ebnf_composer.py +64 -10
  34. sglang/srt/function_call/function_call_parser.py +6 -6
  35. sglang/srt/function_call/llama32_detector.py +1 -1
  36. sglang/srt/function_call/mistral_detector.py +1 -1
  37. sglang/srt/function_call/pythonic_detector.py +1 -1
  38. sglang/srt/function_call/qwen25_detector.py +1 -1
  39. sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
  40. sglang/srt/layers/activation.py +28 -3
  41. sglang/srt/layers/attention/aiter_backend.py +5 -2
  42. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  43. sglang/srt/layers/attention/cutlass_mla_backend.py +1 -0
  44. sglang/srt/layers/attention/flashattention_backend.py +43 -23
  45. sglang/srt/layers/attention/flashinfer_backend.py +9 -6
  46. sglang/srt/layers/attention/flashinfer_mla_backend.py +7 -4
  47. sglang/srt/layers/attention/flashmla_backend.py +5 -2
  48. sglang/srt/layers/attention/tbo_backend.py +3 -3
  49. sglang/srt/layers/attention/triton_backend.py +19 -11
  50. sglang/srt/layers/communicator.py +5 -5
  51. sglang/srt/layers/dp_attention.py +11 -2
  52. sglang/srt/layers/layernorm.py +44 -2
  53. sglang/srt/layers/linear.py +18 -1
  54. sglang/srt/layers/logits_processor.py +14 -5
  55. sglang/srt/layers/moe/ep_moe/kernels.py +159 -2
  56. sglang/srt/layers/moe/ep_moe/layer.py +286 -13
  57. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +19 -2
  58. sglang/srt/layers/moe/fused_moe_native.py +7 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +13 -2
  61. sglang/srt/layers/moe/fused_moe_triton/layer.py +148 -26
  62. sglang/srt/layers/moe/topk.py +117 -4
  63. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
  64. sglang/srt/layers/quantization/fp8.py +25 -17
  65. sglang/srt/layers/quantization/fp8_utils.py +5 -4
  66. sglang/srt/layers/quantization/modelopt_quant.py +62 -8
  67. sglang/srt/layers/quantization/utils.py +5 -2
  68. sglang/srt/layers/rotary_embedding.py +144 -12
  69. sglang/srt/layers/sampler.py +1 -1
  70. sglang/srt/layers/vocab_parallel_embedding.py +14 -1
  71. sglang/srt/lora/lora_manager.py +173 -74
  72. sglang/srt/lora/mem_pool.py +49 -45
  73. sglang/srt/lora/utils.py +1 -1
  74. sglang/srt/managers/cache_controller.py +33 -15
  75. sglang/srt/managers/expert_distribution.py +21 -0
  76. sglang/srt/managers/io_struct.py +19 -14
  77. sglang/srt/managers/multimodal_processors/base_processor.py +44 -9
  78. sglang/srt/managers/multimodal_processors/gemma3n.py +97 -0
  79. sglang/srt/managers/schedule_batch.py +49 -32
  80. sglang/srt/managers/schedule_policy.py +70 -56
  81. sglang/srt/managers/scheduler.py +189 -68
  82. sglang/srt/managers/template_manager.py +226 -0
  83. sglang/srt/managers/tokenizer_manager.py +11 -8
  84. sglang/srt/managers/tp_worker.py +12 -2
  85. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  86. sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
  87. sglang/srt/mem_cache/base_prefix_cache.py +52 -8
  88. sglang/srt/mem_cache/chunk_cache.py +11 -16
  89. sglang/srt/mem_cache/hiradix_cache.py +34 -23
  90. sglang/srt/mem_cache/memory_pool.py +118 -114
  91. sglang/srt/mem_cache/radix_cache.py +20 -16
  92. sglang/srt/model_executor/cuda_graph_runner.py +77 -46
  93. sglang/srt/model_executor/forward_batch_info.py +18 -5
  94. sglang/srt/model_executor/model_runner.py +27 -8
  95. sglang/srt/model_loader/loader.py +50 -8
  96. sglang/srt/model_loader/weight_utils.py +100 -2
  97. sglang/srt/models/deepseek_nextn.py +35 -30
  98. sglang/srt/models/deepseek_v2.py +255 -30
  99. sglang/srt/models/gemma3n_audio.py +949 -0
  100. sglang/srt/models/gemma3n_causal.py +1009 -0
  101. sglang/srt/models/gemma3n_mm.py +511 -0
  102. sglang/srt/models/glm4.py +312 -0
  103. sglang/srt/models/hunyuan.py +771 -0
  104. sglang/srt/models/mimo_mtp.py +2 -18
  105. sglang/srt/reasoning_parser.py +21 -11
  106. sglang/srt/server_args.py +51 -9
  107. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -10
  108. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +125 -12
  109. sglang/srt/speculative/eagle_utils.py +80 -8
  110. sglang/srt/speculative/eagle_worker.py +124 -41
  111. sglang/srt/torch_memory_saver_adapter.py +19 -15
  112. sglang/srt/two_batch_overlap.py +4 -1
  113. sglang/srt/utils.py +248 -11
  114. sglang/test/test_block_fp8_ep.py +1 -0
  115. sglang/test/test_utils.py +1 -0
  116. sglang/version.py +1 -1
  117. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/METADATA +4 -10
  118. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/RECORD +121 -105
  119. sglang/srt/entrypoints/verl_engine.py +0 -179
  120. sglang/srt/openai_api/adapter.py +0 -2148
  121. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/WHEEL +0 -0
  122. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/licenses/LICENSE +0 -0
  123. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,149 @@
1
+ import json
2
+ import logging
3
+ import uuid
4
+ from abc import ABC, abstractmethod
5
+ from typing import Any, Optional, Union
6
+
7
+ from fastapi import Request
8
+ from fastapi.responses import ORJSONResponse, StreamingResponse
9
+
10
+ from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
11
+ from sglang.srt.managers.io_struct import GenerateReqInput
12
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ # Base class for specific endpoint handlers
18
+ class OpenAIServingBase(ABC):
19
+ """Abstract base class for OpenAI endpoint handlers"""
20
+
21
+ def __init__(self, tokenizer_manager: TokenizerManager):
22
+ self.tokenizer_manager = tokenizer_manager
23
+
24
+ async def handle_request(
25
+ self, request: OpenAIServingRequest, raw_request: Request
26
+ ) -> Union[Any, StreamingResponse, ErrorResponse]:
27
+ """Handle the specific request type with common pattern"""
28
+ try:
29
+ # Validate request
30
+ error_msg = self._validate_request(request)
31
+ if error_msg:
32
+ return self.create_error_response(error_msg)
33
+
34
+ # Convert to internal format
35
+ adapted_request, processed_request = self._convert_to_internal_request(
36
+ request
37
+ )
38
+
39
+ # Note(Xinyuan): raw_request below is only used for detecting the connection of the client
40
+ if hasattr(request, "stream") and request.stream:
41
+ return await self._handle_streaming_request(
42
+ adapted_request, processed_request, raw_request
43
+ )
44
+ else:
45
+ return await self._handle_non_streaming_request(
46
+ adapted_request, processed_request, raw_request
47
+ )
48
+
49
+ except Exception as e:
50
+ logger.exception(f"Error in request: {e}")
51
+ return self.create_error_response(
52
+ message=f"Internal server error: {str(e)}",
53
+ err_type="InternalServerError",
54
+ status_code=500,
55
+ )
56
+
57
+ @abstractmethod
58
+ def _request_id_prefix(self) -> str:
59
+ """Generate request ID based on request type"""
60
+ pass
61
+
62
+ def _generate_request_id_base(self, request: OpenAIServingRequest) -> Optional[str]:
63
+ """Generate request ID based on request type"""
64
+ return None
65
+
66
+ # TODO(chang): the rid is used in io_strcut check and often violates `The rid should be a list` AssertionError
67
+ # Temporarily return None in this function until the rid logic is clear.
68
+ if rid := getattr(request, "rid", None):
69
+ return rid
70
+
71
+ return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
72
+
73
+ @abstractmethod
74
+ def _convert_to_internal_request(
75
+ self,
76
+ request: OpenAIServingRequest,
77
+ ) -> tuple[GenerateReqInput, OpenAIServingRequest]:
78
+ """Convert OpenAI request to internal format"""
79
+ pass
80
+
81
+ async def _handle_streaming_request(
82
+ self,
83
+ adapted_request: GenerateReqInput,
84
+ request: OpenAIServingRequest,
85
+ raw_request: Request,
86
+ ) -> Union[StreamingResponse, ErrorResponse, ORJSONResponse]:
87
+ """Handle streaming request
88
+
89
+ Override this method in child classes that support streaming requests.
90
+ """
91
+ return self.create_error_response(
92
+ message=f"{self.__class__.__name__} does not support streaming requests",
93
+ err_type="NotImplementedError",
94
+ status_code=501,
95
+ )
96
+
97
+ async def _handle_non_streaming_request(
98
+ self,
99
+ adapted_request: GenerateReqInput,
100
+ request: OpenAIServingRequest,
101
+ raw_request: Request,
102
+ ) -> Union[Any, ErrorResponse, ORJSONResponse]:
103
+ """Handle non-streaming request
104
+
105
+ Override this method in child classes that support non-streaming requests.
106
+ """
107
+ return self.create_error_response(
108
+ message=f"{self.__class__.__name__} does not support non-streaming requests",
109
+ err_type="NotImplementedError",
110
+ status_code=501,
111
+ )
112
+
113
+ def _validate_request(self, _: OpenAIServingRequest) -> Optional[str]:
114
+ """Validate request"""
115
+ pass
116
+
117
+ def create_error_response(
118
+ self,
119
+ message: str,
120
+ err_type: str = "BadRequestError",
121
+ status_code: int = 400,
122
+ param: Optional[str] = None,
123
+ ) -> ORJSONResponse:
124
+ """Create an error response"""
125
+ # TODO: remove fastapi dependency in openai and move response handling to the entrypoint
126
+ error = ErrorResponse(
127
+ object="error",
128
+ message=message,
129
+ type=err_type,
130
+ param=param,
131
+ code=status_code,
132
+ )
133
+ return ORJSONResponse(content=error.model_dump(), status_code=status_code)
134
+
135
+ def create_streaming_error_response(
136
+ self,
137
+ message: str,
138
+ err_type: str = "BadRequestError",
139
+ status_code: int = 400,
140
+ ) -> str:
141
+ """Create a streaming error response"""
142
+ error = ErrorResponse(
143
+ object="error",
144
+ message=message,
145
+ type=err_type,
146
+ param=None,
147
+ code=status_code,
148
+ )
149
+ return json.dumps({"error": error.model_dump()})