sglang 0.4.5__py3-none-any.whl → 0.4.5.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. sglang/bench_one_batch.py +21 -0
  2. sglang/bench_serving.py +10 -4
  3. sglang/srt/configs/model_config.py +37 -5
  4. sglang/srt/constrained/base_grammar_backend.py +26 -5
  5. sglang/srt/constrained/llguidance_backend.py +1 -0
  6. sglang/srt/constrained/outlines_backend.py +1 -0
  7. sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
  8. sglang/srt/constrained/xgrammar_backend.py +1 -0
  9. sglang/srt/disaggregation/base/__init__.py +8 -0
  10. sglang/srt/disaggregation/base/conn.py +113 -0
  11. sglang/srt/disaggregation/decode.py +18 -5
  12. sglang/srt/disaggregation/mini_lb.py +53 -122
  13. sglang/srt/disaggregation/mooncake/__init__.py +6 -0
  14. sglang/srt/disaggregation/mooncake/conn.py +615 -0
  15. sglang/srt/disaggregation/mooncake/transfer_engine.py +108 -0
  16. sglang/srt/disaggregation/prefill.py +43 -19
  17. sglang/srt/disaggregation/utils.py +31 -0
  18. sglang/srt/entrypoints/EngineBase.py +53 -0
  19. sglang/srt/entrypoints/engine.py +36 -8
  20. sglang/srt/entrypoints/http_server.py +37 -8
  21. sglang/srt/entrypoints/http_server_engine.py +142 -0
  22. sglang/srt/entrypoints/verl_engine.py +37 -10
  23. sglang/srt/hf_transformers_utils.py +4 -0
  24. sglang/srt/layers/attention/flashattention_backend.py +330 -200
  25. sglang/srt/layers/attention/flashinfer_backend.py +13 -7
  26. sglang/srt/layers/attention/vision.py +1 -1
  27. sglang/srt/layers/dp_attention.py +2 -4
  28. sglang/srt/layers/elementwise.py +15 -2
  29. sglang/srt/layers/linear.py +1 -0
  30. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
  31. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  36. sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  37. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +38 -21
  38. sglang/srt/layers/moe/router.py +7 -1
  39. sglang/srt/layers/moe/topk.py +37 -16
  40. sglang/srt/layers/quantization/__init__.py +12 -5
  41. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +4 -0
  42. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +68 -45
  43. sglang/srt/layers/quantization/fp8.py +25 -13
  44. sglang/srt/layers/quantization/fp8_kernel.py +130 -4
  45. sglang/srt/layers/quantization/fp8_utils.py +34 -6
  46. sglang/srt/layers/quantization/kv_cache.py +43 -52
  47. sglang/srt/layers/quantization/modelopt_quant.py +271 -4
  48. sglang/srt/layers/quantization/w8a8_fp8.py +154 -4
  49. sglang/srt/layers/quantization/w8a8_int8.py +1 -0
  50. sglang/srt/layers/radix_attention.py +13 -1
  51. sglang/srt/layers/rotary_embedding.py +12 -1
  52. sglang/srt/managers/io_struct.py +254 -97
  53. sglang/srt/managers/mm_utils.py +3 -2
  54. sglang/srt/managers/multimodal_processors/base_processor.py +114 -77
  55. sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
  56. sglang/srt/managers/multimodal_processors/mllama4.py +21 -36
  57. sglang/srt/managers/schedule_batch.py +62 -21
  58. sglang/srt/managers/scheduler.py +71 -14
  59. sglang/srt/managers/tokenizer_manager.py +17 -3
  60. sglang/srt/managers/tp_worker.py +1 -0
  61. sglang/srt/mem_cache/memory_pool.py +14 -1
  62. sglang/srt/metrics/collector.py +9 -0
  63. sglang/srt/model_executor/cuda_graph_runner.py +7 -4
  64. sglang/srt/model_executor/forward_batch_info.py +234 -15
  65. sglang/srt/model_executor/model_runner.py +48 -9
  66. sglang/srt/model_loader/loader.py +31 -4
  67. sglang/srt/model_loader/weight_utils.py +4 -2
  68. sglang/srt/models/baichuan.py +2 -0
  69. sglang/srt/models/chatglm.py +1 -0
  70. sglang/srt/models/commandr.py +1 -0
  71. sglang/srt/models/dbrx.py +1 -0
  72. sglang/srt/models/deepseek.py +1 -0
  73. sglang/srt/models/deepseek_v2.py +248 -61
  74. sglang/srt/models/exaone.py +1 -0
  75. sglang/srt/models/gemma.py +1 -0
  76. sglang/srt/models/gemma2.py +1 -0
  77. sglang/srt/models/gemma3_causal.py +1 -0
  78. sglang/srt/models/gpt2.py +1 -0
  79. sglang/srt/models/gpt_bigcode.py +1 -0
  80. sglang/srt/models/granite.py +1 -0
  81. sglang/srt/models/grok.py +1 -0
  82. sglang/srt/models/internlm2.py +1 -0
  83. sglang/srt/models/llama.py +1 -0
  84. sglang/srt/models/llama4.py +101 -34
  85. sglang/srt/models/minicpm.py +1 -0
  86. sglang/srt/models/minicpm3.py +2 -0
  87. sglang/srt/models/mixtral.py +1 -0
  88. sglang/srt/models/mixtral_quant.py +1 -0
  89. sglang/srt/models/mllama.py +51 -8
  90. sglang/srt/models/mllama4.py +102 -29
  91. sglang/srt/models/olmo.py +1 -0
  92. sglang/srt/models/olmo2.py +1 -0
  93. sglang/srt/models/olmoe.py +1 -0
  94. sglang/srt/models/phi3_small.py +1 -0
  95. sglang/srt/models/qwen.py +1 -0
  96. sglang/srt/models/qwen2.py +1 -0
  97. sglang/srt/models/qwen2_5_vl.py +35 -70
  98. sglang/srt/models/qwen2_moe.py +1 -0
  99. sglang/srt/models/qwen2_vl.py +27 -25
  100. sglang/srt/models/stablelm.py +1 -0
  101. sglang/srt/models/xverse.py +1 -0
  102. sglang/srt/models/xverse_moe.py +1 -0
  103. sglang/srt/openai_api/adapter.py +4 -1
  104. sglang/srt/patch_torch.py +11 -0
  105. sglang/srt/server_args.py +34 -0
  106. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
  107. sglang/srt/speculative/eagle_utils.py +1 -11
  108. sglang/srt/speculative/eagle_worker.py +6 -2
  109. sglang/srt/utils.py +120 -9
  110. sglang/test/attention/test_flashattn_backend.py +259 -221
  111. sglang/test/attention/test_flashattn_mla_backend.py +285 -0
  112. sglang/test/attention/test_prefix_chunk_info.py +224 -0
  113. sglang/test/test_block_fp8.py +57 -0
  114. sglang/test/test_utils.py +19 -8
  115. sglang/version.py +1 -1
  116. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/METADATA +14 -4
  117. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/RECORD +120 -106
  118. sglang/srt/disaggregation/conn.py +0 -81
  119. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/WHEEL +0 -0
  120. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/licenses/LICENSE +0 -0
  121. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  """
2
- Minimal HTTP load balancer for prefill and decode servers for testing purpose.
2
+ Minimal HTTP load balancer for prefill and decode servers for testing.
3
3
  """
4
4
 
5
5
  import asyncio
@@ -22,64 +22,59 @@ class MiniLoadBalancer:
22
22
  def select_pair(self):
23
23
  return random.choice(self.prefill_servers), random.choice(self.decode_servers)
24
24
 
25
- async def generate_request(self, request_data):
26
- prefill_server, decode_server = self.select_pair()
27
-
28
- # Parse and transform prefill_server
29
- parsed_url = urllib.parse.urlparse(prefill_server)
30
- hostname = parsed_url.hostname
31
- bootstrap_host = f"{hostname}"
32
-
33
- modified_request = request_data.copy()
34
- modified_request.update(
35
- {
36
- "bootstrap_host": bootstrap_host,
37
- "bootstrap_room": random.randint(0, 2**63 - 1),
38
- }
39
- )
25
+ async def generate(
26
+ self, modified_request, prefill_server, decode_server
27
+ ) -> ORJSONResponse:
40
28
 
41
29
  async with aiohttp.ClientSession() as session:
42
- # Create the tasks
43
30
  tasks = [
44
31
  session.post(f"{prefill_server}/generate", json=modified_request),
45
32
  session.post(f"{decode_server}/generate", json=modified_request),
46
33
  ]
34
+ # Wait for both responses to complete. Prefill should end first.
35
+ prefill_response, decode_response = await asyncio.gather(*tasks)
36
+
37
+ return ORJSONResponse(
38
+ content=await decode_response.json(),
39
+ status_code=decode_response.status,
40
+ )
41
+
42
+ async def generate_stream(self, modified_request, prefill_server, decode_server):
43
+ async def stream_results():
44
+ async with aiohttp.ClientSession(
45
+ timeout=aiohttp.ClientTimeout(
46
+ total=3600
47
+ ) # Add timeout for request reliability
48
+ ) as session:
49
+ try:
50
+ # Create the tasks for both prefill and decode requests
51
+ tasks = [
52
+ session.post(
53
+ f"{prefill_server}/generate", json=modified_request
54
+ ),
55
+ session.post(
56
+ f"{decode_server}/generate", json=modified_request
57
+ ),
58
+ ]
59
+ # Wait for both responses to complete. Since this is streaming, they return immediately.
60
+ prefill_response, decode_response = await asyncio.gather(*tasks)
61
+ async for chunk in decode_response.content:
62
+ yield chunk
63
+ except Exception as e:
64
+ error_msg = {
65
+ "error": {"message": f"Stream processing error: {str(e)}"}
66
+ }
67
+ yield b"data: " + orjson.dumps(
68
+ error_msg, option=orjson.OPT_NON_STR_KEYS
69
+ ) + b"\n\n"
70
+ finally:
71
+ if prefill_response is not None:
72
+ await prefill_response.release()
47
73
 
48
- prefill_response = None
49
- decode_response = None
50
-
51
- # Process responses as they arrive
52
- for i, response in enumerate(asyncio.as_completed(tasks)):
53
- response = await response
54
- # Check if this is the prefill or decode response based on order created
55
- if i == 0: # First completed task
56
- if str(response.url).startswith(prefill_server):
57
- prefill_response = response
58
- if response.status != 200:
59
- raise HTTPException(
60
- status_code=response.status,
61
- detail=f"Prefill server error: Status {response.status} Details: {await response.text()}",
62
- )
63
- else:
64
- decode_response = response
65
- if response.status != 200:
66
- raise HTTPException(
67
- status_code=response.status,
68
- detail=f"Decode server error: Status {response.status} Details: {await response.text()}",
69
- )
70
- else: # Second completed task
71
- if str(response.url).startswith(prefill_server):
72
- prefill_response = response
73
- else:
74
- decode_response = response
75
-
76
- if response.status != 200:
77
- raise HTTPException(
78
- status_code=response.status,
79
- detail=f"{'Prefill' if str(response.url).startswith(prefill_server) else 'Decode'} server error: Status {response.status} Details: {await response.text()}",
80
- )
81
-
82
- return await decode_response.json()
74
+ return StreamingResponse(
75
+ stream_results(),
76
+ media_type="text/event-stream",
77
+ )
83
78
 
84
79
 
85
80
  app = FastAPI()
@@ -169,78 +164,14 @@ async def handle_generate_request(request_data: dict):
169
164
  }
170
165
  )
171
166
 
172
- # Check if streaming is requested
173
167
  if request_data.get("stream", False):
174
-
175
- async def stream_results():
176
- async with aiohttp.ClientSession(
177
- timeout=aiohttp.ClientTimeout(total=3600)
178
- ) as session:
179
- try:
180
- # Create the tasks
181
- tasks = [
182
- session.post(
183
- f"{prefill_server}/generate", json=modified_request
184
- ),
185
- session.post(
186
- f"{decode_server}/generate", json=modified_request
187
- ),
188
- ]
189
-
190
- prefill_response = None
191
- decode_response = None
192
-
193
- # Process responses as they arrive
194
- for i, response_task in enumerate(asyncio.as_completed(tasks)):
195
- response = await response_task
196
-
197
- # Check the response immediately
198
- if str(response.url).startswith(prefill_server):
199
- prefill_response = response
200
- if response.status != 200:
201
- error_msg = {
202
- "error": {
203
- "message": f"Prefill server error: Status {response.status}, Details: {await response.text()}"
204
- }
205
- }
206
- yield b"data: " + orjson.dumps(
207
- error_msg, option=orjson.OPT_NON_STR_KEYS
208
- ) + b"\n\n"
209
- return
210
- else:
211
- decode_response = response
212
- if response.status != 200:
213
- error_msg = {
214
- "error": {
215
- "message": f"Decode server error: Status {response.status}"
216
- }
217
- }
218
- yield b"data: " + orjson.dumps(
219
- error_msg, option=orjson.OPT_NON_STR_KEYS
220
- ) + b"\n\n"
221
- return
222
-
223
- # Stream successful decode server response
224
- async for line in decode_response.content:
225
- yield line
226
- yield b"data: [DONE]\n\n"
227
-
228
- except Exception as e:
229
- error_msg = {
230
- "error": {"message": f"Stream processing error: {str(e)}"}
231
- }
232
- yield b"data: " + orjson.dumps(
233
- error_msg, option=orjson.OPT_NON_STR_KEYS
234
- ) + b"\n\n"
235
-
236
- return StreamingResponse(
237
- stream_results(),
238
- media_type="text/event-stream",
168
+ return await load_balancer.generate_stream(
169
+ modified_request, prefill_server, decode_server
170
+ )
171
+ else:
172
+ return await load_balancer.generate(
173
+ modified_request, prefill_server, decode_server
239
174
  )
240
-
241
- # Non-streaming case
242
- result = await load_balancer.generate_request(request_data)
243
- return ORJSONResponse(content=result)
244
175
 
245
176
 
246
177
  @app.get("/v1/models")
@@ -0,0 +1,6 @@
1
+ from .conn import (
2
+ MooncakeKVBootstrapServer,
3
+ MooncakeKVManager,
4
+ MooncakeKVReceiver,
5
+ MooncakeKVSender,
6
+ )