sglang 0.4.7.post1__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. sglang/bench_one_batch.py +8 -6
  2. sglang/srt/_custom_ops.py +2 -2
  3. sglang/srt/code_completion_parser.py +2 -44
  4. sglang/srt/configs/model_config.py +1 -0
  5. sglang/srt/constants.py +3 -0
  6. sglang/srt/conversation.py +14 -3
  7. sglang/srt/custom_op.py +11 -1
  8. sglang/srt/disaggregation/base/conn.py +2 -0
  9. sglang/srt/disaggregation/decode.py +22 -28
  10. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
  11. sglang/srt/disaggregation/mini_lb.py +34 -4
  12. sglang/srt/disaggregation/mooncake/conn.py +301 -64
  13. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
  14. sglang/srt/disaggregation/nixl/conn.py +94 -46
  15. sglang/srt/disaggregation/prefill.py +20 -15
  16. sglang/srt/disaggregation/utils.py +47 -18
  17. sglang/srt/distributed/parallel_state.py +12 -4
  18. sglang/srt/entrypoints/engine.py +27 -31
  19. sglang/srt/entrypoints/http_server.py +149 -79
  20. sglang/srt/entrypoints/http_server_engine.py +0 -3
  21. sglang/srt/entrypoints/openai/__init__.py +0 -0
  22. sglang/srt/{openai_api → entrypoints/openai}/protocol.py +115 -34
  23. sglang/srt/entrypoints/openai/serving_base.py +149 -0
  24. sglang/srt/entrypoints/openai/serving_chat.py +897 -0
  25. sglang/srt/entrypoints/openai/serving_completions.py +425 -0
  26. sglang/srt/entrypoints/openai/serving_embedding.py +170 -0
  27. sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
  28. sglang/srt/entrypoints/openai/serving_score.py +61 -0
  29. sglang/srt/entrypoints/openai/usage_processor.py +81 -0
  30. sglang/srt/entrypoints/openai/utils.py +72 -0
  31. sglang/srt/function_call/base_format_detector.py +7 -4
  32. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  33. sglang/srt/function_call/ebnf_composer.py +64 -10
  34. sglang/srt/function_call/function_call_parser.py +6 -6
  35. sglang/srt/function_call/llama32_detector.py +1 -1
  36. sglang/srt/function_call/mistral_detector.py +1 -1
  37. sglang/srt/function_call/pythonic_detector.py +1 -1
  38. sglang/srt/function_call/qwen25_detector.py +1 -1
  39. sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
  40. sglang/srt/layers/activation.py +28 -3
  41. sglang/srt/layers/attention/aiter_backend.py +5 -2
  42. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  43. sglang/srt/layers/attention/cutlass_mla_backend.py +1 -0
  44. sglang/srt/layers/attention/flashattention_backend.py +43 -23
  45. sglang/srt/layers/attention/flashinfer_backend.py +9 -6
  46. sglang/srt/layers/attention/flashinfer_mla_backend.py +7 -4
  47. sglang/srt/layers/attention/flashmla_backend.py +5 -2
  48. sglang/srt/layers/attention/tbo_backend.py +3 -3
  49. sglang/srt/layers/attention/triton_backend.py +19 -11
  50. sglang/srt/layers/communicator.py +5 -5
  51. sglang/srt/layers/dp_attention.py +11 -2
  52. sglang/srt/layers/layernorm.py +44 -2
  53. sglang/srt/layers/linear.py +18 -1
  54. sglang/srt/layers/logits_processor.py +14 -5
  55. sglang/srt/layers/moe/ep_moe/kernels.py +159 -2
  56. sglang/srt/layers/moe/ep_moe/layer.py +286 -13
  57. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +19 -2
  58. sglang/srt/layers/moe/fused_moe_native.py +7 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +13 -2
  61. sglang/srt/layers/moe/fused_moe_triton/layer.py +148 -26
  62. sglang/srt/layers/moe/topk.py +117 -4
  63. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
  64. sglang/srt/layers/quantization/fp8.py +25 -17
  65. sglang/srt/layers/quantization/fp8_utils.py +5 -4
  66. sglang/srt/layers/quantization/modelopt_quant.py +62 -8
  67. sglang/srt/layers/quantization/utils.py +5 -2
  68. sglang/srt/layers/rotary_embedding.py +144 -12
  69. sglang/srt/layers/sampler.py +1 -1
  70. sglang/srt/layers/vocab_parallel_embedding.py +14 -1
  71. sglang/srt/lora/lora_manager.py +173 -74
  72. sglang/srt/lora/mem_pool.py +49 -45
  73. sglang/srt/lora/utils.py +1 -1
  74. sglang/srt/managers/cache_controller.py +33 -15
  75. sglang/srt/managers/expert_distribution.py +21 -0
  76. sglang/srt/managers/io_struct.py +19 -14
  77. sglang/srt/managers/multimodal_processors/base_processor.py +44 -9
  78. sglang/srt/managers/multimodal_processors/gemma3n.py +97 -0
  79. sglang/srt/managers/schedule_batch.py +49 -32
  80. sglang/srt/managers/schedule_policy.py +70 -56
  81. sglang/srt/managers/scheduler.py +189 -68
  82. sglang/srt/managers/template_manager.py +226 -0
  83. sglang/srt/managers/tokenizer_manager.py +11 -8
  84. sglang/srt/managers/tp_worker.py +12 -2
  85. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  86. sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
  87. sglang/srt/mem_cache/base_prefix_cache.py +52 -8
  88. sglang/srt/mem_cache/chunk_cache.py +11 -16
  89. sglang/srt/mem_cache/hiradix_cache.py +34 -23
  90. sglang/srt/mem_cache/memory_pool.py +118 -114
  91. sglang/srt/mem_cache/radix_cache.py +20 -16
  92. sglang/srt/model_executor/cuda_graph_runner.py +77 -46
  93. sglang/srt/model_executor/forward_batch_info.py +18 -5
  94. sglang/srt/model_executor/model_runner.py +27 -8
  95. sglang/srt/model_loader/loader.py +50 -8
  96. sglang/srt/model_loader/weight_utils.py +100 -2
  97. sglang/srt/models/deepseek_nextn.py +35 -30
  98. sglang/srt/models/deepseek_v2.py +255 -30
  99. sglang/srt/models/gemma3n_audio.py +949 -0
  100. sglang/srt/models/gemma3n_causal.py +1009 -0
  101. sglang/srt/models/gemma3n_mm.py +511 -0
  102. sglang/srt/models/glm4.py +312 -0
  103. sglang/srt/models/hunyuan.py +771 -0
  104. sglang/srt/models/mimo_mtp.py +2 -18
  105. sglang/srt/reasoning_parser.py +21 -11
  106. sglang/srt/server_args.py +51 -9
  107. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -10
  108. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +125 -12
  109. sglang/srt/speculative/eagle_utils.py +80 -8
  110. sglang/srt/speculative/eagle_worker.py +124 -41
  111. sglang/srt/torch_memory_saver_adapter.py +19 -15
  112. sglang/srt/two_batch_overlap.py +4 -1
  113. sglang/srt/utils.py +248 -11
  114. sglang/test/test_block_fp8_ep.py +1 -0
  115. sglang/test/test_utils.py +1 -0
  116. sglang/version.py +1 -1
  117. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/METADATA +4 -10
  118. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/RECORD +121 -105
  119. sglang/srt/entrypoints/verl_engine.py +0 -179
  120. sglang/srt/openai_api/adapter.py +0 -2148
  121. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/WHEEL +0 -0
  122. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/licenses/LICENSE +0 -0
  123. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # Copyright 2023-2024 SGLang Team
2
4
  # Licensed under the Apache License, Version 2.0 (the "License");
3
5
  # you may not use this file except in compliance with the License.
@@ -18,15 +20,17 @@ import random
18
20
  from collections import defaultdict
19
21
  from contextlib import contextmanager
20
22
  from enum import Enum, auto
21
- from typing import Dict, List, Optional, Set, Union
23
+ from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
22
24
 
23
25
  import torch
24
26
 
25
27
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
26
28
  from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
27
- from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
28
29
  from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
29
30
 
31
+ if TYPE_CHECKING:
32
+ from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
33
+
30
34
  # Clip the estimation of max_new_tokens for the request whose max_new_tokens is very large.
31
35
  # This can prevent the server from being too conservative.
32
36
  # Note that this only clips the estimation in the scheduler but does not change the stop
@@ -51,6 +55,9 @@ IN_BATCH_PREFIX_CACHING_DEPRIORITIZE_THRESHOLD = int(
51
55
  )
52
56
 
53
57
 
58
+ IGNORE_EOS_RESERVE_TOKENS = 1
59
+
60
+
54
61
  class CacheAwarePolicy(Enum):
55
62
  """Scheduling policies that are aware of the tree cache."""
56
63
 
@@ -90,7 +97,7 @@ class SchedulePolicy:
90
97
  def calc_priority(self, waiting_queue: List[Req]) -> bool:
91
98
  if self.policy == CacheAgnosticPolicy.FCFS:
92
99
  # A shortcut for FCFS
93
- return
100
+ return False
94
101
 
95
102
  policy = self._determine_active_policy(waiting_queue)
96
103
 
@@ -134,7 +141,7 @@ class SchedulePolicy:
134
141
  """
135
142
  try:
136
143
  policy_enum = CacheAwarePolicy(policy)
137
- if tree_cache.disable:
144
+ if getattr(tree_cache, "disable", True):
138
145
  # If tree_cache is disabled, using CacheAgnosticPolicy policy
139
146
  return CacheAgnosticPolicy.FCFS
140
147
  return policy_enum
@@ -158,14 +165,9 @@ class SchedulePolicy:
158
165
  prefix_ids = r.adjust_max_prefix_ids()
159
166
 
160
167
  # NOTE: the prefix_indices must always be aligned with last_node
161
- if self.enable_hierarchical_cache:
162
- r.prefix_indices, r.last_node, r.last_node_global = (
163
- self.tree_cache.match_prefix(key=prefix_ids, include_evicted=True)
164
- )
165
- else:
166
- r.prefix_indices, r.last_node = self.tree_cache.match_prefix(
167
- rid=r.rid, key=prefix_ids
168
- )
168
+ r.prefix_indices, r.last_node, r.last_host_node, r.host_hit_length = (
169
+ self.tree_cache.match_prefix(rid=r.rid, key=prefix_ids)
170
+ )
169
171
 
170
172
  # NOTE(sang): This logic is for in-batch prefix caching;
171
173
  # If there are more than 1 request that have small matching prefix from
@@ -175,7 +177,7 @@ class SchedulePolicy:
175
177
  # threshold means we cannot use in-batch prefix caching for short prefixes.
176
178
  # It is kind of common when the engine is long running (e.g., imagine the prefix "the").
177
179
  if len(r.prefix_indices) <= IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD:
178
- in_batch_matching_prefixes, _ = (
180
+ in_batch_matching_prefixes, _, _, _ = (
179
181
  self.waiting_queue_radix_tree.match_prefix(
180
182
  rid=r.rid, key=prefix_ids
181
183
  )
@@ -268,14 +270,16 @@ class AddReqResult(Enum):
268
270
  class PrefillAdder:
269
271
  def __init__(
270
272
  self,
273
+ page_size: int,
271
274
  tree_cache: BasePrefixCache,
272
- token_to_kv_pool_allocator: TokenToKVPoolAllocator,
275
+ token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
273
276
  running_batch: ScheduleBatch,
274
277
  new_token_ratio: float,
275
278
  rem_input_tokens: int,
276
279
  rem_chunk_tokens: Optional[int],
277
280
  mixed_with_decode_tokens: int = 0,
278
281
  ):
282
+ self.page_size = page_size
279
283
  self.tree_cache = tree_cache
280
284
  self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
281
285
  self.running_batch = running_batch
@@ -292,6 +296,7 @@ class PrefillAdder:
292
296
  self.can_run_list = []
293
297
  self.new_chunked_req = None
294
298
  self.log_hit_tokens = 0
299
+ # TODO(lsyin): report the real input tokens excluding page alignment
295
300
  self.log_input_tokens = 0
296
301
 
297
302
  if running_batch is not None:
@@ -322,6 +327,9 @@ class PrefillAdder:
322
327
  - self.cur_rem_token_offset
323
328
  )
324
329
 
330
+ def ceil_paged_tokens(self, tokens: int) -> int:
331
+ return -(-tokens // self.page_size) * self.page_size
332
+
325
333
  def budget_state(self):
326
334
  if self.rem_total_tokens <= 0 or self.cur_rem_tokens <= 0:
327
335
  return AddReqResult.NO_TOKEN
@@ -333,9 +341,12 @@ class PrefillAdder:
333
341
 
334
342
  return AddReqResult.CONTINUE
335
343
 
336
- def _prefill_one_req(
344
+ def _update_prefill_budget(
337
345
  self, prefix_len: int, extend_input_len: int, max_new_tokens: int
338
346
  ):
347
+ # TODO(lsyin): check this workaround logic, which only ensures the prefill will not out of memory, and may be too conservative
348
+ extend_input_len = self.ceil_paged_tokens(extend_input_len)
349
+
339
350
  self.rem_total_token_offset += extend_input_len + max_new_tokens
340
351
  self.cur_rem_token_offset += extend_input_len
341
352
  self.rem_input_tokens -= extend_input_len
@@ -350,7 +361,7 @@ class PrefillAdder:
350
361
  req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
351
362
  req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
352
363
  self.can_run_list.append(req)
353
- self._prefill_one_req(
364
+ self._update_prefill_budget(
354
365
  0,
355
366
  req.extend_input_len,
356
367
  (
@@ -372,6 +383,12 @@ class PrefillAdder:
372
383
  self.tree_cache.dec_lock_ref(last_node)
373
384
 
374
385
  def add_one_req_ignore_eos(self, req: Req, has_chunked_req: bool):
386
+ # Early exit if no enough tokens for the input tokens
387
+ if self.ceil_paged_tokens(req.extend_input_len) > min(
388
+ self.cur_rem_tokens, self.rem_total_tokens
389
+ ):
390
+ return AddReqResult.NO_TOKEN
391
+
375
392
  def add_req_state(r, insert_sort=False):
376
393
  new_token_ratio = (
377
394
  1.0 if r.sampling_params.ignore_eos else self.new_token_ratio
@@ -381,15 +398,17 @@ class PrefillAdder:
381
398
  )
382
399
  tokens_occupied = len(r.origin_input_ids) + len(r.output_ids)
383
400
 
384
- if tokens_left > 0:
385
- if not insert_sort:
386
- self.req_states.append((tokens_left, tokens_occupied))
387
- else:
388
- i = 0
389
- for i in range(len(self.req_states)):
390
- if tokens_left <= self.req_states[i][0]:
391
- break
392
- self.req_states.insert(i, (tokens_left, tokens_occupied))
401
+ if tokens_left <= 0:
402
+ return
403
+
404
+ if not insert_sort:
405
+ self.req_states.append((tokens_left, tokens_occupied))
406
+ else:
407
+ i = 0
408
+ for i in range(len(self.req_states)):
409
+ if tokens_left <= self.req_states[i][0]:
410
+ break
411
+ self.req_states.insert(i, (tokens_left, tokens_occupied))
393
412
 
394
413
  if self.req_states is None:
395
414
  self.req_states = []
@@ -406,13 +425,11 @@ class PrefillAdder:
406
425
  cur_rem_tokens = self.cur_rem_tokens - len(req.origin_input_ids)
407
426
  tokens_freed = 0
408
427
  for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
409
- decode_steps = (
410
- self.req_states[i + 1][0]
411
- if i + 1 < len(self.req_states)
412
- else tokens_left
413
- )
428
+ # tokens_left gives a reservative calculation as the last token is not stored
414
429
  bs = len(self.req_states) - i
415
- if cur_rem_tokens + tokens_freed - decode_steps * bs <= 0:
430
+ min_free_tokens = cur_rem_tokens + tokens_freed - tokens_left * bs
431
+ # reserve tokens for corner cases
432
+ if min_free_tokens <= IGNORE_EOS_RESERVE_TOKENS * bs:
416
433
  return AddReqResult.NO_TOKEN
417
434
  tokens_freed += tokens_occupied
418
435
 
@@ -422,7 +439,7 @@ class PrefillAdder:
422
439
  ):
423
440
  # Non-chunked prefill
424
441
  self.can_run_list.append(req)
425
- self._prefill_one_req(
442
+ self._update_prefill_budget(
426
443
  0,
427
444
  req.extend_input_len,
428
445
  min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION),
@@ -438,55 +455,52 @@ class PrefillAdder:
438
455
  req.fill_ids = req.fill_ids[:trunc_len]
439
456
  self.can_run_list.append(req)
440
457
  self.new_chunked_req = req
441
- self._prefill_one_req(0, trunc_len, 0)
458
+ self._update_prefill_budget(0, trunc_len, 0)
442
459
 
443
460
  return self.budget_state()
444
461
 
445
- def add_one_req(
446
- self, req: Req, has_chunked_req: bool, enable_hierarchical_cache: bool = False
447
- ):
462
+ def add_one_req(self, req: Req, has_chunked_req: bool):
448
463
  if req.sampling_params.ignore_eos and getattr(self.tree_cache, "disable", True):
449
464
  return self.add_one_req_ignore_eos(req, has_chunked_req)
450
465
 
451
466
  total_tokens = req.extend_input_len + min(
452
467
  req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION
453
468
  )
454
- input_tokens = (
455
- -(-req.extend_input_len // self.tree_cache.page_size)
456
- * self.tree_cache.page_size
457
- )
469
+
470
+ # adjusting the input_tokens based on host_hit_length and page_size
471
+ real_input_tokens = req.extend_input_len - req.host_hit_length
472
+ real_input_tokens = self.ceil_paged_tokens(real_input_tokens)
458
473
  prefix_len = len(req.prefix_indices)
459
474
 
460
475
  if total_tokens >= self.rem_total_tokens:
461
476
  return AddReqResult.NO_TOKEN
462
477
 
463
- if input_tokens > self.rem_input_tokens and len(self.can_run_list) != 0:
478
+ if real_input_tokens >= self.rem_input_tokens and len(self.can_run_list) != 0:
464
479
  return AddReqResult.OTHER
465
480
 
466
481
  with self._lock_node(req.last_node):
467
- if total_tokens > self.rem_total_tokens:
482
+ # self.rem_total_tokens may decrease after the lock acquisition
483
+ if total_tokens >= self.rem_total_tokens:
468
484
  return AddReqResult.NO_TOKEN
469
485
 
470
- if (
471
- enable_hierarchical_cache
472
- and req.last_node_global is not None
473
- and req.last_node_global.evicted
474
- ):
475
- req.last_node, req.prefix_indices = self.tree_cache.init_load_back(
476
- req.last_node_global, req.prefix_indices
486
+ if req.host_hit_length > 0:
487
+ new_indices, req.last_node = self.tree_cache.init_load_back(
488
+ req.last_host_node, req.host_hit_length
477
489
  )
490
+ req.prefix_indices = torch.cat([req.prefix_indices, new_indices])
478
491
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
479
- input_tokens = (
480
- -(-req.extend_input_len // self.tree_cache.page_size)
481
- * self.tree_cache.page_size
482
- )
483
492
  prefix_len = len(req.prefix_indices)
484
493
 
494
+ input_tokens = self.ceil_paged_tokens(req.extend_input_len)
495
+
496
+ if input_tokens >= self.rem_input_tokens and len(self.can_run_list) != 0:
497
+ return AddReqResult.OTHER
498
+
485
499
  if self.rem_chunk_tokens is None or input_tokens <= self.rem_chunk_tokens:
486
500
  # Non-chunked prefill
487
501
  self.can_run_list.append(req)
488
502
  self.tree_cache.inc_lock_ref(req.last_node)
489
- self._prefill_one_req(
503
+ self._update_prefill_budget(
490
504
  prefix_len,
491
505
  input_tokens,
492
506
  min(
@@ -496,7 +510,7 @@ class PrefillAdder:
496
510
  )
497
511
  else:
498
512
  # Make sure at least one page is available
499
- trunc_len = self.rem_chunk_tokens - self.tree_cache.page_size + 1
513
+ trunc_len = self.rem_chunk_tokens - self.page_size + 1
500
514
  if trunc_len <= 0:
501
515
  return AddReqResult.OTHER
502
516
 
@@ -507,6 +521,6 @@ class PrefillAdder:
507
521
  self.can_run_list.append(req)
508
522
  self.new_chunked_req = req
509
523
  self.tree_cache.inc_lock_ref(req.last_node)
510
- self._prefill_one_req(prefix_len, trunc_len, 0)
524
+ self._update_prefill_budget(prefix_len, trunc_len, 0)
511
525
 
512
526
  return self.budget_state()