sglang 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_one_batch.py +8 -6
- sglang/bench_serving.py +1 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/srt/_custom_ops.py +2 -2
- sglang/srt/code_completion_parser.py +2 -44
- sglang/srt/configs/model_config.py +6 -0
- sglang/srt/constants.py +3 -0
- sglang/srt/conversation.py +19 -3
- sglang/srt/custom_op.py +5 -1
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -1
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +211 -72
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +15 -9
- sglang/srt/disaggregation/mini_lb.py +34 -4
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +30 -29
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +17 -12
- sglang/srt/disaggregation/prefill.py +144 -55
- sglang/srt/disaggregation/utils.py +155 -123
- sglang/srt/distributed/parallel_state.py +12 -4
- sglang/srt/entrypoints/engine.py +37 -29
- sglang/srt/entrypoints/http_server.py +153 -72
- sglang/srt/entrypoints/http_server_engine.py +0 -3
- sglang/srt/entrypoints/openai/__init__.py +0 -0
- sglang/srt/{openai_api → entrypoints/openai}/protocol.py +84 -10
- sglang/srt/entrypoints/openai/serving_base.py +149 -0
- sglang/srt/entrypoints/openai/serving_chat.py +921 -0
- sglang/srt/entrypoints/openai/serving_completions.py +424 -0
- sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
- sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
- sglang/srt/entrypoints/openai/serving_score.py +61 -0
- sglang/srt/entrypoints/openai/usage_processor.py +81 -0
- sglang/srt/entrypoints/openai/utils.py +72 -0
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +7 -4
- sglang/srt/function_call/deepseekv3_detector.py +1 -1
- sglang/srt/function_call/ebnf_composer.py +64 -10
- sglang/srt/function_call/function_call_parser.py +6 -6
- sglang/srt/function_call/llama32_detector.py +1 -1
- sglang/srt/function_call/mistral_detector.py +1 -1
- sglang/srt/function_call/pythonic_detector.py +1 -1
- sglang/srt/function_call/qwen25_detector.py +1 -1
- sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
- sglang/srt/layers/activation.py +40 -3
- sglang/srt/layers/attention/aiter_backend.py +20 -4
- sglang/srt/layers/attention/base_attn_backend.py +1 -1
- sglang/srt/layers/attention/cutlass_mla_backend.py +39 -15
- sglang/srt/layers/attention/flashattention_backend.py +71 -72
- sglang/srt/layers/attention/flashinfer_backend.py +10 -8
- sglang/srt/layers/attention/flashinfer_mla_backend.py +29 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -12
- sglang/srt/layers/attention/tbo_backend.py +3 -3
- sglang/srt/layers/attention/triton_backend.py +138 -130
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +28 -10
- sglang/srt/layers/dp_attention.py +11 -2
- sglang/srt/layers/layernorm.py +29 -2
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +2 -14
- sglang/srt/layers/moe/ep_moe/kernels.py +165 -7
- sglang/srt/layers/moe/ep_moe/layer.py +249 -33
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +7 -4
- sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
- sglang/srt/layers/moe/topk.py +107 -12
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +25 -17
- sglang/srt/layers/quantization/fp8_kernel.py +44 -15
- sglang/srt/layers/quantization/fp8_utils.py +87 -22
- sglang/srt/layers/quantization/modelopt_quant.py +62 -8
- sglang/srt/layers/quantization/utils.py +5 -2
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +42 -2
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/lora/lora_manager.py +249 -105
- sglang/srt/lora/mem_pool.py +53 -50
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/cache_controller.py +33 -14
- sglang/srt/managers/io_struct.py +31 -10
- sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +79 -37
- sglang/srt/managers/schedule_policy.py +70 -56
- sglang/srt/managers/scheduler.py +220 -79
- sglang/srt/managers/template_manager.py +226 -0
- sglang/srt/managers/tokenizer_manager.py +40 -10
- sglang/srt/managers/tp_worker.py +12 -2
- sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
- sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
- sglang/srt/mem_cache/base_prefix_cache.py +52 -8
- sglang/srt/mem_cache/chunk_cache.py +11 -15
- sglang/srt/mem_cache/hiradix_cache.py +38 -25
- sglang/srt/mem_cache/memory_pool.py +213 -505
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +56 -28
- sglang/srt/model_executor/cuda_graph_runner.py +198 -100
- sglang/srt/model_executor/forward_batch_info.py +32 -10
- sglang/srt/model_executor/model_runner.py +28 -12
- sglang/srt/model_loader/loader.py +16 -2
- sglang/srt/model_loader/weight_utils.py +11 -2
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +29 -27
- sglang/srt/models/deepseek_v2.py +213 -173
- sglang/srt/models/glm4.py +312 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/mimo_mtp.py +2 -18
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/vila.py +305 -0
- sglang/srt/reasoning_parser.py +21 -11
- sglang/srt/sampling/sampling_batch_info.py +24 -0
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +351 -238
- sglang/srt/speculative/build_eagle_tree.py +1 -1
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -9
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +130 -14
- sglang/srt/speculative/eagle_utils.py +468 -116
- sglang/srt/speculative/eagle_worker.py +258 -84
- sglang/srt/torch_memory_saver_adapter.py +19 -15
- sglang/srt/two_batch_overlap.py +4 -2
- sglang/srt/utils.py +235 -11
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +38 -3
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +2 -0
- sglang/test/test_utils.py +4 -1
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/METADATA +8 -14
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/RECORD +150 -128
- sglang/srt/entrypoints/verl_engine.py +0 -179
- sglang/srt/openai_api/adapter.py +0 -1990
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0
@@ -2,28 +2,23 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
"""Cache for chunked prefill, used when RadixCache is disabled."""
|
4
4
|
|
5
|
-
from typing import TYPE_CHECKING, Any
|
5
|
+
from typing import TYPE_CHECKING, Any
|
6
6
|
|
7
7
|
import torch
|
8
8
|
|
9
|
-
from sglang.srt.mem_cache.
|
10
|
-
from sglang.srt.mem_cache.
|
9
|
+
from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
|
10
|
+
from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchResult
|
11
|
+
from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
|
11
12
|
|
12
13
|
if TYPE_CHECKING:
|
13
14
|
from sglang.srt.managers.schedule_batch import Req
|
14
15
|
|
15
16
|
|
16
|
-
class ChunkCacheEntry:
|
17
|
-
def __init__(self, rid: str, value: torch.Tensor):
|
18
|
-
self.rid = rid
|
19
|
-
self.value = value
|
20
|
-
|
21
|
-
|
22
17
|
class ChunkCache(BasePrefixCache):
|
23
18
|
def __init__(
|
24
19
|
self,
|
25
20
|
req_to_token_pool: ReqToTokenPool,
|
26
|
-
token_to_kv_pool_allocator:
|
21
|
+
token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
|
27
22
|
page_size: int,
|
28
23
|
):
|
29
24
|
self.req_to_token_pool = req_to_token_pool
|
@@ -33,8 +28,12 @@ class ChunkCache(BasePrefixCache):
|
|
33
28
|
def reset(self):
|
34
29
|
pass
|
35
30
|
|
36
|
-
def match_prefix(self, **unused_kwargs) ->
|
37
|
-
return
|
31
|
+
def match_prefix(self, **unused_kwargs) -> MatchResult:
|
32
|
+
return MatchResult(
|
33
|
+
device_indices=torch.empty((0,), dtype=torch.int64),
|
34
|
+
last_device_node=None,
|
35
|
+
last_host_node=None,
|
36
|
+
)
|
38
37
|
|
39
38
|
def cache_finished_req(self, req: Req):
|
40
39
|
kv_indices = self.req_to_token_pool.req_to_token[
|
@@ -53,9 +52,6 @@ class ChunkCache(BasePrefixCache):
|
|
53
52
|
# `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
|
54
53
|
req.prefix_indices = kv_indices
|
55
54
|
|
56
|
-
def insert(self):
|
57
|
-
raise NotImplementedError()
|
58
|
-
|
59
55
|
def evict(self, num_tokens: int):
|
60
56
|
pass
|
61
57
|
|
@@ -7,13 +7,16 @@ from typing import List, Optional
|
|
7
7
|
import torch
|
8
8
|
|
9
9
|
from sglang.srt.managers.cache_controller import HiCacheController
|
10
|
+
from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
|
11
|
+
from sglang.srt.mem_cache.base_prefix_cache import MatchResult
|
10
12
|
from sglang.srt.mem_cache.memory_pool import (
|
11
13
|
MHATokenToKVPool,
|
12
|
-
MHATokenToKVPoolHost,
|
13
14
|
MLATokenToKVPool,
|
14
|
-
MLATokenToKVPoolHost,
|
15
15
|
ReqToTokenPool,
|
16
|
-
|
16
|
+
)
|
17
|
+
from sglang.srt.mem_cache.memory_pool_host import (
|
18
|
+
MHATokenToKVPoolHost,
|
19
|
+
MLATokenToKVPoolHost,
|
17
20
|
)
|
18
21
|
from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
|
19
22
|
|
@@ -25,7 +28,7 @@ class HiRadixCache(RadixCache):
|
|
25
28
|
def __init__(
|
26
29
|
self,
|
27
30
|
req_to_token_pool: ReqToTokenPool,
|
28
|
-
token_to_kv_pool_allocator:
|
31
|
+
token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
|
29
32
|
tp_cache_group: torch.distributed.ProcessGroup,
|
30
33
|
page_size: int,
|
31
34
|
hicache_ratio: float,
|
@@ -281,39 +284,44 @@ class HiRadixCache(RadixCache):
|
|
281
284
|
def init_load_back(
|
282
285
|
self,
|
283
286
|
last_node: TreeNode,
|
284
|
-
|
287
|
+
host_hit_length: int,
|
285
288
|
mem_quota: Optional[int] = None,
|
286
289
|
):
|
287
|
-
|
288
|
-
len(prefix_indices) == 0 or prefix_indices.is_cuda
|
289
|
-
), "indices of device kV caches should be on GPU"
|
290
|
+
_ = host_hit_length # unused, but kept for compatibility
|
290
291
|
if last_node.evicted:
|
291
292
|
loading_values = self.load_back(last_node, mem_quota)
|
292
293
|
if loading_values is not None:
|
293
|
-
prefix_indices = (
|
294
|
-
loading_values
|
295
|
-
if len(prefix_indices) == 0
|
296
|
-
else torch.cat([prefix_indices, loading_values])
|
297
|
-
)
|
298
294
|
logger.debug(
|
299
295
|
f"loading back {len(loading_values)} tokens for node {last_node.id}"
|
300
296
|
)
|
297
|
+
return loading_values, last_node
|
301
298
|
|
302
299
|
while last_node.evicted:
|
303
300
|
last_node = last_node.parent
|
304
301
|
|
305
|
-
return
|
302
|
+
return (
|
303
|
+
torch.empty((0,), dtype=torch.int64, device=self.device),
|
304
|
+
last_node,
|
305
|
+
)
|
306
306
|
|
307
|
-
def
|
307
|
+
def ready_to_load_host_cache(self):
|
308
|
+
producer_index = self.cache_controller.layer_done_counter.next_producer()
|
308
309
|
self.load_cache_event.set()
|
310
|
+
return producer_index
|
311
|
+
|
312
|
+
def check_hicache_events(self):
|
313
|
+
self.writing_check()
|
314
|
+
self.loading_check()
|
309
315
|
|
310
|
-
def match_prefix(self, key: List[int],
|
316
|
+
def match_prefix(self, key: List[int], **kwargs):
|
311
317
|
empty_value = torch.empty((0,), dtype=torch.int64, device=self.device)
|
312
318
|
if self.disable or len(key) == 0:
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
319
|
+
return MatchResult(
|
320
|
+
device_indices=empty_value,
|
321
|
+
last_device_node=self.root_node,
|
322
|
+
last_host_node=self.root_node,
|
323
|
+
host_hit_length=0,
|
324
|
+
)
|
317
325
|
|
318
326
|
if self.page_size != 1:
|
319
327
|
page_aligned_len = len(key) // self.page_size * self.page_size
|
@@ -325,14 +333,18 @@ class HiRadixCache(RadixCache):
|
|
325
333
|
else:
|
326
334
|
value = empty_value
|
327
335
|
|
328
|
-
|
336
|
+
host_hit_length = 0
|
337
|
+
last_host_node = last_node
|
329
338
|
while last_node.evicted:
|
339
|
+
host_hit_length += len(last_node.host_value)
|
330
340
|
last_node = last_node.parent
|
331
341
|
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
342
|
+
return MatchResult(
|
343
|
+
device_indices=value,
|
344
|
+
last_device_node=last_node,
|
345
|
+
last_host_node=last_host_node,
|
346
|
+
host_hit_length=host_hit_length,
|
347
|
+
)
|
336
348
|
|
337
349
|
def _match_prefix_helper(self, node: TreeNode, key: List):
|
338
350
|
node.last_access_time = time.monotonic()
|
@@ -370,6 +382,7 @@ class HiRadixCache(RadixCache):
|
|
370
382
|
new_node.lock_ref = child.lock_ref
|
371
383
|
new_node.key = child.key[:split_len]
|
372
384
|
new_node.loading = child.loading
|
385
|
+
new_node.hit_count = child.hit_count
|
373
386
|
|
374
387
|
# split value and host value if exists
|
375
388
|
if child.evicted:
|