sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +4 -2
- sglang/bench_one_batch.py +2 -2
- sglang/bench_one_batch_server.py +143 -15
- sglang/bench_serving.py +9 -7
- sglang/compile_deep_gemm.py +1 -1
- sglang/eval/loogle_eval.py +157 -0
- sglang/lang/chat_template.py +78 -78
- sglang/lang/tracer.py +1 -1
- sglang/srt/code_completion_parser.py +1 -1
- sglang/srt/configs/deepseekvl2.py +2 -2
- sglang/srt/configs/model_config.py +1 -0
- sglang/srt/constrained/base_grammar_backend.py +55 -72
- sglang/srt/constrained/llguidance_backend.py +25 -21
- sglang/srt/constrained/outlines_backend.py +27 -26
- sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -43
- sglang/srt/conversation.py +48 -43
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +7 -2
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +227 -120
- sglang/srt/disaggregation/nixl/conn.py +1 -0
- sglang/srt/disaggregation/prefill.py +7 -4
- sglang/srt/disaggregation/utils.py +7 -1
- sglang/srt/entrypoints/engine.py +17 -2
- sglang/srt/entrypoints/http_server.py +17 -2
- sglang/srt/function_call_parser.py +2 -2
- sglang/srt/layers/attention/flashattention_backend.py +1 -1
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
- sglang/srt/layers/attention/utils.py +4 -2
- sglang/srt/layers/dp_attention.py +71 -21
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/logits_processor.py +46 -11
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
- sglang/srt/layers/moe/topk.py +1 -1
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/blockwise_int8.py +2 -2
- sglang/srt/layers/quantization/deep_gemm.py +72 -71
- sglang/srt/layers/quantization/fp8.py +2 -2
- sglang/srt/layers/quantization/fp8_kernel.py +3 -3
- sglang/srt/layers/quantization/int8_kernel.py +2 -2
- sglang/srt/layers/sampler.py +0 -4
- sglang/srt/layers/vocab_parallel_embedding.py +18 -7
- sglang/srt/lora/lora_manager.py +1 -1
- sglang/srt/lora/mem_pool.py +4 -4
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/data_parallel_controller.py +3 -3
- sglang/srt/managers/detokenizer_manager.py +21 -8
- sglang/srt/managers/io_struct.py +3 -1
- sglang/srt/managers/mm_utils.py +1 -1
- sglang/srt/managers/multimodal_processors/llava.py +46 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
- sglang/srt/managers/schedule_batch.py +76 -24
- sglang/srt/managers/schedule_policy.py +0 -3
- sglang/srt/managers/scheduler.py +113 -88
- sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
- sglang/srt/managers/tokenizer_manager.py +133 -34
- sglang/srt/managers/tp_worker.py +12 -9
- sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
- sglang/srt/mem_cache/memory_pool.py +2 -0
- sglang/srt/metrics/collector.py +312 -37
- sglang/srt/model_executor/cuda_graph_runner.py +10 -11
- sglang/srt/model_executor/forward_batch_info.py +1 -1
- sglang/srt/model_executor/model_runner.py +19 -14
- sglang/srt/models/deepseek_janus_pro.py +2 -2
- sglang/srt/models/deepseek_v2.py +23 -20
- sglang/srt/models/llama.py +2 -0
- sglang/srt/models/llama4.py +5 -6
- sglang/srt/models/llava.py +248 -5
- sglang/srt/models/mixtral.py +98 -34
- sglang/srt/models/pixtral.py +467 -0
- sglang/srt/models/roberta.py +1 -1
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/openai_api/adapter.py +30 -4
- sglang/srt/openai_api/protocol.py +0 -8
- sglang/srt/reasoning_parser.py +3 -3
- sglang/srt/sampling/custom_logit_processor.py +18 -3
- sglang/srt/sampling/sampling_batch_info.py +4 -56
- sglang/srt/sampling/sampling_params.py +2 -2
- sglang/srt/server_args.py +34 -4
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
- sglang/srt/speculative/eagle_utils.py +7 -7
- sglang/srt/speculative/eagle_worker.py +22 -19
- sglang/srt/utils.py +6 -5
- sglang/test/few_shot_gsm8k.py +2 -2
- sglang/test/few_shot_gsm8k_engine.py +2 -2
- sglang/test/run_eval.py +2 -2
- sglang/test/runners.py +8 -1
- sglang/test/send_one.py +13 -3
- sglang/test/simple_eval_common.py +1 -1
- sglang/test/simple_eval_humaneval.py +1 -1
- sglang/test/test_programs.py +5 -5
- sglang/test/test_utils.py +89 -14
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +6 -5
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +107 -104
- /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +0 -0
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0
sglang/srt/managers/tp_worker.py
CHANGED
@@ -20,7 +20,7 @@ from typing import Optional, Tuple, Union
|
|
20
20
|
import torch
|
21
21
|
|
22
22
|
from sglang.srt.configs.model_config import ModelConfig
|
23
|
-
from sglang.srt.distributed import get_pp_group,
|
23
|
+
from sglang.srt.distributed import get_pp_group, get_world_group
|
24
24
|
from sglang.srt.hf_transformers_utils import (
|
25
25
|
get_processor,
|
26
26
|
get_tokenizer,
|
@@ -183,8 +183,11 @@ class TpModelWorker:
|
|
183
183
|
def forward_batch_generation(
|
184
184
|
self,
|
185
185
|
model_worker_batch: ModelWorkerBatch,
|
186
|
+
launch_done: Optional[threading.Event] = None,
|
186
187
|
skip_sample: bool = False,
|
187
|
-
) -> Tuple[
|
188
|
+
) -> Tuple[
|
189
|
+
Union[LogitsProcessorOutput, torch.Tensor], Optional[torch.Tensor], bool
|
190
|
+
]:
|
188
191
|
forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
|
189
192
|
|
190
193
|
pp_proxy_tensors = None
|
@@ -196,11 +199,11 @@ class TpModelWorker:
|
|
196
199
|
)
|
197
200
|
|
198
201
|
if self.pp_group.is_last_rank:
|
199
|
-
logits_output = self.model_runner.forward(
|
202
|
+
logits_output, can_run_cuda_graph = self.model_runner.forward(
|
200
203
|
forward_batch, pp_proxy_tensors=pp_proxy_tensors
|
201
204
|
)
|
202
|
-
if
|
203
|
-
|
205
|
+
if launch_done is not None:
|
206
|
+
launch_done.set()
|
204
207
|
|
205
208
|
if skip_sample:
|
206
209
|
next_token_ids = None
|
@@ -209,17 +212,17 @@ class TpModelWorker:
|
|
209
212
|
logits_output, model_worker_batch
|
210
213
|
)
|
211
214
|
|
212
|
-
return logits_output, next_token_ids
|
215
|
+
return logits_output, next_token_ids, can_run_cuda_graph
|
213
216
|
else:
|
214
|
-
pp_proxy_tensors = self.model_runner.forward(
|
217
|
+
pp_proxy_tensors, can_run_cuda_graph = self.model_runner.forward(
|
215
218
|
forward_batch,
|
216
219
|
pp_proxy_tensors=pp_proxy_tensors,
|
217
220
|
)
|
218
|
-
return pp_proxy_tensors.tensors, None
|
221
|
+
return pp_proxy_tensors.tensors, None, can_run_cuda_graph
|
219
222
|
|
220
223
|
def forward_batch_embedding(self, model_worker_batch: ModelWorkerBatch):
|
221
224
|
forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
|
222
|
-
logits_output = self.model_runner.forward(forward_batch)
|
225
|
+
logits_output, _ = self.model_runner.forward(forward_batch)
|
223
226
|
embeddings = logits_output.embeddings
|
224
227
|
return embeddings
|
225
228
|
|
@@ -18,7 +18,7 @@ import logging
|
|
18
18
|
import signal
|
19
19
|
import threading
|
20
20
|
from queue import Queue
|
21
|
-
from typing import Optional
|
21
|
+
from typing import Optional, Tuple
|
22
22
|
|
23
23
|
import psutil
|
24
24
|
import torch
|
@@ -127,10 +127,12 @@ class TpModelWorkerClient:
|
|
127
127
|
batch_lists = [None] * 2
|
128
128
|
|
129
129
|
while True:
|
130
|
-
model_worker_batch, future_token_ids_ct = self.input_queue.get()
|
130
|
+
model_worker_batch, future_token_ids_ct, sync_event = self.input_queue.get()
|
131
131
|
if not model_worker_batch:
|
132
132
|
break
|
133
133
|
|
134
|
+
sync_event.wait()
|
135
|
+
|
134
136
|
# Keep a reference of model_worker_batch by storing it into a list.
|
135
137
|
# Otherwise, the tensor members of model_worker_batch will be released
|
136
138
|
# by pytorch and cause CUDA illegal memory access errors.
|
@@ -145,8 +147,10 @@ class TpModelWorkerClient:
|
|
145
147
|
resolve_future_token_ids(input_ids, self.future_token_ids_map)
|
146
148
|
|
147
149
|
# Run forward
|
148
|
-
logits_output, next_token_ids =
|
149
|
-
|
150
|
+
logits_output, next_token_ids, can_run_cuda_graph = (
|
151
|
+
self.worker.forward_batch_generation(
|
152
|
+
model_worker_batch, model_worker_batch.launch_done
|
153
|
+
)
|
150
154
|
)
|
151
155
|
|
152
156
|
# Update the future token ids map
|
@@ -171,14 +175,18 @@ class TpModelWorkerClient:
|
|
171
175
|
next_token_ids = next_token_ids.to("cpu", non_blocking=True)
|
172
176
|
copy_done.record()
|
173
177
|
|
174
|
-
self.output_queue.put(
|
178
|
+
self.output_queue.put(
|
179
|
+
(copy_done, logits_output, next_token_ids, can_run_cuda_graph)
|
180
|
+
)
|
175
181
|
|
176
182
|
def resolve_last_batch_result(self, launch_done: Optional[threading.Event] = None):
|
177
183
|
"""
|
178
184
|
This function is called to resolve the last batch result and
|
179
185
|
wait for the current batch to be launched. Used in overlap mode.
|
180
186
|
"""
|
181
|
-
copy_done, logits_output, next_token_ids =
|
187
|
+
copy_done, logits_output, next_token_ids, can_run_cuda_graph = (
|
188
|
+
self.output_queue.get()
|
189
|
+
)
|
182
190
|
|
183
191
|
if launch_done is not None:
|
184
192
|
launch_done.wait()
|
@@ -193,9 +201,11 @@ class TpModelWorkerClient:
|
|
193
201
|
logits_output.input_token_logprobs.tolist()
|
194
202
|
)
|
195
203
|
next_token_ids = next_token_ids.tolist()
|
196
|
-
return logits_output, next_token_ids
|
204
|
+
return logits_output, next_token_ids, can_run_cuda_graph
|
197
205
|
|
198
|
-
def forward_batch_generation(
|
206
|
+
def forward_batch_generation(
|
207
|
+
self, model_worker_batch: ModelWorkerBatch
|
208
|
+
) -> Tuple[None, torch.Tensor, bool]:
|
199
209
|
# Create a new copy of sampling_info because it will be updated in-place by the scheduler for the next batch.
|
200
210
|
sampling_info = model_worker_batch.sampling_info
|
201
211
|
sampling_info.update_penalties()
|
@@ -206,10 +216,11 @@ class TpModelWorkerClient:
|
|
206
216
|
)
|
207
217
|
|
208
218
|
# A cuda stream sync here to avoid the cuda illegal memory access error.
|
209
|
-
self.
|
219
|
+
sync_event = torch.get_device_module(self.device).Event()
|
220
|
+
sync_event.record(self.scheduler_stream)
|
210
221
|
|
211
222
|
# Push a new batch to the queue
|
212
|
-
self.input_queue.put((model_worker_batch, self.future_token_ids_ct))
|
223
|
+
self.input_queue.put((model_worker_batch, self.future_token_ids_ct, sync_event))
|
213
224
|
|
214
225
|
# Allocate output future objects
|
215
226
|
bs = len(model_worker_batch.seq_lens)
|
@@ -223,7 +234,7 @@ class TpModelWorkerClient:
|
|
223
234
|
self.future_token_ids_ct = (
|
224
235
|
self.future_token_ids_ct + bs
|
225
236
|
) % self.future_token_ids_limit
|
226
|
-
return None, future_next_token_ids
|
237
|
+
return None, future_next_token_ids, False
|
227
238
|
|
228
239
|
def update_weights_from_disk(self, recv_req: UpdateWeightFromDiskReqInput):
|
229
240
|
success, message = self.worker.update_weights_from_disk(recv_req)
|
@@ -762,6 +762,8 @@ class HostKVCache(abc.ABC):
|
|
762
762
|
self.size = int(device_pool.size * host_to_device_ratio)
|
763
763
|
# Align the host memory pool size to the page size
|
764
764
|
self.size = self.size - (self.size % self.page_size)
|
765
|
+
self.start_layer = device_pool.start_layer
|
766
|
+
self.end_layer = device_pool.end_layer
|
765
767
|
|
766
768
|
assert (
|
767
769
|
self.size > device_pool.size
|
sglang/srt/metrics/collector.py
CHANGED
@@ -15,7 +15,119 @@
|
|
15
15
|
|
16
16
|
import time
|
17
17
|
from dataclasses import dataclass
|
18
|
-
from
|
18
|
+
from enum import Enum
|
19
|
+
from typing import Dict, List, Optional, Union
|
20
|
+
|
21
|
+
from sglang.srt.utils import get_bool_env_var
|
22
|
+
|
23
|
+
SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS")
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass
|
27
|
+
class TimeStats:
|
28
|
+
"""
|
29
|
+
Store the timestamps for each stage of a request.
|
30
|
+
|
31
|
+
Unified: wait_queue -> forward -> completion
|
32
|
+
Prefill: bootstrap_queue -> wait_queue -> forward -> transfer_queue -> completion
|
33
|
+
Decode: prealloc_queue -> transfer_queue -> wait_queue -> forward -> completion
|
34
|
+
"""
|
35
|
+
|
36
|
+
lb_entry_time: float = 0.0
|
37
|
+
wait_queue_entry_time: float = 0.0
|
38
|
+
forward_entry_time: float = 0.0
|
39
|
+
completion_time: float = 0.0
|
40
|
+
prefill_bootstrap_queue_entry_time: float = 0.0
|
41
|
+
prefill_transfer_queue_entry_time: float = 0.0
|
42
|
+
decode_prealloc_queue_entry_time: float = 0.0
|
43
|
+
decode_transfer_queue_entry_time: float = 0.0
|
44
|
+
|
45
|
+
class RequestType(Enum):
|
46
|
+
UNIFIED = "unified"
|
47
|
+
PREFILL = "prefill"
|
48
|
+
DECODE = "decode"
|
49
|
+
INVALID = "invalid"
|
50
|
+
|
51
|
+
def __str__(self) -> str:
|
52
|
+
# if unified
|
53
|
+
_type = self.get_type()
|
54
|
+
|
55
|
+
if _type == self.RequestType.UNIFIED:
|
56
|
+
queue_duration = self.forward_entry_time - self.wait_queue_entry_time
|
57
|
+
forward_duration = self.completion_time - self.forward_entry_time
|
58
|
+
|
59
|
+
if SGLANG_TEST_REQUEST_TIME_STATS:
|
60
|
+
assert (
|
61
|
+
queue_duration >= 0 and forward_duration >= 0
|
62
|
+
), f"queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
|
63
|
+
|
64
|
+
return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time}"
|
65
|
+
elif _type == self.RequestType.PREFILL:
|
66
|
+
bootstrap_duration = (
|
67
|
+
self.wait_queue_entry_time - self.prefill_bootstrap_queue_entry_time
|
68
|
+
)
|
69
|
+
|
70
|
+
queue_duration = self.forward_entry_time - self.wait_queue_entry_time
|
71
|
+
|
72
|
+
forward_duration = self.completion_time - self.forward_entry_time
|
73
|
+
|
74
|
+
if SGLANG_TEST_REQUEST_TIME_STATS:
|
75
|
+
assert (
|
76
|
+
bootstrap_duration >= 0
|
77
|
+
and queue_duration >= 0
|
78
|
+
and forward_duration >= 0
|
79
|
+
), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
|
80
|
+
return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time}"
|
81
|
+
# if decode
|
82
|
+
elif _type == self.RequestType.DECODE:
|
83
|
+
prealloc_duration = (
|
84
|
+
self.decode_transfer_queue_entry_time
|
85
|
+
- self.decode_prealloc_queue_entry_time
|
86
|
+
)
|
87
|
+
|
88
|
+
transfer_duration = (
|
89
|
+
self.wait_queue_entry_time - self.decode_transfer_queue_entry_time
|
90
|
+
)
|
91
|
+
queue_duration = self.forward_entry_time - self.wait_queue_entry_time
|
92
|
+
forward_duration = self.completion_time - self.forward_entry_time
|
93
|
+
|
94
|
+
if SGLANG_TEST_REQUEST_TIME_STATS:
|
95
|
+
assert (
|
96
|
+
prealloc_duration >= 0
|
97
|
+
and transfer_duration >= 0
|
98
|
+
and queue_duration >= 0
|
99
|
+
and forward_duration >= 0
|
100
|
+
), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
|
101
|
+
|
102
|
+
return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time}"
|
103
|
+
else:
|
104
|
+
return "Invalid Time Stats"
|
105
|
+
|
106
|
+
def format_duration(self, duration: float) -> str:
|
107
|
+
return f"{duration * 1e3:.2f}ms"
|
108
|
+
|
109
|
+
def get_type(self) -> RequestType:
|
110
|
+
"""Determine the type of request based on timestamp values."""
|
111
|
+
if (
|
112
|
+
self.prefill_bootstrap_queue_entry_time == 0.0
|
113
|
+
and self.prefill_transfer_queue_entry_time == 0.0
|
114
|
+
and self.decode_prealloc_queue_entry_time == 0.0
|
115
|
+
and self.decode_transfer_queue_entry_time == 0.0
|
116
|
+
):
|
117
|
+
return self.RequestType.UNIFIED
|
118
|
+
elif (
|
119
|
+
self.prefill_bootstrap_queue_entry_time > 0.0
|
120
|
+
and self.prefill_transfer_queue_entry_time > 0.0
|
121
|
+
):
|
122
|
+
return self.RequestType.PREFILL
|
123
|
+
elif (
|
124
|
+
self.decode_prealloc_queue_entry_time > 0.0
|
125
|
+
and self.decode_transfer_queue_entry_time > 0.0
|
126
|
+
and self.wait_queue_entry_time > 0.0
|
127
|
+
):
|
128
|
+
return self.RequestType.DECODE
|
129
|
+
else:
|
130
|
+
return self.RequestType.INVALID
|
19
131
|
|
20
132
|
|
21
133
|
@dataclass
|
@@ -26,15 +138,20 @@ class SchedulerStats:
|
|
26
138
|
gen_throughput: float = 0.0
|
27
139
|
num_queue_reqs: int = 0
|
28
140
|
cache_hit_rate: float = 0.0
|
141
|
+
num_grammar_queue_reqs: int = 0
|
29
142
|
spec_accept_length: float = 0.0
|
30
143
|
avg_request_queue_latency: float = 0.0
|
144
|
+
num_prefill_prealloc_queue_reqs: int = 0
|
145
|
+
num_prefill_infight_queue_reqs: int = 0
|
146
|
+
num_decode_prealloc_queue_reqs: int = 0
|
147
|
+
num_decode_transfer_queue_reqs: int = 0
|
31
148
|
|
32
149
|
|
33
150
|
class SchedulerMetricsCollector:
|
34
151
|
|
35
152
|
def __init__(self, labels: Dict[str, str]) -> None:
|
36
153
|
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
37
|
-
from prometheus_client import
|
154
|
+
from prometheus_client import Counter, Gauge
|
38
155
|
|
39
156
|
self.labels = labels
|
40
157
|
self.last_log_time = time.time()
|
@@ -74,6 +191,13 @@ class SchedulerMetricsCollector:
|
|
74
191
|
multiprocess_mode="mostrecent",
|
75
192
|
)
|
76
193
|
|
194
|
+
self.num_grammar_queue_reqs = Gauge(
|
195
|
+
name="sglang:num_grammar_queue_reqs",
|
196
|
+
documentation="The number of requests in the grammar waiting queue.",
|
197
|
+
labelnames=labels.keys(),
|
198
|
+
multiprocess_mode="mostrecent",
|
199
|
+
)
|
200
|
+
|
77
201
|
self.cache_hit_rate = Gauge(
|
78
202
|
name="sglang:cache_hit_rate",
|
79
203
|
documentation="The prefix cache hit rate.",
|
@@ -95,28 +219,98 @@ class SchedulerMetricsCollector:
|
|
95
219
|
multiprocess_mode="mostrecent",
|
96
220
|
)
|
97
221
|
|
222
|
+
# Disaggregation queue metrics
|
223
|
+
self.num_prefill_prealloc_queue_reqs = Gauge(
|
224
|
+
name="sglang:num_prefill_prealloc_queue_reqs",
|
225
|
+
documentation="The number of requests in the prefill prealloc queue.",
|
226
|
+
labelnames=labels.keys(),
|
227
|
+
multiprocess_mode="mostrecent",
|
228
|
+
)
|
229
|
+
|
230
|
+
self.num_prefill_infight_queue_reqs = Gauge(
|
231
|
+
name="sglang:num_prefill_infight_queue_reqs",
|
232
|
+
documentation="The number of requests in the prefill infight queue.",
|
233
|
+
labelnames=labels.keys(),
|
234
|
+
multiprocess_mode="mostrecent",
|
235
|
+
)
|
236
|
+
|
237
|
+
self.num_decode_prealloc_queue_reqs = Gauge(
|
238
|
+
name="sglang:num_decode_prealloc_queue_reqs",
|
239
|
+
documentation="The number of requests in the decode prealloc queue.",
|
240
|
+
labelnames=labels.keys(),
|
241
|
+
multiprocess_mode="mostrecent",
|
242
|
+
)
|
243
|
+
|
244
|
+
self.num_decode_transfer_queue_reqs = Gauge(
|
245
|
+
name="sglang:num_decode_transfer_queue_reqs",
|
246
|
+
documentation="The number of requests in the decode transfer queue.",
|
247
|
+
labelnames=labels.keys(),
|
248
|
+
multiprocess_mode="mostrecent",
|
249
|
+
)
|
250
|
+
|
251
|
+
self.num_bootstrap_failed_reqs = Counter(
|
252
|
+
name="sglang:num_bootstrap_failed_reqs",
|
253
|
+
documentation="The number of bootstrap failed requests.",
|
254
|
+
labelnames=labels.keys(),
|
255
|
+
)
|
256
|
+
|
257
|
+
self.num_transfer_failed_reqs = Counter(
|
258
|
+
name="sglang:num_transfer_failed_reqs",
|
259
|
+
documentation="The number of transfer failed requests.",
|
260
|
+
labelnames=labels.keys(),
|
261
|
+
)
|
262
|
+
|
98
263
|
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
99
264
|
# Convenience function for logging to gauge.
|
100
265
|
gauge.labels(**self.labels).set(data)
|
101
266
|
|
267
|
+
def increment_bootstrap_failed_reqs(self) -> None:
|
268
|
+
self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
|
269
|
+
|
270
|
+
def increment_transfer_failed_reqs(self) -> None:
|
271
|
+
self.num_transfer_failed_reqs.labels(**self.labels).inc(1)
|
272
|
+
|
102
273
|
def log_stats(self, stats: SchedulerStats) -> None:
|
103
274
|
self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
|
104
275
|
self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
|
105
276
|
self._log_gauge(self.token_usage, stats.token_usage)
|
106
277
|
self._log_gauge(self.gen_throughput, stats.gen_throughput)
|
107
278
|
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
|
279
|
+
self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
|
108
280
|
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
109
281
|
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
|
110
|
-
|
282
|
+
|
283
|
+
# Disaggregation metrics
|
284
|
+
self._log_gauge(
|
285
|
+
self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
|
286
|
+
)
|
287
|
+
self._log_gauge(
|
288
|
+
self.num_prefill_infight_queue_reqs, stats.num_prefill_infight_queue_reqs
|
289
|
+
)
|
290
|
+
self._log_gauge(
|
291
|
+
self.num_decode_prealloc_queue_reqs, stats.num_decode_prealloc_queue_reqs
|
292
|
+
)
|
293
|
+
self._log_gauge(
|
294
|
+
self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
|
295
|
+
)
|
296
|
+
|
111
297
|
self.last_log_time = time.time()
|
112
298
|
|
113
299
|
|
114
300
|
class TokenizerMetricsCollector:
|
115
|
-
def __init__(
|
301
|
+
def __init__(
|
302
|
+
self,
|
303
|
+
labels: Dict[str, str],
|
304
|
+
bucket_time_to_first_token: Optional[List[float]] = None,
|
305
|
+
bucket_inter_token_latency: Optional[List[float]] = None,
|
306
|
+
bucket_e2e_request_latency: Optional[List[float]] = None,
|
307
|
+
collect_tokens_histogram: bool = False,
|
308
|
+
) -> None:
|
116
309
|
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
117
310
|
from prometheus_client import Counter, Histogram
|
118
311
|
|
119
312
|
self.labels = labels
|
313
|
+
self.collect_tokens_histogram = collect_tokens_histogram
|
120
314
|
|
121
315
|
self.prompt_tokens_total = Counter(
|
122
316
|
name="sglang:prompt_tokens_total",
|
@@ -130,6 +324,66 @@ class TokenizerMetricsCollector:
|
|
130
324
|
labelnames=labels.keys(),
|
131
325
|
)
|
132
326
|
|
327
|
+
if collect_tokens_histogram:
|
328
|
+
bucket_prompt_tokens = [
|
329
|
+
100,
|
330
|
+
300,
|
331
|
+
500,
|
332
|
+
700,
|
333
|
+
1000,
|
334
|
+
1500,
|
335
|
+
2000,
|
336
|
+
3000,
|
337
|
+
4000,
|
338
|
+
5000,
|
339
|
+
6000,
|
340
|
+
7000,
|
341
|
+
8000,
|
342
|
+
9000,
|
343
|
+
10000,
|
344
|
+
12000,
|
345
|
+
15000,
|
346
|
+
20000,
|
347
|
+
22000,
|
348
|
+
25000,
|
349
|
+
30000,
|
350
|
+
35000,
|
351
|
+
40000,
|
352
|
+
]
|
353
|
+
self.prompt_tokens_histogram = Histogram(
|
354
|
+
name="sglang:prompt_tokens_histogram",
|
355
|
+
documentation="Histogram of prompt token length.",
|
356
|
+
labelnames=labels.keys(),
|
357
|
+
buckets=bucket_prompt_tokens,
|
358
|
+
)
|
359
|
+
bucket_generation_tokens = [
|
360
|
+
100,
|
361
|
+
300,
|
362
|
+
500,
|
363
|
+
1000,
|
364
|
+
1200,
|
365
|
+
1500,
|
366
|
+
1700,
|
367
|
+
2000,
|
368
|
+
2500,
|
369
|
+
3000,
|
370
|
+
3500,
|
371
|
+
4000,
|
372
|
+
4500,
|
373
|
+
5000,
|
374
|
+
6000,
|
375
|
+
7000,
|
376
|
+
8000,
|
377
|
+
9000,
|
378
|
+
10000,
|
379
|
+
]
|
380
|
+
self.generation_tokens_histogram = Histogram(
|
381
|
+
name="sglang:generation_tokens_histogram",
|
382
|
+
documentation="Histogram of generation token length.",
|
383
|
+
labelnames=labels.keys(),
|
384
|
+
buckets=bucket_generation_tokens,
|
385
|
+
)
|
386
|
+
|
133
387
|
self.cached_tokens_total = Counter(
|
134
388
|
name="sglang:cached_tokens_total",
|
135
389
|
documentation="Number of cached prompt tokens.",
|
@@ -142,11 +396,14 @@ class TokenizerMetricsCollector:
|
|
142
396
|
labelnames=labels.keys(),
|
143
397
|
)
|
144
398
|
|
145
|
-
self.
|
146
|
-
name="sglang:
|
147
|
-
documentation="
|
399
|
+
self.num_so_requests_total = Counter(
|
400
|
+
name="sglang:num_so_requests_total",
|
401
|
+
documentation="Number of structured output requests processed.",
|
148
402
|
labelnames=labels.keys(),
|
149
|
-
|
403
|
+
)
|
404
|
+
|
405
|
+
if bucket_time_to_first_token is None:
|
406
|
+
bucket_time_to_first_token = [
|
150
407
|
0.1,
|
151
408
|
0.2,
|
152
409
|
0.4,
|
@@ -165,14 +422,33 @@ class TokenizerMetricsCollector:
|
|
165
422
|
100,
|
166
423
|
200,
|
167
424
|
400,
|
168
|
-
]
|
169
|
-
)
|
425
|
+
]
|
170
426
|
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
427
|
+
if bucket_e2e_request_latency is None:
|
428
|
+
bucket_e2e_request_latency = [
|
429
|
+
0.1,
|
430
|
+
0.2,
|
431
|
+
0.4,
|
432
|
+
0.6,
|
433
|
+
0.8,
|
434
|
+
1,
|
435
|
+
2,
|
436
|
+
4,
|
437
|
+
6,
|
438
|
+
8,
|
439
|
+
10,
|
440
|
+
20,
|
441
|
+
40,
|
442
|
+
60,
|
443
|
+
80,
|
444
|
+
100,
|
445
|
+
200,
|
446
|
+
400,
|
447
|
+
800,
|
448
|
+
]
|
449
|
+
|
450
|
+
if bucket_inter_token_latency is None:
|
451
|
+
bucket_inter_token_latency = [
|
176
452
|
0.002,
|
177
453
|
0.004,
|
178
454
|
0.006,
|
@@ -196,34 +472,27 @@ class TokenizerMetricsCollector:
|
|
196
472
|
4.000,
|
197
473
|
6.000,
|
198
474
|
8.000,
|
199
|
-
]
|
475
|
+
]
|
476
|
+
|
477
|
+
self.histogram_time_to_first_token = Histogram(
|
478
|
+
name="sglang:time_to_first_token_seconds",
|
479
|
+
documentation="Histogram of time to first token in seconds.",
|
480
|
+
labelnames=labels.keys(),
|
481
|
+
buckets=bucket_time_to_first_token,
|
482
|
+
)
|
483
|
+
|
484
|
+
self.histogram_inter_token_latency_seconds = Histogram(
|
485
|
+
name="sglang:inter_token_latency_seconds",
|
486
|
+
documentation="Histogram of inter-token latency in seconds.",
|
487
|
+
labelnames=labels.keys(),
|
488
|
+
buckets=bucket_inter_token_latency,
|
200
489
|
)
|
201
490
|
|
202
491
|
self.histogram_e2e_request_latency = Histogram(
|
203
492
|
name="sglang:e2e_request_latency_seconds",
|
204
493
|
documentation="Histogram of End-to-end request latency in seconds",
|
205
494
|
labelnames=labels.keys(),
|
206
|
-
buckets=
|
207
|
-
0.1,
|
208
|
-
0.2,
|
209
|
-
0.4,
|
210
|
-
0.6,
|
211
|
-
0.8,
|
212
|
-
1,
|
213
|
-
2,
|
214
|
-
4,
|
215
|
-
6,
|
216
|
-
8,
|
217
|
-
10,
|
218
|
-
20,
|
219
|
-
40,
|
220
|
-
60,
|
221
|
-
80,
|
222
|
-
100,
|
223
|
-
200,
|
224
|
-
400,
|
225
|
-
800,
|
226
|
-
],
|
495
|
+
buckets=bucket_e2e_request_latency,
|
227
496
|
)
|
228
497
|
|
229
498
|
def _log_histogram(self, histogram, data: Union[int, float]) -> None:
|
@@ -235,13 +504,19 @@ class TokenizerMetricsCollector:
|
|
235
504
|
generation_tokens: int,
|
236
505
|
cached_tokens: int,
|
237
506
|
e2e_latency: float,
|
507
|
+
has_grammar: bool,
|
238
508
|
):
|
239
509
|
self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
|
240
510
|
self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
|
241
511
|
if cached_tokens > 0:
|
242
512
|
self.cached_tokens_total.labels(**self.labels).inc(cached_tokens)
|
243
513
|
self.num_requests_total.labels(**self.labels).inc(1)
|
514
|
+
if has_grammar:
|
515
|
+
self.num_so_requests_total.labels(**self.labels).inc(1)
|
244
516
|
self._log_histogram(self.histogram_e2e_request_latency, e2e_latency)
|
517
|
+
if self.collect_tokens_histogram:
|
518
|
+
self._log_histogram(self.prompt_tokens_histogram, prompt_tokens)
|
519
|
+
self._log_histogram(self.generation_tokens_histogram, generation_tokens)
|
245
520
|
|
246
521
|
def observe_time_to_first_token(self, value: float):
|
247
522
|
self.histogram_time_to_first_token.labels(**self.labels).observe(value)
|
@@ -19,7 +19,7 @@ import bisect
|
|
19
19
|
import inspect
|
20
20
|
import os
|
21
21
|
from contextlib import contextmanager
|
22
|
-
from typing import TYPE_CHECKING, Callable
|
22
|
+
from typing import TYPE_CHECKING, Callable, Optional, Union
|
23
23
|
|
24
24
|
import torch
|
25
25
|
import tqdm
|
@@ -40,15 +40,12 @@ from sglang.srt.patch_torch import monkey_patch_torch_compile
|
|
40
40
|
from sglang.srt.utils import (
|
41
41
|
get_available_gpu_memory,
|
42
42
|
get_device_memory_capacity,
|
43
|
-
is_hip,
|
44
43
|
rank0_log,
|
45
44
|
)
|
46
45
|
|
47
46
|
if TYPE_CHECKING:
|
48
47
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
49
48
|
|
50
|
-
_is_hip = is_hip()
|
51
|
-
|
52
49
|
|
53
50
|
def _to_torch(model: torch.nn.Module, reverse: bool, num_tokens: int):
|
54
51
|
for sub in model._modules.values():
|
@@ -137,7 +134,6 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
|
|
137
134
|
)
|
138
135
|
|
139
136
|
gpu_mem = get_device_memory_capacity()
|
140
|
-
# Batch size of each rank will not become so large when DP is on
|
141
137
|
if gpu_mem is not None and gpu_mem > 96 * 1024:
|
142
138
|
capture_bs += list(range(160, 257, 8))
|
143
139
|
|
@@ -148,12 +144,15 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
|
|
148
144
|
model_runner.req_to_token_pool.size
|
149
145
|
]
|
150
146
|
|
151
|
-
capture_bs = list(sorted(set(capture_bs)))
|
152
|
-
|
153
|
-
assert len(capture_bs) > 0 and capture_bs[0] > 0
|
154
|
-
capture_bs = [bs for bs in capture_bs if bs <= model_runner.req_to_token_pool.size]
|
155
147
|
if server_args.cuda_graph_max_bs:
|
156
148
|
capture_bs = [bs for bs in capture_bs if bs <= server_args.cuda_graph_max_bs]
|
149
|
+
if max(capture_bs) < server_args.cuda_graph_max_bs:
|
150
|
+
capture_bs += list(
|
151
|
+
range(max(capture_bs), server_args.cuda_graph_max_bs + 1, 16)
|
152
|
+
)
|
153
|
+
capture_bs = [bs for bs in capture_bs if bs <= model_runner.req_to_token_pool.size]
|
154
|
+
capture_bs = list(sorted(set(capture_bs)))
|
155
|
+
assert len(capture_bs) > 0 and capture_bs[0] > 0
|
157
156
|
compile_bs = (
|
158
157
|
[bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
|
159
158
|
if server_args.enable_torch_compile
|
@@ -296,12 +295,12 @@ class CudaGraphRunner:
|
|
296
295
|
self.capture()
|
297
296
|
except RuntimeError as e:
|
298
297
|
raise Exception(
|
299
|
-
f"Capture
|
298
|
+
f"Capture CUDA graph failed: {e}\n"
|
300
299
|
"Possible solutions:\n"
|
301
300
|
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
302
301
|
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
|
303
302
|
"3. disable torch compile by not using --enable-torch-compile\n"
|
304
|
-
"4. disable
|
303
|
+
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
|
305
304
|
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
306
305
|
)
|
307
306
|
|
@@ -58,7 +58,7 @@ class ForwardMode(IntEnum):
|
|
58
58
|
DECODE = auto()
|
59
59
|
# Contains both EXTEND and DECODE when doing chunked prefill.
|
60
60
|
MIXED = auto()
|
61
|
-
# No sequence to forward. For data parallel attention, some workers
|
61
|
+
# No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated.
|
62
62
|
IDLE = auto()
|
63
63
|
|
64
64
|
# Used in speculative decoding: verify a batch in the target model.
|