sglang 0.4.9__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +2 -2
- sglang/srt/configs/model_config.py +36 -2
- sglang/srt/conversation.py +56 -3
- sglang/srt/disaggregation/ascend/__init__.py +6 -0
- sglang/srt/disaggregation/ascend/conn.py +44 -0
- sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
- sglang/srt/disaggregation/mooncake/conn.py +50 -18
- sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
- sglang/srt/disaggregation/utils.py +25 -3
- sglang/srt/entrypoints/engine.py +1 -1
- sglang/srt/entrypoints/http_server.py +1 -0
- sglang/srt/entrypoints/http_server_engine.py +1 -1
- sglang/srt/entrypoints/openai/protocol.py +11 -0
- sglang/srt/entrypoints/openai/serving_chat.py +7 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/kimik2_detector.py +220 -0
- sglang/srt/hf_transformers_utils.py +18 -0
- sglang/srt/jinja_template_utils.py +8 -0
- sglang/srt/layers/communicator.py +20 -5
- sglang/srt/layers/flashinfer_comm_fusion.py +3 -3
- sglang/srt/layers/layernorm.py +2 -2
- sglang/srt/layers/linear.py +12 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -1
- sglang/srt/layers/moe/ep_moe/layer.py +141 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +141 -59
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
- sglang/srt/layers/moe/topk.py +8 -2
- sglang/srt/layers/parameter.py +19 -3
- sglang/srt/layers/quantization/__init__.py +2 -0
- sglang/srt/layers/quantization/fp8.py +28 -7
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/modelopt_quant.py +244 -1
- sglang/srt/layers/quantization/moe_wna16.py +1 -2
- sglang/srt/layers/quantization/w4afp8.py +264 -0
- sglang/srt/layers/quantization/w8a8_int8.py +738 -14
- sglang/srt/layers/vocab_parallel_embedding.py +9 -3
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
- sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
- sglang/srt/managers/cache_controller.py +41 -195
- sglang/srt/managers/io_struct.py +35 -3
- sglang/srt/managers/mm_utils.py +59 -96
- sglang/srt/managers/schedule_batch.py +17 -6
- sglang/srt/managers/scheduler.py +38 -6
- sglang/srt/managers/tokenizer_manager.py +16 -0
- sglang/srt/mem_cache/hiradix_cache.py +2 -0
- sglang/srt/mem_cache/memory_pool.py +176 -101
- sglang/srt/mem_cache/memory_pool_host.py +6 -109
- sglang/srt/mem_cache/radix_cache.py +8 -4
- sglang/srt/model_executor/forward_batch_info.py +13 -1
- sglang/srt/model_loader/loader.py +23 -12
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_v2.py +78 -19
- sglang/srt/models/deepseek_vl2.py +1 -1
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +6 -3
- sglang/srt/models/internvl.py +8 -2
- sglang/srt/models/kimi_vl.py +8 -2
- sglang/srt/models/llama.py +2 -0
- sglang/srt/models/llava.py +3 -1
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpmo.py +1 -2
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mixtral_quant.py +4 -0
- sglang/srt/models/mllama4.py +372 -82
- sglang/srt/models/phi4mm.py +8 -2
- sglang/srt/models/phimoe.py +553 -0
- sglang/srt/models/qwen2.py +2 -0
- sglang/srt/models/qwen2_5_vl.py +10 -7
- sglang/srt/models/qwen2_vl.py +12 -1
- sglang/srt/models/vila.py +8 -2
- sglang/srt/multimodal/mm_utils.py +2 -2
- sglang/srt/multimodal/processors/base_processor.py +197 -137
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
- sglang/srt/multimodal/processors/gemma3.py +4 -2
- sglang/srt/multimodal/processors/gemma3n.py +1 -1
- sglang/srt/multimodal/processors/internvl.py +1 -1
- sglang/srt/multimodal/processors/janus_pro.py +1 -1
- sglang/srt/multimodal/processors/kimi_vl.py +1 -1
- sglang/srt/multimodal/processors/minicpm.py +4 -3
- sglang/srt/multimodal/processors/mllama4.py +63 -61
- sglang/srt/multimodal/processors/phi4mm.py +1 -1
- sglang/srt/multimodal/processors/pixtral.py +1 -1
- sglang/srt/multimodal/processors/qwen_vl.py +203 -80
- sglang/srt/multimodal/processors/vila.py +1 -1
- sglang/srt/server_args.py +26 -4
- sglang/srt/two_batch_overlap.py +3 -0
- sglang/srt/utils.py +191 -48
- sglang/test/test_cutlass_w4a8_moe.py +281 -0
- sglang/utils.py +5 -5
- sglang/version.py +1 -1
- {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +6 -4
- {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +99 -90
- {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0
@@ -101,6 +101,7 @@ GLOBAL_SERVER_ARGS_KEYS = [
|
|
101
101
|
"triton_attention_reduce_in_fp32",
|
102
102
|
"num_reserved_decode_tokens",
|
103
103
|
"weight_loader_disable_mmap",
|
104
|
+
"enable_triton_kernel_moe",
|
104
105
|
]
|
105
106
|
|
106
107
|
# Put some global args for easy access
|
@@ -184,6 +185,10 @@ class Modality(Enum):
|
|
184
185
|
f"Invalid modality string: {modality_str}. Valid modalities are: {[m.name for m in Modality]}"
|
185
186
|
)
|
186
187
|
|
188
|
+
@staticmethod
|
189
|
+
def all():
|
190
|
+
return [Modality.IMAGE, Modality.VIDEO, Modality.AUDIO]
|
191
|
+
|
187
192
|
|
188
193
|
@dataclasses.dataclass
|
189
194
|
class MultimodalDataItem:
|
@@ -199,7 +204,7 @@ class MultimodalDataItem:
|
|
199
204
|
hash: int = None
|
200
205
|
pad_value: int = None
|
201
206
|
image_sizes: Tuple[int, int] = None
|
202
|
-
|
207
|
+
offsets: Optional[list] = None
|
203
208
|
|
204
209
|
# the real data, pixel_values or audio_features
|
205
210
|
# data: Union[List[torch.Tensor], List[np.ndarray]]
|
@@ -252,12 +257,17 @@ class MultimodalDataItem:
|
|
252
257
|
self.hash = hash_feature(self.audio_features)
|
253
258
|
elif self.input_features is not None:
|
254
259
|
self.hash = hash_feature(self.input_features)
|
260
|
+
elif self.is_video():
|
261
|
+
self.hash = hash_feature(self.pixel_values_videos)
|
255
262
|
else:
|
256
263
|
self.hash = hash_feature(self.pixel_values)
|
257
264
|
|
258
265
|
assert self.hash is not None
|
259
266
|
self.pad_value = self.hash % (1 << 30)
|
260
267
|
|
268
|
+
def is_modality(self, modality: Modality) -> bool:
|
269
|
+
return self.modality == modality
|
270
|
+
|
261
271
|
def is_audio(self):
|
262
272
|
return (self.modality == Modality.AUDIO) and (
|
263
273
|
self.precomputed_features is not None
|
@@ -267,7 +277,7 @@ class MultimodalDataItem:
|
|
267
277
|
|
268
278
|
def is_image(self):
|
269
279
|
return (
|
270
|
-
self.
|
280
|
+
self.is_modality(Modality.IMAGE) or self.is_modality(Modality.MULTI_IMAGES)
|
271
281
|
) and (
|
272
282
|
self.precomputed_features is not None
|
273
283
|
or not MultimodalDataItem.is_empty_list(self.pixel_values)
|
@@ -276,7 +286,7 @@ class MultimodalDataItem:
|
|
276
286
|
def is_video(self):
|
277
287
|
return (self.modality == Modality.VIDEO) and (
|
278
288
|
self.precomputed_features is not None
|
279
|
-
or not MultimodalDataItem.is_empty_list(self.
|
289
|
+
or not MultimodalDataItem.is_empty_list(self.pixel_values_videos)
|
280
290
|
)
|
281
291
|
|
282
292
|
def is_valid(self) -> bool:
|
@@ -350,6 +360,7 @@ class MultimodalInputs:
|
|
350
360
|
"im_token_id",
|
351
361
|
"im_start_id",
|
352
362
|
"im_end_id",
|
363
|
+
"video_token_id",
|
353
364
|
"slice_start_id",
|
354
365
|
"slice_end_id",
|
355
366
|
"audio_start_id",
|
@@ -363,11 +374,12 @@ class MultimodalInputs:
|
|
363
374
|
return ret
|
364
375
|
|
365
376
|
def contains_image_inputs(self) -> bool:
|
366
|
-
""" """
|
367
377
|
return any(item.is_image() for item in self.mm_items)
|
368
378
|
|
379
|
+
def contains_video_inputs(self) -> bool:
|
380
|
+
return any(item.is_video() for item in self.mm_items)
|
381
|
+
|
369
382
|
def contains_audio_inputs(self) -> bool:
|
370
|
-
""" """
|
371
383
|
return any(item.is_audio() for item in self.mm_items)
|
372
384
|
|
373
385
|
def contains_mm_input(self) -> bool:
|
@@ -842,7 +854,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
842
854
|
global_num_tokens_for_logprob: Optional[List[int]] = None
|
843
855
|
is_extend_in_batch: bool = False
|
844
856
|
can_run_dp_cuda_graph: bool = False
|
845
|
-
is_extend_in_batch: bool = False
|
846
857
|
tbo_split_seq_index: Optional[int] = None
|
847
858
|
global_forward_mode: Optional[ForwardMode] = None
|
848
859
|
|
sglang/srt/managers/scheduler.py
CHANGED
@@ -13,6 +13,7 @@
|
|
13
13
|
# ==============================================================================
|
14
14
|
"""A scheduler that manages a tensor parallel GPU worker."""
|
15
15
|
|
16
|
+
import datetime
|
16
17
|
import faulthandler
|
17
18
|
import logging
|
18
19
|
import os
|
@@ -484,6 +485,8 @@ class Scheduler(
|
|
484
485
|
enable=server_args.enable_memory_saver
|
485
486
|
)
|
486
487
|
self.init_profier()
|
488
|
+
|
489
|
+
# Init metrics stats
|
487
490
|
self.init_metrics()
|
488
491
|
self.init_kv_events(server_args.kv_events_config)
|
489
492
|
|
@@ -590,6 +593,12 @@ class Scheduler(
|
|
590
593
|
hicache_ratio=server_args.hicache_ratio,
|
591
594
|
hicache_size=server_args.hicache_size,
|
592
595
|
hicache_write_policy=server_args.hicache_write_policy,
|
596
|
+
hicache_io_backend=(
|
597
|
+
"direct"
|
598
|
+
if server_args.attention_backend
|
599
|
+
== "fa3" # hot fix for incompatibility
|
600
|
+
else server_args.hicache_io_backend
|
601
|
+
),
|
593
602
|
)
|
594
603
|
self.tp_worker.register_hicache_layer_transfer_counter(
|
595
604
|
self.tree_cache.cache_controller.layer_done_counter
|
@@ -621,6 +630,7 @@ class Scheduler(
|
|
621
630
|
self.torch_profiler_output_dir: Optional[str] = None
|
622
631
|
self.profiler_activities: Optional[List[str]] = None
|
623
632
|
self.profile_id: Optional[str] = None
|
633
|
+
self.profiler_start_forward_ct: Optional[int] = None
|
624
634
|
self.profiler_target_forward_ct: Optional[int] = None
|
625
635
|
self.profiler_target_prefill_ct: Optional[int] = None
|
626
636
|
self.profiler_target_decode_ct: Optional[int] = None
|
@@ -1313,10 +1323,12 @@ class Scheduler(
|
|
1313
1323
|
f += f"#unbootstrapped-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
|
1314
1324
|
f += f"#queue-req: {len(self.waiting_queue)}, "
|
1315
1325
|
f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)}, "
|
1316
|
-
f += f"input throughput (token/s): {self.last_input_throughput:.2f} "
|
1326
|
+
f += f"input throughput (token/s): {self.last_input_throughput:.2f}, "
|
1317
1327
|
else:
|
1318
1328
|
f += f"#running-req: {running_bs}, "
|
1319
|
-
f += f"#queue-req: {len(self.waiting_queue)}"
|
1329
|
+
f += f"#queue-req: {len(self.waiting_queue)}, "
|
1330
|
+
|
1331
|
+
f += f"timestamp: {datetime.datetime.now().isoformat()}"
|
1320
1332
|
|
1321
1333
|
logger.info(f)
|
1322
1334
|
|
@@ -1378,7 +1390,8 @@ class Scheduler(
|
|
1378
1390
|
msg += (
|
1379
1391
|
f"cuda graph: {can_run_cuda_graph}, "
|
1380
1392
|
f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
|
1381
|
-
f"#queue-req: {len(self.waiting_queue)}"
|
1393
|
+
f"#queue-req: {len(self.waiting_queue)}, "
|
1394
|
+
f"timestamp: {datetime.datetime.now().isoformat()}"
|
1382
1395
|
)
|
1383
1396
|
|
1384
1397
|
logger.info(msg)
|
@@ -2333,9 +2346,8 @@ class Scheduler(
|
|
2333
2346
|
|
2334
2347
|
def release_memory_occupation(self, recv_req: ReleaseMemoryOccupationReqInput):
|
2335
2348
|
tags = recv_req.tags
|
2336
|
-
import subprocess
|
2337
2349
|
|
2338
|
-
if tags is None:
|
2350
|
+
if tags is None or len(tags) == 0:
|
2339
2351
|
tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
|
2340
2352
|
|
2341
2353
|
if GPU_MEMORY_TYPE_KV_CACHE in tags:
|
@@ -2346,17 +2358,20 @@ class Scheduler(
|
|
2346
2358
|
self.stashed_model_static_state = _export_static_state(
|
2347
2359
|
self.tp_worker.worker.model_runner.model
|
2348
2360
|
)
|
2361
|
+
torch.distributed.barrier(self.tp_cpu_group)
|
2349
2362
|
self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_WEIGHTS)
|
2350
2363
|
|
2351
2364
|
return ReleaseMemoryOccupationReqOutput()
|
2352
2365
|
|
2353
2366
|
def resume_memory_occupation(self, recv_req: ResumeMemoryOccupationReqInput):
|
2354
2367
|
tags = recv_req.tags
|
2368
|
+
|
2355
2369
|
if tags is None or len(tags) == 0:
|
2356
2370
|
tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
|
2357
2371
|
|
2358
2372
|
if GPU_MEMORY_TYPE_WEIGHTS in tags:
|
2359
2373
|
self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_WEIGHTS)
|
2374
|
+
torch.distributed.barrier(self.tp_cpu_group)
|
2360
2375
|
_import_static_state(
|
2361
2376
|
self.tp_worker.worker.model_runner.model,
|
2362
2377
|
self.stashed_model_static_state,
|
@@ -2377,9 +2392,10 @@ class Scheduler(
|
|
2377
2392
|
|
2378
2393
|
def profile(self, recv_req: ProfileReq):
|
2379
2394
|
if recv_req.type == ProfileReqType.START_PROFILE:
|
2380
|
-
if recv_req.profile_by_stage:
|
2395
|
+
if recv_req.profile_by_stage or recv_req.start_step:
|
2381
2396
|
return self.init_profile(
|
2382
2397
|
recv_req.output_dir,
|
2398
|
+
recv_req.start_step,
|
2383
2399
|
recv_req.num_steps,
|
2384
2400
|
recv_req.activities,
|
2385
2401
|
recv_req.with_stack,
|
@@ -2390,6 +2406,7 @@ class Scheduler(
|
|
2390
2406
|
else:
|
2391
2407
|
self.init_profile(
|
2392
2408
|
recv_req.output_dir,
|
2409
|
+
recv_req.start_step,
|
2393
2410
|
recv_req.num_steps,
|
2394
2411
|
recv_req.activities,
|
2395
2412
|
recv_req.with_stack,
|
@@ -2404,6 +2421,7 @@ class Scheduler(
|
|
2404
2421
|
def init_profile(
|
2405
2422
|
self,
|
2406
2423
|
output_dir: Optional[str],
|
2424
|
+
start_step: Optional[int],
|
2407
2425
|
num_steps: Optional[int],
|
2408
2426
|
activities: Optional[List[str]],
|
2409
2427
|
with_stack: Optional[bool],
|
@@ -2430,6 +2448,9 @@ class Scheduler(
|
|
2430
2448
|
self.profiler_activities = activities
|
2431
2449
|
self.profile_id = profile_id
|
2432
2450
|
|
2451
|
+
if start_step:
|
2452
|
+
self.profiler_start_forward_ct = max(start_step, self.forward_ct + 1)
|
2453
|
+
|
2433
2454
|
if num_steps:
|
2434
2455
|
self.profile_steps = num_steps
|
2435
2456
|
if self.profile_by_stage:
|
@@ -2437,6 +2458,10 @@ class Scheduler(
|
|
2437
2458
|
self.profiler_target_decode_ct = num_steps
|
2438
2459
|
self.profiler_prefill_ct = 0
|
2439
2460
|
self.profiler_decode_ct = 0
|
2461
|
+
elif start_step:
|
2462
|
+
self.profiler_target_forward_ct = (
|
2463
|
+
self.profiler_start_forward_ct + num_steps
|
2464
|
+
)
|
2440
2465
|
else:
|
2441
2466
|
self.profiler_target_forward_ct = self.forward_ct + num_steps
|
2442
2467
|
# The caller will be notified when reaching profiler_target_forward_ct
|
@@ -2509,6 +2534,7 @@ class Scheduler(
|
|
2509
2534
|
|
2510
2535
|
if "CUDA_PROFILER" in activities:
|
2511
2536
|
torch.cuda.cudart().cudaProfilerStart()
|
2537
|
+
self.profile_in_progress = True
|
2512
2538
|
|
2513
2539
|
return ProfileReqOutput(success=True, message="Succeeded")
|
2514
2540
|
|
@@ -2572,6 +2598,7 @@ class Scheduler(
|
|
2572
2598
|
)
|
2573
2599
|
self.torch_profiler = None
|
2574
2600
|
self.profile_in_progress = False
|
2601
|
+
self.profiler_start_forward_ct = None
|
2575
2602
|
|
2576
2603
|
return ProfileReqOutput(success=True, message="Succeeded.")
|
2577
2604
|
|
@@ -2605,6 +2632,11 @@ class Scheduler(
|
|
2605
2632
|
and self.profiler_target_forward_ct <= self.forward_ct
|
2606
2633
|
):
|
2607
2634
|
self.stop_profile()
|
2635
|
+
if (
|
2636
|
+
self.profiler_start_forward_ct
|
2637
|
+
and self.profiler_start_forward_ct == self.forward_ct
|
2638
|
+
):
|
2639
|
+
self.start_profile()
|
2608
2640
|
|
2609
2641
|
def expert_distribution_handle(self, recv_req: ExpertDistributionReq):
|
2610
2642
|
if recv_req == ExpertDistributionReq.START_RECORD:
|
@@ -285,6 +285,20 @@ class TokenizerManager:
|
|
285
285
|
self.bootstrap_server = kv_bootstrap_server_class(
|
286
286
|
self.server_args.disaggregation_bootstrap_port
|
287
287
|
)
|
288
|
+
is_create_store = (
|
289
|
+
self.server_args.node_rank == 0
|
290
|
+
and self.server_args.disaggregation_transfer_backend == "ascend"
|
291
|
+
)
|
292
|
+
if is_create_store:
|
293
|
+
try:
|
294
|
+
from mf_adapter import create_config_store
|
295
|
+
|
296
|
+
ascend_url = os.getenv("ASCEND_MF_STORE_URL")
|
297
|
+
create_config_store(ascend_url)
|
298
|
+
except Exception as e:
|
299
|
+
error_message = f"Failed create mf store, invalid ascend_url."
|
300
|
+
error_message += f" With exception {e}"
|
301
|
+
raise error_message
|
288
302
|
|
289
303
|
# For load balancing
|
290
304
|
self.current_load = 0
|
@@ -863,6 +877,7 @@ class TokenizerManager:
|
|
863
877
|
async def start_profile(
|
864
878
|
self,
|
865
879
|
output_dir: Optional[str] = None,
|
880
|
+
start_step: Optional[int] = None,
|
866
881
|
num_steps: Optional[int] = None,
|
867
882
|
activities: Optional[List[str]] = None,
|
868
883
|
with_stack: Optional[bool] = None,
|
@@ -875,6 +890,7 @@ class TokenizerManager:
|
|
875
890
|
req = ProfileReq(
|
876
891
|
type=ProfileReqType.START_PROFILE,
|
877
892
|
output_dir=output_dir,
|
893
|
+
start_step=start_step,
|
878
894
|
num_steps=num_steps,
|
879
895
|
activities=activities,
|
880
896
|
with_stack=with_stack,
|
@@ -34,6 +34,7 @@ class HiRadixCache(RadixCache):
|
|
34
34
|
hicache_ratio: float,
|
35
35
|
hicache_size: int,
|
36
36
|
hicache_write_policy: str,
|
37
|
+
hicache_io_backend: str,
|
37
38
|
):
|
38
39
|
self.kv_cache = token_to_kv_pool_allocator.get_kvcache()
|
39
40
|
if isinstance(self.kv_cache, MHATokenToKVPool):
|
@@ -56,6 +57,7 @@ class HiRadixCache(RadixCache):
|
|
56
57
|
page_size,
|
57
58
|
load_cache_event=self.load_cache_event,
|
58
59
|
write_policy=hicache_write_policy,
|
60
|
+
io_backend=hicache_io_backend,
|
59
61
|
)
|
60
62
|
|
61
63
|
# record the nodes with ongoing write through
|