sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +321 -31
  3. sglang/bench_serving.py +10 -3
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +4 -0
  11. sglang/srt/configs/dots_ocr.py +64 -0
  12. sglang/srt/configs/falcon_h1.py +360 -0
  13. sglang/srt/configs/load_config.py +8 -0
  14. sglang/srt/configs/model_config.py +160 -105
  15. sglang/srt/configs/qwen3_vl.py +586 -0
  16. sglang/srt/constrained/base_grammar_backend.py +1 -0
  17. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  18. sglang/srt/constrained/xgrammar_backend.py +6 -4
  19. sglang/srt/debug_utils/dumper.py +10 -3
  20. sglang/srt/disaggregation/ascend/conn.py +2 -2
  21. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  22. sglang/srt/disaggregation/common/conn.py +266 -98
  23. sglang/srt/disaggregation/decode.py +50 -9
  24. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  25. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  26. sglang/srt/disaggregation/mooncake/conn.py +51 -541
  27. sglang/srt/disaggregation/nixl/conn.py +148 -39
  28. sglang/srt/disaggregation/prefill.py +31 -14
  29. sglang/srt/disaggregation/utils.py +36 -5
  30. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  31. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  32. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  33. sglang/srt/distributed/parallel_state.py +135 -80
  34. sglang/srt/entrypoints/engine.py +23 -3
  35. sglang/srt/entrypoints/grpc_request_manager.py +330 -55
  36. sglang/srt/entrypoints/grpc_server.py +232 -102
  37. sglang/srt/entrypoints/http_server.py +49 -9
  38. sglang/srt/entrypoints/openai/protocol.py +110 -5
  39. sglang/srt/entrypoints/openai/serving_base.py +25 -6
  40. sglang/srt/entrypoints/openai/serving_chat.py +178 -49
  41. sglang/srt/entrypoints/openai/serving_completions.py +5 -3
  42. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  43. sglang/srt/entrypoints/openai/serving_responses.py +42 -0
  44. sglang/srt/environ.py +285 -0
  45. sglang/srt/eplb/expert_location.py +30 -5
  46. sglang/srt/function_call/function_call_parser.py +3 -2
  47. sglang/srt/function_call/glm4_moe_detector.py +3 -3
  48. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  49. sglang/srt/function_call/json_array_parser.py +63 -0
  50. sglang/srt/function_call/kimik2_detector.py +17 -4
  51. sglang/srt/function_call/utils.py +96 -5
  52. sglang/srt/grpc/compile_proto.py +245 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
  56. sglang/srt/layers/activation.py +7 -6
  57. sglang/srt/layers/attention/aiter_backend.py +14 -15
  58. sglang/srt/layers/attention/ascend_backend.py +108 -9
  59. sglang/srt/layers/attention/attention_registry.py +206 -0
  60. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  61. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  62. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  63. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  66. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  67. sglang/srt/layers/attention/flashinfer_backend.py +112 -194
  68. sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
  69. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  70. sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
  71. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
  72. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
  73. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
  74. sglang/srt/layers/attention/mamba/mamba.py +566 -1
  75. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  76. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  77. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  78. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  79. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  80. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  81. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  82. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  83. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  84. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  85. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  86. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  87. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  88. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  89. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  90. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  91. sglang/srt/layers/attention/nsa/utils.py +24 -0
  92. sglang/srt/layers/attention/nsa_backend.py +887 -0
  93. sglang/srt/layers/attention/tbo_backend.py +6 -6
  94. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  95. sglang/srt/layers/attention/triton_backend.py +42 -9
  96. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  97. sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
  98. sglang/srt/layers/attention/vision.py +58 -0
  99. sglang/srt/layers/attention/wave_backend.py +4 -4
  100. sglang/srt/layers/communicator.py +8 -0
  101. sglang/srt/layers/dp_attention.py +11 -1
  102. sglang/srt/layers/elementwise.py +3 -1
  103. sglang/srt/layers/layernorm.py +2 -0
  104. sglang/srt/layers/linear.py +21 -4
  105. sglang/srt/layers/logits_processor.py +15 -2
  106. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  107. sglang/srt/layers/moe/ep_moe/layer.py +147 -74
  108. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  109. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  110. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  111. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  112. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
  113. sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
  114. sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
  115. sglang/srt/layers/moe/utils.py +10 -0
  116. sglang/srt/layers/parameter.py +23 -6
  117. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  118. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  119. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  120. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  121. sglang/srt/layers/quantization/fp8.py +2 -2
  122. sglang/srt/layers/quantization/fp8_utils.py +1 -1
  123. sglang/srt/layers/quantization/modelopt_quant.py +44 -9
  124. sglang/srt/layers/quantization/mxfp4.py +12 -4
  125. sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
  126. sglang/srt/layers/quantization/w4afp8.py +0 -4
  127. sglang/srt/layers/quantization/w8a8_int8.py +15 -3
  128. sglang/srt/layers/rotary_embedding.py +78 -31
  129. sglang/srt/layers/sampler.py +52 -4
  130. sglang/srt/layers/utils.py +23 -0
  131. sglang/srt/lora/backend/base_backend.py +3 -3
  132. sglang/srt/lora/backend/chunked_backend.py +348 -0
  133. sglang/srt/lora/backend/triton_backend.py +10 -4
  134. sglang/srt/lora/lora.py +7 -5
  135. sglang/srt/lora/lora_manager.py +17 -6
  136. sglang/srt/lora/mem_pool.py +1 -1
  137. sglang/srt/lora/triton_ops/__init__.py +4 -0
  138. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  139. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  140. sglang/srt/lora/utils.py +7 -5
  141. sglang/srt/managers/cache_controller.py +42 -142
  142. sglang/srt/managers/data_parallel_controller.py +11 -46
  143. sglang/srt/managers/detokenizer_manager.py +11 -11
  144. sglang/srt/managers/io_struct.py +162 -118
  145. sglang/srt/managers/mm_utils.py +43 -6
  146. sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
  147. sglang/srt/managers/multimodal_processor.py +1 -2
  148. sglang/srt/managers/overlap_utils.py +53 -0
  149. sglang/srt/managers/schedule_batch.py +167 -86
  150. sglang/srt/managers/schedule_policy.py +143 -16
  151. sglang/srt/managers/scheduler.py +359 -214
  152. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  153. sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
  154. sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
  155. sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
  156. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  157. sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
  158. sglang/srt/managers/tokenizer_manager.py +84 -136
  159. sglang/srt/managers/tp_worker.py +39 -29
  160. sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
  161. sglang/srt/managers/utils.py +1 -45
  162. sglang/srt/mem_cache/allocator.py +14 -20
  163. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  164. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  165. sglang/srt/mem_cache/chunk_cache.py +8 -1
  166. sglang/srt/mem_cache/evict_policy.py +23 -0
  167. sglang/srt/mem_cache/hicache_storage.py +40 -1
  168. sglang/srt/mem_cache/hiradix_cache.py +119 -32
  169. sglang/srt/mem_cache/memory_pool.py +188 -10
  170. sglang/srt/mem_cache/memory_pool_host.py +134 -182
  171. sglang/srt/mem_cache/radix_cache.py +222 -71
  172. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  173. sglang/srt/mem_cache/storage/__init__.py +10 -0
  174. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  175. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  176. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  177. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  178. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  179. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
  180. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
  181. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
  182. sglang/srt/mem_cache/swa_radix_cache.py +25 -34
  183. sglang/srt/metrics/collector.py +82 -120
  184. sglang/srt/metrics/func_timer.py +2 -7
  185. sglang/srt/metrics/utils.py +8 -1
  186. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  187. sglang/srt/model_executor/cuda_graph_runner.py +39 -32
  188. sglang/srt/model_executor/forward_batch_info.py +23 -38
  189. sglang/srt/model_executor/model_runner.py +131 -183
  190. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  191. sglang/srt/model_loader/loader.py +14 -10
  192. sglang/srt/model_loader/weight_utils.py +156 -2
  193. sglang/srt/models/bailing_moe.py +27 -4
  194. sglang/srt/models/deepseek_nextn.py +6 -1
  195. sglang/srt/models/deepseek_v2.py +536 -153
  196. sglang/srt/models/dots_ocr.py +173 -0
  197. sglang/srt/models/falcon_h1.py +576 -0
  198. sglang/srt/models/gemma3_causal.py +0 -2
  199. sglang/srt/models/gemma3_mm.py +1 -1
  200. sglang/srt/models/gemma3n_mm.py +1 -1
  201. sglang/srt/models/glm4_moe.py +3 -3
  202. sglang/srt/models/glm4_moe_nextn.py +2 -2
  203. sglang/srt/models/glm4v.py +1 -1
  204. sglang/srt/models/glm4v_moe.py +1 -1
  205. sglang/srt/models/gpt_oss.py +7 -30
  206. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  207. sglang/srt/models/llama.py +4 -0
  208. sglang/srt/models/longcat_flash.py +1 -1
  209. sglang/srt/models/longcat_flash_nextn.py +1 -1
  210. sglang/srt/models/mllama4.py +15 -4
  211. sglang/srt/models/qwen2.py +0 -7
  212. sglang/srt/models/qwen2_5_vl.py +2 -2
  213. sglang/srt/models/qwen2_audio.py +1 -1
  214. sglang/srt/models/qwen2_moe.py +64 -1
  215. sglang/srt/models/qwen2_vl.py +1 -1
  216. sglang/srt/models/qwen3.py +18 -3
  217. sglang/srt/models/qwen3_moe.py +31 -3
  218. sglang/srt/models/qwen3_next.py +36 -9
  219. sglang/srt/models/qwen3_vl.py +787 -0
  220. sglang/srt/models/qwen3_vl_moe.py +471 -0
  221. sglang/srt/models/registry.py +15 -3
  222. sglang/srt/models/sarashina2_vision.py +269 -0
  223. sglang/srt/models/solar.py +505 -0
  224. sglang/srt/models/starcoder2.py +357 -0
  225. sglang/srt/models/torch_native_llama.py +9 -2
  226. sglang/srt/models/utils.py +51 -0
  227. sglang/srt/multimodal/processors/base_processor.py +15 -7
  228. sglang/srt/multimodal/processors/dots_vlm.py +2 -3
  229. sglang/srt/multimodal/processors/internvl.py +20 -8
  230. sglang/srt/multimodal/processors/qwen_vl.py +8 -1
  231. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  232. sglang/srt/parser/jinja_template_utils.py +6 -0
  233. sglang/srt/sampling/sampling_batch_info.py +20 -2
  234. sglang/srt/sampling/sampling_params.py +7 -0
  235. sglang/srt/server_args.py +753 -295
  236. sglang/srt/server_args_config_parser.py +146 -0
  237. sglang/srt/single_batch_overlap.py +151 -0
  238. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  239. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  240. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  241. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  242. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  243. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  244. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
  245. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
  246. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
  247. sglang/srt/speculative/eagle_worker.py +57 -25
  248. sglang/srt/speculative/ngram_utils.py +428 -0
  249. sglang/srt/speculative/ngram_worker.py +245 -0
  250. sglang/srt/speculative/spec_info.py +47 -0
  251. sglang/srt/speculative/spec_utils.py +606 -0
  252. sglang/srt/torch_memory_saver_adapter.py +5 -7
  253. sglang/srt/tracing/trace.py +32 -6
  254. sglang/srt/two_batch_overlap.py +8 -5
  255. sglang/srt/utils/__init__.py +2 -0
  256. sglang/srt/{utils.py → utils/common.py} +399 -74
  257. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
  258. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  259. sglang/srt/utils/rpd_utils.py +452 -0
  260. sglang/srt/utils/slow_rank_detector.py +71 -0
  261. sglang/srt/warmup.py +8 -4
  262. sglang/srt/weight_sync/utils.py +1 -1
  263. sglang/test/get_logits_ut.py +57 -0
  264. sglang/test/run_eval.py +79 -11
  265. sglang/test/runners.py +1 -1
  266. sglang/test/simple_eval_common.py +5 -2
  267. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  268. sglang/test/test_block_fp8.py +2 -2
  269. sglang/test/test_deterministic.py +297 -0
  270. sglang/test/test_disaggregation_utils.py +12 -1
  271. sglang/test/test_programs.py +1 -1
  272. sglang/test/test_utils.py +355 -4
  273. sglang/utils.py +10 -1
  274. sglang/version.py +1 -1
  275. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
  276. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
  277. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  278. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  279. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  280. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  281. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  282. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
sglang/test/test_utils.py CHANGED
@@ -9,15 +9,18 @@ import os
9
9
  import random
10
10
  import re
11
11
  import subprocess
12
+ import sys
12
13
  import threading
13
14
  import time
14
15
  import unittest
15
16
  from concurrent.futures import ThreadPoolExecutor
16
17
  from dataclasses import dataclass
18
+ from datetime import datetime
17
19
  from functools import partial
18
20
  from pathlib import Path
19
21
  from types import SimpleNamespace
20
- from typing import Awaitable, Callable, List, Optional, Tuple
22
+ from typing import Any, Awaitable, Callable, List, Optional, Tuple
23
+ from urllib.parse import quote
21
24
 
22
25
  import aiohttp
23
26
  import numpy as np
@@ -41,6 +44,7 @@ from sglang.utils import get_exception_traceback
41
44
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
42
45
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
43
46
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
47
+ DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE = "Qwen/Qwen3-Reranker-0.6B"
44
48
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
45
49
  DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
46
50
  DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
@@ -75,11 +79,13 @@ DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
75
79
  # EAGLE
76
80
  DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
77
81
  DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
78
- DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B"
82
+ DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST_EAGLE3 = "meta-llama/Llama-3.1-8B-Instruct"
83
+ DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B"
79
84
  DEFAULT_STANDALONE_SPECULATIVE_TARGET_MODEL_FOR_TEST = (
80
85
  "meta-llama/Llama-3.1-8B-Instruct"
81
86
  )
82
87
  DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
88
+ DEFAULT_NGRAM_SPECULATIVE_TARGET_MODEL_FOR_TEST = "Qwen/Qwen2.5-Coder-7B-Instruct"
83
89
 
84
90
  # Other use cases
85
91
  DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
@@ -561,11 +567,30 @@ def popen_launch_server(
561
567
  if return_stdout_stderr:
562
568
  process = subprocess.Popen(
563
569
  command,
564
- stdout=return_stdout_stderr[0],
565
- stderr=return_stdout_stderr[1],
570
+ stdout=subprocess.PIPE,
571
+ stderr=subprocess.PIPE,
566
572
  env=env,
567
573
  text=True,
574
+ bufsize=1,
568
575
  )
576
+
577
+ def _dump(src, sinks):
578
+ for line in iter(src.readline, ""):
579
+ for sink in sinks:
580
+ sink.write(line)
581
+ sink.flush()
582
+ src.close()
583
+
584
+ threading.Thread(
585
+ target=_dump,
586
+ args=(process.stdout, [return_stdout_stderr[0], sys.stdout]),
587
+ daemon=True,
588
+ ).start()
589
+ threading.Thread(
590
+ target=_dump,
591
+ args=(process.stderr, [return_stdout_stderr[1], sys.stderr]),
592
+ daemon=True,
593
+ ).start()
569
594
  else:
570
595
  process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
571
596
 
@@ -869,6 +894,154 @@ def run_bench_serving(
869
894
  return res
870
895
 
871
896
 
897
+ def run_score_benchmark(
898
+ model,
899
+ num_requests=100,
900
+ batch_size=5,
901
+ other_server_args=None,
902
+ need_warmup=False,
903
+ device="auto",
904
+ ):
905
+ """Score API benchmark function compatible with run_bench_serving pattern"""
906
+ if other_server_args is None:
907
+ other_server_args = []
908
+
909
+ if device == "auto":
910
+ device = auto_config_device()
911
+
912
+ # Launch the server (consistent with run_bench_serving)
913
+ base_url = DEFAULT_URL_FOR_TEST
914
+ process = popen_launch_server(
915
+ model,
916
+ base_url,
917
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
918
+ other_args=other_server_args,
919
+ )
920
+
921
+ async def _run_benchmark():
922
+
923
+ # Load tokenizer for generating test data
924
+ from sglang.srt.utils.hf_transformers_utils import get_tokenizer
925
+
926
+ tokenizer = get_tokenizer(model)
927
+
928
+ # Score API configuration
929
+ score_query_tokens = 120
930
+ score_item_tokens = 180
931
+ score_label_token_ids = [9454, 2753] # Yes/No token IDs
932
+ special_token = "<|im_start|>"
933
+
934
+ def generate_text_with_token_count(num_tokens):
935
+ """Generate text with precise token count using replicated token."""
936
+ text = special_token * num_tokens
937
+ actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
938
+ if actual_tokens != num_tokens:
939
+ text = special_token * (
940
+ num_tokens
941
+ // len(tokenizer.encode(special_token, add_special_tokens=False))
942
+ )
943
+ return text
944
+
945
+ if need_warmup:
946
+ warmup_data = {
947
+ "query": generate_text_with_token_count(score_query_tokens),
948
+ "items": [
949
+ generate_text_with_token_count(score_item_tokens) for _ in range(3)
950
+ ],
951
+ "label_token_ids": score_label_token_ids,
952
+ "model": model,
953
+ "apply_softmax": True,
954
+ }
955
+
956
+ async with aiohttp.ClientSession() as session:
957
+ try:
958
+ await session.post(
959
+ f"{base_url}/v1/score",
960
+ json=warmup_data,
961
+ timeout=aiohttp.ClientTimeout(total=30),
962
+ )
963
+ except:
964
+ pass # Ignore warmup errors
965
+
966
+ test_requests = []
967
+ for i in range(num_requests):
968
+ query = generate_text_with_token_count(score_query_tokens)
969
+ items = [
970
+ generate_text_with_token_count(score_item_tokens)
971
+ for _ in range(batch_size)
972
+ ]
973
+
974
+ score_data = {
975
+ "query": query,
976
+ "items": items,
977
+ "label_token_ids": score_label_token_ids,
978
+ "model": model,
979
+ "apply_softmax": True,
980
+ }
981
+ test_requests.append(score_data)
982
+
983
+ start_time = time.monotonic()
984
+ successful_requests = 0
985
+ total_latency = 0
986
+ latencies = []
987
+
988
+ async with aiohttp.ClientSession() as session:
989
+ for request_data in test_requests:
990
+ try:
991
+ request_start = time.monotonic()
992
+ async with session.post(
993
+ f"{base_url}/v1/score",
994
+ json=request_data,
995
+ timeout=aiohttp.ClientTimeout(total=30),
996
+ ) as response:
997
+ if response.status == 200:
998
+ response_data = await response.json()
999
+ request_end = time.monotonic()
1000
+
1001
+ if "scores" in response_data or "logprobs" in response_data:
1002
+ latency_ms = (request_end - request_start) * 1000
1003
+ latencies.append(latency_ms)
1004
+ total_latency += latency_ms
1005
+ successful_requests += 1
1006
+ except Exception:
1007
+ continue
1008
+
1009
+ end_time = time.monotonic()
1010
+ total_time = end_time - start_time
1011
+
1012
+ if successful_requests > 0:
1013
+ throughput = successful_requests / total_time
1014
+ avg_latency = total_latency / successful_requests
1015
+ latencies.sort()
1016
+ p95_latency = latencies[int(len(latencies) * 0.95)] if latencies else 0
1017
+
1018
+ return {
1019
+ "completed": successful_requests,
1020
+ "total_requests": num_requests,
1021
+ "throughput": throughput,
1022
+ "avg_latency_ms": avg_latency,
1023
+ "p95_latency_ms": p95_latency,
1024
+ "successful_requests": successful_requests,
1025
+ }
1026
+ else:
1027
+ return {
1028
+ "completed": 0,
1029
+ "total_requests": num_requests,
1030
+ "throughput": 0,
1031
+ "avg_latency_ms": 0,
1032
+ "p95_latency_ms": 0,
1033
+ "successful_requests": 0,
1034
+ }
1035
+
1036
+ try:
1037
+ res = asyncio.run(_run_benchmark())
1038
+ finally:
1039
+ kill_process_tree(process.pid)
1040
+
1041
+ assert res["completed"] == res["successful_requests"]
1042
+ return res
1043
+
1044
+
872
1045
  def run_bench_serving_multi(
873
1046
  model,
874
1047
  base_url,
@@ -1390,6 +1563,41 @@ async def send_concurrent_generate_requests(
1390
1563
  return await asyncio.gather(*tasks)
1391
1564
 
1392
1565
 
1566
+ async def send_concurrent_generate_requests_with_custom_params(
1567
+ base_url: str,
1568
+ custom_params: List[dict[str, Any]],
1569
+ ) -> Tuple[int, Any]:
1570
+ """Sends generate request concurrently with custom parameters and returns status code and response json tuple. Max concurrency is num_requests."""
1571
+
1572
+ base_payload = {
1573
+ "text": """
1574
+ System: You are a helpful assistant.
1575
+ User: What is the capital of France?
1576
+ Assistant: The capital of France is
1577
+ """,
1578
+ "sampling_params": {
1579
+ "temperature": 0,
1580
+ "max_new_tokens": 50,
1581
+ },
1582
+ }
1583
+
1584
+ async def async_generate_with_priority(req):
1585
+ async with aiohttp.ClientSession() as session:
1586
+ async with session.post(
1587
+ f"{base_url}/generate",
1588
+ json=req,
1589
+ ) as response:
1590
+ resp_json = await response.json()
1591
+ return (response.status, resp_json)
1592
+
1593
+ tasks = []
1594
+ for c in custom_params:
1595
+ req = base_payload.copy()
1596
+ req.update(c)
1597
+ tasks.append(asyncio.create_task(async_generate_with_priority(req)))
1598
+ return await asyncio.gather(*tasks)
1599
+
1600
+
1393
1601
  class CustomTestCase(unittest.TestCase):
1394
1602
  def _callTestMethod(self, method):
1395
1603
  max_retry = int(
@@ -1431,3 +1639,146 @@ def dump_bench_raw_result(
1431
1639
  def _ensure_remove_suffix(text: str, suffix: str):
1432
1640
  assert text.endswith(suffix)
1433
1641
  return text.removesuffix(suffix)
1642
+
1643
+
1644
+ class ModelDeploySetup:
1645
+ def __init__(self, model_path: str, extra_args: List[str] = []):
1646
+ self.model_path = model_path
1647
+ if "--enable-multimodal" not in extra_args:
1648
+ extra_args.append("--enable-multimodal")
1649
+ if "--trust-remote-code" not in extra_args:
1650
+ extra_args.append("--trust-remote-code")
1651
+
1652
+ self.extra_args = extra_args
1653
+
1654
+
1655
+ class ModelEvalMetrics:
1656
+ def __init__(self, accuracy: float, eval_time: float):
1657
+ self.accuracy = accuracy
1658
+ self.eval_time = eval_time
1659
+
1660
+
1661
+ def extract_trace_link_from_bench_one_batch_server_output(output: str) -> str:
1662
+ match = re.search(r"\[Profile\]\((.*?)\)", output)
1663
+ if match:
1664
+ trace_link = match.group(1)
1665
+ return trace_link
1666
+ return None
1667
+
1668
+
1669
+ def parse_models(model_string: str):
1670
+ return [model.strip() for model in model_string.split(",") if model.strip()]
1671
+
1672
+
1673
+ def check_evaluation_test_results(
1674
+ results,
1675
+ test_name,
1676
+ model_accuracy_thresholds,
1677
+ model_latency_thresholds=None,
1678
+ model_count=None,
1679
+ ):
1680
+ """
1681
+ results: list of tuple of (model_path, accuracy, latency)
1682
+ """
1683
+ failed_models = []
1684
+ if model_latency_thresholds is not None:
1685
+ summary = " | model | status | score | score_threshold | latency | latency_threshold | \n"
1686
+ summary += "| ----- | ------ | ----- | --------------- | ------- | ----------------- | \n"
1687
+ else:
1688
+ summary = " | model | status | score | score_threshold | \n"
1689
+ summary += "| ----- | ------ | ----- | --------------- | \n"
1690
+
1691
+ results_dict = {res[0]: (res[1], res[2]) for res in results}
1692
+
1693
+ for model, accuracy_threshold in sorted(model_accuracy_thresholds.items()):
1694
+ latency_threshold = (
1695
+ model_latency_thresholds.get(model)
1696
+ if model_latency_thresholds is not None
1697
+ else 1e9
1698
+ )
1699
+
1700
+ if model in results_dict:
1701
+ accuracy, latency = results_dict[model]
1702
+ is_success = accuracy >= accuracy_threshold and latency <= latency_threshold
1703
+ status_emoji = "✅" if is_success else "❌"
1704
+
1705
+ if not is_success:
1706
+ if accuracy < accuracy_threshold:
1707
+ failed_models.append(
1708
+ f"\nScore Check Failed: {model}\n"
1709
+ f"Model {model} score ({accuracy:.4f}) is below threshold ({accuracy_threshold:.4f})"
1710
+ )
1711
+ if latency > latency_threshold:
1712
+ failed_models.append(
1713
+ f"\nLatency Check Failed: {model}\n"
1714
+ f"Model {model} latency ({latency:.4f}) is above threshold ({latency_threshold:.4f})"
1715
+ )
1716
+
1717
+ if model_latency_thresholds is not None:
1718
+ line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold} | {latency} | {latency_threshold}\n"
1719
+ else:
1720
+ line = (
1721
+ f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold}\n"
1722
+ )
1723
+ else:
1724
+ status_emoji = "❌"
1725
+ failed_models.append(f"Model failed to launch or be evaluated: {model}")
1726
+ if model_latency_thresholds is not None:
1727
+ line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold} | N/A | {latency_threshold}\n"
1728
+ else:
1729
+ line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold}\n"
1730
+
1731
+ summary += line
1732
+
1733
+ print(summary)
1734
+
1735
+ if is_in_ci():
1736
+ write_github_step_summary(f"## {test_name}\n{summary}")
1737
+
1738
+ if failed_models:
1739
+ print("Some models failed the evaluation.")
1740
+ raise AssertionError("\n".join(failed_models))
1741
+
1742
+
1743
+ # Bench knobs for bench_one_batch_server (override by env)
1744
+ def _parse_int_list_env(name: str, default_val: str):
1745
+ val = os.environ.get(name, default_val)
1746
+ return [int(x) for x in val.split(",") if x]
1747
+
1748
+
1749
+ # Return filenames
1750
+ def find_traces_under_path(path: str) -> List[str]:
1751
+ results = []
1752
+ for _, dirs, files in os.walk(path):
1753
+ for file in files:
1754
+ if file.endswith(".trace.json.gz"):
1755
+ results.append(f"{file}")
1756
+ return results
1757
+
1758
+
1759
+ def write_results_to_json(model, metrics, mode="a"):
1760
+ result = {
1761
+ "timestamp": datetime.now().isoformat(),
1762
+ "model": model,
1763
+ "metrics": metrics,
1764
+ "score": metrics["score"],
1765
+ }
1766
+
1767
+ if "latency" in metrics:
1768
+ result["latency"] = (metrics.get("latency"),)
1769
+
1770
+ existing_results = []
1771
+ if mode == "a" and os.path.exists("results.json"):
1772
+ try:
1773
+ with open("results.json", "r") as f:
1774
+ existing_results = json.load(f)
1775
+ except json.JSONDecodeError:
1776
+ existing_results = []
1777
+
1778
+ if isinstance(existing_results, list):
1779
+ existing_results.append(result)
1780
+ else:
1781
+ existing_results = [result]
1782
+
1783
+ with open("results.json", "w") as f:
1784
+ json.dump(existing_results, f, indent=2)
sglang/utils.py CHANGED
@@ -6,6 +6,7 @@ import logging
6
6
  import os
7
7
  import random
8
8
  import socket
9
+ import ssl
9
10
  import subprocess
10
11
  import sys
11
12
  import time
@@ -155,7 +156,15 @@ def http_request(
155
156
  data = bytes(dumps(json), encoding="utf-8")
156
157
 
157
158
  try:
158
- resp = urllib.request.urlopen(req, data=data, cafile=verify)
159
+ if sys.version_info >= (3, 13):
160
+ # Python 3.13+: Use SSL context (cafile removed)
161
+ if verify and isinstance(verify, str):
162
+ context = ssl.create_default_context(cafile=verify)
163
+ else:
164
+ context = ssl.create_default_context()
165
+ resp = urllib.request.urlopen(req, data=data, context=context)
166
+ else:
167
+ resp = urllib.request.urlopen(req, data=data, cafile=verify)
159
168
  return HttpResponse(resp)
160
169
  except urllib.error.HTTPError as e:
161
170
  return HttpResponse(e)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.3rc0"
1
+ __version__ = "0.5.3rc2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.5.3rc0
3
+ Version: 0.5.3rc2
4
4
  Summary: SGLang is a fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -211,18 +211,17 @@ Classifier: License :: OSI Approved :: Apache Software License
211
211
  Requires-Python: >=3.10
212
212
  Description-Content-Type: text/markdown
213
213
  License-File: LICENSE
214
- Requires-Dist: aiohttp
215
- Requires-Dist: requests
216
- Requires-Dist: tqdm
217
- Requires-Dist: numpy
218
214
  Requires-Dist: IPython
219
- Requires-Dist: setproctitle
215
+ Requires-Dist: aiohttp
216
+ Requires-Dist: anthropic>=0.20.0
220
217
  Requires-Dist: blobfile==3.0.0
221
218
  Requires-Dist: build
222
219
  Requires-Dist: compressed-tensors
220
+ Requires-Dist: cuda-python
223
221
  Requires-Dist: datasets
224
222
  Requires-Dist: einops
225
223
  Requires-Dist: fastapi
224
+ Requires-Dist: flashinfer_python==0.4.0rc3
226
225
  Requires-Dist: hf_transfer
227
226
  Requires-Dist: huggingface_hub
228
227
  Requires-Dist: interegular
@@ -230,8 +229,10 @@ Requires-Dist: llguidance<0.8.0,>=0.7.11
230
229
  Requires-Dist: modelscope
231
230
  Requires-Dist: msgspec
232
231
  Requires-Dist: ninja
233
- Requires-Dist: openai==1.99.1
232
+ Requires-Dist: numpy
233
+ Requires-Dist: nvidia-cutlass-dsl==4.2.1
234
234
  Requires-Dist: openai-harmony==0.0.4
235
+ Requires-Dist: openai==1.99.1
235
236
  Requires-Dist: orjson
236
237
  Requires-Dist: outlines==0.1.11
237
238
  Requires-Dist: packaging
@@ -239,32 +240,34 @@ Requires-Dist: partial_json_parser
239
240
  Requires-Dist: pillow
240
241
  Requires-Dist: prometheus-client>=0.20.0
241
242
  Requires-Dist: psutil
243
+ Requires-Dist: py-spy
242
244
  Requires-Dist: pybase64
243
245
  Requires-Dist: pydantic
244
246
  Requires-Dist: pynvml
245
247
  Requires-Dist: python-multipart
246
248
  Requires-Dist: pyzmq>=25.1.2
249
+ Requires-Dist: requests
247
250
  Requires-Dist: scipy
248
251
  Requires-Dist: sentencepiece
252
+ Requires-Dist: setproctitle
253
+ Requires-Dist: sgl-kernel==0.3.14.post1
249
254
  Requires-Dist: soundfile==0.13.1
250
- Requires-Dist: timm==1.0.16
251
255
  Requires-Dist: tiktoken
256
+ Requires-Dist: timm==1.0.16
257
+ Requires-Dist: torch==2.8.0
258
+ Requires-Dist: torch_memory_saver==0.0.9rc2
252
259
  Requires-Dist: torchao==0.9.0
253
- Requires-Dist: transformers==4.56.1
260
+ Requires-Dist: torchaudio==2.8.0
261
+ Requires-Dist: torchvision
262
+ Requires-Dist: tqdm
263
+ Requires-Dist: transformers==4.57.0
254
264
  Requires-Dist: uvicorn
255
265
  Requires-Dist: uvloop
256
266
  Requires-Dist: xgrammar==0.1.24
257
- Requires-Dist: sgl-kernel==0.3.9.post2
258
- Requires-Dist: torch==2.8.0
259
- Requires-Dist: torchaudio==2.8.0
260
- Requires-Dist: torchvision
261
- Requires-Dist: cuda-python
262
- Requires-Dist: flashinfer_python==0.3.1
263
- Requires-Dist: openai==1.99.1
264
- Requires-Dist: tiktoken
265
- Requires-Dist: anthropic>=0.20.0
266
- Requires-Dist: torch_memory_saver==0.0.8
267
- Requires-Dist: decord
267
+ Requires-Dist: grpcio==1.75.1
268
+ Requires-Dist: grpcio-tools==1.75.1
269
+ Provides-Extra: decord
270
+ Requires-Dist: decord; extra == "decord"
268
271
  Provides-Extra: test
269
272
  Requires-Dist: accelerate; extra == "test"
270
273
  Requires-Dist: expecttest; extra == "test"
@@ -272,21 +275,27 @@ Requires-Dist: jsonlines; extra == "test"
272
275
  Requires-Dist: matplotlib; extra == "test"
273
276
  Requires-Dist: pandas; extra == "test"
274
277
  Requires-Dist: peft; extra == "test"
275
- Requires-Dist: sentence_transformers; extra == "test"
276
278
  Requires-Dist: pytest; extra == "test"
279
+ Requires-Dist: sentence_transformers; extra == "test"
277
280
  Requires-Dist: tabulate; extra == "test"
278
281
  Provides-Extra: tracing
279
- Requires-Dist: opentelemetry-sdk; extra == "tracing"
280
282
  Requires-Dist: opentelemetry-api; extra == "tracing"
281
283
  Requires-Dist: opentelemetry-exporter-otlp; extra == "tracing"
282
284
  Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "tracing"
285
+ Requires-Dist: opentelemetry-sdk; extra == "tracing"
283
286
  Provides-Extra: all
284
287
  Requires-Dist: sglang[test]; extra == "all"
285
- Provides-Extra: blackwell
286
- Requires-Dist: nvidia-cutlass-dsl==4.1.0; extra == "blackwell"
287
- Requires-Dist: sglang[test]; extra == "blackwell"
288
+ Requires-Dist: sglang[decord]; extra == "all"
289
+ Provides-Extra: all-aarch64
290
+ Requires-Dist: sglang[test]; extra == "all-aarch64"
288
291
  Provides-Extra: dev
289
292
  Requires-Dist: sglang[test]; extra == "dev"
293
+ Requires-Dist: sglang[decord]; extra == "dev"
294
+ Provides-Extra: blackwell
295
+ Requires-Dist: sglang[test]; extra == "blackwell"
296
+ Requires-Dist: sglang[decord]; extra == "blackwell"
297
+ Provides-Extra: blackwell-aarch64
298
+ Requires-Dist: sglang[test]; extra == "blackwell-aarch64"
290
299
  Dynamic: license-file
291
300
 
292
301
  <div align="center" id="sglangtop">