sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +321 -31
  3. sglang/bench_serving.py +10 -3
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +4 -0
  11. sglang/srt/configs/dots_ocr.py +64 -0
  12. sglang/srt/configs/falcon_h1.py +360 -0
  13. sglang/srt/configs/load_config.py +8 -0
  14. sglang/srt/configs/model_config.py +160 -105
  15. sglang/srt/configs/qwen3_vl.py +586 -0
  16. sglang/srt/constrained/base_grammar_backend.py +1 -0
  17. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  18. sglang/srt/constrained/xgrammar_backend.py +6 -4
  19. sglang/srt/debug_utils/dumper.py +10 -3
  20. sglang/srt/disaggregation/ascend/conn.py +2 -2
  21. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  22. sglang/srt/disaggregation/common/conn.py +266 -98
  23. sglang/srt/disaggregation/decode.py +50 -9
  24. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  25. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  26. sglang/srt/disaggregation/mooncake/conn.py +51 -541
  27. sglang/srt/disaggregation/nixl/conn.py +148 -39
  28. sglang/srt/disaggregation/prefill.py +31 -14
  29. sglang/srt/disaggregation/utils.py +36 -5
  30. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  31. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  32. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  33. sglang/srt/distributed/parallel_state.py +135 -80
  34. sglang/srt/entrypoints/engine.py +23 -3
  35. sglang/srt/entrypoints/grpc_request_manager.py +330 -55
  36. sglang/srt/entrypoints/grpc_server.py +232 -102
  37. sglang/srt/entrypoints/http_server.py +49 -9
  38. sglang/srt/entrypoints/openai/protocol.py +110 -5
  39. sglang/srt/entrypoints/openai/serving_base.py +25 -6
  40. sglang/srt/entrypoints/openai/serving_chat.py +178 -49
  41. sglang/srt/entrypoints/openai/serving_completions.py +5 -3
  42. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  43. sglang/srt/entrypoints/openai/serving_responses.py +42 -0
  44. sglang/srt/environ.py +285 -0
  45. sglang/srt/eplb/expert_location.py +30 -5
  46. sglang/srt/function_call/function_call_parser.py +3 -2
  47. sglang/srt/function_call/glm4_moe_detector.py +3 -3
  48. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  49. sglang/srt/function_call/json_array_parser.py +63 -0
  50. sglang/srt/function_call/kimik2_detector.py +17 -4
  51. sglang/srt/function_call/utils.py +96 -5
  52. sglang/srt/grpc/compile_proto.py +245 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
  56. sglang/srt/layers/activation.py +7 -6
  57. sglang/srt/layers/attention/aiter_backend.py +14 -15
  58. sglang/srt/layers/attention/ascend_backend.py +108 -9
  59. sglang/srt/layers/attention/attention_registry.py +206 -0
  60. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  61. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  62. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  63. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  66. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  67. sglang/srt/layers/attention/flashinfer_backend.py +112 -194
  68. sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
  69. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  70. sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
  71. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
  72. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
  73. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
  74. sglang/srt/layers/attention/mamba/mamba.py +566 -1
  75. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  76. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  77. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  78. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  79. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  80. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  81. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  82. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  83. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  84. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  85. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  86. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  87. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  88. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  89. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  90. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  91. sglang/srt/layers/attention/nsa/utils.py +24 -0
  92. sglang/srt/layers/attention/nsa_backend.py +887 -0
  93. sglang/srt/layers/attention/tbo_backend.py +6 -6
  94. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  95. sglang/srt/layers/attention/triton_backend.py +42 -9
  96. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  97. sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
  98. sglang/srt/layers/attention/vision.py +58 -0
  99. sglang/srt/layers/attention/wave_backend.py +4 -4
  100. sglang/srt/layers/communicator.py +8 -0
  101. sglang/srt/layers/dp_attention.py +11 -1
  102. sglang/srt/layers/elementwise.py +3 -1
  103. sglang/srt/layers/layernorm.py +2 -0
  104. sglang/srt/layers/linear.py +21 -4
  105. sglang/srt/layers/logits_processor.py +15 -2
  106. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  107. sglang/srt/layers/moe/ep_moe/layer.py +147 -74
  108. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  109. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  110. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  111. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  112. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
  113. sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
  114. sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
  115. sglang/srt/layers/moe/utils.py +10 -0
  116. sglang/srt/layers/parameter.py +23 -6
  117. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  118. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  119. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  120. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  121. sglang/srt/layers/quantization/fp8.py +2 -2
  122. sglang/srt/layers/quantization/fp8_utils.py +1 -1
  123. sglang/srt/layers/quantization/modelopt_quant.py +44 -9
  124. sglang/srt/layers/quantization/mxfp4.py +12 -4
  125. sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
  126. sglang/srt/layers/quantization/w4afp8.py +0 -4
  127. sglang/srt/layers/quantization/w8a8_int8.py +15 -3
  128. sglang/srt/layers/rotary_embedding.py +78 -31
  129. sglang/srt/layers/sampler.py +52 -4
  130. sglang/srt/layers/utils.py +23 -0
  131. sglang/srt/lora/backend/base_backend.py +3 -3
  132. sglang/srt/lora/backend/chunked_backend.py +348 -0
  133. sglang/srt/lora/backend/triton_backend.py +10 -4
  134. sglang/srt/lora/lora.py +7 -5
  135. sglang/srt/lora/lora_manager.py +17 -6
  136. sglang/srt/lora/mem_pool.py +1 -1
  137. sglang/srt/lora/triton_ops/__init__.py +4 -0
  138. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  139. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  140. sglang/srt/lora/utils.py +7 -5
  141. sglang/srt/managers/cache_controller.py +42 -142
  142. sglang/srt/managers/data_parallel_controller.py +11 -46
  143. sglang/srt/managers/detokenizer_manager.py +11 -11
  144. sglang/srt/managers/io_struct.py +162 -118
  145. sglang/srt/managers/mm_utils.py +43 -6
  146. sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
  147. sglang/srt/managers/multimodal_processor.py +1 -2
  148. sglang/srt/managers/overlap_utils.py +53 -0
  149. sglang/srt/managers/schedule_batch.py +167 -86
  150. sglang/srt/managers/schedule_policy.py +143 -16
  151. sglang/srt/managers/scheduler.py +359 -214
  152. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  153. sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
  154. sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
  155. sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
  156. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  157. sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
  158. sglang/srt/managers/tokenizer_manager.py +84 -136
  159. sglang/srt/managers/tp_worker.py +39 -29
  160. sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
  161. sglang/srt/managers/utils.py +1 -45
  162. sglang/srt/mem_cache/allocator.py +14 -20
  163. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  164. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  165. sglang/srt/mem_cache/chunk_cache.py +8 -1
  166. sglang/srt/mem_cache/evict_policy.py +23 -0
  167. sglang/srt/mem_cache/hicache_storage.py +40 -1
  168. sglang/srt/mem_cache/hiradix_cache.py +119 -32
  169. sglang/srt/mem_cache/memory_pool.py +188 -10
  170. sglang/srt/mem_cache/memory_pool_host.py +134 -182
  171. sglang/srt/mem_cache/radix_cache.py +222 -71
  172. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  173. sglang/srt/mem_cache/storage/__init__.py +10 -0
  174. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  175. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  176. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  177. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  178. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  179. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
  180. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
  181. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
  182. sglang/srt/mem_cache/swa_radix_cache.py +25 -34
  183. sglang/srt/metrics/collector.py +82 -120
  184. sglang/srt/metrics/func_timer.py +2 -7
  185. sglang/srt/metrics/utils.py +8 -1
  186. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  187. sglang/srt/model_executor/cuda_graph_runner.py +39 -32
  188. sglang/srt/model_executor/forward_batch_info.py +23 -38
  189. sglang/srt/model_executor/model_runner.py +131 -183
  190. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  191. sglang/srt/model_loader/loader.py +14 -10
  192. sglang/srt/model_loader/weight_utils.py +156 -2
  193. sglang/srt/models/bailing_moe.py +27 -4
  194. sglang/srt/models/deepseek_nextn.py +6 -1
  195. sglang/srt/models/deepseek_v2.py +536 -153
  196. sglang/srt/models/dots_ocr.py +173 -0
  197. sglang/srt/models/falcon_h1.py +576 -0
  198. sglang/srt/models/gemma3_causal.py +0 -2
  199. sglang/srt/models/gemma3_mm.py +1 -1
  200. sglang/srt/models/gemma3n_mm.py +1 -1
  201. sglang/srt/models/glm4_moe.py +3 -3
  202. sglang/srt/models/glm4_moe_nextn.py +2 -2
  203. sglang/srt/models/glm4v.py +1 -1
  204. sglang/srt/models/glm4v_moe.py +1 -1
  205. sglang/srt/models/gpt_oss.py +7 -30
  206. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  207. sglang/srt/models/llama.py +4 -0
  208. sglang/srt/models/longcat_flash.py +1 -1
  209. sglang/srt/models/longcat_flash_nextn.py +1 -1
  210. sglang/srt/models/mllama4.py +15 -4
  211. sglang/srt/models/qwen2.py +0 -7
  212. sglang/srt/models/qwen2_5_vl.py +2 -2
  213. sglang/srt/models/qwen2_audio.py +1 -1
  214. sglang/srt/models/qwen2_moe.py +64 -1
  215. sglang/srt/models/qwen2_vl.py +1 -1
  216. sglang/srt/models/qwen3.py +18 -3
  217. sglang/srt/models/qwen3_moe.py +31 -3
  218. sglang/srt/models/qwen3_next.py +36 -9
  219. sglang/srt/models/qwen3_vl.py +787 -0
  220. sglang/srt/models/qwen3_vl_moe.py +471 -0
  221. sglang/srt/models/registry.py +15 -3
  222. sglang/srt/models/sarashina2_vision.py +269 -0
  223. sglang/srt/models/solar.py +505 -0
  224. sglang/srt/models/starcoder2.py +357 -0
  225. sglang/srt/models/torch_native_llama.py +9 -2
  226. sglang/srt/models/utils.py +51 -0
  227. sglang/srt/multimodal/processors/base_processor.py +15 -7
  228. sglang/srt/multimodal/processors/dots_vlm.py +2 -3
  229. sglang/srt/multimodal/processors/internvl.py +20 -8
  230. sglang/srt/multimodal/processors/qwen_vl.py +8 -1
  231. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  232. sglang/srt/parser/jinja_template_utils.py +6 -0
  233. sglang/srt/sampling/sampling_batch_info.py +20 -2
  234. sglang/srt/sampling/sampling_params.py +7 -0
  235. sglang/srt/server_args.py +753 -295
  236. sglang/srt/server_args_config_parser.py +146 -0
  237. sglang/srt/single_batch_overlap.py +151 -0
  238. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  239. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  240. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  241. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  242. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  243. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  244. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
  245. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
  246. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
  247. sglang/srt/speculative/eagle_worker.py +57 -25
  248. sglang/srt/speculative/ngram_utils.py +428 -0
  249. sglang/srt/speculative/ngram_worker.py +245 -0
  250. sglang/srt/speculative/spec_info.py +47 -0
  251. sglang/srt/speculative/spec_utils.py +606 -0
  252. sglang/srt/torch_memory_saver_adapter.py +5 -7
  253. sglang/srt/tracing/trace.py +32 -6
  254. sglang/srt/two_batch_overlap.py +8 -5
  255. sglang/srt/utils/__init__.py +2 -0
  256. sglang/srt/{utils.py → utils/common.py} +399 -74
  257. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
  258. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  259. sglang/srt/utils/rpd_utils.py +452 -0
  260. sglang/srt/utils/slow_rank_detector.py +71 -0
  261. sglang/srt/warmup.py +8 -4
  262. sglang/srt/weight_sync/utils.py +1 -1
  263. sglang/test/get_logits_ut.py +57 -0
  264. sglang/test/run_eval.py +79 -11
  265. sglang/test/runners.py +1 -1
  266. sglang/test/simple_eval_common.py +5 -2
  267. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  268. sglang/test/test_block_fp8.py +2 -2
  269. sglang/test/test_deterministic.py +297 -0
  270. sglang/test/test_disaggregation_utils.py +12 -1
  271. sglang/test/test_programs.py +1 -1
  272. sglang/test/test_utils.py +355 -4
  273. sglang/utils.py +10 -1
  274. sglang/version.py +1 -1
  275. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
  276. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
  277. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  278. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  279. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  280. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  281. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  282. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,297 @@
1
+ """
2
+ Batch the same prompt in random batch sizes, and test if the results are consistent across different trials.
3
+
4
+ Usage:
5
+ python3 -m sglang.test.test_deterministic --n-trials <numer_of_trials> --test-mode <single|mixed|prefix> --profile
6
+ """
7
+
8
+ import argparse
9
+ import dataclasses
10
+ import json
11
+ import os
12
+ import random
13
+ from typing import List
14
+
15
+ import requests
16
+
17
+ from sglang.profiler import run_profile
18
+
19
+ PROMPT_1 = "Tell me about Richard Feynman: "
20
+ PROMPT_2 = "Generate 1000 random numbers. Go directly into it, don't say Sure and don't say here are numbers. Just start with a number."
21
+ dirpath = os.path.dirname(__file__)
22
+ with open(os.path.join(dirpath, "long_prompt.txt"), "r") as f:
23
+ LONG_PROMPT = f.read()
24
+
25
+
26
+ @dataclasses.dataclass
27
+ class BenchArgs:
28
+ host: str = "localhost"
29
+ port: int = 30000
30
+ batch_size: int = 1
31
+ temperature: float = 0.0
32
+ sampling_seed: int = 42
33
+ max_new_tokens: int = 100
34
+ frequency_penalty: float = 0.0
35
+ presence_penalty: float = 0.0
36
+ return_logprob: bool = False
37
+ stream: bool = False
38
+ profile: bool = False
39
+ profile_steps: int = 3
40
+ profile_by_stage: bool = False
41
+ test_mode: str = "single"
42
+
43
+ @staticmethod
44
+ def add_cli_args(parser: argparse.ArgumentParser):
45
+ parser.add_argument("--host", type=str, default=BenchArgs.host)
46
+ parser.add_argument("--port", type=int, default=BenchArgs.port)
47
+ parser.add_argument("--n-trials", type=int, default=50)
48
+ parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
49
+ parser.add_argument(
50
+ "--sampling-seed", type=int, default=BenchArgs.sampling_seed
51
+ )
52
+ parser.add_argument(
53
+ "--max-new-tokens", type=int, default=BenchArgs.max_new_tokens
54
+ )
55
+ parser.add_argument(
56
+ "--frequency-penalty", type=float, default=BenchArgs.frequency_penalty
57
+ )
58
+ parser.add_argument(
59
+ "--presence-penalty", type=float, default=BenchArgs.presence_penalty
60
+ )
61
+ parser.add_argument("--return-logprob", action="store_true")
62
+ parser.add_argument("--stream", action="store_true")
63
+ parser.add_argument(
64
+ "--test-mode",
65
+ type=str,
66
+ default=BenchArgs.test_mode,
67
+ choices=["single", "mixed", "prefix"],
68
+ )
69
+ parser.add_argument("--profile", action="store_true")
70
+ parser.add_argument(
71
+ "--profile-steps", type=int, default=BenchArgs.profile_steps
72
+ )
73
+ parser.add_argument("--profile-by-stage", action="store_true")
74
+
75
+ @classmethod
76
+ def from_cli_args(cls, args: argparse.Namespace):
77
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
78
+ return cls(**{attr: getattr(args, attr) for attr in attrs})
79
+
80
+
81
+ def send_single(
82
+ args,
83
+ batch_size: int,
84
+ profile: bool = False,
85
+ profile_steps: int = 3,
86
+ profile_by_stage: bool = False,
87
+ ):
88
+
89
+ base_url = f"http://{args.host}:{args.port}"
90
+ prompt = [PROMPT_1] * batch_size
91
+
92
+ json_data = {
93
+ "text": prompt,
94
+ "sampling_params": {
95
+ "temperature": args.temperature,
96
+ "max_new_tokens": args.max_new_tokens,
97
+ "frequency_penalty": args.frequency_penalty,
98
+ "presence_penalty": args.presence_penalty,
99
+ },
100
+ "return_logprob": args.return_logprob,
101
+ "stream": args.stream,
102
+ }
103
+
104
+ if args.sampling_seed is not None:
105
+ # sglang server cannot parse None value for sampling_seed
106
+ json_data["sampling_params"]["sampling_seed"] = args.sampling_seed
107
+
108
+ if profile:
109
+ run_profile(
110
+ base_url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
111
+ )
112
+
113
+ response = requests.post(
114
+ f"{base_url}/generate",
115
+ json=json_data,
116
+ stream=args.stream,
117
+ )
118
+
119
+ if args.stream:
120
+ for chunk in response.iter_lines(decode_unicode=False):
121
+ chunk = chunk.decode("utf-8")
122
+ if chunk and chunk.startswith("data:"):
123
+ if chunk == "data: [DONE]":
124
+ break
125
+ ret = json.loads(chunk[5:].strip("\n"))
126
+ else:
127
+ ret = response.json()
128
+ ret = ret[0]
129
+
130
+ if response.status_code != 200:
131
+ print(ret)
132
+ return -1
133
+
134
+ return ret["text"]
135
+
136
+
137
+ def send_mixed(args, batch_size: int):
138
+ num_long_prompt = 0 if batch_size <= 10 else random.randint(1, 10)
139
+ num_prompt_1 = random.randint(1, batch_size - num_long_prompt)
140
+ num_prompt_2 = batch_size - num_prompt_1 - num_long_prompt
141
+
142
+ json_data = {
143
+ "text": [PROMPT_1] * num_prompt_1
144
+ + [PROMPT_2] * num_prompt_2
145
+ + [LONG_PROMPT] * num_long_prompt,
146
+ "sampling_params": {
147
+ "temperature": args.temperature,
148
+ "max_new_tokens": args.max_new_tokens,
149
+ "frequency_penalty": args.frequency_penalty,
150
+ "presence_penalty": args.presence_penalty,
151
+ },
152
+ "return_logprob": args.return_logprob,
153
+ "stream": args.stream,
154
+ }
155
+
156
+ if args.sampling_seed is not None:
157
+ json_data["sampling_params"]["sampling_seed"] = args.sampling_seed
158
+
159
+ response = requests.post(
160
+ f"http://{args.host}:{args.port}/generate",
161
+ json=json_data,
162
+ stream=args.stream,
163
+ )
164
+ ret = response.json()
165
+ if response.status_code != 200:
166
+ print(ret)
167
+ return -1, -1, -1
168
+
169
+ prompt_1_ret = [ret[i]["text"] for i in range(num_prompt_1)]
170
+ prompt_2_ret = [
171
+ ret[i]["text"] for i in range(num_prompt_1, num_prompt_1 + num_prompt_2)
172
+ ]
173
+ long_prompt_ret = [
174
+ ret[i]["text"]
175
+ for i in range(
176
+ num_prompt_1 + num_prompt_2, num_prompt_1 + num_prompt_2 + num_long_prompt
177
+ )
178
+ ]
179
+
180
+ return prompt_1_ret, prompt_2_ret, long_prompt_ret
181
+
182
+
183
+ def send_prefix(args, batch_size: int, prompts: List[str]):
184
+ requests.post(f"http://{args.host}:{args.port}/flush_cache")
185
+
186
+ batch_data = []
187
+ sampled_indices = []
188
+ for _ in range(batch_size):
189
+ sampled_index = random.randint(0, len(prompts) - 1)
190
+ sampled_indices.append(sampled_index)
191
+ batch_data.append(prompts[sampled_index])
192
+
193
+ json_data = {
194
+ "text": batch_data,
195
+ "sampling_params": {
196
+ "temperature": args.temperature,
197
+ "max_new_tokens": args.max_new_tokens,
198
+ "frequency_penalty": args.frequency_penalty,
199
+ "presence_penalty": args.presence_penalty,
200
+ },
201
+ "return_logprob": args.return_logprob,
202
+ "stream": args.stream,
203
+ }
204
+
205
+ if args.sampling_seed is not None:
206
+ json_data["sampling_params"]["sampling_seed"] = args.sampling_seed
207
+
208
+ response = requests.post(
209
+ f"http://{args.host}:{args.port}/generate",
210
+ json=json_data,
211
+ stream=args.stream,
212
+ )
213
+ ret = response.json()
214
+ if response.status_code != 200:
215
+ print(ret)
216
+ return -1, -1, -1
217
+
218
+ ret_dict = {i: [] for i in range(len(prompts))}
219
+ for i in range(batch_size):
220
+ ret_dict[sampled_indices[i]].append(ret[i]["text"])
221
+
222
+ return ret_dict
223
+
224
+
225
+ def test_deterministic(args):
226
+ # First do some warmups
227
+ for i in range(3):
228
+ send_single(args, 16, args.profile)
229
+
230
+ if args.test_mode == "single":
231
+ # In single mode, we test the deterministic behavior by sending the same prompt in batch sizes ranging from 1 to n_trials.
232
+ texts = []
233
+ for i in range(1, args.n_trials + 1):
234
+ batch_size = i
235
+ text = send_single(args, batch_size, args.profile)
236
+ text = text.replace("\n", " ")
237
+ print(f"Trial {i} with batch size {batch_size}: {text}")
238
+ texts.append(text)
239
+
240
+ print(f"Total samples: {len(texts)}, Unique samples: {len(set(texts))}")
241
+ elif args.test_mode == "mixed":
242
+ # In mixed mode, we send a mixture of two short prompts and one long prompt in the same batch with batch size ranging from 1 to n_trials.
243
+ output_prompt_1 = []
244
+ output_prompt_2 = []
245
+ output_long_prompt = []
246
+ for i in range(1, args.n_trials + 1):
247
+ batch_size = i
248
+ ret_prompt_1, ret_prompt_2, ret_long_prompt = send_mixed(args, batch_size)
249
+ output_prompt_1.extend(ret_prompt_1)
250
+ output_prompt_2.extend(ret_prompt_2)
251
+ output_long_prompt.extend(ret_long_prompt)
252
+
253
+ print(
254
+ f"Testing Trial {i} with batch size {batch_size}, number of prompt 1: {len(ret_prompt_1)}, number of prompt 2: {len(ret_prompt_2)}, number of long prompt: {len(ret_long_prompt)}"
255
+ )
256
+
257
+ print(
258
+ f"Prompt 1: total samples: {len(output_prompt_1)}, Unique samples: {len(set(output_prompt_1))}"
259
+ )
260
+ print(
261
+ f"Prompt 2: total samples: {len(output_prompt_2)}, Unique samples: {len(set(output_prompt_2))}"
262
+ )
263
+ print(
264
+ f"Long prompt: total samples: {len(output_long_prompt)}, Unique samples: {len(set(output_long_prompt))}"
265
+ )
266
+
267
+ elif args.test_mode == "prefix":
268
+ # In prefix mode, we create prompts from the same long prompt, with different lengths of common prefix.
269
+ len_prefix = [1, 511, 2048, 4097]
270
+ num_prompts = len(len_prefix)
271
+ outputs = {i: [] for i in range(4)}
272
+ prompts = [LONG_PROMPT[: len_prefix[i]] for i in range(4)]
273
+ for i in range(1, args.n_trials + 1):
274
+ batch_size = i
275
+ ret_dict = send_prefix(args, batch_size, prompts)
276
+ msg = f"Testing Trial {i} with batch size {batch_size},"
277
+ for i in range(num_prompts):
278
+ msg += f" # prefix length {len_prefix[i]}: {len(ret_dict[i])},"
279
+ print(msg)
280
+ for i in range(num_prompts):
281
+ outputs[i].extend(ret_dict[i])
282
+
283
+ for i in range(num_prompts):
284
+ print(
285
+ f"Prompt {i} with prefix length {len_prefix[i]}: total samples: {len(outputs[i])}, Unique samples: {len(set(outputs[i]))}"
286
+ )
287
+
288
+ else:
289
+ raise ValueError(f"Invalid test mode: {args.test_mode}")
290
+
291
+
292
+ if __name__ == "__main__":
293
+ parser = argparse.ArgumentParser()
294
+ BenchArgs.add_cli_args(parser)
295
+ args = parser.parse_args()
296
+
297
+ test_deterministic(args)
@@ -1,10 +1,12 @@
1
1
  import time
2
+ from urllib.parse import urlparse
2
3
 
3
4
  import requests
4
5
 
5
6
  from sglang.srt.utils import kill_process_tree
6
7
  from sglang.test.test_utils import (
7
8
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
9
+ DEFAULT_URL_FOR_TEST,
8
10
  CustomTestCase,
9
11
  popen_with_error_check,
10
12
  )
@@ -13,8 +15,17 @@ from sglang.test.test_utils import (
13
15
  class TestDisaggregationBase(CustomTestCase):
14
16
  @classmethod
15
17
  def setUpClass(cls):
18
+ parsed_url = urlparse(DEFAULT_URL_FOR_TEST)
19
+ cls.base_host = parsed_url.hostname
20
+ base_port = str(parsed_url.port)
21
+ cls.lb_port = base_port
22
+ cls.prefill_port = f"{int(base_port) + 100}"
23
+ cls.decode_port = f"{int(base_port) + 200}"
24
+ cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"
25
+ cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"
26
+ cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"
27
+ print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")
16
28
  cls.process_lb, cls.process_decode, cls.process_prefill = None, None, None
17
- pass
18
29
 
19
30
  @classmethod
20
31
  def launch_lb(cls):
@@ -551,7 +551,7 @@ def test_gen_min_new_tokens():
551
551
  We verify that the number of tokens in the answer is >= the min_tokens threshold.
552
552
  """
553
553
  import sglang as sgl
554
- from sglang.srt.hf_transformers_utils import get_tokenizer
554
+ from sglang.srt.utils.hf_transformers_utils import get_tokenizer
555
555
 
556
556
  model_path = sgl.global_config.default_backend.endpoint.get_model_name()
557
557
  MIN_TOKENS, MAX_TOKENS = 64, 128