sglang 0.4.1.post7__tar.gz → 0.4.2.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (361) hide show
  1. {sglang-0.4.1.post7/sglang.egg-info → sglang-0.4.2.post1}/PKG-INFO +8 -8
  2. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/README.md +6 -6
  3. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/pyproject.toml +2 -2
  4. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/bench_offline_throughput.py +17 -11
  5. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/bench_one_batch.py +14 -6
  6. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/bench_serving.py +47 -44
  7. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/chat_template.py +31 -0
  8. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/load_config.py +1 -0
  9. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +5 -2
  10. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/entrypoints/engine.py +5 -2
  11. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/entrypoints/http_server.py +24 -0
  12. sglang-0.4.2.post1/sglang/srt/function_call_parser.py +494 -0
  13. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/activation.py +5 -5
  14. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +6 -0
  15. sglang-0.4.2.post1/sglang/srt/layers/attention/vision.py +407 -0
  16. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/dp_attention.py +3 -1
  17. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/layernorm.py +5 -5
  18. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/linear.py +24 -9
  19. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/logits_processor.py +1 -1
  20. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/ep_moe/layer.py +20 -12
  21. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_native.py +17 -3
  22. sglang-0.4.2.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  23. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -1
  24. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/layer.py +9 -0
  25. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/parameter.py +16 -7
  26. sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  27. sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  28. sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  29. sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  30. sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  31. sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  32. sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  33. sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  34. sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  35. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/fp8.py +11 -1
  36. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/rotary_embedding.py +34 -13
  37. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/sampler.py +33 -10
  38. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/torchao_utils.py +12 -6
  39. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/detokenizer_manager.py +1 -0
  40. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/image_processor.py +77 -38
  41. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/io_struct.py +36 -5
  42. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/schedule_batch.py +31 -25
  43. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/scheduler.py +78 -38
  44. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/tokenizer_manager.py +4 -0
  45. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/mem_cache/base_prefix_cache.py +4 -0
  46. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/mem_cache/chunk_cache.py +3 -0
  47. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/mem_cache/radix_cache.py +30 -1
  48. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_executor/cuda_graph_runner.py +23 -25
  49. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_executor/forward_batch_info.py +5 -7
  50. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_executor/model_runner.py +7 -4
  51. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_loader/loader.py +75 -0
  52. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_loader/weight_utils.py +91 -5
  53. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/commandr.py +14 -2
  54. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/dbrx.py +9 -1
  55. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/deepseek_v2.py +3 -3
  56. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/gemma2.py +9 -1
  57. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/grok.py +1 -0
  58. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/minicpm3.py +3 -3
  59. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/minicpmv.py +129 -76
  60. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/mllama.py +16 -56
  61. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/qwen2.py +4 -1
  62. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/qwen2_vl.py +18 -8
  63. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/torch_native_llama.py +17 -4
  64. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/openai_api/adapter.py +139 -37
  65. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/openai_api/protocol.py +5 -4
  66. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
  67. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/sampling_batch_info.py +4 -14
  68. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/server.py +2 -2
  69. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/server_args.py +26 -1
  70. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/speculative/eagle_utils.py +37 -15
  71. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/speculative/eagle_worker.py +11 -13
  72. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/utils.py +62 -67
  73. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/test_programs.py +1 -0
  74. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/test_utils.py +81 -22
  75. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/utils.py +42 -0
  76. sglang-0.4.2.post1/sglang/version.py +1 -0
  77. {sglang-0.4.1.post7 → sglang-0.4.2.post1/sglang.egg-info}/PKG-INFO +8 -8
  78. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang.egg-info/SOURCES.txt +11 -0
  79. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang.egg-info/requires.txt +1 -1
  80. sglang-0.4.1.post7/sglang/srt/layers/attention/vision.py +0 -204
  81. sglang-0.4.1.post7/sglang/version.py +0 -1
  82. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/LICENSE +0 -0
  83. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/setup.cfg +0 -0
  84. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/__init__.py +0 -0
  85. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/api.py +0 -0
  86. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/bench_latency.py +0 -0
  87. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/bench_one_batch_server.py +0 -0
  88. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/check_env.py +0 -0
  89. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/global_config.py +0 -0
  90. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/__init__.py +0 -0
  91. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/__init__.py +0 -0
  92. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/anthropic.py +0 -0
  93. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/base_backend.py +0 -0
  94. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/litellm.py +0 -0
  95. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/openai.py +0 -0
  96. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  97. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/vertexai.py +0 -0
  98. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/choices.py +0 -0
  99. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/compiler.py +0 -0
  100. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/interpreter.py +0 -0
  101. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/ir.py +0 -0
  102. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/tracer.py +0 -0
  103. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/launch_server.py +0 -0
  104. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/llama3_eval.py +0 -0
  105. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/_custom_ops.py +0 -0
  106. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/aio_rwlock.py +0 -0
  107. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/__init__.py +0 -0
  108. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/chatglm.py +0 -0
  109. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/dbrx.py +0 -0
  110. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/device_config.py +0 -0
  111. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/exaone.py +0 -0
  112. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/model_config.py +0 -0
  113. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/qwen2vl.py +0 -0
  114. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  115. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/constrained/outlines_backend.py +0 -0
  116. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  117. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  118. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/conversation.py +0 -0
  119. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/__init__.py +0 -0
  120. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/communication_op.py +0 -0
  121. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  122. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  123. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  124. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  125. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  126. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  127. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  128. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/parallel_state.py +0 -0
  129. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/utils.py +0 -0
  130. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/hf_transformers_utils.py +0 -0
  131. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/__init__.py +0 -0
  132. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  133. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
  134. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  135. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
  136. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  137. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  138. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  139. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/custom_op_util.py +0 -0
  140. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  141. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  142. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  143. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  144. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  145. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  146. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  147. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  148. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  149. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  150. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  151. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  152. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  153. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  154. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  155. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  156. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  157. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  158. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  159. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  160. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  161. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  162. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  163. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  164. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  165. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  166. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  167. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  168. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  169. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  170. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  171. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  172. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  173. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  174. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  175. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  176. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  177. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  178. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  179. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  180. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  181. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  182. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  183. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  184. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  185. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  186. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  187. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  188. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  189. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  190. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  191. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  192. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  193. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  194. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  195. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  196. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  197. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  198. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  199. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  200. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  201. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  202. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  203. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  204. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  205. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  206. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  207. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  208. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  209. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  210. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  211. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  212. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  213. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  214. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  215. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  216. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  217. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  218. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  219. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  220. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  221. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  222. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  223. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  224. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  225. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  226. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  227. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  228. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  229. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  230. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  231. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  232. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  233. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  234. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/topk.py +0 -0
  235. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/pooler.py +0 -0
  236. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
  237. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  238. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  239. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  240. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  241. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  242. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  243. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  244. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  245. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  246. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  247. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  248. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  249. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  250. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  251. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  252. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  253. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  254. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  255. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  256. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  257. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  258. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  259. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  260. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  261. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  262. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  263. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  264. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  265. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  266. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  267. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  268. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  269. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  270. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  271. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  272. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  273. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  274. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  275. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  276. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
  277. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
  278. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  279. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
  280. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/w8a8_int8.py +0 -0
  281. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/radix_attention.py +0 -0
  282. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  283. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/lora/lora.py +0 -0
  284. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/lora/lora_config.py +0 -0
  285. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/lora/lora_manager.py +0 -0
  286. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/cache_controller.py +0 -0
  287. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/configure_logging.py +0 -0
  288. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
  289. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/schedule_policy.py +0 -0
  290. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/session_controller.py +0 -0
  291. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/tp_worker.py +0 -0
  292. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  293. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/utils.py +0 -0
  294. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  295. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/mem_cache/memory_pool.py +0 -0
  296. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/metrics/collector.py +0 -0
  297. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/metrics/func_timer.py +0 -0
  298. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/mm_utils.py +0 -0
  299. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_loader/__init__.py +0 -0
  300. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_loader/utils.py +0 -0
  301. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_parallel.py +0 -0
  302. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/baichuan.py +0 -0
  303. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/chatglm.py +0 -0
  304. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/deepseek.py +0 -0
  305. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/exaone.py +0 -0
  306. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/gemma.py +0 -0
  307. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/gemma2_reward.py +0 -0
  308. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/gpt2.py +0 -0
  309. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
  310. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/granite.py +0 -0
  311. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/internlm2.py +0 -0
  312. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/internlm2_reward.py +0 -0
  313. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llama.py +0 -0
  314. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llama_classification.py +0 -0
  315. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llama_eagle.py +0 -0
  316. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llama_embedding.py +0 -0
  317. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llama_reward.py +0 -0
  318. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llava.py +0 -0
  319. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llavavid.py +0 -0
  320. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/minicpm.py +0 -0
  321. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/mistral.py +0 -0
  322. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/mixtral.py +0 -0
  323. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/mixtral_quant.py +0 -0
  324. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/olmo.py +0 -0
  325. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/olmo2.py +0 -0
  326. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/olmoe.py +0 -0
  327. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/phi3_small.py +0 -0
  328. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/qwen.py +0 -0
  329. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/qwen2_eagle.py +0 -0
  330. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/qwen2_moe.py +0 -0
  331. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/registry.py +0 -0
  332. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/stablelm.py +0 -0
  333. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/xverse.py +0 -0
  334. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/xverse_moe.py +0 -0
  335. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/yivl.py +0 -0
  336. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  337. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  338. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  339. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  340. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  341. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  342. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/sampling_params.py +0 -0
  343. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  344. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/speculative/spec_info.py +0 -0
  345. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  346. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  347. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  348. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/run_eval.py +0 -0
  349. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/runners.py +0 -0
  350. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/simple_eval_common.py +0 -0
  351. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  352. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  353. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/simple_eval_math.py +0 -0
  354. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  355. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  356. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  357. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/test_activation.py +0 -0
  358. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/test_block_fp8.py +0 -0
  359. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/test_layernorm.py +0 -0
  360. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang.egg-info/dependency_links.txt +0 -0
  361. {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.1.post7
3
+ Version: 0.4.2.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -240,7 +240,7 @@ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
240
240
  Provides-Extra: srt
241
241
  Requires-Dist: sglang[runtime_common]; extra == "srt"
242
242
  Requires-Dist: cuda-python; extra == "srt"
243
- Requires-Dist: sgl-kernel>=0.0.2.post14; extra == "srt"
243
+ Requires-Dist: sgl-kernel>=0.0.3; extra == "srt"
244
244
  Requires-Dist: torch; extra == "srt"
245
245
  Requires-Dist: vllm==0.6.4.post1; extra == "srt"
246
246
  Requires-Dist: flashinfer==0.1.6; extra == "srt"
@@ -333,16 +333,16 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
333
333
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
334
334
 
335
335
  ## News
336
- - [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
337
- - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
338
- - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
339
- - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
336
+ - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
337
+ - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
338
+ - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
339
+ - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
340
340
 
341
341
  <details>
342
342
  <summary>More</summary>
343
343
 
344
+ - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
344
345
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
345
- - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
346
346
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
347
347
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
348
348
 
@@ -372,7 +372,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
372
372
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
373
373
 
374
374
  ## Adoption and Sponsorship
375
- The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
375
+ The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
376
376
 
377
377
  ## Acknowledgment and Citation
378
378
  We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -19,16 +19,16 @@
19
19
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
20
20
 
21
21
  ## News
22
- - [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
23
- - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
24
- - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
25
- - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
22
+ - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
23
+ - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
24
+ - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
25
+ - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
26
26
 
27
27
  <details>
28
28
  <summary>More</summary>
29
29
 
30
+ - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
30
31
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
31
- - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
32
32
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
33
33
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
34
34
 
@@ -58,7 +58,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
58
58
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
59
59
 
60
60
  ## Adoption and Sponsorship
61
- The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
61
+ The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
62
62
 
63
63
  ## Acknowledgment and Citation
64
64
  We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.1.post7"
7
+ version = "0.4.2.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -27,7 +27,7 @@ runtime_common = [
27
27
  ]
28
28
  srt = [
29
29
  "sglang[runtime_common]", "cuda-python",
30
- "sgl-kernel>=0.0.2.post14", "torch", "vllm==0.6.4.post1",
30
+ "sgl-kernel>=0.0.3", "torch", "vllm==0.6.4.post1",
31
31
  "flashinfer==0.1.6"
32
32
  ]
33
33
 
@@ -49,12 +49,13 @@ class BenchArgs:
49
49
  gsp_system_prompt_len: int = 2048
50
50
  gsp_question_len: int = 128
51
51
  gsp_output_len: int = 256
52
+ seed: int = 1
52
53
  disable_ignore_eos: bool = False
53
54
  extra_request_body: Optional[str] = None
54
- seed: int = 1
55
+ apply_chat_template: bool = False
56
+ profile: bool = False
55
57
  skip_warmup: bool = False
56
58
  do_not_exit: bool = False
57
- profile: bool = False
58
59
 
59
60
  @staticmethod
60
61
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -141,20 +142,31 @@ class BenchArgs:
141
142
  default=BenchArgs.gsp_output_len,
142
143
  help="Target length in tokens for outputs in generated-shared-prefix dataset",
143
144
  )
145
+ parser.add_argument("--seed", type=int, default=1, help="The random seed.")
144
146
  parser.add_argument(
145
147
  "--disable-ignore-eos",
146
- type=bool,
147
- default=BenchArgs.disable_ignore_eos,
148
+ action="store_true",
148
149
  help="Disable ignore EOS token",
149
150
  )
150
151
  parser.add_argument(
151
152
  "--extra-request-body",
152
153
  metavar='{"key1": "value1", "key2": "value2"}',
153
154
  type=str,
155
+ default=BenchArgs.extra_request_body,
154
156
  help="Append given JSON object to the request payload. You can use this to specify"
155
157
  "additional generate params like sampling params.",
156
158
  )
157
- parser.add_argument("--seed", type=int, default=1, help="The random seed.")
159
+ parser.add_argument(
160
+ "--apply-chat-template",
161
+ action="store_true",
162
+ help="Apply chat template",
163
+ )
164
+ parser.add_argument(
165
+ "--profile",
166
+ action="store_true",
167
+ help="Use Torch Profiler. The endpoint must be launched with "
168
+ "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
169
+ )
158
170
  parser.add_argument(
159
171
  "--skip-warmup",
160
172
  action="store_true",
@@ -165,12 +177,6 @@ class BenchArgs:
165
177
  action="store_true",
166
178
  help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
167
179
  )
168
- parser.add_argument(
169
- "--profile",
170
- action="store_true",
171
- help="Use Torch Profiler. The endpoint must be launched with "
172
- "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
173
- )
174
180
 
175
181
  @classmethod
176
182
  def from_cli_args(cls, args: argparse.Namespace):
@@ -65,7 +65,13 @@ from sglang.srt.model_executor.model_runner import ModelRunner
65
65
  from sglang.srt.sampling.sampling_params import SamplingParams
66
66
  from sglang.srt.server_args import PortArgs, ServerArgs
67
67
  from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
68
- from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
68
+ from sglang.srt.utils import (
69
+ configure_logger,
70
+ get_bool_env_var,
71
+ kill_process_tree,
72
+ set_gpu_proc_affinity,
73
+ suppress_other_loggers,
74
+ )
69
75
 
70
76
 
71
77
  @dataclasses.dataclass
@@ -99,10 +105,7 @@ class BenchArgs:
99
105
  parser.add_argument("--correctness-test", action="store_true")
100
106
  parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
101
107
  parser.add_argument(
102
- "--profile",
103
- action="store_true",
104
- help="Use Torch Profiler. The endpoint must be launched with "
105
- "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
108
+ "--profile", action="store_true", help="Use Torch Profiler."
106
109
  )
107
110
  parser.add_argument(
108
111
  "--profile-filename-prefix",
@@ -381,6 +384,7 @@ def latency_test_run_once(
381
384
  parent_dir = os.path.dirname(os.path.abspath(profile_filename))
382
385
  os.makedirs(parent_dir, exist_ok=True)
383
386
  profiler.export_chrome_trace(profile_filename)
387
+ rank_print(f"torch profiler chrome trace saved to {profile_filename}")
384
388
 
385
389
  # Record decode timing from 2nd output
386
390
  if output_len > 1:
@@ -407,6 +411,10 @@ def latency_test(
407
411
  bench_args,
408
412
  tp_rank,
409
413
  ):
414
+ # Set CPU affinity
415
+ if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
416
+ set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
417
+
410
418
  # Configure the logger
411
419
  configure_logger(server_args, prefix=f" TP{tp_rank}")
412
420
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
@@ -451,7 +459,7 @@ def latency_test(
451
459
  il,
452
460
  ol,
453
461
  server_args.device,
454
- bench_args.profile,
462
+ bench_args.profile if tp_rank == 0 else None,
455
463
  bench_args.profile_filename_prefix,
456
464
  )
457
465
  if ret is not None:
@@ -453,6 +453,7 @@ def get_dataset(args, tokenizer):
453
453
  tokenizer=tokenizer,
454
454
  fixed_output_len=args.sharegpt_output_len,
455
455
  context_len=args.sharegpt_context_len,
456
+ apply_chat_template=args.apply_chat_template,
456
457
  )
457
458
  elif args.dataset_name == "random":
458
459
  input_requests = sample_random_requests(
@@ -517,6 +518,7 @@ class BenchmarkMetrics:
517
518
  median_e2e_latency_ms: float
518
519
  std_e2e_latency_ms: float
519
520
  p99_e2e_latency_ms: float
521
+ concurrency: float
520
522
 
521
523
 
522
524
  SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
@@ -562,6 +564,7 @@ def sample_sharegpt_requests(
562
564
  tokenizer: PreTrainedTokenizerBase,
563
565
  fixed_output_len: Optional[int] = None,
564
566
  context_len: Optional[int] = None,
567
+ apply_chat_template=False,
565
568
  ) -> List[Tuple[str, int, int]]:
566
569
  if fixed_output_len is not None and fixed_output_len < 4:
567
570
  raise ValueError("output_len too small")
@@ -592,6 +595,15 @@ def sample_sharegpt_requests(
592
595
 
593
596
  # Tokenize the prompts and completions.
594
597
  prompt = dataset[i][0]
598
+
599
+ if apply_chat_template:
600
+ prompt = tokenizer.apply_chat_template(
601
+ [{"role": "user", "content": prompt}],
602
+ add_generation_prompt=True,
603
+ tokenize=False,
604
+ )
605
+ prompt = prompt.replace(tokenizer.bos_token, "")
606
+
595
607
  prompt_token_ids = tokenizer.encode(prompt)
596
608
  completion = dataset[i][1]
597
609
  completion_token_ids = tokenizer.encode(completion)
@@ -600,7 +612,7 @@ def sample_sharegpt_requests(
600
612
  len(completion_token_ids) if fixed_output_len is None else fixed_output_len
601
613
  )
602
614
 
603
- if prompt_len < 1 or output_len < 1:
615
+ if prompt_len < 2 or output_len < 2:
604
616
  # Prune too short sequences.
605
617
  continue
606
618
 
@@ -880,6 +892,7 @@ def calculate_metrics(
880
892
  median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
881
893
  std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
882
894
  p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000,
895
+ concurrency=np.sum(e2e_latencies) / dur_s,
883
896
  )
884
897
 
885
898
  return metrics, output_lens
@@ -1031,6 +1044,7 @@ async def benchmark(
1031
1044
  "Total token throughput (tok/s):", metrics.total_throughput
1032
1045
  )
1033
1046
  )
1047
+ print("{:<40} {:<10.2f}".format("Concurrency:", metrics.concurrency))
1034
1048
  print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
1035
1049
  print(
1036
1050
  "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
@@ -1062,13 +1076,24 @@ async def benchmark(
1062
1076
  and metrics.output_throughput is not None
1063
1077
  ):
1064
1078
  result = {
1079
+ # Arguments
1065
1080
  "backend": args.backend,
1066
1081
  "dataset_name": args.dataset_name,
1067
1082
  "request_rate": request_rate,
1068
1083
  "max_concurrency": max_concurrency,
1084
+ "sharegpt_output_len": args.sharegpt_output_len,
1085
+ "random_input_len": args.random_input_len,
1086
+ "random_output_len": args.random_output_len,
1087
+ "random_range_ratio": args.random_range_ratio,
1088
+ # Results
1089
+ "duration": benchmark_duration,
1090
+ "completed": metrics.completed,
1069
1091
  "total_input_tokens": metrics.total_input,
1070
1092
  "total_output_tokens": metrics.total_output,
1071
1093
  "total_output_tokens_retokenized": metrics.total_output_retokenized,
1094
+ "request_throughput": metrics.request_throughput,
1095
+ "input_throughput": metrics.input_throughput,
1096
+ "output_throughput": metrics.output_throughput,
1072
1097
  "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
1073
1098
  "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
1074
1099
  "std_e2e_latency_ms": metrics.std_e2e_latency_ms,
@@ -1085,14 +1110,7 @@ async def benchmark(
1085
1110
  "median_itl_ms": metrics.median_itl_ms,
1086
1111
  "std_itl_ms": metrics.std_itl_ms,
1087
1112
  "p99_itl_ms": metrics.p99_itl_ms,
1088
- "input_throughput": metrics.input_throughput,
1089
- "output_throughput": metrics.output_throughput,
1090
- "sharegpt_output_len": args.sharegpt_output_len,
1091
- "random_input_len": args.random_input_len,
1092
- "random_output_len": args.random_output_len,
1093
- "random_range_ratio": args.random_range_ratio,
1094
- "duration": benchmark_duration,
1095
- "completed": metrics.completed,
1113
+ "concurrency": metrics.concurrency,
1096
1114
  }
1097
1115
  else:
1098
1116
  print(f"Error running benchmark for request rate: {request_rate}")
@@ -1112,36 +1130,16 @@ async def benchmark(
1112
1130
  with open(output_file_name, "a") as file:
1113
1131
  file.write(json.dumps(result) + "\n")
1114
1132
 
1115
- result = {
1116
- "duration": benchmark_duration,
1117
- "completed": metrics.completed,
1118
- "total_input_tokens": metrics.total_input,
1119
- "total_output_tokens": metrics.total_output,
1120
- "total_output_tokens_retokenized": metrics.total_output_retokenized,
1121
- "request_throughput": metrics.request_throughput,
1122
- "input_throughput": metrics.input_throughput,
1123
- "output_throughput": metrics.output_throughput,
1124
- "mean_ttft_ms": metrics.mean_ttft_ms,
1125
- "median_ttft_ms": metrics.median_ttft_ms,
1126
- "std_ttft_ms": metrics.std_ttft_ms,
1127
- "p99_ttft_ms": metrics.p99_ttft_ms,
1128
- "mean_tpot_ms": metrics.mean_tpot_ms,
1129
- "median_tpot_ms": metrics.median_tpot_ms,
1130
- "std_tpot_ms": metrics.std_tpot_ms,
1131
- "p99_tpot_ms": metrics.p99_tpot_ms,
1132
- "mean_itl_ms": metrics.mean_itl_ms,
1133
- "median_itl_ms": metrics.median_itl_ms,
1134
- "std_itl_ms": metrics.std_itl_ms,
1135
- "p99_itl_ms": metrics.p99_itl_ms,
1136
- "input_lens": [output.prompt_len for output in outputs],
1137
- "output_lens": output_lens,
1138
- "ttfts": [output.ttft for output in outputs],
1139
- "itls": [output.itl for output in outputs],
1140
- "generated_texts": [output.generated_text for output in outputs],
1141
- "errors": [output.error for output in outputs],
1142
- "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
1143
- "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
1144
- }
1133
+ result.update(
1134
+ {
1135
+ "input_lens": [output.prompt_len for output in outputs],
1136
+ "output_lens": output_lens,
1137
+ "ttfts": [output.ttft for output in outputs],
1138
+ "itls": [output.itl for output in outputs],
1139
+ "generated_texts": [output.generated_text for output in outputs],
1140
+ "errors": [output.error for output in outputs],
1141
+ }
1142
+ )
1145
1143
  return result
1146
1144
 
1147
1145
 
@@ -1422,7 +1420,6 @@ if __name__ == "__main__":
1422
1420
  "actual request rate may be lower than specified with --request-rate, "
1423
1421
  "if the server is not processing requests fast enough to keep up.",
1424
1422
  )
1425
- parser.add_argument("--seed", type=int, default=1, help="The random seed.")
1426
1423
  parser.add_argument(
1427
1424
  "--multi",
1428
1425
  action="store_true",
@@ -1446,14 +1443,15 @@ if __name__ == "__main__":
1446
1443
  help="Disable streaming mode.",
1447
1444
  )
1448
1445
  parser.add_argument(
1449
- "--disable-ignore-eos",
1446
+ "--return-logprob",
1450
1447
  action="store_true",
1451
- help="Disable ignoring EOS.",
1448
+ help="Return logprob.",
1452
1449
  )
1450
+ parser.add_argument("--seed", type=int, default=1, help="The random seed.")
1453
1451
  parser.add_argument(
1454
- "--return-logprob",
1452
+ "--disable-ignore-eos",
1455
1453
  action="store_true",
1456
- help="Return logprob.",
1454
+ help="Disable ignoring EOS.",
1457
1455
  )
1458
1456
  parser.add_argument(
1459
1457
  "--extra-request-body",
@@ -1462,6 +1460,11 @@ if __name__ == "__main__":
1462
1460
  help="Append given JSON object to the request payload. You can use this to specify"
1463
1461
  "additional generate params like sampling params.",
1464
1462
  )
1463
+ parser.add_argument(
1464
+ "--apply-chat-template",
1465
+ action="store_true",
1466
+ help="Apply chat template",
1467
+ )
1465
1468
  parser.add_argument(
1466
1469
  "--profile",
1467
1470
  action="store_true",
@@ -354,6 +354,37 @@ register_chat_template(
354
354
  )
355
355
 
356
356
 
357
+ register_chat_template(
358
+ ChatTemplate(
359
+ name="deepseek-v3",
360
+ default_system_prompt=None,
361
+ role_prefix_and_suffix={
362
+ "system": (
363
+ "",
364
+ "",
365
+ ),
366
+ "user": (
367
+ "<|User|>",
368
+ "",
369
+ ),
370
+ "assistant": (
371
+ "<|Assistant|>",
372
+ "<|end▁of▁sentence|>",
373
+ ),
374
+ },
375
+ stop_str=("<|end▁of▁sentence|>",),
376
+ )
377
+ )
378
+
379
+
380
+ @register_chat_template_matching_function
381
+ def match_deepseek(model_path: str):
382
+ if (
383
+ "deepseek-v3" in model_path.lower() or "deepseek-r1" in model_path.lower()
384
+ ) and "base" not in model_path.lower():
385
+ return get_chat_template("deepseek-v3")
386
+
387
+
357
388
  @register_chat_template_matching_function
358
389
  def match_dbrx(model_path: str):
359
390
  if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
@@ -20,6 +20,7 @@ class LoadFormat(str, enum.Enum):
20
20
  GGUF = "gguf"
21
21
  BITSANDBYTES = "bitsandbytes"
22
22
  MISTRAL = "mistral"
23
+ LAYERED = "layered"
23
24
 
24
25
 
25
26
  @dataclass
@@ -185,9 +185,12 @@ class CustomAllreduce:
185
185
  # test nvlink first, this will filter out most of the cases
186
186
  # where custom allreduce is not supported
187
187
  # this checks hardware and driver support for NVLink
188
- assert is_cuda()
188
+ if is_cuda():
189
+ assert is_cuda()
189
190
 
190
- full_nvlink = is_full_nvlink(physical_device_ids)
191
+ full_nvlink = is_full_nvlink(physical_device_ids)
192
+ else:
193
+ full_nvlink = False
191
194
  if world_size > 2 and not full_nvlink:
192
195
  logger.warning(
193
196
  "Custom allreduce is disabled because it's not supported on"
@@ -57,6 +57,7 @@ from sglang.srt.utils import (
57
57
  assert_pkg_version,
58
58
  configure_logger,
59
59
  kill_process_tree,
60
+ launch_dummy_health_check_server,
60
61
  maybe_set_triton_cache_manager,
61
62
  prepare_model_and_tokenizer,
62
63
  set_prometheus_multiproc_dir,
@@ -400,14 +401,16 @@ def _launch_subprocesses(server_args: ServerArgs) -> Tuple[TokenizerManager, Dic
400
401
 
401
402
  if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
402
403
  # When using `Engine` as a Python API, we don't want to block here.
403
- return
404
+ return None, None
405
+
406
+ launch_dummy_health_check_server(server_args.host, server_args.port)
404
407
 
405
408
  for proc in scheduler_procs:
406
409
  proc.join()
407
410
  logger.error(
408
411
  f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
409
412
  )
410
- return
413
+ return None, None
411
414
 
412
415
  # Launch detokenizer process
413
416
  detoken_proc = mp.Process(
@@ -39,10 +39,12 @@ from fastapi.middleware.cors import CORSMiddleware
39
39
  from fastapi.responses import ORJSONResponse, Response, StreamingResponse
40
40
 
41
41
  from sglang.srt.entrypoints.engine import _launch_subprocesses
42
+ from sglang.srt.function_call_parser import FunctionCallParser
42
43
  from sglang.srt.managers.io_struct import (
43
44
  CloseSessionReqInput,
44
45
  ConfigureLoggingReq,
45
46
  EmbeddingReqInput,
47
+ FunctionCallReqInput,
46
48
  GenerateReqInput,
47
49
  GetWeightsByNameReqInput,
48
50
  InitWeightsUpdateGroupReqInput,
@@ -369,6 +371,28 @@ async def configure_logging(obj: ConfigureLoggingReq, request: Request):
369
371
  return Response(status_code=200)
370
372
 
371
373
 
374
+ @app.post("/function_call")
375
+ async def function_call_request(obj: FunctionCallReqInput, request: Request):
376
+ """
377
+ A native API endpoint to parse function calls from a text.
378
+ """
379
+ # 1) Initialize the parser based on the request body
380
+ parser = FunctionCallParser(tools=obj.tools, tool_call_parser=obj.tool_call_parser)
381
+
382
+ # 2) Call the non-stream parsing method (non-stream)
383
+ normal_text, calls = parser.parse_non_stream(obj.text)
384
+
385
+ # 3) Organize the response content
386
+ response_data = {
387
+ "normal_text": normal_text,
388
+ "calls": [
389
+ call.model_dump() for call in calls
390
+ ], # Convert pydantic objects to dictionaries
391
+ }
392
+
393
+ return ORJSONResponse(content=response_data, status_code=200)
394
+
395
+
372
396
  ##### OpenAI-compatible API endpoints #####
373
397
 
374
398