sglang 0.4.1.post6__tar.gz → 0.4.1.post7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (356) hide show
  1. {sglang-0.4.1.post6/sglang.egg-info → sglang-0.4.1.post7}/PKG-INFO +16 -5
  2. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/README.md +1 -1
  3. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/pyproject.toml +9 -3
  4. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/__init__.py +21 -23
  5. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/api.py +2 -7
  6. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/bench_offline_throughput.py +24 -16
  7. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/bench_one_batch.py +51 -3
  8. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/bench_one_batch_server.py +1 -1
  9. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/bench_serving.py +37 -28
  10. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/runtime_endpoint.py +183 -4
  11. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/chat_template.py +15 -4
  12. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/launch_server.py +1 -1
  13. sglang-0.4.1.post7/sglang/srt/_custom_ops.py +156 -0
  14. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/device_config.py +1 -1
  15. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/model_config.py +1 -0
  16. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/constrained/base_grammar_backend.py +21 -0
  17. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/constrained/xgrammar_backend.py +8 -4
  18. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/conversation.py +14 -1
  19. sglang-0.4.1.post7/sglang/srt/distributed/__init__.py +3 -0
  20. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/communication_op.py +2 -1
  21. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
  22. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
  23. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  24. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
  25. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/pynccl.py +80 -1
  26. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
  27. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
  28. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
  29. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/parallel_state.py +1 -1
  30. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/utils.py +2 -1
  31. sglang-0.4.1.post7/sglang/srt/entrypoints/engine.py +449 -0
  32. sglang-0.4.1.post7/sglang/srt/entrypoints/http_server.py +579 -0
  33. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/activation.py +3 -3
  34. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/flashinfer_backend.py +10 -9
  35. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/triton_backend.py +4 -6
  36. sglang-0.4.1.post7/sglang/srt/layers/attention/vision.py +204 -0
  37. sglang-0.4.1.post7/sglang/srt/layers/dp_attention.py +69 -0
  38. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/linear.py +41 -5
  39. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/logits_processor.py +48 -63
  40. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/ep_moe/layer.py +4 -4
  41. sglang-0.4.1.post7/sglang/srt/layers/moe/fused_moe_native.py +115 -0
  42. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
  43. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/layer.py +29 -5
  44. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/parameter.py +2 -1
  45. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/__init__.py +20 -23
  46. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/fp8.py +6 -3
  47. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/modelopt_quant.py +1 -2
  48. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/w8a8_int8.py +1 -1
  49. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/radix_attention.py +2 -2
  50. sglang-0.4.1.post7/sglang/srt/layers/rotary_embedding.py +1260 -0
  51. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/sampler.py +39 -1
  52. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/vocab_parallel_embedding.py +2 -2
  53. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/lora/lora.py +1 -9
  54. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/configure_logging.py +3 -0
  55. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/data_parallel_controller.py +79 -72
  56. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/detokenizer_manager.py +23 -6
  57. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/image_processor.py +158 -2
  58. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/io_struct.py +25 -2
  59. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/schedule_batch.py +49 -22
  60. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/schedule_policy.py +26 -12
  61. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/scheduler.py +277 -178
  62. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/session_controller.py +1 -0
  63. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/tokenizer_manager.py +206 -121
  64. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/tp_worker.py +6 -4
  65. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
  66. sglang-0.4.1.post7/sglang/srt/managers/utils.py +44 -0
  67. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/mem_cache/memory_pool.py +10 -32
  68. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/metrics/collector.py +15 -6
  69. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_executor/cuda_graph_runner.py +4 -6
  70. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_executor/model_runner.py +37 -15
  71. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_loader/loader.py +8 -6
  72. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_loader/weight_utils.py +55 -2
  73. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/baichuan.py +6 -6
  74. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/chatglm.py +2 -2
  75. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/commandr.py +3 -3
  76. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/dbrx.py +4 -4
  77. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/deepseek.py +3 -3
  78. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/deepseek_v2.py +8 -8
  79. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/exaone.py +2 -2
  80. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/gemma.py +2 -2
  81. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/gemma2.py +6 -24
  82. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/gpt2.py +3 -5
  83. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/gpt_bigcode.py +1 -1
  84. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/granite.py +2 -2
  85. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/grok.py +3 -3
  86. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/internlm2.py +2 -2
  87. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llama.py +7 -5
  88. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/minicpm.py +2 -2
  89. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/minicpm3.py +6 -6
  90. sglang-0.4.1.post7/sglang/srt/models/minicpmv.py +1238 -0
  91. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/mixtral.py +3 -3
  92. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/mixtral_quant.py +3 -3
  93. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/mllama.py +2 -2
  94. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/olmo.py +3 -3
  95. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/olmo2.py +4 -4
  96. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/olmoe.py +7 -13
  97. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/phi3_small.py +2 -2
  98. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/qwen.py +2 -2
  99. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/qwen2.py +41 -4
  100. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/qwen2_moe.py +3 -3
  101. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/qwen2_vl.py +22 -122
  102. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/stablelm.py +2 -2
  103. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/torch_native_llama.py +3 -3
  104. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/xverse.py +6 -6
  105. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/xverse_moe.py +6 -6
  106. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/openai_api/protocol.py +2 -0
  107. sglang-0.4.1.post7/sglang/srt/sampling/custom_logit_processor.py +38 -0
  108. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/sampling_batch_info.py +139 -4
  109. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/sampling_params.py +3 -1
  110. sglang-0.4.1.post6/sglang/srt/constrained/__init__.py → sglang-0.4.1.post7/sglang/srt/server.py +4 -2
  111. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/server_args.py +57 -14
  112. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/utils.py +103 -65
  113. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/runners.py +8 -13
  114. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/test_programs.py +1 -1
  115. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/test_utils.py +3 -1
  116. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/utils.py +12 -2
  117. sglang-0.4.1.post7/sglang/version.py +1 -0
  118. {sglang-0.4.1.post6 → sglang-0.4.1.post7/sglang.egg-info}/PKG-INFO +16 -5
  119. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang.egg-info/SOURCES.txt +7 -3
  120. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang.egg-info/requires.txt +17 -3
  121. sglang-0.4.1.post6/sglang/launch_server_llavavid.py +0 -25
  122. sglang-0.4.1.post6/sglang/srt/_custom_ops.py +0 -118
  123. sglang-0.4.1.post6/sglang/srt/distributed/__init__.py +0 -3
  124. sglang-0.4.1.post6/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  125. sglang-0.4.1.post6/sglang/srt/layers/moe/fused_moe_native.py +0 -46
  126. sglang-0.4.1.post6/sglang/srt/layers/rotary_embedding.py +0 -112
  127. sglang-0.4.1.post6/sglang/srt/server.py +0 -1104
  128. sglang-0.4.1.post6/sglang/version.py +0 -1
  129. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/LICENSE +0 -0
  130. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/setup.cfg +0 -0
  131. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/bench_latency.py +0 -0
  132. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/check_env.py +0 -0
  133. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/global_config.py +0 -0
  134. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/__init__.py +0 -0
  135. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/__init__.py +0 -0
  136. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/anthropic.py +0 -0
  137. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/base_backend.py +0 -0
  138. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/litellm.py +0 -0
  139. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/openai.py +0 -0
  140. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/vertexai.py +0 -0
  141. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/choices.py +0 -0
  142. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/compiler.py +0 -0
  143. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/interpreter.py +0 -0
  144. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/ir.py +0 -0
  145. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/tracer.py +0 -0
  146. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/llama3_eval.py +0 -0
  147. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/aio_rwlock.py +0 -0
  148. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/__init__.py +0 -0
  149. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/chatglm.py +0 -0
  150. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/dbrx.py +0 -0
  151. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/exaone.py +0 -0
  152. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/load_config.py +0 -0
  153. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/qwen2vl.py +0 -0
  154. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/constrained/outlines_backend.py +0 -0
  155. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  156. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/hf_transformers_utils.py +0 -0
  157. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/__init__.py +0 -0
  158. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  159. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  160. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  161. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  162. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  163. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  164. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/custom_op_util.py +0 -0
  165. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/layernorm.py +0 -0
  166. {sglang-0.4.1.post6/sglang/srt/distributed/device_communicators → sglang-0.4.1.post7/sglang/srt/layers/moe/ep_moe}/__init__.py +0 -0
  167. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  168. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  169. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  170. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  171. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  172. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  173. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  174. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  175. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  176. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  177. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  178. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  179. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  180. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  181. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  182. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  183. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  184. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  185. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  186. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  187. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  188. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  189. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  190. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  191. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  192. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  193. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  194. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  195. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  196. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  197. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  198. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  199. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  200. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  201. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  202. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  203. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  204. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  205. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  206. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  207. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  208. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  209. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  210. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  211. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  212. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  213. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  214. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  215. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  216. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  217. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  218. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  219. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  220. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  221. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  222. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  223. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  224. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  225. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  226. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  227. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  228. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  229. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  230. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  231. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  232. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  233. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  234. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  235. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  236. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  237. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  238. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  239. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  240. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  241. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  242. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  243. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  244. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  245. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  246. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  247. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  248. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  249. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  250. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  251. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  252. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  253. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  254. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  255. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  256. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  257. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  258. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  259. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  260. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/topk.py +0 -0
  261. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/pooler.py +0 -0
  262. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/base_config.py +0 -0
  263. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  264. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  265. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  266. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  267. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  268. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  269. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  270. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  271. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  272. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  273. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  274. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  275. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  276. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  277. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  278. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  279. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  280. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  281. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  282. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  283. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  284. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  285. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  286. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  287. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  288. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  289. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  290. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  291. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  292. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  293. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  294. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  295. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  296. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  297. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  298. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  299. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  300. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  301. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
  302. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
  303. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  304. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/torchao_utils.py +0 -0
  305. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/lora/lora_config.py +0 -0
  306. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/lora/lora_manager.py +0 -0
  307. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/cache_controller.py +0 -0
  308. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  309. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  310. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/mem_cache/flush_cache.py +0 -0
  311. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/mem_cache/radix_cache.py +0 -0
  312. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/metrics/func_timer.py +0 -0
  313. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/mm_utils.py +0 -0
  314. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  315. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_loader/__init__.py +0 -0
  316. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_loader/utils.py +0 -0
  317. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_parallel.py +0 -0
  318. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/gemma2_reward.py +0 -0
  319. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/internlm2_reward.py +0 -0
  320. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llama_classification.py +0 -0
  321. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llama_eagle.py +0 -0
  322. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llama_embedding.py +0 -0
  323. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llama_reward.py +0 -0
  324. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llava.py +0 -0
  325. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llavavid.py +0 -0
  326. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/mistral.py +0 -0
  327. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/qwen2_eagle.py +0 -0
  328. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/registry.py +0 -0
  329. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/yivl.py +0 -0
  330. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/openai_api/adapter.py +0 -0
  331. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  332. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  333. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  334. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  335. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  336. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  337. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  338. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/speculative/eagle_utils.py +0 -0
  339. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/speculative/eagle_worker.py +0 -0
  340. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/speculative/spec_info.py +0 -0
  341. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  342. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/few_shot_gsm8k.py +0 -0
  343. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  344. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/run_eval.py +0 -0
  345. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/simple_eval_common.py +0 -0
  346. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/simple_eval_gpqa.py +0 -0
  347. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/simple_eval_humaneval.py +0 -0
  348. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/simple_eval_math.py +0 -0
  349. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/simple_eval_mgsm.py +0 -0
  350. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/simple_eval_mmlu.py +0 -0
  351. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  352. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/test_activation.py +0 -0
  353. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/test_block_fp8.py +0 -0
  354. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/test_layernorm.py +0 -0
  355. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang.egg-info/dependency_links.txt +0 -0
  356. {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.1.post6
3
+ Version: 0.4.1.post7
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -236,13 +236,13 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
236
236
  Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
237
237
  Requires-Dist: uvicorn; extra == "runtime-common"
238
238
  Requires-Dist: uvloop; extra == "runtime-common"
239
- Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
239
+ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
240
240
  Provides-Extra: srt
241
241
  Requires-Dist: sglang[runtime_common]; extra == "srt"
242
242
  Requires-Dist: cuda-python; extra == "srt"
243
- Requires-Dist: sgl-kernel>=0.0.2.post12; extra == "srt"
243
+ Requires-Dist: sgl-kernel>=0.0.2.post14; extra == "srt"
244
244
  Requires-Dist: torch; extra == "srt"
245
- Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
245
+ Requires-Dist: vllm==0.6.4.post1; extra == "srt"
246
246
  Requires-Dist: flashinfer==0.1.6; extra == "srt"
247
247
  Provides-Extra: srt-hip
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
@@ -252,6 +252,9 @@ Provides-Extra: srt-xpu
252
252
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
253
253
  Provides-Extra: srt-hpu
254
254
  Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
255
+ Provides-Extra: srt-cpu
256
+ Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
257
+ Requires-Dist: torch; extra == "srt-cpu"
255
258
  Provides-Extra: openai
256
259
  Requires-Dist: openai>=1.0; extra == "openai"
257
260
  Requires-Dist: tiktoken; extra == "openai"
@@ -288,6 +291,11 @@ Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
288
291
  Requires-Dist: sglang[openai]; extra == "all-hpu"
289
292
  Requires-Dist: sglang[anthropic]; extra == "all-hpu"
290
293
  Requires-Dist: sglang[litellm]; extra == "all-hpu"
294
+ Provides-Extra: all-cpu
295
+ Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
296
+ Requires-Dist: sglang[openai]; extra == "all-cpu"
297
+ Requires-Dist: sglang[anthropic]; extra == "all-cpu"
298
+ Requires-Dist: sglang[litellm]; extra == "all-cpu"
291
299
  Provides-Extra: dev
292
300
  Requires-Dist: sglang[all]; extra == "dev"
293
301
  Requires-Dist: sglang[test]; extra == "dev"
@@ -300,6 +308,9 @@ Requires-Dist: sglang[test]; extra == "dev-xpu"
300
308
  Provides-Extra: dev-hpu
301
309
  Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
302
310
  Requires-Dist: sglang[test]; extra == "dev-hpu"
311
+ Provides-Extra: dev-cpu
312
+ Requires-Dist: sglang[all_cpu]; extra == "dev-cpu"
313
+ Requires-Dist: sglang[test]; extra == "dev-cpu"
303
314
 
304
315
  <div align="center" id="sglangtop">
305
316
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -361,7 +372,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
361
372
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
362
373
 
363
374
  ## Adoption and Sponsorship
364
- The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
375
+ The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
365
376
 
366
377
  ## Acknowledgment and Citation
367
378
  We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -58,7 +58,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
58
58
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
59
59
 
60
60
  ## Adoption and Sponsorship
61
- The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
61
+ The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
62
62
 
63
63
  ## Acknowledgment and Citation
64
64
  We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.1.post6"
7
+ version = "0.4.1.post7"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -23,11 +23,11 @@ runtime_common = [
23
23
  "packaging", "pillow", "prometheus-client>=0.20.0",
24
24
  "psutil", "pydantic", "python-multipart",
25
25
  "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
26
- "xgrammar>=0.1.6"
26
+ "xgrammar>=0.1.10"
27
27
  ]
28
28
  srt = [
29
29
  "sglang[runtime_common]", "cuda-python",
30
- "sgl-kernel>=0.0.2.post12", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1",
30
+ "sgl-kernel>=0.0.2.post14", "torch", "vllm==0.6.4.post1",
31
31
  "flashinfer==0.1.6"
32
32
  ]
33
33
 
@@ -40,6 +40,10 @@ srt_xpu = ["sglang[runtime_common]"]
40
40
  #For Intel Gaudi(device : hpu) follow the installation guide
41
41
  #https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
42
42
  srt_hpu = ["sglang[runtime_common]"]
43
+ # CPU: currently, there are no pre-built vllm wheels for CPU.
44
+ # To install vllm for CPU, please follow the instruction here:
45
+ # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
46
+ srt_cpu = ["sglang[runtime_common]", "torch"]
43
47
 
44
48
  openai = ["openai>=1.0", "tiktoken"]
45
49
  anthropic = ["anthropic>=0.20.0"]
@@ -57,11 +61,13 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
57
61
  all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
58
62
  all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
59
63
  all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
64
+ all_cpu = ["sglang[srt_cpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
60
65
 
61
66
  dev = ["sglang[all]", "sglang[test]"]
62
67
  dev_hip = ["sglang[all_hip]", "sglang[test]"]
63
68
  dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
64
69
  dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
70
+ dev_cpu = ["sglang[all_cpu]", "sglang[test]"]
65
71
 
66
72
  [project.urls]
67
73
  "Homepage" = "https://github.com/sgl-project/sglang"
@@ -1,5 +1,6 @@
1
- # SGL API Components
1
+ # SGLang public APIs
2
2
 
3
+ # Frontend Language APIs
3
4
  from sglang.api import (
4
5
  Engine,
5
6
  Runtime,
@@ -23,16 +24,26 @@ from sglang.api import (
23
24
  user_end,
24
25
  video,
25
26
  )
27
+ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
26
28
  from sglang.lang.choices import (
27
29
  greedy_token_selection,
28
30
  token_length_normalized,
29
31
  unconditional_likelihood_normalized,
30
32
  )
33
+ from sglang.utils import LazyImport
34
+
35
+ Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
36
+ LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
37
+ OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
38
+ VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
39
+
40
+ # Other configs
41
+ from sglang.global_config import global_config
42
+ from sglang.version import __version__
31
43
 
32
- # SGLang DSL APIs
33
44
  __all__ = [
34
- "Runtime",
35
45
  "Engine",
46
+ "Runtime",
36
47
  "assistant",
37
48
  "assistant_begin",
38
49
  "assistant_end",
@@ -52,27 +63,14 @@ __all__ = [
52
63
  "user_begin",
53
64
  "user_end",
54
65
  "video",
66
+ "RuntimeEndpoint",
55
67
  "greedy_token_selection",
56
68
  "token_length_normalized",
57
69
  "unconditional_likelihood_normalized",
70
+ "Anthropic",
71
+ "LiteLLM",
72
+ "OpenAI",
73
+ "VertexAI",
74
+ "global_config",
75
+ "__version__",
58
76
  ]
59
-
60
- # Global Configurations
61
- from sglang.global_config import global_config
62
-
63
- __all__ += ["global_config"]
64
-
65
- from sglang.version import __version__
66
-
67
- __all__ += ["__version__"]
68
-
69
- # SGLang Backends
70
- from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
71
- from sglang.utils import LazyImport
72
-
73
- Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
74
- LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
75
- OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
76
- VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
77
-
78
- __all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
@@ -1,6 +1,5 @@
1
1
  """Public APIs of the language."""
2
2
 
3
- import os
4
3
  import re
5
4
  from typing import Callable, List, Optional, Union
6
5
 
@@ -33,19 +32,15 @@ def function(
33
32
 
34
33
 
35
34
  def Runtime(*args, **kwargs):
36
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
37
-
38
35
  # Avoid importing unnecessary dependency
39
- from sglang.srt.server import Runtime
36
+ from sglang.lang.backend.runtime_endpoint import Runtime
40
37
 
41
38
  return Runtime(*args, **kwargs)
42
39
 
43
40
 
44
41
  def Engine(*args, **kwargs):
45
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
46
-
47
42
  # Avoid importing unnecessary dependency
48
- from sglang.srt.server import Engine
43
+ from sglang.srt.entrypoints.engine import Engine
49
44
 
50
45
  return Engine(*args, **kwargs)
51
46
 
@@ -27,7 +27,8 @@ from sglang.bench_serving import (
27
27
  sample_random_requests,
28
28
  set_ulimit,
29
29
  )
30
- from sglang.srt.server import Engine, Runtime
30
+ from sglang.lang.backend.runtime_endpoint import Runtime
31
+ from sglang.srt.entrypoints.engine import Engine
31
32
  from sglang.srt.server_args import ServerArgs
32
33
 
33
34
 
@@ -39,14 +40,15 @@ class BenchArgs:
39
40
  dataset_path: str = ""
40
41
  num_prompts: int = 1000
41
42
  sharegpt_output_len: Optional[int] = None
43
+ sharegpt_context_len: Optional[int] = None
42
44
  random_input_len: int = 1024
43
45
  random_output_len: int = 1024
44
46
  random_range_ratio: float = 0.0
45
- gen_num_groups: int = 64
46
- gen_prompts_per_group: int = 16
47
- gen_system_prompt_len: int = 2048
48
- gen_question_len: int = 128
49
- gen_output_len: int = 256
47
+ gsp_num_groups: int = 64
48
+ gsp_prompts_per_group: int = 16
49
+ gsp_system_prompt_len: int = 2048
50
+ gsp_question_len: int = 128
51
+ gsp_output_len: int = 256
50
52
  disable_ignore_eos: bool = False
51
53
  extra_request_body: Optional[str] = None
52
54
  seed: int = 1
@@ -82,6 +84,12 @@ class BenchArgs:
82
84
  default=BenchArgs.sharegpt_output_len,
83
85
  help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
84
86
  )
87
+ parser.add_argument(
88
+ "--sharegpt-context-len",
89
+ type=int,
90
+ default=BenchArgs.sharegpt_context_len,
91
+ help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
92
+ )
85
93
  parser.add_argument(
86
94
  "--random-input-len",
87
95
  type=int,
@@ -102,35 +110,35 @@ class BenchArgs:
102
110
  "used only for random dataset.",
103
111
  )
104
112
  parser.add_argument(
105
- "--gen-num-groups",
113
+ "--gsp-num-groups",
106
114
  type=int,
107
- default=BenchArgs.gen_num_groups,
115
+ default=BenchArgs.gsp_num_groups,
108
116
  help="Number of groups with shared prefix, used"
109
117
  "only for generate-shared-prefix",
110
118
  )
111
119
  parser.add_argument(
112
- "--gen-prompts-per-group",
120
+ "--gsp-prompts-per-group",
113
121
  type=int,
114
- default=BenchArgs.gen_prompts_per_group,
122
+ default=BenchArgs.gsp_prompts_per_group,
115
123
  help="Number of prompts per group of shared prefix, used"
116
124
  "only for generate-shared-prefix",
117
125
  )
118
126
  parser.add_argument(
119
- "--gen-system-prompt-len",
127
+ "--gsp-system-prompt-len",
120
128
  type=int,
121
- default=BenchArgs.gen_system_prompt_len,
129
+ default=BenchArgs.gsp_system_prompt_len,
122
130
  help="System prompt length, used" "only for generate-shared-prefix",
123
131
  )
124
132
  parser.add_argument(
125
- "--gen-question-len",
133
+ "--gsp-question-len",
126
134
  type=int,
127
- default=BenchArgs.gen_question_len,
135
+ default=BenchArgs.gsp_question_len,
128
136
  help="Question length, used" "only for generate-shared-prefix",
129
137
  )
130
138
  parser.add_argument(
131
- "--gen-output-len",
139
+ "--gsp-output-len",
132
140
  type=int,
133
- default=BenchArgs.gen_output_len,
141
+ default=BenchArgs.gsp_output_len,
134
142
  help="Target length in tokens for outputs in generated-shared-prefix dataset",
135
143
  )
136
144
  parser.add_argument(
@@ -9,7 +9,8 @@ It accepts server arguments (the same as launch_server.py) and benchmark argumen
9
9
  python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
10
10
  ## sweep through multiple data points and store (append) the results in a jsonl file:
11
11
  python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
12
-
12
+ ## run with profiling:
13
+ python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
13
14
  # Usage (correctness test):
14
15
  python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
15
16
 
@@ -56,12 +57,12 @@ import torch
56
57
  import torch.distributed as dist
57
58
 
58
59
  from sglang.srt.configs.model_config import ModelConfig
60
+ from sglang.srt.entrypoints.engine import _set_envs_and_config
59
61
  from sglang.srt.hf_transformers_utils import get_tokenizer
60
62
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
61
63
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
62
64
  from sglang.srt.model_executor.model_runner import ModelRunner
63
65
  from sglang.srt.sampling.sampling_params import SamplingParams
64
- from sglang.srt.server import _set_envs_and_config
65
66
  from sglang.srt.server_args import PortArgs, ServerArgs
66
67
  from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
67
68
  from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
@@ -77,6 +78,8 @@ class BenchArgs:
77
78
  correctness_test: bool = False
78
79
  # This is only used for correctness test
79
80
  cut_len: int = 4
81
+ profile: bool = False
82
+ profile_filename_prefix: str = "profile"
80
83
 
81
84
  @staticmethod
82
85
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -95,6 +98,19 @@ class BenchArgs:
95
98
  )
96
99
  parser.add_argument("--correctness-test", action="store_true")
97
100
  parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
101
+ parser.add_argument(
102
+ "--profile",
103
+ action="store_true",
104
+ help="Use Torch Profiler. The endpoint must be launched with "
105
+ "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
106
+ )
107
+ parser.add_argument(
108
+ "--profile-filename-prefix",
109
+ type=str,
110
+ default=BenchArgs.profile_filename_prefix,
111
+ help="Prefix of the profiling file names. The full profiling result file(s) be "
112
+ '"[profile_filename_prefix]_batch[batch_size]_input[input_len]_output[output_len].trace.json.gz"',
113
+ )
98
114
 
99
115
  @classmethod
100
116
  def from_cli_args(cls, args: argparse.Namespace):
@@ -216,6 +232,7 @@ def extend(reqs, model_runner):
216
232
  model_config=model_runner.model_config,
217
233
  enable_overlap=False,
218
234
  spec_algorithm=SpeculativeAlgorithm.NONE,
235
+ enable_custom_logit_processor=False,
219
236
  )
220
237
  batch.prepare_for_extend()
221
238
  model_worker_batch = batch.get_model_worker_batch()
@@ -286,7 +303,16 @@ def synchronize(device):
286
303
 
287
304
 
288
305
  def latency_test_run_once(
289
- run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len, device
306
+ run_name,
307
+ model_runner,
308
+ rank_print,
309
+ reqs,
310
+ batch_size,
311
+ input_len,
312
+ output_len,
313
+ device,
314
+ profile,
315
+ profile_filename_prefix,
290
316
  ):
291
317
  max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
292
318
  if batch_size > max_batch_size:
@@ -308,6 +334,17 @@ def latency_test_run_once(
308
334
 
309
335
  tot_latency = 0
310
336
 
337
+ profiler = None
338
+ if profile:
339
+ profiler = torch.profiler.profile(
340
+ activities=[
341
+ torch.profiler.ProfilerActivity.CPU,
342
+ torch.profiler.ProfilerActivity.CUDA,
343
+ ],
344
+ with_stack=True,
345
+ )
346
+ profiler.start()
347
+
311
348
  # Prefill
312
349
  synchronize(device)
313
350
  tic = time.time()
@@ -338,6 +375,13 @@ def latency_test_run_once(
338
375
  f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
339
376
  )
340
377
 
378
+ if profile:
379
+ profiler.stop()
380
+ profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}.trace.json.gz"
381
+ parent_dir = os.path.dirname(os.path.abspath(profile_filename))
382
+ os.makedirs(parent_dir, exist_ok=True)
383
+ profiler.export_chrome_trace(profile_filename)
384
+
341
385
  # Record decode timing from 2nd output
342
386
  if output_len > 1:
343
387
  med_decode_latency = np.median(decode_latencies)
@@ -386,6 +430,8 @@ def latency_test(
386
430
  bench_args.input_len[0],
387
431
  8, # shorter decoding to speed up the warmup
388
432
  server_args.device,
433
+ profile=False,
434
+ profile_filename_prefix="", # not used
389
435
  )
390
436
 
391
437
  rank_print("Benchmark ...")
@@ -405,6 +451,8 @@ def latency_test(
405
451
  il,
406
452
  ol,
407
453
  server_args.device,
454
+ bench_args.profile,
455
+ bench_args.profile_filename_prefix,
408
456
  )
409
457
  if ret is not None:
410
458
  result_list.append(ret)
@@ -22,7 +22,7 @@ from typing import Tuple
22
22
  import numpy as np
23
23
  import requests
24
24
 
25
- from sglang.srt.server import launch_server
25
+ from sglang.srt.entrypoints.http_server import launch_server
26
26
  from sglang.srt.server_args import ServerArgs
27
27
  from sglang.srt.utils import kill_process_tree
28
28
 
@@ -452,6 +452,7 @@ def get_dataset(args, tokenizer):
452
452
  num_requests=args.num_prompts,
453
453
  tokenizer=tokenizer,
454
454
  fixed_output_len=args.sharegpt_output_len,
455
+ context_len=args.sharegpt_context_len,
455
456
  )
456
457
  elif args.dataset_name == "random":
457
458
  input_requests = sample_random_requests(
@@ -464,11 +465,11 @@ def get_dataset(args, tokenizer):
464
465
  )
465
466
  elif args.dataset_name == "generated-shared-prefix":
466
467
  input_requests = sample_generated_shared_prefix_requests(
467
- num_groups=args.gen_num_groups,
468
- prompts_per_group=args.gen_prompts_per_group,
469
- system_prompt_len=args.gen_system_prompt_len,
470
- question_len=args.gen_question_len,
471
- output_len=args.gen_output_len,
468
+ num_groups=args.gsp_num_groups,
469
+ prompts_per_group=args.gsp_prompts_per_group,
470
+ system_prompt_len=args.gsp_system_prompt_len,
471
+ question_len=args.gsp_question_len,
472
+ output_len=args.gsp_output_len,
472
473
  tokenizer=tokenizer,
473
474
  )
474
475
  else:
@@ -560,6 +561,7 @@ def sample_sharegpt_requests(
560
561
  num_requests: int,
561
562
  tokenizer: PreTrainedTokenizerBase,
562
563
  fixed_output_len: Optional[int] = None,
564
+ context_len: Optional[int] = None,
563
565
  ) -> List[Tuple[str, int, int]]:
564
566
  if fixed_output_len is not None and fixed_output_len < 4:
565
567
  raise ValueError("output_len too small")
@@ -597,14 +599,15 @@ def sample_sharegpt_requests(
597
599
  output_len = (
598
600
  len(completion_token_ids) if fixed_output_len is None else fixed_output_len
599
601
  )
600
- if prompt_len < 4 or output_len < 4:
602
+
603
+ if prompt_len < 1 or output_len < 1:
601
604
  # Prune too short sequences.
602
605
  continue
603
- if prompt_len > 1024 or (
604
- prompt_len + output_len > 2048 and fixed_output_len is None
605
- ):
606
+
607
+ if context_len and prompt_len + output_len > context_len:
606
608
  # Prune too long sequences.
607
609
  continue
610
+
608
611
  filtered_dataset.append((prompt, prompt_len, output_len))
609
612
 
610
613
  print(f"#Input tokens: {np.sum([x[1] for x in filtered_dataset])}")
@@ -706,8 +709,8 @@ def get_gen_prefix_cache_path(args, tokenizer):
706
709
 
707
710
  # Create a unique cache filename based on the generation parameters
708
711
  cache_key = (
709
- f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
710
- f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
712
+ f"gen_shared_prefix_{args.gsp_num_groups}_{args.gsp_prompts_per_group}_"
713
+ f"{args.gsp_system_prompt_len}_{args.gsp_question_len}_{args.gsp_output_len}_"
711
714
  f"{tokenizer.__class__.__name__}.pkl"
712
715
  )
713
716
  return cache_dir / cache_key
@@ -1374,6 +1377,12 @@ if __name__ == "__main__":
1374
1377
  default=None,
1375
1378
  help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
1376
1379
  )
1380
+ parser.add_argument(
1381
+ "--sharegpt-context-len",
1382
+ type=int,
1383
+ default=None,
1384
+ help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
1385
+ )
1377
1386
  parser.add_argument(
1378
1387
  "--random-input-len",
1379
1388
  type=int,
@@ -1453,49 +1462,49 @@ if __name__ == "__main__":
1453
1462
  help="Append given JSON object to the request payload. You can use this to specify"
1454
1463
  "additional generate params like sampling params.",
1455
1464
  )
1465
+ parser.add_argument(
1466
+ "--profile",
1467
+ action="store_true",
1468
+ help="Use Torch Profiler. The endpoint must be launched with "
1469
+ "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
1470
+ )
1471
+ parser.add_argument(
1472
+ "--lora-name",
1473
+ type=str,
1474
+ default=None,
1475
+ help="The name of LoRA adapter",
1476
+ )
1456
1477
 
1457
1478
  group = parser.add_argument_group("generated-shared-prefix dataset arguments")
1458
1479
  group.add_argument(
1459
- "--gen-num-groups",
1480
+ "--gsp-num-groups",
1460
1481
  type=int,
1461
1482
  default=64,
1462
1483
  help="Number of system prompt groups for generated-shared-prefix dataset",
1463
1484
  )
1464
1485
  group.add_argument(
1465
- "--gen-prompts-per-group",
1486
+ "--gsp-prompts-per-group",
1466
1487
  type=int,
1467
1488
  default=16,
1468
1489
  help="Number of prompts per system prompt group for generated-shared-prefix dataset",
1469
1490
  )
1470
1491
  group.add_argument(
1471
- "--gen-system-prompt-len",
1492
+ "--gsp-system-prompt-len",
1472
1493
  type=int,
1473
1494
  default=2048,
1474
1495
  help="Target length in tokens for system prompts in generated-shared-prefix dataset",
1475
1496
  )
1476
1497
  group.add_argument(
1477
- "--gen-question-len",
1498
+ "--gsp-question-len",
1478
1499
  type=int,
1479
1500
  default=128,
1480
1501
  help="Target length in tokens for questions in generated-shared-prefix dataset",
1481
1502
  )
1482
1503
  group.add_argument(
1483
- "--gen-output-len",
1504
+ "--gsp-output-len",
1484
1505
  type=int,
1485
1506
  default=256,
1486
1507
  help="Target length in tokens for outputs in generated-shared-prefix dataset",
1487
1508
  )
1488
- parser.add_argument(
1489
- "--profile",
1490
- action="store_true",
1491
- help="Use Torch Profiler. The endpoint must be launched with "
1492
- "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
1493
- )
1494
- parser.add_argument(
1495
- "--lora-name",
1496
- type=str,
1497
- default=None,
1498
- help="The name of LoRA adapter",
1499
- )
1500
1509
  args = parser.parse_args()
1501
1510
  run_benchmark(args)