sglang 0.4.1.post4__tar.gz → 0.4.1.post6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (346) hide show
  1. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/PKG-INFO +16 -15
  2. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/README.md +10 -11
  3. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/pyproject.toml +11 -4
  4. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/bench_serving.py +18 -1
  5. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/interpreter.py +71 -1
  6. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/ir.py +2 -0
  7. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/configs/__init__.py +4 -0
  8. sglang-0.4.1.post6/sglang/srt/configs/chatglm.py +78 -0
  9. sglang-0.4.1.post6/sglang/srt/configs/dbrx.py +279 -0
  10. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/configs/model_config.py +16 -7
  11. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/hf_transformers_utils.py +9 -14
  12. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/attention/__init__.py +8 -1
  13. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/attention/flashinfer_backend.py +21 -5
  14. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/linear.py +89 -47
  15. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/logits_processor.py +6 -6
  16. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +16 -5
  17. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/layer.py +39 -12
  18. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/topk.py +4 -2
  19. sglang-0.4.1.post6/sglang/srt/layers/parameter.py +439 -0
  20. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/__init__.py +5 -2
  21. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/fp8.py +107 -53
  22. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/fp8_utils.py +1 -1
  23. sglang-0.4.1.post6/sglang/srt/layers/quantization/int8_kernel.py +54 -0
  24. sglang-0.4.1.post6/sglang/srt/layers/quantization/modelopt_quant.py +174 -0
  25. sglang-0.4.1.post6/sglang/srt/layers/quantization/w8a8_int8.py +117 -0
  26. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/radix_attention.py +2 -0
  27. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/vocab_parallel_embedding.py +16 -3
  28. sglang-0.4.1.post6/sglang/srt/managers/cache_controller.py +307 -0
  29. sglang-0.4.1.post6/sglang/srt/managers/configure_logging.py +43 -0
  30. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/managers/data_parallel_controller.py +2 -0
  31. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/managers/detokenizer_manager.py +0 -2
  32. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/managers/io_struct.py +29 -13
  33. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/managers/schedule_batch.py +7 -1
  34. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/managers/scheduler.py +58 -15
  35. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/managers/session_controller.py +1 -1
  36. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/managers/tokenizer_manager.py +109 -45
  37. sglang-0.4.1.post6/sglang/srt/mem_cache/memory_pool.py +623 -0
  38. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/metrics/collector.py +32 -35
  39. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/model_executor/cuda_graph_runner.py +14 -7
  40. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/model_executor/forward_batch_info.py +20 -15
  41. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/model_executor/model_runner.py +53 -10
  42. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/chatglm.py +1 -1
  43. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/dbrx.py +1 -1
  44. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/grok.py +25 -16
  45. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/llama.py +46 -4
  46. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/qwen2.py +11 -0
  47. sglang-0.4.1.post6/sglang/srt/models/qwen2_eagle.py +131 -0
  48. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +15 -5
  49. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/sampling/sampling_batch_info.py +15 -5
  50. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/sampling/sampling_params.py +1 -1
  51. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/server.py +125 -69
  52. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/server_args.py +39 -19
  53. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/speculative/eagle_utils.py +93 -85
  54. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/speculative/eagle_worker.py +48 -33
  55. sglang-0.4.1.post6/sglang/srt/torch_memory_saver_adapter.py +59 -0
  56. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/utils.py +61 -5
  57. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/test_programs.py +23 -1
  58. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/test_utils.py +36 -7
  59. sglang-0.4.1.post6/sglang/version.py +1 -0
  60. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang.egg-info/PKG-INFO +16 -15
  61. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang.egg-info/SOURCES.txt +10 -0
  62. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang.egg-info/requires.txt +5 -2
  63. sglang-0.4.1.post4/sglang/srt/mem_cache/memory_pool.py +0 -363
  64. sglang-0.4.1.post4/sglang/version.py +0 -1
  65. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/LICENSE +0 -0
  66. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/setup.cfg +0 -0
  67. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/__init__.py +0 -0
  68. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/api.py +0 -0
  69. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/bench_latency.py +0 -0
  70. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/bench_offline_throughput.py +0 -0
  71. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/bench_one_batch.py +0 -0
  72. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/bench_one_batch_server.py +0 -0
  73. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/check_env.py +0 -0
  74. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/global_config.py +0 -0
  75. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/__init__.py +0 -0
  76. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/backend/__init__.py +0 -0
  77. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/backend/anthropic.py +0 -0
  78. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/backend/base_backend.py +0 -0
  79. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/backend/litellm.py +0 -0
  80. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/backend/openai.py +0 -0
  81. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/backend/runtime_endpoint.py +0 -0
  82. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/backend/vertexai.py +0 -0
  83. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/chat_template.py +0 -0
  84. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/choices.py +0 -0
  85. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/compiler.py +0 -0
  86. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/lang/tracer.py +0 -0
  87. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/launch_server.py +0 -0
  88. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/launch_server_llavavid.py +0 -0
  89. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/llama3_eval.py +0 -0
  90. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/_custom_ops.py +0 -0
  91. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/aio_rwlock.py +0 -0
  92. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/configs/device_config.py +0 -0
  93. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/configs/exaone.py +0 -0
  94. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/configs/load_config.py +0 -0
  95. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/configs/qwen2vl.py +0 -0
  96. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/constrained/__init__.py +0 -0
  97. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  98. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/constrained/outlines_backend.py +0 -0
  99. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  100. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  101. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/conversation.py +0 -0
  102. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/distributed/__init__.py +0 -0
  103. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/distributed/communication_op.py +0 -0
  104. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/distributed/device_communicators/__init__.py +0 -0
  105. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  106. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  107. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  108. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  109. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  110. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  111. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  112. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  113. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/distributed/parallel_state.py +0 -0
  114. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/distributed/utils.py +0 -0
  115. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/activation.py +0 -0
  116. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  117. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  118. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/attention/triton_backend.py +0 -0
  119. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  120. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  121. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  122. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  123. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/custom_op_util.py +0 -0
  124. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/layernorm.py +0 -0
  125. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  126. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  127. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
  128. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  129. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  130. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  131. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  132. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  133. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  134. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  135. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  136. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  137. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  138. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  139. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  140. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  141. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  142. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  143. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  144. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  145. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  146. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  147. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  148. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  149. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  150. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  151. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  152. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  153. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  154. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  155. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  156. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  157. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  158. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  159. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  160. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  161. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  162. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  163. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  164. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  165. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  166. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  167. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  168. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  169. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  170. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  171. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  172. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  173. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  174. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  175. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  176. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  177. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  178. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  179. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  180. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  181. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  182. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  183. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  184. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  185. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  186. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  187. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  188. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  189. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  190. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  191. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  192. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  193. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  194. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  195. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  196. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  197. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  198. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  199. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  200. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  201. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  202. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  203. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  204. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  205. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  206. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  207. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  208. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  209. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  210. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  211. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  212. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  213. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  214. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  215. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  216. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  217. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  218. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  219. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  220. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  221. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/pooler.py +0 -0
  222. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/base_config.py +0 -0
  223. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  224. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  225. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  226. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  227. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  228. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  229. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  230. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  231. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  232. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  233. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  234. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  235. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  236. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  237. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  238. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  239. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  240. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  241. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  242. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  243. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  244. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  245. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  246. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  247. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  248. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  249. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  250. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  251. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  252. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  253. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  254. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  255. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  256. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  257. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  258. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  259. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  260. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  261. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
  262. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/rotary_embedding.py +0 -0
  263. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/sampler.py +0 -0
  264. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/layers/torchao_utils.py +0 -0
  265. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/lora/lora.py +0 -0
  266. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/lora/lora_config.py +0 -0
  267. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/lora/lora_manager.py +0 -0
  268. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/managers/image_processor.py +0 -0
  269. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/managers/schedule_policy.py +0 -0
  270. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/managers/tp_worker.py +0 -0
  271. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  272. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  273. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  274. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/mem_cache/flush_cache.py +0 -0
  275. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/mem_cache/radix_cache.py +0 -0
  276. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/metrics/func_timer.py +0 -0
  277. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/mm_utils.py +0 -0
  278. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/model_loader/__init__.py +0 -0
  279. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/model_loader/loader.py +0 -0
  280. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/model_loader/utils.py +0 -0
  281. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/model_loader/weight_utils.py +0 -0
  282. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/model_parallel.py +0 -0
  283. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/baichuan.py +0 -0
  284. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/commandr.py +0 -0
  285. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/deepseek.py +0 -0
  286. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/deepseek_v2.py +0 -0
  287. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/exaone.py +0 -0
  288. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/gemma.py +0 -0
  289. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/gemma2.py +0 -0
  290. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/gemma2_reward.py +0 -0
  291. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/gpt2.py +0 -0
  292. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/gpt_bigcode.py +0 -0
  293. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/granite.py +0 -0
  294. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/internlm2.py +0 -0
  295. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/internlm2_reward.py +0 -0
  296. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/llama_classification.py +0 -0
  297. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/llama_eagle.py +0 -0
  298. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/llama_embedding.py +0 -0
  299. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/llama_reward.py +0 -0
  300. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/llava.py +0 -0
  301. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/llavavid.py +0 -0
  302. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/minicpm.py +0 -0
  303. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/minicpm3.py +0 -0
  304. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/mistral.py +0 -0
  305. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/mixtral.py +0 -0
  306. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/mixtral_quant.py +0 -0
  307. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/mllama.py +0 -0
  308. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/olmo.py +0 -0
  309. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/olmo2.py +0 -0
  310. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/olmoe.py +0 -0
  311. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/phi3_small.py +0 -0
  312. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/qwen.py +0 -0
  313. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/qwen2_moe.py +0 -0
  314. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/qwen2_vl.py +0 -0
  315. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/registry.py +0 -0
  316. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/stablelm.py +0 -0
  317. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/torch_native_llama.py +0 -0
  318. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/xverse.py +0 -0
  319. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/xverse_moe.py +0 -0
  320. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/models/yivl.py +0 -0
  321. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/openai_api/adapter.py +0 -0
  322. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/openai_api/protocol.py +0 -0
  323. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  324. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  325. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  326. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  327. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  328. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  329. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/srt/speculative/spec_info.py +0 -0
  330. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/few_shot_gsm8k.py +0 -0
  331. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  332. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/run_eval.py +0 -0
  333. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/runners.py +0 -0
  334. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/simple_eval_common.py +0 -0
  335. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/simple_eval_gpqa.py +0 -0
  336. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/simple_eval_humaneval.py +0 -0
  337. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/simple_eval_math.py +0 -0
  338. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/simple_eval_mgsm.py +0 -0
  339. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/simple_eval_mmlu.py +0 -0
  340. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  341. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/test_activation.py +0 -0
  342. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/test_block_fp8.py +0 -0
  343. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/test/test_layernorm.py +0 -0
  344. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang/utils.py +0 -0
  345. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang.egg-info/dependency_links.txt +0 -0
  346. {sglang-0.4.1.post4 → sglang-0.4.1.post6}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.1.post4
3
+ Version: 0.4.1.post6
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -239,11 +239,11 @@ Requires-Dist: uvloop; extra == "runtime-common"
239
239
  Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
240
240
  Provides-Extra: srt
241
241
  Requires-Dist: sglang[runtime_common]; extra == "srt"
242
+ Requires-Dist: cuda-python; extra == "srt"
243
+ Requires-Dist: sgl-kernel>=0.0.2.post12; extra == "srt"
242
244
  Requires-Dist: torch; extra == "srt"
243
245
  Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
244
- Requires-Dist: cuda-python; extra == "srt"
245
246
  Requires-Dist: flashinfer==0.1.6; extra == "srt"
246
- Requires-Dist: sgl-kernel>=0.0.2.post11; extra == "srt"
247
247
  Provides-Extra: srt-hip
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
249
249
  Requires-Dist: torch; extra == "srt-hip"
@@ -259,6 +259,8 @@ Provides-Extra: anthropic
259
259
  Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
260
260
  Provides-Extra: litellm
261
261
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
262
+ Provides-Extra: torch-memory-saver
263
+ Requires-Dist: torch_memory_saver; extra == "torch-memory-saver"
262
264
  Provides-Extra: test
263
265
  Requires-Dist: jsonlines; extra == "test"
264
266
  Requires-Dist: matplotlib; extra == "test"
@@ -314,9 +316,9 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
314
316
  --------------------------------------------------------------------------------
315
317
 
316
318
  | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
317
- | [**Documentation**](https://sgl-project.github.io/)
318
- | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw)
319
- | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
319
+ | [**Documentation**](https://docs.sglang.ai/)
320
+ | [**Join Slack**](https://slack.sglang.ai/)
321
+ | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
320
322
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
321
323
 
322
324
  ## News
@@ -346,14 +348,14 @@ The core features include:
346
348
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
347
349
 
348
350
  ## Getting Started
349
- - [Install SGLang](https://sgl-project.github.io/start/install.html)
350
- - [Quick Start](https://sgl-project.github.io/start/send_request.html)
351
- - [Backend Tutorial](https://sgl-project.github.io/backend/openai_api_completions.html)
352
- - [Frontend Tutorial](https://sgl-project.github.io/frontend/frontend.html)
353
- - [Contribution Guide](https://sgl-project.github.io/references/contribution_guide.html)
351
+ - [Install SGLang](https://docs.sglang.ai/start/install.html)
352
+ - [Quick Start](https://docs.sglang.ai/start/send_request.html)
353
+ - [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html)
354
+ - [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html)
355
+ - [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
354
356
 
355
357
  ## Benchmark and Performance
356
- Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
358
+ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
357
359
 
358
360
  ## Roadmap
359
361
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
@@ -362,5 +364,4 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
362
364
  The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
363
365
 
364
366
  ## Acknowledgment and Citation
365
- We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
366
- Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
367
+ We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -13,9 +13,9 @@
13
13
  --------------------------------------------------------------------------------
14
14
 
15
15
  | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
16
- | [**Documentation**](https://sgl-project.github.io/)
17
- | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw)
18
- | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
16
+ | [**Documentation**](https://docs.sglang.ai/)
17
+ | [**Join Slack**](https://slack.sglang.ai/)
18
+ | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
19
19
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
20
20
 
21
21
  ## News
@@ -45,14 +45,14 @@ The core features include:
45
45
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
46
46
 
47
47
  ## Getting Started
48
- - [Install SGLang](https://sgl-project.github.io/start/install.html)
49
- - [Quick Start](https://sgl-project.github.io/start/send_request.html)
50
- - [Backend Tutorial](https://sgl-project.github.io/backend/openai_api_completions.html)
51
- - [Frontend Tutorial](https://sgl-project.github.io/frontend/frontend.html)
52
- - [Contribution Guide](https://sgl-project.github.io/references/contribution_guide.html)
48
+ - [Install SGLang](https://docs.sglang.ai/start/install.html)
49
+ - [Quick Start](https://docs.sglang.ai/start/send_request.html)
50
+ - [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html)
51
+ - [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html)
52
+ - [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
53
53
 
54
54
  ## Benchmark and Performance
55
- Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
55
+ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
56
56
 
57
57
  ## Roadmap
58
58
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
@@ -61,5 +61,4 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
61
61
  The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
62
62
 
63
63
  ## Acknowledgment and Citation
64
- We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
65
- Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
64
+ We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.1.post4"
7
+ version = "0.4.1.post6"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -16,14 +16,20 @@ classifiers = [
16
16
  dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
17
17
 
18
18
  [project.optional-dependencies]
19
- runtime_common = ["aiohttp", "decord", "fastapi",
19
+ runtime_common = [
20
+ "aiohttp", "decord", "fastapi",
20
21
  "hf_transfer", "huggingface_hub", "interegular", "modelscope",
21
22
  "orjson", "outlines>=0.0.44,<0.1.0",
22
23
  "packaging", "pillow", "prometheus-client>=0.20.0",
23
24
  "psutil", "pydantic", "python-multipart",
24
25
  "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
25
- "xgrammar>=0.1.6"]
26
- srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6", "sgl-kernel>=0.0.2.post11"]
26
+ "xgrammar>=0.1.6"
27
+ ]
28
+ srt = [
29
+ "sglang[runtime_common]", "cuda-python",
30
+ "sgl-kernel>=0.0.2.post12", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1",
31
+ "flashinfer==0.1.6"
32
+ ]
27
33
 
28
34
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
29
35
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
@@ -38,6 +44,7 @@ srt_hpu = ["sglang[runtime_common]"]
38
44
  openai = ["openai>=1.0", "tiktoken"]
39
45
  anthropic = ["anthropic>=0.20.0"]
40
46
  litellm = ["litellm>=1.0.0"]
47
+ torch_memory_saver = ["torch_memory_saver"]
41
48
  test = [
42
49
  "jsonlines",
43
50
  "matplotlib",
@@ -514,6 +514,8 @@ class BenchmarkMetrics:
514
514
  p99_itl_ms: float
515
515
  mean_e2e_latency_ms: float
516
516
  median_e2e_latency_ms: float
517
+ std_e2e_latency_ms: float
518
+ p99_e2e_latency_ms: float
517
519
 
518
520
 
519
521
  SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
@@ -563,7 +565,7 @@ def sample_sharegpt_requests(
563
565
  raise ValueError("output_len too small")
564
566
 
565
567
  # Download sharegpt if necessary
566
- if not os.path.isfile(dataset_path):
568
+ if not os.path.isfile(dataset_path) and dataset_path == "":
567
569
  dataset_path = download_and_cache_file(SHAREGPT_URL)
568
570
 
569
571
  # Load the dataset.
@@ -873,6 +875,8 @@ def calculate_metrics(
873
875
  p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
874
876
  mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
875
877
  median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
878
+ std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
879
+ p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000,
876
880
  )
877
881
 
878
882
  return metrics, output_lens
@@ -1064,8 +1068,21 @@ async def benchmark(
1064
1068
  "total_output_tokens_retokenized": metrics.total_output_retokenized,
1065
1069
  "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
1066
1070
  "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
1071
+ "std_e2e_latency_ms": metrics.std_e2e_latency_ms,
1072
+ "p99_e2e_latency_ms": metrics.p99_e2e_latency_ms,
1073
+ "mean_ttft_ms": metrics.mean_ttft_ms,
1067
1074
  "median_ttft_ms": metrics.median_ttft_ms,
1075
+ "std_ttft_ms": metrics.std_ttft_ms,
1076
+ "p99_ttft_ms": metrics.p99_ttft_ms,
1077
+ "mean_tpot_ms": metrics.mean_tpot_ms,
1078
+ "median_tpot_ms": metrics.median_tpot_ms,
1079
+ "std_tpot_ms": metrics.std_tpot_ms,
1080
+ "p99_tpot_ms": metrics.p99_tpot_ms,
1081
+ "mean_itl_ms": metrics.mean_itl_ms,
1068
1082
  "median_itl_ms": metrics.median_itl_ms,
1083
+ "std_itl_ms": metrics.std_itl_ms,
1084
+ "p99_itl_ms": metrics.p99_itl_ms,
1085
+ "input_throughput": metrics.input_throughput,
1069
1086
  "output_throughput": metrics.output_throughput,
1070
1087
  "sharegpt_output_len": args.sharegpt_output_len,
1071
1088
  "random_input_len": args.random_input_len,
@@ -96,6 +96,7 @@ def run_program_batch(
96
96
  default_sampling_para,
97
97
  num_threads,
98
98
  progress_bar,
99
+ generator_style=False,
99
100
  ):
100
101
  if hasattr(backend, "endpoint"):
101
102
  backend = backend.endpoint
@@ -109,6 +110,17 @@ def run_program_batch(
109
110
  num_threads = max(96, multiprocessing.cpu_count() * 16)
110
111
  num_threads = min(num_threads, len(batch_arguments))
111
112
 
113
+ if generator_style:
114
+ return _run_program_batch_generator(
115
+ program,
116
+ backend,
117
+ batch_arguments,
118
+ default_sampling_para,
119
+ num_threads,
120
+ progress_bar,
121
+ )
122
+
123
+ # Original code path when generator_style=False
112
124
  if num_threads == 1:
113
125
  rets = []
114
126
  if progress_bar:
@@ -168,6 +180,64 @@ def run_program_batch(
168
180
  return rets
169
181
 
170
182
 
183
+ def _run_program_batch_generator(
184
+ program,
185
+ backend,
186
+ batch_arguments,
187
+ default_sampling_para,
188
+ num_threads,
189
+ progress_bar,
190
+ ):
191
+ """Helper function that yields results one by one using chunking to avoid overwhelming ThreadPoolExecutor."""
192
+ if num_threads == 1:
193
+ iterator = tqdm.tqdm(batch_arguments) if progress_bar else batch_arguments
194
+ for arguments in iterator:
195
+ yield run_program(
196
+ program,
197
+ backend,
198
+ (),
199
+ arguments,
200
+ default_sampling_para,
201
+ False,
202
+ True,
203
+ )
204
+ else:
205
+ pbar = tqdm.tqdm(total=len(batch_arguments)) if progress_bar else None
206
+
207
+ # Process in chunks to avoid overwhelming ThreadPoolExecutor
208
+ # Otherwise, ThreadPoolExecutor.submit will block after adding certain number of tasks
209
+ # so we will never reach "yield" until all tasks are done
210
+ chunk_size = 200
211
+
212
+ with ThreadPoolExecutor(num_threads) as executor:
213
+ for chunk_start in range(0, len(batch_arguments), chunk_size):
214
+ chunk_end = min(chunk_start + chunk_size, len(batch_arguments))
215
+ chunk_futures = []
216
+
217
+ # Submit chunk of tasks
218
+ for i in range(chunk_start, chunk_end):
219
+ future = executor.submit(
220
+ run_program,
221
+ program,
222
+ backend,
223
+ (),
224
+ batch_arguments[i],
225
+ default_sampling_para,
226
+ False,
227
+ True,
228
+ )
229
+ if pbar:
230
+ future.add_done_callback(lambda _: pbar.update())
231
+ chunk_futures.append(future)
232
+
233
+ # Yield results from this chunk as they complete
234
+ for future in chunk_futures:
235
+ yield future.result()
236
+
237
+ if pbar:
238
+ pbar.close()
239
+
240
+
171
241
  def cache_program(program, backend):
172
242
  from sglang.lang.tracer import extract_prefix_by_tracing
173
243
 
@@ -277,7 +347,7 @@ class StreamExecutor:
277
347
  size: int = 1,
278
348
  position_ids_offset: Optional[List[int]] = None,
279
349
  ):
280
- if size > 1:
350
+ if size > 1 and str(self.text_):
281
351
  self.submit(SglCommitLazy())
282
352
 
283
353
  self.sync()
@@ -227,6 +227,7 @@ class SglFunction:
227
227
  backend=None,
228
228
  num_threads: Union[str, int] = "auto",
229
229
  progress_bar: bool = False,
230
+ generator_style: bool = False,
230
231
  ):
231
232
  from sglang.lang.interpreter import run_program_batch
232
233
 
@@ -277,6 +278,7 @@ class SglFunction:
277
278
  default_sampling_para,
278
279
  num_threads,
279
280
  progress_bar,
281
+ generator_style=generator_style,
280
282
  )
281
283
 
282
284
  def trace(self, *, backend=None, **kwargs):
@@ -1,3 +1,5 @@
1
+ from sglang.srt.configs.chatglm import ChatGLMConfig
2
+ from sglang.srt.configs.dbrx import DbrxConfig
1
3
  from sglang.srt.configs.exaone import ExaoneConfig
2
4
  from sglang.srt.configs.qwen2vl import Qwen2VLConfig, Qwen2VLVisionConfig
3
5
 
@@ -5,4 +7,6 @@ __all__ = [
5
7
  "ExaoneConfig",
6
8
  "Qwen2VLConfig",
7
9
  "Qwen2VLVisionConfig",
10
+ "ChatGLMConfig",
11
+ "DbrxConfig",
8
12
  ]
@@ -0,0 +1,78 @@
1
+ # Adapted from
2
+ # https://github.com/THUDM/ChatGLM2-6B
3
+ # https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/chatglm.py
4
+
5
+ # ChatGLM2 and ChatGLM3 share the same config.
6
+ # ChatGLM4 is officially supported by Huggingface
7
+ # transformers >= 4.46.0 is required
8
+ # https://huggingface.co/docs/transformers/en/model_doc/glm
9
+ from transformers import PretrainedConfig
10
+
11
+
12
+ class ChatGLMConfig(PretrainedConfig):
13
+ model_type = "chatglm"
14
+ attribute_map = {
15
+ "num_hidden_layers": "num_layers",
16
+ "n_head_kv": "multi_query_group_num",
17
+ }
18
+
19
+ def __init__(
20
+ self,
21
+ num_layers=28,
22
+ padded_vocab_size=65024,
23
+ hidden_size=4096,
24
+ ffn_hidden_size=13696,
25
+ kv_channels=128,
26
+ num_attention_heads=32,
27
+ seq_length=2048,
28
+ hidden_dropout=0.0,
29
+ attention_dropout=0.0,
30
+ layernorm_epsilon=1e-5,
31
+ rmsnorm=True,
32
+ apply_residual_connection_post_layernorm=False,
33
+ post_layer_norm=True,
34
+ add_bias_linear=False,
35
+ add_qkv_bias=False,
36
+ interleaved_qkv=False,
37
+ bias_dropout_fusion=True,
38
+ multi_query_attention=False,
39
+ multi_query_group_num=1,
40
+ apply_query_key_layer_scaling=True,
41
+ attention_softmax_in_fp32=True,
42
+ fp32_residual_connection=False,
43
+ quantization_bit=0,
44
+ pre_seq_len=None,
45
+ prefix_projection=False,
46
+ **kwargs
47
+ ):
48
+ self.num_layers = num_layers
49
+ self.vocab_size = padded_vocab_size
50
+ self.padded_vocab_size = padded_vocab_size
51
+ self.hidden_size = hidden_size
52
+ self.ffn_hidden_size = ffn_hidden_size
53
+ self.kv_channels = kv_channels
54
+ self.num_attention_heads = num_attention_heads
55
+ self.seq_length = seq_length
56
+ # It is to be compatible with long lora.
57
+ self.max_position_embeddings = seq_length
58
+ self.hidden_dropout = hidden_dropout
59
+ self.attention_dropout = attention_dropout
60
+ self.layernorm_epsilon = layernorm_epsilon
61
+ self.rmsnorm = rmsnorm
62
+ self.apply_residual_connection_post_layernorm = (
63
+ apply_residual_connection_post_layernorm
64
+ )
65
+ self.post_layer_norm = post_layer_norm
66
+ self.add_bias_linear = add_bias_linear
67
+ self.add_qkv_bias = add_qkv_bias
68
+ self.bias_dropout_fusion = bias_dropout_fusion
69
+ self.multi_query_attention = multi_query_attention
70
+ self.multi_query_group_num = multi_query_group_num
71
+ self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
72
+ self.attention_softmax_in_fp32 = attention_softmax_in_fp32
73
+ self.fp32_residual_connection = fp32_residual_connection
74
+ self.quantization_bit = quantization_bit
75
+ self.pre_seq_len = pre_seq_len
76
+ self.prefix_projection = prefix_projection
77
+ self.interleaved_qkv = interleaved_qkv
78
+ super().__init__(**kwargs)