sglang 0.4.1.post3__tar.gz → 0.4.1.post5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (341) hide show
  1. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/PKG-INFO +12 -12
  2. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/README.md +7 -7
  3. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/pyproject.toml +11 -5
  4. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/bench_one_batch.py +2 -0
  5. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/bench_serving.py +18 -1
  6. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/interpreter.py +71 -1
  7. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/ir.py +2 -0
  8. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/configs/__init__.py +4 -0
  9. sglang-0.4.1.post5/sglang/srt/configs/chatglm.py +78 -0
  10. sglang-0.4.1.post5/sglang/srt/configs/dbrx.py +279 -0
  11. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/configs/model_config.py +1 -1
  12. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/hf_transformers_utils.py +9 -14
  13. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/attention/__init__.py +22 -6
  14. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -52
  15. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/attention/flashinfer_backend.py +215 -83
  16. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/attention/torch_native_backend.py +1 -38
  17. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/attention/triton_backend.py +20 -11
  18. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/attention/triton_ops/decode_attention.py +4 -0
  19. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/linear.py +159 -55
  20. sglang-0.4.1.post5/sglang/srt/layers/logits_processor.py +346 -0
  21. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  22. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  23. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  24. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  25. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  26. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  27. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  28. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  29. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  30. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  31. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  32. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  33. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  34. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  35. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  36. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  37. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  38. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  39. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  40. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  41. sglang-0.4.1.post5/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  42. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +198 -29
  43. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -7
  44. sglang-0.4.1.post5/sglang/srt/layers/parameter.py +431 -0
  45. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/__init__.py +3 -2
  46. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/fp8.py +3 -3
  47. sglang-0.4.1.post5/sglang/srt/layers/quantization/modelopt_quant.py +174 -0
  48. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/sampler.py +57 -21
  49. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/torchao_utils.py +17 -3
  50. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/vocab_parallel_embedding.py +1 -1
  51. sglang-0.4.1.post5/sglang/srt/managers/cache_controller.py +307 -0
  52. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/managers/data_parallel_controller.py +2 -0
  53. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/managers/io_struct.py +1 -2
  54. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/managers/schedule_batch.py +33 -3
  55. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/managers/schedule_policy.py +159 -90
  56. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/managers/scheduler.py +68 -28
  57. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/managers/session_controller.py +1 -1
  58. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/managers/tokenizer_manager.py +27 -21
  59. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/managers/tp_worker.py +16 -4
  60. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
  61. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/mem_cache/memory_pool.py +206 -1
  62. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/metrics/collector.py +22 -30
  63. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/model_executor/cuda_graph_runner.py +129 -77
  64. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/model_executor/forward_batch_info.py +51 -21
  65. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/model_executor/model_runner.py +72 -64
  66. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/chatglm.py +1 -1
  67. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/dbrx.py +1 -1
  68. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/deepseek_v2.py +34 -7
  69. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/grok.py +109 -29
  70. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/llama.py +9 -2
  71. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/openai_api/adapter.py +0 -17
  72. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/sampling/sampling_batch_info.py +22 -0
  73. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/sampling/sampling_params.py +9 -1
  74. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/server.py +20 -13
  75. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/server_args.py +120 -58
  76. sglang-0.4.1.post5/sglang/srt/speculative/build_eagle_tree.py +347 -0
  77. sglang-0.4.1.post5/sglang/srt/speculative/eagle_utils.py +626 -0
  78. sglang-0.4.1.post5/sglang/srt/speculative/eagle_worker.py +184 -0
  79. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/speculative/spec_info.py +5 -0
  80. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/utils.py +47 -7
  81. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/test_programs.py +23 -1
  82. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/test_utils.py +36 -7
  83. sglang-0.4.1.post5/sglang/version.py +1 -0
  84. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang.egg-info/PKG-INFO +12 -12
  85. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang.egg-info/SOURCES.txt +29 -0
  86. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang.egg-info/requires.txt +3 -3
  87. sglang-0.4.1.post3/sglang/srt/layers/logits_processor.py +0 -391
  88. sglang-0.4.1.post3/sglang/version.py +0 -1
  89. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/LICENSE +0 -0
  90. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/setup.cfg +0 -0
  91. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/__init__.py +0 -0
  92. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/api.py +0 -0
  93. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/bench_latency.py +0 -0
  94. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/bench_offline_throughput.py +0 -0
  95. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/bench_one_batch_server.py +0 -0
  96. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/check_env.py +0 -0
  97. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/global_config.py +0 -0
  98. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/__init__.py +0 -0
  99. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/backend/__init__.py +0 -0
  100. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/backend/anthropic.py +0 -0
  101. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/backend/base_backend.py +0 -0
  102. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/backend/litellm.py +0 -0
  103. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/backend/openai.py +0 -0
  104. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/backend/runtime_endpoint.py +0 -0
  105. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/backend/vertexai.py +0 -0
  106. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/chat_template.py +0 -0
  107. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/choices.py +0 -0
  108. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/compiler.py +0 -0
  109. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/lang/tracer.py +0 -0
  110. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/launch_server.py +0 -0
  111. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/launch_server_llavavid.py +0 -0
  112. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/llama3_eval.py +0 -0
  113. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/_custom_ops.py +0 -0
  114. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/aio_rwlock.py +0 -0
  115. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/configs/device_config.py +0 -0
  116. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/configs/exaone.py +0 -0
  117. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/configs/load_config.py +0 -0
  118. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/configs/qwen2vl.py +0 -0
  119. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/constrained/__init__.py +0 -0
  120. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  121. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/constrained/outlines_backend.py +0 -0
  122. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  123. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  124. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/conversation.py +0 -0
  125. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/distributed/__init__.py +0 -0
  126. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/distributed/communication_op.py +0 -0
  127. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/distributed/device_communicators/__init__.py +0 -0
  128. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  129. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  130. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  131. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  132. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  133. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  134. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  135. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  136. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/distributed/parallel_state.py +0 -0
  137. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/distributed/utils.py +0 -0
  138. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/activation.py +0 -0
  139. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  140. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  141. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  142. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/custom_op_util.py +0 -0
  143. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/layernorm.py +0 -0
  144. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  145. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  146. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
  147. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  148. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  149. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  150. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  151. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  152. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  153. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  154. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  155. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  156. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  157. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  158. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  159. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  160. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  161. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  162. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  163. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  164. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  165. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  166. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  167. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  168. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  169. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  170. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  171. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  172. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  173. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  174. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  175. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  176. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  177. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  178. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  179. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  180. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  181. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  182. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  183. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  184. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  185. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  186. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  187. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  188. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  189. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  190. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  191. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  192. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  193. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  194. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  195. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  196. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  197. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  198. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  199. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  200. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  201. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  202. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  203. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  204. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  205. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  206. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  207. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  208. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  209. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  210. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  211. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  212. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  213. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  214. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  215. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  216. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  217. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  218. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  219. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/moe/topk.py +0 -0
  220. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/pooler.py +0 -0
  221. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/base_config.py +0 -0
  222. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  223. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  224. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  225. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  226. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  227. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  228. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  229. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  230. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  231. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  232. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  233. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  234. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  235. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  236. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  237. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  238. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  239. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  240. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  241. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  242. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  243. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  244. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  245. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  246. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  247. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  248. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  249. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  250. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  251. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  252. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  253. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  254. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  255. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  256. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  257. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  258. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  259. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  260. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
  261. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
  262. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/radix_attention.py +0 -0
  263. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/layers/rotary_embedding.py +0 -0
  264. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/lora/lora.py +0 -0
  265. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/lora/lora_config.py +0 -0
  266. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/lora/lora_manager.py +0 -0
  267. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/managers/detokenizer_manager.py +0 -0
  268. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/managers/image_processor.py +0 -0
  269. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  270. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  271. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/mem_cache/flush_cache.py +0 -0
  272. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/mem_cache/radix_cache.py +0 -0
  273. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/metrics/func_timer.py +0 -0
  274. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/mm_utils.py +0 -0
  275. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/model_loader/__init__.py +0 -0
  276. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/model_loader/loader.py +0 -0
  277. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/model_loader/utils.py +0 -0
  278. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/model_loader/weight_utils.py +0 -0
  279. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/model_parallel.py +0 -0
  280. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/baichuan.py +0 -0
  281. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/commandr.py +0 -0
  282. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/deepseek.py +0 -0
  283. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/exaone.py +0 -0
  284. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/gemma.py +0 -0
  285. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/gemma2.py +0 -0
  286. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/gemma2_reward.py +0 -0
  287. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/gpt2.py +0 -0
  288. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/gpt_bigcode.py +0 -0
  289. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/granite.py +0 -0
  290. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/internlm2.py +0 -0
  291. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/internlm2_reward.py +0 -0
  292. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/llama_classification.py +0 -0
  293. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/llama_eagle.py +0 -0
  294. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/llama_embedding.py +0 -0
  295. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/llama_reward.py +0 -0
  296. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/llava.py +0 -0
  297. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/llavavid.py +0 -0
  298. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/minicpm.py +0 -0
  299. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/minicpm3.py +0 -0
  300. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/mistral.py +0 -0
  301. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/mixtral.py +0 -0
  302. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/mixtral_quant.py +0 -0
  303. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/mllama.py +0 -0
  304. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/olmo.py +0 -0
  305. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/olmo2.py +0 -0
  306. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/olmoe.py +0 -0
  307. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/phi3_small.py +0 -0
  308. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/qwen.py +0 -0
  309. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/qwen2.py +0 -0
  310. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/qwen2_moe.py +0 -0
  311. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/qwen2_vl.py +0 -0
  312. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/registry.py +0 -0
  313. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/stablelm.py +0 -0
  314. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/torch_native_llama.py +0 -0
  315. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/xverse.py +0 -0
  316. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/xverse_moe.py +0 -0
  317. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/models/yivl.py +0 -0
  318. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/openai_api/protocol.py +3 -3
  319. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  320. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  321. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  322. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  323. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  324. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  325. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/few_shot_gsm8k.py +0 -0
  326. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  327. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/run_eval.py +0 -0
  328. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/runners.py +0 -0
  329. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/simple_eval_common.py +0 -0
  330. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/simple_eval_gpqa.py +0 -0
  331. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/simple_eval_humaneval.py +0 -0
  332. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/simple_eval_math.py +0 -0
  333. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/simple_eval_mgsm.py +0 -0
  334. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/simple_eval_mmlu.py +0 -0
  335. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  336. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/test_activation.py +0 -0
  337. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/test_block_fp8.py +0 -0
  338. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/test/test_layernorm.py +0 -0
  339. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang/utils.py +0 -0
  340. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang.egg-info/dependency_links.txt +0 -0
  341. {sglang-0.4.1.post3 → sglang-0.4.1.post5}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.1.post3
3
+ Version: 0.4.1.post5
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -239,15 +239,15 @@ Requires-Dist: uvloop; extra == "runtime-common"
239
239
  Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
240
240
  Provides-Extra: srt
241
241
  Requires-Dist: sglang[runtime_common]; extra == "srt"
242
+ Requires-Dist: cuda-python; extra == "srt"
243
+ Requires-Dist: sgl-kernel>=0.0.2.post11; extra == "srt"
242
244
  Requires-Dist: torch; extra == "srt"
243
245
  Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
244
- Requires-Dist: cuda-python; extra == "srt"
245
246
  Requires-Dist: flashinfer==0.1.6; extra == "srt"
246
- Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
247
247
  Provides-Extra: srt-hip
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
249
249
  Requires-Dist: torch; extra == "srt-hip"
250
- Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
250
+ Requires-Dist: vllm==0.6.3.post2.dev1; extra == "srt-hip"
251
251
  Provides-Extra: srt-xpu
252
252
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
253
253
  Provides-Extra: srt-hpu
@@ -315,7 +315,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
315
315
 
316
316
  | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
317
317
  | [**Documentation**](https://sgl-project.github.io/)
318
- | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
318
+ | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw)
319
319
  | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
320
320
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
321
321
 
@@ -347,12 +347,13 @@ The core features include:
347
347
 
348
348
  ## Getting Started
349
349
  - [Install SGLang](https://sgl-project.github.io/start/install.html)
350
- - [Send requests](https://sgl-project.github.io/start/send_request.html)
351
- - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
352
- - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
350
+ - [Quick Start](https://sgl-project.github.io/start/send_request.html)
351
+ - [Backend Tutorial](https://sgl-project.github.io/backend/openai_api_completions.html)
352
+ - [Frontend Tutorial](https://sgl-project.github.io/frontend/frontend.html)
353
+ - [Contribution Guide](https://sgl-project.github.io/references/contribution_guide.html)
353
354
 
354
355
  ## Benchmark and Performance
355
- Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
356
+ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
356
357
 
357
358
  ## Roadmap
358
359
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
@@ -361,5 +362,4 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
361
362
  The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
362
363
 
363
364
  ## Acknowledgment and Citation
364
- We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
365
- Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
365
+ We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -14,7 +14,7 @@
14
14
 
15
15
  | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
16
16
  | [**Documentation**](https://sgl-project.github.io/)
17
- | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
17
+ | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw)
18
18
  | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
19
19
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
20
20
 
@@ -46,12 +46,13 @@ The core features include:
46
46
 
47
47
  ## Getting Started
48
48
  - [Install SGLang](https://sgl-project.github.io/start/install.html)
49
- - [Send requests](https://sgl-project.github.io/start/send_request.html)
50
- - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
51
- - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
49
+ - [Quick Start](https://sgl-project.github.io/start/send_request.html)
50
+ - [Backend Tutorial](https://sgl-project.github.io/backend/openai_api_completions.html)
51
+ - [Frontend Tutorial](https://sgl-project.github.io/frontend/frontend.html)
52
+ - [Contribution Guide](https://sgl-project.github.io/references/contribution_guide.html)
52
53
 
53
54
  ## Benchmark and Performance
54
- Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
55
+ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
55
56
 
56
57
  ## Roadmap
57
58
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
@@ -60,5 +61,4 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
60
61
  The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
61
62
 
62
63
  ## Acknowledgment and Citation
63
- We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
64
- Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
64
+ We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.1.post3"
7
+ version = "0.4.1.post5"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -16,18 +16,24 @@ classifiers = [
16
16
  dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
17
17
 
18
18
  [project.optional-dependencies]
19
- runtime_common = ["aiohttp", "decord", "fastapi",
19
+ runtime_common = [
20
+ "aiohttp", "decord", "fastapi",
20
21
  "hf_transfer", "huggingface_hub", "interegular", "modelscope",
21
22
  "orjson", "outlines>=0.0.44,<0.1.0",
22
23
  "packaging", "pillow", "prometheus-client>=0.20.0",
23
24
  "psutil", "pydantic", "python-multipart",
24
25
  "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
25
- "xgrammar>=0.1.6"]
26
- srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6", "sgl-kernel>=0.0.2.post10"]
26
+ "xgrammar>=0.1.6"
27
+ ]
28
+ srt = [
29
+ "sglang[runtime_common]", "cuda-python",
30
+ "sgl-kernel>=0.0.2.post11", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1",
31
+ "flashinfer==0.1.6"
32
+ ]
27
33
 
28
34
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
29
35
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
30
- srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
36
+ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post2.dev1"]
31
37
  # xpu is not enabled in public vllm and torch whl,
32
38
  # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
33
39
  srt_xpu = ["sglang[runtime_common]"]
@@ -63,6 +63,7 @@ from sglang.srt.model_executor.model_runner import ModelRunner
63
63
  from sglang.srt.sampling.sampling_params import SamplingParams
64
64
  from sglang.srt.server import _set_envs_and_config
65
65
  from sglang.srt.server_args import PortArgs, ServerArgs
66
+ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
66
67
  from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
67
68
 
68
69
 
@@ -214,6 +215,7 @@ def extend(reqs, model_runner):
214
215
  tree_cache=None,
215
216
  model_config=model_runner.model_config,
216
217
  enable_overlap=False,
218
+ spec_algorithm=SpeculativeAlgorithm.NONE,
217
219
  )
218
220
  batch.prepare_for_extend()
219
221
  model_worker_batch = batch.get_model_worker_batch()
@@ -514,6 +514,8 @@ class BenchmarkMetrics:
514
514
  p99_itl_ms: float
515
515
  mean_e2e_latency_ms: float
516
516
  median_e2e_latency_ms: float
517
+ std_e2e_latency_ms: float
518
+ p99_e2e_latency_ms: float
517
519
 
518
520
 
519
521
  SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
@@ -563,7 +565,7 @@ def sample_sharegpt_requests(
563
565
  raise ValueError("output_len too small")
564
566
 
565
567
  # Download sharegpt if necessary
566
- if not os.path.isfile(dataset_path):
568
+ if not os.path.isfile(dataset_path) and dataset_path == "":
567
569
  dataset_path = download_and_cache_file(SHAREGPT_URL)
568
570
 
569
571
  # Load the dataset.
@@ -873,6 +875,8 @@ def calculate_metrics(
873
875
  p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
874
876
  mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
875
877
  median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
878
+ std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
879
+ p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000,
876
880
  )
877
881
 
878
882
  return metrics, output_lens
@@ -1064,8 +1068,21 @@ async def benchmark(
1064
1068
  "total_output_tokens_retokenized": metrics.total_output_retokenized,
1065
1069
  "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
1066
1070
  "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
1071
+ "std_e2e_latency_ms": metrics.std_e2e_latency_ms,
1072
+ "p99_e2e_latency_ms": metrics.p99_e2e_latency_ms,
1073
+ "mean_ttft_ms": metrics.mean_ttft_ms,
1067
1074
  "median_ttft_ms": metrics.median_ttft_ms,
1075
+ "std_ttft_ms": metrics.std_ttft_ms,
1076
+ "p99_ttft_ms": metrics.p99_ttft_ms,
1077
+ "mean_tpot_ms": metrics.mean_tpot_ms,
1078
+ "median_tpot_ms": metrics.median_tpot_ms,
1079
+ "std_tpot_ms": metrics.std_tpot_ms,
1080
+ "p99_tpot_ms": metrics.p99_tpot_ms,
1081
+ "mean_itl_ms": metrics.mean_itl_ms,
1068
1082
  "median_itl_ms": metrics.median_itl_ms,
1083
+ "std_itl_ms": metrics.std_itl_ms,
1084
+ "p99_itl_ms": metrics.p99_itl_ms,
1085
+ "input_throughput": metrics.input_throughput,
1069
1086
  "output_throughput": metrics.output_throughput,
1070
1087
  "sharegpt_output_len": args.sharegpt_output_len,
1071
1088
  "random_input_len": args.random_input_len,
@@ -96,6 +96,7 @@ def run_program_batch(
96
96
  default_sampling_para,
97
97
  num_threads,
98
98
  progress_bar,
99
+ generator_style=False,
99
100
  ):
100
101
  if hasattr(backend, "endpoint"):
101
102
  backend = backend.endpoint
@@ -109,6 +110,17 @@ def run_program_batch(
109
110
  num_threads = max(96, multiprocessing.cpu_count() * 16)
110
111
  num_threads = min(num_threads, len(batch_arguments))
111
112
 
113
+ if generator_style:
114
+ return _run_program_batch_generator(
115
+ program,
116
+ backend,
117
+ batch_arguments,
118
+ default_sampling_para,
119
+ num_threads,
120
+ progress_bar,
121
+ )
122
+
123
+ # Original code path when generator_style=False
112
124
  if num_threads == 1:
113
125
  rets = []
114
126
  if progress_bar:
@@ -168,6 +180,64 @@ def run_program_batch(
168
180
  return rets
169
181
 
170
182
 
183
+ def _run_program_batch_generator(
184
+ program,
185
+ backend,
186
+ batch_arguments,
187
+ default_sampling_para,
188
+ num_threads,
189
+ progress_bar,
190
+ ):
191
+ """Helper function that yields results one by one using chunking to avoid overwhelming ThreadPoolExecutor."""
192
+ if num_threads == 1:
193
+ iterator = tqdm.tqdm(batch_arguments) if progress_bar else batch_arguments
194
+ for arguments in iterator:
195
+ yield run_program(
196
+ program,
197
+ backend,
198
+ (),
199
+ arguments,
200
+ default_sampling_para,
201
+ False,
202
+ True,
203
+ )
204
+ else:
205
+ pbar = tqdm.tqdm(total=len(batch_arguments)) if progress_bar else None
206
+
207
+ # Process in chunks to avoid overwhelming ThreadPoolExecutor
208
+ # Otherwise, ThreadPoolExecutor.submit will block after adding certain number of tasks
209
+ # so we will never reach "yield" until all tasks are done
210
+ chunk_size = 200
211
+
212
+ with ThreadPoolExecutor(num_threads) as executor:
213
+ for chunk_start in range(0, len(batch_arguments), chunk_size):
214
+ chunk_end = min(chunk_start + chunk_size, len(batch_arguments))
215
+ chunk_futures = []
216
+
217
+ # Submit chunk of tasks
218
+ for i in range(chunk_start, chunk_end):
219
+ future = executor.submit(
220
+ run_program,
221
+ program,
222
+ backend,
223
+ (),
224
+ batch_arguments[i],
225
+ default_sampling_para,
226
+ False,
227
+ True,
228
+ )
229
+ if pbar:
230
+ future.add_done_callback(lambda _: pbar.update())
231
+ chunk_futures.append(future)
232
+
233
+ # Yield results from this chunk as they complete
234
+ for future in chunk_futures:
235
+ yield future.result()
236
+
237
+ if pbar:
238
+ pbar.close()
239
+
240
+
171
241
  def cache_program(program, backend):
172
242
  from sglang.lang.tracer import extract_prefix_by_tracing
173
243
 
@@ -277,7 +347,7 @@ class StreamExecutor:
277
347
  size: int = 1,
278
348
  position_ids_offset: Optional[List[int]] = None,
279
349
  ):
280
- if size > 1:
350
+ if size > 1 and str(self.text_):
281
351
  self.submit(SglCommitLazy())
282
352
 
283
353
  self.sync()
@@ -227,6 +227,7 @@ class SglFunction:
227
227
  backend=None,
228
228
  num_threads: Union[str, int] = "auto",
229
229
  progress_bar: bool = False,
230
+ generator_style: bool = False,
230
231
  ):
231
232
  from sglang.lang.interpreter import run_program_batch
232
233
 
@@ -277,6 +278,7 @@ class SglFunction:
277
278
  default_sampling_para,
278
279
  num_threads,
279
280
  progress_bar,
281
+ generator_style=generator_style,
280
282
  )
281
283
 
282
284
  def trace(self, *, backend=None, **kwargs):
@@ -1,3 +1,5 @@
1
+ from sglang.srt.configs.chatglm import ChatGLMConfig
2
+ from sglang.srt.configs.dbrx import DbrxConfig
1
3
  from sglang.srt.configs.exaone import ExaoneConfig
2
4
  from sglang.srt.configs.qwen2vl import Qwen2VLConfig, Qwen2VLVisionConfig
3
5
 
@@ -5,4 +7,6 @@ __all__ = [
5
7
  "ExaoneConfig",
6
8
  "Qwen2VLConfig",
7
9
  "Qwen2VLVisionConfig",
10
+ "ChatGLMConfig",
11
+ "DbrxConfig",
8
12
  ]
@@ -0,0 +1,78 @@
1
+ # Adapted from
2
+ # https://github.com/THUDM/ChatGLM2-6B
3
+ # https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/chatglm.py
4
+
5
+ # ChatGLM2 and ChatGLM3 share the same config.
6
+ # ChatGLM4 is officially supported by Huggingface
7
+ # transformers >= 4.46.0 is required
8
+ # https://huggingface.co/docs/transformers/en/model_doc/glm
9
+ from transformers import PretrainedConfig
10
+
11
+
12
+ class ChatGLMConfig(PretrainedConfig):
13
+ model_type = "chatglm"
14
+ attribute_map = {
15
+ "num_hidden_layers": "num_layers",
16
+ "n_head_kv": "multi_query_group_num",
17
+ }
18
+
19
+ def __init__(
20
+ self,
21
+ num_layers=28,
22
+ padded_vocab_size=65024,
23
+ hidden_size=4096,
24
+ ffn_hidden_size=13696,
25
+ kv_channels=128,
26
+ num_attention_heads=32,
27
+ seq_length=2048,
28
+ hidden_dropout=0.0,
29
+ attention_dropout=0.0,
30
+ layernorm_epsilon=1e-5,
31
+ rmsnorm=True,
32
+ apply_residual_connection_post_layernorm=False,
33
+ post_layer_norm=True,
34
+ add_bias_linear=False,
35
+ add_qkv_bias=False,
36
+ interleaved_qkv=False,
37
+ bias_dropout_fusion=True,
38
+ multi_query_attention=False,
39
+ multi_query_group_num=1,
40
+ apply_query_key_layer_scaling=True,
41
+ attention_softmax_in_fp32=True,
42
+ fp32_residual_connection=False,
43
+ quantization_bit=0,
44
+ pre_seq_len=None,
45
+ prefix_projection=False,
46
+ **kwargs
47
+ ):
48
+ self.num_layers = num_layers
49
+ self.vocab_size = padded_vocab_size
50
+ self.padded_vocab_size = padded_vocab_size
51
+ self.hidden_size = hidden_size
52
+ self.ffn_hidden_size = ffn_hidden_size
53
+ self.kv_channels = kv_channels
54
+ self.num_attention_heads = num_attention_heads
55
+ self.seq_length = seq_length
56
+ # It is to be compatible with long lora.
57
+ self.max_position_embeddings = seq_length
58
+ self.hidden_dropout = hidden_dropout
59
+ self.attention_dropout = attention_dropout
60
+ self.layernorm_epsilon = layernorm_epsilon
61
+ self.rmsnorm = rmsnorm
62
+ self.apply_residual_connection_post_layernorm = (
63
+ apply_residual_connection_post_layernorm
64
+ )
65
+ self.post_layer_norm = post_layer_norm
66
+ self.add_bias_linear = add_bias_linear
67
+ self.add_qkv_bias = add_qkv_bias
68
+ self.bias_dropout_fusion = bias_dropout_fusion
69
+ self.multi_query_attention = multi_query_attention
70
+ self.multi_query_group_num = multi_query_group_num
71
+ self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
72
+ self.attention_softmax_in_fp32 = attention_softmax_in_fp32
73
+ self.fp32_residual_connection = fp32_residual_connection
74
+ self.quantization_bit = quantization_bit
75
+ self.pre_seq_len = pre_seq_len
76
+ self.prefix_projection = prefix_projection
77
+ self.interleaved_qkv = interleaved_qkv
78
+ super().__init__(**kwargs)