sglang 0.4.4.post1__tar.gz → 0.4.4.post3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (586) hide show
  1. {sglang-0.4.4.post1/sglang.egg-info → sglang-0.4.4.post3}/PKG-INFO +16 -8
  2. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/README.md +4 -1
  3. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/pyproject.toml +16 -7
  4. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/__init__.py +2 -0
  5. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/api.py +6 -0
  6. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/bench_one_batch.py +1 -1
  7. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/bench_one_batch_server.py +1 -1
  8. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/bench_serving.py +26 -4
  9. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/check_env.py +3 -4
  10. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/backend/openai.py +18 -5
  11. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/chat_template.py +28 -7
  12. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/interpreter.py +7 -3
  13. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/ir.py +10 -0
  14. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/_custom_ops.py +1 -1
  15. sglang-0.4.4.post3/sglang/srt/code_completion_parser.py +174 -0
  16. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/configs/__init__.py +2 -6
  17. sglang-0.4.4.post3/sglang/srt/configs/deepseekvl2.py +676 -0
  18. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/configs/janus_pro.py +3 -4
  19. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/configs/load_config.py +1 -0
  20. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/configs/model_config.py +49 -8
  21. sglang-0.4.4.post3/sglang/srt/configs/utils.py +25 -0
  22. sglang-0.4.4.post3/sglang/srt/connector/__init__.py +51 -0
  23. sglang-0.4.4.post3/sglang/srt/connector/base_connector.py +112 -0
  24. sglang-0.4.4.post3/sglang/srt/connector/redis.py +85 -0
  25. sglang-0.4.4.post3/sglang/srt/connector/s3.py +122 -0
  26. sglang-0.4.4.post3/sglang/srt/connector/serde/__init__.py +31 -0
  27. sglang-0.4.4.post3/sglang/srt/connector/serde/safe_serde.py +29 -0
  28. sglang-0.4.4.post3/sglang/srt/connector/serde/serde.py +43 -0
  29. sglang-0.4.4.post3/sglang/srt/connector/utils.py +35 -0
  30. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/conversation.py +88 -0
  31. sglang-0.4.4.post3/sglang/srt/disaggregation/conn.py +81 -0
  32. sglang-0.4.4.post3/sglang/srt/disaggregation/decode.py +495 -0
  33. sglang-0.4.4.post3/sglang/srt/disaggregation/mini_lb.py +285 -0
  34. sglang-0.4.4.post3/sglang/srt/disaggregation/prefill.py +249 -0
  35. sglang-0.4.4.post3/sglang/srt/disaggregation/utils.py +44 -0
  36. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
  37. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/distributed/parallel_state.py +42 -8
  38. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/entrypoints/engine.py +55 -5
  39. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/entrypoints/http_server.py +78 -13
  40. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/entrypoints/verl_engine.py +2 -0
  41. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/function_call_parser.py +133 -55
  42. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/hf_transformers_utils.py +28 -3
  43. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/activation.py +4 -2
  44. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/attention/base_attn_backend.py +1 -1
  45. sglang-0.4.4.post3/sglang/srt/layers/attention/flashattention_backend.py +434 -0
  46. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  47. sglang-0.4.4.post3/sglang/srt/layers/attention/flashmla_backend.py +284 -0
  48. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/attention/triton_backend.py +171 -38
  49. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
  50. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
  51. sglang-0.4.4.post3/sglang/srt/layers/attention/utils.py +92 -0
  52. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/attention/vision.py +9 -28
  53. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/dp_attention.py +41 -19
  54. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/layernorm.py +24 -2
  55. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/linear.py +17 -5
  56. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/logits_processor.py +25 -7
  57. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
  58. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/ep_moe/layer.py +273 -1
  59. sglang-0.4.4.post3/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
  60. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_native.py +2 -1
  61. sglang-0.4.4.post3/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
  62. sglang-0.4.4.post3/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
  63. sglang-0.4.4.post3/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang-0.4.4.post3/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
  66. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
  67. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/topk.py +60 -20
  68. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/parameter.py +1 -1
  69. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/__init__.py +80 -53
  70. sglang-0.4.4.post3/sglang/srt/layers/quantization/awq.py +200 -0
  71. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/base_config.py +5 -0
  72. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/blockwise_int8.py +1 -1
  73. sglang-0.4.4.post3/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  74. sglang-0.4.4.post3/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
  75. sglang-0.4.4.post3/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
  76. sglang-0.4.4.post3/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
  77. sglang-0.4.4.post3/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
  78. sglang-0.4.4.post3/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
  79. sglang-0.4.4.post3/sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
  80. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/fp8.py +76 -34
  81. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/fp8_kernel.py +25 -8
  82. sglang-0.4.4.post3/sglang/srt/layers/quantization/fp8_utils.py +564 -0
  83. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/gptq.py +36 -19
  84. sglang-0.4.4.post3/sglang/srt/layers/quantization/kv_cache.py +98 -0
  85. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/modelopt_quant.py +9 -7
  86. sglang-0.4.4.post3/sglang/srt/layers/quantization/utils.py +153 -0
  87. sglang-0.4.4.post3/sglang/srt/layers/quantization/w8a8_fp8.py +179 -0
  88. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/rotary_embedding.py +78 -87
  89. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/sampler.py +1 -1
  90. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/backend/base_backend.py +4 -4
  91. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/backend/flashinfer_backend.py +12 -9
  92. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/backend/triton_backend.py +5 -8
  93. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/layers.py +87 -33
  94. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/lora.py +2 -22
  95. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/lora_manager.py +67 -30
  96. sglang-0.4.4.post3/sglang/srt/lora/mem_pool.py +239 -0
  97. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
  98. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
  99. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
  100. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
  101. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/utils.py +18 -1
  102. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/cache_controller.py +2 -5
  103. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/data_parallel_controller.py +30 -8
  104. sglang-0.4.4.post3/sglang/srt/managers/expert_distribution.py +81 -0
  105. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/io_struct.py +43 -5
  106. sglang-0.4.4.post3/sglang/srt/managers/mm_utils.py +373 -0
  107. sglang-0.4.4.post3/sglang/srt/managers/multimodal_processor.py +68 -0
  108. sglang-0.4.4.post3/sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
  109. sglang-0.4.4.post3/sglang/srt/managers/multimodal_processors/clip.py +63 -0
  110. sglang-0.4.4.post3/sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
  111. sglang-0.4.4.post3/sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
  112. {sglang-0.4.4.post1/sglang/srt/managers/image_processors → sglang-0.4.4.post3/sglang/srt/managers/multimodal_processors}/janus_pro.py +20 -15
  113. {sglang-0.4.4.post1/sglang/srt/managers/image_processors → sglang-0.4.4.post3/sglang/srt/managers/multimodal_processors}/llava.py +10 -15
  114. sglang-0.4.4.post3/sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
  115. {sglang-0.4.4.post1/sglang/srt/managers/image_processors → sglang-0.4.4.post3/sglang/srt/managers/multimodal_processors}/mlama.py +7 -8
  116. {sglang-0.4.4.post1/sglang/srt/managers/image_processors → sglang-0.4.4.post3/sglang/srt/managers/multimodal_processors}/qwen_vl.py +28 -22
  117. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/schedule_batch.py +134 -30
  118. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/scheduler.py +290 -31
  119. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/session_controller.py +1 -1
  120. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/tokenizer_manager.py +59 -24
  121. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/tp_worker.py +4 -1
  122. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
  123. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/utils.py +6 -1
  124. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/mem_cache/hiradix_cache.py +18 -7
  125. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/mem_cache/memory_pool.py +255 -98
  126. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/mem_cache/paged_allocator.py +2 -2
  127. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/mem_cache/radix_cache.py +4 -4
  128. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/model_executor/cuda_graph_runner.py +36 -21
  129. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/model_executor/forward_batch_info.py +68 -11
  130. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/model_executor/model_runner.py +75 -8
  131. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/model_loader/loader.py +171 -3
  132. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/model_loader/weight_utils.py +51 -3
  133. sglang-0.4.4.post3/sglang/srt/models/clip.py +563 -0
  134. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/deepseek_janus_pro.py +31 -88
  135. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/deepseek_nextn.py +22 -10
  136. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/deepseek_v2.py +329 -73
  137. sglang-0.4.4.post3/sglang/srt/models/deepseek_vl2.py +358 -0
  138. sglang-0.4.4.post3/sglang/srt/models/gemma3_causal.py +694 -0
  139. sglang-0.4.4.post3/sglang/srt/models/gemma3_mm.py +468 -0
  140. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/llama.py +47 -7
  141. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/llama_eagle.py +1 -0
  142. sglang-0.4.4.post3/sglang/srt/models/llama_eagle3.py +196 -0
  143. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/llava.py +3 -3
  144. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/llavavid.py +3 -3
  145. sglang-0.4.4.post3/sglang/srt/models/minicpmo.py +1995 -0
  146. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/minicpmv.py +62 -137
  147. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/mllama.py +4 -4
  148. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/phi3_small.py +1 -1
  149. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/qwen2.py +3 -0
  150. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/qwen2_5_vl.py +68 -146
  151. sglang-0.4.4.post3/sglang/srt/models/qwen2_classification.py +75 -0
  152. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/qwen2_moe.py +9 -1
  153. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/qwen2_vl.py +25 -63
  154. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/openai_api/adapter.py +201 -104
  155. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/openai_api/protocol.py +33 -7
  156. sglang-0.4.4.post3/sglang/srt/patch_torch.py +71 -0
  157. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/sampling/sampling_batch_info.py +1 -1
  158. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/sampling/sampling_params.py +6 -6
  159. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/server_args.py +114 -14
  160. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/speculative/build_eagle_tree.py +7 -347
  161. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
  162. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/speculative/eagle_utils.py +208 -252
  163. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/speculative/eagle_worker.py +140 -54
  164. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/speculative/spec_info.py +6 -1
  165. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/torch_memory_saver_adapter.py +22 -0
  166. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/utils.py +215 -21
  167. sglang-0.4.4.post3/sglang/test/__init__.py +0 -0
  168. sglang-0.4.4.post3/sglang/test/attention/__init__.py +0 -0
  169. sglang-0.4.4.post3/sglang/test/attention/test_flashattn_backend.py +312 -0
  170. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/runners.py +29 -2
  171. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/test_activation.py +2 -1
  172. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/test_block_fp8.py +5 -4
  173. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/test_block_fp8_ep.py +2 -1
  174. sglang-0.4.4.post3/sglang/test/test_dynamic_grad_mode.py +58 -0
  175. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/test_layernorm.py +3 -2
  176. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/test_utils.py +56 -5
  177. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/utils.py +31 -0
  178. sglang-0.4.4.post3/sglang/version.py +1 -0
  179. {sglang-0.4.4.post1 → sglang-0.4.4.post3/sglang.egg-info}/PKG-INFO +16 -8
  180. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang.egg-info/SOURCES.txt +58 -10
  181. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang.egg-info/requires.txt +8 -5
  182. sglang-0.4.4.post1/sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
  183. sglang-0.4.4.post1/sglang/srt/layers/attention/utils.py +0 -39
  184. sglang-0.4.4.post1/sglang/srt/layers/quantization/fp8_utils.py +0 -308
  185. sglang-0.4.4.post1/sglang/srt/layers/quantization/w8a8_fp8.py +0 -128
  186. sglang-0.4.4.post1/sglang/srt/lora/mem_pool.py +0 -174
  187. sglang-0.4.4.post1/sglang/srt/managers/image_processor.py +0 -55
  188. sglang-0.4.4.post1/sglang/srt/managers/image_processors/base_image_processor.py +0 -219
  189. sglang-0.4.4.post1/sglang/srt/managers/image_processors/minicpmv.py +0 -86
  190. sglang-0.4.4.post1/sglang/srt/managers/multi_modality_padding.py +0 -134
  191. sglang-0.4.4.post1/sglang/version.py +0 -1
  192. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/LICENSE +0 -0
  193. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/setup.cfg +0 -0
  194. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/bench_offline_throughput.py +0 -0
  195. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/global_config.py +0 -0
  196. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/__init__.py +0 -0
  197. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/backend/__init__.py +0 -0
  198. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/backend/anthropic.py +0 -0
  199. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/backend/base_backend.py +0 -0
  200. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/backend/litellm.py +0 -0
  201. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/backend/runtime_endpoint.py +0 -0
  202. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/backend/vertexai.py +0 -0
  203. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/choices.py +0 -0
  204. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/compiler.py +0 -0
  205. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/tracer.py +0 -0
  206. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/launch_server.py +0 -0
  207. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/llama3_eval.py +0 -0
  208. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/aio_rwlock.py +0 -0
  209. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/configs/chatglm.py +0 -0
  210. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/configs/dbrx.py +0 -0
  211. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/configs/device_config.py +0 -0
  212. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/configs/exaone.py +0 -0
  213. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  214. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/constrained/llguidance_backend.py +0 -0
  215. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/constrained/outlines_backend.py +0 -0
  216. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  217. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  218. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/custom_op.py +0 -0
  219. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/distributed/__init__.py +0 -0
  220. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/distributed/communication_op.py +0 -0
  221. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  222. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  223. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  224. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  225. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  226. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  227. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  228. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/distributed/utils.py +0 -0
  229. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  230. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
  231. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  232. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  233. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  234. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
  235. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/elementwise.py +0 -0
  236. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  237. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  238. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  239. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  240. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  241. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  242. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  243. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  244. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  245. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  246. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  247. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  248. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  249. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  250. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  251. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  252. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  253. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  254. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  255. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  256. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  257. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  258. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  259. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  260. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  261. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  262. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  263. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  264. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  265. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  266. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  267. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  268. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  269. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  270. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  271. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  272. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  273. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  274. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  275. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  276. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  277. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  278. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  279. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  280. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  281. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  282. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  283. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  284. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  285. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  286. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  287. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  288. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  289. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  290. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  291. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  292. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  293. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  294. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  295. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  296. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  297. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  298. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  299. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  300. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  301. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  302. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  303. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  304. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  305. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  306. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  307. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  308. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  309. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  310. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  311. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  312. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  313. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  314. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  315. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  316. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  317. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  318. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  319. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  320. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  321. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  322. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  323. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  324. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  325. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  326. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  327. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  328. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  329. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  330. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  331. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  332. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  333. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  334. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  335. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  336. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  337. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  338. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  339. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  340. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  341. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  342. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  343. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  344. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  345. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  346. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  347. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  348. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  349. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  350. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  351. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  352. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  353. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/moe/router.py +0 -0
  354. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/pooler.py +0 -0
  355. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  356. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  357. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  358. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  359. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  360. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  361. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  362. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  363. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  364. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  365. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  366. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  367. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  368. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  369. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  370. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  371. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  372. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  373. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  374. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  375. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  376. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  377. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  378. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  379. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  380. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  381. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  382. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  383. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  384. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  385. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  386. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  387. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  388. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  389. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  390. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  391. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  392. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  393. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  394. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  395. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  396. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  397. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  398. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  399. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  400. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  401. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  402. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  403. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  404. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  405. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  406. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  407. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  408. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  409. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  410. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  411. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  412. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  413. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  414. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  415. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  416. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  417. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  418. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  420. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  421. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  422. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  423. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  424. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  425. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  426. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  427. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  428. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  429. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  430. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  431. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  432. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  433. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  434. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  435. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  436. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  437. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  438. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  439. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  440. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  441. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  442. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  443. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  444. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  445. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  446. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  447. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  448. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  449. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  450. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  451. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  452. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  453. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  454. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  455. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  456. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  457. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  458. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  459. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  460. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  461. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  462. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  463. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  464. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  465. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  466. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  467. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  468. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  469. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  470. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  471. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  472. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  473. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  474. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  475. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  476. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  477. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  478. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  479. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  480. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  481. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  482. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  483. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  484. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  485. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  486. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  487. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  488. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  489. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  490. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  491. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  492. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  493. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  494. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  495. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  496. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  497. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  498. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  499. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  500. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  501. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  502. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  503. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  504. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  505. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  506. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  507. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  508. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  509. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/quantization/w8a8_int8.py +0 -0
  510. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/radix_attention.py +0 -0
  511. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/torchao_utils.py +0 -0
  512. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  513. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/backend/__init__.py +0 -0
  514. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/lora_config.py +0 -0
  515. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  516. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/configure_logging.py +0 -0
  517. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/detokenizer_manager.py +0 -0
  518. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/schedule_policy.py +0 -0
  519. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/managers/scheduler_output_processor_mixin.py +0 -0
  520. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  521. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  522. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/mem_cache/flush_cache.py +0 -0
  523. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/metrics/collector.py +0 -0
  524. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/metrics/func_timer.py +0 -0
  525. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/mm_utils.py +0 -0
  526. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/model_loader/__init__.py +0 -0
  527. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/model_loader/utils.py +0 -0
  528. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/model_parallel.py +0 -0
  529. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/baichuan.py +0 -0
  530. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/chatglm.py +0 -0
  531. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/commandr.py +0 -0
  532. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/dbrx.py +0 -0
  533. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/deepseek.py +0 -0
  534. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/exaone.py +0 -0
  535. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/gemma.py +0 -0
  536. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/gemma2.py +0 -0
  537. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/gemma2_reward.py +0 -0
  538. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/gpt2.py +0 -0
  539. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/gpt_bigcode.py +0 -0
  540. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/granite.py +0 -0
  541. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/grok.py +0 -0
  542. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/internlm2.py +0 -0
  543. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/internlm2_reward.py +0 -0
  544. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/llama_classification.py +0 -0
  545. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/llama_embedding.py +0 -0
  546. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/llama_reward.py +0 -0
  547. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/minicpm.py +0 -0
  548. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/minicpm3.py +0 -0
  549. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/mistral.py +0 -0
  550. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/mixtral.py +0 -0
  551. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/mixtral_quant.py +0 -0
  552. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/olmo.py +0 -0
  553. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/olmo2.py +0 -0
  554. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/olmoe.py +0 -0
  555. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/qwen.py +0 -0
  556. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/qwen2_eagle.py +0 -0
  557. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/qwen2_rm.py +0 -0
  558. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/registry.py +0 -0
  559. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/stablelm.py +0 -0
  560. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/torch_native_llama.py +0 -0
  561. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/xverse.py +0 -0
  562. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/xverse_moe.py +0 -0
  563. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/models/yivl.py +0 -0
  564. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/reasoning_parser.py +0 -0
  565. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  566. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  567. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
  568. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  569. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  570. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
  571. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/server.py +0 -0
  572. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/warmup.py +0 -0
  573. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/few_shot_gsm8k.py +0 -0
  574. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  575. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/run_eval.py +0 -0
  576. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/send_one.py +0 -0
  577. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/simple_eval_common.py +0 -0
  578. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/simple_eval_gpqa.py +0 -0
  579. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/simple_eval_humaneval.py +0 -0
  580. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/simple_eval_math.py +0 -0
  581. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/simple_eval_mgsm.py +0 -0
  582. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/simple_eval_mmlu.py +0 -0
  583. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/test_custom_ops.py +0 -0
  584. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/test/test_programs.py +0 -0
  585. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang.egg-info/dependency_links.txt +0 -0
  586. {sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.4.post1
3
+ Version: 0.4.4.post3
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -218,6 +218,7 @@ Requires-Dist: numpy
218
218
  Requires-Dist: IPython
219
219
  Requires-Dist: setproctitle
220
220
  Provides-Extra: runtime-common
221
+ Requires-Dist: compressed-tensors; extra == "runtime-common"
221
222
  Requires-Dist: datasets; extra == "runtime-common"
222
223
  Requires-Dist: decord; extra == "runtime-common"
223
224
  Requires-Dist: fastapi; extra == "runtime-common"
@@ -235,19 +236,22 @@ Requires-Dist: psutil; extra == "runtime-common"
235
236
  Requires-Dist: pydantic; extra == "runtime-common"
236
237
  Requires-Dist: python-multipart; extra == "runtime-common"
237
238
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
239
+ Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
238
240
  Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
239
- Requires-Dist: transformers==4.48.3; extra == "runtime-common"
241
+ Requires-Dist: transformers==4.50.0; extra == "runtime-common"
240
242
  Requires-Dist: uvicorn; extra == "runtime-common"
241
243
  Requires-Dist: uvloop; extra == "runtime-common"
242
- Requires-Dist: xgrammar==0.1.15; extra == "runtime-common"
244
+ Requires-Dist: compressed-tensors; extra == "runtime-common"
245
+ Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
243
246
  Provides-Extra: srt
244
247
  Requires-Dist: sglang[runtime_common]; extra == "srt"
245
- Requires-Dist: sgl-kernel==0.0.5; extra == "srt"
248
+ Requires-Dist: sgl-kernel==0.0.5.post4; extra == "srt"
246
249
  Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
247
250
  Requires-Dist: torch==2.5.1; extra == "srt"
248
- Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
249
251
  Requires-Dist: cuda-python; extra == "srt"
250
252
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
253
+ Requires-Dist: partial_json_parser; extra == "srt"
254
+ Requires-Dist: einops; extra == "srt"
251
255
  Provides-Extra: srt-hip
252
256
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
253
257
  Requires-Dist: torch; extra == "srt-hip"
@@ -271,7 +275,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
271
275
  Provides-Extra: litellm
272
276
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
273
277
  Provides-Extra: torch-memory-saver
274
- Requires-Dist: torch_memory_saver; extra == "torch-memory-saver"
278
+ Requires-Dist: torch_memory_saver>=0.0.4; extra == "torch-memory-saver"
275
279
  Provides-Extra: test
276
280
  Requires-Dist: jsonlines; extra == "test"
277
281
  Requires-Dist: matplotlib; extra == "test"
@@ -319,6 +323,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
319
323
  Provides-Extra: dev-cpu
320
324
  Requires-Dist: sglang[all_cpu]; extra == "dev-cpu"
321
325
  Requires-Dist: sglang[test]; extra == "dev-cpu"
326
+ Dynamic: license-file
322
327
 
323
328
  <div align="center" id="sglangtop">
324
329
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -342,6 +347,9 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
342
347
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
343
348
 
344
349
  ## News
350
+ - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
351
+ - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
352
+ - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
345
353
  - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
346
354
  - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
347
355
  - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -362,7 +370,7 @@ SGLang is a fast serving framework for large language models and vision language
362
370
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
363
371
  The core features include:
364
372
 
365
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
373
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
366
374
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
367
375
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
368
376
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
@@ -20,6 +20,9 @@
20
20
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
21
21
 
22
22
  ## News
23
+ - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
24
+ - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
25
+ - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
23
26
  - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
24
27
  - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
25
28
  - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -40,7 +43,7 @@ SGLang is a fast serving framework for large language models and vision language
40
43
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
41
44
  The core features include:
42
45
 
43
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
46
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
44
47
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
45
48
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
46
49
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.4.post1"
7
+ version = "0.4.4.post3"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -17,6 +17,7 @@ dependencies = ["aiohttp", "requests", "tqdm", "numpy", "IPython", "setproctitle
17
17
 
18
18
  [project.optional-dependencies]
19
19
  runtime_common = [
20
+ "compressed-tensors",
20
21
  "datasets",
21
22
  "decord",
22
23
  "fastapi",
@@ -34,26 +35,34 @@ runtime_common = [
34
35
  "pydantic",
35
36
  "python-multipart",
36
37
  "pyzmq>=25.1.2",
38
+ "soundfile==0.13.1",
37
39
  "torchao>=0.7.0",
38
- "transformers==4.48.3",
40
+ "transformers==4.50.0",
39
41
  "uvicorn",
40
42
  "uvloop",
41
- "xgrammar==0.1.15",
43
+ "compressed-tensors",
44
+ "xgrammar==0.1.17",
42
45
  ]
43
46
 
44
47
  srt = [
45
48
  "sglang[runtime_common]",
46
- "sgl-kernel==0.0.5",
49
+ "sgl-kernel==0.0.5.post4",
47
50
  "flashinfer_python==0.2.3",
48
51
  "torch==2.5.1",
49
- "vllm>=0.6.4.post1,<=0.7.2",
50
52
  "cuda-python",
51
53
  "outlines>=0.0.44,<=0.1.11",
54
+ "partial_json_parser",
55
+ "einops",
52
56
  ]
53
57
 
54
58
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
55
59
  # => base docker rocm/vllm-dev:20250114, not from public vllm whl
56
- srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
60
+ srt_hip = [
61
+ "sglang[runtime_common]",
62
+ "torch",
63
+ "vllm==0.6.7.dev2",
64
+ "outlines==0.1.11"
65
+ ]
57
66
 
58
67
  # xpu is not enabled in public vllm and torch whl,
59
68
  # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
@@ -71,7 +80,7 @@ srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"]
71
80
  openai = ["openai>=1.0", "tiktoken"]
72
81
  anthropic = ["anthropic>=0.20.0"]
73
82
  litellm = ["litellm>=1.0.0"]
74
- torch_memory_saver = ["torch_memory_saver"]
83
+ torch_memory_saver = ["torch_memory_saver>=0.0.4"]
75
84
  test = [
76
85
  "jsonlines",
77
86
  "matplotlib",
@@ -32,6 +32,7 @@ from sglang.lang.choices import (
32
32
  )
33
33
  from sglang.utils import LazyImport
34
34
 
35
+ ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
35
36
  Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
36
37
  LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
37
38
  OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
@@ -67,6 +68,7 @@ __all__ = [
67
68
  "greedy_token_selection",
68
69
  "token_length_normalized",
69
70
  "unconditional_likelihood_normalized",
71
+ "ServerArgs",
70
72
  "Anthropic",
71
73
  "LiteLLM",
72
74
  "OpenAI",
@@ -75,6 +75,7 @@ def gen(
75
75
  name: Optional[str] = None,
76
76
  max_tokens: Optional[int] = None,
77
77
  min_tokens: Optional[int] = None,
78
+ n: Optional[int] = None,
78
79
  stop: Optional[Union[str, List[str]]] = None,
79
80
  stop_token_ids: Optional[List[int]] = None,
80
81
  temperature: Optional[float] = None,
@@ -115,6 +116,7 @@ def gen(
115
116
  name,
116
117
  max_tokens,
117
118
  min_tokens,
119
+ n,
118
120
  stop,
119
121
  stop_token_ids,
120
122
  temperature,
@@ -137,6 +139,7 @@ def gen(
137
139
  def gen_int(
138
140
  name: Optional[str] = None,
139
141
  max_tokens: Optional[int] = None,
142
+ n: Optional[int] = None,
140
143
  stop: Optional[Union[str, List[str]]] = None,
141
144
  stop_token_ids: Optional[List[int]] = None,
142
145
  temperature: Optional[float] = None,
@@ -155,6 +158,7 @@ def gen_int(
155
158
  name,
156
159
  max_tokens,
157
160
  None,
161
+ n,
158
162
  stop,
159
163
  stop_token_ids,
160
164
  temperature,
@@ -176,6 +180,7 @@ def gen_int(
176
180
  def gen_string(
177
181
  name: Optional[str] = None,
178
182
  max_tokens: Optional[int] = None,
183
+ n: Optional[int] = None,
179
184
  stop: Optional[Union[str, List[str]]] = None,
180
185
  stop_token_ids: Optional[List[int]] = None,
181
186
  temperature: Optional[float] = None,
@@ -194,6 +199,7 @@ def gen_string(
194
199
  name,
195
200
  max_tokens,
196
201
  None,
202
+ n,
197
203
  stop,
198
204
  stop_token_ids,
199
205
  temperature,
@@ -117,7 +117,7 @@ class BenchArgs:
117
117
 
118
118
  @classmethod
119
119
  def from_cli_args(cls, args: argparse.Namespace):
120
- # use the default value's type to case the args into correct types.
120
+ # use the default value's type to cast the args into correct types.
121
121
  attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
122
122
  return cls(
123
123
  **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
@@ -57,7 +57,7 @@ class BenchArgs:
57
57
 
58
58
  @classmethod
59
59
  def from_cli_args(cls, args: argparse.Namespace):
60
- # use the default value's type to case the args into correct types.
60
+ # use the default value's type to cast the args into correct types.
61
61
  attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
62
62
  return cls(
63
63
  **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
@@ -128,7 +128,7 @@ async def async_request_trt_llm(
128
128
  timestamp = time.perf_counter()
129
129
  # First token
130
130
  if ttft == 0.0:
131
- ttft = time.perf_counter() - st
131
+ ttft = timestamp - st
132
132
  output.ttft = ttft
133
133
 
134
134
  # Decoding phase
@@ -501,6 +501,7 @@ def get_dataset(args, tokenizer):
501
501
  question_len=args.gsp_question_len,
502
502
  output_len=args.gsp_output_len,
503
503
  tokenizer=tokenizer,
504
+ args=args,
504
505
  )
505
506
  else:
506
507
  raise ValueError(f"Unknown dataset: {args.dataset_name}")
@@ -788,6 +789,7 @@ def sample_generated_shared_prefix_requests(
788
789
  question_len: int,
789
790
  output_len: int,
790
791
  tokenizer: PreTrainedTokenizerBase,
792
+ args: argparse.Namespace,
791
793
  ) -> List[Tuple[str, int, int]]:
792
794
  """Generate benchmark requests with shared system prompts using random tokens and caching."""
793
795
  cache_path = get_gen_prefix_cache_path(args, tokenizer)
@@ -963,7 +965,7 @@ async def benchmark(
963
965
  request_rate: float,
964
966
  max_concurrency: Optional[int],
965
967
  disable_tqdm: bool,
966
- lora_name: str,
968
+ lora_names: List[str],
967
969
  extra_request_body: Dict[str, Any],
968
970
  profile: bool,
969
971
  pd_seperated: bool = False,
@@ -986,6 +988,11 @@ async def benchmark(
986
988
  # Warmup
987
989
  print("Starting initial single prompt test run...")
988
990
  test_prompt, test_prompt_len, test_output_len = input_requests[0]
991
+ if lora_names != None and len(lora_names) != 0:
992
+ lora_name = lora_names[0]
993
+ else:
994
+ lora_name = None
995
+
989
996
  test_input = RequestFuncInput(
990
997
  model=model_id,
991
998
  prompt=test_prompt,
@@ -1026,6 +1033,12 @@ async def benchmark(
1026
1033
  tasks: List[asyncio.Task] = []
1027
1034
  async for request in get_request(input_requests, request_rate):
1028
1035
  prompt, prompt_len, output_len = request
1036
+ if lora_names != None and len(lora_names) != 0:
1037
+ idx = random.randint(0, len(lora_names) - 1)
1038
+ lora_name = lora_names[idx]
1039
+ else:
1040
+ lora_name = None
1041
+
1029
1042
  request_func_input = RequestFuncInput(
1030
1043
  model=model_id,
1031
1044
  prompt=prompt,
@@ -1345,7 +1358,7 @@ def run_benchmark(args_: argparse.Namespace):
1345
1358
  request_rate=args.request_rate,
1346
1359
  max_concurrency=args.max_concurrency,
1347
1360
  disable_tqdm=args.disable_tqdm,
1348
- lora_name=args.lora_name,
1361
+ lora_names=args.lora_name,
1349
1362
  extra_request_body=extra_request_body,
1350
1363
  profile=args.profile,
1351
1364
  pd_seperated=args.pd_seperated,
@@ -1364,6 +1377,13 @@ def set_ulimit(target_soft_limit=65535):
1364
1377
  print(f"Fail to set RLIMIT_NOFILE: {e}")
1365
1378
 
1366
1379
 
1380
+ class LoRAPathAction(argparse.Action):
1381
+ def __call__(self, parser, namespace, values, option_string=None):
1382
+ setattr(namespace, self.dest, [])
1383
+ for lora_name in values:
1384
+ getattr(namespace, self.dest).append(lora_name)
1385
+
1386
+
1367
1387
  if __name__ == "__main__":
1368
1388
  parser = ArgumentParser(description="Benchmark the online serving throughput.")
1369
1389
  parser.add_argument(
@@ -1507,8 +1527,10 @@ if __name__ == "__main__":
1507
1527
  parser.add_argument(
1508
1528
  "--lora-name",
1509
1529
  type=str,
1530
+ nargs="*",
1510
1531
  default=None,
1511
- help="The name of LoRA adapter",
1532
+ action=LoRAPathAction,
1533
+ help="The names of LoRA adapters. You can provide a list of names in the format {name} {name} {name}...",
1512
1534
  )
1513
1535
  parser.add_argument(
1514
1536
  "--prompt-suffix",
@@ -1,6 +1,6 @@
1
1
  """Check environment configurations and dependency versions."""
2
2
 
3
- import importlib
3
+ import importlib.metadata
4
4
  import os
5
5
  import resource
6
6
  import subprocess
@@ -59,9 +59,8 @@ def get_package_versions(packages):
59
59
  for package in packages:
60
60
  package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
61
61
  try:
62
- module = importlib.import_module(package_name)
63
- if hasattr(module, "__version__"):
64
- versions[package_name] = module.__version__
62
+ version = importlib.metadata.version(package_name)
63
+ versions[package_name] = version
65
64
  except ModuleNotFoundError:
66
65
  versions[package_name] = "Module Not Found"
67
66
  return versions
@@ -165,6 +165,7 @@ class OpenAI(BaseBackend):
165
165
  kwargs.pop("max_tokens", None)
166
166
  else:
167
167
  kwargs.pop("max_completion_tokens", None)
168
+
168
169
  comp = openai_completion(
169
170
  client=self.client,
170
171
  token_usage=self.token_usage,
@@ -173,13 +174,13 @@ class OpenAI(BaseBackend):
173
174
  prompt=prompt,
174
175
  **kwargs,
175
176
  )
177
+ # Keep the returned list (or string) as is.
176
178
  elif sampling_params.dtype in [str, "str", "string"]:
177
179
  assert (
178
180
  not self.is_chat_model
179
181
  ), "constrained type not supported on chat model"
180
182
  kwargs = sampling_params.to_openai_kwargs()
181
183
  kwargs.pop("stop")
182
-
183
184
  comp = openai_completion(
184
185
  client=self.client,
185
186
  token_usage=self.token_usage,
@@ -189,7 +190,11 @@ class OpenAI(BaseBackend):
189
190
  stop='"',
190
191
  **kwargs,
191
192
  )
192
- comp = '"' + comp + '"'
193
+ # Wrap each element in quotes if we have a list.
194
+ if isinstance(comp, list):
195
+ comp = ['"' + x + '"' for x in comp]
196
+ else:
197
+ comp = '"' + comp + '"'
193
198
  elif sampling_params.dtype in [int, "int"]:
194
199
  assert (
195
200
  not self.is_chat_model
@@ -206,6 +211,7 @@ class OpenAI(BaseBackend):
206
211
  stop=[" "],
207
212
  **kwargs,
208
213
  )
214
+ # Leave as a list if that's what is returned.
209
215
  else:
210
216
  raise ValueError(f"Unknown dtype: {sampling_params.dtype}")
211
217
 
@@ -254,7 +260,9 @@ class OpenAI(BaseBackend):
254
260
  prompt=s.messages_,
255
261
  **self.spec_kwargs,
256
262
  )
257
- if self.spec_pattern_match(comp):
263
+ # Use a string for pattern matching.
264
+ comp_for_match = comp[0] if isinstance(comp, list) else comp
265
+ if self.spec_pattern_match(comp_for_match):
258
266
  break
259
267
 
260
268
  for term in self.spec_format:
@@ -370,7 +378,7 @@ class OpenAI(BaseBackend):
370
378
 
371
379
  def openai_completion(
372
380
  client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
373
- ):
381
+ ) -> Union[str, List[str]]:
374
382
  # if "ebnf" is in kwargs, warn and remove
375
383
  if "ebnf" in kwargs:
376
384
  warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
@@ -382,13 +390,18 @@ def openai_completion(
382
390
  if "stop" in kwargs and kwargs["stop"] is None:
383
391
  kwargs.pop("stop")
384
392
  ret = client.chat.completions.create(messages=prompt, **kwargs)
385
- comp = ret.choices[0].message.content
393
+ if len(ret.choices) == 1:
394
+ comp = ret.choices[0].message.content
395
+ else:
396
+ comp = [c.message.content for c in ret.choices]
386
397
  else:
387
398
  ret = client.completions.create(prompt=prompt, **kwargs)
388
399
  if isinstance(prompt, (list, tuple)):
389
400
  comp = [c.text for c in ret.choices]
390
401
  else:
391
402
  comp = ret.choices[0].text
403
+ if len(ret.choices) > 1:
404
+ comp = [c.text for c in ret.choices]
392
405
 
393
406
  token_usage.prompt_tokens += ret.usage.prompt_tokens
394
407
  token_usage.completion_tokens += ret.usage.completion_tokens
@@ -15,6 +15,7 @@ class ChatTemplate:
15
15
  role_prefix_and_suffix: Dict[str, Tuple[str, str]]
16
16
  stop_str: List[str] = ()
17
17
  image_token: str = "<image>"
18
+ audio_token: str = "<audio>"
18
19
  style: ChatTemplateStyle = ChatTemplateStyle.PLAIN
19
20
 
20
21
  def get_prefix_and_suffix(
@@ -253,6 +254,22 @@ register_chat_template(
253
254
  )
254
255
  )
255
256
 
257
+ # https://huggingface.co/openbmb/MiniCPM-o-2_6
258
+ register_chat_template(
259
+ ChatTemplate(
260
+ name="minicpmo",
261
+ default_system_prompt=None,
262
+ role_prefix_and_suffix={
263
+ "system": ("", " "),
264
+ "user": ("user:", " "),
265
+ "assistant": ("assistant:", "</s>"),
266
+ },
267
+ stop_str=("<|im_end|>", "<|endoftext|>"),
268
+ image_token="(<image>./</image>)",
269
+ audio_token="(<audio>./</audio>)",
270
+ )
271
+ )
272
+
256
273
  # The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
257
274
  register_chat_template(
258
275
  ChatTemplate(
@@ -474,12 +491,6 @@ def match_chat_ml(model_path: str):
474
491
  return get_chat_template("chatml-llava")
475
492
 
476
493
 
477
- @register_chat_template_matching_function
478
- def match_chat_minicpm(model_path: str):
479
- if "minicpm" in model_path:
480
- return get_chat_template("minicpmv")
481
-
482
-
483
494
  @register_chat_template_matching_function
484
495
  def match_chat_yi(model_path: str):
485
496
  model_path = model_path.lower()
@@ -499,8 +510,10 @@ def match_gemma_it(model_path: str):
499
510
  @register_chat_template_matching_function
500
511
  def match_openbmb_minicpm(model_path: str):
501
512
  model_path = model_path.lower()
502
- if "minicpm" in model_path:
513
+ if "minicpm-v" in model_path:
503
514
  return get_chat_template("minicpmv")
515
+ elif "minicpm-o" in model_path:
516
+ return get_chat_template("minicpmo")
504
517
 
505
518
 
506
519
  @register_chat_template_matching_function
@@ -520,6 +533,14 @@ def match_granite_instruct(model_path: str):
520
533
  return get_chat_template("granite-3-instruct")
521
534
 
522
535
 
536
+ @register_chat_template_matching_function
537
+ def match_gemma3_instruct(model_path: str):
538
+ model_path = model_path.lower()
539
+ if "gemma-3" in model_path and "1b" not in model_path:
540
+ # gemma-3-1b-it is completion model
541
+ return get_chat_template("gemma-it")
542
+
543
+
523
544
  if __name__ == "__main__":
524
545
  messages = [
525
546
  {"role": "system", "content": None}, # None means default
@@ -566,13 +566,13 @@ class StreamExecutor:
566
566
  def _execute_gen(self, expr: SglGen):
567
567
  sampling_params = self._resolve_sampling_params(expr.sampling_params)
568
568
  name = expr.name
569
-
570
569
  if not self.stream:
571
570
  if self.num_api_spec_tokens is None:
572
571
  comp, meta_info = self.backend.generate(
573
572
  self,
574
573
  sampling_params=sampling_params,
575
574
  )
575
+
576
576
  else:
577
577
  if self.backend.is_chat_model:
578
578
  # Speculative execution on models with only chat interface.
@@ -587,8 +587,11 @@ class StreamExecutor:
587
587
 
588
588
  else: # Speculative execution on models with completion interface
589
589
  comp, meta_info = self._spec_gen(sampling_params)
590
-
591
- self.text_ += comp
590
+ if isinstance(comp, list):
591
+ self.text_ += comp[0]
592
+ else:
593
+ assert isinstance(comp, str)
594
+ self.text_ += comp
592
595
 
593
596
  self.variables[name] = comp
594
597
  self.meta_info[name] = meta_info
@@ -747,6 +750,7 @@ class StreamExecutor:
747
750
  for item in [
748
751
  "max_new_tokens",
749
752
  "min_new_tokens",
753
+ "n",
750
754
  "stop",
751
755
  "stop_token_ids",
752
756
  "temperature",
@@ -18,6 +18,7 @@ REGEX_STR = r"\"[\w\d\s]*\"" # bugs with regex r"\".*\"" in interegular pkg
18
18
  class SglSamplingParams:
19
19
  max_new_tokens: int = 128
20
20
  min_new_tokens: int = 0
21
+ n: int = 1
21
22
  stop: Union[str, List[str]] = ()
22
23
  stop_token_ids: Optional[List[int]] = ()
23
24
  temperature: float = 1.0
@@ -41,6 +42,7 @@ class SglSamplingParams:
41
42
  return SglSamplingParams(
42
43
  self.max_new_tokens,
43
44
  self.min_new_tokens,
45
+ self.n,
44
46
  self.stop,
45
47
  self.stop_token_ids,
46
48
  self.temperature,
@@ -64,6 +66,7 @@ class SglSamplingParams:
64
66
  return {
65
67
  "max_tokens": self.max_new_tokens,
66
68
  "max_completion_tokens": self.max_new_tokens,
69
+ "n": self.n,
67
70
  "stop": self.stop or None,
68
71
  "temperature": self.temperature,
69
72
  "top_p": self.top_p,
@@ -117,6 +120,7 @@ class SglSamplingParams:
117
120
  return {
118
121
  "max_new_tokens": self.max_new_tokens,
119
122
  "min_new_tokens": self.min_new_tokens,
123
+ "n": self.n,
120
124
  "stop": self.stop,
121
125
  "stop_token_ids": self.stop_token_ids,
122
126
  "temperature": self.temperature,
@@ -154,6 +158,7 @@ class SglFunction:
154
158
  self,
155
159
  *args,
156
160
  max_new_tokens: int = 128,
161
+ n: int = 1,
157
162
  stop: Optional[Union[str, List[str]]] = None,
158
163
  stop_token_ids: Optional[List[int]] = None,
159
164
  temperature: float = 1.0,
@@ -182,6 +187,7 @@ class SglFunction:
182
187
 
183
188
  default_sampling_para = SglSamplingParams(
184
189
  max_new_tokens=max_new_tokens,
190
+ n=n,
185
191
  stop=stop,
186
192
  stop_token_ids=stop_token_ids,
187
193
  temperature=temperature,
@@ -212,6 +218,7 @@ class SglFunction:
212
218
  batch_kwargs,
213
219
  *,
214
220
  max_new_tokens: int = 128,
221
+ n: int = 1,
215
222
  stop: Optional[Union[str, List[str]]] = None,
216
223
  stop_token_ids: Optional[List[int]] = None,
217
224
  temperature: float = 1.0,
@@ -257,6 +264,7 @@ class SglFunction:
257
264
 
258
265
  default_sampling_para = SglSamplingParams(
259
266
  max_new_tokens=max_new_tokens,
267
+ n=n,
260
268
  stop=stop,
261
269
  stop_token_ids=stop_token_ids,
262
270
  temperature=temperature,
@@ -440,6 +448,7 @@ class SglGen(SglExpr):
440
448
  name: Optional[str] = None,
441
449
  max_new_tokens: Optional[int] = None,
442
450
  min_new_tokens: Optional[int] = None,
451
+ n: Optional[int] = None,
443
452
  stop: Optional[Union[str, List[str]]] = None,
444
453
  stop_token_ids: Optional[List[int]] = None,
445
454
  temperature: Optional[float] = None,
@@ -463,6 +472,7 @@ class SglGen(SglExpr):
463
472
  self.sampling_params = SglSamplingParams(
464
473
  max_new_tokens=max_new_tokens,
465
474
  min_new_tokens=min_new_tokens,
475
+ n=n,
466
476
  stop=stop,
467
477
  stop_token_ids=stop_token_ids,
468
478
  temperature=temperature,
@@ -10,7 +10,7 @@ from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
12
  use_vllm_custom_allreduce = get_bool_env_var(
13
- "USE_VLLM_CUSTOM_ALLREDUCE", default="true"
13
+ "USE_VLLM_CUSTOM_ALLREDUCE", default="false"
14
14
  )
15
15
 
16
16
  if not is_hpu():