sglang 0.4.3.post4__tar.gz → 0.4.4.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (534) hide show
  1. {sglang-0.4.3.post4/sglang.egg-info → sglang-0.4.4.post1}/PKG-INFO +9 -10
  2. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/pyproject.toml +10 -11
  3. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/bench_serving.py +1 -1
  4. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/chat_template.py +29 -0
  5. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/_custom_ops.py +19 -17
  6. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/configs/__init__.py +2 -0
  7. sglang-0.4.4.post1/sglang/srt/configs/janus_pro.py +629 -0
  8. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/configs/model_config.py +24 -14
  9. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/conversation.py +80 -2
  10. sglang-0.4.4.post1/sglang/srt/custom_op.py +101 -0
  11. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +18 -17
  12. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/distributed/parallel_state.py +10 -1
  13. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/entrypoints/engine.py +5 -3
  14. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/entrypoints/http_server.py +1 -1
  15. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/function_call_parser.py +33 -2
  16. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/hf_transformers_utils.py +16 -1
  17. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  18. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/attention/flashinfer_mla_backend.py +317 -57
  19. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/attention/triton_backend.py +1 -3
  20. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +6 -6
  21. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +3 -3
  22. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
  23. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +3 -3
  24. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/attention/vision.py +43 -62
  25. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/dp_attention.py +30 -2
  26. sglang-0.4.4.post1/sglang/srt/layers/elementwise.py +411 -0
  27. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/linear.py +1 -1
  28. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/logits_processor.py +1 -0
  29. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
  30. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/ep_moe/layer.py +25 -9
  31. sglang-0.4.4.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  32. sglang-0.4.4.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  33. sglang-0.4.4.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  34. sglang-0.4.4.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  35. sglang-0.4.4.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  36. sglang-0.4.4.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  37. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +63 -23
  38. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/layer.py +16 -4
  39. sglang-0.4.4.post1/sglang/srt/layers/moe/router.py +342 -0
  40. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/parameter.py +10 -0
  41. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/__init__.py +90 -68
  42. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/blockwise_int8.py +1 -2
  43. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  44. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  45. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  46. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  47. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  48. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  49. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  50. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  51. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  52. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  63. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang-0.4.4.post1/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  69. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/fp8.py +174 -106
  70. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/fp8_kernel.py +210 -38
  71. sglang-0.4.4.post1/sglang/srt/layers/quantization/fp8_utils.py +308 -0
  72. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/modelopt_quant.py +5 -1
  73. sglang-0.4.3.post4/sglang/srt/layers/quantization/w8a8_int8.py → sglang-0.4.4.post1/sglang/srt/layers/quantization/w8a8_fp8.py +34 -23
  74. sglang-0.4.4.post1/sglang/srt/layers/quantization/w8a8_int8.py +266 -0
  75. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/rotary_embedding.py +5 -3
  76. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/sampler.py +29 -35
  77. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -1
  78. sglang-0.4.4.post1/sglang/srt/lora/backend/__init__.py +25 -0
  79. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/managers/cache_controller.py +74 -8
  80. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/managers/data_parallel_controller.py +1 -1
  81. sglang-0.4.4.post1/sglang/srt/managers/image_processor.py +55 -0
  82. sglang-0.4.4.post1/sglang/srt/managers/image_processors/base_image_processor.py +219 -0
  83. sglang-0.4.4.post1/sglang/srt/managers/image_processors/janus_pro.py +79 -0
  84. sglang-0.4.4.post1/sglang/srt/managers/image_processors/llava.py +152 -0
  85. sglang-0.4.4.post1/sglang/srt/managers/image_processors/minicpmv.py +86 -0
  86. sglang-0.4.4.post1/sglang/srt/managers/image_processors/mlama.py +60 -0
  87. sglang-0.4.4.post1/sglang/srt/managers/image_processors/qwen_vl.py +161 -0
  88. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/managers/io_struct.py +32 -15
  89. sglang-0.4.4.post1/sglang/srt/managers/multi_modality_padding.py +134 -0
  90. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/managers/schedule_batch.py +213 -118
  91. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/managers/schedule_policy.py +40 -8
  92. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/managers/scheduler.py +176 -683
  93. sglang-0.4.4.post1/sglang/srt/managers/scheduler_output_processor_mixin.py +614 -0
  94. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/managers/tokenizer_manager.py +6 -6
  95. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +4 -1
  96. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/mem_cache/base_prefix_cache.py +6 -8
  97. sglang-0.4.4.post1/sglang/srt/mem_cache/chunk_cache.py +65 -0
  98. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/mem_cache/hiradix_cache.py +71 -34
  99. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/mem_cache/memory_pool.py +81 -17
  100. sglang-0.4.4.post1/sglang/srt/mem_cache/paged_allocator.py +283 -0
  101. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/mem_cache/radix_cache.py +117 -36
  102. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/model_executor/cuda_graph_runner.py +68 -20
  103. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/model_executor/forward_batch_info.py +23 -10
  104. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/model_executor/model_runner.py +63 -63
  105. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/model_loader/loader.py +2 -1
  106. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/model_loader/weight_utils.py +1 -1
  107. sglang-0.4.4.post1/sglang/srt/models/deepseek_janus_pro.py +2127 -0
  108. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/deepseek_nextn.py +23 -3
  109. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/deepseek_v2.py +200 -191
  110. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/grok.py +374 -119
  111. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/minicpmv.py +28 -89
  112. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/mllama.py +1 -1
  113. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/qwen2.py +0 -1
  114. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/qwen2_5_vl.py +25 -50
  115. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/qwen2_vl.py +33 -49
  116. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/openai_api/adapter.py +59 -35
  117. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/openai_api/protocol.py +8 -1
  118. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -1
  119. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -1
  120. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/server_args.py +24 -16
  121. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/speculative/eagle_worker.py +75 -39
  122. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/utils.py +104 -9
  123. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/runners.py +104 -10
  124. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/test_block_fp8.py +106 -16
  125. sglang-0.4.4.post1/sglang/test/test_custom_ops.py +88 -0
  126. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/test_utils.py +20 -4
  127. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/utils.py +0 -4
  128. sglang-0.4.4.post1/sglang/version.py +1 -0
  129. {sglang-0.4.3.post4 → sglang-0.4.4.post1/sglang.egg-info}/PKG-INFO +9 -10
  130. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang.egg-info/SOURCES.txt +47 -0
  131. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang.egg-info/requires.txt +8 -9
  132. sglang-0.4.3.post4/sglang/srt/custom_op.py +0 -40
  133. sglang-0.4.3.post4/sglang/srt/layers/quantization/fp8_utils.py +0 -167
  134. sglang-0.4.3.post4/sglang/srt/lora/backend/__init__.py +0 -28
  135. sglang-0.4.3.post4/sglang/srt/managers/image_processor.py +0 -649
  136. sglang-0.4.3.post4/sglang/srt/mem_cache/chunk_cache.py +0 -97
  137. sglang-0.4.3.post4/sglang/version.py +0 -1
  138. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/LICENSE +0 -0
  139. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/README.md +0 -0
  140. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/setup.cfg +0 -0
  141. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/__init__.py +0 -0
  142. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/api.py +0 -0
  143. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/bench_offline_throughput.py +0 -0
  144. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/bench_one_batch.py +0 -0
  145. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/bench_one_batch_server.py +0 -0
  146. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/check_env.py +0 -0
  147. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/global_config.py +0 -0
  148. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/__init__.py +0 -0
  149. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/backend/__init__.py +0 -0
  150. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/backend/anthropic.py +0 -0
  151. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/backend/base_backend.py +0 -0
  152. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/backend/litellm.py +0 -0
  153. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/backend/openai.py +0 -0
  154. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  155. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/backend/vertexai.py +0 -0
  156. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/choices.py +0 -0
  157. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/compiler.py +0 -0
  158. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/interpreter.py +0 -0
  159. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/ir.py +0 -0
  160. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/lang/tracer.py +0 -0
  161. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/launch_server.py +0 -0
  162. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/llama3_eval.py +0 -0
  163. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/aio_rwlock.py +0 -0
  164. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/configs/chatglm.py +0 -0
  165. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/configs/dbrx.py +0 -0
  166. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/configs/device_config.py +0 -0
  167. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/configs/exaone.py +0 -0
  168. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/configs/load_config.py +0 -0
  169. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/configs/qwen2_5_vl_config.py +0 -0
  170. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  171. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/constrained/llguidance_backend.py +0 -0
  172. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/constrained/outlines_backend.py +0 -0
  173. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  174. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  175. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/distributed/__init__.py +0 -0
  176. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/distributed/communication_op.py +0 -0
  177. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  178. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  179. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  180. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  181. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  182. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  183. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  184. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/distributed/utils.py +0 -0
  185. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/entrypoints/verl_engine.py +0 -0
  186. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/activation.py +0 -0
  187. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
  188. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  189. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  190. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  191. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/attention/utils.py +0 -0
  192. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/layernorm.py +0 -0
  193. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  194. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  195. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  196. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  197. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  198. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  199. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  200. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  201. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  202. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  203. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  204. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  205. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  206. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  207. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  208. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  209. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  210. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  211. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  212. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  213. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  214. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  215. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  216. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  217. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  218. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  219. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  220. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  221. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  222. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  223. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  224. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  225. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  226. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  227. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  228. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  229. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  230. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  231. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  232. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  233. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  234. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  235. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  236. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  237. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  238. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  239. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  240. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  241. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  242. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  243. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  244. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  245. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  246. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  247. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  248. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  249. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  250. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  251. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  252. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  253. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  254. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  255. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  256. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  257. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  258. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  259. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  260. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  261. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  262. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  263. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  264. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  265. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  266. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  267. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  268. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  269. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  270. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  271. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  272. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  273. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  274. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  275. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  276. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  277. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  278. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  279. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  280. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  281. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  282. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  283. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  284. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  285. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  286. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  287. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  288. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  289. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  290. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  291. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  292. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  293. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  294. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  295. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  296. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  297. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  298. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  299. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  300. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  301. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  302. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  303. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  304. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  305. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/moe/topk.py +0 -0
  306. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/pooler.py +0 -0
  307. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  308. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  309. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  310. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  311. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  312. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  313. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  314. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  315. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  316. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  317. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  318. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  319. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  320. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  321. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  322. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  323. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  324. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  325. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  326. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  327. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  328. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  329. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  330. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  331. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  332. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  333. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  334. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  335. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  336. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  337. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  338. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  339. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  340. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  341. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  342. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  343. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  344. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  345. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  346. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  347. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  348. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  349. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  350. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  351. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  352. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  353. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  354. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  355. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  356. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  357. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  358. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  359. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  360. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  361. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  362. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  363. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  364. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  365. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  366. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  367. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  368. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  369. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  370. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  371. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  372. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  373. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  374. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  375. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  376. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  377. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  378. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  379. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  380. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  381. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  382. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  383. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  384. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  385. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  386. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  387. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  388. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  389. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  390. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  391. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  392. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  393. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  394. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  395. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  396. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  397. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  398. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  399. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  400. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  401. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  402. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  403. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  404. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  405. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  406. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  407. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  408. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  409. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  410. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  411. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  412. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  413. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  414. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  415. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  416. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  417. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  418. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  420. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  421. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  422. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  423. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  424. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  425. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  426. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  427. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  428. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  429. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  430. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  431. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  432. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  433. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  434. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/gptq.py +0 -0
  435. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  436. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  437. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/radix_attention.py +0 -0
  438. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/layers/torchao_utils.py +0 -0
  439. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/backend/base_backend.py +0 -0
  440. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
  441. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/backend/triton_backend.py +0 -0
  442. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/layers.py +0 -0
  443. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/lora.py +0 -0
  444. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/lora_config.py +0 -0
  445. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/lora_manager.py +0 -0
  446. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/mem_pool.py +0 -0
  447. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  448. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
  449. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
  450. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
  451. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
  452. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/lora/utils.py +0 -0
  453. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/managers/configure_logging.py +0 -0
  454. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  455. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/managers/session_controller.py +0 -0
  456. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/managers/tp_worker.py +0 -0
  457. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/managers/utils.py +0 -0
  458. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  459. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/metrics/collector.py +0 -0
  460. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/metrics/func_timer.py +0 -0
  461. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/mm_utils.py +0 -0
  462. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/model_loader/__init__.py +0 -0
  463. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/model_loader/utils.py +0 -0
  464. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/model_parallel.py +0 -0
  465. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/baichuan.py +0 -0
  466. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/chatglm.py +0 -0
  467. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/commandr.py +0 -0
  468. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/dbrx.py +0 -0
  469. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/deepseek.py +0 -0
  470. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/exaone.py +0 -0
  471. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/gemma.py +0 -0
  472. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/gemma2.py +0 -0
  473. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/gemma2_reward.py +0 -0
  474. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/gpt2.py +0 -0
  475. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
  476. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/granite.py +0 -0
  477. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/internlm2.py +0 -0
  478. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/internlm2_reward.py +0 -0
  479. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/llama.py +0 -0
  480. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/llama_classification.py +0 -0
  481. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/llama_eagle.py +0 -0
  482. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/llama_embedding.py +0 -0
  483. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/llama_reward.py +0 -0
  484. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/llava.py +0 -0
  485. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/llavavid.py +0 -0
  486. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/minicpm.py +0 -0
  487. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/minicpm3.py +0 -0
  488. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/mistral.py +0 -0
  489. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/mixtral.py +0 -0
  490. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/mixtral_quant.py +0 -0
  491. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/olmo.py +0 -0
  492. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/olmo2.py +0 -0
  493. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/olmoe.py +0 -0
  494. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/phi3_small.py +0 -0
  495. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/qwen.py +0 -0
  496. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/qwen2_eagle.py +0 -0
  497. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/qwen2_moe.py +0 -0
  498. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/qwen2_rm.py +0 -0
  499. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/registry.py +0 -0
  500. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/stablelm.py +0 -0
  501. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/torch_native_llama.py +0 -0
  502. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/xverse.py +0 -0
  503. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/xverse_moe.py +0 -0
  504. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/models/yivl.py +0 -0
  505. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/reasoning_parser.py +0 -0
  506. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  507. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  508. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  509. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  510. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  511. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/sampling/sampling_params.py +0 -0
  512. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/server.py +0 -0
  513. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  514. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -0
  515. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/speculative/eagle_utils.py +0 -0
  516. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/speculative/spec_info.py +0 -0
  517. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  518. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/srt/warmup.py +0 -0
  519. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  520. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  521. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/run_eval.py +0 -0
  522. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/send_one.py +0 -0
  523. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/simple_eval_common.py +0 -0
  524. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  525. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  526. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/simple_eval_math.py +0 -0
  527. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  528. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  529. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/test_activation.py +0 -0
  530. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/test_block_fp8_ep.py +0 -0
  531. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/test_layernorm.py +0 -0
  532. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang/test/test_programs.py +0 -0
  533. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang.egg-info/dependency_links.txt +0 -0
  534. {sglang-0.4.3.post4 → sglang-0.4.4.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.3.post4
3
+ Version: 0.4.4.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -211,19 +211,22 @@ Classifier: License :: OSI Approved :: Apache Software License
211
211
  Requires-Python: >=3.8
212
212
  Description-Content-Type: text/markdown
213
213
  License-File: LICENSE
214
+ Requires-Dist: aiohttp
214
215
  Requires-Dist: requests
215
216
  Requires-Dist: tqdm
216
217
  Requires-Dist: numpy
217
218
  Requires-Dist: IPython
218
219
  Requires-Dist: setproctitle
219
220
  Provides-Extra: runtime-common
220
- Requires-Dist: aiohttp; extra == "runtime-common"
221
+ Requires-Dist: datasets; extra == "runtime-common"
221
222
  Requires-Dist: decord; extra == "runtime-common"
222
223
  Requires-Dist: fastapi; extra == "runtime-common"
223
224
  Requires-Dist: hf_transfer; extra == "runtime-common"
224
225
  Requires-Dist: huggingface_hub; extra == "runtime-common"
225
226
  Requires-Dist: interegular; extra == "runtime-common"
227
+ Requires-Dist: llguidance>=0.6.15; extra == "runtime-common"
226
228
  Requires-Dist: modelscope; extra == "runtime-common"
229
+ Requires-Dist: ninja; extra == "runtime-common"
227
230
  Requires-Dist: orjson; extra == "runtime-common"
228
231
  Requires-Dist: packaging; extra == "runtime-common"
229
232
  Requires-Dist: pillow; extra == "runtime-common"
@@ -233,24 +236,20 @@ Requires-Dist: pydantic; extra == "runtime-common"
233
236
  Requires-Dist: python-multipart; extra == "runtime-common"
234
237
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
235
238
  Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
239
+ Requires-Dist: transformers==4.48.3; extra == "runtime-common"
236
240
  Requires-Dist: uvicorn; extra == "runtime-common"
237
241
  Requires-Dist: uvloop; extra == "runtime-common"
238
- Requires-Dist: xgrammar==0.1.14; extra == "runtime-common"
239
- Requires-Dist: ninja; extra == "runtime-common"
240
- Requires-Dist: transformers==4.48.3; extra == "runtime-common"
241
- Requires-Dist: llguidance>=0.6.15; extra == "runtime-common"
242
- Requires-Dist: datasets; extra == "runtime-common"
242
+ Requires-Dist: xgrammar==0.1.15; extra == "runtime-common"
243
243
  Provides-Extra: srt
244
244
  Requires-Dist: sglang[runtime_common]; extra == "srt"
245
- Requires-Dist: sgl-kernel==0.0.3.post6; extra == "srt"
246
- Requires-Dist: flashinfer_python==0.2.2.post1; extra == "srt"
245
+ Requires-Dist: sgl-kernel==0.0.5; extra == "srt"
246
+ Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
247
247
  Requires-Dist: torch==2.5.1; extra == "srt"
248
248
  Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
249
249
  Requires-Dist: cuda-python; extra == "srt"
250
250
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
251
251
  Provides-Extra: srt-hip
252
252
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
253
- Requires-Dist: sgl-kernel==0.0.3.post6; extra == "srt-hip"
254
253
  Requires-Dist: torch; extra == "srt-hip"
255
254
  Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
256
255
  Requires-Dist: outlines==0.1.11; extra == "srt-hip"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.3.post4"
7
+ version = "0.4.4.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -13,17 +13,19 @@ classifiers = [
13
13
  "Programming Language :: Python :: 3",
14
14
  "License :: OSI Approved :: Apache Software License",
15
15
  ]
16
- dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
16
+ dependencies = ["aiohttp", "requests", "tqdm", "numpy", "IPython", "setproctitle"]
17
17
 
18
18
  [project.optional-dependencies]
19
19
  runtime_common = [
20
- "aiohttp",
20
+ "datasets",
21
21
  "decord",
22
22
  "fastapi",
23
23
  "hf_transfer",
24
24
  "huggingface_hub",
25
25
  "interegular",
26
+ "llguidance>=0.6.15",
26
27
  "modelscope",
28
+ "ninja",
27
29
  "orjson",
28
30
  "packaging",
29
31
  "pillow",
@@ -33,19 +35,16 @@ runtime_common = [
33
35
  "python-multipart",
34
36
  "pyzmq>=25.1.2",
35
37
  "torchao>=0.7.0",
38
+ "transformers==4.48.3",
36
39
  "uvicorn",
37
40
  "uvloop",
38
- "xgrammar==0.1.14",
39
- "ninja",
40
- "transformers==4.48.3",
41
- "llguidance>=0.6.15",
42
- "datasets"
41
+ "xgrammar==0.1.15",
43
42
  ]
44
43
 
45
44
  srt = [
46
45
  "sglang[runtime_common]",
47
- "sgl-kernel==0.0.3.post6",
48
- "flashinfer_python==0.2.2.post1",
46
+ "sgl-kernel==0.0.5",
47
+ "flashinfer_python==0.2.3",
49
48
  "torch==2.5.1",
50
49
  "vllm>=0.6.4.post1,<=0.7.2",
51
50
  "cuda-python",
@@ -54,7 +53,7 @@ srt = [
54
53
 
55
54
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
56
55
  # => base docker rocm/vllm-dev:20250114, not from public vllm whl
57
- srt_hip = ["sglang[runtime_common]", "sgl-kernel==0.0.3.post6", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
56
+ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
58
57
 
59
58
  # xpu is not enabled in public vllm and torch whl,
60
59
  # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
@@ -1006,7 +1006,7 @@ async def benchmark(
1006
1006
 
1007
1007
  # Flush cache
1008
1008
  if "sglang" in backend:
1009
- requests.post(base_url + "/flush_cache")
1009
+ requests.post(base_url + "/flush_cache", headers=get_auth_headers())
1010
1010
 
1011
1011
  time.sleep(1.0)
1012
1012
 
@@ -230,6 +230,29 @@ register_chat_template(
230
230
  )
231
231
  )
232
232
 
233
+ register_chat_template(
234
+ ChatTemplate(
235
+ name="janus-pro",
236
+ default_system_prompt=None,
237
+ role_prefix_and_suffix={
238
+ "system": (
239
+ "",
240
+ "",
241
+ ),
242
+ "User": (
243
+ "<|User|>",
244
+ "",
245
+ ),
246
+ "assistant": (
247
+ "<|Assistant|>",
248
+ "<|end▁of▁sentence|>",
249
+ ),
250
+ },
251
+ stop_str=("<|end▁of▁sentence|>",),
252
+ image_token="<image_placeholder>\n",
253
+ )
254
+ )
255
+
233
256
  # The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
234
257
  register_chat_template(
235
258
  ChatTemplate(
@@ -384,6 +407,12 @@ def match_deepseek(model_path: str):
384
407
  return get_chat_template("deepseek-v3")
385
408
 
386
409
 
410
+ @register_chat_template_matching_function
411
+ def match_deepseek_janus_pro(model_path: str):
412
+ if "janus" in model_path.lower():
413
+ return get_chat_template("janus-pro")
414
+
415
+
387
416
  @register_chat_template_matching_function
388
417
  def match_dbrx(model_path: str):
389
418
  if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
@@ -6,10 +6,12 @@ from typing import List, Tuple
6
6
  import torch
7
7
  import torch.library
8
8
 
9
- from sglang.srt.utils import is_hip, is_hpu
9
+ from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
- use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=True)
12
+ use_vllm_custom_allreduce = get_bool_env_var(
13
+ "USE_VLLM_CUSTOM_ALLREDUCE", default="true"
14
+ )
13
15
 
14
16
  if not is_hpu():
15
17
  # ROCm does not use vllm custom allreduce
@@ -75,42 +77,42 @@ else:
75
77
  rank: int,
76
78
  full_nvlink: bool,
77
79
  ) -> int:
78
- return sgl_kernel.ops.allreduce.init_custom_ar(
80
+ return sgl_kernel.allreduce.init_custom_ar(
79
81
  meta, rank_data, handles, offsets, rank, full_nvlink
80
82
  )
81
83
 
82
84
  def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
83
- sgl_kernel.ops.allreduce.all_reduce_reg(fa, inp, out)
85
+ sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
84
86
 
85
87
  def all_reduce_unreg(
86
88
  fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
87
89
  ) -> None:
88
- sgl_kernel.ops.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
90
+ sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
89
91
 
90
92
  def dispose(fa: int) -> None:
91
- sgl_kernel.ops.allreduce.dispose(fa)
93
+ sgl_kernel.allreduce.dispose(fa)
92
94
 
93
95
  def meta_size() -> int:
94
- return sgl_kernel.ops.allreduce.meta_size()
96
+ return sgl_kernel.allreduce.meta_size()
95
97
 
96
98
  def register_buffer(
97
99
  fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
98
100
  ) -> None:
99
- return sgl_kernel.ops.allreduce.register_buffer(fa, t, handles, offsets)
101
+ return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
100
102
 
101
103
  def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
102
- return sgl_kernel.ops.allreduce.get_graph_buffer_ipc_meta(fa)
104
+ return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
103
105
 
104
106
  def register_graph_buffers(
105
107
  fa: int, handles: List[str], offsets: List[List[int]]
106
108
  ) -> None:
107
- sgl_kernel.ops.allreduce.register_graph_buffers(fa, handles, offsets)
109
+ sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
108
110
 
109
111
  def allocate_meta_buffer(size: int) -> torch.Tensor:
110
- return sgl_kernel.ops.allreduce.allocate_meta_buffer(size)
112
+ return sgl_kernel.allreduce.allocate_meta_buffer(size)
111
113
 
112
114
  def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
113
- return sgl_kernel.ops.allreduce.get_meta_buffer_ipc_handle(inp)
115
+ return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
114
116
 
115
117
  else:
116
118
  # TRTLLM custom allreduce
@@ -123,7 +125,7 @@ else:
123
125
  barrier_in: List[int],
124
126
  barrier_out: List[int],
125
127
  ) -> int:
126
- return sgl_kernel.ops.init_custom_reduce(
128
+ return sgl_kernel.init_custom_reduce(
127
129
  rank_id,
128
130
  world_size,
129
131
  rank_data_base,
@@ -134,15 +136,15 @@ else:
134
136
  )
135
137
 
136
138
  def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
137
- sgl_kernel.ops.custom_reduce(fa, inp, out)
139
+ sgl_kernel.custom_reduce(fa, inp, out)
138
140
 
139
141
  def dispose(fa: int) -> None:
140
- sgl_kernel.ops.custom_dispose(fa)
142
+ sgl_kernel.custom_dispose(fa)
141
143
 
142
144
  def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
143
- return sgl_kernel.ops.get_graph_buffer_ipc_meta(fa)
145
+ return sgl_kernel.get_graph_buffer_ipc_meta(fa)
144
146
 
145
147
  def register_graph_buffers(
146
148
  fa: int, handles: List[List[int]], offsets: List[List[int]]
147
149
  ) -> None:
148
- sgl_kernel.ops.register_graph_buffers(fa, handles, offsets)
150
+ sgl_kernel.register_graph_buffers(fa, handles, offsets)
@@ -1,6 +1,7 @@
1
1
  from sglang.srt.configs.chatglm import ChatGLMConfig
2
2
  from sglang.srt.configs.dbrx import DbrxConfig
3
3
  from sglang.srt.configs.exaone import ExaoneConfig
4
+ from sglang.srt.configs.janus_pro import MultiModalityConfig
4
5
  from sglang.srt.configs.qwen2_5_vl_config import (
5
6
  Qwen2_5_VLConfig,
6
7
  Qwen2_5_VLVisionConfig,
@@ -12,4 +13,5 @@ __all__ = [
12
13
  "DbrxConfig",
13
14
  "Qwen2_5_VLConfig",
14
15
  "Qwen2_5_VLVisionConfig",
16
+ "MultiModalityConfig",
15
17
  ]