sglang 0.4.3.post3__tar.gz → 0.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.4.3.post3/sglang.egg-info → sglang-0.4.4}/PKG-INFO +9 -9
- {sglang-0.4.3.post3 → sglang-0.4.4}/pyproject.toml +10 -10
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/bench_serving.py +2 -2
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/chat_template.py +29 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/_custom_ops.py +19 -17
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/configs/__init__.py +2 -0
- sglang-0.4.4/sglang/srt/configs/janus_pro.py +629 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/configs/model_config.py +24 -14
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/conversation.py +80 -2
- sglang-0.4.4/sglang/srt/custom_op.py +101 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +18 -17
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/distributed/parallel_state.py +10 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/entrypoints/engine.py +5 -3
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/entrypoints/http_server.py +1 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/hf_transformers_utils.py +16 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/attention/flashinfer_backend.py +95 -49
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/attention/flashinfer_mla_backend.py +317 -57
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/attention/triton_backend.py +5 -5
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/attention/triton_ops/decode_attention.py +6 -6
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +3 -3
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +3 -3
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/attention/vision.py +43 -62
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/linear.py +1 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/ep_moe/layer.py +25 -9
- sglang-0.4.4/sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang-0.4.4/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang-0.4.4/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang-0.4.4/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +63 -23
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/layer.py +16 -4
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/parameter.py +10 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/__init__.py +90 -68
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/blockwise_int8.py +1 -2
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.4/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/fp8.py +174 -106
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/fp8_kernel.py +210 -38
- sglang-0.4.4/sglang/srt/layers/quantization/fp8_utils.py +308 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/modelopt_quant.py +5 -1
- sglang-0.4.3.post3/sglang/srt/layers/quantization/w8a8_int8.py → sglang-0.4.4/sglang/srt/layers/quantization/w8a8_fp8.py +34 -23
- sglang-0.4.4/sglang/srt/layers/quantization/w8a8_int8.py +266 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/rotary_embedding.py +5 -3
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/sampler.py +29 -35
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/vocab_parallel_embedding.py +0 -1
- sglang-0.4.4/sglang/srt/lora/backend/__init__.py +25 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/managers/cache_controller.py +72 -8
- sglang-0.4.4/sglang/srt/managers/image_processor.py +55 -0
- sglang-0.4.4/sglang/srt/managers/image_processors/base_image_processor.py +219 -0
- sglang-0.4.4/sglang/srt/managers/image_processors/janus_pro.py +79 -0
- sglang-0.4.4/sglang/srt/managers/image_processors/llava.py +152 -0
- sglang-0.4.4/sglang/srt/managers/image_processors/minicpmv.py +86 -0
- sglang-0.4.4/sglang/srt/managers/image_processors/mlama.py +60 -0
- sglang-0.4.4/sglang/srt/managers/image_processors/qwen_vl.py +161 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/managers/io_struct.py +33 -15
- sglang-0.4.4/sglang/srt/managers/multi_modality_padding.py +134 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/managers/schedule_batch.py +212 -117
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/managers/schedule_policy.py +40 -8
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/managers/scheduler.py +258 -782
- sglang-0.4.4/sglang/srt/managers/scheduler_output_processor_mixin.py +611 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/managers/tokenizer_manager.py +7 -6
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/managers/tp_worker_overlap_thread.py +4 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/mem_cache/base_prefix_cache.py +6 -8
- sglang-0.4.4/sglang/srt/mem_cache/chunk_cache.py +65 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/mem_cache/hiradix_cache.py +63 -34
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/mem_cache/memory_pool.py +112 -46
- sglang-0.4.4/sglang/srt/mem_cache/paged_allocator.py +283 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/mem_cache/radix_cache.py +117 -36
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/metrics/collector.py +8 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/model_executor/cuda_graph_runner.py +10 -11
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/model_executor/forward_batch_info.py +12 -8
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/model_executor/model_runner.py +153 -134
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/model_loader/loader.py +2 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/model_loader/weight_utils.py +1 -1
- sglang-0.4.4/sglang/srt/models/deepseek_janus_pro.py +2127 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/deepseek_nextn.py +23 -3
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/deepseek_v2.py +25 -19
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/minicpmv.py +28 -89
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/mllama.py +1 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/qwen2.py +0 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/qwen2_5_vl.py +25 -50
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/qwen2_vl.py +33 -49
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/openai_api/adapter.py +37 -15
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/openai_api/protocol.py +8 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/server_args.py +19 -20
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/speculative/build_eagle_tree.py +6 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -11
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/speculative/eagle_utils.py +2 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/speculative/eagle_worker.py +109 -38
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/utils.py +104 -9
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/runners.py +104 -10
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/test_block_fp8.py +106 -16
- sglang-0.4.4/sglang/test/test_custom_ops.py +88 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/test_utils.py +20 -4
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/utils.py +0 -4
- sglang-0.4.4/sglang/version.py +1 -0
- {sglang-0.4.3.post3 → sglang-0.4.4/sglang.egg-info}/PKG-INFO +9 -9
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang.egg-info/SOURCES.txt +45 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang.egg-info/requires.txt +8 -8
- sglang-0.4.3.post3/sglang/srt/custom_op.py +0 -40
- sglang-0.4.3.post3/sglang/srt/layers/quantization/fp8_utils.py +0 -167
- sglang-0.4.3.post3/sglang/srt/lora/backend/__init__.py +0 -28
- sglang-0.4.3.post3/sglang/srt/managers/image_processor.py +0 -649
- sglang-0.4.3.post3/sglang/srt/mem_cache/chunk_cache.py +0 -97
- sglang-0.4.3.post3/sglang/version.py +0 -1
- {sglang-0.4.3.post3 → sglang-0.4.4}/LICENSE +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/README.md +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/setup.cfg +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/__init__.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/api.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/bench_offline_throughput.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/bench_one_batch.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/bench_one_batch_server.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/check_env.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/global_config.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/__init__.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/choices.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/compiler.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/interpreter.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/ir.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/lang/tracer.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/launch_server.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/llama3_eval.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/aio_rwlock.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/configs/chatglm.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/configs/dbrx.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/configs/device_config.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/configs/load_config.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/configs/qwen2_5_vl_config.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/constrained/base_grammar_backend.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/constrained/llguidance_backend.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/constrained/outlines_backend.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/constrained/xgrammar_backend.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/distributed/__init__.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/distributed/communication_op.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/distributed/utils.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/entrypoints/verl_engine.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/function_call_parser.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/attention/utils.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/dp_attention.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/moe/topk.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/gptq.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/quantization/int8_utils.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/backend/base_backend.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/backend/triton_backend.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/layers.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/mem_pool.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/triton_ops/__init__.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/lora/utils.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/managers/configure_logging.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/managers/data_parallel_controller.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/managers/session_controller.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/managers/tp_worker.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/managers/utils.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/model_loader/__init__.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/model_parallel.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/gemma2.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/gemma2_reward.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/gpt2.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/granite.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/grok.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/internlm2_reward.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/llama.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/llama_eagle.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/llama_reward.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/llava.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/minicpm3.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/olmo.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/olmo2.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/phi3_small.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/qwen2_eagle.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/qwen2_rm.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/registry.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/torch_native_llama.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/reasoning_parser.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/sampling/custom_logit_processor.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/sampling/sampling_batch_info.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/server.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/speculative/spec_info.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/torch_memory_saver_adapter.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/srt/warmup.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/run_eval.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/send_one.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/test_activation.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/test_block_fp8_ep.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang/test/test_programs.py +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.4.3.post3 → sglang-0.4.4}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.4
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -211,19 +211,22 @@ Classifier: License :: OSI Approved :: Apache Software License
|
|
211
211
|
Requires-Python: >=3.8
|
212
212
|
Description-Content-Type: text/markdown
|
213
213
|
License-File: LICENSE
|
214
|
+
Requires-Dist: aiohttp
|
214
215
|
Requires-Dist: requests
|
215
216
|
Requires-Dist: tqdm
|
216
217
|
Requires-Dist: numpy
|
217
218
|
Requires-Dist: IPython
|
218
219
|
Requires-Dist: setproctitle
|
219
220
|
Provides-Extra: runtime-common
|
220
|
-
Requires-Dist:
|
221
|
+
Requires-Dist: datasets; extra == "runtime-common"
|
221
222
|
Requires-Dist: decord; extra == "runtime-common"
|
222
223
|
Requires-Dist: fastapi; extra == "runtime-common"
|
223
224
|
Requires-Dist: hf_transfer; extra == "runtime-common"
|
224
225
|
Requires-Dist: huggingface_hub; extra == "runtime-common"
|
225
226
|
Requires-Dist: interegular; extra == "runtime-common"
|
227
|
+
Requires-Dist: llguidance>=0.6.15; extra == "runtime-common"
|
226
228
|
Requires-Dist: modelscope; extra == "runtime-common"
|
229
|
+
Requires-Dist: ninja; extra == "runtime-common"
|
227
230
|
Requires-Dist: orjson; extra == "runtime-common"
|
228
231
|
Requires-Dist: packaging; extra == "runtime-common"
|
229
232
|
Requires-Dist: pillow; extra == "runtime-common"
|
@@ -233,23 +236,20 @@ Requires-Dist: pydantic; extra == "runtime-common"
|
|
233
236
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
234
237
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
235
238
|
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
239
|
+
Requires-Dist: transformers==4.48.3; extra == "runtime-common"
|
236
240
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
237
241
|
Requires-Dist: uvloop; extra == "runtime-common"
|
238
|
-
Requires-Dist: xgrammar==0.1.
|
239
|
-
Requires-Dist: ninja; extra == "runtime-common"
|
240
|
-
Requires-Dist: transformers==4.48.3; extra == "runtime-common"
|
241
|
-
Requires-Dist: llguidance>=0.6.15; extra == "runtime-common"
|
242
|
+
Requires-Dist: xgrammar==0.1.15; extra == "runtime-common"
|
242
243
|
Provides-Extra: srt
|
243
244
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
244
|
-
Requires-Dist: sgl-kernel==0.0.
|
245
|
-
Requires-Dist: flashinfer_python==0.2.
|
245
|
+
Requires-Dist: sgl-kernel==0.0.5; extra == "srt"
|
246
|
+
Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
|
246
247
|
Requires-Dist: torch==2.5.1; extra == "srt"
|
247
248
|
Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
|
248
249
|
Requires-Dist: cuda-python; extra == "srt"
|
249
250
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
250
251
|
Provides-Extra: srt-hip
|
251
252
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
252
|
-
Requires-Dist: sgl-kernel==0.0.3.post6; extra == "srt-hip"
|
253
253
|
Requires-Dist: torch; extra == "srt-hip"
|
254
254
|
Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
|
255
255
|
Requires-Dist: outlines==0.1.11; extra == "srt-hip"
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.4.
|
7
|
+
version = "0.4.4"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -13,17 +13,19 @@ classifiers = [
|
|
13
13
|
"Programming Language :: Python :: 3",
|
14
14
|
"License :: OSI Approved :: Apache Software License",
|
15
15
|
]
|
16
|
-
dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
|
16
|
+
dependencies = ["aiohttp", "requests", "tqdm", "numpy", "IPython", "setproctitle"]
|
17
17
|
|
18
18
|
[project.optional-dependencies]
|
19
19
|
runtime_common = [
|
20
|
-
"
|
20
|
+
"datasets",
|
21
21
|
"decord",
|
22
22
|
"fastapi",
|
23
23
|
"hf_transfer",
|
24
24
|
"huggingface_hub",
|
25
25
|
"interegular",
|
26
|
+
"llguidance>=0.6.15",
|
26
27
|
"modelscope",
|
28
|
+
"ninja",
|
27
29
|
"orjson",
|
28
30
|
"packaging",
|
29
31
|
"pillow",
|
@@ -33,18 +35,16 @@ runtime_common = [
|
|
33
35
|
"python-multipart",
|
34
36
|
"pyzmq>=25.1.2",
|
35
37
|
"torchao>=0.7.0",
|
38
|
+
"transformers==4.48.3",
|
36
39
|
"uvicorn",
|
37
40
|
"uvloop",
|
38
|
-
"xgrammar==0.1.
|
39
|
-
"ninja",
|
40
|
-
"transformers==4.48.3",
|
41
|
-
"llguidance>=0.6.15"
|
41
|
+
"xgrammar==0.1.15",
|
42
42
|
]
|
43
43
|
|
44
44
|
srt = [
|
45
45
|
"sglang[runtime_common]",
|
46
|
-
"sgl-kernel==0.0.
|
47
|
-
"flashinfer_python==0.2.
|
46
|
+
"sgl-kernel==0.0.5",
|
47
|
+
"flashinfer_python==0.2.3",
|
48
48
|
"torch==2.5.1",
|
49
49
|
"vllm>=0.6.4.post1,<=0.7.2",
|
50
50
|
"cuda-python",
|
@@ -53,7 +53,7 @@ srt = [
|
|
53
53
|
|
54
54
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
55
55
|
# => base docker rocm/vllm-dev:20250114, not from public vllm whl
|
56
|
-
srt_hip = ["sglang[runtime_common]", "
|
56
|
+
srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
|
57
57
|
|
58
58
|
# xpu is not enabled in public vllm and torch whl,
|
59
59
|
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
|
@@ -220,7 +220,7 @@ async def async_request_openai_completions(
|
|
220
220
|
|
221
221
|
most_recent_timestamp = timestamp
|
222
222
|
generated_text += data["choices"][0]["text"]
|
223
|
-
output_len = data.get("usage"
|
223
|
+
output_len = (data.get("usage") or {}).get(
|
224
224
|
"completion_tokens", output_len
|
225
225
|
)
|
226
226
|
|
@@ -1006,7 +1006,7 @@ async def benchmark(
|
|
1006
1006
|
|
1007
1007
|
# Flush cache
|
1008
1008
|
if "sglang" in backend:
|
1009
|
-
requests.post(base_url + "/flush_cache")
|
1009
|
+
requests.post(base_url + "/flush_cache", headers=get_auth_headers())
|
1010
1010
|
|
1011
1011
|
time.sleep(1.0)
|
1012
1012
|
|
@@ -230,6 +230,29 @@ register_chat_template(
|
|
230
230
|
)
|
231
231
|
)
|
232
232
|
|
233
|
+
register_chat_template(
|
234
|
+
ChatTemplate(
|
235
|
+
name="janus-pro",
|
236
|
+
default_system_prompt=None,
|
237
|
+
role_prefix_and_suffix={
|
238
|
+
"system": (
|
239
|
+
"",
|
240
|
+
"",
|
241
|
+
),
|
242
|
+
"User": (
|
243
|
+
"<|User|>",
|
244
|
+
"",
|
245
|
+
),
|
246
|
+
"assistant": (
|
247
|
+
"<|Assistant|>",
|
248
|
+
"<|end▁of▁sentence|>",
|
249
|
+
),
|
250
|
+
},
|
251
|
+
stop_str=("<|end▁of▁sentence|>",),
|
252
|
+
image_token="<image_placeholder>\n",
|
253
|
+
)
|
254
|
+
)
|
255
|
+
|
233
256
|
# The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
|
234
257
|
register_chat_template(
|
235
258
|
ChatTemplate(
|
@@ -384,6 +407,12 @@ def match_deepseek(model_path: str):
|
|
384
407
|
return get_chat_template("deepseek-v3")
|
385
408
|
|
386
409
|
|
410
|
+
@register_chat_template_matching_function
|
411
|
+
def match_deepseek_janus_pro(model_path: str):
|
412
|
+
if "janus" in model_path.lower():
|
413
|
+
return get_chat_template("janus-pro")
|
414
|
+
|
415
|
+
|
387
416
|
@register_chat_template_matching_function
|
388
417
|
def match_dbrx(model_path: str):
|
389
418
|
if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
|
@@ -6,10 +6,12 @@ from typing import List, Tuple
|
|
6
6
|
import torch
|
7
7
|
import torch.library
|
8
8
|
|
9
|
-
from sglang.srt.utils import is_hip, is_hpu
|
9
|
+
from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
|
10
10
|
|
11
11
|
logger = logging.getLogger(__name__)
|
12
|
-
use_vllm_custom_allreduce =
|
12
|
+
use_vllm_custom_allreduce = get_bool_env_var(
|
13
|
+
"USE_VLLM_CUSTOM_ALLREDUCE", default="true"
|
14
|
+
)
|
13
15
|
|
14
16
|
if not is_hpu():
|
15
17
|
# ROCm does not use vllm custom allreduce
|
@@ -75,42 +77,42 @@ else:
|
|
75
77
|
rank: int,
|
76
78
|
full_nvlink: bool,
|
77
79
|
) -> int:
|
78
|
-
return sgl_kernel.
|
80
|
+
return sgl_kernel.allreduce.init_custom_ar(
|
79
81
|
meta, rank_data, handles, offsets, rank, full_nvlink
|
80
82
|
)
|
81
83
|
|
82
84
|
def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
|
83
|
-
sgl_kernel.
|
85
|
+
sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
|
84
86
|
|
85
87
|
def all_reduce_unreg(
|
86
88
|
fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
|
87
89
|
) -> None:
|
88
|
-
sgl_kernel.
|
90
|
+
sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
|
89
91
|
|
90
92
|
def dispose(fa: int) -> None:
|
91
|
-
sgl_kernel.
|
93
|
+
sgl_kernel.allreduce.dispose(fa)
|
92
94
|
|
93
95
|
def meta_size() -> int:
|
94
|
-
return sgl_kernel.
|
96
|
+
return sgl_kernel.allreduce.meta_size()
|
95
97
|
|
96
98
|
def register_buffer(
|
97
99
|
fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
|
98
100
|
) -> None:
|
99
|
-
return sgl_kernel.
|
101
|
+
return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
|
100
102
|
|
101
103
|
def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
|
102
|
-
return sgl_kernel.
|
104
|
+
return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
|
103
105
|
|
104
106
|
def register_graph_buffers(
|
105
107
|
fa: int, handles: List[str], offsets: List[List[int]]
|
106
108
|
) -> None:
|
107
|
-
sgl_kernel.
|
109
|
+
sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
|
108
110
|
|
109
111
|
def allocate_meta_buffer(size: int) -> torch.Tensor:
|
110
|
-
return sgl_kernel.
|
112
|
+
return sgl_kernel.allreduce.allocate_meta_buffer(size)
|
111
113
|
|
112
114
|
def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
|
113
|
-
return sgl_kernel.
|
115
|
+
return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
|
114
116
|
|
115
117
|
else:
|
116
118
|
# TRTLLM custom allreduce
|
@@ -123,7 +125,7 @@ else:
|
|
123
125
|
barrier_in: List[int],
|
124
126
|
barrier_out: List[int],
|
125
127
|
) -> int:
|
126
|
-
return sgl_kernel.
|
128
|
+
return sgl_kernel.init_custom_reduce(
|
127
129
|
rank_id,
|
128
130
|
world_size,
|
129
131
|
rank_data_base,
|
@@ -134,15 +136,15 @@ else:
|
|
134
136
|
)
|
135
137
|
|
136
138
|
def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
|
137
|
-
sgl_kernel.
|
139
|
+
sgl_kernel.custom_reduce(fa, inp, out)
|
138
140
|
|
139
141
|
def dispose(fa: int) -> None:
|
140
|
-
sgl_kernel.
|
142
|
+
sgl_kernel.custom_dispose(fa)
|
141
143
|
|
142
144
|
def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
|
143
|
-
return sgl_kernel.
|
145
|
+
return sgl_kernel.get_graph_buffer_ipc_meta(fa)
|
144
146
|
|
145
147
|
def register_graph_buffers(
|
146
148
|
fa: int, handles: List[List[int]], offsets: List[List[int]]
|
147
149
|
) -> None:
|
148
|
-
sgl_kernel.
|
150
|
+
sgl_kernel.register_graph_buffers(fa, handles, offsets)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from sglang.srt.configs.chatglm import ChatGLMConfig
|
2
2
|
from sglang.srt.configs.dbrx import DbrxConfig
|
3
3
|
from sglang.srt.configs.exaone import ExaoneConfig
|
4
|
+
from sglang.srt.configs.janus_pro import MultiModalityConfig
|
4
5
|
from sglang.srt.configs.qwen2_5_vl_config import (
|
5
6
|
Qwen2_5_VLConfig,
|
6
7
|
Qwen2_5_VLVisionConfig,
|
@@ -12,4 +13,5 @@ __all__ = [
|
|
12
13
|
"DbrxConfig",
|
13
14
|
"Qwen2_5_VLConfig",
|
14
15
|
"Qwen2_5_VLVisionConfig",
|
16
|
+
"MultiModalityConfig",
|
15
17
|
]
|