sglang 0.4.10__tar.gz → 0.4.10.post2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.4.10/sglang.egg-info → sglang-0.4.10.post2}/PKG-INFO +3 -2
- {sglang-0.4.10 → sglang-0.4.10.post2}/pyproject.toml +4 -2
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/bench_offline_throughput.py +20 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/compile_deep_gemm.py +8 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/global_config.py +5 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/model_config.py +1 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/conversation.py +0 -112
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/decode_schedule_batch_mixin.py +1 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/launch_lb.py +5 -20
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/mooncake/conn.py +33 -15
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/prefill.py +1 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/device_communicators/pynccl.py +7 -0
- sglang-0.4.10.post2/sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/parallel_state.py +11 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/engine.py +4 -2
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/http_server.py +35 -15
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/eplb/expert_distribution.py +4 -2
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/hf_transformers_utils.py +25 -10
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/flashattention_backend.py +7 -11
- sglang-0.4.10.post2/sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/utils.py +6 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/vision.py +27 -10
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/communicator.py +14 -4
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/linear.py +7 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/logits_processor.py +9 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/ep_moe/layer.py +29 -68
- sglang-0.4.10.post2/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/layer.py +82 -25
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +0 -31
- sglang-0.4.10.post2/sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
- sglang-0.4.10/sglang/srt/layers/moe/ep_moe/token_dispatcher.py → sglang-0.4.10.post2/sglang/srt/layers/moe/token_dispatcher/deepep.py +8 -15
- sglang-0.4.10.post2/sglang/srt/layers/moe/utils.py +43 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/fp8.py +57 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/fp8_kernel.py +0 -4
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/w8a8_int8.py +4 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/vocab_parallel_embedding.py +7 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/lora_registry.py +7 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/cache_controller.py +43 -39
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/data_parallel_controller.py +52 -2
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/io_struct.py +6 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/schedule_batch.py +3 -2
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/schedule_policy.py +3 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/scheduler.py +145 -6
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/template_manager.py +25 -22
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/tokenizer_manager.py +114 -62
- sglang-0.4.10.post2/sglang/srt/managers/utils.py +84 -0
- sglang-0.4.10.post2/sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/mem_cache/hicache_storage.py +13 -12
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/mem_cache/hiradix_cache.py +21 -4
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/mem_cache/memory_pool.py +15 -118
- sglang-0.4.10.post2/sglang/srt/mem_cache/memory_pool_host.py +708 -0
- sglang-0.4.10.post2/sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +8 -2
- sglang-0.4.10.post2/sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
- sglang-0.4.10.post2/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +163 -0
- sglang-0.4.10.post2/sglang/srt/mem_cache/storage/nixl/nixl_utils.py +238 -0
- sglang-0.4.10.post2/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +216 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/model_executor/cuda_graph_runner.py +42 -4
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/model_executor/forward_batch_info.py +13 -3
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/model_executor/model_runner.py +13 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/model_loader/weight_utils.py +2 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/deepseek_v2.py +28 -23
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/glm4_moe.py +85 -22
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/grok.py +3 -3
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/llama4.py +13 -2
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/mixtral.py +3 -3
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/mllama4.py +428 -19
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/qwen2_moe.py +1 -4
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/qwen3_moe.py +7 -8
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/step3_vl.py +1 -4
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/base_processor.py +4 -3
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/gemma3n.py +0 -7
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/operations_strategy.py +1 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/server_args.py +115 -21
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/two_batch_overlap.py +6 -4
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/utils.py +4 -24
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/weight_sync/utils.py +1 -1
- sglang-0.4.10.post2/sglang/test/attention/test_trtllm_mla_backend.py +945 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/runners.py +2 -2
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_utils.py +3 -3
- sglang-0.4.10.post2/sglang/version.py +1 -0
- {sglang-0.4.10 → sglang-0.4.10.post2/sglang.egg-info}/PKG-INFO +3 -2
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang.egg-info/SOURCES.txt +15 -4
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang.egg-info/requires.txt +2 -1
- sglang-0.4.10/sglang/srt/managers/utils.py +0 -40
- sglang-0.4.10/sglang/srt/mem_cache/memory_pool_host.py +0 -391
- sglang-0.4.10/sglang/test/attention/__init__.py +0 -0
- sglang-0.4.10/sglang/version.py +0 -1
- {sglang-0.4.10 → sglang-0.4.10.post2}/LICENSE +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/README.md +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/setup.cfg +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/api.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/bench_one_batch.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/bench_one_batch_server.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/bench_serving.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/check_env.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/eval/llama3_eval.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/eval/loogle_eval.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/lang/chat_template.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/lang/choices.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/lang/compiler.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/lang/interpreter.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/lang/ir.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/lang/tracer.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/launch_server.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/profiler.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/_custom_ops.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/aio_rwlock.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/code_completion_parser.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/chatglm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/dbrx.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/deepseekvl2.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/device_config.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/internvl.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/janus_pro.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/kimi_vl.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/kimi_vl_moonvit.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/load_config.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/step3_vl.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/update_config.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/configs/utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/connector/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/connector/base_connector.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/connector/redis.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/connector/s3.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/connector/serde/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/connector/serde/safe_serde.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/connector/serde/serde.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/connector/utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/constants.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/constrained/base_grammar_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/constrained/llguidance_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/constrained/outlines_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/constrained/reasoner_grammar_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/constrained/triton_ops/bitmask_ops.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/constrained/xgrammar_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/custom_op.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/debug_utils/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/debug_utils/dump_comparator.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/debug_utils/dumper.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/debug_utils/text_comparator.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/ascend/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/ascend/conn.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/ascend/transfer_engine.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/base/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/base/conn.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/common/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/common/conn.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/common/utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/decode.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/fake/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/fake/conn.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/kv_events.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/mini_lb.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/mooncake/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/mooncake/transfer_engine.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/nixl/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/nixl/conn.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/communication_op.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/device_communicators/npu_communicator.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/device_communicators/pymscclpp.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/device_communicators/quick_all_reduce.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/distributed/utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/EngineBase.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/http_server_engine.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/openai/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/openai/protocol.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/openai/serving_base.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/openai/serving_chat.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/openai/serving_completions.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/openai/serving_embedding.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/openai/serving_rerank.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/openai/serving_score.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/openai/usage_processor.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/entrypoints/openai/utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/eplb/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/eplb/eplb_algorithms/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/eplb/eplb_algorithms/deepseek_vec.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/eplb/eplb_manager.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/eplb/eplb_simulator/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/eplb/eplb_simulator/reader.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/eplb/expert_location.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/eplb/expert_location_dispatch.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/eplb/expert_location_updater.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/base_format_detector.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/core_types.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/deepseekv3_detector.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/ebnf_composer.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/function_call_parser.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/glm4_moe_detector.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/kimik2_detector.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/llama32_detector.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/mistral_detector.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/pythonic_detector.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/qwen25_detector.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/qwen3_coder_detector.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/step3_detector.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/function_call/utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/jinja_template_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/amx_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/aiter_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/ascend_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/flashmla_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/hybrid_attn_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/intel_amx_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/merge_state.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/tbo_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/triton_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/triton_ops/merge_state.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/dp_attention.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/elementwise.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/flashinfer_comm_fusion.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/cutlass_moe.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/cutlass_moe_params.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/cutlass_w4a8_moe.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/router.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/token_dispatcher/standard.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/moe/topk.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/multimodal.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/parameter.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/awq.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/awq_triton.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
- {sglang-0.4.10/sglang/srt/layers/moe/token_dispatcher → sglang-0.4.10.post2/sglang/srt/layers/quantization/compressed_tensors}/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/gptq.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/int8_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/kv_cache.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/marlin_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/moe_wna16.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/petit.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/petit_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/qoq.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/scalar_type.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/unquant.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/w4afp8.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/quantization/w8a8_fp8.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/rotary_embedding.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/sampler.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/layers/utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/backend/base_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/backend/triton_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/layers.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/mem_pool.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/triton_ops/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/lora/utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/configure_logging.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/mm_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/multimodal_processor.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/scheduler_input_blocker.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/scheduler_metrics_mixin.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/scheduler_output_processor_mixin.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/scheduler_profiler_mixin.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/scheduler_update_weights_mixin.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/session_controller.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/tp_worker.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/mem_cache/allocator.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/mem_cache/multimodal_cache.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +0 -0
- {sglang-0.4.10/sglang/srt/mem_cache → sglang-0.4.10.post2/sglang/srt/mem_cache/storage}/mooncake_store/mooncake_store.py +0 -0
- {sglang-0.4.10/sglang/srt/mem_cache → sglang-0.4.10.post2/sglang/srt/mem_cache/storage}/mooncake_store/unit_test.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/mem_cache/swa_radix_cache.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/metrics/collector.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/model_loader/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/model_loader/loader.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/model_parallel.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/arcee.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/bert.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/clip.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/deepseek_janus_pro.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/deepseek_nextn.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/deepseek_vl2.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/gemma2.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/gemma2_reward.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/gemma3_causal.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/gemma3_mm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/gemma3n_audio.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/gemma3n_causal.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/gemma3n_mm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/glm4.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/glm4_moe_nextn.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/gpt2.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/granite.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/granitemoe.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/hunyuan.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/idefics2.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/internlm2_reward.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/interns1.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/internvl.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/kimi_vl.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/kimi_vl_moonvit.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/llama.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/llama_eagle.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/llama_eagle3.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/llama_reward.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/llava.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/mimo.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/mimo_mtp.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/minicpm3.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/minicpmo.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/minicpmv.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/mllama.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/olmo.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/olmo2.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/persimmon.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/phi.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/phi3_small.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/phi4mm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/phi4mm_audio.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/phi4mm_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/phimoe.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/pixtral.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/qwen2_5_vl.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/qwen2_audio.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/qwen2_classification.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/qwen2_eagle.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/qwen2_rm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/qwen2_vl.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/qwen3.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/registry.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/roberta.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/siglip.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/torch_native_llama.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/transformers.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/vila.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/mm_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/clip.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/gemma3.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/internvl.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/janus_pro.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/kimi_vl.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/llava.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/minicpm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/mlama.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/mllama4.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/phi4mm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/pixtral.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/qwen_audio.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/qwen_vl.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/step3_vl.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/multimodal/processors/vila.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/operations.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/patch_torch.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/poll_based_barrier.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/reasoning_parser.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/sampling/custom_logit_processor.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/sampling/sampling_batch_info.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/speculative/build_eagle_tree.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/speculative/eagle_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/speculative/eagle_worker.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/speculative/spec_info.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/torch_memory_saver_adapter.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/warmup.py +0 -0
- {sglang-0.4.10/sglang/srt/layers/quantization/compressed_tensors → sglang-0.4.10.post2/sglang/test}/__init__.py +0 -0
- {sglang-0.4.10/sglang/test → sglang-0.4.10.post2/sglang/test/attention}/__init__.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/attention/test_flashattn_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/attention/test_flashattn_mla_backend.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/attention/test_prefix_chunk_info.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/run_eval.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/send_one.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_activation.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_block_fp8.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_block_fp8_ep.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_custom_ops.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_cutlass_moe.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_cutlass_w4a8_moe.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_deepep_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_dynamic_grad_mode.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_fp4_moe.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_marlin_moe.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_marlin_utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/test/test_programs.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang/utils.py +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.4.10 → sglang-0.4.10.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.10
|
3
|
+
Version: 0.4.10.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -250,7 +250,7 @@ Requires-Dist: transformers==4.54.1; extra == "runtime-common"
|
|
250
250
|
Requires-Dist: timm==1.0.16; extra == "runtime-common"
|
251
251
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
252
252
|
Requires-Dist: uvloop; extra == "runtime-common"
|
253
|
-
Requires-Dist: xgrammar==0.1.
|
253
|
+
Requires-Dist: xgrammar==0.1.22; extra == "runtime-common"
|
254
254
|
Provides-Extra: srt
|
255
255
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
256
256
|
Requires-Dist: sgl-kernel==0.2.8; extra == "srt"
|
@@ -301,6 +301,7 @@ Requires-Dist: matplotlib; extra == "test"
|
|
301
301
|
Requires-Dist: pandas; extra == "test"
|
302
302
|
Requires-Dist: peft; extra == "test"
|
303
303
|
Requires-Dist: sentence_transformers; extra == "test"
|
304
|
+
Requires-Dist: pytest; extra == "test"
|
304
305
|
Provides-Extra: all
|
305
306
|
Requires-Dist: sglang[srt]; extra == "all"
|
306
307
|
Requires-Dist: sglang[openai]; extra == "all"
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.4.10"
|
7
|
+
version = "0.4.10.post2"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -49,7 +49,7 @@ runtime_common = [
|
|
49
49
|
"timm==1.0.16",
|
50
50
|
"uvicorn",
|
51
51
|
"uvloop",
|
52
|
-
"xgrammar==0.1.
|
52
|
+
"xgrammar==0.1.22",
|
53
53
|
]
|
54
54
|
|
55
55
|
srt = [
|
@@ -108,6 +108,7 @@ test = [
|
|
108
108
|
"pandas",
|
109
109
|
"peft",
|
110
110
|
"sentence_transformers",
|
111
|
+
"pytest",
|
111
112
|
]
|
112
113
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[torch_memory_saver]", "sglang[decord]"]
|
113
114
|
all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[decord]"]
|
@@ -130,6 +131,7 @@ dev_cpu = ["sglang[all_cpu]", "sglang[test]"]
|
|
130
131
|
"sglang" = [
|
131
132
|
"srt/layers/moe/fused_moe_triton/configs/*/*.json",
|
132
133
|
"srt/layers/quantization/configs/*.json",
|
134
|
+
"srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp",
|
133
135
|
]
|
134
136
|
|
135
137
|
[tool.setuptools.packages.find]
|
@@ -418,6 +418,26 @@ if __name__ == "__main__":
|
|
418
418
|
ServerArgs.add_cli_args(parser)
|
419
419
|
BenchArgs.add_cli_args(parser)
|
420
420
|
args = parser.parse_args()
|
421
|
+
|
422
|
+
# handling ModelScope model downloads
|
423
|
+
if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() in ("true", "1"):
|
424
|
+
if os.path.exists(args.model_path):
|
425
|
+
print(f"Using local model path: {args.model_path}")
|
426
|
+
else:
|
427
|
+
try:
|
428
|
+
from modelscope import snapshot_download
|
429
|
+
|
430
|
+
print(f"Using ModelScope to download model: {args.model_path}")
|
431
|
+
|
432
|
+
# download the model and replace args.model_path
|
433
|
+
args.model_path = snapshot_download(
|
434
|
+
args.model_path,
|
435
|
+
)
|
436
|
+
print(f"Model downloaded to: {args.model_path}")
|
437
|
+
except Exception as e:
|
438
|
+
print(f"ModelScope download failed: {str(e)}")
|
439
|
+
raise e
|
440
|
+
|
421
441
|
server_args = ServerArgs.from_cli_args(args)
|
422
442
|
bench_args = BenchArgs.from_cli_args(args)
|
423
443
|
|
@@ -17,6 +17,7 @@ import time
|
|
17
17
|
|
18
18
|
import requests
|
19
19
|
|
20
|
+
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
|
20
21
|
from sglang.srt.entrypoints.http_server import launch_server
|
21
22
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
22
23
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
@@ -52,7 +53,9 @@ class CompileArgs:
|
|
52
53
|
|
53
54
|
|
54
55
|
@warmup("compile-deep-gemm")
|
55
|
-
async def warm_up_compile(
|
56
|
+
async def warm_up_compile(
|
57
|
+
disaggregation_mode: str, tokenizer_manager: TokenizerManager
|
58
|
+
):
|
56
59
|
print("\nGenerate warm up request for compiling DeepGEMM...\n")
|
57
60
|
generate_req_input = GenerateReqInput(
|
58
61
|
input_ids=[0, 1, 2, 3],
|
@@ -62,6 +65,10 @@ async def warm_up_compile(tokenizer_manager: TokenizerManager):
|
|
62
65
|
"ignore_eos": True,
|
63
66
|
},
|
64
67
|
)
|
68
|
+
if disaggregation_mode != "null":
|
69
|
+
generate_req_input.bootstrap_room = 0
|
70
|
+
generate_req_input.bootstrap_host = FAKE_BOOTSTRAP_HOST
|
71
|
+
|
65
72
|
await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
|
66
73
|
|
67
74
|
|
@@ -30,7 +30,11 @@ class GlobalConfig:
|
|
30
30
|
self.default_new_token_ratio_decay_steps = float(
|
31
31
|
os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600)
|
32
32
|
)
|
33
|
-
|
33
|
+
self.torch_empty_cache_interval = float(
|
34
|
+
os.environ.get(
|
35
|
+
"SGLANG_EMPTY_CACHE_INTERVAL", -1
|
36
|
+
) # in seconds. Set if you observe high memory accumulation over a long serving period.
|
37
|
+
)
|
34
38
|
# Runtime constants: others
|
35
39
|
self.retract_decode_steps = 20
|
36
40
|
self.flashinfer_workspace_size = os.environ.get(
|
@@ -112,6 +112,7 @@ class ModelConfig:
|
|
112
112
|
mm_disabled_models = [
|
113
113
|
"Gemma3ForConditionalGeneration",
|
114
114
|
"Llama4ForConditionalGeneration",
|
115
|
+
"Step3VLForConditionalGeneration",
|
115
116
|
]
|
116
117
|
if self.hf_config.architectures[0] in mm_disabled_models:
|
117
118
|
enable_multimodal = False
|
@@ -954,20 +954,6 @@ register_conv_template(
|
|
954
954
|
)
|
955
955
|
)
|
956
956
|
|
957
|
-
register_conv_template(
|
958
|
-
Conversation(
|
959
|
-
name="mimo-vl",
|
960
|
-
system_message="You are MiMo, an AI assistant developed by Xiaomi.",
|
961
|
-
system_template="<|im_start|>system\n{system_message}",
|
962
|
-
roles=("<|im_start|>user", "<|im_start|>assistant"),
|
963
|
-
sep="<|im_end|>\n",
|
964
|
-
sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
|
965
|
-
stop_str=["<|im_end|>"],
|
966
|
-
image_token="<|vision_start|><|image_pad|><|vision_end|>",
|
967
|
-
)
|
968
|
-
)
|
969
|
-
|
970
|
-
|
971
957
|
register_conv_template(
|
972
958
|
Conversation(
|
973
959
|
name="qwen2-audio",
|
@@ -981,51 +967,11 @@ register_conv_template(
|
|
981
967
|
)
|
982
968
|
)
|
983
969
|
|
984
|
-
register_conv_template(
|
985
|
-
Conversation(
|
986
|
-
name="llama_4_vision",
|
987
|
-
system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
|
988
|
-
system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
|
989
|
-
roles=("user", "assistant"),
|
990
|
-
sep_style=SeparatorStyle.LLAMA4,
|
991
|
-
sep="",
|
992
|
-
stop_str="<|eot|>",
|
993
|
-
image_token="<|image|>",
|
994
|
-
)
|
995
|
-
)
|
996
|
-
|
997
|
-
register_conv_template(
|
998
|
-
Conversation(
|
999
|
-
name="step3-vl",
|
1000
|
-
system_message="<|begin▁of▁sentence|>You are a helpful assistant",
|
1001
|
-
system_template="{system_message}\n",
|
1002
|
-
roles=(
|
1003
|
-
"<|BOT|>user\n",
|
1004
|
-
"<|BOT|>assistant\n<think>\n",
|
1005
|
-
),
|
1006
|
-
sep="<|EOT|>",
|
1007
|
-
sep_style=SeparatorStyle.NO_COLON_SINGLE,
|
1008
|
-
stop_str="<|EOT|>",
|
1009
|
-
image_token="<im_patch>",
|
1010
|
-
# add_bos=True,
|
1011
|
-
)
|
1012
|
-
)
|
1013
|
-
|
1014
970
|
|
1015
971
|
@register_conv_template_matching_function
|
1016
972
|
def match_internvl(model_path: str):
|
1017
973
|
if re.search(r"internvl", model_path, re.IGNORECASE):
|
1018
974
|
return "internvl-2-5"
|
1019
|
-
if re.search(r"intern.*s1", model_path, re.IGNORECASE):
|
1020
|
-
return "interns1"
|
1021
|
-
|
1022
|
-
|
1023
|
-
@register_conv_template_matching_function
|
1024
|
-
def match_llama_vision(model_path: str):
|
1025
|
-
if re.search(r"llama.*3\.2.*vision", model_path, re.IGNORECASE):
|
1026
|
-
return "llama_3_vision"
|
1027
|
-
if re.search(r"llama.*4.*", model_path, re.IGNORECASE):
|
1028
|
-
return "llama_4_vision"
|
1029
975
|
|
1030
976
|
|
1031
977
|
@register_conv_template_matching_function
|
@@ -1040,22 +986,6 @@ def match_vicuna(model_path: str):
|
|
1040
986
|
return "vicuna_v1.1"
|
1041
987
|
|
1042
988
|
|
1043
|
-
@register_conv_template_matching_function
|
1044
|
-
def match_llama2_chat(model_path: str):
|
1045
|
-
if re.search(
|
1046
|
-
r"llama-2.*chat|codellama.*instruct",
|
1047
|
-
model_path,
|
1048
|
-
re.IGNORECASE,
|
1049
|
-
):
|
1050
|
-
return "llama-2"
|
1051
|
-
|
1052
|
-
|
1053
|
-
@register_conv_template_matching_function
|
1054
|
-
def match_mistral(model_path: str):
|
1055
|
-
if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
|
1056
|
-
return "mistral"
|
1057
|
-
|
1058
|
-
|
1059
989
|
@register_conv_template_matching_function
|
1060
990
|
def match_deepseek_vl(model_path: str):
|
1061
991
|
if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE):
|
@@ -1064,12 +994,6 @@ def match_deepseek_vl(model_path: str):
|
|
1064
994
|
|
1065
995
|
@register_conv_template_matching_function
|
1066
996
|
def match_qwen_chat_ml(model_path: str):
|
1067
|
-
if re.search(r"gme.*qwen.*vl", model_path, re.IGNORECASE):
|
1068
|
-
return "gme-qwen2-vl"
|
1069
|
-
if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
|
1070
|
-
return "qwen2-vl"
|
1071
|
-
if re.search(r"qwen.*audio", model_path, re.IGNORECASE):
|
1072
|
-
return "qwen2-audio"
|
1073
997
|
if re.search(
|
1074
998
|
r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
|
1075
999
|
model_path,
|
@@ -1078,12 +1002,6 @@ def match_qwen_chat_ml(model_path: str):
|
|
1078
1002
|
return "chatml-llava"
|
1079
1003
|
|
1080
1004
|
|
1081
|
-
@register_conv_template_matching_function
|
1082
|
-
def match_gemma3_instruct(model_path: str):
|
1083
|
-
if re.search(r"gemma-3.*it", model_path, re.IGNORECASE):
|
1084
|
-
return "gemma-it"
|
1085
|
-
|
1086
|
-
|
1087
1005
|
@register_conv_template_matching_function
|
1088
1006
|
def match_openbmb_minicpm(model_path: str):
|
1089
1007
|
if re.search(r"minicpm-v", model_path, re.IGNORECASE):
|
@@ -1092,37 +1010,7 @@ def match_openbmb_minicpm(model_path: str):
|
|
1092
1010
|
return "minicpmo"
|
1093
1011
|
|
1094
1012
|
|
1095
|
-
@register_conv_template_matching_function
|
1096
|
-
def match_moonshot_kimivl(model_path: str):
|
1097
|
-
if re.search(r"kimi.*vl", model_path, re.IGNORECASE):
|
1098
|
-
return "kimi-vl"
|
1099
|
-
|
1100
|
-
|
1101
|
-
@register_conv_template_matching_function
|
1102
|
-
def match_devstral(model_path: str):
|
1103
|
-
if re.search(r"devstral", model_path, re.IGNORECASE):
|
1104
|
-
return "devstral"
|
1105
|
-
|
1106
|
-
|
1107
1013
|
@register_conv_template_matching_function
|
1108
1014
|
def match_phi_4_mm(model_path: str):
|
1109
1015
|
if "phi-4-multimodal" in model_path.lower():
|
1110
1016
|
return "phi-4-mm"
|
1111
|
-
|
1112
|
-
|
1113
|
-
@register_conv_template_matching_function
|
1114
|
-
def match_vila(model_path: str):
|
1115
|
-
if re.search(r"vila", model_path, re.IGNORECASE):
|
1116
|
-
return "chatml"
|
1117
|
-
|
1118
|
-
|
1119
|
-
@register_conv_template_matching_function
|
1120
|
-
def match_mimo_vl(model_path: str):
|
1121
|
-
if re.search(r"mimo.*vl", model_path, re.IGNORECASE):
|
1122
|
-
return "mimo-vl"
|
1123
|
-
|
1124
|
-
|
1125
|
-
# @register_conv_template_matching_function
|
1126
|
-
# def match_step3(model_path: str):
|
1127
|
-
# if re.search(r"step3", model_path, re.IGNORECASE):
|
1128
|
-
# return "step3-vl"
|
{sglang-0.4.10 → sglang-0.4.10.post2}/sglang/srt/disaggregation/decode_schedule_batch_mixin.py
RENAMED
@@ -88,6 +88,7 @@ class ScheduleBatchDisaggregationDecodeMixin:
|
|
88
88
|
self.extend_lens = [r.extend_input_len for r in reqs]
|
89
89
|
self.extend_logprob_start_lens = [r.extend_logprob_start_len for r in reqs]
|
90
90
|
self.extend_input_logprob_token_ids = extend_input_logprob_token_ids
|
91
|
+
self.multimodal_inputs = [r.multimodal_inputs for r in reqs]
|
91
92
|
|
92
93
|
# Build sampling info
|
93
94
|
self.sampling_info = SamplingBatchInfo.from_schedule_batch(
|
@@ -1,6 +1,8 @@
|
|
1
1
|
import argparse
|
2
2
|
import dataclasses
|
3
3
|
|
4
|
+
from sglang.srt.disaggregation.mini_lb import PrefillConfig, run
|
5
|
+
|
4
6
|
|
5
7
|
@dataclasses.dataclass
|
6
8
|
class LBArgs:
|
@@ -18,7 +20,7 @@ class LBArgs:
|
|
18
20
|
parser.add_argument(
|
19
21
|
"--rust-lb",
|
20
22
|
action="store_true",
|
21
|
-
help="
|
23
|
+
help="Deprecated, please use SGLang Router instead, this argument will have no effect.",
|
22
24
|
)
|
23
25
|
parser.add_argument(
|
24
26
|
"--host",
|
@@ -115,25 +117,8 @@ def main():
|
|
115
117
|
args = parser.parse_args()
|
116
118
|
lb_args = LBArgs.from_cli_args(args)
|
117
119
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
RustLB(
|
122
|
-
host=lb_args.host,
|
123
|
-
port=lb_args.port,
|
124
|
-
policy=lb_args.policy,
|
125
|
-
prefill_infos=lb_args.prefill_infos,
|
126
|
-
decode_infos=lb_args.decode_infos,
|
127
|
-
log_interval=lb_args.log_interval,
|
128
|
-
timeout=lb_args.timeout,
|
129
|
-
).start()
|
130
|
-
else:
|
131
|
-
from sglang.srt.disaggregation.mini_lb import PrefillConfig, run
|
132
|
-
|
133
|
-
prefill_configs = [
|
134
|
-
PrefillConfig(url, port) for url, port in lb_args.prefill_infos
|
135
|
-
]
|
136
|
-
run(prefill_configs, lb_args.decode_infos, lb_args.host, lb_args.port)
|
120
|
+
prefill_configs = [PrefillConfig(url, port) for url, port in lb_args.prefill_infos]
|
121
|
+
run(prefill_configs, lb_args.decode_infos, lb_args.host, lb_args.port)
|
137
122
|
|
138
123
|
|
139
124
|
if __name__ == "__main__":
|
@@ -37,6 +37,7 @@ from sglang.srt.disaggregation.utils import DisaggregationMode
|
|
37
37
|
from sglang.srt.server_args import ServerArgs
|
38
38
|
from sglang.srt.utils import (
|
39
39
|
format_tcp_address,
|
40
|
+
get_bool_env_var,
|
40
41
|
get_free_port,
|
41
42
|
get_int_env_var,
|
42
43
|
get_ip,
|
@@ -198,6 +199,10 @@ class MooncakeKVManager(BaseKVManager):
|
|
198
199
|
self.bootstrap_timeout = get_int_env_var(
|
199
200
|
"SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 300
|
200
201
|
)
|
202
|
+
|
203
|
+
self.enable_custom_mem_pool = get_bool_env_var(
|
204
|
+
"SGLANG_MOONCAKE_CUSTOM_MEM_POOL", "false"
|
205
|
+
)
|
201
206
|
elif self.disaggregation_mode == DisaggregationMode.DECODE:
|
202
207
|
self.heartbeat_failures = {}
|
203
208
|
self.session_pool = defaultdict(requests.Session)
|
@@ -258,6 +263,26 @@ class MooncakeKVManager(BaseKVManager):
|
|
258
263
|
socket.connect(endpoint)
|
259
264
|
return socket
|
260
265
|
|
266
|
+
def _transfer_data(self, mooncake_session_id, transfer_blocks):
|
267
|
+
if not transfer_blocks:
|
268
|
+
return 0
|
269
|
+
|
270
|
+
# TODO(shangming): Fix me when nvlink_transport of Mooncake is bug-free
|
271
|
+
if self.enable_custom_mem_pool:
|
272
|
+
# batch_transfer_sync has a higher chance to trigger an accuracy drop for MNNVL, fallback to transfer_sync temporarily
|
273
|
+
for src_addr, dst_addr, length in transfer_blocks:
|
274
|
+
status = self.engine.transfer_sync(
|
275
|
+
mooncake_session_id, src_addr, dst_addr, length
|
276
|
+
)
|
277
|
+
if status != 0:
|
278
|
+
return status
|
279
|
+
return 0
|
280
|
+
else:
|
281
|
+
src_addrs, dst_addrs, lengths = zip(*transfer_blocks)
|
282
|
+
return self.engine.batch_transfer_sync(
|
283
|
+
mooncake_session_id, list(src_addrs), list(dst_addrs), list(lengths)
|
284
|
+
)
|
285
|
+
|
261
286
|
def send_kvcache(
|
262
287
|
self,
|
263
288
|
mooncake_session_id: str,
|
@@ -283,17 +308,14 @@ class MooncakeKVManager(BaseKVManager):
|
|
283
308
|
|
284
309
|
# Worker function for processing a single layer
|
285
310
|
def process_layer(src_ptr: int, dst_ptr: int, item_len: int) -> int:
|
311
|
+
transfer_blocks = []
|
286
312
|
for prefill_index, decode_index in zip(prefill_kv_blocks, dst_kv_blocks):
|
287
313
|
src_addr = src_ptr + int(prefill_index[0]) * item_len
|
288
314
|
dst_addr = dst_ptr + int(decode_index[0]) * item_len
|
289
315
|
length = item_len * len(prefill_index)
|
316
|
+
transfer_blocks.append((src_addr, dst_addr, length))
|
290
317
|
|
291
|
-
|
292
|
-
mooncake_session_id, src_addr, dst_addr, length
|
293
|
-
)
|
294
|
-
if status != 0:
|
295
|
-
return status
|
296
|
-
return 0
|
318
|
+
return self._transfer_data(mooncake_session_id, transfer_blocks)
|
297
319
|
|
298
320
|
futures = [
|
299
321
|
executor.submit(
|
@@ -465,21 +487,17 @@ class MooncakeKVManager(BaseKVManager):
|
|
465
487
|
dst_aux_ptrs: list[int],
|
466
488
|
dst_aux_index: int,
|
467
489
|
):
|
468
|
-
|
469
|
-
dst_addr_list = []
|
470
|
-
length_list = []
|
490
|
+
transfer_blocks = []
|
471
491
|
prefill_aux_ptrs = self.kv_args.aux_data_ptrs
|
472
492
|
prefill_aux_item_lens = self.kv_args.aux_item_lens
|
493
|
+
|
473
494
|
for i, dst_aux_ptr in enumerate(dst_aux_ptrs):
|
474
495
|
length = prefill_aux_item_lens[i]
|
475
496
|
src_addr = prefill_aux_ptrs[i] + length * prefill_aux_index
|
476
497
|
dst_addr = dst_aux_ptrs[i] + length * dst_aux_index
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
return self.engine.batch_transfer_sync(
|
481
|
-
mooncake_session_id, src_addr_list, dst_addr_list, length_list
|
482
|
-
)
|
498
|
+
transfer_blocks.append((src_addr, dst_addr, length))
|
499
|
+
|
500
|
+
return self._transfer_data(mooncake_session_id, transfer_blocks)
|
483
501
|
|
484
502
|
def sync_status_to_decode_endpoint(
|
485
503
|
self, remote: str, dst_port: int, room: int, status: int, prefill_rank: int
|
@@ -460,6 +460,7 @@ class SchedulerDisaggregationPrefillMixin:
|
|
460
460
|
|
461
461
|
# We need to remove the sync in the following function for overlap schedule.
|
462
462
|
self.set_next_batch_sampling_info_done(batch)
|
463
|
+
self.maybe_send_health_check_signal()
|
463
464
|
|
464
465
|
def process_disagg_prefill_inflight_queue(
|
465
466
|
self: Scheduler, rids_to_check: Optional[List[str]] = None
|
@@ -75,6 +75,7 @@ class PyNcclCommunicator:
|
|
75
75
|
self.available = True
|
76
76
|
self.disabled = False
|
77
77
|
|
78
|
+
self.nccl_version = self.nccl.ncclGetRawVersion()
|
78
79
|
if self.rank == 0:
|
79
80
|
logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
|
80
81
|
|
@@ -259,6 +260,12 @@ class PyNcclCommunicator:
|
|
259
260
|
cudaStream_t(stream.cuda_stream),
|
260
261
|
)
|
261
262
|
|
263
|
+
def register_comm_window_raw(self, ptr: int, size: int):
|
264
|
+
return self.nccl.ncclCommWindowRegister(self.comm, buffer_type(ptr), size, 1)
|
265
|
+
|
266
|
+
def deregister_comm_window(self, window):
|
267
|
+
return self.nccl.ncclCommWindowDeregister(self.comm, window)
|
268
|
+
|
262
269
|
@contextmanager
|
263
270
|
def change_state(
|
264
271
|
self, enable: Optional[bool] = None, stream: Optional[torch.cuda.Stream] = None
|
@@ -0,0 +1,133 @@
|
|
1
|
+
import tempfile
|
2
|
+
|
3
|
+
import torch
|
4
|
+
from packaging import version
|
5
|
+
from torch.cuda.memory import CUDAPluggableAllocator
|
6
|
+
|
7
|
+
from sglang.srt.distributed.parallel_state import GroupCoordinator
|
8
|
+
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
9
|
+
|
10
|
+
nccl_allocator_source = """
|
11
|
+
#include <nccl.h>
|
12
|
+
extern "C" {
|
13
|
+
|
14
|
+
void* nccl_alloc_plug(size_t size, int device, void* stream) {
|
15
|
+
void* ptr;
|
16
|
+
ncclResult_t err = ncclMemAlloc(&ptr, size);
|
17
|
+
return ptr;
|
18
|
+
|
19
|
+
}
|
20
|
+
|
21
|
+
void nccl_free_plug(void* ptr, size_t size, int device, void* stream) {
|
22
|
+
ncclResult_t err = ncclMemFree(ptr);
|
23
|
+
}
|
24
|
+
|
25
|
+
}
|
26
|
+
"""
|
27
|
+
|
28
|
+
_allocator = None
|
29
|
+
_mem_pool = None
|
30
|
+
_registered_base_addrs = set()
|
31
|
+
_graph_pool_id = None
|
32
|
+
|
33
|
+
|
34
|
+
def is_symmetric_memory_enabled():
|
35
|
+
return global_server_args_dict["enable_symm_mem"]
|
36
|
+
|
37
|
+
|
38
|
+
def set_graph_pool_id(graph_pool_id):
|
39
|
+
global _graph_pool_id
|
40
|
+
_graph_pool_id = graph_pool_id
|
41
|
+
|
42
|
+
|
43
|
+
def get_nccl_mem_pool():
|
44
|
+
global _allocator, _mem_pool
|
45
|
+
if _mem_pool is None:
|
46
|
+
out_dir = tempfile.gettempdir()
|
47
|
+
nccl_allocator_libname = "nccl_allocator"
|
48
|
+
torch.utils.cpp_extension.load_inline(
|
49
|
+
name=nccl_allocator_libname,
|
50
|
+
cpp_sources=nccl_allocator_source,
|
51
|
+
with_cuda=True,
|
52
|
+
extra_ldflags=["-lnccl"],
|
53
|
+
verbose=True,
|
54
|
+
is_python_module=False,
|
55
|
+
build_directory=out_dir,
|
56
|
+
)
|
57
|
+
_allocator = CUDAPluggableAllocator(
|
58
|
+
f"{out_dir}/{nccl_allocator_libname}.so",
|
59
|
+
"nccl_alloc_plug",
|
60
|
+
"nccl_free_plug",
|
61
|
+
).allocator()
|
62
|
+
_mem_pool = torch.cuda.MemPool(_allocator)
|
63
|
+
return _mem_pool
|
64
|
+
|
65
|
+
|
66
|
+
class use_symmetric_memory:
|
67
|
+
def __init__(self, group_coordinator: GroupCoordinator):
|
68
|
+
if not is_symmetric_memory_enabled():
|
69
|
+
self.group_coordinator = None
|
70
|
+
self._mem_pool_ctx = None
|
71
|
+
self.is_graph_capture = None
|
72
|
+
self.device = None
|
73
|
+
self.pre_2_8_0 = None
|
74
|
+
else:
|
75
|
+
self.group_coordinator = group_coordinator
|
76
|
+
self._mem_pool_ctx = torch.cuda.use_mem_pool(get_nccl_mem_pool())
|
77
|
+
self.is_graph_capture = torch.cuda.is_current_stream_capturing()
|
78
|
+
self.device = torch.cuda.current_device()
|
79
|
+
self.pre_2_8_0 = version.parse(torch.__version__) < version.parse("2.8.0")
|
80
|
+
|
81
|
+
def __enter__(self):
|
82
|
+
if not is_symmetric_memory_enabled():
|
83
|
+
return self
|
84
|
+
assert (
|
85
|
+
self.group_coordinator.pynccl_comm is not None
|
86
|
+
), f"Symmetric memory requires pynccl to be enabled in group '{self.group_coordinator.group_name}'"
|
87
|
+
assert (
|
88
|
+
self.group_coordinator.pynccl_comm.nccl_version >= 22703
|
89
|
+
), "NCCL version 2.27.3 or higher is required for NCCL symmetric memory"
|
90
|
+
if self.is_graph_capture:
|
91
|
+
assert (
|
92
|
+
_graph_pool_id is not None
|
93
|
+
), "graph_pool_id is not set under graph capture"
|
94
|
+
# Pause graph memory pool to use symmetric memory with cuda graph
|
95
|
+
if self.pre_2_8_0:
|
96
|
+
torch._C._cuda_endAllocateCurrentStreamToPool(
|
97
|
+
self.device, _graph_pool_id
|
98
|
+
)
|
99
|
+
else:
|
100
|
+
torch._C._cuda_endAllocateToPool(self.device, _graph_pool_id)
|
101
|
+
self._mem_pool_ctx.__enter__()
|
102
|
+
return self
|
103
|
+
|
104
|
+
def tag(self, tensor: torch.Tensor):
|
105
|
+
if not is_symmetric_memory_enabled():
|
106
|
+
return
|
107
|
+
tensor.symmetric_memory = True
|
108
|
+
|
109
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
110
|
+
if not is_symmetric_memory_enabled():
|
111
|
+
return
|
112
|
+
global _registered_base_addrs
|
113
|
+
self._mem_pool_ctx.__exit__(exc_type, exc_val, exc_tb)
|
114
|
+
for segment in get_nccl_mem_pool().snapshot():
|
115
|
+
if segment["address"] not in _registered_base_addrs:
|
116
|
+
if segment["stream"] == 0 and self.pre_2_8_0:
|
117
|
+
# PyTorch version < 2.8.0 has a multi-thread MemPool bug
|
118
|
+
# See https://github.com/pytorch/pytorch/issues/152861
|
119
|
+
# Fixed at https://github.com/pytorch/pytorch/commit/f01e628e3b31852983ab30b25bf251f557ba9c0b
|
120
|
+
# WAR is to skip allocations on the default stream since the forward_pass thread always runs on a custom stream
|
121
|
+
continue
|
122
|
+
self.group_coordinator.pynccl_comm.register_comm_window_raw(
|
123
|
+
segment["address"], segment["total_size"]
|
124
|
+
)
|
125
|
+
_registered_base_addrs.add(segment["address"])
|
126
|
+
|
127
|
+
if self.is_graph_capture:
|
128
|
+
if self.pre_2_8_0:
|
129
|
+
torch._C._cuda_beginAllocateToPool(self.device, _graph_pool_id)
|
130
|
+
else:
|
131
|
+
torch._C._cuda_beginAllocateCurrentThreadToPool(
|
132
|
+
self.device, _graph_pool_id
|
133
|
+
)
|