sglang 0.4.9.post6__tar.gz → 0.4.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.4.9.post6/sglang.egg-info → sglang-0.4.10}/PKG-INFO +3 -4
- {sglang-0.4.9.post6 → sglang-0.4.10}/README.md +0 -1
- {sglang-0.4.9.post6 → sglang-0.4.10}/pyproject.toml +3 -3
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/bench_one_batch.py +3 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/__init__.py +8 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/model_config.py +3 -0
- sglang-0.4.10/sglang/srt/configs/step3_vl.py +172 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/conversation.py +23 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/decode.py +2 -8
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/prefill.py +2 -6
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/parallel_state.py +86 -1
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/engine.py +14 -18
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/http_server.py +10 -2
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/openai/serving_chat.py +2 -21
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/eplb/expert_distribution.py +5 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/eplb/expert_location.py +17 -6
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/eplb/expert_location_dispatch.py +1 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/eplb/expert_location_updater.py +2 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/function_call/function_call_parser.py +2 -0
- sglang-0.4.10/sglang/srt/function_call/step3_detector.py +436 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/hf_transformers_utils.py +2 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/jinja_template_utils.py +4 -1
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/cutlass_moe.py +2 -1
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/ep_moe/layer.py +20 -640
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/fp8.py +0 -18
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/unquant.py +0 -8
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/w4afp8.py +1 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/cache_controller.py +143 -45
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/data_parallel_controller.py +2 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/io_struct.py +0 -2
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/scheduler.py +89 -671
- sglang-0.4.10/sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
- sglang-0.4.10/sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
- sglang-0.4.10/sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/template_manager.py +62 -19
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/tokenizer_manager.py +123 -74
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/tp_worker.py +4 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/mem_cache/hicache_storage.py +45 -11
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/mem_cache/hiradix_cache.py +15 -4
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/mem_cache/memory_pool_host.py +73 -1
- sglang-0.4.10/sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
- sglang-0.4.10/sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
- sglang-0.4.10/sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
- sglang-0.4.10/sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
- sglang-0.4.10/sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/model_executor/model_runner.py +5 -0
- sglang-0.4.10/sglang/srt/models/arcee.py +532 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/deepseek_v2.py +2 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/glm4_moe.py +3 -1
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/granitemoe.py +3 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/grok.py +3 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/hunyuan.py +1 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/llama4.py +3 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/mixtral.py +3 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/olmoe.py +3 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/phimoe.py +1 -0
- sglang-0.4.10/sglang/srt/models/step3_vl.py +994 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/base_processor.py +15 -16
- sglang-0.4.10/sglang/srt/multimodal/processors/step3_vl.py +515 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/reasoning_parser.py +2 -1
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/server_args.py +10 -13
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/speculative/eagle_worker.py +2 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/utils.py +0 -11
- sglang-0.4.10/sglang/version.py +1 -0
- {sglang-0.4.9.post6 → sglang-0.4.10/sglang.egg-info}/PKG-INFO +3 -4
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang.egg-info/SOURCES.txt +13 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang.egg-info/requires.txt +2 -2
- sglang-0.4.9.post6/sglang/version.py +0 -1
- {sglang-0.4.9.post6 → sglang-0.4.10}/LICENSE +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/setup.cfg +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/api.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/bench_offline_throughput.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/bench_one_batch_server.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/bench_serving.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/check_env.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/compile_deep_gemm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/eval/llama3_eval.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/eval/loogle_eval.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/global_config.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/lang/chat_template.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/lang/choices.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/lang/compiler.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/lang/interpreter.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/lang/ir.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/lang/tracer.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/launch_server.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/profiler.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/_custom_ops.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/aio_rwlock.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/code_completion_parser.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/chatglm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/dbrx.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/deepseekvl2.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/device_config.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/internvl.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/janus_pro.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/kimi_vl.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/kimi_vl_moonvit.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/load_config.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/update_config.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/configs/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/connector/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/connector/base_connector.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/connector/redis.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/connector/s3.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/connector/serde/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/connector/serde/safe_serde.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/connector/serde/serde.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/connector/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/constants.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/constrained/base_grammar_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/constrained/llguidance_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/constrained/outlines_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/constrained/reasoner_grammar_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/constrained/triton_ops/bitmask_ops.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/constrained/xgrammar_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/custom_op.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/debug_utils/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/debug_utils/dump_comparator.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/debug_utils/dumper.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/debug_utils/text_comparator.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/ascend/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/ascend/conn.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/ascend/transfer_engine.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/base/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/base/conn.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/common/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/common/conn.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/common/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/decode_schedule_batch_mixin.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/fake/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/fake/conn.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/kv_events.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/launch_lb.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/mini_lb.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/mooncake/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/mooncake/conn.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/mooncake/transfer_engine.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/nixl/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/nixl/conn.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/disaggregation/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/communication_op.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/device_communicators/npu_communicator.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/device_communicators/pymscclpp.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/device_communicators/quick_all_reduce.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/distributed/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/EngineBase.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/http_server_engine.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/openai/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/openai/protocol.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/openai/serving_base.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/openai/serving_completions.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/openai/serving_embedding.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/openai/serving_rerank.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/openai/serving_score.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/openai/usage_processor.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/entrypoints/openai/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/eplb/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/eplb/eplb_algorithms/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/eplb/eplb_algorithms/deepseek_vec.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/eplb/eplb_manager.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/eplb/eplb_simulator/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/eplb/eplb_simulator/reader.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/function_call/base_format_detector.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/function_call/core_types.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/function_call/deepseekv3_detector.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/function_call/ebnf_composer.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/function_call/glm4_moe_detector.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/function_call/kimik2_detector.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/function_call/llama32_detector.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/function_call/mistral_detector.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/function_call/pythonic_detector.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/function_call/qwen25_detector.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/function_call/qwen3_coder_detector.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/function_call/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/amx_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/aiter_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/ascend_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/cutlass_mla_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/flashattention_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/flashmla_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/hybrid_attn_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/intel_amx_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/merge_state.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/tbo_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/triton_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/triton_ops/merge_state.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/attention/vision.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/communicator.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/dp_attention.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/elementwise.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/flashinfer_comm_fusion.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/cutlass_moe_params.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/cutlass_w4a8_moe.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/router.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/token_dispatcher/standard.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/moe/topk.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/multimodal.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/parameter.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/awq.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/awq_triton.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/gptq.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/int8_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/kv_cache.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/marlin_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/moe_wna16.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/petit.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/petit_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/qoq.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/scalar_type.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/w8a8_fp8.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/quantization/w8a8_int8.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/rotary_embedding.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/sampler.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/backend/base_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/backend/triton_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/layers.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/lora_registry.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/mem_pool.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/triton_ops/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/lora/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/configure_logging.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/mm_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/multimodal_processor.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/schedule_batch.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/schedule_policy.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/scheduler_input_blocker.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/scheduler_output_processor_mixin.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/session_controller.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/managers/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/mem_cache/allocator.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/mem_cache/memory_pool.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/mem_cache/multimodal_cache.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/mem_cache/swa_radix_cache.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/metrics/collector.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/model_executor/forward_batch_info.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/model_loader/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/model_loader/loader.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/model_loader/weight_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/model_parallel.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/bert.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/clip.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/deepseek_janus_pro.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/deepseek_nextn.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/deepseek_vl2.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/gemma2.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/gemma2_reward.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/gemma3_causal.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/gemma3_mm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/gemma3n_audio.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/gemma3n_causal.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/gemma3n_mm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/glm4.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/glm4_moe_nextn.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/gpt2.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/granite.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/idefics2.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/internlm2_reward.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/interns1.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/internvl.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/kimi_vl.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/kimi_vl_moonvit.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/llama.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/llama_eagle.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/llama_eagle3.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/llama_reward.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/llava.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/mimo.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/mimo_mtp.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/minicpm3.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/minicpmo.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/minicpmv.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/mllama.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/mllama4.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/olmo.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/olmo2.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/persimmon.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/phi.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/phi3_small.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/phi4mm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/phi4mm_audio.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/phi4mm_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/pixtral.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/qwen2_5_vl.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/qwen2_audio.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/qwen2_classification.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/qwen2_eagle.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/qwen2_rm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/qwen2_vl.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/qwen3.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/qwen3_moe.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/registry.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/roberta.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/siglip.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/torch_native_llama.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/transformers.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/vila.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/mm_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/clip.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/gemma3.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/gemma3n.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/internvl.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/janus_pro.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/kimi_vl.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/llava.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/minicpm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/mlama.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/mllama4.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/phi4mm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/pixtral.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/qwen_audio.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/qwen_vl.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/multimodal/processors/vila.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/operations.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/operations_strategy.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/patch_torch.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/poll_based_barrier.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/sampling/custom_logit_processor.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/sampling/sampling_batch_info.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/speculative/build_eagle_tree.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/speculative/eagle_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/speculative/spec_info.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/torch_memory_saver_adapter.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/two_batch_overlap.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/warmup.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/srt/weight_sync/utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/attention/__init__.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/attention/test_flashattn_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/attention/test_flashattn_mla_backend.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/attention/test_prefix_chunk_info.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/run_eval.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/runners.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/send_one.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_activation.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_block_fp8.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_block_fp8_ep.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_custom_ops.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_cutlass_moe.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_cutlass_w4a8_moe.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_deepep_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_dynamic_grad_mode.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_fp4_moe.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_marlin_moe.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_marlin_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_programs.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang/test/test_utils.py +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.4.9.post6 → sglang-0.4.10}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.10
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -246,14 +246,14 @@ Requires-Dist: sentencepiece; extra == "runtime-common"
|
|
246
246
|
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
247
247
|
Requires-Dist: scipy; extra == "runtime-common"
|
248
248
|
Requires-Dist: torchao==0.9.0; extra == "runtime-common"
|
249
|
-
Requires-Dist: transformers==4.54.
|
249
|
+
Requires-Dist: transformers==4.54.1; extra == "runtime-common"
|
250
250
|
Requires-Dist: timm==1.0.16; extra == "runtime-common"
|
251
251
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
252
252
|
Requires-Dist: uvloop; extra == "runtime-common"
|
253
253
|
Requires-Dist: xgrammar==0.1.21; extra == "runtime-common"
|
254
254
|
Provides-Extra: srt
|
255
255
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
256
|
-
Requires-Dist: sgl-kernel==0.2.
|
256
|
+
Requires-Dist: sgl-kernel==0.2.8; extra == "srt"
|
257
257
|
Requires-Dist: torch==2.7.1; extra == "srt"
|
258
258
|
Requires-Dist: torchaudio==2.7.1; extra == "srt"
|
259
259
|
Requires-Dist: torchvision==0.22.1; extra == "srt"
|
@@ -427,7 +427,6 @@ SGLang has been deployed at large scale, generating trillions of tokens in produ
|
|
427
427
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
428
428
|
|
429
429
|
## Contact Us
|
430
|
-
|
431
430
|
For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
|
432
431
|
|
433
432
|
## Acknowledgment
|
@@ -70,7 +70,6 @@ SGLang has been deployed at large scale, generating trillions of tokens in produ
|
|
70
70
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
71
71
|
|
72
72
|
## Contact Us
|
73
|
-
|
74
73
|
For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
|
75
74
|
|
76
75
|
## Acknowledgment
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.4.
|
7
|
+
version = "0.4.10"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -45,7 +45,7 @@ runtime_common = [
|
|
45
45
|
"soundfile==0.13.1",
|
46
46
|
"scipy",
|
47
47
|
"torchao==0.9.0",
|
48
|
-
"transformers==4.54.
|
48
|
+
"transformers==4.54.1",
|
49
49
|
"timm==1.0.16",
|
50
50
|
"uvicorn",
|
51
51
|
"uvloop",
|
@@ -54,7 +54,7 @@ runtime_common = [
|
|
54
54
|
|
55
55
|
srt = [
|
56
56
|
"sglang[runtime_common]",
|
57
|
-
"sgl-kernel==0.2.
|
57
|
+
"sgl-kernel==0.2.8",
|
58
58
|
"torch==2.7.1",
|
59
59
|
"torchaudio==2.7.1",
|
60
60
|
"torchvision==0.22.1",
|
@@ -138,6 +138,7 @@ class BenchArgs:
|
|
138
138
|
def load_model(server_args, port_args, tp_rank):
|
139
139
|
suppress_other_loggers()
|
140
140
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
141
|
+
moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
|
141
142
|
|
142
143
|
model_config = ModelConfig.from_server_args(server_args)
|
143
144
|
model_runner = ModelRunner(
|
@@ -146,6 +147,8 @@ def load_model(server_args, port_args, tp_rank):
|
|
146
147
|
gpu_id=tp_rank,
|
147
148
|
tp_rank=tp_rank,
|
148
149
|
tp_size=server_args.tp_size,
|
150
|
+
moe_ep_rank=moe_ep_rank,
|
151
|
+
moe_ep_size=server_args.ep_size,
|
149
152
|
pp_rank=0,
|
150
153
|
pp_size=1,
|
151
154
|
nccl_port=port_args.nccl_port,
|
@@ -5,6 +5,11 @@ from sglang.srt.configs.exaone import ExaoneConfig
|
|
5
5
|
from sglang.srt.configs.janus_pro import MultiModalityConfig
|
6
6
|
from sglang.srt.configs.kimi_vl import KimiVLConfig
|
7
7
|
from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
|
8
|
+
from sglang.srt.configs.step3_vl import (
|
9
|
+
Step3TextConfig,
|
10
|
+
Step3VisionEncoderConfig,
|
11
|
+
Step3VLConfig,
|
12
|
+
)
|
8
13
|
|
9
14
|
__all__ = [
|
10
15
|
"ExaoneConfig",
|
@@ -14,4 +19,7 @@ __all__ = [
|
|
14
19
|
"MultiModalityConfig",
|
15
20
|
"KimiVLConfig",
|
16
21
|
"MoonViTConfig",
|
22
|
+
"Step3VLConfig",
|
23
|
+
"Step3TextConfig",
|
24
|
+
"Step3VisionEncoderConfig",
|
17
25
|
]
|
@@ -335,6 +335,8 @@ class ModelConfig:
|
|
335
335
|
"num_key_value_heads",
|
336
336
|
# For ChatGLM:
|
337
337
|
"multi_query_group_num",
|
338
|
+
# For Step3
|
339
|
+
"num_attention_groups",
|
338
340
|
]
|
339
341
|
for attr in attributes:
|
340
342
|
num_kv_heads = getattr(self.hf_text_config, attr, None)
|
@@ -644,6 +646,7 @@ multimodal_model_archs = [
|
|
644
646
|
"InternS1ForConditionalGeneration",
|
645
647
|
"Phi4MMForCausalLM",
|
646
648
|
"VILAForConditionalGeneration",
|
649
|
+
"Step3VLForConditionalGeneration",
|
647
650
|
]
|
648
651
|
|
649
652
|
|
@@ -0,0 +1,172 @@
|
|
1
|
+
from typing import Any, Optional, Union
|
2
|
+
|
3
|
+
from transformers.configuration_utils import PretrainedConfig
|
4
|
+
|
5
|
+
|
6
|
+
class Step3VisionEncoderConfig(PretrainedConfig):
|
7
|
+
model_type = "step3_vision_encoder"
|
8
|
+
|
9
|
+
def __init__(
|
10
|
+
self,
|
11
|
+
hidden_size=1792,
|
12
|
+
intermediate_size=3072,
|
13
|
+
output_hidden_size=4096,
|
14
|
+
num_hidden_layers=63,
|
15
|
+
num_attention_heads=16,
|
16
|
+
num_channels=3,
|
17
|
+
image_size=728,
|
18
|
+
patch_size=14,
|
19
|
+
hidden_act="quick_gelu",
|
20
|
+
layer_norm_eps=1e-5,
|
21
|
+
**kwargs,
|
22
|
+
):
|
23
|
+
self.hidden_size = hidden_size
|
24
|
+
self.intermediate_size = intermediate_size
|
25
|
+
self.output_hidden_size = output_hidden_size
|
26
|
+
self.num_hidden_layers = num_hidden_layers
|
27
|
+
self.num_attention_heads = num_attention_heads
|
28
|
+
self.num_channels = num_channels
|
29
|
+
self.patch_size = patch_size
|
30
|
+
self.image_size = image_size
|
31
|
+
self.layer_norm_eps = layer_norm_eps
|
32
|
+
self.hidden_act = hidden_act
|
33
|
+
super().__init__(**kwargs)
|
34
|
+
|
35
|
+
|
36
|
+
class Step3TextConfig(PretrainedConfig):
|
37
|
+
model_type = "step3_text"
|
38
|
+
architectures = ["Step3TextForCausalLM"]
|
39
|
+
|
40
|
+
def __init__(
|
41
|
+
self,
|
42
|
+
hidden_size: int = 7168,
|
43
|
+
intermediate_size: int = 18432,
|
44
|
+
num_attention_heads: int = 64,
|
45
|
+
num_attention_groups: int = 1,
|
46
|
+
num_hidden_layers: int = 61,
|
47
|
+
max_seq_len: int = 65536,
|
48
|
+
vocab_size: int = 128815,
|
49
|
+
rms_norm_eps: float = 1e-5,
|
50
|
+
moe_intermediate_size: int = 5120,
|
51
|
+
moe_num_experts: int = 48,
|
52
|
+
moe_top_k: int = 3,
|
53
|
+
rope_theta: float = 500000,
|
54
|
+
rope_scaling: Optional[dict[str, Any]] = None,
|
55
|
+
max_position_embedding: int = 65536,
|
56
|
+
share_expert_dim: int = 5120,
|
57
|
+
share_q_dim: int = 2048,
|
58
|
+
head_dim: int = 256,
|
59
|
+
norm_expert_weight: bool = False,
|
60
|
+
moe_layers_enum: tuple[int] = (
|
61
|
+
4,
|
62
|
+
5,
|
63
|
+
6,
|
64
|
+
7,
|
65
|
+
8,
|
66
|
+
9,
|
67
|
+
10,
|
68
|
+
11,
|
69
|
+
12,
|
70
|
+
13,
|
71
|
+
14,
|
72
|
+
15,
|
73
|
+
16,
|
74
|
+
17,
|
75
|
+
18,
|
76
|
+
19,
|
77
|
+
20,
|
78
|
+
21,
|
79
|
+
22,
|
80
|
+
23,
|
81
|
+
24,
|
82
|
+
25,
|
83
|
+
26,
|
84
|
+
27,
|
85
|
+
28,
|
86
|
+
29,
|
87
|
+
30,
|
88
|
+
31,
|
89
|
+
32,
|
90
|
+
33,
|
91
|
+
34,
|
92
|
+
35,
|
93
|
+
36,
|
94
|
+
37,
|
95
|
+
38,
|
96
|
+
39,
|
97
|
+
40,
|
98
|
+
41,
|
99
|
+
42,
|
100
|
+
43,
|
101
|
+
44,
|
102
|
+
45,
|
103
|
+
46,
|
104
|
+
47,
|
105
|
+
48,
|
106
|
+
49,
|
107
|
+
50,
|
108
|
+
51,
|
109
|
+
52,
|
110
|
+
53,
|
111
|
+
54,
|
112
|
+
55,
|
113
|
+
56,
|
114
|
+
57,
|
115
|
+
58,
|
116
|
+
59,
|
117
|
+
),
|
118
|
+
**kwargs,
|
119
|
+
) -> None:
|
120
|
+
self.hidden_size = hidden_size
|
121
|
+
self.intermediate_size = intermediate_size
|
122
|
+
self.num_attention_heads = num_attention_heads
|
123
|
+
self.num_attention_groups = num_attention_groups
|
124
|
+
self.num_hidden_layers = num_hidden_layers
|
125
|
+
self.max_seq_len = max_seq_len
|
126
|
+
self.vocab_size = vocab_size
|
127
|
+
self.rms_norm_eps = rms_norm_eps
|
128
|
+
self.moe_intermediate_size = moe_intermediate_size
|
129
|
+
self.moe_num_experts = moe_num_experts
|
130
|
+
self.moe_top_k = moe_top_k
|
131
|
+
self.rope_theta = rope_theta
|
132
|
+
self.rope_scaling = rope_scaling
|
133
|
+
self.max_position_embedding = max_position_embedding
|
134
|
+
self.share_expert_dim = share_expert_dim
|
135
|
+
self.share_q_dim = share_q_dim
|
136
|
+
self.head_dim = head_dim
|
137
|
+
self.norm_expert_weight = norm_expert_weight
|
138
|
+
self.moe_layers_enum = moe_layers_enum
|
139
|
+
|
140
|
+
super().__init__(**kwargs)
|
141
|
+
|
142
|
+
|
143
|
+
class Step3VLConfig(PretrainedConfig):
|
144
|
+
model_type = "step3_vl"
|
145
|
+
|
146
|
+
def __init__(
|
147
|
+
self,
|
148
|
+
vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
|
149
|
+
text_config: Optional[Union[dict, Step3TextConfig]] = None,
|
150
|
+
understand_projector_stride: int = 1,
|
151
|
+
projector_bias: bool = True,
|
152
|
+
image_token_id: int = 128001,
|
153
|
+
**kwargs,
|
154
|
+
) -> None:
|
155
|
+
if vision_config is None:
|
156
|
+
vision_config = Step3VisionEncoderConfig()
|
157
|
+
elif isinstance(vision_config, dict):
|
158
|
+
vision_config = Step3VisionEncoderConfig(**vision_config)
|
159
|
+
self.vision_config = vision_config
|
160
|
+
|
161
|
+
if text_config is None:
|
162
|
+
text_config = Step3TextConfig()
|
163
|
+
elif isinstance(text_config, dict):
|
164
|
+
text_config = Step3TextConfig(**text_config)
|
165
|
+
self.text_config = text_config
|
166
|
+
|
167
|
+
self.understand_projector_stride = understand_projector_stride
|
168
|
+
self.projector_bias = projector_bias
|
169
|
+
self.hidden_size = text_config.hidden_size
|
170
|
+
self.image_token_id = image_token_id
|
171
|
+
|
172
|
+
super().__init__(**kwargs)
|
@@ -994,6 +994,23 @@ register_conv_template(
|
|
994
994
|
)
|
995
995
|
)
|
996
996
|
|
997
|
+
register_conv_template(
|
998
|
+
Conversation(
|
999
|
+
name="step3-vl",
|
1000
|
+
system_message="<|begin▁of▁sentence|>You are a helpful assistant",
|
1001
|
+
system_template="{system_message}\n",
|
1002
|
+
roles=(
|
1003
|
+
"<|BOT|>user\n",
|
1004
|
+
"<|BOT|>assistant\n<think>\n",
|
1005
|
+
),
|
1006
|
+
sep="<|EOT|>",
|
1007
|
+
sep_style=SeparatorStyle.NO_COLON_SINGLE,
|
1008
|
+
stop_str="<|EOT|>",
|
1009
|
+
image_token="<im_patch>",
|
1010
|
+
# add_bos=True,
|
1011
|
+
)
|
1012
|
+
)
|
1013
|
+
|
997
1014
|
|
998
1015
|
@register_conv_template_matching_function
|
999
1016
|
def match_internvl(model_path: str):
|
@@ -1103,3 +1120,9 @@ def match_vila(model_path: str):
|
|
1103
1120
|
def match_mimo_vl(model_path: str):
|
1104
1121
|
if re.search(r"mimo.*vl", model_path, re.IGNORECASE):
|
1105
1122
|
return "mimo-vl"
|
1123
|
+
|
1124
|
+
|
1125
|
+
# @register_conv_template_matching_function
|
1126
|
+
# def match_step3(model_path: str):
|
1127
|
+
# if re.search(r"step3", model_path, re.IGNORECASE):
|
1128
|
+
# return "step3-vl"
|
@@ -694,10 +694,7 @@ class SchedulerDisaggregationDecodeMixin:
|
|
694
694
|
+ len(self.disagg_decode_prealloc_queue.queue)
|
695
695
|
== 0
|
696
696
|
):
|
697
|
-
|
698
|
-
self.check_memory()
|
699
|
-
self.new_token_ratio = self.init_new_token_ratio
|
700
|
-
self.maybe_sleep_on_idle()
|
697
|
+
self.self_check_during_idle()
|
701
698
|
|
702
699
|
self.last_batch = batch
|
703
700
|
|
@@ -771,10 +768,7 @@ class SchedulerDisaggregationDecodeMixin:
|
|
771
768
|
+ len(self.disagg_decode_prealloc_queue.queue)
|
772
769
|
== 0
|
773
770
|
):
|
774
|
-
|
775
|
-
self.check_memory()
|
776
|
-
self.new_token_ratio = self.init_new_token_ratio
|
777
|
-
self.maybe_sleep_on_idle()
|
771
|
+
self.self_check_during_idle()
|
778
772
|
|
779
773
|
self.last_batch = batch
|
780
774
|
self.last_batch_in_queue = last_batch_in_queue
|
@@ -287,9 +287,7 @@ class SchedulerDisaggregationPrefillMixin:
|
|
287
287
|
self.process_disagg_prefill_inflight_queue()
|
288
288
|
|
289
289
|
if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
|
290
|
-
self.
|
291
|
-
self.new_token_ratio = self.init_new_token_ratio
|
292
|
-
self.maybe_sleep_on_idle()
|
290
|
+
self.self_check_during_idle()
|
293
291
|
|
294
292
|
self.last_batch = batch
|
295
293
|
# HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
|
@@ -337,9 +335,7 @@ class SchedulerDisaggregationPrefillMixin:
|
|
337
335
|
self.process_disagg_prefill_inflight_queue()
|
338
336
|
|
339
337
|
if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
|
340
|
-
self.
|
341
|
-
self.new_token_ratio = self.init_new_token_ratio
|
342
|
-
self.maybe_sleep_on_idle()
|
338
|
+
self.self_check_during_idle()
|
343
339
|
|
344
340
|
self.last_batch = batch
|
345
341
|
# HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
|
@@ -354,6 +354,13 @@ class GroupCoordinator:
|
|
354
354
|
self.cpu_group, 1 << 22, 6
|
355
355
|
)
|
356
356
|
|
357
|
+
def __repr__(self):
|
358
|
+
return (
|
359
|
+
f"ranks={self.ranks} rank={self.rank} local_rank={self.local_rank} use_pynccl={self.use_pynccl} "
|
360
|
+
f"device_group={self.device_group} cpu_group={self.cpu_group} unique_name={self.unique_name} "
|
361
|
+
f"world_size={self.world_size} rank_in_group={self.rank_in_group}"
|
362
|
+
)
|
363
|
+
|
357
364
|
@property
|
358
365
|
def first_rank(self):
|
359
366
|
"""Return the global rank of the first process in the group"""
|
@@ -1141,6 +1148,20 @@ def get_tp_group() -> GroupCoordinator:
|
|
1141
1148
|
return _TP
|
1142
1149
|
|
1143
1150
|
|
1151
|
+
_MOE_EP: Optional[GroupCoordinator] = None
|
1152
|
+
_MOE_TP: Optional[GroupCoordinator] = None
|
1153
|
+
|
1154
|
+
|
1155
|
+
def get_moe_ep_group() -> GroupCoordinator:
|
1156
|
+
assert _MOE_EP is not None, "expert model parallel group is not initialized"
|
1157
|
+
return _MOE_EP
|
1158
|
+
|
1159
|
+
|
1160
|
+
def get_moe_tp_group() -> GroupCoordinator:
|
1161
|
+
assert _MOE_TP is not None, "expert model parallel group is not initialized"
|
1162
|
+
return _MOE_TP
|
1163
|
+
|
1164
|
+
|
1144
1165
|
# kept for backward compatibility
|
1145
1166
|
get_tensor_model_parallel_group = get_tp_group
|
1146
1167
|
|
@@ -1250,6 +1271,7 @@ def init_distributed_environment(
|
|
1250
1271
|
|
1251
1272
|
def initialize_model_parallel(
|
1252
1273
|
tensor_model_parallel_size: int = 1,
|
1274
|
+
expert_model_parallel_size: int = 1,
|
1253
1275
|
pipeline_model_parallel_size: int = 1,
|
1254
1276
|
backend: Optional[str] = None,
|
1255
1277
|
duplicate_tp_group: bool = False,
|
@@ -1327,6 +1349,45 @@ def initialize_model_parallel(
|
|
1327
1349
|
_TP.pynccl_comm.disabled = False
|
1328
1350
|
_PDMUX_PREFILL_TP_GROUP.pynccl_comm.disabled = False
|
1329
1351
|
|
1352
|
+
moe_ep_size = expert_model_parallel_size
|
1353
|
+
|
1354
|
+
moe_tp_size = tensor_model_parallel_size // moe_ep_size
|
1355
|
+
global _MOE_EP
|
1356
|
+
assert _MOE_EP is None, "expert model parallel group is already initialized"
|
1357
|
+
group_ranks = []
|
1358
|
+
for i in range(num_tensor_model_parallel_groups):
|
1359
|
+
for j in range(moe_tp_size):
|
1360
|
+
st = i * tensor_model_parallel_size + j
|
1361
|
+
en = (i + 1) * tensor_model_parallel_size + j
|
1362
|
+
ranks = list(range(st, en, moe_tp_size))
|
1363
|
+
group_ranks.append(ranks)
|
1364
|
+
|
1365
|
+
_MOE_EP = init_model_parallel_group(
|
1366
|
+
group_ranks,
|
1367
|
+
get_world_group().local_rank,
|
1368
|
+
backend,
|
1369
|
+
use_custom_allreduce=False,
|
1370
|
+
group_name="moe_ep",
|
1371
|
+
)
|
1372
|
+
|
1373
|
+
global _MOE_TP
|
1374
|
+
assert _MOE_TP is None, "expert model parallel group is already initialized"
|
1375
|
+
group_ranks = []
|
1376
|
+
for i in range(num_tensor_model_parallel_groups):
|
1377
|
+
for j in range(moe_ep_size):
|
1378
|
+
st = i * tensor_model_parallel_size + j * moe_tp_size
|
1379
|
+
en = i * tensor_model_parallel_size + (j + 1) * moe_tp_size
|
1380
|
+
ranks = list(range(st, en))
|
1381
|
+
group_ranks.append(ranks)
|
1382
|
+
|
1383
|
+
_MOE_TP = init_model_parallel_group(
|
1384
|
+
group_ranks,
|
1385
|
+
get_world_group().local_rank,
|
1386
|
+
backend,
|
1387
|
+
use_custom_allreduce=False,
|
1388
|
+
group_name="moe_tp",
|
1389
|
+
)
|
1390
|
+
|
1330
1391
|
# Build the pipeline model-parallel groups.
|
1331
1392
|
num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
|
1332
1393
|
global _PP
|
@@ -1347,6 +1408,7 @@ def initialize_model_parallel(
|
|
1347
1408
|
|
1348
1409
|
def ensure_model_parallel_initialized(
|
1349
1410
|
tensor_model_parallel_size: int,
|
1411
|
+
expert_model_parallel_size: int,
|
1350
1412
|
pipeline_model_parallel_size: int,
|
1351
1413
|
backend: Optional[str] = None,
|
1352
1414
|
) -> None:
|
@@ -1357,7 +1419,10 @@ def ensure_model_parallel_initialized(
|
|
1357
1419
|
backend = backend or torch.distributed.get_backend(get_world_group().device_group)
|
1358
1420
|
if not model_parallel_is_initialized():
|
1359
1421
|
initialize_model_parallel(
|
1360
|
-
tensor_model_parallel_size,
|
1422
|
+
tensor_model_parallel_size,
|
1423
|
+
expert_model_parallel_size,
|
1424
|
+
pipeline_model_parallel_size,
|
1425
|
+
backend,
|
1361
1426
|
)
|
1362
1427
|
return
|
1363
1428
|
|
@@ -1417,6 +1482,26 @@ def get_tensor_model_parallel_rank():
|
|
1417
1482
|
return get_tp_group().rank_in_group
|
1418
1483
|
|
1419
1484
|
|
1485
|
+
def get_moe_expert_parallel_world_size():
|
1486
|
+
"""Return world size for the moe expert parallel group."""
|
1487
|
+
return get_moe_ep_group().world_size
|
1488
|
+
|
1489
|
+
|
1490
|
+
def get_moe_expert_parallel_rank():
|
1491
|
+
"""Return my rank for the moe expert parallel group."""
|
1492
|
+
return get_moe_ep_group().rank_in_group
|
1493
|
+
|
1494
|
+
|
1495
|
+
def get_moe_tensor_parallel_world_size():
|
1496
|
+
"""Return world size for the moe tensor parallel group."""
|
1497
|
+
return get_moe_tp_group().world_size
|
1498
|
+
|
1499
|
+
|
1500
|
+
def get_moe_tensor_parallel_rank():
|
1501
|
+
"""Return my rank for the moe tensor parallel group."""
|
1502
|
+
return get_moe_tp_group().rank_in_group
|
1503
|
+
|
1504
|
+
|
1420
1505
|
def destroy_model_parallel():
|
1421
1506
|
"""Set the groups to none and destroy them."""
|
1422
1507
|
global _TP
|
@@ -648,29 +648,23 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
648
648
|
if _is_cuda:
|
649
649
|
assert_pkg_version(
|
650
650
|
"sgl-kernel",
|
651
|
-
"0.2.
|
651
|
+
"0.2.8",
|
652
652
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
653
653
|
)
|
654
654
|
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
655
|
+
if True: # Keep this check for internal code compatibility
|
656
|
+
# Register the signal handler.
|
657
|
+
# The child processes will send SIGQUIT to this process when any error happens
|
658
|
+
# This process then clean up the whole process tree
|
659
|
+
# Note: This sigquit handler is used in the launch phase, and may be replaced by
|
660
|
+
# the running_phase_sigquit_handler in the tokenizer manager after the grpc server is launched.
|
661
|
+
def launch_phase_sigquit_handler(signum, frame):
|
662
|
+
logger.error(
|
663
|
+
"Received sigquit from a child process. It usually means the child failed."
|
660
664
|
)
|
665
|
+
kill_process_tree(os.getpid())
|
661
666
|
|
662
|
-
|
663
|
-
|
664
|
-
# Register the signal handler.
|
665
|
-
# The child processes will send SIGQUIT to this process when any error happens
|
666
|
-
# This process then clean up the whole process tree
|
667
|
-
def sigquit_handler(signum, frame):
|
668
|
-
logger.error(
|
669
|
-
"Received sigquit from a child process. It usually means the child failed."
|
670
|
-
)
|
671
|
-
kill_process_tree(os.getpid())
|
672
|
-
|
673
|
-
signal.signal(signal.SIGQUIT, sigquit_handler)
|
667
|
+
signal.signal(signal.SIGQUIT, launch_phase_sigquit_handler)
|
674
668
|
|
675
669
|
# Set mp start method
|
676
670
|
mp.set_start_method("spawn", force=True)
|
@@ -725,6 +719,7 @@ def _launch_subprocesses(
|
|
725
719
|
+ ((pp_rank % pp_size_per_node) * tp_size_per_node)
|
726
720
|
+ (tp_rank % tp_size_per_node) * server_args.gpu_id_step
|
727
721
|
)
|
722
|
+
moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
|
728
723
|
proc = mp.Process(
|
729
724
|
target=run_scheduler_process,
|
730
725
|
args=(
|
@@ -732,6 +727,7 @@ def _launch_subprocesses(
|
|
732
727
|
port_args,
|
733
728
|
gpu_id,
|
734
729
|
tp_rank,
|
730
|
+
moe_ep_rank,
|
735
731
|
pp_rank,
|
736
732
|
None,
|
737
733
|
writer,
|
@@ -238,6 +238,9 @@ async def health() -> Response:
|
|
238
238
|
@app.get("/health_generate")
|
239
239
|
async def health_generate(request: Request) -> Response:
|
240
240
|
"""Check the health of the inference server by generating one token."""
|
241
|
+
if _global_state.tokenizer_manager.gracefully_exit:
|
242
|
+
logger.info("Health check request received during shutdown. Returning 503.")
|
243
|
+
return Response(status_code=503)
|
241
244
|
|
242
245
|
sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
|
243
246
|
rid = f"HEALTH_CHECK_{time.time()}"
|
@@ -260,9 +263,14 @@ async def health_generate(request: Request) -> Response:
|
|
260
263
|
async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
|
261
264
|
break
|
262
265
|
|
263
|
-
|
266
|
+
# This request is a special request.
|
267
|
+
# If the server already has something running, this request will be ignored, so it creates zero overhead.
|
268
|
+
# If the server is not running, this request will be run, so we know whether the server is healthy.
|
264
269
|
task = asyncio.create_task(gen())
|
265
|
-
|
270
|
+
|
271
|
+
# As long as we receive any response from the detokenizer/scheduler, we consider the server is healthy.
|
272
|
+
tic = time.time()
|
273
|
+
while time.time() < tic + HEALTH_CHECK_TIMEOUT:
|
266
274
|
await asyncio.sleep(1)
|
267
275
|
if _global_state.tokenizer_manager.last_receive_tstamp > tic:
|
268
276
|
task.cancel()
|
@@ -127,12 +127,12 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
127
127
|
request.skip_special_tokens = False
|
128
128
|
if not isinstance(request.tool_choice, str):
|
129
129
|
tools = [
|
130
|
-
item.model_dump()
|
130
|
+
item.function.model_dump()
|
131
131
|
for item in request.tools
|
132
132
|
if item.function.name == request.tool_choice.function.name
|
133
133
|
]
|
134
134
|
else:
|
135
|
-
tools = [item.model_dump() for item in request.tools]
|
135
|
+
tools = [item.function.model_dump() for item in request.tools]
|
136
136
|
|
137
137
|
tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
|
138
138
|
parser = FunctionCallParser(request.tools, tool_call_parser)
|
@@ -178,25 +178,6 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
178
178
|
audio_data,
|
179
179
|
modalities,
|
180
180
|
)
|
181
|
-
|
182
|
-
if "tool_calls" in processed_msg and isinstance(
|
183
|
-
processed_msg.get("tool_calls"), list
|
184
|
-
):
|
185
|
-
for call in processed_msg["tool_calls"]:
|
186
|
-
try:
|
187
|
-
if "arguments" in call["function"] and isinstance(
|
188
|
-
call["function"]["arguments"], str
|
189
|
-
):
|
190
|
-
call["function"]["arguments"] = json.loads(
|
191
|
-
call["function"]["arguments"]
|
192
|
-
)
|
193
|
-
except json.JSONDecodeError as e:
|
194
|
-
# Log a warning or error if JSON parsing fails for arguments
|
195
|
-
logger.warning(
|
196
|
-
f"Failed to parse tool call arguments as JSON: {e}"
|
197
|
-
)
|
198
|
-
# Decide whether to continue or raise the exception based on desired behavior
|
199
|
-
continue # Or raise e if strict parsing is required
|
200
181
|
openai_compatible_messages.append(processed_msg)
|
201
182
|
|
202
183
|
# Handle assistant prefix for continue_final_message
|
@@ -47,6 +47,11 @@ class ExpertDistributionRecorder(ABC):
|
|
47
47
|
rank: int,
|
48
48
|
):
|
49
49
|
if server_args.expert_distribution_recorder_mode is not None:
|
50
|
+
assert (
|
51
|
+
expert_location_metadata is not None
|
52
|
+
), "ExpertLocationMetadata is required for expert distribution recording. One possible"
|
53
|
+
"reason is that you are using a model that does not support expert distribution"
|
54
|
+
"recording. Try setting `get_model_config_for_expert_location` in your model."
|
50
55
|
return _ExpertDistributionRecorderReal(
|
51
56
|
server_args, expert_location_metadata, rank
|
52
57
|
)
|