sglang 0.4.0__tar.gz → 0.4.0.post1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.4.0 → sglang-0.4.0.post1}/PKG-INFO +5 -4
- {sglang-0.4.0 → sglang-0.4.0.post1}/README.md +3 -2
- {sglang-0.4.0 → sglang-0.4.0.post1}/pyproject.toml +2 -2
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/__init__.py +1 -1
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/constrained/outlines_backend.py +5 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/constrained/xgrammar_backend.py +5 -5
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/__init__.py +5 -2
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +22 -8
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/flashinfer_backend.py +20 -5
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/torch_native_backend.py +22 -8
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/triton_backend.py +22 -8
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +3 -0
- sglang-0.4.0.post1/sglang/srt/layers/ep_moe/__init__.py +0 -0
- sglang-0.4.0.post1/sglang/srt/layers/ep_moe/kernels.py +349 -0
- sglang-0.4.0.post1/sglang/srt/layers/ep_moe/layer.py +661 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/quantization/__init__.py +2 -2
- sglang-0.4.0.post1/sglang/srt/layers/quantization/fp8.py +559 -0
- sglang-0.4.0.post1/sglang/srt/layers/quantization/fp8_utils.py +27 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/radix_attention.py +4 -2
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/sampler.py +2 -0
- sglang-0.4.0.post1/sglang/srt/layers/torchao_utils.py +73 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/schedule_batch.py +1 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/scheduler.py +69 -65
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +7 -5
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/mem_cache/memory_pool.py +5 -1
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_executor/cuda_graph_runner.py +15 -1
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_executor/model_runner.py +11 -4
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_parallel.py +1 -5
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/commandr.py +2 -2
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/deepseek_v2.py +87 -7
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/grok.py +0 -5
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/llama.py +0 -5
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/mixtral.py +12 -9
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/phi3_small.py +0 -5
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/qwen2_moe.py +0 -5
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/torch_native_llama.py +0 -5
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/sampling_batch_info.py +9 -8
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/server.py +3 -3
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/server_args.py +43 -4
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/utils.py +50 -0
- sglang-0.4.0.post1/sglang/version.py +1 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang.egg-info/PKG-INFO +5 -4
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang.egg-info/SOURCES.txt +5 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang.egg-info/requires.txt +1 -1
- sglang-0.4.0/sglang/srt/layers/torchao_utils.py +0 -95
- sglang-0.4.0/sglang/version.py +0 -1
- {sglang-0.4.0 → sglang-0.4.0.post1}/LICENSE +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/setup.cfg +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/api.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/bench_latency.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/bench_offline_throughput.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/bench_one_batch.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/bench_one_batch_server.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/bench_serving.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/check_env.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/global_config.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/chat_template.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/choices.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/compiler.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/interpreter.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/ir.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/tracer.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/launch_server.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/_custom_ops.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/configs/device_config.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/configs/load_config.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/configs/model_config.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/configs/qwen2vl.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/conversation.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/communication_op.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/parallel_state.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/custom_op_util.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/fused_moe_patch.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/fused_moe_triton/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/fused_moe_triton/fused_moe.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/fused_moe_triton/layer.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/rotary_embedding.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/image_processor.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/io_struct.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/schedule_policy.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/session_controller.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/tokenizer_manager.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/tp_worker.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/metrics/collector.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_executor/forward_batch_info.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_loader/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_loader/loader.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_loader/weight_utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/gemma2.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/gemma2_reward.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/gpt2.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/internlm2_reward.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/llama_reward.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/llava.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/minicpm3.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/mllama.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/olmo.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/olmo2.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/qwen2_vl.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/registry.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/openai_api/adapter.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/run_eval.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/runners.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/test_activation.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/test_programs.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/test_utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.0
|
3
|
+
Version: 0.4.0.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -239,7 +239,7 @@ Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
|
|
239
239
|
Provides-Extra: srt
|
240
240
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
241
241
|
Requires-Dist: torch; extra == "srt"
|
242
|
-
Requires-Dist: vllm
|
242
|
+
Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
|
243
243
|
Requires-Dist: cuda-python; extra == "srt"
|
244
244
|
Requires-Dist: flashinfer>=0.1.6; extra == "srt"
|
245
245
|
Provides-Extra: srt-hip
|
@@ -315,6 +315,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
|
|
315
315
|
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
316
316
|
|
317
317
|
## News
|
318
|
+
- [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
318
319
|
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
319
320
|
- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
320
321
|
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
@@ -346,13 +347,13 @@ The core features include:
|
|
346
347
|
- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
|
347
348
|
|
348
349
|
## Benchmark And Performance
|
349
|
-
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
|
350
|
+
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
|
350
351
|
|
351
352
|
## Roadmap
|
352
353
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
353
354
|
|
354
355
|
## Adoption and Sponsorship
|
355
|
-
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
|
356
|
+
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
|
356
357
|
|
357
358
|
## Acknowledgment and Citation
|
358
359
|
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
@@ -16,6 +16,7 @@
|
|
16
16
|
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
17
17
|
|
18
18
|
## News
|
19
|
+
- [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
19
20
|
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
20
21
|
- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
21
22
|
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
@@ -47,13 +48,13 @@ The core features include:
|
|
47
48
|
- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
|
48
49
|
|
49
50
|
## Benchmark And Performance
|
50
|
-
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
|
51
|
+
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
|
51
52
|
|
52
53
|
## Roadmap
|
53
54
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
54
55
|
|
55
56
|
## Adoption and Sponsorship
|
56
|
-
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
|
57
|
+
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
|
57
58
|
|
58
59
|
## Acknowledgment and Citation
|
59
60
|
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.4.0"
|
7
|
+
version = "0.4.0.post1"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -23,7 +23,7 @@ runtime_common = ["aiohttp", "decord", "fastapi",
|
|
23
23
|
"psutil", "pydantic", "python-multipart",
|
24
24
|
"pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
|
25
25
|
"xgrammar>=0.1.4"]
|
26
|
-
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python", "flashinfer>=0.1.6"]
|
26
|
+
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer>=0.1.6"]
|
27
27
|
|
28
28
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
29
29
|
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
|
@@ -42,6 +42,7 @@ class OutlinesGrammar(BaseGrammarObject):
|
|
42
42
|
self.guide = guide
|
43
43
|
self.jump_forward_map = jump_forward_map
|
44
44
|
self.state = 0
|
45
|
+
self.finished = False
|
45
46
|
|
46
47
|
def accept_token(self, token: int):
|
47
48
|
self.state = self.guide.get_next_state(self.state, token)
|
@@ -84,6 +85,10 @@ class OutlinesGrammar(BaseGrammarObject):
|
|
84
85
|
) -> torch.Tensor:
|
85
86
|
return torch.zeros(batch_size, vocab_size, dtype=torch.bool, device=device)
|
86
87
|
|
88
|
+
@staticmethod
|
89
|
+
def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
|
90
|
+
return vocab_mask
|
91
|
+
|
87
92
|
def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
|
88
93
|
tokens = torch.tensor(
|
89
94
|
self.guide.get_next_instruction(self.state).tokens, dtype=torch.int64
|
@@ -45,6 +45,7 @@ class XGrammarGrammar(BaseGrammarObject):
|
|
45
45
|
self.matcher = matcher
|
46
46
|
self.vocab_size = vocab_size
|
47
47
|
self.ctx = ctx
|
48
|
+
self.finished = False
|
48
49
|
|
49
50
|
def accept_token(self, token: int):
|
50
51
|
assert self.matcher.accept_token(token)
|
@@ -85,12 +86,11 @@ class XGrammarGrammar(BaseGrammarObject):
|
|
85
86
|
self.matcher.fill_next_token_bitmask(vocab_mask, idx)
|
86
87
|
|
87
88
|
@staticmethod
|
88
|
-
def
|
89
|
-
|
90
|
-
# vocab_mask must then be on the same device as logits
|
91
|
-
# when applying the token bitmask, so we check and move if needed
|
92
|
-
vocab_mask = vocab_mask.to(logits.device)
|
89
|
+
def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
|
90
|
+
return vocab_mask.to(device, non_blocking=True)
|
93
91
|
|
92
|
+
@staticmethod
|
93
|
+
def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
|
94
94
|
apply_token_bitmask_inplace(logits, vocab_mask)
|
95
95
|
|
96
96
|
def copy(self):
|
@@ -52,12 +52,13 @@ class AttentionBackend(ABC):
|
|
52
52
|
v: torch.Tensor,
|
53
53
|
layer: RadixAttention,
|
54
54
|
forward_batch: ForwardBatch,
|
55
|
+
save_kv_cache: bool = True,
|
55
56
|
):
|
56
57
|
"""Run forward on an attention layer."""
|
57
58
|
if forward_batch.forward_mode.is_decode():
|
58
|
-
return self.forward_decode(q, k, v, layer, forward_batch)
|
59
|
+
return self.forward_decode(q, k, v, layer, forward_batch, save_kv_cache)
|
59
60
|
else:
|
60
|
-
return self.forward_extend(q, k, v, layer, forward_batch)
|
61
|
+
return self.forward_extend(q, k, v, layer, forward_batch, save_kv_cache)
|
61
62
|
|
62
63
|
def forward_decode(
|
63
64
|
self,
|
@@ -66,6 +67,7 @@ class AttentionBackend(ABC):
|
|
66
67
|
v: torch.Tensor,
|
67
68
|
layer: RadixAttention,
|
68
69
|
forward_batch: ForwardBatch,
|
70
|
+
save_kv_cache: bool = True,
|
69
71
|
):
|
70
72
|
"""Run a forward for decode."""
|
71
73
|
raise NotImplementedError()
|
@@ -77,6 +79,7 @@ class AttentionBackend(ABC):
|
|
77
79
|
v: torch.Tensor,
|
78
80
|
layer: RadixAttention,
|
79
81
|
forward_batch: ForwardBatch,
|
82
|
+
save_kv_cache: bool = True,
|
80
83
|
):
|
81
84
|
"""Run a forward for extend."""
|
82
85
|
raise NotImplementedError()
|
@@ -165,7 +165,13 @@ class DoubleSparseAttnBackend(AttentionBackend):
|
|
165
165
|
return 1
|
166
166
|
|
167
167
|
def forward_extend(
|
168
|
-
self,
|
168
|
+
self,
|
169
|
+
q,
|
170
|
+
k,
|
171
|
+
v,
|
172
|
+
layer: RadixAttention,
|
173
|
+
forward_batch: ForwardBatch,
|
174
|
+
save_kv_cache=True,
|
169
175
|
):
|
170
176
|
# TODO: reuse the buffer across layers
|
171
177
|
if layer.qk_head_dim != layer.v_head_dim:
|
@@ -181,9 +187,10 @@ class DoubleSparseAttnBackend(AttentionBackend):
|
|
181
187
|
.expand(k.shape[0], -1, -1),
|
182
188
|
)
|
183
189
|
|
184
|
-
|
185
|
-
|
186
|
-
|
190
|
+
if save_kv_cache:
|
191
|
+
forward_batch.token_to_kv_pool.set_kv_buffer(
|
192
|
+
layer, forward_batch.out_cache_loc, k, v, k_label
|
193
|
+
)
|
187
194
|
|
188
195
|
(
|
189
196
|
start_loc,
|
@@ -212,7 +219,13 @@ class DoubleSparseAttnBackend(AttentionBackend):
|
|
212
219
|
return o
|
213
220
|
|
214
221
|
def forward_decode(
|
215
|
-
self,
|
222
|
+
self,
|
223
|
+
q,
|
224
|
+
k,
|
225
|
+
v,
|
226
|
+
layer: RadixAttention,
|
227
|
+
forward_batch: ForwardBatch,
|
228
|
+
save_kv_cache=True,
|
216
229
|
):
|
217
230
|
# During torch.compile, there is a bug in rotary_emb that causes the
|
218
231
|
# output value to have a 3D tensor shape. This reshapes the output correctly.
|
@@ -242,9 +255,10 @@ class DoubleSparseAttnBackend(AttentionBackend):
|
|
242
255
|
.expand(k.shape[0], -1, -1),
|
243
256
|
)
|
244
257
|
|
245
|
-
|
246
|
-
|
247
|
-
|
258
|
+
if save_kv_cache:
|
259
|
+
forward_batch.token_to_kv_pool.set_kv_buffer(
|
260
|
+
layer, forward_batch.out_cache_loc, k, v, k_label
|
261
|
+
)
|
248
262
|
|
249
263
|
# NOTE(Andy) shouldn't be used when max_len_in_batch < heavy_token_num
|
250
264
|
# and set a minimum value for sparse_decode
|
@@ -221,7 +221,13 @@ class FlashInferAttnBackend(AttentionBackend):
|
|
221
221
|
return 0
|
222
222
|
|
223
223
|
def forward_extend(
|
224
|
-
self,
|
224
|
+
self,
|
225
|
+
q,
|
226
|
+
k,
|
227
|
+
v,
|
228
|
+
layer: RadixAttention,
|
229
|
+
forward_batch: ForwardBatch,
|
230
|
+
save_kv_cache=True,
|
225
231
|
):
|
226
232
|
prefill_wrapper_paged = self.prefill_wrappers_paged[
|
227
233
|
self._get_wrapper_idx(layer)
|
@@ -237,7 +243,8 @@ class FlashInferAttnBackend(AttentionBackend):
|
|
237
243
|
if not use_ragged:
|
238
244
|
if k is not None:
|
239
245
|
assert v is not None
|
240
|
-
|
246
|
+
if save_kv_cache:
|
247
|
+
forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
|
241
248
|
|
242
249
|
o = prefill_wrapper_paged.forward(
|
243
250
|
q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
|
@@ -270,12 +277,19 @@ class FlashInferAttnBackend(AttentionBackend):
|
|
270
277
|
|
271
278
|
o, _ = merge_state(o1, s1, o2, s2)
|
272
279
|
|
273
|
-
|
280
|
+
if save_kv_cache:
|
281
|
+
forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
|
274
282
|
|
275
283
|
return o.view(-1, layer.tp_q_head_num * layer.head_dim)
|
276
284
|
|
277
285
|
def forward_decode(
|
278
|
-
self,
|
286
|
+
self,
|
287
|
+
q,
|
288
|
+
k,
|
289
|
+
v,
|
290
|
+
layer: RadixAttention,
|
291
|
+
forward_batch: ForwardBatch,
|
292
|
+
save_kv_cache=True,
|
279
293
|
):
|
280
294
|
decode_wrapper = self.forward_metadata[0][self._get_wrapper_idx(layer)]
|
281
295
|
cache_loc = (
|
@@ -286,7 +300,8 @@ class FlashInferAttnBackend(AttentionBackend):
|
|
286
300
|
|
287
301
|
if k is not None:
|
288
302
|
assert v is not None
|
289
|
-
|
303
|
+
if save_kv_cache:
|
304
|
+
forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
|
290
305
|
|
291
306
|
o = decode_wrapper.forward(
|
292
307
|
q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
|
@@ -216,16 +216,23 @@ class TorchNativeAttnBackend(AttentionBackend):
|
|
216
216
|
return output
|
217
217
|
|
218
218
|
def forward_extend(
|
219
|
-
self,
|
219
|
+
self,
|
220
|
+
q,
|
221
|
+
k,
|
222
|
+
v,
|
223
|
+
layer: RadixAttention,
|
224
|
+
forward_batch: ForwardBatch,
|
225
|
+
save_kv_cache=True,
|
220
226
|
):
|
221
227
|
if layer.qk_head_dim != layer.v_head_dim:
|
222
228
|
o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
|
223
229
|
else:
|
224
230
|
o = torch.empty_like(q)
|
225
231
|
|
226
|
-
|
227
|
-
|
228
|
-
|
232
|
+
if save_kv_cache:
|
233
|
+
forward_batch.token_to_kv_pool.set_kv_buffer(
|
234
|
+
layer, forward_batch.out_cache_loc, k, v
|
235
|
+
)
|
229
236
|
|
230
237
|
use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
|
231
238
|
|
@@ -249,7 +256,13 @@ class TorchNativeAttnBackend(AttentionBackend):
|
|
249
256
|
return o
|
250
257
|
|
251
258
|
def forward_decode(
|
252
|
-
self,
|
259
|
+
self,
|
260
|
+
q,
|
261
|
+
k,
|
262
|
+
v,
|
263
|
+
layer: RadixAttention,
|
264
|
+
forward_batch: ForwardBatch,
|
265
|
+
save_kv_cache=True,
|
253
266
|
):
|
254
267
|
# During torch.compile, there is a bug in rotary_emb that causes the
|
255
268
|
# output value to have a 3D tensor shape. This reshapes the output correctly.
|
@@ -260,9 +273,10 @@ class TorchNativeAttnBackend(AttentionBackend):
|
|
260
273
|
else:
|
261
274
|
o = torch.empty_like(q)
|
262
275
|
|
263
|
-
|
264
|
-
|
265
|
-
|
276
|
+
if save_kv_cache:
|
277
|
+
forward_batch.token_to_kv_pool.set_kv_buffer(
|
278
|
+
layer, forward_batch.out_cache_loc, k, v
|
279
|
+
)
|
266
280
|
|
267
281
|
use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
|
268
282
|
|
@@ -114,7 +114,13 @@ class TritonAttnBackend(AttentionBackend):
|
|
114
114
|
return 1
|
115
115
|
|
116
116
|
def forward_extend(
|
117
|
-
self,
|
117
|
+
self,
|
118
|
+
q,
|
119
|
+
k,
|
120
|
+
v,
|
121
|
+
layer: RadixAttention,
|
122
|
+
forward_batch: ForwardBatch,
|
123
|
+
save_kv_cache=True,
|
118
124
|
):
|
119
125
|
# TODO: reuse the buffer across layers
|
120
126
|
if layer.qk_head_dim != layer.v_head_dim:
|
@@ -122,9 +128,10 @@ class TritonAttnBackend(AttentionBackend):
|
|
122
128
|
else:
|
123
129
|
o = torch.empty_like(q)
|
124
130
|
|
125
|
-
|
126
|
-
|
127
|
-
|
131
|
+
if save_kv_cache:
|
132
|
+
forward_batch.token_to_kv_pool.set_kv_buffer(
|
133
|
+
layer, forward_batch.out_cache_loc, k, v
|
134
|
+
)
|
128
135
|
|
129
136
|
start_loc, attn_logits, max_seq_len, max_extend_len = self.forward_metadata
|
130
137
|
self.extend_attention_fwd(
|
@@ -146,7 +153,13 @@ class TritonAttnBackend(AttentionBackend):
|
|
146
153
|
return o
|
147
154
|
|
148
155
|
def forward_decode(
|
149
|
-
self,
|
156
|
+
self,
|
157
|
+
q,
|
158
|
+
k,
|
159
|
+
v,
|
160
|
+
layer: RadixAttention,
|
161
|
+
forward_batch: ForwardBatch,
|
162
|
+
save_kv_cache=True,
|
150
163
|
):
|
151
164
|
# During torch.compile, there is a bug in rotary_emb that causes the
|
152
165
|
# output value to have a 3D tensor shape. This reshapes the output correctly.
|
@@ -160,9 +173,10 @@ class TritonAttnBackend(AttentionBackend):
|
|
160
173
|
|
161
174
|
start_loc, attn_logits, max_seq_len, max_extend_len = self.forward_metadata
|
162
175
|
|
163
|
-
|
164
|
-
|
165
|
-
|
176
|
+
if save_kv_cache:
|
177
|
+
forward_batch.token_to_kv_pool.set_kv_buffer(
|
178
|
+
layer, forward_batch.out_cache_loc, k, v
|
179
|
+
)
|
166
180
|
|
167
181
|
self.decode_attention_fwd(
|
168
182
|
q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
|
File without changes
|