sglang 0.4.1__tar.gz → 0.4.1.post1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.4.1 → sglang-0.4.1.post1}/PKG-INFO +4 -4
- {sglang-0.4.1 → sglang-0.4.1.post1}/README.md +2 -2
- {sglang-0.4.1 → sglang-0.4.1.post1}/pyproject.toml +5 -2
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/bench_serving.py +11 -3
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/openai.py +10 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/constrained/xgrammar_backend.py +6 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -14
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +17 -4
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/topk.py +14 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/quantization/fp8_kernel.py +14 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/schedule_policy.py +1 -1
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/scheduler.py +11 -14
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/tokenizer_manager.py +54 -45
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_executor/model_runner.py +0 -6
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_loader/loader.py +22 -11
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/gemma2.py +19 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/llama.py +2 -2
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/openai_api/adapter.py +19 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/openai_api/protocol.py +2 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/sampling_params.py +9 -2
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/server.py +20 -37
- sglang-0.4.1.post1/sglang/version.py +1 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang.egg-info/PKG-INFO +4 -4
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang.egg-info/requires.txt +1 -1
- sglang-0.4.1/sglang/version.py +0 -1
- {sglang-0.4.1 → sglang-0.4.1.post1}/LICENSE +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/setup.cfg +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/__init__.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/api.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/bench_latency.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/bench_offline_throughput.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/bench_one_batch.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/bench_one_batch_server.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/check_env.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/global_config.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/__init__.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/chat_template.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/choices.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/compiler.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/interpreter.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/ir.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/tracer.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/launch_server.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/llama3_eval.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/_custom_ops.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/aio_rwlock.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/configs/device_config.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/configs/load_config.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/configs/model_config.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/configs/qwen2vl.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/constrained/outlines_backend.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/conversation.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/__init__.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/communication_op.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/__init__.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/parallel_state.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/utils.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/__init__.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/custom_op_util.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/fused_moe_triton/layer.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/quantization/fp8.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/rotary_embedding.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/sampler.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/image_processor.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/io_struct.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/schedule_batch.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/session_controller.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/tp_worker.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/mem_cache/memory_pool.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/metrics/collector.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_executor/forward_batch_info.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_loader/__init__.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_loader/weight_utils.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_parallel.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/deepseek_v2.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/gemma2_reward.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/gpt2.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/granite.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/grok.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/internlm2_reward.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/llama_reward.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/llava.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/minicpm3.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/mllama.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/olmo.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/olmo2.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/phi3_small.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/qwen2_vl.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/registry.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/torch_native_llama.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/sampling_batch_info.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/server_args.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/utils.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/run_eval.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/runners.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/test_activation.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/test_block_fp8.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/test_programs.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/test_utils.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/utils.py +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang.egg-info/SOURCES.txt +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.4.1 → sglang-0.4.1.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.1
|
3
|
+
Version: 0.4.1.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -243,7 +243,7 @@ Requires-Dist: torch; extra == "srt"
|
|
243
243
|
Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
|
244
244
|
Requires-Dist: cuda-python; extra == "srt"
|
245
245
|
Requires-Dist: flashinfer==0.1.6; extra == "srt"
|
246
|
-
Requires-Dist: sgl-kernel>=0.0.2.
|
246
|
+
Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
|
247
247
|
Provides-Extra: srt-hip
|
248
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
249
249
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -358,8 +358,8 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
358
358
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
359
359
|
|
360
360
|
## Adoption and Sponsorship
|
361
|
-
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI
|
361
|
+
The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
|
362
362
|
|
363
363
|
## Acknowledgment and Citation
|
364
364
|
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
365
|
-
Please cite
|
365
|
+
Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
@@ -57,8 +57,8 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
57
57
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
58
58
|
|
59
59
|
## Adoption and Sponsorship
|
60
|
-
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI
|
60
|
+
The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
|
61
61
|
|
62
62
|
## Acknowledgment and Citation
|
63
63
|
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
64
|
-
Please cite
|
64
|
+
Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.4.1"
|
7
|
+
version = "0.4.1.post1"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -23,7 +23,7 @@ runtime_common = ["aiohttp", "decord", "fastapi",
|
|
23
23
|
"psutil", "pydantic", "python-multipart",
|
24
24
|
"pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
|
25
25
|
"xgrammar>=0.1.6"]
|
26
|
-
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6", "sgl-kernel>=0.0.2.
|
26
|
+
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6", "sgl-kernel>=0.0.2.post10"]
|
27
27
|
|
28
28
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
29
29
|
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
|
@@ -60,6 +60,9 @@ dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
|
|
60
60
|
"Homepage" = "https://github.com/sgl-project/sglang"
|
61
61
|
"Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
|
62
62
|
|
63
|
+
[tool.setuptools.package-data]
|
64
|
+
"sglang" = ["srt/layers/fused_moe_triton/configs/*.json"]
|
65
|
+
|
63
66
|
[tool.setuptools.packages.find]
|
64
67
|
exclude = [
|
65
68
|
"assets*",
|
@@ -897,6 +897,7 @@ async def benchmark(
|
|
897
897
|
else:
|
898
898
|
raise ValueError(f"Unknown backend: {backend}")
|
899
899
|
|
900
|
+
# Limit concurrency
|
900
901
|
# From https://github.com/vllm-project/vllm/pull/9390
|
901
902
|
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
902
903
|
|
@@ -906,6 +907,7 @@ async def benchmark(
|
|
906
907
|
async with semaphore:
|
907
908
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
908
909
|
|
910
|
+
# Warmup
|
909
911
|
print("Starting initial single prompt test run...")
|
910
912
|
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
911
913
|
test_input = RequestFuncInput(
|
@@ -924,11 +926,15 @@ async def benchmark(
|
|
924
926
|
f"are correctly specified. Error: {test_output.error}"
|
925
927
|
)
|
926
928
|
else:
|
927
|
-
requests.post(base_url + "/flush_cache")
|
928
929
|
print("Initial test run completed. Starting main benchmark run...")
|
929
930
|
|
930
|
-
|
931
|
+
# Flush cache
|
932
|
+
if "sglang" in backend:
|
933
|
+
requests.post(base_url + "/flush_cache")
|
934
|
+
|
935
|
+
time.sleep(1.0)
|
931
936
|
|
937
|
+
# Start profiler
|
932
938
|
if profile:
|
933
939
|
print("Starting profiler...")
|
934
940
|
profile_output = await async_request_profile(
|
@@ -939,6 +945,7 @@ async def benchmark(
|
|
939
945
|
|
940
946
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
941
947
|
|
948
|
+
# Run all requests
|
942
949
|
benchmark_start_time = time.perf_counter()
|
943
950
|
tasks: List[asyncio.Task] = []
|
944
951
|
async for request in get_request(input_requests, request_rate):
|
@@ -959,6 +966,7 @@ async def benchmark(
|
|
959
966
|
)
|
960
967
|
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
961
968
|
|
969
|
+
# Stop profiler
|
962
970
|
if profile:
|
963
971
|
print("Stopping profiler...")
|
964
972
|
profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
|
@@ -968,8 +976,8 @@ async def benchmark(
|
|
968
976
|
if pbar is not None:
|
969
977
|
pbar.close()
|
970
978
|
|
979
|
+
# Compute metrics and print results
|
971
980
|
benchmark_duration = time.perf_counter() - benchmark_start_time
|
972
|
-
|
973
981
|
metrics, output_lens = calculate_metrics(
|
974
982
|
input_requests=input_requests,
|
975
983
|
outputs=outputs,
|
@@ -366,6 +366,11 @@ class OpenAI(BaseBackend):
|
|
366
366
|
def openai_completion(
|
367
367
|
client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
|
368
368
|
):
|
369
|
+
# if "ebnf" is in kwargs, warn and remove
|
370
|
+
if "ebnf" in kwargs:
|
371
|
+
warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
|
372
|
+
del kwargs["ebnf"]
|
373
|
+
|
369
374
|
for attempt in range(retries):
|
370
375
|
try:
|
371
376
|
if is_chat:
|
@@ -398,6 +403,11 @@ def openai_completion(
|
|
398
403
|
def openai_completion_stream(
|
399
404
|
client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
|
400
405
|
):
|
406
|
+
# if "ebnf" is in kwargs, warn and remove
|
407
|
+
if "ebnf" in kwargs:
|
408
|
+
warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
|
409
|
+
del kwargs["ebnf"]
|
410
|
+
|
401
411
|
for attempt in range(retries):
|
402
412
|
try:
|
403
413
|
if is_chat:
|
@@ -126,6 +126,12 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
|
|
126
126
|
f"Skip invalid json_schema: json_schema={key_string}, {e=}"
|
127
127
|
)
|
128
128
|
return None
|
129
|
+
elif key_type == "ebnf":
|
130
|
+
try:
|
131
|
+
ctx = self.grammar_compiler.compile_grammar(key_string)
|
132
|
+
except RuntimeError as e:
|
133
|
+
logging.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
|
134
|
+
return None
|
129
135
|
elif key_type == "regex":
|
130
136
|
logger.warning(
|
131
137
|
"regex hasn't been supported by xgrammar yet. This is skipped."
|
{sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py
RENAMED
@@ -292,27 +292,33 @@ def extend_attention_fwd(
|
|
292
292
|
BLOCK_DPE = 0
|
293
293
|
BLOCK_DV = triton.next_power_of_2(Lv)
|
294
294
|
|
295
|
-
if
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
BLOCK_M, BLOCK_N = (32, 64)
|
300
|
-
elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
|
301
|
-
if Lq <= 128:
|
302
|
-
BLOCK_M, BLOCK_N = (128, 128)
|
303
|
-
elif Lq <= 256:
|
304
|
-
BLOCK_M, BLOCK_N = (64, 64)
|
305
|
-
else:
|
306
|
-
BLOCK_M, BLOCK_N = (32, 64)
|
295
|
+
if is_hip_:
|
296
|
+
BLOCK_M, BLOCK_N = (64, 64)
|
297
|
+
num_warps = 4
|
298
|
+
|
307
299
|
else:
|
308
|
-
|
300
|
+
if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
|
301
|
+
if Lq <= 256:
|
302
|
+
BLOCK_M, BLOCK_N = (128, 64)
|
303
|
+
else:
|
304
|
+
BLOCK_M, BLOCK_N = (32, 64)
|
305
|
+
elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
|
306
|
+
if Lq <= 128:
|
307
|
+
BLOCK_M, BLOCK_N = (128, 128)
|
308
|
+
elif Lq <= 256:
|
309
|
+
BLOCK_M, BLOCK_N = (64, 64)
|
310
|
+
else:
|
311
|
+
BLOCK_M, BLOCK_N = (32, 64)
|
312
|
+
else:
|
313
|
+
BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
|
314
|
+
|
315
|
+
num_warps = 4 if Lk <= 64 else 8
|
309
316
|
|
310
317
|
sm_scale = sm_scale or 1.0 / (Lq**0.5)
|
311
318
|
batch_size, head_num = b_seq_len.shape[0], q_extend.shape[1]
|
312
319
|
kv_group_num = q_extend.shape[1] // k_extend.shape[1]
|
313
320
|
|
314
321
|
grid = (batch_size, head_num, triton.cdiv(max_len_extend, BLOCK_M))
|
315
|
-
num_warps = 4 if Lk <= 64 else 8
|
316
322
|
num_stages = 1
|
317
323
|
|
318
324
|
extra_kargs = {}
|
@@ -11,12 +11,17 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
11
11
|
import torch
|
12
12
|
import triton
|
13
13
|
import triton.language as tl
|
14
|
-
from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
|
15
14
|
from vllm import _custom_ops as ops
|
16
15
|
|
17
16
|
from sglang.srt.layers.moe.topk import select_experts
|
18
17
|
from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
|
19
|
-
from sglang.srt.utils import direct_register_custom_op, get_device_name
|
18
|
+
from sglang.srt.utils import direct_register_custom_op, get_device_name, is_hip
|
19
|
+
|
20
|
+
not_hip = False
|
21
|
+
if not is_hip():
|
22
|
+
from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
|
23
|
+
|
24
|
+
not_hip = True
|
20
25
|
|
21
26
|
logger = logging.getLogger(__name__)
|
22
27
|
padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
|
@@ -267,8 +272,14 @@ def moe_align_block_size(
|
|
267
272
|
(max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
|
268
273
|
)
|
269
274
|
num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
|
270
|
-
|
271
|
-
|
275
|
+
if not_hip and num_experts >= 224:
|
276
|
+
token_cnts_buffer = torch.empty(
|
277
|
+
(num_experts + 1) * num_experts, dtype=torch.int32, device=topk_ids.device
|
278
|
+
)
|
279
|
+
cumsum_buffer = torch.empty(
|
280
|
+
num_experts + 1, dtype=torch.int32, device=topk_ids.device
|
281
|
+
)
|
282
|
+
|
272
283
|
sgl_moe_align_block_size(
|
273
284
|
topk_ids,
|
274
285
|
num_experts,
|
@@ -276,6 +287,8 @@ def moe_align_block_size(
|
|
276
287
|
sorted_ids,
|
277
288
|
expert_ids,
|
278
289
|
num_tokens_post_pad,
|
290
|
+
token_cnts_buffer,
|
291
|
+
cumsum_buffer,
|
279
292
|
)
|
280
293
|
else:
|
281
294
|
ops.moe_align_block_size(
|
@@ -1,3 +1,17 @@
|
|
1
|
+
# Copyright 2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
|
1
15
|
from typing import Callable, Optional
|
2
16
|
|
3
17
|
import torch
|
@@ -1,3 +1,17 @@
|
|
1
|
+
# Copyright 2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
|
1
15
|
from typing import List, Tuple
|
2
16
|
|
3
17
|
import torch
|
@@ -468,9 +468,6 @@ class Scheduler:
|
|
468
468
|
self.send_to_tokenizer.send_pyobj(
|
469
469
|
UpdateWeightFromDiskReqOutput(success, message)
|
470
470
|
)
|
471
|
-
elif isinstance(recv_req, GetWeightsByNameReqInput):
|
472
|
-
parameter = self.get_weights_by_name(recv_req)
|
473
|
-
self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
|
474
471
|
elif isinstance(recv_req, InitWeightsUpdateGroupReqInput):
|
475
472
|
success, message = self.init_weights_update_group(recv_req)
|
476
473
|
self.send_to_tokenizer.send_pyobj(
|
@@ -565,7 +562,7 @@ class Scheduler:
|
|
565
562
|
|
566
563
|
if req.logprob_start_len == -1:
|
567
564
|
# By default, only return the logprobs for output tokens
|
568
|
-
req.logprob_start_len = len(
|
565
|
+
req.logprob_start_len = len(req.origin_input_ids) - 1
|
569
566
|
|
570
567
|
# Truncate prompts that are too long
|
571
568
|
if len(req.origin_input_ids) > self.max_req_input_len:
|
@@ -589,12 +586,15 @@ class Scheduler:
|
|
589
586
|
if (
|
590
587
|
req.sampling_params.json_schema is not None
|
591
588
|
or req.sampling_params.regex is not None
|
589
|
+
or req.sampling_params.ebnf is not None
|
592
590
|
):
|
593
591
|
assert self.grammar_backend is not None
|
594
592
|
if req.sampling_params.json_schema is not None:
|
595
593
|
key = ("json", req.sampling_params.json_schema)
|
596
594
|
elif req.sampling_params.regex is not None:
|
597
595
|
key = ("regex", req.sampling_params.regex)
|
596
|
+
elif req.sampling_params.ebnf is not None:
|
597
|
+
key = ("ebnf", req.sampling_params.ebnf)
|
598
598
|
|
599
599
|
req.grammar = self.grammar_backend.get_cached_value(key)
|
600
600
|
if not req.grammar:
|
@@ -629,16 +629,13 @@ class Scheduler:
|
|
629
629
|
self.waiting_queue.append(req)
|
630
630
|
|
631
631
|
def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
)
|
640
|
-
else:
|
641
|
-
tree_cache_hit_rate = 0.0
|
632
|
+
self.tree_cache_metrics["total"] += (
|
633
|
+
adder.log_input_tokens + adder.log_hit_tokens
|
634
|
+
) / 10**9
|
635
|
+
self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
|
636
|
+
tree_cache_hit_rate = (
|
637
|
+
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
|
638
|
+
)
|
642
639
|
|
643
640
|
num_used = self.max_total_num_tokens - (
|
644
641
|
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
|
@@ -22,7 +22,7 @@ import signal
|
|
22
22
|
import sys
|
23
23
|
import time
|
24
24
|
import uuid
|
25
|
-
from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
|
25
|
+
from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union
|
26
26
|
|
27
27
|
import fastapi
|
28
28
|
import uvloop
|
@@ -173,6 +173,15 @@ class TokenizerManager:
|
|
173
173
|
|
174
174
|
# Others
|
175
175
|
self.gracefully_exit = False
|
176
|
+
self.init_weights_update_group_communicator = _Communicator(
|
177
|
+
self.send_to_scheduler, server_args.dp_size
|
178
|
+
)
|
179
|
+
self.update_weights_from_distributed_communicator = _Communicator(
|
180
|
+
self.send_to_scheduler, server_args.dp_size
|
181
|
+
)
|
182
|
+
self.get_weights_by_name_communicator = _Communicator(
|
183
|
+
self.send_to_scheduler, server_args.dp_size
|
184
|
+
)
|
176
185
|
|
177
186
|
# Metrics
|
178
187
|
if self.enable_metrics:
|
@@ -190,8 +199,7 @@ class TokenizerManager:
|
|
190
199
|
):
|
191
200
|
created_time = time.time()
|
192
201
|
|
193
|
-
|
194
|
-
self.create_handle_loop()
|
202
|
+
self.auto_create_handle_loop()
|
195
203
|
|
196
204
|
if isinstance(obj, EmbeddingReqInput) and self.is_generation:
|
197
205
|
raise ValueError(
|
@@ -440,8 +448,7 @@ class TokenizerManager:
|
|
440
448
|
obj: UpdateWeightFromDiskReqInput,
|
441
449
|
request: Optional[fastapi.Request] = None,
|
442
450
|
) -> Tuple[bool, str]:
|
443
|
-
|
444
|
-
self.create_handle_loop()
|
451
|
+
self.auto_create_handle_loop()
|
445
452
|
|
446
453
|
# default the load format to the server_args
|
447
454
|
if obj.load_format is None:
|
@@ -456,7 +463,7 @@ class TokenizerManager:
|
|
456
463
|
|
457
464
|
async def _wait_for_model_update_from_disk(
|
458
465
|
self, obj: UpdateWeightFromDiskReqInput
|
459
|
-
) -> Tuple[bool, str
|
466
|
+
) -> Tuple[bool, str]:
|
460
467
|
self.send_to_scheduler.send_pyobj(obj)
|
461
468
|
self.model_update_result = asyncio.Future()
|
462
469
|
if self.server_args.dp_size == 1:
|
@@ -485,15 +492,11 @@ class TokenizerManager:
|
|
485
492
|
obj: InitWeightsUpdateGroupReqInput,
|
486
493
|
request: Optional[fastapi.Request] = None,
|
487
494
|
) -> Tuple[bool, str]:
|
488
|
-
|
489
|
-
self.create_handle_loop()
|
490
|
-
self.send_to_scheduler.send_pyobj(obj)
|
491
|
-
|
492
|
-
self.init_weights_update_group_result = asyncio.Future()
|
495
|
+
self.auto_create_handle_loop()
|
493
496
|
assert (
|
494
497
|
self.server_args.dp_size == 1
|
495
498
|
), "dp_size must be 1 for init parameter update group"
|
496
|
-
result = await self.
|
499
|
+
result = (await self.init_weights_update_group_communicator(obj))[0]
|
497
500
|
return result.success, result.message
|
498
501
|
|
499
502
|
async def update_weights_from_distributed(
|
@@ -501,44 +504,32 @@ class TokenizerManager:
|
|
501
504
|
obj: UpdateWeightsFromDistributedReqInput,
|
502
505
|
request: Optional[fastapi.Request] = None,
|
503
506
|
) -> Tuple[bool, str]:
|
504
|
-
|
505
|
-
|
507
|
+
self.auto_create_handle_loop()
|
508
|
+
assert (
|
509
|
+
self.server_args.dp_size == 1
|
510
|
+
), "dp_size must be for update weights from distributed"
|
506
511
|
|
507
512
|
# This means that weight sync
|
508
513
|
# cannot run while requests are in progress.
|
509
514
|
async with self.model_update_lock.writer_lock:
|
510
|
-
self.
|
511
|
-
self.parameter_update_result: Awaitable[
|
512
|
-
UpdateWeightsFromDistributedReqOutput
|
513
|
-
] = asyncio.Future()
|
514
|
-
assert (
|
515
|
-
self.server_args.dp_size == 1
|
516
|
-
), "dp_size must be for update weights from distributed"
|
517
|
-
result = await self.parameter_update_result
|
515
|
+
result = (await self.update_weights_from_distributed_communicator(obj))[0]
|
518
516
|
return result.success, result.message
|
519
517
|
|
520
518
|
async def get_weights_by_name(
|
521
519
|
self, obj: GetWeightsByNameReqInput, request: Optional[fastapi.Request] = None
|
522
520
|
):
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
self.send_to_scheduler.send_pyobj(obj)
|
527
|
-
self.get_weights_by_name_result = asyncio.Future()
|
521
|
+
self.auto_create_handle_loop()
|
522
|
+
results = await self.get_weights_by_name_communicator(obj)
|
523
|
+
all_parameters = [r.parameter for r in results]
|
528
524
|
if self.server_args.dp_size == 1:
|
529
|
-
|
530
|
-
return result.parameter
|
525
|
+
return all_parameters[0]
|
531
526
|
else:
|
532
|
-
self.get_weights_by_name_tmp = []
|
533
|
-
result = await self.get_weights_by_name_result
|
534
|
-
all_parameters = [r.parameter for r in result]
|
535
527
|
return all_parameters
|
536
528
|
|
537
529
|
async def open_session(
|
538
530
|
self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
|
539
531
|
):
|
540
|
-
|
541
|
-
self.create_handle_loop()
|
532
|
+
self.auto_create_handle_loop()
|
542
533
|
|
543
534
|
session_id = uuid.uuid4().hex
|
544
535
|
obj.session_id = session_id
|
@@ -568,7 +559,7 @@ class TokenizerManager:
|
|
568
559
|
background_tasks.add_task(abort_request)
|
569
560
|
return background_tasks
|
570
561
|
|
571
|
-
def
|
562
|
+
def auto_create_handle_loop(self):
|
572
563
|
if not self.to_create_loop:
|
573
564
|
return
|
574
565
|
|
@@ -711,21 +702,14 @@ class TokenizerManager:
|
|
711
702
|
assert (
|
712
703
|
self.server_args.dp_size == 1
|
713
704
|
), "dp_size must be 1 for init parameter update group"
|
714
|
-
self.
|
705
|
+
self.init_weights_update_group_communicator.handle_recv(recv_obj)
|
715
706
|
elif isinstance(recv_obj, UpdateWeightsFromDistributedReqOutput):
|
716
707
|
assert (
|
717
708
|
self.server_args.dp_size == 1
|
718
709
|
), "dp_size must be 1 for update weights from distributed"
|
719
|
-
self.
|
710
|
+
self.update_weights_from_distributed_communicator.handle_recv(recv_obj)
|
720
711
|
elif isinstance(recv_obj, GetWeightsByNameReqOutput):
|
721
|
-
|
722
|
-
self.get_weights_by_name_result.set_result(recv_obj)
|
723
|
-
else:
|
724
|
-
self.get_weights_by_name_tmp.append(recv_obj)
|
725
|
-
if len(self.get_weights_by_name_tmp) == self.server_args.dp_size:
|
726
|
-
self.get_weights_by_name_result.set_result(
|
727
|
-
self.get_weights_by_name_tmp
|
728
|
-
)
|
712
|
+
self.get_weights_by_name_communicator.handle_recv(recv_obj)
|
729
713
|
else:
|
730
714
|
raise ValueError(f"Invalid object: {recv_obj=}")
|
731
715
|
|
@@ -809,3 +793,28 @@ class SignalHandler:
|
|
809
793
|
f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
|
810
794
|
)
|
811
795
|
self.tokenizer_manager.gracefully_exit = True
|
796
|
+
|
797
|
+
|
798
|
+
T = TypeVar("T")
|
799
|
+
|
800
|
+
|
801
|
+
class _Communicator(Generic[T]):
|
802
|
+
def __init__(self, sender, fan_out: int):
|
803
|
+
self._sender = sender
|
804
|
+
self._fan_out = fan_out
|
805
|
+
self._result_future: Optional[asyncio.Future] = None
|
806
|
+
self._result_values: Optional[List[T]] = None
|
807
|
+
|
808
|
+
async def __call__(self, obj):
|
809
|
+
self._sender.send_pyobj(obj)
|
810
|
+
self._result_future = asyncio.Future()
|
811
|
+
self._result_values = []
|
812
|
+
await self._result_future
|
813
|
+
result_values = self._result_values
|
814
|
+
self._result_future = self._result_values = None
|
815
|
+
return result_values
|
816
|
+
|
817
|
+
def handle_recv(self, recv_obj: T):
|
818
|
+
self._result_values.append(recv_obj)
|
819
|
+
if len(self._result_values) == self._fan_out:
|
820
|
+
self._result_future.set_result(None)
|
@@ -95,12 +95,6 @@ class ModelRunner:
|
|
95
95
|
):
|
96
96
|
logger.info("MLA optimization is turned on. Use triton backend.")
|
97
97
|
self.server_args.attention_backend = "triton"
|
98
|
-
# FIXME(HandH1998)
|
99
|
-
if (
|
100
|
-
"DeepseekV3ForCausalLM" in self.model_config.hf_config.architectures
|
101
|
-
and not self.server_args.disable_cuda_graph
|
102
|
-
):
|
103
|
-
self.server_args.disable_cuda_graph = True
|
104
98
|
|
105
99
|
if self.server_args.enable_double_sparsity:
|
106
100
|
logger.info(
|
@@ -770,6 +770,21 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
|
770
770
|
quant_state_dict,
|
771
771
|
)
|
772
772
|
|
773
|
+
def _is_8bit_weight_name(self, weight_name: str):
|
774
|
+
quantized_suffix = {".scb", ".weight_format"}
|
775
|
+
return any(weight_name.lower().endswith(suffix) for suffix in quantized_suffix)
|
776
|
+
|
777
|
+
def _is_4bit_weight_name(self, weight_name: str):
|
778
|
+
quantized_suffix = {
|
779
|
+
"absmax",
|
780
|
+
"quant_map",
|
781
|
+
"nested_absmax",
|
782
|
+
"nested_quant_map",
|
783
|
+
"bitsandbytes",
|
784
|
+
}
|
785
|
+
suffix = weight_name.split(".")[-1]
|
786
|
+
return any(q_suffix in suffix for q_suffix in quantized_suffix)
|
787
|
+
|
773
788
|
def _quantized_8bit_generator(
|
774
789
|
self, hf_weights_files, use_safetensors, quant_state_dict
|
775
790
|
) -> Generator:
|
@@ -779,21 +794,18 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
|
779
794
|
if not weight_name.lower().endswith(".scb"):
|
780
795
|
continue
|
781
796
|
|
782
|
-
weight_key = weight_name.lower().replace(".scb", ".
|
797
|
+
weight_key = weight_name.lower().replace(".scb", ".weight")
|
783
798
|
quant_state_dict[weight_key] = weight_tensor
|
784
799
|
|
785
800
|
for weight_name, weight_tensor in self._hf_weight_iter(
|
786
801
|
hf_weights_files, use_safetensors
|
787
802
|
):
|
788
|
-
|
789
|
-
if not weight_name.endswith((".weight", ".bias")):
|
803
|
+
if self._is_8bit_weight_name(weight_name):
|
790
804
|
continue
|
791
805
|
|
792
|
-
|
793
|
-
|
794
|
-
if qweight_name in quant_state_dict:
|
806
|
+
if weight_name in quant_state_dict:
|
795
807
|
set_weight_attrs(weight_tensor, {"load_in_8bit": True})
|
796
|
-
yield
|
808
|
+
yield weight_name, weight_tensor
|
797
809
|
else:
|
798
810
|
yield weight_name, weight_tensor
|
799
811
|
|
@@ -806,7 +818,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
|
806
818
|
weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors)
|
807
819
|
temp_state_dict = {}
|
808
820
|
for weight_name, weight_tensor in weight_iterator:
|
809
|
-
if
|
821
|
+
if not self._is_4bit_weight_name(weight_name):
|
810
822
|
continue
|
811
823
|
# bitsandbytes library requires
|
812
824
|
# weight.quant_state.bitsandbytes__* in CPU
|
@@ -830,16 +842,15 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
|
830
842
|
hf_weights_files, use_safetensors
|
831
843
|
):
|
832
844
|
|
833
|
-
if
|
845
|
+
if self._is_4bit_weight_name(weight_name):
|
834
846
|
continue
|
835
847
|
|
836
848
|
if (f"{weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict) or (
|
837
849
|
f"{weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict
|
838
850
|
):
|
839
851
|
quant_state = _parse_quant_state(weight_name, temp_state_dict)
|
840
|
-
weight_name = weight_name.replace(".weight", ".qweight")
|
841
852
|
quant_state_dict[weight_name] = quant_state
|
842
|
-
yield weight_name
|
853
|
+
yield weight_name, weight_tensor
|
843
854
|
else:
|
844
855
|
yield weight_name, weight_tensor
|
845
856
|
|