sglang 0.3.6.post2__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.6.post2 → sglang-0.4.0}/PKG-INFO +2 -1
- {sglang-0.3.6.post2 → sglang-0.4.0}/pyproject.toml +2 -2
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_offline_throughput.py +55 -2
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_one_batch.py +7 -6
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_one_batch_server.py +4 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_serving.py +13 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/check_env.py +1 -1
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/launch_server.py +3 -2
- sglang-0.4.0/sglang/srt/_custom_ops.py +118 -0
- sglang-0.4.0/sglang/srt/configs/device_config.py +17 -0
- sglang-0.4.0/sglang/srt/configs/load_config.py +84 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/configs/model_config.py +161 -4
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/configs/qwen2vl.py +5 -8
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/constrained/outlines_backend.py +6 -1
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/constrained/outlines_jump_forward.py +8 -1
- sglang-0.4.0/sglang/srt/distributed/__init__.py +3 -0
- sglang-0.4.0/sglang/srt/distributed/communication_op.py +34 -0
- sglang-0.4.0/sglang/srt/distributed/device_communicators/__init__.py +0 -0
- sglang-0.4.0/sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
- sglang-0.4.0/sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
- sglang-0.4.0/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
- sglang-0.4.0/sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
- sglang-0.4.0/sglang/srt/distributed/device_communicators/pynccl.py +204 -0
- sglang-0.4.0/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
- sglang-0.4.0/sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
- sglang-0.4.0/sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
- sglang-0.4.0/sglang/srt/distributed/parallel_state.py +1275 -0
- sglang-0.4.0/sglang/srt/distributed/utils.py +223 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/hf_transformers_utils.py +37 -1
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/flashinfer_backend.py +13 -15
- sglang-0.4.0/sglang/srt/layers/attention/torch_native_backend.py +285 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/fused_moe_patch.py +20 -11
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/linear.py +1 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/logits_processor.py +17 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/quantization/__init__.py +34 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/vocab_parallel_embedding.py +1 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/lora/lora.py +1 -1
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/data_parallel_controller.py +7 -11
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/detokenizer_manager.py +7 -4
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/image_processor.py +1 -1
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/io_struct.py +48 -12
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/schedule_batch.py +42 -36
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/schedule_policy.py +7 -4
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/scheduler.py +111 -46
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/session_controller.py +0 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/tokenizer_manager.py +169 -100
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/tp_worker.py +36 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/tp_worker_overlap_thread.py +32 -5
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/model_executor/cuda_graph_runner.py +16 -7
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/model_executor/forward_batch_info.py +9 -4
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/model_executor/model_runner.py +136 -150
- sglang-0.4.0/sglang/srt/model_loader/__init__.py +34 -0
- sglang-0.4.0/sglang/srt/model_loader/loader.py +1139 -0
- sglang-0.4.0/sglang/srt/model_loader/utils.py +41 -0
- sglang-0.4.0/sglang/srt/model_loader/weight_utils.py +640 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/baichuan.py +9 -10
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/chatglm.py +6 -15
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/commandr.py +2 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/dbrx.py +2 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/deepseek.py +4 -11
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/deepseek_v2.py +3 -11
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/exaone.py +2 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/gemma.py +2 -6
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/gemma2.py +3 -14
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/gemma2_reward.py +0 -1
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/gpt2.py +5 -12
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/gpt_bigcode.py +6 -22
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/grok.py +14 -51
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/internlm2.py +2 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/internlm2_reward.py +0 -1
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/llama.py +97 -27
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/llama_classification.py +1 -2
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/llama_embedding.py +1 -2
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/llama_reward.py +2 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/llava.py +10 -12
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/llavavid.py +1 -2
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/minicpm.py +4 -7
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/minicpm3.py +6 -19
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/mixtral.py +12 -5
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/mixtral_quant.py +2 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/mllama.py +3 -7
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/olmo.py +2 -8
- sglang-0.4.0/sglang/srt/models/olmo2.py +391 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/olmoe.py +3 -5
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/phi3_small.py +8 -8
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/qwen.py +2 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/qwen2.py +10 -9
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/qwen2_moe.py +4 -11
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/qwen2_vl.py +12 -9
- sglang-0.4.0/sglang/srt/models/registry.py +99 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/stablelm.py +2 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/torch_native_llama.py +6 -12
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/xverse.py +2 -4
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/xverse_moe.py +4 -11
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/yivl.py +2 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/openai_api/adapter.py +10 -6
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/openai_api/protocol.py +1 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/server.py +303 -204
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/server_args.py +65 -31
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/utils.py +253 -48
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/test_utils.py +27 -7
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/utils.py +2 -2
- sglang-0.4.0/sglang/version.py +1 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang.egg-info/PKG-INFO +2 -1
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang.egg-info/SOURCES.txt +23 -3
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang.egg-info/requires.txt +1 -0
- sglang-0.3.6.post2/sglang/srt/layers/fused_moe_grok/__init__.py +0 -1
- sglang-0.3.6.post2/sglang/srt/layers/fused_moe_grok/fused_moe.py +0 -692
- sglang-0.3.6.post2/sglang/srt/layers/fused_moe_grok/layer.py +0 -630
- sglang-0.3.6.post2/sglang/version.py +0 -1
- {sglang-0.3.6.post2 → sglang-0.4.0}/LICENSE +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/README.md +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/setup.cfg +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/__init__.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/api.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_latency.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/global_config.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/chat_template.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/choices.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/interpreter.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/ir.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/constrained/base_grammar_backend.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/constrained/xgrammar_backend.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/conversation.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/__init__.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/triton_backend.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/custom_op_util.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/fused_moe_triton/__init__.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/fused_moe_triton/fused_moe.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/fused_moe_triton/layer.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/rotary_embedding.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/sampler.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/mem_cache/memory_pool.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/metrics/collector.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/model_parallel.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/sampling_batch_info.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/runners.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/test_programs.py +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.6.post2 → sglang-0.4.0}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.0
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -241,6 +241,7 @@ Requires-Dist: sglang[runtime_common]; extra == "srt"
|
|
241
241
|
Requires-Dist: torch; extra == "srt"
|
242
242
|
Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
|
243
243
|
Requires-Dist: cuda-python; extra == "srt"
|
244
|
+
Requires-Dist: flashinfer>=0.1.6; extra == "srt"
|
244
245
|
Provides-Extra: srt-hip
|
245
246
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
246
247
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.
|
7
|
+
version = "0.4.0"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -23,7 +23,7 @@ runtime_common = ["aiohttp", "decord", "fastapi",
|
|
23
23
|
"psutil", "pydantic", "python-multipart",
|
24
24
|
"pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
|
25
25
|
"xgrammar>=0.1.4"]
|
26
|
-
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python"]
|
26
|
+
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python", "flashinfer>=0.1.6"]
|
27
27
|
|
28
28
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
29
29
|
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
|
@@ -14,20 +14,20 @@ import argparse
|
|
14
14
|
import dataclasses
|
15
15
|
import json
|
16
16
|
import logging
|
17
|
+
import os
|
17
18
|
import random
|
18
19
|
import time
|
19
20
|
from typing import Dict, List, Optional, Tuple
|
20
21
|
|
21
22
|
import numpy as np
|
22
23
|
|
23
|
-
from sglang.api import Engine
|
24
24
|
from sglang.bench_serving import (
|
25
25
|
get_dataset,
|
26
26
|
get_tokenizer,
|
27
27
|
sample_random_requests,
|
28
28
|
set_ulimit,
|
29
29
|
)
|
30
|
-
from sglang.srt.server import Runtime
|
30
|
+
from sglang.srt.server import Engine, Runtime
|
31
31
|
from sglang.srt.server_args import ServerArgs
|
32
32
|
|
33
33
|
|
@@ -52,6 +52,7 @@ class BenchArgs:
|
|
52
52
|
seed: int = 1
|
53
53
|
skip_warmup: bool = False
|
54
54
|
do_not_exit: bool = False
|
55
|
+
profile: bool = False
|
55
56
|
|
56
57
|
@staticmethod
|
57
58
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -156,6 +157,12 @@ class BenchArgs:
|
|
156
157
|
action="store_true",
|
157
158
|
help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
|
158
159
|
)
|
160
|
+
parser.add_argument(
|
161
|
+
"--profile",
|
162
|
+
action="store_true",
|
163
|
+
help="Use Torch Profiler. The endpoint must be launched with "
|
164
|
+
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
165
|
+
)
|
159
166
|
|
160
167
|
@classmethod
|
161
168
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -169,6 +176,7 @@ def throughput_test_once(
|
|
169
176
|
reqs: List[Tuple[str, int, int]],
|
170
177
|
ignore_eos: bool,
|
171
178
|
extra_request_body: Dict,
|
179
|
+
profile: bool,
|
172
180
|
):
|
173
181
|
measurement_results = {
|
174
182
|
"backend": backend_name,
|
@@ -194,7 +202,15 @@ def throughput_test_once(
|
|
194
202
|
]
|
195
203
|
|
196
204
|
st = time.perf_counter()
|
205
|
+
if profile:
|
206
|
+
backend.start_profile()
|
207
|
+
|
197
208
|
gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
|
209
|
+
|
210
|
+
if profile:
|
211
|
+
backend.stop_profile()
|
212
|
+
monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
|
213
|
+
|
198
214
|
latency = time.perf_counter() - st
|
199
215
|
|
200
216
|
if backend_name == "runtime":
|
@@ -221,6 +237,41 @@ def throughput_test_once(
|
|
221
237
|
return measurement_results
|
222
238
|
|
223
239
|
|
240
|
+
def monitor_trace_file(directory, interval=1):
|
241
|
+
|
242
|
+
print(f"Monitoring {directory} for new trace files...")
|
243
|
+
|
244
|
+
known_files = set(os.listdir(directory))
|
245
|
+
|
246
|
+
while True:
|
247
|
+
flag = False
|
248
|
+
time.sleep(interval)
|
249
|
+
current_files = set(os.listdir(directory))
|
250
|
+
|
251
|
+
new_files = current_files - known_files
|
252
|
+
for new_file in new_files:
|
253
|
+
new_file_path = os.path.join(directory, new_file)
|
254
|
+
print(f"New file detected: {new_file}")
|
255
|
+
|
256
|
+
previous_size = 0
|
257
|
+
while True:
|
258
|
+
try:
|
259
|
+
current_size = os.path.getsize(new_file_path)
|
260
|
+
except FileNotFoundError:
|
261
|
+
print(f"File {new_file} is no longer accessible.")
|
262
|
+
break
|
263
|
+
|
264
|
+
if current_size > previous_size:
|
265
|
+
previous_size = current_size
|
266
|
+
else:
|
267
|
+
flag = True
|
268
|
+
break
|
269
|
+
|
270
|
+
time.sleep(interval)
|
271
|
+
if flag:
|
272
|
+
break
|
273
|
+
|
274
|
+
|
224
275
|
def throughput_test(
|
225
276
|
server_args: ServerArgs,
|
226
277
|
bench_args: BenchArgs,
|
@@ -268,6 +319,7 @@ def throughput_test(
|
|
268
319
|
reqs=warmup_requests,
|
269
320
|
ignore_eos=not bench_args.disable_ignore_eos,
|
270
321
|
extra_request_body=extra_request_body,
|
322
|
+
profile=False,
|
271
323
|
)
|
272
324
|
|
273
325
|
logging.info("\nBenchmark...")
|
@@ -277,6 +329,7 @@ def throughput_test(
|
|
277
329
|
reqs=input_requests,
|
278
330
|
ignore_eos=not bench_args.disable_ignore_eos,
|
279
331
|
extra_request_body=extra_request_body,
|
332
|
+
profile=bench_args.profile,
|
280
333
|
)
|
281
334
|
|
282
335
|
if bench_args.result_filename:
|
@@ -47,6 +47,7 @@ import itertools
|
|
47
47
|
import json
|
48
48
|
import logging
|
49
49
|
import multiprocessing
|
50
|
+
import os
|
50
51
|
import time
|
51
52
|
from typing import Tuple
|
52
53
|
|
@@ -62,11 +63,7 @@ from sglang.srt.model_executor.model_runner import ModelRunner
|
|
62
63
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
63
64
|
from sglang.srt.server import _set_envs_and_config
|
64
65
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
65
|
-
from sglang.srt.utils import
|
66
|
-
configure_logger,
|
67
|
-
kill_child_process,
|
68
|
-
suppress_other_loggers,
|
69
|
-
)
|
66
|
+
from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
|
70
67
|
|
71
68
|
|
72
69
|
@dataclasses.dataclass
|
@@ -114,8 +111,12 @@ def load_model(server_args, port_args, tp_rank):
|
|
114
111
|
model_config = ModelConfig(
|
115
112
|
server_args.model_path,
|
116
113
|
trust_remote_code=server_args.trust_remote_code,
|
114
|
+
revision=server_args.revision,
|
117
115
|
context_length=server_args.context_length,
|
118
116
|
model_override_args=server_args.json_model_override_args,
|
117
|
+
is_embedding=server_args.is_embedding,
|
118
|
+
dtype=server_args.dtype,
|
119
|
+
quantization=server_args.quantization,
|
119
120
|
)
|
120
121
|
model_runner = ModelRunner(
|
121
122
|
model_config=model_config,
|
@@ -468,4 +469,4 @@ if __name__ == "__main__":
|
|
468
469
|
main(server_args, bench_args)
|
469
470
|
finally:
|
470
471
|
if server_args.tp_size != 1:
|
471
|
-
|
472
|
+
kill_process_tree(os.getpid(), include_parent=False)
|
@@ -15,6 +15,7 @@ import dataclasses
|
|
15
15
|
import itertools
|
16
16
|
import json
|
17
17
|
import multiprocessing
|
18
|
+
import os
|
18
19
|
import time
|
19
20
|
from typing import Tuple
|
20
21
|
|
@@ -23,7 +24,7 @@ import requests
|
|
23
24
|
|
24
25
|
from sglang.srt.server import launch_server
|
25
26
|
from sglang.srt.server_args import ServerArgs
|
26
|
-
from sglang.srt.utils import
|
27
|
+
from sglang.srt.utils import kill_process_tree
|
27
28
|
|
28
29
|
|
29
30
|
@dataclasses.dataclass
|
@@ -69,7 +70,7 @@ def launch_server_internal(server_args):
|
|
69
70
|
except Exception as e:
|
70
71
|
raise e
|
71
72
|
finally:
|
72
|
-
|
73
|
+
kill_process_tree(os.getpid(), include_parent=False)
|
73
74
|
|
74
75
|
|
75
76
|
def launch_server_process(server_args: ServerArgs):
|
@@ -175,7 +176,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
175
176
|
)
|
176
177
|
finally:
|
177
178
|
if proc:
|
178
|
-
|
179
|
+
kill_process_tree(proc.pid)
|
179
180
|
|
180
181
|
print(f"\nResults are saved to {bench_args.result_filename}")
|
181
182
|
|
@@ -51,6 +51,7 @@ class RequestFuncInput:
|
|
51
51
|
prompt_len: int
|
52
52
|
output_len: int
|
53
53
|
model: str
|
54
|
+
lora_name: str
|
54
55
|
extra_request_body: Dict[str, Any]
|
55
56
|
|
56
57
|
|
@@ -319,6 +320,7 @@ async def async_request_sglang_generate(
|
|
319
320
|
"ignore_eos": not args.disable_ignore_eos,
|
320
321
|
},
|
321
322
|
"stream": not args.disable_stream,
|
323
|
+
"lora_path": request_func_input.lora_name,
|
322
324
|
**request_func_input.extra_request_body,
|
323
325
|
}
|
324
326
|
headers = {}
|
@@ -884,6 +886,7 @@ async def benchmark(
|
|
884
886
|
request_rate: float,
|
885
887
|
max_concurrency: Optional[int],
|
886
888
|
disable_tqdm: bool,
|
889
|
+
lora_name: str,
|
887
890
|
extra_request_body: Dict[str, Any],
|
888
891
|
profile: bool,
|
889
892
|
):
|
@@ -909,6 +912,7 @@ async def benchmark(
|
|
909
912
|
api_url=api_url,
|
910
913
|
prompt_len=test_prompt_len,
|
911
914
|
output_len=test_output_len,
|
915
|
+
lora_name=lora_name,
|
912
916
|
extra_request_body=extra_request_body,
|
913
917
|
)
|
914
918
|
test_output = await request_func(request_func_input=test_input)
|
@@ -942,6 +946,7 @@ async def benchmark(
|
|
942
946
|
api_url=api_url,
|
943
947
|
prompt_len=prompt_len,
|
944
948
|
output_len=output_len,
|
949
|
+
lora_name=lora_name,
|
945
950
|
extra_request_body=extra_request_body,
|
946
951
|
)
|
947
952
|
tasks.append(
|
@@ -1247,6 +1252,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1247
1252
|
request_rate=args.request_rate,
|
1248
1253
|
max_concurrency=args.max_concurrency,
|
1249
1254
|
disable_tqdm=args.disable_tqdm,
|
1255
|
+
lora_name=args.lora_name,
|
1250
1256
|
extra_request_body=extra_request_body,
|
1251
1257
|
profile=args.profile,
|
1252
1258
|
)
|
@@ -1267,6 +1273,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1267
1273
|
request_rate=rate,
|
1268
1274
|
max_concurrency=args.max_concurrency,
|
1269
1275
|
disable_tqdm=args.disable_tqdm,
|
1276
|
+
lora_name=args.lora_name,
|
1270
1277
|
extra_request_body=extra_request_body,
|
1271
1278
|
profile=args.profile,
|
1272
1279
|
)
|
@@ -1451,5 +1458,11 @@ if __name__ == "__main__":
|
|
1451
1458
|
help="Use Torch Profiler. The endpoint must be launched with "
|
1452
1459
|
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
1453
1460
|
)
|
1461
|
+
parser.add_argument(
|
1462
|
+
"--lora-name",
|
1463
|
+
type=str,
|
1464
|
+
default=None,
|
1465
|
+
help="The name of LoRA adapter",
|
1466
|
+
)
|
1454
1467
|
args = parser.parse_args()
|
1455
1468
|
run_benchmark(args)
|
@@ -1,10 +1,11 @@
|
|
1
1
|
"""Launch the inference server."""
|
2
2
|
|
3
|
+
import os
|
3
4
|
import sys
|
4
5
|
|
5
6
|
from sglang.srt.server import launch_server
|
6
7
|
from sglang.srt.server_args import prepare_server_args
|
7
|
-
from sglang.srt.utils import
|
8
|
+
from sglang.srt.utils import kill_process_tree
|
8
9
|
|
9
10
|
if __name__ == "__main__":
|
10
11
|
server_args = prepare_server_args(sys.argv[1:])
|
@@ -12,4 +13,4 @@ if __name__ == "__main__":
|
|
12
13
|
try:
|
13
14
|
launch_server(server_args)
|
14
15
|
finally:
|
15
|
-
|
16
|
+
kill_process_tree(os.getpid(), include_parent=False)
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/_custom_ops.py
|
2
|
+
import contextlib
|
3
|
+
import functools
|
4
|
+
import importlib
|
5
|
+
import logging
|
6
|
+
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
|
7
|
+
|
8
|
+
import torch
|
9
|
+
import torch.library
|
10
|
+
|
11
|
+
from sglang.srt.utils import is_hpu
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
if not is_hpu():
|
16
|
+
try:
|
17
|
+
import custom_ar
|
18
|
+
except ImportError as e:
|
19
|
+
logger.warning("Failed to import from custom_ar with %r", e)
|
20
|
+
|
21
|
+
|
22
|
+
def hint_on_error(fn):
|
23
|
+
|
24
|
+
@functools.wraps(fn)
|
25
|
+
def wrapper(*args, **kwargs):
|
26
|
+
try:
|
27
|
+
return fn(*args, **kwargs)
|
28
|
+
|
29
|
+
except NotImplementedError as e:
|
30
|
+
msg = (
|
31
|
+
"Error in calling custom op %s: %s\n"
|
32
|
+
"Not implemented or built, mostly likely because the current current device "
|
33
|
+
"does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
|
34
|
+
"incorrectly while building)"
|
35
|
+
)
|
36
|
+
logger.error(msg, fn.__name__, e)
|
37
|
+
raise NotImplementedError(msg % (fn.__name__, e)) from e
|
38
|
+
except AttributeError as e:
|
39
|
+
msg = (
|
40
|
+
"Error in calling custom op %s: %s\n"
|
41
|
+
"Possibly you have built or installed an obsolete version of vllm.\n"
|
42
|
+
"Please try a clean build and install of vllm,"
|
43
|
+
"or remove old built files such as vllm/*cpython*.so and build/ ."
|
44
|
+
)
|
45
|
+
logger.error(msg, fn.__name__, e)
|
46
|
+
raise e
|
47
|
+
|
48
|
+
return wrapper
|
49
|
+
|
50
|
+
|
51
|
+
# custom ar
|
52
|
+
def init_custom_ar(
|
53
|
+
ipc_tensors: List[torch.Tensor],
|
54
|
+
rank_data: torch.Tensor,
|
55
|
+
rank: int,
|
56
|
+
full_nvlink: bool,
|
57
|
+
) -> int:
|
58
|
+
return torch.ops._C_vllm_ar.init_custom_ar(
|
59
|
+
ipc_tensors, rank_data, rank, full_nvlink
|
60
|
+
)
|
61
|
+
|
62
|
+
|
63
|
+
def all_reduce(
|
64
|
+
fa: int,
|
65
|
+
inp: torch.Tensor,
|
66
|
+
out: torch.Tensor,
|
67
|
+
reg_buffer: int,
|
68
|
+
reg_buffer_sz_bytes: int,
|
69
|
+
) -> None:
|
70
|
+
torch.ops._C_vllm_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
|
71
|
+
|
72
|
+
|
73
|
+
def dispose(fa: int) -> None:
|
74
|
+
torch.ops._C_vllm_ar.dispose(fa)
|
75
|
+
|
76
|
+
|
77
|
+
def meta_size() -> int:
|
78
|
+
return torch.ops._C_vllm_ar.meta_size()
|
79
|
+
|
80
|
+
|
81
|
+
def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
|
82
|
+
return torch.ops._C_vllm_ar.register_buffer(fa, ipc_tensors)
|
83
|
+
|
84
|
+
|
85
|
+
def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
|
86
|
+
return torch.ops._C_vllm_ar.get_graph_buffer_ipc_meta(fa)
|
87
|
+
|
88
|
+
|
89
|
+
def register_graph_buffers(
|
90
|
+
fa: int, handles: List[List[int]], offsets: List[List[int]]
|
91
|
+
) -> None:
|
92
|
+
torch.ops._C_vllm_ar.register_graph_buffers(fa, handles, offsets)
|
93
|
+
|
94
|
+
|
95
|
+
# temporary fix for https://github.com/vllm-project/vllm/issues/5456
|
96
|
+
# TODO: remove this in v0.6.0
|
97
|
+
names_and_values = globals()
|
98
|
+
names_and_values_to_update = {}
|
99
|
+
# prepare variables to avoid dict size change during iteration
|
100
|
+
k, v, arg = None, None, None
|
101
|
+
fn_type = type(lambda x: x)
|
102
|
+
for k, v in names_and_values.items():
|
103
|
+
# find functions that are defined in this file and have torch.Tensor
|
104
|
+
# in their annotations. `arg == "torch.Tensor"` is used to handle
|
105
|
+
# the case when users use `import __annotations__` to turn type
|
106
|
+
# hints into strings.
|
107
|
+
if (
|
108
|
+
isinstance(v, fn_type)
|
109
|
+
and v.__code__.co_filename == __file__
|
110
|
+
and any(
|
111
|
+
arg is torch.Tensor or arg == "torch.Tensor"
|
112
|
+
for arg in v.__annotations__.values()
|
113
|
+
)
|
114
|
+
):
|
115
|
+
names_and_values_to_update[k] = hint_on_error(v)
|
116
|
+
|
117
|
+
names_and_values.update(names_and_values_to_update)
|
118
|
+
del names_and_values_to_update, names_and_values, v, k, fn_type
|
@@ -0,0 +1,17 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
|
8
|
+
|
9
|
+
class DeviceConfig:
|
10
|
+
device: Optional[torch.device]
|
11
|
+
|
12
|
+
def __init__(self, device: str = "cuda") -> None:
|
13
|
+
if device in ["cuda", "xpu", "hpu"]:
|
14
|
+
self.device_type = device
|
15
|
+
else:
|
16
|
+
raise RuntimeError(f"Not supported device type: {device}")
|
17
|
+
self.device = torch.device(self.device_type)
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
|
2
|
+
import enum
|
3
|
+
import json
|
4
|
+
import logging
|
5
|
+
from dataclasses import dataclass, field
|
6
|
+
from typing import List, Optional, Union
|
7
|
+
|
8
|
+
from sglang.srt.utils import is_hip
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class LoadFormat(str, enum.Enum):
|
14
|
+
AUTO = "auto"
|
15
|
+
PT = "pt"
|
16
|
+
SAFETENSORS = "safetensors"
|
17
|
+
NPCACHE = "npcache"
|
18
|
+
DUMMY = "dummy"
|
19
|
+
SHARDED_STATE = "sharded_state"
|
20
|
+
GGUF = "gguf"
|
21
|
+
BITSANDBYTES = "bitsandbytes"
|
22
|
+
MISTRAL = "mistral"
|
23
|
+
|
24
|
+
|
25
|
+
@dataclass
|
26
|
+
class LoadConfig:
|
27
|
+
"""
|
28
|
+
download_dir: Directory to download and load the weights, default to the
|
29
|
+
default cache directory of huggingface.
|
30
|
+
load_format: The format of the model weights to load:
|
31
|
+
"auto" will try to load the weights in the safetensors format and
|
32
|
+
fall back to the pytorch bin format if safetensors format is
|
33
|
+
not available.
|
34
|
+
"pt" will load the weights in the pytorch bin format.
|
35
|
+
"safetensors" will load the weights in the safetensors format.
|
36
|
+
"npcache" will load the weights in pytorch format and store
|
37
|
+
a numpy cache to speed up the loading.
|
38
|
+
"dummy" will initialize the weights with random values, which is
|
39
|
+
mainly for profiling.
|
40
|
+
"bitsandbytes" will load nf4 type weights.
|
41
|
+
ignore_patterns: The list of patterns to ignore when loading the model.
|
42
|
+
Default to "original/**/*" to avoid repeated loading of llama's
|
43
|
+
checkpoints.
|
44
|
+
|
45
|
+
"""
|
46
|
+
|
47
|
+
load_format: Union[str, LoadFormat] = LoadFormat.AUTO
|
48
|
+
download_dir: Optional[str] = None
|
49
|
+
model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
|
50
|
+
ignore_patterns: Optional[Union[List[str], str]] = None
|
51
|
+
|
52
|
+
def __post_init__(self):
|
53
|
+
model_loader_extra_config = self.model_loader_extra_config or {}
|
54
|
+
if isinstance(model_loader_extra_config, str):
|
55
|
+
self.model_loader_extra_config = json.loads(model_loader_extra_config)
|
56
|
+
self._verify_load_format()
|
57
|
+
|
58
|
+
if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
|
59
|
+
logger.info(
|
60
|
+
"Ignoring the following patterns when downloading weights: %s",
|
61
|
+
self.ignore_patterns,
|
62
|
+
)
|
63
|
+
else:
|
64
|
+
self.ignore_patterns = ["original/**/*"]
|
65
|
+
|
66
|
+
def _verify_load_format(self) -> None:
|
67
|
+
if not isinstance(self.load_format, str):
|
68
|
+
return
|
69
|
+
|
70
|
+
load_format = self.load_format.lower()
|
71
|
+
self.load_format = LoadFormat(load_format)
|
72
|
+
|
73
|
+
rocm_not_supported_load_format: List[str] = []
|
74
|
+
if is_hip() and load_format in rocm_not_supported_load_format:
|
75
|
+
rocm_supported_load_format = [
|
76
|
+
f
|
77
|
+
for f in LoadFormat.__members__
|
78
|
+
if (f not in rocm_not_supported_load_format)
|
79
|
+
]
|
80
|
+
raise ValueError(
|
81
|
+
f"load format '{load_format}' is not supported in ROCm. "
|
82
|
+
f"Supported load formats are "
|
83
|
+
f"{rocm_supported_load_format}"
|
84
|
+
)
|