sglang 0.4.0__tar.gz → 0.4.0.post2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.4.0 → sglang-0.4.0.post2}/PKG-INFO +15 -9
- {sglang-0.4.0 → sglang-0.4.0.post2}/README.md +8 -4
- {sglang-0.4.0 → sglang-0.4.0.post2}/pyproject.toml +7 -6
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/__init__.py +1 -1
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/bench_offline_throughput.py +18 -6
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/bench_one_batch.py +13 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/bench_serving.py +8 -1
- sglang-0.4.0.post2/sglang/check_env.py +305 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/runtime_endpoint.py +1 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/chat_template.py +32 -0
- sglang-0.4.0.post2/sglang/llama3_eval.py +316 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/constrained/outlines_backend.py +5 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/constrained/xgrammar_backend.py +9 -6
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/__init__.py +5 -2
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +22 -8
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/flashinfer_backend.py +22 -5
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/torch_native_backend.py +22 -8
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/triton_backend.py +38 -33
- sglang-0.4.0.post2/sglang/srt/layers/attention/triton_ops/decode_attention.py +669 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +3 -0
- sglang-0.4.0.post2/sglang/srt/layers/ep_moe/__init__.py +0 -0
- sglang-0.4.0.post2/sglang/srt/layers/ep_moe/kernels.py +349 -0
- sglang-0.4.0.post2/sglang/srt/layers/ep_moe/layer.py +665 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/fused_moe_triton/fused_moe.py +64 -21
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/fused_moe_triton/layer.py +1 -1
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/logits_processor.py +133 -95
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/quantization/__init__.py +2 -47
- sglang-0.4.0.post2/sglang/srt/layers/quantization/fp8.py +607 -0
- sglang-0.4.0.post2/sglang/srt/layers/quantization/fp8_utils.py +27 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/radix_attention.py +11 -2
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/sampler.py +29 -5
- sglang-0.4.0.post2/sglang/srt/layers/torchao_utils.py +108 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/detokenizer_manager.py +37 -17
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/io_struct.py +39 -10
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/schedule_batch.py +39 -24
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/schedule_policy.py +64 -5
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/scheduler.py +236 -197
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/tokenizer_manager.py +99 -58
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/tp_worker_overlap_thread.py +7 -5
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/mem_cache/chunk_cache.py +2 -2
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/mem_cache/memory_pool.py +5 -1
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/mem_cache/radix_cache.py +12 -2
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_executor/cuda_graph_runner.py +39 -11
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_executor/model_runner.py +24 -9
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_parallel.py +67 -10
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/commandr.py +2 -2
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/deepseek_v2.py +87 -7
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/gemma2.py +34 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/gemma2_reward.py +0 -1
- sglang-0.4.0.post2/sglang/srt/models/granite.py +517 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/grok.py +72 -13
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/llama.py +22 -5
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/llama_classification.py +11 -23
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/llama_reward.py +0 -2
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/llava.py +37 -14
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/mixtral.py +12 -9
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/phi3_small.py +0 -5
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/qwen2.py +20 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/qwen2_moe.py +0 -5
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/torch_native_llama.py +0 -5
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/openai_api/adapter.py +4 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/openai_api/protocol.py +9 -4
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/sampling_batch_info.py +9 -8
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/server.py +4 -4
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/server_args.py +62 -13
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/utils.py +57 -10
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/test_utils.py +3 -2
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/utils.py +10 -3
- sglang-0.4.0.post2/sglang/version.py +1 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang.egg-info/PKG-INFO +15 -9
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang.egg-info/SOURCES.txt +7 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang.egg-info/requires.txt +6 -4
- sglang-0.4.0/sglang/check_env.py +0 -213
- sglang-0.4.0/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -714
- sglang-0.4.0/sglang/srt/layers/torchao_utils.py +0 -95
- sglang-0.4.0/sglang/version.py +0 -1
- {sglang-0.4.0 → sglang-0.4.0.post2}/LICENSE +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/setup.cfg +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/api.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/bench_latency.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/bench_one_batch_server.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/global_config.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/choices.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/compiler.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/interpreter.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/ir.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/tracer.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/launch_server.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/_custom_ops.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/configs/device_config.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/configs/load_config.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/configs/model_config.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/configs/qwen2vl.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/constrained/base_grammar_backend.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/conversation.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/communication_op.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/parallel_state.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/custom_op_util.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/fused_moe_patch.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/fused_moe_triton/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/rotary_embedding.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/data_parallel_controller.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/image_processor.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/session_controller.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/tp_worker.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/metrics/collector.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_executor/forward_batch_info.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_loader/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_loader/loader.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_loader/weight_utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/gpt2.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/internlm2_reward.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/minicpm3.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/mllama.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/olmo.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/olmo2.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/qwen2_vl.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/registry.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/run_eval.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/runners.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/test_activation.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/test_programs.py +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.4.0 → sglang-0.4.0.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.0
|
3
|
+
Version: 0.4.0.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -215,6 +215,7 @@ Requires-Dist: requests
|
|
215
215
|
Requires-Dist: tqdm
|
216
216
|
Requires-Dist: numpy
|
217
217
|
Requires-Dist: IPython
|
218
|
+
Requires-Dist: setproctitle
|
218
219
|
Provides-Extra: runtime-common
|
219
220
|
Requires-Dist: aiohttp; extra == "runtime-common"
|
220
221
|
Requires-Dist: decord; extra == "runtime-common"
|
@@ -232,16 +233,17 @@ Requires-Dist: psutil; extra == "runtime-common"
|
|
232
233
|
Requires-Dist: pydantic; extra == "runtime-common"
|
233
234
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
234
235
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
235
|
-
Requires-Dist: torchao; extra == "runtime-common"
|
236
|
+
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
237
|
+
Requires-Dist: gemlite; extra == "runtime-common"
|
236
238
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
237
239
|
Requires-Dist: uvloop; extra == "runtime-common"
|
238
|
-
Requires-Dist: xgrammar>=0.1.
|
240
|
+
Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
|
239
241
|
Provides-Extra: srt
|
240
242
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
241
243
|
Requires-Dist: torch; extra == "srt"
|
242
|
-
Requires-Dist: vllm
|
244
|
+
Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
|
243
245
|
Requires-Dist: cuda-python; extra == "srt"
|
244
|
-
Requires-Dist: flashinfer
|
246
|
+
Requires-Dist: flashinfer==0.1.6; extra == "srt"
|
245
247
|
Provides-Extra: srt-hip
|
246
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
247
249
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -311,10 +313,14 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
|
|
311
313
|
|
312
314
|
--------------------------------------------------------------------------------
|
313
315
|
|
314
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
|
315
|
-
|
316
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
|
317
|
+
| [**Documentation**](https://sgl-project.github.io/)
|
318
|
+
| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
|
319
|
+
| [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
|
320
|
+
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
316
321
|
|
317
322
|
## News
|
323
|
+
- [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
318
324
|
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
319
325
|
- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
320
326
|
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
@@ -346,13 +352,13 @@ The core features include:
|
|
346
352
|
- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
|
347
353
|
|
348
354
|
## Benchmark And Performance
|
349
|
-
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
|
355
|
+
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
|
350
356
|
|
351
357
|
## Roadmap
|
352
358
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
353
359
|
|
354
360
|
## Adoption and Sponsorship
|
355
|
-
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
|
361
|
+
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
|
356
362
|
|
357
363
|
## Acknowledgment and Citation
|
358
364
|
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
@@ -12,10 +12,14 @@
|
|
12
12
|
|
13
13
|
--------------------------------------------------------------------------------
|
14
14
|
|
15
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
|
16
|
-
|
15
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
|
16
|
+
| [**Documentation**](https://sgl-project.github.io/)
|
17
|
+
| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
|
18
|
+
| [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
|
19
|
+
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
17
20
|
|
18
21
|
## News
|
22
|
+
- [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
19
23
|
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
20
24
|
- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
21
25
|
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
@@ -47,13 +51,13 @@ The core features include:
|
|
47
51
|
- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
|
48
52
|
|
49
53
|
## Benchmark And Performance
|
50
|
-
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
|
54
|
+
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
|
51
55
|
|
52
56
|
## Roadmap
|
53
57
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
54
58
|
|
55
59
|
## Adoption and Sponsorship
|
56
|
-
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
|
60
|
+
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
|
57
61
|
|
58
62
|
## Acknowledgment and Citation
|
59
63
|
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.4.0"
|
7
|
+
version = "0.4.0.post2"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -13,7 +13,7 @@ classifiers = [
|
|
13
13
|
"Programming Language :: Python :: 3",
|
14
14
|
"License :: OSI Approved :: Apache Software License",
|
15
15
|
]
|
16
|
-
dependencies = ["requests", "tqdm", "numpy", "IPython"]
|
16
|
+
dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
|
17
17
|
|
18
18
|
[project.optional-dependencies]
|
19
19
|
runtime_common = ["aiohttp", "decord", "fastapi",
|
@@ -21,9 +21,9 @@ runtime_common = ["aiohttp", "decord", "fastapi",
|
|
21
21
|
"orjson", "outlines>=0.0.44,<0.1.0",
|
22
22
|
"packaging", "pillow", "prometheus-client>=0.20.0",
|
23
23
|
"psutil", "pydantic", "python-multipart",
|
24
|
-
"pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
|
25
|
-
"xgrammar>=0.1.
|
26
|
-
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python", "flashinfer
|
24
|
+
"pyzmq>=25.1.2", "torchao>=0.7.0", "gemlite", "uvicorn", "uvloop",
|
25
|
+
"xgrammar>=0.1.6"]
|
26
|
+
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6"]
|
27
27
|
|
28
28
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
29
29
|
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
|
@@ -33,7 +33,7 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
|
|
33
33
|
srt_xpu = ["sglang[runtime_common]"]
|
34
34
|
#For Intel Gaudi(device : hpu) follow the installation guide
|
35
35
|
#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
|
36
|
-
srt_hpu =
|
36
|
+
srt_hpu = ["sglang[runtime_common]"]
|
37
37
|
|
38
38
|
openai = ["openai>=1.0", "tiktoken"]
|
39
39
|
anthropic = ["anthropic>=0.20.0"]
|
@@ -50,6 +50,7 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
|
50
50
|
all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
51
51
|
all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
52
52
|
all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
53
|
+
|
53
54
|
dev = ["sglang[all]", "sglang[test]"]
|
54
55
|
dev_hip = ["sglang[all_hip]", "sglang[test]"]
|
55
56
|
dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
|
@@ -201,18 +201,17 @@ def throughput_test_once(
|
|
201
201
|
for r in reqs
|
202
202
|
]
|
203
203
|
|
204
|
-
st = time.perf_counter()
|
205
204
|
if profile:
|
206
205
|
backend.start_profile()
|
207
206
|
|
207
|
+
st = time.perf_counter()
|
208
208
|
gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
|
209
|
+
latency = time.perf_counter() - st
|
209
210
|
|
210
211
|
if profile:
|
211
212
|
backend.stop_profile()
|
212
213
|
monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
|
213
214
|
|
214
|
-
latency = time.perf_counter() - st
|
215
|
-
|
216
215
|
if backend_name == "runtime":
|
217
216
|
gen_out = json.loads(gen_out)
|
218
217
|
|
@@ -285,7 +284,7 @@ def throughput_test(
|
|
285
284
|
else:
|
286
285
|
raise ValueError('Please set backend to either "engine" or "runtime"')
|
287
286
|
|
288
|
-
tokenizer_id = server_args.model_path
|
287
|
+
tokenizer_id = server_args.tokenizer_path or server_args.model_path
|
289
288
|
tokenizer = get_tokenizer(tokenizer_id)
|
290
289
|
|
291
290
|
# Set global environmnets
|
@@ -304,8 +303,8 @@ def throughput_test(
|
|
304
303
|
warmup_requests = sample_random_requests(
|
305
304
|
input_len=256,
|
306
305
|
output_len=16,
|
307
|
-
num_prompts=16,
|
308
|
-
range_ratio=0
|
306
|
+
num_prompts=min(bench_args.num_prompts, 16),
|
307
|
+
range_ratio=1.0,
|
309
308
|
tokenizer=tokenizer,
|
310
309
|
dataset_path=bench_args.dataset_path,
|
311
310
|
)
|
@@ -321,6 +320,19 @@ def throughput_test(
|
|
321
320
|
extra_request_body=extra_request_body,
|
322
321
|
profile=False,
|
323
322
|
)
|
323
|
+
time.sleep(0.5)
|
324
|
+
|
325
|
+
try:
|
326
|
+
import os
|
327
|
+
import pwd
|
328
|
+
|
329
|
+
from gemlite.core import GemLiteLinearTriton
|
330
|
+
|
331
|
+
GemLiteLinearTriton.cache_config(
|
332
|
+
f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
|
333
|
+
)
|
334
|
+
except ImportError:
|
335
|
+
pass
|
324
336
|
|
325
337
|
logging.info("\nBenchmark...")
|
326
338
|
result = throughput_test_once(
|
@@ -385,6 +385,19 @@ def latency_test(
|
|
385
385
|
8, # shorter decoding to speed up the warmup
|
386
386
|
server_args.device,
|
387
387
|
)
|
388
|
+
|
389
|
+
try:
|
390
|
+
import os
|
391
|
+
import pwd
|
392
|
+
|
393
|
+
from gemlite.core import GemLiteLinearTriton
|
394
|
+
|
395
|
+
GemLiteLinearTriton.cache_config(
|
396
|
+
f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
|
397
|
+
)
|
398
|
+
except ImportError:
|
399
|
+
pass
|
400
|
+
|
388
401
|
rank_print("Benchmark ...")
|
389
402
|
|
390
403
|
# Run the sweep
|
@@ -321,6 +321,8 @@ async def async_request_sglang_generate(
|
|
321
321
|
},
|
322
322
|
"stream": not args.disable_stream,
|
323
323
|
"lora_path": request_func_input.lora_name,
|
324
|
+
"return_logprob": args.return_logprob,
|
325
|
+
"logprob_start_len": -1,
|
324
326
|
**request_func_input.extra_request_body,
|
325
327
|
}
|
326
328
|
headers = {}
|
@@ -911,7 +913,7 @@ async def benchmark(
|
|
911
913
|
prompt=test_prompt,
|
912
914
|
api_url=api_url,
|
913
915
|
prompt_len=test_prompt_len,
|
914
|
-
output_len=test_output_len,
|
916
|
+
output_len=min(test_output_len, 32),
|
915
917
|
lora_name=lora_name,
|
916
918
|
extra_request_body=extra_request_body,
|
917
919
|
)
|
@@ -1413,6 +1415,11 @@ if __name__ == "__main__":
|
|
1413
1415
|
action="store_true",
|
1414
1416
|
help="Disable ignoring EOS.",
|
1415
1417
|
)
|
1418
|
+
parser.add_argument(
|
1419
|
+
"--return-logprob",
|
1420
|
+
action="store_true",
|
1421
|
+
help="Return logprob.",
|
1422
|
+
)
|
1416
1423
|
parser.add_argument(
|
1417
1424
|
"--extra-request-body",
|
1418
1425
|
metavar='{"key1": "value1", "key2": "value2"}',
|
@@ -0,0 +1,305 @@
|
|
1
|
+
"""Check environment configurations and dependency versions."""
|
2
|
+
|
3
|
+
import importlib
|
4
|
+
import os
|
5
|
+
import resource
|
6
|
+
import subprocess
|
7
|
+
import sys
|
8
|
+
from collections import OrderedDict, defaultdict
|
9
|
+
|
10
|
+
import torch
|
11
|
+
|
12
|
+
from sglang.srt.utils import is_hip
|
13
|
+
|
14
|
+
|
15
|
+
def is_cuda_v2():
|
16
|
+
return torch.version.cuda is not None
|
17
|
+
|
18
|
+
|
19
|
+
# List of packages to check versions
|
20
|
+
PACKAGE_LIST = [
|
21
|
+
"sglang",
|
22
|
+
"flashinfer",
|
23
|
+
"triton",
|
24
|
+
"transformers",
|
25
|
+
"torchao",
|
26
|
+
"numpy",
|
27
|
+
"aiohttp",
|
28
|
+
"fastapi",
|
29
|
+
"hf_transfer",
|
30
|
+
"huggingface_hub",
|
31
|
+
"interegular",
|
32
|
+
"modelscope",
|
33
|
+
"orjson",
|
34
|
+
"outlines",
|
35
|
+
"packaging",
|
36
|
+
"psutil",
|
37
|
+
"pydantic",
|
38
|
+
"multipart",
|
39
|
+
"zmq",
|
40
|
+
"torchao",
|
41
|
+
"uvicorn",
|
42
|
+
"uvloop",
|
43
|
+
"vllm",
|
44
|
+
"xgrammar",
|
45
|
+
"openai",
|
46
|
+
"tiktoken",
|
47
|
+
"anthropic",
|
48
|
+
"litellm",
|
49
|
+
"decord",
|
50
|
+
]
|
51
|
+
|
52
|
+
|
53
|
+
def get_package_versions(packages):
|
54
|
+
"""
|
55
|
+
Get versions of specified packages.
|
56
|
+
"""
|
57
|
+
versions = {}
|
58
|
+
for package in packages:
|
59
|
+
package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
|
60
|
+
try:
|
61
|
+
module = importlib.import_module(package_name)
|
62
|
+
if hasattr(module, "__version__"):
|
63
|
+
versions[package_name] = module.__version__
|
64
|
+
except ModuleNotFoundError:
|
65
|
+
versions[package_name] = "Module Not Found"
|
66
|
+
return versions
|
67
|
+
|
68
|
+
|
69
|
+
def get_cuda_info():
|
70
|
+
"""
|
71
|
+
Get CUDA-related information if available.
|
72
|
+
"""
|
73
|
+
if is_cuda_v2():
|
74
|
+
cuda_info = {"CUDA available": torch.cuda.is_available()}
|
75
|
+
|
76
|
+
if cuda_info["CUDA available"]:
|
77
|
+
cuda_info.update(_get_gpu_info())
|
78
|
+
cuda_info.update(_get_cuda_version_info())
|
79
|
+
|
80
|
+
return cuda_info
|
81
|
+
elif is_hip():
|
82
|
+
cuda_info = {"ROCM available": torch.cuda.is_available()}
|
83
|
+
|
84
|
+
if cuda_info["ROCM available"]:
|
85
|
+
cuda_info.update(_get_gpu_info())
|
86
|
+
cuda_info.update(_get_cuda_version_info())
|
87
|
+
|
88
|
+
return cuda_info
|
89
|
+
|
90
|
+
|
91
|
+
def _get_gpu_info():
|
92
|
+
"""
|
93
|
+
Get information about available GPUs.
|
94
|
+
"""
|
95
|
+
devices = defaultdict(list)
|
96
|
+
capabilities = defaultdict(list)
|
97
|
+
for k in range(torch.cuda.device_count()):
|
98
|
+
devices[torch.cuda.get_device_name(k)].append(str(k))
|
99
|
+
capability = torch.cuda.get_device_capability(k)
|
100
|
+
capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
|
101
|
+
|
102
|
+
gpu_info = {}
|
103
|
+
for name, device_ids in devices.items():
|
104
|
+
gpu_info[f"GPU {','.join(device_ids)}"] = name
|
105
|
+
|
106
|
+
if len(capabilities) == 1:
|
107
|
+
# All GPUs have the same compute capability
|
108
|
+
cap, gpu_ids = list(capabilities.items())[0]
|
109
|
+
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
|
110
|
+
else:
|
111
|
+
# GPUs have different compute capabilities
|
112
|
+
for cap, gpu_ids in capabilities.items():
|
113
|
+
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
|
114
|
+
|
115
|
+
return gpu_info
|
116
|
+
|
117
|
+
|
118
|
+
def _get_cuda_version_info():
|
119
|
+
"""
|
120
|
+
Get CUDA version information.
|
121
|
+
"""
|
122
|
+
if is_cuda_v2():
|
123
|
+
from torch.utils.cpp_extension import CUDA_HOME
|
124
|
+
|
125
|
+
cuda_info = {"CUDA_HOME": CUDA_HOME}
|
126
|
+
|
127
|
+
if CUDA_HOME and os.path.isdir(CUDA_HOME):
|
128
|
+
cuda_info.update(_get_nvcc_info())
|
129
|
+
cuda_info.update(_get_cuda_driver_version())
|
130
|
+
|
131
|
+
return cuda_info
|
132
|
+
elif is_hip():
|
133
|
+
from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
|
134
|
+
|
135
|
+
cuda_info = {"ROCM_HOME": ROCM_HOME}
|
136
|
+
|
137
|
+
if ROCM_HOME and os.path.isdir(ROCM_HOME):
|
138
|
+
cuda_info.update(_get_nvcc_info())
|
139
|
+
cuda_info.update(_get_cuda_driver_version())
|
140
|
+
|
141
|
+
return cuda_info
|
142
|
+
else:
|
143
|
+
cuda_info = {"CUDA_HOME": ""}
|
144
|
+
return cuda_info
|
145
|
+
|
146
|
+
|
147
|
+
def _get_nvcc_info():
|
148
|
+
"""
|
149
|
+
Get NVCC version information.
|
150
|
+
"""
|
151
|
+
if is_cuda_v2():
|
152
|
+
from torch.utils.cpp_extension import CUDA_HOME
|
153
|
+
|
154
|
+
try:
|
155
|
+
nvcc = os.path.join(CUDA_HOME, "bin/nvcc")
|
156
|
+
nvcc_output = (
|
157
|
+
subprocess.check_output(f'"{nvcc}" -V', shell=True)
|
158
|
+
.decode("utf-8")
|
159
|
+
.strip()
|
160
|
+
)
|
161
|
+
return {
|
162
|
+
"NVCC": nvcc_output[
|
163
|
+
nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind(
|
164
|
+
"Build"
|
165
|
+
)
|
166
|
+
].strip()
|
167
|
+
}
|
168
|
+
except subprocess.SubprocessError:
|
169
|
+
return {"NVCC": "Not Available"}
|
170
|
+
elif is_hip():
|
171
|
+
from torch.utils.cpp_extension import ROCM_HOME
|
172
|
+
|
173
|
+
try:
|
174
|
+
hipcc = os.path.join(ROCM_HOME, "bin/hipcc")
|
175
|
+
hipcc_output = (
|
176
|
+
subprocess.check_output(f'"{hipcc}" --version', shell=True)
|
177
|
+
.decode("utf-8")
|
178
|
+
.strip()
|
179
|
+
)
|
180
|
+
return {
|
181
|
+
"HIPCC": hipcc_output[
|
182
|
+
hipcc_output.rfind("HIP version") : hipcc_output.rfind("AMD clang")
|
183
|
+
].strip()
|
184
|
+
}
|
185
|
+
except subprocess.SubprocessError:
|
186
|
+
return {"HIPCC": "Not Available"}
|
187
|
+
else:
|
188
|
+
return {"NVCC": "Not Available"}
|
189
|
+
|
190
|
+
|
191
|
+
def _get_cuda_driver_version():
|
192
|
+
"""
|
193
|
+
Get CUDA driver version.
|
194
|
+
"""
|
195
|
+
versions = set()
|
196
|
+
if is_cuda_v2():
|
197
|
+
try:
|
198
|
+
output = subprocess.check_output(
|
199
|
+
[
|
200
|
+
"nvidia-smi",
|
201
|
+
"--query-gpu=driver_version",
|
202
|
+
"--format=csv,noheader,nounits",
|
203
|
+
]
|
204
|
+
)
|
205
|
+
versions = set(output.decode().strip().split("\n"))
|
206
|
+
if len(versions) == 1:
|
207
|
+
return {"CUDA Driver Version": versions.pop()}
|
208
|
+
else:
|
209
|
+
return {"CUDA Driver Versions": ", ".join(sorted(versions))}
|
210
|
+
except subprocess.SubprocessError:
|
211
|
+
return {"CUDA Driver Version": "Not Available"}
|
212
|
+
elif is_hip():
|
213
|
+
try:
|
214
|
+
output = subprocess.check_output(
|
215
|
+
[
|
216
|
+
"rocm-smi",
|
217
|
+
"--showdriverversion",
|
218
|
+
"--csv",
|
219
|
+
]
|
220
|
+
)
|
221
|
+
versions = set(output.decode().strip().split("\n"))
|
222
|
+
versions.discard("name, value")
|
223
|
+
ver = versions.pop()
|
224
|
+
ver = ver.replace('"Driver version", ', "").replace('"', "")
|
225
|
+
|
226
|
+
return {"ROCM Driver Version": ver}
|
227
|
+
except subprocess.SubprocessError:
|
228
|
+
return {"ROCM Driver Version": "Not Available"}
|
229
|
+
else:
|
230
|
+
return {"CUDA Driver Version": "Not Available"}
|
231
|
+
|
232
|
+
|
233
|
+
def get_gpu_topology():
|
234
|
+
"""
|
235
|
+
Get GPU topology information.
|
236
|
+
"""
|
237
|
+
if is_cuda_v2():
|
238
|
+
try:
|
239
|
+
result = subprocess.run(
|
240
|
+
["nvidia-smi", "topo", "-m"],
|
241
|
+
stdout=subprocess.PIPE,
|
242
|
+
stderr=subprocess.PIPE,
|
243
|
+
text=True,
|
244
|
+
check=True,
|
245
|
+
)
|
246
|
+
return "\n" + result.stdout if result.returncode == 0 else None
|
247
|
+
except subprocess.SubprocessError:
|
248
|
+
return None
|
249
|
+
elif is_hip():
|
250
|
+
try:
|
251
|
+
result = subprocess.run(
|
252
|
+
["rocm-smi", "--showtopotype"],
|
253
|
+
stdout=subprocess.PIPE,
|
254
|
+
stderr=subprocess.PIPE,
|
255
|
+
text=True,
|
256
|
+
check=True,
|
257
|
+
)
|
258
|
+
return "\n" + result.stdout if result.returncode == 0 else None
|
259
|
+
except subprocess.SubprocessError:
|
260
|
+
return None
|
261
|
+
else:
|
262
|
+
return None
|
263
|
+
|
264
|
+
|
265
|
+
def get_hypervisor_vendor():
|
266
|
+
try:
|
267
|
+
output = subprocess.check_output(["lscpu"], text=True)
|
268
|
+
for line in output.split("\n"):
|
269
|
+
if "Hypervisor vendor:" in line:
|
270
|
+
return line.split(":")[1].strip()
|
271
|
+
return None
|
272
|
+
except:
|
273
|
+
return None
|
274
|
+
|
275
|
+
|
276
|
+
def check_env():
|
277
|
+
"""
|
278
|
+
Check and print environment information.
|
279
|
+
"""
|
280
|
+
env_info = OrderedDict()
|
281
|
+
env_info["Python"] = sys.version.replace("\n", "")
|
282
|
+
env_info.update(get_cuda_info())
|
283
|
+
env_info["PyTorch"] = torch.__version__
|
284
|
+
env_info.update(get_package_versions(PACKAGE_LIST))
|
285
|
+
|
286
|
+
gpu_topo = get_gpu_topology()
|
287
|
+
if gpu_topo:
|
288
|
+
if is_cuda_v2():
|
289
|
+
env_info["NVIDIA Topology"] = gpu_topo
|
290
|
+
elif is_hip():
|
291
|
+
env_info["AMD Topology"] = gpu_topo
|
292
|
+
|
293
|
+
hypervisor_vendor = get_hypervisor_vendor()
|
294
|
+
if hypervisor_vendor:
|
295
|
+
env_info["Hypervisor vendor"] = hypervisor_vendor
|
296
|
+
|
297
|
+
ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
|
298
|
+
env_info["ulimit soft"] = ulimit_soft
|
299
|
+
|
300
|
+
for k, v in env_info.items():
|
301
|
+
print(f"{k}: {v}")
|
302
|
+
|
303
|
+
|
304
|
+
if __name__ == "__main__":
|
305
|
+
check_env()
|
@@ -320,6 +320,28 @@ register_chat_template(
|
|
320
320
|
)
|
321
321
|
)
|
322
322
|
|
323
|
+
register_chat_template(
|
324
|
+
ChatTemplate(
|
325
|
+
name="granite-3-instruct",
|
326
|
+
default_system_prompt=None,
|
327
|
+
role_prefix_and_suffix={
|
328
|
+
"system": (
|
329
|
+
"<|start_of_role|>system<|end_of_role|>",
|
330
|
+
"<|end_of_text|>",
|
331
|
+
),
|
332
|
+
"user": (
|
333
|
+
"<|start_of_role|>user<|end_of_role|>",
|
334
|
+
"<|end_of_text|>",
|
335
|
+
),
|
336
|
+
"assistant": (
|
337
|
+
"<|start_of_role|>assistant<|end_of_role|>",
|
338
|
+
"<|end_of_text|>",
|
339
|
+
),
|
340
|
+
},
|
341
|
+
stop_str=("<|end_of_text|>",),
|
342
|
+
)
|
343
|
+
)
|
344
|
+
|
323
345
|
|
324
346
|
@register_chat_template_matching_function
|
325
347
|
def match_dbrx(model_path: str):
|
@@ -402,6 +424,16 @@ def match_c4ai_command_r(model_path: str):
|
|
402
424
|
return get_chat_template("c4ai-command-r")
|
403
425
|
|
404
426
|
|
427
|
+
@register_chat_template_matching_function
|
428
|
+
def match_granite_instruct(model_path: str):
|
429
|
+
model_path = model_path.lower()
|
430
|
+
# When future versions of Granite are released, this code may
|
431
|
+
# need to be updated. For now, assume that the Granite 3.0
|
432
|
+
# template works across the board.
|
433
|
+
if "granite" in model_path and "instruct" in model_path:
|
434
|
+
return get_chat_template("granite-3-instruct")
|
435
|
+
|
436
|
+
|
405
437
|
if __name__ == "__main__":
|
406
438
|
messages = [
|
407
439
|
{"role": "system", "content": None}, # None means default
|