sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +321 -31
- sglang/bench_serving.py +10 -3
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +8 -0
- sglang/srt/configs/model_config.py +160 -105
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/constrained/base_grammar_backend.py +1 -0
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +6 -4
- sglang/srt/debug_utils/dumper.py +10 -3
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/common/conn.py +266 -98
- sglang/srt/disaggregation/decode.py +50 -9
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
- sglang/srt/disaggregation/mooncake/conn.py +51 -541
- sglang/srt/disaggregation/nixl/conn.py +148 -39
- sglang/srt/disaggregation/prefill.py +31 -14
- sglang/srt/disaggregation/utils.py +36 -5
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +135 -80
- sglang/srt/entrypoints/engine.py +23 -3
- sglang/srt/entrypoints/grpc_request_manager.py +330 -55
- sglang/srt/entrypoints/grpc_server.py +232 -102
- sglang/srt/entrypoints/http_server.py +49 -9
- sglang/srt/entrypoints/openai/protocol.py +110 -5
- sglang/srt/entrypoints/openai/serving_base.py +25 -6
- sglang/srt/entrypoints/openai/serving_chat.py +178 -49
- sglang/srt/entrypoints/openai/serving_completions.py +5 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/entrypoints/openai/serving_responses.py +42 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/expert_location.py +30 -5
- sglang/srt/function_call/function_call_parser.py +3 -2
- sglang/srt/function_call/glm4_moe_detector.py +3 -3
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
- sglang/srt/layers/activation.py +7 -6
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +108 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
- sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +112 -194
- sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
- sglang/srt/layers/attention/mamba/mamba.py +566 -1
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/triton_backend.py +42 -9
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +11 -1
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +2 -0
- sglang/srt/layers/linear.py +21 -4
- sglang/srt/layers/logits_processor.py +15 -2
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +147 -74
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
- sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
- sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
- sglang/srt/layers/moe/utils.py +10 -0
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/fp8.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +1 -1
- sglang/srt/layers/quantization/modelopt_quant.py +44 -9
- sglang/srt/layers/quantization/mxfp4.py +12 -4
- sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
- sglang/srt/layers/quantization/w4afp8.py +0 -4
- sglang/srt/layers/quantization/w8a8_int8.py +15 -3
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +52 -4
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +3 -3
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +10 -4
- sglang/srt/lora/lora.py +7 -5
- sglang/srt/lora/lora_manager.py +17 -6
- sglang/srt/lora/mem_pool.py +1 -1
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +7 -5
- sglang/srt/managers/cache_controller.py +42 -142
- sglang/srt/managers/data_parallel_controller.py +11 -46
- sglang/srt/managers/detokenizer_manager.py +11 -11
- sglang/srt/managers/io_struct.py +162 -118
- sglang/srt/managers/mm_utils.py +43 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +167 -86
- sglang/srt/managers/schedule_policy.py +143 -16
- sglang/srt/managers/scheduler.py +359 -214
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
- sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
- sglang/srt/managers/tokenizer_manager.py +84 -136
- sglang/srt/managers/tp_worker.py +39 -29
- sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +40 -1
- sglang/srt/mem_cache/hiradix_cache.py +119 -32
- sglang/srt/mem_cache/memory_pool.py +188 -10
- sglang/srt/mem_cache/memory_pool_host.py +134 -182
- sglang/srt/mem_cache/radix_cache.py +222 -71
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
- sglang/srt/mem_cache/swa_radix_cache.py +25 -34
- sglang/srt/metrics/collector.py +82 -120
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +39 -32
- sglang/srt/model_executor/forward_batch_info.py +23 -38
- sglang/srt/model_executor/model_runner.py +131 -183
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/loader.py +14 -10
- sglang/srt/model_loader/weight_utils.py +156 -2
- sglang/srt/models/bailing_moe.py +27 -4
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +536 -153
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +3 -3
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +1 -1
- sglang/srt/models/glm4v_moe.py +1 -1
- sglang/srt/models/gpt_oss.py +7 -30
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/longcat_flash.py +1 -1
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +15 -4
- sglang/srt/models/qwen2.py +0 -7
- sglang/srt/models/qwen2_5_vl.py +2 -2
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +64 -1
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +31 -3
- sglang/srt/models/qwen3_next.py +36 -9
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +2 -3
- sglang/srt/multimodal/processors/internvl.py +20 -8
- sglang/srt/multimodal/processors/qwen_vl.py +8 -1
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +20 -2
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +753 -295
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
- sglang/srt/speculative/eagle_worker.py +57 -25
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +47 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +32 -6
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +399 -74
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +1 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +12 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +355 -4
- sglang/utils.py +10 -1
- sglang/version.py +1 -1
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
sglang/test/test_utils.py
CHANGED
@@ -9,15 +9,18 @@ import os
|
|
9
9
|
import random
|
10
10
|
import re
|
11
11
|
import subprocess
|
12
|
+
import sys
|
12
13
|
import threading
|
13
14
|
import time
|
14
15
|
import unittest
|
15
16
|
from concurrent.futures import ThreadPoolExecutor
|
16
17
|
from dataclasses import dataclass
|
18
|
+
from datetime import datetime
|
17
19
|
from functools import partial
|
18
20
|
from pathlib import Path
|
19
21
|
from types import SimpleNamespace
|
20
|
-
from typing import Awaitable, Callable, List, Optional, Tuple
|
22
|
+
from typing import Any, Awaitable, Callable, List, Optional, Tuple
|
23
|
+
from urllib.parse import quote
|
21
24
|
|
22
25
|
import aiohttp
|
23
26
|
import numpy as np
|
@@ -41,6 +44,7 @@ from sglang.utils import get_exception_traceback
|
|
41
44
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
42
45
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
43
46
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
|
47
|
+
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE = "Qwen/Qwen3-Reranker-0.6B"
|
44
48
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
45
49
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
|
46
50
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
|
@@ -75,11 +79,13 @@ DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
|
|
75
79
|
# EAGLE
|
76
80
|
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
77
81
|
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
78
|
-
|
82
|
+
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST_EAGLE3 = "meta-llama/Llama-3.1-8B-Instruct"
|
83
|
+
DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B"
|
79
84
|
DEFAULT_STANDALONE_SPECULATIVE_TARGET_MODEL_FOR_TEST = (
|
80
85
|
"meta-llama/Llama-3.1-8B-Instruct"
|
81
86
|
)
|
82
87
|
DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
88
|
+
DEFAULT_NGRAM_SPECULATIVE_TARGET_MODEL_FOR_TEST = "Qwen/Qwen2.5-Coder-7B-Instruct"
|
83
89
|
|
84
90
|
# Other use cases
|
85
91
|
DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
|
@@ -561,11 +567,30 @@ def popen_launch_server(
|
|
561
567
|
if return_stdout_stderr:
|
562
568
|
process = subprocess.Popen(
|
563
569
|
command,
|
564
|
-
stdout=
|
565
|
-
stderr=
|
570
|
+
stdout=subprocess.PIPE,
|
571
|
+
stderr=subprocess.PIPE,
|
566
572
|
env=env,
|
567
573
|
text=True,
|
574
|
+
bufsize=1,
|
568
575
|
)
|
576
|
+
|
577
|
+
def _dump(src, sinks):
|
578
|
+
for line in iter(src.readline, ""):
|
579
|
+
for sink in sinks:
|
580
|
+
sink.write(line)
|
581
|
+
sink.flush()
|
582
|
+
src.close()
|
583
|
+
|
584
|
+
threading.Thread(
|
585
|
+
target=_dump,
|
586
|
+
args=(process.stdout, [return_stdout_stderr[0], sys.stdout]),
|
587
|
+
daemon=True,
|
588
|
+
).start()
|
589
|
+
threading.Thread(
|
590
|
+
target=_dump,
|
591
|
+
args=(process.stderr, [return_stdout_stderr[1], sys.stderr]),
|
592
|
+
daemon=True,
|
593
|
+
).start()
|
569
594
|
else:
|
570
595
|
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
|
571
596
|
|
@@ -869,6 +894,154 @@ def run_bench_serving(
|
|
869
894
|
return res
|
870
895
|
|
871
896
|
|
897
|
+
def run_score_benchmark(
|
898
|
+
model,
|
899
|
+
num_requests=100,
|
900
|
+
batch_size=5,
|
901
|
+
other_server_args=None,
|
902
|
+
need_warmup=False,
|
903
|
+
device="auto",
|
904
|
+
):
|
905
|
+
"""Score API benchmark function compatible with run_bench_serving pattern"""
|
906
|
+
if other_server_args is None:
|
907
|
+
other_server_args = []
|
908
|
+
|
909
|
+
if device == "auto":
|
910
|
+
device = auto_config_device()
|
911
|
+
|
912
|
+
# Launch the server (consistent with run_bench_serving)
|
913
|
+
base_url = DEFAULT_URL_FOR_TEST
|
914
|
+
process = popen_launch_server(
|
915
|
+
model,
|
916
|
+
base_url,
|
917
|
+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
918
|
+
other_args=other_server_args,
|
919
|
+
)
|
920
|
+
|
921
|
+
async def _run_benchmark():
|
922
|
+
|
923
|
+
# Load tokenizer for generating test data
|
924
|
+
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
925
|
+
|
926
|
+
tokenizer = get_tokenizer(model)
|
927
|
+
|
928
|
+
# Score API configuration
|
929
|
+
score_query_tokens = 120
|
930
|
+
score_item_tokens = 180
|
931
|
+
score_label_token_ids = [9454, 2753] # Yes/No token IDs
|
932
|
+
special_token = "<|im_start|>"
|
933
|
+
|
934
|
+
def generate_text_with_token_count(num_tokens):
|
935
|
+
"""Generate text with precise token count using replicated token."""
|
936
|
+
text = special_token * num_tokens
|
937
|
+
actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
|
938
|
+
if actual_tokens != num_tokens:
|
939
|
+
text = special_token * (
|
940
|
+
num_tokens
|
941
|
+
// len(tokenizer.encode(special_token, add_special_tokens=False))
|
942
|
+
)
|
943
|
+
return text
|
944
|
+
|
945
|
+
if need_warmup:
|
946
|
+
warmup_data = {
|
947
|
+
"query": generate_text_with_token_count(score_query_tokens),
|
948
|
+
"items": [
|
949
|
+
generate_text_with_token_count(score_item_tokens) for _ in range(3)
|
950
|
+
],
|
951
|
+
"label_token_ids": score_label_token_ids,
|
952
|
+
"model": model,
|
953
|
+
"apply_softmax": True,
|
954
|
+
}
|
955
|
+
|
956
|
+
async with aiohttp.ClientSession() as session:
|
957
|
+
try:
|
958
|
+
await session.post(
|
959
|
+
f"{base_url}/v1/score",
|
960
|
+
json=warmup_data,
|
961
|
+
timeout=aiohttp.ClientTimeout(total=30),
|
962
|
+
)
|
963
|
+
except:
|
964
|
+
pass # Ignore warmup errors
|
965
|
+
|
966
|
+
test_requests = []
|
967
|
+
for i in range(num_requests):
|
968
|
+
query = generate_text_with_token_count(score_query_tokens)
|
969
|
+
items = [
|
970
|
+
generate_text_with_token_count(score_item_tokens)
|
971
|
+
for _ in range(batch_size)
|
972
|
+
]
|
973
|
+
|
974
|
+
score_data = {
|
975
|
+
"query": query,
|
976
|
+
"items": items,
|
977
|
+
"label_token_ids": score_label_token_ids,
|
978
|
+
"model": model,
|
979
|
+
"apply_softmax": True,
|
980
|
+
}
|
981
|
+
test_requests.append(score_data)
|
982
|
+
|
983
|
+
start_time = time.monotonic()
|
984
|
+
successful_requests = 0
|
985
|
+
total_latency = 0
|
986
|
+
latencies = []
|
987
|
+
|
988
|
+
async with aiohttp.ClientSession() as session:
|
989
|
+
for request_data in test_requests:
|
990
|
+
try:
|
991
|
+
request_start = time.monotonic()
|
992
|
+
async with session.post(
|
993
|
+
f"{base_url}/v1/score",
|
994
|
+
json=request_data,
|
995
|
+
timeout=aiohttp.ClientTimeout(total=30),
|
996
|
+
) as response:
|
997
|
+
if response.status == 200:
|
998
|
+
response_data = await response.json()
|
999
|
+
request_end = time.monotonic()
|
1000
|
+
|
1001
|
+
if "scores" in response_data or "logprobs" in response_data:
|
1002
|
+
latency_ms = (request_end - request_start) * 1000
|
1003
|
+
latencies.append(latency_ms)
|
1004
|
+
total_latency += latency_ms
|
1005
|
+
successful_requests += 1
|
1006
|
+
except Exception:
|
1007
|
+
continue
|
1008
|
+
|
1009
|
+
end_time = time.monotonic()
|
1010
|
+
total_time = end_time - start_time
|
1011
|
+
|
1012
|
+
if successful_requests > 0:
|
1013
|
+
throughput = successful_requests / total_time
|
1014
|
+
avg_latency = total_latency / successful_requests
|
1015
|
+
latencies.sort()
|
1016
|
+
p95_latency = latencies[int(len(latencies) * 0.95)] if latencies else 0
|
1017
|
+
|
1018
|
+
return {
|
1019
|
+
"completed": successful_requests,
|
1020
|
+
"total_requests": num_requests,
|
1021
|
+
"throughput": throughput,
|
1022
|
+
"avg_latency_ms": avg_latency,
|
1023
|
+
"p95_latency_ms": p95_latency,
|
1024
|
+
"successful_requests": successful_requests,
|
1025
|
+
}
|
1026
|
+
else:
|
1027
|
+
return {
|
1028
|
+
"completed": 0,
|
1029
|
+
"total_requests": num_requests,
|
1030
|
+
"throughput": 0,
|
1031
|
+
"avg_latency_ms": 0,
|
1032
|
+
"p95_latency_ms": 0,
|
1033
|
+
"successful_requests": 0,
|
1034
|
+
}
|
1035
|
+
|
1036
|
+
try:
|
1037
|
+
res = asyncio.run(_run_benchmark())
|
1038
|
+
finally:
|
1039
|
+
kill_process_tree(process.pid)
|
1040
|
+
|
1041
|
+
assert res["completed"] == res["successful_requests"]
|
1042
|
+
return res
|
1043
|
+
|
1044
|
+
|
872
1045
|
def run_bench_serving_multi(
|
873
1046
|
model,
|
874
1047
|
base_url,
|
@@ -1390,6 +1563,41 @@ async def send_concurrent_generate_requests(
|
|
1390
1563
|
return await asyncio.gather(*tasks)
|
1391
1564
|
|
1392
1565
|
|
1566
|
+
async def send_concurrent_generate_requests_with_custom_params(
|
1567
|
+
base_url: str,
|
1568
|
+
custom_params: List[dict[str, Any]],
|
1569
|
+
) -> Tuple[int, Any]:
|
1570
|
+
"""Sends generate request concurrently with custom parameters and returns status code and response json tuple. Max concurrency is num_requests."""
|
1571
|
+
|
1572
|
+
base_payload = {
|
1573
|
+
"text": """
|
1574
|
+
System: You are a helpful assistant.
|
1575
|
+
User: What is the capital of France?
|
1576
|
+
Assistant: The capital of France is
|
1577
|
+
""",
|
1578
|
+
"sampling_params": {
|
1579
|
+
"temperature": 0,
|
1580
|
+
"max_new_tokens": 50,
|
1581
|
+
},
|
1582
|
+
}
|
1583
|
+
|
1584
|
+
async def async_generate_with_priority(req):
|
1585
|
+
async with aiohttp.ClientSession() as session:
|
1586
|
+
async with session.post(
|
1587
|
+
f"{base_url}/generate",
|
1588
|
+
json=req,
|
1589
|
+
) as response:
|
1590
|
+
resp_json = await response.json()
|
1591
|
+
return (response.status, resp_json)
|
1592
|
+
|
1593
|
+
tasks = []
|
1594
|
+
for c in custom_params:
|
1595
|
+
req = base_payload.copy()
|
1596
|
+
req.update(c)
|
1597
|
+
tasks.append(asyncio.create_task(async_generate_with_priority(req)))
|
1598
|
+
return await asyncio.gather(*tasks)
|
1599
|
+
|
1600
|
+
|
1393
1601
|
class CustomTestCase(unittest.TestCase):
|
1394
1602
|
def _callTestMethod(self, method):
|
1395
1603
|
max_retry = int(
|
@@ -1431,3 +1639,146 @@ def dump_bench_raw_result(
|
|
1431
1639
|
def _ensure_remove_suffix(text: str, suffix: str):
|
1432
1640
|
assert text.endswith(suffix)
|
1433
1641
|
return text.removesuffix(suffix)
|
1642
|
+
|
1643
|
+
|
1644
|
+
class ModelDeploySetup:
|
1645
|
+
def __init__(self, model_path: str, extra_args: List[str] = []):
|
1646
|
+
self.model_path = model_path
|
1647
|
+
if "--enable-multimodal" not in extra_args:
|
1648
|
+
extra_args.append("--enable-multimodal")
|
1649
|
+
if "--trust-remote-code" not in extra_args:
|
1650
|
+
extra_args.append("--trust-remote-code")
|
1651
|
+
|
1652
|
+
self.extra_args = extra_args
|
1653
|
+
|
1654
|
+
|
1655
|
+
class ModelEvalMetrics:
|
1656
|
+
def __init__(self, accuracy: float, eval_time: float):
|
1657
|
+
self.accuracy = accuracy
|
1658
|
+
self.eval_time = eval_time
|
1659
|
+
|
1660
|
+
|
1661
|
+
def extract_trace_link_from_bench_one_batch_server_output(output: str) -> str:
|
1662
|
+
match = re.search(r"\[Profile\]\((.*?)\)", output)
|
1663
|
+
if match:
|
1664
|
+
trace_link = match.group(1)
|
1665
|
+
return trace_link
|
1666
|
+
return None
|
1667
|
+
|
1668
|
+
|
1669
|
+
def parse_models(model_string: str):
|
1670
|
+
return [model.strip() for model in model_string.split(",") if model.strip()]
|
1671
|
+
|
1672
|
+
|
1673
|
+
def check_evaluation_test_results(
|
1674
|
+
results,
|
1675
|
+
test_name,
|
1676
|
+
model_accuracy_thresholds,
|
1677
|
+
model_latency_thresholds=None,
|
1678
|
+
model_count=None,
|
1679
|
+
):
|
1680
|
+
"""
|
1681
|
+
results: list of tuple of (model_path, accuracy, latency)
|
1682
|
+
"""
|
1683
|
+
failed_models = []
|
1684
|
+
if model_latency_thresholds is not None:
|
1685
|
+
summary = " | model | status | score | score_threshold | latency | latency_threshold | \n"
|
1686
|
+
summary += "| ----- | ------ | ----- | --------------- | ------- | ----------------- | \n"
|
1687
|
+
else:
|
1688
|
+
summary = " | model | status | score | score_threshold | \n"
|
1689
|
+
summary += "| ----- | ------ | ----- | --------------- | \n"
|
1690
|
+
|
1691
|
+
results_dict = {res[0]: (res[1], res[2]) for res in results}
|
1692
|
+
|
1693
|
+
for model, accuracy_threshold in sorted(model_accuracy_thresholds.items()):
|
1694
|
+
latency_threshold = (
|
1695
|
+
model_latency_thresholds.get(model)
|
1696
|
+
if model_latency_thresholds is not None
|
1697
|
+
else 1e9
|
1698
|
+
)
|
1699
|
+
|
1700
|
+
if model in results_dict:
|
1701
|
+
accuracy, latency = results_dict[model]
|
1702
|
+
is_success = accuracy >= accuracy_threshold and latency <= latency_threshold
|
1703
|
+
status_emoji = "✅" if is_success else "❌"
|
1704
|
+
|
1705
|
+
if not is_success:
|
1706
|
+
if accuracy < accuracy_threshold:
|
1707
|
+
failed_models.append(
|
1708
|
+
f"\nScore Check Failed: {model}\n"
|
1709
|
+
f"Model {model} score ({accuracy:.4f}) is below threshold ({accuracy_threshold:.4f})"
|
1710
|
+
)
|
1711
|
+
if latency > latency_threshold:
|
1712
|
+
failed_models.append(
|
1713
|
+
f"\nLatency Check Failed: {model}\n"
|
1714
|
+
f"Model {model} latency ({latency:.4f}) is above threshold ({latency_threshold:.4f})"
|
1715
|
+
)
|
1716
|
+
|
1717
|
+
if model_latency_thresholds is not None:
|
1718
|
+
line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold} | {latency} | {latency_threshold}\n"
|
1719
|
+
else:
|
1720
|
+
line = (
|
1721
|
+
f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold}\n"
|
1722
|
+
)
|
1723
|
+
else:
|
1724
|
+
status_emoji = "❌"
|
1725
|
+
failed_models.append(f"Model failed to launch or be evaluated: {model}")
|
1726
|
+
if model_latency_thresholds is not None:
|
1727
|
+
line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold} | N/A | {latency_threshold}\n"
|
1728
|
+
else:
|
1729
|
+
line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold}\n"
|
1730
|
+
|
1731
|
+
summary += line
|
1732
|
+
|
1733
|
+
print(summary)
|
1734
|
+
|
1735
|
+
if is_in_ci():
|
1736
|
+
write_github_step_summary(f"## {test_name}\n{summary}")
|
1737
|
+
|
1738
|
+
if failed_models:
|
1739
|
+
print("Some models failed the evaluation.")
|
1740
|
+
raise AssertionError("\n".join(failed_models))
|
1741
|
+
|
1742
|
+
|
1743
|
+
# Bench knobs for bench_one_batch_server (override by env)
|
1744
|
+
def _parse_int_list_env(name: str, default_val: str):
|
1745
|
+
val = os.environ.get(name, default_val)
|
1746
|
+
return [int(x) for x in val.split(",") if x]
|
1747
|
+
|
1748
|
+
|
1749
|
+
# Return filenames
|
1750
|
+
def find_traces_under_path(path: str) -> List[str]:
|
1751
|
+
results = []
|
1752
|
+
for _, dirs, files in os.walk(path):
|
1753
|
+
for file in files:
|
1754
|
+
if file.endswith(".trace.json.gz"):
|
1755
|
+
results.append(f"{file}")
|
1756
|
+
return results
|
1757
|
+
|
1758
|
+
|
1759
|
+
def write_results_to_json(model, metrics, mode="a"):
|
1760
|
+
result = {
|
1761
|
+
"timestamp": datetime.now().isoformat(),
|
1762
|
+
"model": model,
|
1763
|
+
"metrics": metrics,
|
1764
|
+
"score": metrics["score"],
|
1765
|
+
}
|
1766
|
+
|
1767
|
+
if "latency" in metrics:
|
1768
|
+
result["latency"] = (metrics.get("latency"),)
|
1769
|
+
|
1770
|
+
existing_results = []
|
1771
|
+
if mode == "a" and os.path.exists("results.json"):
|
1772
|
+
try:
|
1773
|
+
with open("results.json", "r") as f:
|
1774
|
+
existing_results = json.load(f)
|
1775
|
+
except json.JSONDecodeError:
|
1776
|
+
existing_results = []
|
1777
|
+
|
1778
|
+
if isinstance(existing_results, list):
|
1779
|
+
existing_results.append(result)
|
1780
|
+
else:
|
1781
|
+
existing_results = [result]
|
1782
|
+
|
1783
|
+
with open("results.json", "w") as f:
|
1784
|
+
json.dump(existing_results, f, indent=2)
|
sglang/utils.py
CHANGED
@@ -6,6 +6,7 @@ import logging
|
|
6
6
|
import os
|
7
7
|
import random
|
8
8
|
import socket
|
9
|
+
import ssl
|
9
10
|
import subprocess
|
10
11
|
import sys
|
11
12
|
import time
|
@@ -155,7 +156,15 @@ def http_request(
|
|
155
156
|
data = bytes(dumps(json), encoding="utf-8")
|
156
157
|
|
157
158
|
try:
|
158
|
-
|
159
|
+
if sys.version_info >= (3, 13):
|
160
|
+
# Python 3.13+: Use SSL context (cafile removed)
|
161
|
+
if verify and isinstance(verify, str):
|
162
|
+
context = ssl.create_default_context(cafile=verify)
|
163
|
+
else:
|
164
|
+
context = ssl.create_default_context()
|
165
|
+
resp = urllib.request.urlopen(req, data=data, context=context)
|
166
|
+
else:
|
167
|
+
resp = urllib.request.urlopen(req, data=data, cafile=verify)
|
159
168
|
return HttpResponse(resp)
|
160
169
|
except urllib.error.HTTPError as e:
|
161
170
|
return HttpResponse(e)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.3rc2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.3rc2
|
4
4
|
Summary: SGLang is a fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -211,18 +211,17 @@ Classifier: License :: OSI Approved :: Apache Software License
|
|
211
211
|
Requires-Python: >=3.10
|
212
212
|
Description-Content-Type: text/markdown
|
213
213
|
License-File: LICENSE
|
214
|
-
Requires-Dist: aiohttp
|
215
|
-
Requires-Dist: requests
|
216
|
-
Requires-Dist: tqdm
|
217
|
-
Requires-Dist: numpy
|
218
214
|
Requires-Dist: IPython
|
219
|
-
Requires-Dist:
|
215
|
+
Requires-Dist: aiohttp
|
216
|
+
Requires-Dist: anthropic>=0.20.0
|
220
217
|
Requires-Dist: blobfile==3.0.0
|
221
218
|
Requires-Dist: build
|
222
219
|
Requires-Dist: compressed-tensors
|
220
|
+
Requires-Dist: cuda-python
|
223
221
|
Requires-Dist: datasets
|
224
222
|
Requires-Dist: einops
|
225
223
|
Requires-Dist: fastapi
|
224
|
+
Requires-Dist: flashinfer_python==0.4.0rc3
|
226
225
|
Requires-Dist: hf_transfer
|
227
226
|
Requires-Dist: huggingface_hub
|
228
227
|
Requires-Dist: interegular
|
@@ -230,8 +229,10 @@ Requires-Dist: llguidance<0.8.0,>=0.7.11
|
|
230
229
|
Requires-Dist: modelscope
|
231
230
|
Requires-Dist: msgspec
|
232
231
|
Requires-Dist: ninja
|
233
|
-
Requires-Dist:
|
232
|
+
Requires-Dist: numpy
|
233
|
+
Requires-Dist: nvidia-cutlass-dsl==4.2.1
|
234
234
|
Requires-Dist: openai-harmony==0.0.4
|
235
|
+
Requires-Dist: openai==1.99.1
|
235
236
|
Requires-Dist: orjson
|
236
237
|
Requires-Dist: outlines==0.1.11
|
237
238
|
Requires-Dist: packaging
|
@@ -239,32 +240,34 @@ Requires-Dist: partial_json_parser
|
|
239
240
|
Requires-Dist: pillow
|
240
241
|
Requires-Dist: prometheus-client>=0.20.0
|
241
242
|
Requires-Dist: psutil
|
243
|
+
Requires-Dist: py-spy
|
242
244
|
Requires-Dist: pybase64
|
243
245
|
Requires-Dist: pydantic
|
244
246
|
Requires-Dist: pynvml
|
245
247
|
Requires-Dist: python-multipart
|
246
248
|
Requires-Dist: pyzmq>=25.1.2
|
249
|
+
Requires-Dist: requests
|
247
250
|
Requires-Dist: scipy
|
248
251
|
Requires-Dist: sentencepiece
|
252
|
+
Requires-Dist: setproctitle
|
253
|
+
Requires-Dist: sgl-kernel==0.3.14.post1
|
249
254
|
Requires-Dist: soundfile==0.13.1
|
250
|
-
Requires-Dist: timm==1.0.16
|
251
255
|
Requires-Dist: tiktoken
|
256
|
+
Requires-Dist: timm==1.0.16
|
257
|
+
Requires-Dist: torch==2.8.0
|
258
|
+
Requires-Dist: torch_memory_saver==0.0.9rc2
|
252
259
|
Requires-Dist: torchao==0.9.0
|
253
|
-
Requires-Dist:
|
260
|
+
Requires-Dist: torchaudio==2.8.0
|
261
|
+
Requires-Dist: torchvision
|
262
|
+
Requires-Dist: tqdm
|
263
|
+
Requires-Dist: transformers==4.57.0
|
254
264
|
Requires-Dist: uvicorn
|
255
265
|
Requires-Dist: uvloop
|
256
266
|
Requires-Dist: xgrammar==0.1.24
|
257
|
-
Requires-Dist:
|
258
|
-
Requires-Dist:
|
259
|
-
|
260
|
-
Requires-Dist:
|
261
|
-
Requires-Dist: cuda-python
|
262
|
-
Requires-Dist: flashinfer_python==0.3.1
|
263
|
-
Requires-Dist: openai==1.99.1
|
264
|
-
Requires-Dist: tiktoken
|
265
|
-
Requires-Dist: anthropic>=0.20.0
|
266
|
-
Requires-Dist: torch_memory_saver==0.0.8
|
267
|
-
Requires-Dist: decord
|
267
|
+
Requires-Dist: grpcio==1.75.1
|
268
|
+
Requires-Dist: grpcio-tools==1.75.1
|
269
|
+
Provides-Extra: decord
|
270
|
+
Requires-Dist: decord; extra == "decord"
|
268
271
|
Provides-Extra: test
|
269
272
|
Requires-Dist: accelerate; extra == "test"
|
270
273
|
Requires-Dist: expecttest; extra == "test"
|
@@ -272,21 +275,27 @@ Requires-Dist: jsonlines; extra == "test"
|
|
272
275
|
Requires-Dist: matplotlib; extra == "test"
|
273
276
|
Requires-Dist: pandas; extra == "test"
|
274
277
|
Requires-Dist: peft; extra == "test"
|
275
|
-
Requires-Dist: sentence_transformers; extra == "test"
|
276
278
|
Requires-Dist: pytest; extra == "test"
|
279
|
+
Requires-Dist: sentence_transformers; extra == "test"
|
277
280
|
Requires-Dist: tabulate; extra == "test"
|
278
281
|
Provides-Extra: tracing
|
279
|
-
Requires-Dist: opentelemetry-sdk; extra == "tracing"
|
280
282
|
Requires-Dist: opentelemetry-api; extra == "tracing"
|
281
283
|
Requires-Dist: opentelemetry-exporter-otlp; extra == "tracing"
|
282
284
|
Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "tracing"
|
285
|
+
Requires-Dist: opentelemetry-sdk; extra == "tracing"
|
283
286
|
Provides-Extra: all
|
284
287
|
Requires-Dist: sglang[test]; extra == "all"
|
285
|
-
|
286
|
-
|
287
|
-
Requires-Dist: sglang[test]; extra == "
|
288
|
+
Requires-Dist: sglang[decord]; extra == "all"
|
289
|
+
Provides-Extra: all-aarch64
|
290
|
+
Requires-Dist: sglang[test]; extra == "all-aarch64"
|
288
291
|
Provides-Extra: dev
|
289
292
|
Requires-Dist: sglang[test]; extra == "dev"
|
293
|
+
Requires-Dist: sglang[decord]; extra == "dev"
|
294
|
+
Provides-Extra: blackwell
|
295
|
+
Requires-Dist: sglang[test]; extra == "blackwell"
|
296
|
+
Requires-Dist: sglang[decord]; extra == "blackwell"
|
297
|
+
Provides-Extra: blackwell-aarch64
|
298
|
+
Requires-Dist: sglang[test]; extra == "blackwell-aarch64"
|
290
299
|
Dynamic: license-file
|
291
300
|
|
292
301
|
<div align="center" id="sglangtop">
|