sglang 0.3.6.post3__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +4 -0
- sglang/bench_serving.py +13 -0
- sglang/check_env.py +1 -1
- sglang/srt/_custom_ops.py +118 -0
- sglang/srt/configs/device_config.py +17 -0
- sglang/srt/configs/load_config.py +84 -0
- sglang/srt/configs/model_config.py +161 -4
- sglang/srt/configs/qwen2vl.py +5 -8
- sglang/srt/constrained/outlines_backend.py +6 -1
- sglang/srt/constrained/outlines_jump_forward.py +8 -1
- sglang/srt/distributed/__init__.py +3 -0
- sglang/srt/distributed/communication_op.py +34 -0
- sglang/srt/distributed/device_communicators/__init__.py +0 -0
- sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
- sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
- sglang/srt/distributed/device_communicators/pynccl.py +204 -0
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
- sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
- sglang/srt/distributed/parallel_state.py +1275 -0
- sglang/srt/distributed/utils.py +223 -0
- sglang/srt/hf_transformers_utils.py +37 -1
- sglang/srt/layers/attention/flashinfer_backend.py +13 -15
- sglang/srt/layers/attention/torch_native_backend.py +285 -0
- sglang/srt/layers/fused_moe_patch.py +20 -11
- sglang/srt/layers/linear.py +1 -0
- sglang/srt/layers/logits_processor.py +17 -3
- sglang/srt/layers/quantization/__init__.py +34 -0
- sglang/srt/layers/vocab_parallel_embedding.py +1 -0
- sglang/srt/lora/lora.py +1 -1
- sglang/srt/managers/io_struct.py +48 -2
- sglang/srt/managers/schedule_batch.py +18 -14
- sglang/srt/managers/schedule_policy.py +7 -4
- sglang/srt/managers/scheduler.py +76 -20
- sglang/srt/managers/tokenizer_manager.py +166 -68
- sglang/srt/managers/tp_worker.py +36 -3
- sglang/srt/managers/tp_worker_overlap_thread.py +21 -3
- sglang/srt/model_executor/cuda_graph_runner.py +16 -7
- sglang/srt/model_executor/forward_batch_info.py +9 -4
- sglang/srt/model_executor/model_runner.py +136 -150
- sglang/srt/model_loader/__init__.py +34 -0
- sglang/srt/model_loader/loader.py +1139 -0
- sglang/srt/model_loader/utils.py +41 -0
- sglang/srt/model_loader/weight_utils.py +640 -0
- sglang/srt/models/baichuan.py +9 -10
- sglang/srt/models/chatglm.py +6 -15
- sglang/srt/models/commandr.py +2 -3
- sglang/srt/models/dbrx.py +2 -3
- sglang/srt/models/deepseek.py +4 -11
- sglang/srt/models/deepseek_v2.py +3 -11
- sglang/srt/models/exaone.py +2 -3
- sglang/srt/models/gemma.py +2 -6
- sglang/srt/models/gemma2.py +3 -14
- sglang/srt/models/gemma2_reward.py +0 -1
- sglang/srt/models/gpt2.py +5 -12
- sglang/srt/models/gpt_bigcode.py +6 -22
- sglang/srt/models/grok.py +3 -3
- sglang/srt/models/internlm2.py +2 -3
- sglang/srt/models/internlm2_reward.py +0 -1
- sglang/srt/models/llama.py +97 -27
- sglang/srt/models/llama_classification.py +1 -2
- sglang/srt/models/llama_embedding.py +1 -2
- sglang/srt/models/llama_reward.py +2 -3
- sglang/srt/models/llava.py +1 -4
- sglang/srt/models/llavavid.py +1 -2
- sglang/srt/models/minicpm.py +4 -7
- sglang/srt/models/minicpm3.py +6 -19
- sglang/srt/models/mixtral.py +12 -5
- sglang/srt/models/mixtral_quant.py +2 -3
- sglang/srt/models/mllama.py +3 -7
- sglang/srt/models/olmo.py +2 -8
- sglang/srt/models/olmo2.py +0 -1
- sglang/srt/models/olmoe.py +3 -5
- sglang/srt/models/phi3_small.py +8 -8
- sglang/srt/models/qwen.py +2 -3
- sglang/srt/models/qwen2.py +10 -9
- sglang/srt/models/qwen2_moe.py +4 -11
- sglang/srt/models/qwen2_vl.py +2 -6
- sglang/srt/models/registry.py +99 -0
- sglang/srt/models/stablelm.py +2 -3
- sglang/srt/models/torch_native_llama.py +6 -12
- sglang/srt/models/xverse.py +2 -4
- sglang/srt/models/xverse_moe.py +4 -11
- sglang/srt/models/yivl.py +2 -3
- sglang/srt/openai_api/adapter.py +9 -5
- sglang/srt/openai_api/protocol.py +1 -0
- sglang/srt/server.py +267 -170
- sglang/srt/server_args.py +65 -31
- sglang/srt/utils.py +245 -28
- sglang/test/test_utils.py +7 -0
- sglang/version.py +1 -1
- {sglang-0.3.6.post3.dist-info → sglang-0.4.0.dist-info}/METADATA +1 -1
- sglang-0.4.0.dist-info/RECORD +184 -0
- sglang-0.3.6.post3.dist-info/RECORD +0 -162
- {sglang-0.3.6.post3.dist-info → sglang-0.4.0.dist-info}/LICENSE +0 -0
- {sglang-0.3.6.post3.dist-info → sglang-0.4.0.dist-info}/WHEEL +0 -0
- {sglang-0.3.6.post3.dist-info → sglang-0.4.0.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py
CHANGED
@@ -30,6 +30,7 @@ import subprocess
|
|
30
30
|
import tempfile
|
31
31
|
import time
|
32
32
|
import warnings
|
33
|
+
from functools import lru_cache
|
33
34
|
from importlib.metadata import PackageNotFoundError, version
|
34
35
|
from io import BytesIO
|
35
36
|
from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
|
@@ -38,6 +39,7 @@ import numpy as np
|
|
38
39
|
import psutil
|
39
40
|
import requests
|
40
41
|
import torch
|
42
|
+
import torch.distributed
|
41
43
|
import torch.distributed as dist
|
42
44
|
import triton
|
43
45
|
import zmq
|
@@ -67,6 +69,22 @@ def is_hip() -> bool:
|
|
67
69
|
return torch.version.hip is not None
|
68
70
|
|
69
71
|
|
72
|
+
def is_cuda():
|
73
|
+
return hasattr(torch, "cuda") and torch.cuda.is_available()
|
74
|
+
|
75
|
+
|
76
|
+
def is_cuda_alike():
|
77
|
+
return is_cuda() or is_hip()
|
78
|
+
|
79
|
+
|
80
|
+
def is_hpu() -> bool:
|
81
|
+
return hasattr(torch, "hpu") and torch.hpu.is_available()
|
82
|
+
|
83
|
+
|
84
|
+
def is_xpu() -> bool:
|
85
|
+
return hasattr(torch, "xpu") and torch.xpu.is_available()
|
86
|
+
|
87
|
+
|
70
88
|
def is_flashinfer_available():
|
71
89
|
"""
|
72
90
|
Check whether flashinfer is available.
|
@@ -412,16 +430,12 @@ def suppress_other_loggers():
|
|
412
430
|
from vllm.logger import logger as vllm_default_logger
|
413
431
|
|
414
432
|
vllm_default_logger.setLevel(logging.WARN)
|
415
|
-
logging.getLogger("vllm.config").setLevel(logging.ERROR)
|
416
433
|
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
|
417
434
|
logging.WARN
|
418
435
|
)
|
419
436
|
logging.getLogger("vllm.distributed.device_communicators.shm_broadcast").setLevel(
|
420
437
|
logging.WARN
|
421
438
|
)
|
422
|
-
logging.getLogger("vllm.selector").setLevel(logging.WARN)
|
423
|
-
logging.getLogger("vllm.utils").setLevel(logging.ERROR)
|
424
|
-
logging.getLogger("vllm.model_executor.model_loader.loader").setLevel(logging.ERROR)
|
425
439
|
|
426
440
|
warnings.filterwarnings(
|
427
441
|
"ignore", category=UserWarning, message="The given NumPy array is not writable"
|
@@ -474,27 +488,6 @@ def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = N
|
|
474
488
|
pass
|
475
489
|
|
476
490
|
|
477
|
-
def monkey_patch_vllm_model_config():
|
478
|
-
from vllm.config import ModelConfig
|
479
|
-
|
480
|
-
if not hasattr(ModelConfig, "_resolve_task"):
|
481
|
-
return
|
482
|
-
|
483
|
-
def _resolve_task(
|
484
|
-
self,
|
485
|
-
task_option,
|
486
|
-
hf_config,
|
487
|
-
):
|
488
|
-
supported_tasks = {
|
489
|
-
"generate": True,
|
490
|
-
"embedding": False,
|
491
|
-
}
|
492
|
-
selected_task = "generate"
|
493
|
-
return supported_tasks, selected_task
|
494
|
-
|
495
|
-
setattr(ModelConfig, "_resolve_task", _resolve_task)
|
496
|
-
|
497
|
-
|
498
491
|
def monkey_patch_vllm_p2p_access_check(gpu_id: int):
|
499
492
|
"""
|
500
493
|
Monkey patch the slow p2p access check in vllm.
|
@@ -557,6 +550,29 @@ def monkey_patch_vllm_all_gather(reverse: bool = False):
|
|
557
550
|
setattr(GroupCoordinator, "all_gather", all_gather)
|
558
551
|
|
559
552
|
|
553
|
+
def monkey_patch_vllm_gguf_config():
|
554
|
+
from vllm.model_executor.layers.linear import LinearBase
|
555
|
+
from vllm.model_executor.layers.quantization.gguf import (
|
556
|
+
GGUFConfig,
|
557
|
+
GGUFEmbeddingMethod,
|
558
|
+
GGUFLinearMethod,
|
559
|
+
)
|
560
|
+
|
561
|
+
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
562
|
+
|
563
|
+
def get_quant_method_with_embedding_replaced(
|
564
|
+
self, layer: torch.nn.Module, prefix: str
|
565
|
+
) -> Optional["QuantizeMethodBase"]:
|
566
|
+
if isinstance(layer, LinearBase):
|
567
|
+
return GGUFLinearMethod(self)
|
568
|
+
elif isinstance(layer, VocabParallelEmbedding):
|
569
|
+
# patch to own VocabParallelEmbedding
|
570
|
+
return GGUFEmbeddingMethod(self)
|
571
|
+
return None
|
572
|
+
|
573
|
+
setattr(GGUFConfig, "get_quant_method", get_quant_method_with_embedding_replaced)
|
574
|
+
|
575
|
+
|
560
576
|
def maybe_set_triton_cache_manager() -> None:
|
561
577
|
"""Set environment variable to tell Triton to use a
|
562
578
|
custom cache manager"""
|
@@ -862,7 +878,9 @@ def get_amdgpu_memory_capacity():
|
|
862
878
|
try:
|
863
879
|
# Run rocm-smi and capture the output
|
864
880
|
result = subprocess.run(
|
865
|
-
[
|
881
|
+
[
|
882
|
+
"rocminfo | grep 'gfx' -A 100 | grep 'Pool 1' -A 5 | grep 'Size:' | awk '{print $2}'"
|
883
|
+
],
|
866
884
|
stdout=subprocess.PIPE,
|
867
885
|
stderr=subprocess.PIPE,
|
868
886
|
shell=True,
|
@@ -873,9 +891,8 @@ def get_amdgpu_memory_capacity():
|
|
873
891
|
|
874
892
|
# Parse the output to extract memory values in MiB
|
875
893
|
memory_values = [
|
876
|
-
float(mem) / 1024
|
894
|
+
float(mem.split("(")[0].strip()) / 1024
|
877
895
|
for mem in result.stdout.strip().split("\n")
|
878
|
-
if re.match(r"^\d+(\.\d+)?$", mem.strip())
|
879
896
|
]
|
880
897
|
|
881
898
|
if not memory_values:
|
@@ -922,11 +939,88 @@ def get_nvgpu_memory_capacity():
|
|
922
939
|
)
|
923
940
|
|
924
941
|
|
942
|
+
# Copy from pytorch and OpenRLHF to allow creating multiple main groups.
|
943
|
+
# https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py
|
944
|
+
# https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/utils/distributed_util.py
|
945
|
+
def init_custom_process_group(
|
946
|
+
backend=None,
|
947
|
+
init_method=None,
|
948
|
+
timeout=None,
|
949
|
+
world_size=-1,
|
950
|
+
rank=-1,
|
951
|
+
store=None,
|
952
|
+
group_name=None,
|
953
|
+
pg_options=None,
|
954
|
+
):
|
955
|
+
from torch.distributed.distributed_c10d import (
|
956
|
+
Backend,
|
957
|
+
PrefixStore,
|
958
|
+
_new_process_group_helper,
|
959
|
+
_world,
|
960
|
+
default_pg_timeout,
|
961
|
+
rendezvous,
|
962
|
+
)
|
963
|
+
|
964
|
+
assert (store is None) or (
|
965
|
+
init_method is None
|
966
|
+
), "Cannot specify both init_method and store."
|
967
|
+
|
968
|
+
if store is not None:
|
969
|
+
assert world_size > 0, "world_size must be positive if using store"
|
970
|
+
assert rank >= 0, "rank must be non-negative if using store"
|
971
|
+
elif init_method is None:
|
972
|
+
init_method = "env://"
|
973
|
+
|
974
|
+
if backend:
|
975
|
+
backend = Backend(backend)
|
976
|
+
else:
|
977
|
+
backend = Backend("undefined")
|
978
|
+
|
979
|
+
if timeout is None:
|
980
|
+
timeout = default_pg_timeout
|
981
|
+
|
982
|
+
# backward compatible API
|
983
|
+
if store is None:
|
984
|
+
rendezvous_iterator = rendezvous(init_method, rank, world_size, timeout=timeout)
|
985
|
+
store, rank, world_size = next(rendezvous_iterator)
|
986
|
+
store.set_timeout(timeout)
|
987
|
+
|
988
|
+
# Use a PrefixStore to avoid accidental overrides of keys used by
|
989
|
+
# different systems (e.g. RPC) in case the store is multi-tenant.
|
990
|
+
store = PrefixStore(group_name, store)
|
991
|
+
|
992
|
+
# NOTE: The pg_options parameter was renamed into backend_options in PyTorch 2.6.0
|
993
|
+
# https://github.com/pytorch/pytorch/commit/a0c7029a75628cd5fa8df83c0de0ea98ee7fd844
|
994
|
+
# We need to determine the appropriate parameter name based on PyTorch version
|
995
|
+
pg_options_param_name = (
|
996
|
+
"backend_options" if str(torch.__version__) >= "2.6" else "pg_options"
|
997
|
+
)
|
998
|
+
pg, _ = _new_process_group_helper(
|
999
|
+
world_size,
|
1000
|
+
rank,
|
1001
|
+
[],
|
1002
|
+
backend,
|
1003
|
+
store,
|
1004
|
+
group_name=group_name,
|
1005
|
+
**{pg_options_param_name: pg_options},
|
1006
|
+
timeout=timeout,
|
1007
|
+
)
|
1008
|
+
|
1009
|
+
_world.pg_group_ranks[pg] = {i: i for i in range(world_size)}
|
1010
|
+
|
1011
|
+
return pg
|
1012
|
+
|
1013
|
+
|
925
1014
|
def crash_on_warnings():
|
926
1015
|
# Crash on warning if we are running CI tests
|
927
1016
|
return get_bool_env_var("SGLANG_IS_IN_CI")
|
928
1017
|
|
929
1018
|
|
1019
|
+
def print_warning_once(msg: str) -> None:
|
1020
|
+
# Set the stacklevel to 2 to print the caller's line info
|
1021
|
+
logger.warning(msg, stacklevel=2)
|
1022
|
+
|
1023
|
+
|
930
1024
|
def get_device_name(device_id: int = 0) -> str:
|
931
1025
|
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
932
1026
|
return torch.cuda.get_device_name(device_id)
|
@@ -941,9 +1035,42 @@ def get_device_name(device_id: int = 0) -> str:
|
|
941
1035
|
return torch.hpu.get_device_name(device_id)
|
942
1036
|
|
943
1037
|
|
1038
|
+
def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
|
1039
|
+
major, minor = None, None
|
1040
|
+
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
1041
|
+
major, minor = torch.cuda.get_device_capability(device_id)
|
1042
|
+
|
1043
|
+
if hasattr(torch, "hip") and torch.hip.is_available():
|
1044
|
+
major, minor = torch.cuda.get_device_capability(device_id)
|
1045
|
+
|
1046
|
+
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
1047
|
+
major, minor, *_ = torch.xpu.get_device_capability(device_id)["version"].split(
|
1048
|
+
"."
|
1049
|
+
)
|
1050
|
+
major, minor = int(major), int(minor)
|
1051
|
+
|
1052
|
+
# TODO(HandH1998): `get_device_capability` is not supported by `torch.hpu` for now.
|
1053
|
+
# Update this once the support is available.
|
1054
|
+
if hasattr(torch, "hpu") and torch.hpu.is_available():
|
1055
|
+
try:
|
1056
|
+
major, minor = torch.hpu.get_device_capability(device_id)
|
1057
|
+
except Exception as e:
|
1058
|
+
raise RuntimeError(
|
1059
|
+
f"An error occurred while getting device capability of hpu: {e}."
|
1060
|
+
) from e
|
1061
|
+
|
1062
|
+
return major, minor
|
1063
|
+
|
1064
|
+
|
944
1065
|
sglang_lib = Library("sglang", "FRAGMENT") # noqa
|
945
1066
|
|
946
1067
|
|
1068
|
+
# Some backends use pytorch version < 2.4.0 which doesn't
|
1069
|
+
# support `torch.library.custom_op`.
|
1070
|
+
def supports_custom_op() -> bool:
|
1071
|
+
return hasattr(torch.library, "custom_op")
|
1072
|
+
|
1073
|
+
|
947
1074
|
def direct_register_custom_op(
|
948
1075
|
op_name: str,
|
949
1076
|
op_func: Callable,
|
@@ -1020,3 +1147,93 @@ def set_gpu_proc_affinity(
|
|
1020
1147
|
def get_bool_env_var(name: str, default: str = "false") -> bool:
|
1021
1148
|
value = os.getenv(name, default)
|
1022
1149
|
return value.lower() in ("true", "1")
|
1150
|
+
|
1151
|
+
|
1152
|
+
@lru_cache(maxsize=8)
|
1153
|
+
def _cuda_device_count_stateless(cuda_visible_devices: Optional[str] = None) -> int:
|
1154
|
+
# Note: cuda_visible_devices is not used, but we keep it as an argument for
|
1155
|
+
# LRU Cache purposes.
|
1156
|
+
|
1157
|
+
# Code below is based on
|
1158
|
+
# https://github.com/pytorch/pytorch/blob/
|
1159
|
+
# c1cd946818442aca8c7f812b16d187ce1586c3bc/
|
1160
|
+
# torch/cuda/__init__.py#L831C1-L831C17
|
1161
|
+
import torch.cuda
|
1162
|
+
import torch.version
|
1163
|
+
|
1164
|
+
if not torch.cuda._is_compiled():
|
1165
|
+
return 0
|
1166
|
+
if is_hip():
|
1167
|
+
# ROCm uses amdsmi instead of nvml for stateless device count
|
1168
|
+
# This requires a sufficiently modern version of Torch 2.4.0
|
1169
|
+
raw_count = (
|
1170
|
+
torch.cuda._device_count_amdsmi()
|
1171
|
+
if (hasattr(torch.cuda, "_device_count_amdsmi"))
|
1172
|
+
else -1
|
1173
|
+
)
|
1174
|
+
else:
|
1175
|
+
raw_count = torch.cuda._device_count_nvml()
|
1176
|
+
r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count
|
1177
|
+
return r
|
1178
|
+
|
1179
|
+
|
1180
|
+
# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/utils.py
|
1181
|
+
def cuda_device_count_stateless() -> int:
|
1182
|
+
"""Get number of CUDA devices, caching based on the value of
|
1183
|
+
CUDA_VISIBLE_DEVICES at the time of call.
|
1184
|
+
|
1185
|
+
This should be used instead of torch.cuda.device_count()
|
1186
|
+
unless CUDA_VISIBLE_DEVICES has already been set to the desired
|
1187
|
+
value."""
|
1188
|
+
|
1189
|
+
# This can be removed and simply replaced with torch.cuda.get_device_count
|
1190
|
+
# after https://github.com/pytorch/pytorch/pull/122815 is released.
|
1191
|
+
return _cuda_device_count_stateless(os.environ.get("CUDA_VISIBLE_DEVICES", None))
|
1192
|
+
|
1193
|
+
|
1194
|
+
def should_use_tensor_core(
|
1195
|
+
kv_cache_dtype: torch.dtype,
|
1196
|
+
num_attention_heads: int,
|
1197
|
+
num_kv_heads: int,
|
1198
|
+
) -> bool:
|
1199
|
+
"""
|
1200
|
+
Determine whether to use tensor cores for attention computation.
|
1201
|
+
|
1202
|
+
Args:
|
1203
|
+
kv_cache_dtype: Data type of the KV cache
|
1204
|
+
num_attention_heads: Number of attention heads
|
1205
|
+
num_kv_heads: Number of key/value heads
|
1206
|
+
|
1207
|
+
Returns:
|
1208
|
+
bool: Whether to use tensor cores
|
1209
|
+
"""
|
1210
|
+
# Try to use environment variable first
|
1211
|
+
env_override = os.environ.get("SGLANG_FLASHINFER_USE_TENSOR_CORE")
|
1212
|
+
if env_override is not None:
|
1213
|
+
return env_override.lower() == "true"
|
1214
|
+
|
1215
|
+
# Try to use _grouped_size_compiled_for_decode_kernels if available
|
1216
|
+
# This is for flashinfer <=0.1.6. Otherwise, there is an accuracy bug
|
1217
|
+
try:
|
1218
|
+
from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
|
1219
|
+
|
1220
|
+
if not _grouped_size_compiled_for_decode_kernels(
|
1221
|
+
num_attention_heads,
|
1222
|
+
num_kv_heads,
|
1223
|
+
):
|
1224
|
+
return True
|
1225
|
+
else:
|
1226
|
+
return False
|
1227
|
+
except (ImportError, AttributeError):
|
1228
|
+
pass
|
1229
|
+
|
1230
|
+
# Calculate GQA group size
|
1231
|
+
gqa_group_size = num_attention_heads // num_kv_heads
|
1232
|
+
|
1233
|
+
# Determine based on dtype and GQA group size
|
1234
|
+
if kv_cache_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
|
1235
|
+
return True
|
1236
|
+
elif kv_cache_dtype in (torch.float16, torch.half, torch.bfloat16):
|
1237
|
+
return gqa_group_size > 4
|
1238
|
+
else:
|
1239
|
+
return False
|
sglang/test/test_utils.py
CHANGED
@@ -424,6 +424,7 @@ def popen_launch_server(
|
|
424
424
|
port,
|
425
425
|
*other_args,
|
426
426
|
]
|
427
|
+
|
427
428
|
if api_key:
|
428
429
|
command += ["--api-key", api_key]
|
429
430
|
|
@@ -567,6 +568,7 @@ def run_bench_serving(
|
|
567
568
|
disable_tqdm=False,
|
568
569
|
disable_stream=disable_stream,
|
569
570
|
disable_ignore_eos=False,
|
571
|
+
lora_name=None,
|
570
572
|
extra_request_body=None,
|
571
573
|
profile=None,
|
572
574
|
)
|
@@ -814,3 +816,8 @@ def run_mulit_request_test(
|
|
814
816
|
chunked_prefill_size,
|
815
817
|
assert_has_abort=False,
|
816
818
|
)
|
819
|
+
|
820
|
+
|
821
|
+
def write_github_step_summary(content):
|
822
|
+
with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
|
823
|
+
f.write(content)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.
|
1
|
+
__version__ = "0.4.0"
|
@@ -0,0 +1,184 @@
|
|
1
|
+
sglang/__init__.py,sha256=3M0oz0ZA8fULhV5LwQ4hxh-MRdHsOJRD1D63C60pdG4,1616
|
2
|
+
sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
|
3
|
+
sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
|
4
|
+
sglang/bench_offline_throughput.py,sha256=3OrFI26PmoVTU3pQrBFC50AZI7HpKKuk4vYycbkDjhY,12428
|
5
|
+
sglang/bench_one_batch.py,sha256=vxXSCQRTMeJUtJKsSoP6tLdoWTdFp1mhwsLpKHccs2c,15858
|
6
|
+
sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
|
7
|
+
sglang/bench_serving.py,sha256=Oa_Qi7YApv37jGDAmuIaZSIhayvRpKq9GZGZLXBU-9I,52924
|
8
|
+
sglang/check_env.py,sha256=q1sdYL-gcKSCeIZMk7sUMh9rjM71f-EUgp07OGPSbZM,5446
|
9
|
+
sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
|
10
|
+
sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
|
11
|
+
sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
|
12
|
+
sglang/utils.py,sha256=r4Dw-xffcrTRposls-gqyoYxjgJNYhVduK_6bDN_Vj4,11526
|
13
|
+
sglang/version.py,sha256=42STGor_9nKYXumfeV5tiyD_M8VdcddX7CEexmibPBk,22
|
14
|
+
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
sglang/lang/chat_template.py,sha256=jprS3-In2FTUoedKwZg-HYvDwU8RTIYntOlf2zoN2sU,14814
|
16
|
+
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
17
|
+
sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
|
18
|
+
sglang/lang/interpreter.py,sha256=SBjejhLhTKzNM0HbjtTg5r17WPJ64WFSk6lcM_SCWKs,30717
|
19
|
+
sglang/lang/ir.py,sha256=zpzzAO1YVldhE95Vwz5hU_TQltu-xt8A6rfFr0PuIDA,18410
|
20
|
+
sglang/lang/tracer.py,sha256=o-jLAPPSuy2vBfsGGrTAnbuWtORzQ50B4C_P5zvYkx8,8291
|
21
|
+
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
+
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
23
|
+
sglang/lang/backend/base_backend.py,sha256=tdoh9YF3CyekY1BKiX9n7-aA4srDWIuA4RDJLM7q8qg,1985
|
24
|
+
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
25
|
+
sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
|
26
|
+
sglang/lang/backend/runtime_endpoint.py,sha256=IWbrAKrUkzNOvwV6V9_y6pkTr2SUYEkKBT-3kirgad0,10514
|
27
|
+
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
28
|
+
sglang/srt/_custom_ops.py,sha256=Y4gyTDGhWz-W2Igq25Ojm8XFiyvkawW9I-79iwYvxJ0,3574
|
29
|
+
sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21201
|
30
|
+
sglang/srt/hf_transformers_utils.py,sha256=38Ms0H2-VMerOS6jnczcFtZMS6lhw9B5rSWKAfxVUfQ,7945
|
31
|
+
sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
|
32
|
+
sglang/srt/model_parallel.py,sha256=QR-Alqo0sElDXPJ79N1PhUHHKiEHPQn3dyXduMP-SHQ,3664
|
33
|
+
sglang/srt/server.py,sha256=uApz59VUGkRx_HCy6kaiKJFTmaOAvAM3o2yYjEWi9EM,34594
|
34
|
+
sglang/srt/server_args.py,sha256=NubTM6ocZY0YDSvPQbvBdURK-7E5kSKHA2Ze09O8_1I,32182
|
35
|
+
sglang/srt/utils.py,sha256=p-eYNMVQkw6Yw1b7MBSNWhVvykCuNfejqMjl7cLDq7A,40708
|
36
|
+
sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
|
37
|
+
sglang/srt/configs/device_config.py,sha256=dResqHjkg_dq10v6rnVpbXpvABZRB0jylOm-2_JAnx0,428
|
38
|
+
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
39
|
+
sglang/srt/configs/load_config.py,sha256=TcPi_HY6xu5SiVZsxPOoB5pGeDUNebOk7muoUH9VBDg,3083
|
40
|
+
sglang/srt/configs/model_config.py,sha256=OjEeigs5tMNKP-RImJk2NHVFXv-fyQfsGREWMO3rqhM,15839
|
41
|
+
sglang/srt/configs/qwen2vl.py,sha256=ZjLy9v2eZY4wptUfY3CWgYKg2B5DDrkfCSyTy_Zf_bg,4351
|
42
|
+
sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO0ATuk,786
|
43
|
+
sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
|
44
|
+
sglang/srt/constrained/outlines_backend.py,sha256=LmezsMAmTlQgaPqO4Axl4EcSAqKr_ZYZASfAoVxT17A,6670
|
45
|
+
sglang/srt/constrained/outlines_jump_forward.py,sha256=iZWXeR3gNYoMubLGyFmLPO4V2YsN5DiGjD71Xk9iFaE,6418
|
46
|
+
sglang/srt/constrained/xgrammar_backend.py,sha256=4ZCQgcjWEY2Lg4r2V9sAiYJJblkQ_uVbEnvsjqhR1Pc,4548
|
47
|
+
sglang/srt/distributed/__init__.py,sha256=__tl9Frrf3PFrSyNYcn5i-y2rL-J4-Qn6RJwrsZ4xgc,83
|
48
|
+
sglang/srt/distributed/communication_op.py,sha256=ZoIhboZyefiAwr-1K-wF3rAFSQ4Wt-RxXpsX443Gbt4,1157
|
49
|
+
sglang/srt/distributed/parallel_state.py,sha256=HplRH5S0AWdwSdhoHYX9_UWQZlFjh2Z1LHaz68EXlpE,47555
|
50
|
+
sglang/srt/distributed/utils.py,sha256=riYflM9l1-Yi-8Ce8Acxa4mAjZaxHRQfta8Dtah4yG0,8500
|
51
|
+
sglang/srt/distributed/device_communicators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
52
|
+
sglang/srt/distributed/device_communicators/cuda_wrapper.py,sha256=u8_kYVOBDrNZMiQCJC538yJvpZgq6ZEpB28tCrp04yM,7065
|
53
|
+
sglang/srt/distributed/device_communicators/custom_all_reduce.py,sha256=5ARfr-1_V4QoxjvdfxOKPtSK_Rax8qAQTPoA5z_Emtc,13567
|
54
|
+
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py,sha256=qq8GTZl0br0ggfosb8mH3U6cXbm4NWfr8y_B83W4fDg,11081
|
55
|
+
sglang/srt/distributed/device_communicators/hpu_communicator.py,sha256=LXGOhoNT5iVu1JWlRvGfHMB0wRW6lkhDamVT9JhVD94,1755
|
56
|
+
sglang/srt/distributed/device_communicators/pynccl.py,sha256=cDEoHU24C8ph-4fJAIDjZfl53aSzrjCG3FAFkt4vjrM,7186
|
57
|
+
sglang/srt/distributed/device_communicators/pynccl_wrapper.py,sha256=pU4xhG-WKytSHJ-cpcPEs0WG4dAg44jpOgv2dAmHisE,11990
|
58
|
+
sglang/srt/distributed/device_communicators/shm_broadcast.py,sha256=WVxBd1QfIgRWzVGtN2axxO-3PFT-Qww8GQ82Yg5PPYU,22824
|
59
|
+
sglang/srt/distributed/device_communicators/xpu_communicator.py,sha256=P3WKgddcfpUhBa-_5PvjYxH146ZE-N1cotTzEpPRKlY,1620
|
60
|
+
sglang/srt/layers/activation.py,sha256=EboMjT9HV2tNHQ6rzpojtlkzev1lAFbhQlxMg9hwxBQ,5471
|
61
|
+
sglang/srt/layers/custom_op_util.py,sha256=0vu-yX2wwonmO1L_o5G7SA6C-8XuhDIh9rPDvNeLhoc,922
|
62
|
+
sglang/srt/layers/fused_moe_patch.py,sha256=DMIyrwOON7OSidKZdreL5HzMhP0AD5Ues0xdY-ADOQw,4471
|
63
|
+
sglang/srt/layers/layernorm.py,sha256=nRQ1w1xSUcU-zlqVC61BnGG6otS5W1w9VaSzeXizrx4,4037
|
64
|
+
sglang/srt/layers/linear.py,sha256=dF2HvqiMbhWlCjvkLFRCcgUFGhG-B0keM_CIpjvgTtg,46154
|
65
|
+
sglang/srt/layers/logits_processor.py,sha256=oZNu9pNNgmswhuw8irlLm0SfpVrD7cFf-GdfPsLZGHE,13227
|
66
|
+
sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
|
67
|
+
sglang/srt/layers/radix_attention.py,sha256=C_mK4mfmKlxMRNeKYP9E5R3PRd3eT-OcE_g3mo36dJM,2058
|
68
|
+
sglang/srt/layers/rotary_embedding.py,sha256=29tx3JNR40AoXqBa2cFGBjva9vU2xgFipETlpMaaZas,3985
|
69
|
+
sglang/srt/layers/sampler.py,sha256=_enfER8MSxsCYrR6_NgyFxKA_XqKtii_asOZUFUUsd8,4580
|
70
|
+
sglang/srt/layers/torchao_utils.py,sha256=v0hyr4hLsM42QwOPCdKb-ftRTjVokBZbqvRj4O4C-Nw,3415
|
71
|
+
sglang/srt/layers/vocab_parallel_embedding.py,sha256=slGwLiWjuFLCUdRe-GTlfumyZpqVX9VF6No_UGOT-hA,21624
|
72
|
+
sglang/srt/layers/attention/__init__.py,sha256=EL1o6Q5vLgViN3pOr2A7F6K9FlNEpMdBypFAVMeq_HA,2445
|
73
|
+
sglang/srt/layers/attention/double_sparsity_backend.py,sha256=BlX7uXteQpnoOnKsdBKh8h20zMVMEiibB5F_PkZSlNI,10706
|
74
|
+
sglang/srt/layers/attention/flashinfer_backend.py,sha256=q0yPeKUjGbxKnOqbZLHs5fyYkkVE3YEkBBMWtUaiDL4,24611
|
75
|
+
sglang/srt/layers/attention/torch_native_backend.py,sha256=amR8xGaaCONvo3M8I4nuYMv6OxPHyYoxu3m8vAqjWfo,10295
|
76
|
+
sglang/srt/layers/attention/triton_backend.py,sha256=gjxed2cvc2-8QEHkzyTVv6ui7oYOp2b_vgIUQVD1XuM,6538
|
77
|
+
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=BE63WhKiutSNkhJLsRwvfsRy-ExvuAv7FZyoWv73ul8,18744
|
78
|
+
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
|
79
|
+
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=Gfct-0_l-S2ZrP4F-zkzNiFbmd3C3f7uJovacOuDxaA,11472
|
80
|
+
sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
|
81
|
+
sglang/srt/layers/fused_moe_triton/__init__.py,sha256=PHKFqd2hPOO-g9kSMseg2g76lpg9OGXQDThWU6bt9vs,902
|
82
|
+
sglang/srt/layers/fused_moe_triton/fused_moe.py,sha256=qwfRBOeY5DT48Q6z71Eh9cjFehvs_K6eLIVWNL044Ug,28363
|
83
|
+
sglang/srt/layers/fused_moe_triton/layer.py,sha256=URDkTt8xEqnqpO5tb_3L7JlhlO53VWfqDDNSRYEu-LY,21545
|
84
|
+
sglang/srt/layers/quantization/__init__.py,sha256=0LnBJ2LOtk7HnMzOZm14tMd4Y0eSPSgqUhj9uOn53Es,5961
|
85
|
+
sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
|
86
|
+
sglang/srt/lora/lora.py,sha256=-o2mBmUvoVpdkgdAkWTARN4kfyep3UNEJLcg6moh0SU,15056
|
87
|
+
sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
|
88
|
+
sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
|
89
|
+
sglang/srt/managers/data_parallel_controller.py,sha256=psI4FAuBGjtdnEuwagnGdtRqvqSSxOROfNKQqVDqlVA,8382
|
90
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=TtrtE37XT5XcJzk8-R5rHZ16NHTPd5XZi8hf3h-sB2A,7462
|
91
|
+
sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
|
92
|
+
sglang/srt/managers/io_struct.py,sha256=d_kctmHcNBzzaP5lUEIpdrVVsob4dNOetMHkobUJZz4,14439
|
93
|
+
sglang/srt/managers/schedule_batch.py,sha256=p6OmeCH3arGZ99Ch7JrnTgGU-SsPgjQY7Z6arVzeqao,44998
|
94
|
+
sglang/srt/managers/schedule_policy.py,sha256=7QuIsJDRzkrvs3IJk10oOfL4Me0UZwDYvRniT1fSFuo,12620
|
95
|
+
sglang/srt/managers/scheduler.py,sha256=fmvwVcZS5DsDdxIJ4bjlLV-qJsRbTxjsqPP5a2SS0C8,59372
|
96
|
+
sglang/srt/managers/session_controller.py,sha256=Yp-IV3rXczACZxZXmF-QxW9CWICGy8KHQ9ttBGJ8WXA,2800
|
97
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=XPaSXB6b23u95viFqlqd-tdyrNMMOOiSDWviz_g7UBM,29890
|
98
|
+
sglang/srt/managers/tp_worker.py,sha256=X1EwFX3FSsmXx7jeeX2tjZRocaujabQYWm-M-0CFEBE,7363
|
99
|
+
sglang/srt/managers/tp_worker_overlap_thread.py,sha256=ZBLCAYz-Ls1coObyO8dNtNsO6gG2rn40KulEmALB_J8,8686
|
100
|
+
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
101
|
+
sglang/srt/mem_cache/chunk_cache.py,sha256=VcCpyrf5FOQ5xoKeOouCI5ZQLkZo_pgY1SPbDDkagGg,2492
|
102
|
+
sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
|
103
|
+
sglang/srt/mem_cache/memory_pool.py,sha256=41fjuj_sD0yfJq-sy-X99cc2djBa6w4dy2y47V0WqNU,10934
|
104
|
+
sglang/srt/mem_cache/radix_cache.py,sha256=DzLCO_gYQ7X_C2NJSEHzzMZhb5HzWjKF9wXJQsnzr8M,10427
|
105
|
+
sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTluMU,6861
|
106
|
+
sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
|
107
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=ffypnhzwAdiBAngHEpK9HVmpvX0H3dFIKGa45tS0IGI,15197
|
108
|
+
sglang/srt/model_executor/forward_batch_info.py,sha256=L5mVoW5SaO6To-7nGk0TZM-FFB5_78cARpJ-aC2rwD0,12883
|
109
|
+
sglang/srt/model_executor/model_runner.py,sha256=wU06s5tpHgtxitmAWGQC1NjLIcp68iAdbFDWN-TtkBU,29260
|
110
|
+
sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
|
111
|
+
sglang/srt/model_loader/loader.py,sha256=VBrY4W9CiVvS_D8yXhdkW9jReV9rSMSkJplabz0Fxgk,43528
|
112
|
+
sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
|
113
|
+
sglang/srt/model_loader/weight_utils.py,sha256=kQo9KPThjH3HAOCfC_tdwdrshdWuWJOVpPR0skSyaRY,24193
|
114
|
+
sglang/srt/models/baichuan.py,sha256=PzBOFcEAixakPEkQSaJwC0Xc1fu-yCsN9T0I67r8QmY,14919
|
115
|
+
sglang/srt/models/chatglm.py,sha256=DOrEhmb0s-yPId88R6nJeLOTUEtogk-vkB69qT2JdWc,12913
|
116
|
+
sglang/srt/models/commandr.py,sha256=-cswAK2_ZTcOsvBkv5Z8SZ1x60cxJryslbSPHDOcnbM,14169
|
117
|
+
sglang/srt/models/dbrx.py,sha256=2Wqcf3sv57l4gi2xH8yrb5WSmY-4_kbbf6fhpJ4aKWw,14581
|
118
|
+
sglang/srt/models/deepseek.py,sha256=BVNICGoLjQoHmR5lc31YrZ6YbxSRTBilHqlLsALr2u8,15693
|
119
|
+
sglang/srt/models/deepseek_v2.py,sha256=2AWdDb5qZ6dj_M6ZY-rCG0omTmerUxbhwuvI9Um4Bg4,31967
|
120
|
+
sglang/srt/models/exaone.py,sha256=dkERTZVxrRroqu5AGLP7D4N6n8HvDqlNaDQUIe15mZY,13038
|
121
|
+
sglang/srt/models/gemma.py,sha256=ydRqsG-7004r1fAiz01LHUmcj_6XN0Tn4xO1keJnMQk,12126
|
122
|
+
sglang/srt/models/gemma2.py,sha256=vPrAasJajitQHB9ZqMFut58xNsOm3fk2m05a-feQL10,14600
|
123
|
+
sglang/srt/models/gemma2_reward.py,sha256=hJw0hXNPyQSpazkVJVYiW04OtTZH0GiLI-JJef_kaGs,2529
|
124
|
+
sglang/srt/models/gpt2.py,sha256=2je1kE09sGcaORWnJuGYAkcwwOrT9EK-KhQaoCKjCSA,9517
|
125
|
+
sglang/srt/models/gpt_bigcode.py,sha256=tovyOdJu2x3LkzmkdFXX_iJdkxuyChIDxwgvPBy6UPo,9528
|
126
|
+
sglang/srt/models/grok.py,sha256=lJQEk2D8zyAXE0g4vXLCEmiWM938K_qly02EScwwV_k,13942
|
127
|
+
sglang/srt/models/internlm2.py,sha256=_xcKtd6YtEFUTozaN-yUb0xbSYckRpomfPSKcAk4j-Y,12127
|
128
|
+
sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
|
129
|
+
sglang/srt/models/llama.py,sha256=JVwVDU-8L8RSNOYH0CK8FI557xqjPU2Ts_pQhVfGZv4,19550
|
130
|
+
sglang/srt/models/llama_classification.py,sha256=EdXmiMyfJ9NH5P-Wel7SRhf_v8ddFFhVJMQgzDt0oVk,3377
|
131
|
+
sglang/srt/models/llama_embedding.py,sha256=rh-AiczPY_pTpzcACHvSMVjh1hsV_MZBBwP0LQxPsGM,3130
|
132
|
+
sglang/srt/models/llama_reward.py,sha256=JVaiTK4gVXNMimeq3kKkv7dt5Hc77hPqF4ewvmzjJes,4622
|
133
|
+
sglang/srt/models/llava.py,sha256=l9mqS9wl_l6ARC-K1UUe7XsB5k9sZratMNQEwx5IjR0,25229
|
134
|
+
sglang/srt/models/llavavid.py,sha256=dYUkKfHoE15vF_VXA_s_ICCTUMSmSgvP181fk8dUi0g,12185
|
135
|
+
sglang/srt/models/minicpm.py,sha256=ws4AqhOfAvYHGd04QuXCZel-Oxy9_vN4p4rTjs9RSz0,13723
|
136
|
+
sglang/srt/models/minicpm3.py,sha256=YIKJDTpwjmpLlv1sNT93k2yZMvGQlI_H87czjf6QYyo,24707
|
137
|
+
sglang/srt/models/mistral.py,sha256=EYifJUUzN2Z2-iL37eJiNZF_DB0H4pa0mKlgYRIxM70,838
|
138
|
+
sglang/srt/models/mixtral.py,sha256=KvvtLWn-GLtwbdrt4PBq9gIFJiJH-JFcj0BLfcLcXjo,14322
|
139
|
+
sglang/srt/models/mixtral_quant.py,sha256=uuVO1nWUZJiDhbqZN6gzSMwyfpyZorMuFXHeMCGo7N0,14022
|
140
|
+
sglang/srt/models/mllama.py,sha256=3kX-UqeTSYZL5kPNdkfKEAEv3DpSAW1ArAAoeiXVzIc,37739
|
141
|
+
sglang/srt/models/olmo.py,sha256=OCDMtX1OI83r80mzU4FMC3Tg8cleQ-7C8Tpoe8zgzss,11708
|
142
|
+
sglang/srt/models/olmo2.py,sha256=aC7svioN7XT5owRxPrvhvWBNMON9QXGQBWJ1KHMyXeA,13442
|
143
|
+
sglang/srt/models/olmoe.py,sha256=Rw-3YrHWd90MZQFnmcfUQ-3wAaI0PCFKb0DIrCDND3s,15347
|
144
|
+
sglang/srt/models/phi3_small.py,sha256=rQFv-4aUGkNPTQjTEME5bcOL9DEQqqFb3aDKrj2joOA,15083
|
145
|
+
sglang/srt/models/qwen.py,sha256=_FKDbwaS5C07uJyyivZpBrXJVej4Ph9ivzJdzWJPxJ4,9904
|
146
|
+
sglang/srt/models/qwen2.py,sha256=Kh6mW0H2jQdrPS9dJnJShLpo0BNEq6oI4oy5VMHGzac,12444
|
147
|
+
sglang/srt/models/qwen2_moe.py,sha256=5-ZTYW5bERhTG5LnNXMqYfWB17NBkvxmQJu6DuDAFCo,16819
|
148
|
+
sglang/srt/models/qwen2_vl.py,sha256=3EaUlTbyWOTRXA7eViK1WqmVbCFhXLIpnos49zzf-yM,26561
|
149
|
+
sglang/srt/models/registry.py,sha256=inKh9iwOp3LFYm3nqujg-OtABClOP-ifc1stA9cZegA,3434
|
150
|
+
sglang/srt/models/stablelm.py,sha256=iBlIkM7CQmqI25nsujWk0LLCQD7TshzUU8qzZYYrt20,11311
|
151
|
+
sglang/srt/models/torch_native_llama.py,sha256=dHuUln7RcsRcdyOIUw7HKtOOG9OPmnfOpK8j5gYXzJY,19039
|
152
|
+
sglang/srt/models/xverse.py,sha256=Oq--KqvbYu2H4TMVGEHpSnJLEwXBpxlncR9ilsQeckc,13579
|
153
|
+
sglang/srt/models/xverse_moe.py,sha256=AawKEQw--oAl-yzwCjoaZRG7q3rdkyDiam3FS0zjf_c,15537
|
154
|
+
sglang/srt/models/yivl.py,sha256=88OubtuZ38Dxb2LzfV_MTPBI4wKhh4NJqFu--efbhFM,4809
|
155
|
+
sglang/srt/openai_api/adapter.py,sha256=gZEaG1dVSFv9WLj0369Ke1yrNNgi_gpgKxPt5Ju9mUw,53775
|
156
|
+
sglang/srt/openai_api/protocol.py,sha256=4T9hGCrpfCUSBjKZFvemTfj49CkTUzpCcx6izLv3ir0,10246
|
157
|
+
sglang/srt/sampling/sampling_batch_info.py,sha256=YC-KPyDWyLGNPL4YVcst4xwP8Wlz2zcCNJHB_5zljXQ,8470
|
158
|
+
sglang/srt/sampling/sampling_params.py,sha256=n7RbBg_bS5fYhsiWa8uJYnfoXy_i5DvtTBOkuFnHDNU,5286
|
159
|
+
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
160
|
+
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=J-DEemZcKm1--o37kf3qDOE8SZ_6H3d5oex49Mgq2ZU,10762
|
161
|
+
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD60mwD1tCcSG0x5IYo0v4z9ce-q_YwbJ9f8,2490
|
162
|
+
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=_Nxv0XgUPirZjw2SEJYp_Cd9ZcLwmt7h6JE6J4hhFq4,3629
|
163
|
+
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=5tOgCg7OvE9kSN9VMCpH1hwqo1YMxt9iS5PVpct9HpU,2468
|
164
|
+
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=m22Rfn1RuB1HpImBDECsiJ2VooBYpsFADAwnk1EPzk0,2751
|
165
|
+
sglang/test/few_shot_gsm8k.py,sha256=7yDbEQe49gZeJhz2wFFX-gf_59ThDKsCS1xwfogNc7k,4034
|
166
|
+
sglang/test/few_shot_gsm8k_engine.py,sha256=QQbrwOX6-cJDD3RZC_e7zPnt6aSo8JdF8X_lRHSjdDM,3886
|
167
|
+
sglang/test/run_eval.py,sha256=9yO0hXZOcn4abEOs96T-XPguDEklK16Ltco0pGF3zCg,4020
|
168
|
+
sglang/test/runners.py,sha256=ANzjrHkT_1E0G3UcD47O8XEKst3Si4AOfx-uErbFS7o,15129
|
169
|
+
sglang/test/simple_eval_common.py,sha256=joqrGysuLnJFtzDRIgFkMsRyKUSyjVPFWp0_PHAL3Ik,12378
|
170
|
+
sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
|
171
|
+
sglang/test/simple_eval_humaneval.py,sha256=zmV3xWYc2OrpiT9Dy55RTKZL5DEROD1cJ0NA_-cU5zI,5685
|
172
|
+
sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
|
173
|
+
sglang/test/simple_eval_mgsm.py,sha256=rd7TSUyxdKbrXaVoewo24V8lCo_6kO8zxPhhmvylpw8,10259
|
174
|
+
sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
|
175
|
+
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
176
|
+
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
177
|
+
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
178
|
+
sglang/test/test_utils.py,sha256=0lY3ZNfS3JCB4LqSRJgBfB8I0MA8TUT-BJmnrvQC8vw,23797
|
179
|
+
sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
|
180
|
+
sglang-0.4.0.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
|
181
|
+
sglang-0.4.0.dist-info/METADATA,sha256=dXh43Qx9ImYKIF-eB8vuiz_0DDbmNgUrOjCukOXCBTA,22165
|
182
|
+
sglang-0.4.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
183
|
+
sglang-0.4.0.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
184
|
+
sglang-0.4.0.dist-info/RECORD,,
|