sglang 0.3.6__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -2
- sglang/api.py +2 -2
- sglang/bench_one_batch.py +2 -4
- sglang/bench_serving.py +75 -26
- sglang/lang/backend/base_backend.py +1 -1
- sglang/lang/backend/runtime_endpoint.py +2 -2
- sglang/srt/configs/model_config.py +13 -14
- sglang/srt/constrained/__init__.py +13 -14
- sglang/srt/constrained/base_grammar_backend.py +13 -15
- sglang/srt/constrained/outlines_backend.py +13 -15
- sglang/srt/constrained/outlines_jump_forward.py +13 -15
- sglang/srt/constrained/xgrammar_backend.py +38 -57
- sglang/srt/conversation.py +13 -15
- sglang/srt/hf_transformers_utils.py +13 -15
- sglang/srt/layers/activation.py +13 -13
- sglang/srt/layers/attention/flashinfer_backend.py +13 -6
- sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
- sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
- sglang/srt/layers/custom_op_util.py +13 -14
- sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
- sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
- sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
- sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
- sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
- sglang/srt/layers/fused_moe_triton/layer.py +633 -0
- sglang/srt/layers/layernorm.py +13 -15
- sglang/srt/layers/logits_processor.py +13 -15
- sglang/srt/layers/quantization/__init__.py +77 -17
- sglang/srt/layers/radix_attention.py +13 -15
- sglang/srt/layers/rotary_embedding.py +13 -13
- sglang/srt/lora/lora.py +13 -14
- sglang/srt/lora/lora_config.py +13 -14
- sglang/srt/lora/lora_manager.py +22 -24
- sglang/srt/managers/data_parallel_controller.py +25 -19
- sglang/srt/managers/detokenizer_manager.py +13 -16
- sglang/srt/managers/io_struct.py +43 -28
- sglang/srt/managers/schedule_batch.py +55 -26
- sglang/srt/managers/schedule_policy.py +13 -15
- sglang/srt/managers/scheduler.py +89 -70
- sglang/srt/managers/session_controller.py +14 -15
- sglang/srt/managers/tokenizer_manager.py +29 -22
- sglang/srt/managers/tp_worker.py +13 -15
- sglang/srt/managers/tp_worker_overlap_thread.py +13 -15
- sglang/srt/metrics/collector.py +13 -15
- sglang/srt/metrics/func_timer.py +13 -15
- sglang/srt/mm_utils.py +13 -14
- sglang/srt/model_executor/cuda_graph_runner.py +20 -19
- sglang/srt/model_executor/forward_batch_info.py +19 -17
- sglang/srt/model_executor/model_runner.py +42 -30
- sglang/srt/models/chatglm.py +15 -16
- sglang/srt/models/commandr.py +15 -16
- sglang/srt/models/dbrx.py +15 -16
- sglang/srt/models/deepseek.py +15 -15
- sglang/srt/models/deepseek_v2.py +15 -15
- sglang/srt/models/exaone.py +14 -15
- sglang/srt/models/gemma.py +14 -14
- sglang/srt/models/gemma2.py +24 -19
- sglang/srt/models/gemma2_reward.py +13 -14
- sglang/srt/models/gpt_bigcode.py +14 -14
- sglang/srt/models/grok.py +15 -15
- sglang/srt/models/internlm2.py +13 -15
- sglang/srt/models/internlm2_reward.py +13 -14
- sglang/srt/models/llama.py +21 -21
- sglang/srt/models/llama_classification.py +13 -14
- sglang/srt/models/llama_reward.py +13 -14
- sglang/srt/models/llava.py +13 -15
- sglang/srt/models/llavavid.py +13 -15
- sglang/srt/models/minicpm.py +13 -15
- sglang/srt/models/minicpm3.py +13 -15
- sglang/srt/models/mistral.py +13 -15
- sglang/srt/models/mixtral.py +15 -15
- sglang/srt/models/mixtral_quant.py +14 -14
- sglang/srt/models/olmo.py +21 -19
- sglang/srt/models/olmoe.py +23 -20
- sglang/srt/models/qwen.py +14 -14
- sglang/srt/models/qwen2.py +22 -19
- sglang/srt/models/qwen2_moe.py +17 -18
- sglang/srt/models/stablelm.py +18 -16
- sglang/srt/models/torch_native_llama.py +15 -17
- sglang/srt/models/xverse.py +13 -14
- sglang/srt/models/xverse_moe.py +15 -16
- sglang/srt/models/yivl.py +13 -15
- sglang/srt/openai_api/adapter.py +13 -15
- sglang/srt/openai_api/protocol.py +13 -15
- sglang/srt/sampling/sampling_batch_info.py +4 -1
- sglang/srt/sampling/sampling_params.py +13 -15
- sglang/srt/server.py +59 -34
- sglang/srt/server_args.py +22 -22
- sglang/srt/utils.py +196 -17
- sglang/test/few_shot_gsm8k.py +8 -4
- sglang/test/runners.py +13 -14
- sglang/test/test_utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
- {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +24 -15
- sglang-0.3.6.post1.dist-info/RECORD +164 -0
- sglang/srt/layers/fused_moe/__init__.py +0 -1
- sglang-0.3.6.dist-info/RECORD +0 -161
- /sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +0 -0
- {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +0 -0
- {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py
CHANGED
@@ -1,22 +1,21 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""Common utilities."""
|
17
15
|
|
18
16
|
import base64
|
19
17
|
import ipaddress
|
18
|
+
import itertools
|
20
19
|
import json
|
21
20
|
import logging
|
22
21
|
import os
|
@@ -33,7 +32,7 @@ import time
|
|
33
32
|
import warnings
|
34
33
|
from importlib.metadata import PackageNotFoundError, version
|
35
34
|
from io import BytesIO
|
36
|
-
from typing import Any, Dict, List, Optional, Union
|
35
|
+
from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
|
37
36
|
|
38
37
|
import numpy as np
|
39
38
|
import psutil
|
@@ -46,6 +45,8 @@ from fastapi.responses import ORJSONResponse
|
|
46
45
|
from packaging import version as pkg_version
|
47
46
|
from starlette.routing import Mount
|
48
47
|
from torch import nn
|
48
|
+
from torch.func import functional_call
|
49
|
+
from torch.library import Library
|
49
50
|
from torch.profiler import ProfilerActivity, profile, record_function
|
50
51
|
from triton.runtime.cache import (
|
51
52
|
FileCacheManager,
|
@@ -192,6 +193,94 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
|
|
192
193
|
return free_gpu_memory / (1 << 30)
|
193
194
|
|
194
195
|
|
196
|
+
def is_pin_memory_available() -> bool:
|
197
|
+
return torch.cuda.is_available()
|
198
|
+
|
199
|
+
|
200
|
+
_CPU_OFFLOAD_BYTES = 0
|
201
|
+
_CPU_OFFLOAD_MAX_BYTES = 0
|
202
|
+
|
203
|
+
|
204
|
+
def set_cpu_offload_max_bytes(max_bytes: int) -> None:
|
205
|
+
global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
|
206
|
+
_CPU_OFFLOAD_BYTES = 0
|
207
|
+
_CPU_OFFLOAD_MAX_BYTES = max_bytes
|
208
|
+
|
209
|
+
|
210
|
+
def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
|
211
|
+
device = next(module.parameters()).device
|
212
|
+
|
213
|
+
if device == torch.device("cpu"):
|
214
|
+
return module
|
215
|
+
|
216
|
+
global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
|
217
|
+
if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
|
218
|
+
return module
|
219
|
+
|
220
|
+
pin_memory = is_pin_memory_available()
|
221
|
+
# offload parameters to CPU
|
222
|
+
# use pin_memory if possible, which helps cudagraph capture speed
|
223
|
+
offloaded_parameters = False
|
224
|
+
for p in module.parameters():
|
225
|
+
if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
|
226
|
+
# we use per-parameter offloading
|
227
|
+
# one module might have some parameters offloaded and some not
|
228
|
+
break
|
229
|
+
|
230
|
+
# `torch.empty_like` does not support `pin_memory` argument
|
231
|
+
cpu_data = torch.empty_strided(
|
232
|
+
size=p.data.size(),
|
233
|
+
stride=p.data.stride(),
|
234
|
+
dtype=p.data.dtype,
|
235
|
+
layout=p.data.layout,
|
236
|
+
device="cpu",
|
237
|
+
pin_memory=pin_memory,
|
238
|
+
)
|
239
|
+
cpu_data.copy_(p.data)
|
240
|
+
p.data = cpu_data
|
241
|
+
_CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
|
242
|
+
offloaded_parameters = True
|
243
|
+
|
244
|
+
if offloaded_parameters:
|
245
|
+
original_forward = module.forward
|
246
|
+
|
247
|
+
def forward(*args, **kwargs):
|
248
|
+
module.forward = original_forward
|
249
|
+
device_state = {
|
250
|
+
# here we blindly call `to(device)`
|
251
|
+
# if the parameter is already on the device, it will be a no-op
|
252
|
+
k: v.to(device, non_blocking=True)
|
253
|
+
for k, v in module.state_dict().items()
|
254
|
+
}
|
255
|
+
output = functional_call(module, device_state, args=args, kwargs=kwargs)
|
256
|
+
module.forward = forward
|
257
|
+
return output
|
258
|
+
|
259
|
+
module.forward = forward
|
260
|
+
|
261
|
+
return module
|
262
|
+
|
263
|
+
|
264
|
+
class LayerFn(Protocol):
|
265
|
+
|
266
|
+
def __call__(self, layer_id: int, prefix: str) -> torch.nn.Module: ...
|
267
|
+
|
268
|
+
|
269
|
+
def make_layers(
|
270
|
+
num_hidden_layers: int,
|
271
|
+
layer_fn: LayerFn,
|
272
|
+
prefix: str = "",
|
273
|
+
) -> Tuple[int, int, torch.nn.ModuleList]:
|
274
|
+
"""Make a list of layers with the given layer function"""
|
275
|
+
modules = torch.nn.ModuleList(
|
276
|
+
[
|
277
|
+
maybe_offload_to_cpu(layer_fn(idx=idx, prefix=f"{prefix}.{idx}"))
|
278
|
+
for idx in range(num_hidden_layers)
|
279
|
+
]
|
280
|
+
)
|
281
|
+
return modules
|
282
|
+
|
283
|
+
|
195
284
|
def set_random_seed(seed: int) -> None:
|
196
285
|
"""Set the random seed for all libraries."""
|
197
286
|
random.seed(seed)
|
@@ -842,4 +931,94 @@ def get_nvgpu_memory_capacity():
|
|
842
931
|
|
843
932
|
def crash_on_warnings():
|
844
933
|
# Crash on warning if we are running CI tests
|
845
|
-
return os.getenv("SGLANG_IS_IN_CI", "false") == "true"
|
934
|
+
return os.getenv("SGLANG_IS_IN_CI", "false").lower() == "true"
|
935
|
+
|
936
|
+
|
937
|
+
def get_device_name(device_id: int = 0) -> str:
|
938
|
+
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
939
|
+
return torch.cuda.get_device_name(device_id)
|
940
|
+
|
941
|
+
if hasattr(torch, "hip") and torch.hip.is_available():
|
942
|
+
return torch.hip.get_device_name(device_id)
|
943
|
+
|
944
|
+
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
945
|
+
return torch.xpu.get_device_name(device_id)
|
946
|
+
|
947
|
+
if hasattr(torch, "hpu") and torch.hpu.is_available():
|
948
|
+
return torch.hpu.get_device_name(device_id)
|
949
|
+
|
950
|
+
|
951
|
+
sglang_lib = Library("sglang", "FRAGMENT") # noqa
|
952
|
+
|
953
|
+
|
954
|
+
def direct_register_custom_op(
|
955
|
+
op_name: str,
|
956
|
+
op_func: Callable,
|
957
|
+
mutates_args: List[str],
|
958
|
+
fake_impl: Optional[Callable] = None,
|
959
|
+
target_lib: Optional[Library] = None,
|
960
|
+
):
|
961
|
+
"""
|
962
|
+
`torch.library.custom_op` can have significant overhead because it
|
963
|
+
needs to consider complicated dispatching logic. This function
|
964
|
+
directly registers a custom op and dispatches it to the CUDA backend.
|
965
|
+
See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
|
966
|
+
for more details.
|
967
|
+
|
968
|
+
By default, the custom op is registered to the vLLM library. If you
|
969
|
+
want to register it to a different library, you can pass the library
|
970
|
+
object to the `target_lib` argument.
|
971
|
+
|
972
|
+
IMPORTANT: the lifetime of the operator is tied to the lifetime of the
|
973
|
+
library object. If you want to bind the operator to a different library,
|
974
|
+
make sure the library object is alive when the operator is used.
|
975
|
+
"""
|
976
|
+
import torch.library
|
977
|
+
|
978
|
+
if hasattr(torch.library, "infer_schema"):
|
979
|
+
schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
|
980
|
+
else:
|
981
|
+
# for pytorch 2.4
|
982
|
+
import torch._custom_op.impl
|
983
|
+
|
984
|
+
schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
|
985
|
+
|
986
|
+
my_lib = target_lib or sglang_lib
|
987
|
+
my_lib.define(op_name + schema_str)
|
988
|
+
my_lib.impl(op_name, op_func, "CUDA")
|
989
|
+
if fake_impl is not None:
|
990
|
+
my_lib._register_fake(op_name, fake_impl)
|
991
|
+
|
992
|
+
|
993
|
+
def gpu_proc_affinity(
|
994
|
+
tp_size: int,
|
995
|
+
nnodes: int,
|
996
|
+
gpu_id: int,
|
997
|
+
):
|
998
|
+
# current process
|
999
|
+
pid = os.getpid()
|
1000
|
+
p = psutil.Process(pid)
|
1001
|
+
|
1002
|
+
tp_size_per_node = tp_size // nnodes
|
1003
|
+
|
1004
|
+
# total physical cores
|
1005
|
+
total_pcores = psutil.cpu_count(logical=False)
|
1006
|
+
# physical cores per TP (N.B. more Cores than GPUs on node)
|
1007
|
+
num_cores_bind = total_pcores // tp_size_per_node
|
1008
|
+
|
1009
|
+
# able to handle multiple DP per node
|
1010
|
+
start_cpu_id = (gpu_id * num_cores_bind) % total_pcores
|
1011
|
+
end_cpu_id = start_cpu_id + num_cores_bind
|
1012
|
+
|
1013
|
+
if psutil.cpu_count() != psutil.cpu_count(logical=False):
|
1014
|
+
# HT on
|
1015
|
+
upper_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
|
1016
|
+
lower_cpu_ids = [id + total_pcores for id in range(start_cpu_id, end_cpu_id)]
|
1017
|
+
bind_cpu_ids = list(itertools.chain(upper_cpu_ids, lower_cpu_ids))
|
1018
|
+
else:
|
1019
|
+
# HT off
|
1020
|
+
bind_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
|
1021
|
+
|
1022
|
+
# set cpu_affinity to current process
|
1023
|
+
p.cpu_affinity(bind_cpu_ids)
|
1024
|
+
logger.info(f"Process {pid} gpu_id {gpu_id} is running on CPUs: {p.cpu_affinity()}")
|
sglang/test/few_shot_gsm8k.py
CHANGED
@@ -48,9 +48,13 @@ def run_eval(args):
|
|
48
48
|
# Select backend
|
49
49
|
set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))
|
50
50
|
|
51
|
-
|
52
|
-
|
53
|
-
|
51
|
+
if args.data_path is None:
|
52
|
+
# Read data
|
53
|
+
url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
|
54
|
+
filename = download_and_cache_file(url)
|
55
|
+
else:
|
56
|
+
filename = args.data_path
|
57
|
+
|
54
58
|
lines = list(read_jsonl(filename))
|
55
59
|
|
56
60
|
# Construct prompts
|
@@ -131,7 +135,7 @@ def run_eval(args):
|
|
131
135
|
if __name__ == "__main__":
|
132
136
|
parser = argparse.ArgumentParser()
|
133
137
|
parser.add_argument("--num-shots", type=int, default=5)
|
134
|
-
parser.add_argument("--data-path", type=str
|
138
|
+
parser.add_argument("--data-path", type=str)
|
135
139
|
parser.add_argument("--num-questions", type=int, default=200)
|
136
140
|
parser.add_argument("--max-new-tokens", type=int, default=512)
|
137
141
|
parser.add_argument("--parallel", type=int, default=128)
|
sglang/test/runners.py
CHANGED
@@ -1,17 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
15
14
|
|
16
15
|
import json
|
17
16
|
import multiprocessing as mp
|
sglang/test/test_utils.py
CHANGED
@@ -44,7 +44,7 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8
|
|
44
44
|
|
45
45
|
def is_in_ci():
|
46
46
|
"""Return whether it is in CI runner."""
|
47
|
-
return os.getenv("SGLANG_IS_IN_CI", "false") == "true"
|
47
|
+
return os.getenv("SGLANG_IS_IN_CI", "false").lower() == "true"
|
48
48
|
|
49
49
|
|
50
50
|
if is_in_ci():
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.6"
|
1
|
+
__version__ = "0.3.6.post1"
|
@@ -186,7 +186,7 @@
|
|
186
186
|
same "printed page" as the copyright notice for easier
|
187
187
|
identification within third-party archives.
|
188
188
|
|
189
|
-
Copyright
|
189
|
+
Copyright 2023-2024 SGLang Team
|
190
190
|
|
191
191
|
Licensed under the Apache License, Version 2.0 (the "License");
|
192
192
|
you may not use this file except in compliance with the License.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.6
|
3
|
+
Version: 0.3.6.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -190,7 +190,7 @@ License: Apache License
|
|
190
190
|
same "printed page" as the copyright notice for easier
|
191
191
|
identification within third-party archives.
|
192
192
|
|
193
|
-
Copyright
|
193
|
+
Copyright 2023-2024 SGLang Team
|
194
194
|
|
195
195
|
Licensed under the Apache License, Version 2.0 (the "License");
|
196
196
|
you may not use this file except in compliance with the License.
|
@@ -222,6 +222,7 @@ Requires-Dist: fastapi; extra == "runtime-common"
|
|
222
222
|
Requires-Dist: hf_transfer; extra == "runtime-common"
|
223
223
|
Requires-Dist: huggingface_hub; extra == "runtime-common"
|
224
224
|
Requires-Dist: interegular; extra == "runtime-common"
|
225
|
+
Requires-Dist: modelscope; extra == "runtime-common"
|
225
226
|
Requires-Dist: orjson; extra == "runtime-common"
|
226
227
|
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
|
227
228
|
Requires-Dist: packaging; extra == "runtime-common"
|
@@ -234,7 +235,7 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
|
234
235
|
Requires-Dist: torchao; extra == "runtime-common"
|
235
236
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
236
237
|
Requires-Dist: uvloop; extra == "runtime-common"
|
237
|
-
Requires-Dist:
|
238
|
+
Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
|
238
239
|
Provides-Extra: srt
|
239
240
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
240
241
|
Requires-Dist: torch; extra == "srt"
|
@@ -245,6 +246,8 @@ Requires-Dist: torch; extra == "srt-hip"
|
|
245
246
|
Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
|
246
247
|
Provides-Extra: srt-xpu
|
247
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
249
|
+
Provides-Extra: srt-hpu
|
250
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
248
251
|
Provides-Extra: openai
|
249
252
|
Requires-Dist: openai>=1.0; extra == "openai"
|
250
253
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -274,6 +277,11 @@ Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
|
|
274
277
|
Requires-Dist: sglang[openai]; extra == "all-xpu"
|
275
278
|
Requires-Dist: sglang[anthropic]; extra == "all-xpu"
|
276
279
|
Requires-Dist: sglang[litellm]; extra == "all-xpu"
|
280
|
+
Provides-Extra: all-hpu
|
281
|
+
Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
|
282
|
+
Requires-Dist: sglang[openai]; extra == "all-hpu"
|
283
|
+
Requires-Dist: sglang[anthropic]; extra == "all-hpu"
|
284
|
+
Requires-Dist: sglang[litellm]; extra == "all-hpu"
|
277
285
|
Provides-Extra: dev
|
278
286
|
Requires-Dist: sglang[all]; extra == "dev"
|
279
287
|
Requires-Dist: sglang[test]; extra == "dev"
|
@@ -283,6 +291,9 @@ Requires-Dist: sglang[test]; extra == "dev-hip"
|
|
283
291
|
Provides-Extra: dev-xpu
|
284
292
|
Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
|
285
293
|
Requires-Dist: sglang[test]; extra == "dev-xpu"
|
294
|
+
Provides-Extra: dev-hpu
|
295
|
+
Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
|
296
|
+
Requires-Dist: sglang[test]; extra == "dev-hpu"
|
286
297
|
|
287
298
|
<div align="center" id="sglangtop">
|
288
299
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
@@ -321,21 +332,16 @@ SGLang is a fast serving framework for large language models and vision language
|
|
321
332
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
322
333
|
The core features include:
|
323
334
|
|
324
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/
|
335
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
|
325
336
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
326
337
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
327
338
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
328
339
|
|
329
340
|
## Getting Started
|
330
|
-
Install SGLang
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
## Backend: SGLang Runtime (SRT)
|
335
|
-
See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
|
336
|
-
|
337
|
-
## Frontend: Structured Generation Language (SGLang)
|
338
|
-
See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
|
341
|
+
- [Install SGLang](https://sgl-project.github.io/start/install.html)
|
342
|
+
- [Send requests](https://sgl-project.github.io/start/send_request.html)
|
343
|
+
- [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
|
344
|
+
- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
|
339
345
|
|
340
346
|
## Benchmark And Performance
|
341
347
|
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
|
@@ -343,6 +349,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
343
349
|
## Roadmap
|
344
350
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
345
351
|
|
346
|
-
##
|
352
|
+
## Adoption and Sponsorship
|
353
|
+
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, and xAI.
|
354
|
+
|
355
|
+
## Acknowledgment and Citation
|
356
|
+
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
347
357
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
348
|
-
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
@@ -0,0 +1,164 @@
|
|
1
|
+
sglang/__init__.py,sha256=3M0oz0ZA8fULhV5LwQ4hxh-MRdHsOJRD1D63C60pdG4,1616
|
2
|
+
sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
|
3
|
+
sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
|
4
|
+
sglang/bench_offline_throughput.py,sha256=z6uA6Gxa_nFZa0cOXi7MJDuX82xcqk5WfqBMavd8a-s,10929
|
5
|
+
sglang/bench_one_batch.py,sha256=WxrQUkMcxz5GV8OEHj0ckHgpC76HgO6YxmDvJFRDeyU,15670
|
6
|
+
sglang/bench_one_batch_server.py,sha256=nzeF_bcaXanQuYLBxAvd3OO4fwbKproMcahXdHIVR6w,5920
|
7
|
+
sglang/bench_serving.py,sha256=hI7FjaERyqKBrYtKewDU6E4rSufKxqsUPyUgtWtTKSI,52545
|
8
|
+
sglang/check_env.py,sha256=nR2m0a9WbQmkimJihUx-Lqi7XjN0jyWTCO2vYyA7R2M,5356
|
9
|
+
sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
|
10
|
+
sglang/launch_server.py,sha256=_XIqBcXArYtHTqilOFkYWKZBYXGCMHAxbYOST08LGj0,415
|
11
|
+
sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
|
12
|
+
sglang/utils.py,sha256=eCvD3fZCALr-MuyZxJL7HAeeqqpxAxf4LJrf7OiCbco,11547
|
13
|
+
sglang/version.py,sha256=YrfhKDmn6rTAj_qREKEXk2FahHCqSbHd4BNoD7wlIi0,28
|
14
|
+
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
sglang/lang/chat_template.py,sha256=jprS3-In2FTUoedKwZg-HYvDwU8RTIYntOlf2zoN2sU,14814
|
16
|
+
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
17
|
+
sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
|
18
|
+
sglang/lang/interpreter.py,sha256=SBjejhLhTKzNM0HbjtTg5r17WPJ64WFSk6lcM_SCWKs,30717
|
19
|
+
sglang/lang/ir.py,sha256=zpzzAO1YVldhE95Vwz5hU_TQltu-xt8A6rfFr0PuIDA,18410
|
20
|
+
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
21
|
+
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
+
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
23
|
+
sglang/lang/backend/base_backend.py,sha256=tdoh9YF3CyekY1BKiX9n7-aA4srDWIuA4RDJLM7q8qg,1985
|
24
|
+
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
25
|
+
sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
|
26
|
+
sglang/lang/backend/runtime_endpoint.py,sha256=IWbrAKrUkzNOvwV6V9_y6pkTr2SUYEkKBT-3kirgad0,10514
|
27
|
+
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
28
|
+
sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21201
|
29
|
+
sglang/srt/hf_transformers_utils.py,sha256=sUUCpjbTHuYDMuwOaz00nH5fataXKjliD8gCxXU64sw,6712
|
30
|
+
sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
|
31
|
+
sglang/srt/model_parallel.py,sha256=QR-Alqo0sElDXPJ79N1PhUHHKiEHPQn3dyXduMP-SHQ,3664
|
32
|
+
sglang/srt/server.py,sha256=7PSxAUhiS796yQFeiQxiilRhLQ3FpV0wL53CfDgkCIk,30851
|
33
|
+
sglang/srt/server_args.py,sha256=CfmpU6_EDnxJzpJiRx2n6AhOPCtrHPOf-7wEtTF__L0,30834
|
34
|
+
sglang/srt/utils.py,sha256=APZEUancLC0jRI1JMbv7e5bIZy3OEySGyZspxGA60yQ,33509
|
35
|
+
sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
|
36
|
+
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
37
|
+
sglang/srt/configs/model_config.py,sha256=dQ58mYKN3M5IwldFZkwIb4CCBa6dREb5Om4Kg2kffOE,9565
|
38
|
+
sglang/srt/configs/qwen2vl.py,sha256=AYHuFgJ0bwhWYkD7S6fvP7yJejJnuhy4xp5Q2W-O6ps,4424
|
39
|
+
sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO0ATuk,786
|
40
|
+
sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
|
41
|
+
sglang/srt/constrained/outlines_backend.py,sha256=IDpyzXJS-ydRXYOHHzx1bO9VjiMRF8E5knn4CLFwPU8,6447
|
42
|
+
sglang/srt/constrained/outlines_jump_forward.py,sha256=IGg6mThDepugfez0jnQ6HfLSHtiUl_Mq7bsPFppb3DA,6196
|
43
|
+
sglang/srt/constrained/xgrammar_backend.py,sha256=4ZCQgcjWEY2Lg4r2V9sAiYJJblkQ_uVbEnvsjqhR1Pc,4548
|
44
|
+
sglang/srt/layers/activation.py,sha256=EboMjT9HV2tNHQ6rzpojtlkzev1lAFbhQlxMg9hwxBQ,5471
|
45
|
+
sglang/srt/layers/custom_op_util.py,sha256=0vu-yX2wwonmO1L_o5G7SA6C-8XuhDIh9rPDvNeLhoc,922
|
46
|
+
sglang/srt/layers/fused_moe_patch.py,sha256=dxjcBMY_zAqA0pnmy5KDUZZJSd5Q64Xlxhxyb33cdMk,4240
|
47
|
+
sglang/srt/layers/layernorm.py,sha256=nRQ1w1xSUcU-zlqVC61BnGG6otS5W1w9VaSzeXizrx4,4037
|
48
|
+
sglang/srt/layers/linear.py,sha256=EOdlpAf6srqxzvPpxcv10KFJKedNc22CGP1qEvpRbDg,46131
|
49
|
+
sglang/srt/layers/logits_processor.py,sha256=V8fHxeQK8lzUhGD2Xc7MY1Y9qBhzFyh6hqp31RJVefg,12669
|
50
|
+
sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
|
51
|
+
sglang/srt/layers/radix_attention.py,sha256=C_mK4mfmKlxMRNeKYP9E5R3PRd3eT-OcE_g3mo36dJM,2058
|
52
|
+
sglang/srt/layers/rotary_embedding.py,sha256=29tx3JNR40AoXqBa2cFGBjva9vU2xgFipETlpMaaZas,3985
|
53
|
+
sglang/srt/layers/sampler.py,sha256=zgNwgUx7fozkWsEJFRKDV9SipHBijfpU9pTroNst6Ho,4552
|
54
|
+
sglang/srt/layers/torchao_utils.py,sha256=v0hyr4hLsM42QwOPCdKb-ftRTjVokBZbqvRj4O4C-Nw,3415
|
55
|
+
sglang/srt/layers/vocab_parallel_embedding.py,sha256=RmaZbgXbFnGKX1eGYxlmiko-6JwaJX6seHupUSCtAm8,21583
|
56
|
+
sglang/srt/layers/attention/__init__.py,sha256=EL1o6Q5vLgViN3pOr2A7F6K9FlNEpMdBypFAVMeq_HA,2445
|
57
|
+
sglang/srt/layers/attention/double_sparsity_backend.py,sha256=BlX7uXteQpnoOnKsdBKh8h20zMVMEiibB5F_PkZSlNI,10706
|
58
|
+
sglang/srt/layers/attention/flashinfer_backend.py,sha256=oblYMbmYzK94H3EA9lMhKWaKdi8HLH5NqAiZmjzj4Es,24875
|
59
|
+
sglang/srt/layers/attention/triton_backend.py,sha256=gjxed2cvc2-8QEHkzyTVv6ui7oYOp2b_vgIUQVD1XuM,6538
|
60
|
+
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=BE63WhKiutSNkhJLsRwvfsRy-ExvuAv7FZyoWv73ul8,18744
|
61
|
+
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
|
62
|
+
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=Gfct-0_l-S2ZrP4F-zkzNiFbmd3C3f7uJovacOuDxaA,11472
|
63
|
+
sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
|
64
|
+
sglang/srt/layers/fused_moe_grok/__init__.py,sha256=rj_JBzcP--eaaM6LGQ-u580uQvqLisp5JtGBAs1fVYc,80
|
65
|
+
sglang/srt/layers/fused_moe_grok/fused_moe.py,sha256=bxRcjdALxeY3FDnKivGOoNr6Er1kh6CCPtlAp7pjz50,23844
|
66
|
+
sglang/srt/layers/fused_moe_grok/layer.py,sha256=v-o5YHYEU2HIEZwouyuc3UyfNj7YQrEYOO_BXKELU7Y,23453
|
67
|
+
sglang/srt/layers/fused_moe_triton/__init__.py,sha256=PHKFqd2hPOO-g9kSMseg2g76lpg9OGXQDThWU6bt9vs,902
|
68
|
+
sglang/srt/layers/fused_moe_triton/fused_moe.py,sha256=qwfRBOeY5DT48Q6z71Eh9cjFehvs_K6eLIVWNL044Ug,28363
|
69
|
+
sglang/srt/layers/fused_moe_triton/layer.py,sha256=URDkTt8xEqnqpO5tb_3L7JlhlO53VWfqDDNSRYEu-LY,21545
|
70
|
+
sglang/srt/layers/quantization/__init__.py,sha256=f9tCC_9sHjp7JCPvyZIvuoTB4KooIucGA9S2w7ADevw,4849
|
71
|
+
sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
|
72
|
+
sglang/srt/lora/lora.py,sha256=KhhO9aKCyFWvJnhI07lZKANIvNjtt882HrTYFNBZMv0,15065
|
73
|
+
sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
|
74
|
+
sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
|
75
|
+
sglang/srt/managers/data_parallel_controller.py,sha256=JxRtJJTVn1FU2iD292rLZPftAsR4_8j4d3yF8j0dvBc,8327
|
76
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=nWBn54pz3aQ8tzVvViwwL2k0V4WATi0qw11H0Bzua-Q,7389
|
77
|
+
sglang/srt/managers/image_processor.py,sha256=Pk_dtXzljTkFt7Acsv1RyDzEqvCvjc7BMngxGhtkpDU,13817
|
78
|
+
sglang/srt/managers/io_struct.py,sha256=WLXz-tyn0jR7zNO9feRBXgyjphVa8qR55OoEOUdzoVI,13751
|
79
|
+
sglang/srt/managers/schedule_batch.py,sha256=-5oYdkStPiYjPWl0tCkUVRjTGB7fjA0wIngK-09da7w,43111
|
80
|
+
sglang/srt/managers/schedule_policy.py,sha256=ayFz4iPLIlG8mx5i1glTCAMHJPGpFedMP9UgRtqkNhA,12526
|
81
|
+
sglang/srt/managers/scheduler.py,sha256=8owHPXG6fxZtsCWSJ6K7EOlFDcPxYinZC1DwKMJcEVM,55930
|
82
|
+
sglang/srt/managers/session_controller.py,sha256=jXoPHxMGh8T1iYWIEjSXoPVwaL6NEjv3QtqlsrvPE1c,2355
|
83
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=zYbKEKNuM1B3PXzA7jnDpxew-0rZXSX-7dHmVLWG3e4,26477
|
84
|
+
sglang/srt/managers/tp_worker.py,sha256=1SQJ60iKS9e5vGY555fT1iZ4OtLumXzeWfB08fSWKbk,6176
|
85
|
+
sglang/srt/managers/tp_worker_overlap_thread.py,sha256=7vhPebaOS4JamaS08CGf_hwxnUO7Gy_SXZXEPwNHKoY,7621
|
86
|
+
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
87
|
+
sglang/srt/mem_cache/chunk_cache.py,sha256=VcCpyrf5FOQ5xoKeOouCI5ZQLkZo_pgY1SPbDDkagGg,2492
|
88
|
+
sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
|
89
|
+
sglang/srt/mem_cache/memory_pool.py,sha256=41fjuj_sD0yfJq-sy-X99cc2djBa6w4dy2y47V0WqNU,10934
|
90
|
+
sglang/srt/mem_cache/radix_cache.py,sha256=DzLCO_gYQ7X_C2NJSEHzzMZhb5HzWjKF9wXJQsnzr8M,10427
|
91
|
+
sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTluMU,6861
|
92
|
+
sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
|
93
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=4hbCtE3gt5kvMNHrnxkE8YPRFcgmVo0Bwz3lgbYZw_E,14805
|
94
|
+
sglang/srt/model_executor/forward_batch_info.py,sha256=n5yk927COTU0klDAkQuwrFzamMygfkHxmDp1I6bJYD8,12612
|
95
|
+
sglang/srt/model_executor/model_runner.py,sha256=AafFWd_EDWbOe0o5etAyutGum5O8_9tO55KRcaAWDW4,29680
|
96
|
+
sglang/srt/models/baichuan.py,sha256=RyvPQvi7wy9VUGvLwG17XttcTp43yRj6c3zNRImBToA,15005
|
97
|
+
sglang/srt/models/chatglm.py,sha256=OikygdK8Mi6F2QPPhAr2E_P4l2V0yWQjDJOdnBAApPE,13216
|
98
|
+
sglang/srt/models/commandr.py,sha256=XkzpfsdDPDx-W5oOac8nFIe39JJZvmv65K5GIpgJTz0,14212
|
99
|
+
sglang/srt/models/dbrx.py,sha256=ucn3UJ1s4nx2qa5hUb8VhJmfVrDZ59e9oNetMU5EWq8,14624
|
100
|
+
sglang/srt/models/deepseek.py,sha256=B5OuW--kDIPfZesOhvGGUhHQNWh0pMPNCYmdsv9lv5U,15922
|
101
|
+
sglang/srt/models/deepseek_v2.py,sha256=shdHVtZGmLEZMZwGlIPz8NPoSb1c_n6hQxWKG45WahE,32265
|
102
|
+
sglang/srt/models/exaone.py,sha256=6LJ1Mr9MbHOXdH_nK9Dba3SR28LMCJvdH1k53w9M9Vg,13081
|
103
|
+
sglang/srt/models/gemma.py,sha256=079CfoQqBnrLIbW0LWcLp-nmb1aPVN1Tw6PxMQQ3Lsk,12289
|
104
|
+
sglang/srt/models/gemma2.py,sha256=lbfQhQpUhf1MAEB_00Uo6rp20k4Hr353UbPKKuMsxec,15020
|
105
|
+
sglang/srt/models/gemma2_reward.py,sha256=cQawatbsfBuWQTueivYHl_17ZoQUHEelI1sr1y5pvfY,2556
|
106
|
+
sglang/srt/models/gpt2.py,sha256=Th7_Dnkw82GFBOuMOTrHtA44JBPHRUtY3Qd73rQwzMc,9741
|
107
|
+
sglang/srt/models/gpt_bigcode.py,sha256=lYo4ajy49VvvPkaduaFtOaCRT_ItqyNUE158S-BI5QA,10136
|
108
|
+
sglang/srt/models/grok.py,sha256=rDIH_SFzauuEHcL_vCOSrYLjdBC3i3o_AcceL3amsJw,14927
|
109
|
+
sglang/srt/models/internlm2.py,sha256=DxbA15d9QR0tLOczpC6DkB8QyNHXJRdZatY6Nskwv1k,12170
|
110
|
+
sglang/srt/models/internlm2_reward.py,sha256=Lr-JA0vfTQJt9q5oDMiopGuoXAevyEv5PAoDe2rsTJk,2425
|
111
|
+
sglang/srt/models/llama.py,sha256=FSGuM3BamhuT5h2jedh5cSFwFYduOJwkAZJJ672awRw,16423
|
112
|
+
sglang/srt/models/llama_classification.py,sha256=c8WZ1ADa3f6s2IJVoP10ouVgeCwv_ndns_qMgLrC6QI,3413
|
113
|
+
sglang/srt/models/llama_embedding.py,sha256=2ex2jrz31osaAd9V8sJeN0qyxmk-L5NgOBkXL1puGhI,3166
|
114
|
+
sglang/srt/models/llama_reward.py,sha256=prhHDPpf1k6tlQtGE6zq5gx0uSZAD3W5v7W28bdgy4U,4619
|
115
|
+
sglang/srt/models/llava.py,sha256=72DnZXIwu78zYqU8YIElq_AaSIFO_icYOPTHXE0_-YQ,24941
|
116
|
+
sglang/srt/models/llavavid.py,sha256=DeWqGSmXgIYGuLyy2ZrxjM9WqbRjueP4chNmXt7Bnus,12221
|
117
|
+
sglang/srt/models/minicpm.py,sha256=KbiTf-kaDAJxSo9Z4IGMTrs9WrYYji1KXO1kA2iy-as,13816
|
118
|
+
sglang/srt/models/minicpm3.py,sha256=C43mTr2Qjccj4sXuTDgzbfZhvCNbsEHNggMRXQ7SrWs,25108
|
119
|
+
sglang/srt/models/mistral.py,sha256=EYifJUUzN2Z2-iL37eJiNZF_DB0H4pa0mKlgYRIxM70,838
|
120
|
+
sglang/srt/models/mixtral.py,sha256=E3d8I7V3Dp1nCEHRbhh-PKBG8UaVK5XOHwl9QyIjcX0,14043
|
121
|
+
sglang/srt/models/mixtral_quant.py,sha256=o-oTG8BGtWuNu-o6muHSarMNBQwrjQowyBFOQhuclZ8,14065
|
122
|
+
sglang/srt/models/mllama.py,sha256=pET1x8wY04yoS8HMCncKx0tFPqGp78K8rlA7Eq7XioE,37889
|
123
|
+
sglang/srt/models/olmo.py,sha256=DEUPNDM0z83N-Qdhkj2WJMtbiz5JNbSBMIjUaYZN9RM,12068
|
124
|
+
sglang/srt/models/olmoe.py,sha256=jVKrjqQQrWLdlkGSGUaMPdT9PHzNH4X-RVwON29eaGw,15412
|
125
|
+
sglang/srt/models/phi3_small.py,sha256=fxqGU0xphJzTeuBW38SRRYpRb2rcsg53JxuObK0pZig,15141
|
126
|
+
sglang/srt/models/qwen.py,sha256=P9zcFnz_Tsz73tVtLRwZ8uWzCtMxWOrzlv2o9Ys_Gck,9947
|
127
|
+
sglang/srt/models/qwen2.py,sha256=ApFFASNwvrkDXi-KkCNA7fTk4uLMuJWoMg15zCaAKdA,12514
|
128
|
+
sglang/srt/models/qwen2_moe.py,sha256=1oxDsKDq3jlHKx9jMi1SfHOqCRVyN5n76uw3M-CUODE,17048
|
129
|
+
sglang/srt/models/qwen2_vl.py,sha256=G3FNa_N2-CzB56LVrukwBtJazxMrDC_GPNjK6Wqxc4s,26415
|
130
|
+
sglang/srt/models/stablelm.py,sha256=jpmsyWMJo_9JapOESnuV7ObNCh78BRznXY0iFvvIbZE,11354
|
131
|
+
sglang/srt/models/torch_native_llama.py,sha256=vNQxsnbVAY1bdyMCCWDZAtWdbaFIiJXhmVxHjk5BB9Y,19400
|
132
|
+
sglang/srt/models/xverse.py,sha256=LGe0ma0wOir3x-OLBT_cRocw8JEo9d3AYNxgA2OcLrk,13659
|
133
|
+
sglang/srt/models/xverse_moe.py,sha256=YqbzkSsnTFt-8-aI8YobF9qJA70qrBjbS1Kjn1KNqVY,15766
|
134
|
+
sglang/srt/models/yivl.py,sha256=yj4aWsOBVGQBLurSrLmYXVC7zGIPH7EYHHtAaAZ7Liw,4859
|
135
|
+
sglang/srt/openai_api/adapter.py,sha256=MhOcWZjcLv4_OuvLvDMcAu6K_u2joJvhaZxaKm0hi3M,53634
|
136
|
+
sglang/srt/openai_api/protocol.py,sha256=vBgrbTqtECsZ5dG0rgP1FHsTBt4eR9zbDX3FBIN-rz4,10172
|
137
|
+
sglang/srt/sampling/sampling_batch_info.py,sha256=YC-KPyDWyLGNPL4YVcst4xwP8Wlz2zcCNJHB_5zljXQ,8470
|
138
|
+
sglang/srt/sampling/sampling_params.py,sha256=n7RbBg_bS5fYhsiWa8uJYnfoXy_i5DvtTBOkuFnHDNU,5286
|
139
|
+
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
140
|
+
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=J-DEemZcKm1--o37kf3qDOE8SZ_6H3d5oex49Mgq2ZU,10762
|
141
|
+
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD60mwD1tCcSG0x5IYo0v4z9ce-q_YwbJ9f8,2490
|
142
|
+
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=_Nxv0XgUPirZjw2SEJYp_Cd9ZcLwmt7h6JE6J4hhFq4,3629
|
143
|
+
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=5tOgCg7OvE9kSN9VMCpH1hwqo1YMxt9iS5PVpct9HpU,2468
|
144
|
+
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=m22Rfn1RuB1HpImBDECsiJ2VooBYpsFADAwnk1EPzk0,2751
|
145
|
+
sglang/test/few_shot_gsm8k.py,sha256=7yDbEQe49gZeJhz2wFFX-gf_59ThDKsCS1xwfogNc7k,4034
|
146
|
+
sglang/test/few_shot_gsm8k_engine.py,sha256=QQbrwOX6-cJDD3RZC_e7zPnt6aSo8JdF8X_lRHSjdDM,3886
|
147
|
+
sglang/test/run_eval.py,sha256=9yO0hXZOcn4abEOs96T-XPguDEklK16Ltco0pGF3zCg,4020
|
148
|
+
sglang/test/runners.py,sha256=ANzjrHkT_1E0G3UcD47O8XEKst3Si4AOfx-uErbFS7o,15129
|
149
|
+
sglang/test/simple_eval_common.py,sha256=joqrGysuLnJFtzDRIgFkMsRyKUSyjVPFWp0_PHAL3Ik,12378
|
150
|
+
sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
|
151
|
+
sglang/test/simple_eval_humaneval.py,sha256=zmV3xWYc2OrpiT9Dy55RTKZL5DEROD1cJ0NA_-cU5zI,5685
|
152
|
+
sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
|
153
|
+
sglang/test/simple_eval_mgsm.py,sha256=rd7TSUyxdKbrXaVoewo24V8lCo_6kO8zxPhhmvylpw8,10259
|
154
|
+
sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
|
155
|
+
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
156
|
+
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
157
|
+
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
158
|
+
sglang/test/test_utils.py,sha256=ULF7C3pLXkMevXgE_Dodt29OBfvvXKUnRvwKhaBg1ys,23470
|
159
|
+
sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
|
160
|
+
sglang-0.3.6.post1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
|
161
|
+
sglang-0.3.6.post1.dist-info/METADATA,sha256=XwhCEL8SbEVcT7LQLk26g6tzduS6mByBE7dDqZYpQxo,22073
|
162
|
+
sglang-0.3.6.post1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
163
|
+
sglang-0.3.6.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
164
|
+
sglang-0.3.6.post1.dist-info/RECORD,,
|
@@ -1 +0,0 @@
|
|
1
|
-
from sglang.srt.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
|