sglang 0.3.1__py3-none-any.whl → 0.3.1.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +10 -3
- sglang/bench_server_latency.py +187 -0
- sglang/bench_serving.py +1 -1
- sglang/global_config.py +5 -13
- sglang/lang/interpreter.py +0 -3
- sglang/srt/constrained/fsm_cache.py +5 -1
- sglang/srt/layers/activation.py +16 -1
- sglang/srt/layers/attention_backend.py +12 -12
- sglang/srt/layers/fused_moe/layer.py +27 -7
- sglang/srt/layers/layernorm.py +21 -6
- sglang/srt/layers/sampler.py +40 -98
- sglang/srt/lora/lora_manager.py +11 -8
- sglang/srt/managers/io_struct.py +3 -0
- sglang/srt/managers/policy_scheduler.py +49 -93
- sglang/srt/managers/schedule_batch.py +2 -1
- sglang/srt/managers/tp_worker.py +19 -13
- sglang/srt/model_executor/cuda_graph_runner.py +25 -13
- sglang/srt/model_executor/model_runner.py +37 -46
- sglang/srt/models/deepseek_v2.py +8 -3
- sglang/srt/models/llama.py +1 -3
- sglang/srt/models/llama_classification.py +2 -3
- sglang/srt/models/minicpm3.py +7 -3
- sglang/srt/models/olmoe.py +415 -0
- sglang/srt/models/xverse.py +1 -3
- sglang/srt/models/xverse_moe.py +1 -4
- sglang/srt/sampling/sampling_batch_info.py +3 -50
- sglang/srt/server.py +6 -1
- sglang/srt/server_args.py +39 -10
- sglang/srt/utils.py +7 -51
- sglang/test/few_shot_gsm8k.py +8 -2
- sglang/test/test_utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/METADATA +4 -5
- {sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/RECORD +37 -35
- {sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/WHEEL +1 -1
- {sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/LICENSE +0 -0
- {sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -21,6 +21,8 @@ import logging
|
|
21
21
|
import random
|
22
22
|
from typing import List, Optional, Union
|
23
23
|
|
24
|
+
from sglang.srt.utils import is_hip
|
25
|
+
|
24
26
|
logger = logging.getLogger(__name__)
|
25
27
|
|
26
28
|
|
@@ -59,6 +61,7 @@ class ServerArgs:
|
|
59
61
|
tp_size: int = 1
|
60
62
|
stream_interval: int = 1
|
61
63
|
random_seed: Optional[int] = None
|
64
|
+
constrained_json_whitespace_pattern: Optional[str] = None
|
62
65
|
|
63
66
|
# Logging
|
64
67
|
log_level: str = "info"
|
@@ -94,11 +97,12 @@ class ServerArgs:
|
|
94
97
|
disable_cuda_graph_padding: bool = False
|
95
98
|
disable_disk_cache: bool = False
|
96
99
|
disable_custom_all_reduce: bool = False
|
100
|
+
disable_mla: bool = False
|
97
101
|
enable_mixed_chunk: bool = False
|
98
102
|
enable_torch_compile: bool = False
|
103
|
+
max_torch_compile_bs: int = 32
|
99
104
|
torchao_config: str = ""
|
100
105
|
enable_p2p_check: bool = False
|
101
|
-
enable_mla: bool = False
|
102
106
|
triton_attention_reduce_in_fp32: bool = False
|
103
107
|
|
104
108
|
# LoRA
|
@@ -152,11 +156,12 @@ class ServerArgs:
|
|
152
156
|
)
|
153
157
|
self.sampling_backend = "pytorch"
|
154
158
|
|
155
|
-
#
|
156
|
-
if
|
157
|
-
logger.info("MLA optimization is tunred on. Use triton backend.")
|
159
|
+
# ROCm: flashinfer available later
|
160
|
+
if is_hip():
|
158
161
|
self.attention_backend = "triton"
|
162
|
+
self.sampling_backend = "pytorch"
|
159
163
|
|
164
|
+
# Default kernel backends
|
160
165
|
if self.attention_backend is None:
|
161
166
|
self.attention_backend = "flashinfer"
|
162
167
|
|
@@ -359,6 +364,12 @@ class ServerArgs:
|
|
359
364
|
default=ServerArgs.random_seed,
|
360
365
|
help="The random seed.",
|
361
366
|
)
|
367
|
+
parser.add_argument(
|
368
|
+
"--constrained-json-whitespace-pattern",
|
369
|
+
type=str,
|
370
|
+
default=ServerArgs.constrained_json_whitespace_pattern,
|
371
|
+
help=r"Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
|
372
|
+
)
|
362
373
|
parser.add_argument(
|
363
374
|
"--log-level",
|
364
375
|
type=str,
|
@@ -488,6 +499,11 @@ class ServerArgs:
|
|
488
499
|
default=False,
|
489
500
|
help="Disable the custom all-reduce kernel and fall back to NCCL.",
|
490
501
|
)
|
502
|
+
parser.add_argument(
|
503
|
+
"--disable-mla",
|
504
|
+
action="store_true",
|
505
|
+
help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
|
506
|
+
)
|
491
507
|
parser.add_argument(
|
492
508
|
"--enable-mixed-chunk",
|
493
509
|
action="store_true",
|
@@ -498,6 +514,12 @@ class ServerArgs:
|
|
498
514
|
action="store_true",
|
499
515
|
help="Optimize the model with torch.compile. Experimental feature.",
|
500
516
|
)
|
517
|
+
parser.add_argument(
|
518
|
+
"--max-torch-compile-bs",
|
519
|
+
type=int,
|
520
|
+
default=ServerArgs.max_torch_compile_bs,
|
521
|
+
help="Set the maximum batch size when using torch compile.",
|
522
|
+
)
|
501
523
|
parser.add_argument(
|
502
524
|
"--torchao-config",
|
503
525
|
type=str,
|
@@ -509,11 +531,6 @@ class ServerArgs:
|
|
509
531
|
action="store_true",
|
510
532
|
help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
|
511
533
|
)
|
512
|
-
parser.add_argument(
|
513
|
-
"--enable-mla",
|
514
|
-
action="store_true",
|
515
|
-
help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
|
516
|
-
)
|
517
534
|
parser.add_argument(
|
518
535
|
"--triton-attention-reduce-in-fp32",
|
519
536
|
action="store_true",
|
@@ -532,7 +549,8 @@ class ServerArgs:
|
|
532
549
|
type=str,
|
533
550
|
nargs="*",
|
534
551
|
default=None,
|
535
|
-
|
552
|
+
action=LoRAPathAction,
|
553
|
+
help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}",
|
536
554
|
)
|
537
555
|
parser.add_argument(
|
538
556
|
"--max-loras-per-batch",
|
@@ -590,3 +608,14 @@ class PortArgs:
|
|
590
608
|
controller_port: int
|
591
609
|
detokenizer_port: int
|
592
610
|
nccl_ports: List[int]
|
611
|
+
|
612
|
+
|
613
|
+
class LoRAPathAction(argparse.Action):
|
614
|
+
def __call__(self, parser, namespace, values, option_string=None):
|
615
|
+
setattr(namespace, self.dest, {})
|
616
|
+
for lora_path in values:
|
617
|
+
if "=" in lora_path:
|
618
|
+
name, path = lora_path.split("=", 1)
|
619
|
+
getattr(namespace, self.dest)[name] = path
|
620
|
+
else:
|
621
|
+
getattr(namespace, self.dest)[lora_path] = lora_path
|
sglang/srt/utils.py
CHANGED
@@ -51,6 +51,11 @@ show_time_cost = False
|
|
51
51
|
time_infos = {}
|
52
52
|
|
53
53
|
|
54
|
+
# torch flag AMD GPU
|
55
|
+
def is_hip() -> bool:
|
56
|
+
return torch.version.hip is not None
|
57
|
+
|
58
|
+
|
54
59
|
def enable_show_time_cost():
|
55
60
|
global show_time_cost
|
56
61
|
show_time_cost = True
|
@@ -187,7 +192,7 @@ def allocate_init_ports(
|
|
187
192
|
cur_port += 1
|
188
193
|
|
189
194
|
if port is not None and ret_ports[0] != port:
|
190
|
-
logger.
|
195
|
+
logger.warning(
|
191
196
|
f"WARNING: Port {port} is not available. Use port {ret_ports[0]} instead."
|
192
197
|
)
|
193
198
|
|
@@ -623,56 +628,7 @@ def set_ulimit(target_soft_limit=65535):
|
|
623
628
|
try:
|
624
629
|
resource.setrlimit(resource_type, (target_soft_limit, current_hard))
|
625
630
|
except ValueError as e:
|
626
|
-
logger.
|
627
|
-
|
628
|
-
|
629
|
-
def is_llama3_405b_fp8_head_16(model_config):
|
630
|
-
"""Return whether the model is meta-llama/Meta-Llama-3.1-405B-FP8 with 16 kv heads."""
|
631
|
-
if (
|
632
|
-
model_config.hf_config.architectures[0] == "LlamaForCausalLM"
|
633
|
-
and model_config.hf_config.hidden_size == 16384
|
634
|
-
and model_config.hf_config.intermediate_size == 53248
|
635
|
-
and model_config.hf_config.num_hidden_layers == 126
|
636
|
-
and model_config.hf_config.num_key_value_heads == 16
|
637
|
-
and hasattr(model_config.hf_config, "quantization_config")
|
638
|
-
and model_config.hf_config.quantization_config["quant_method"] == "fbgemm_fp8"
|
639
|
-
):
|
640
|
-
return True
|
641
|
-
return False
|
642
|
-
|
643
|
-
|
644
|
-
def monkey_patch_vllm_qvk_linear_loader():
|
645
|
-
"""A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints."""
|
646
|
-
from vllm.model_executor.layers.linear import QKVParallelLinear
|
647
|
-
|
648
|
-
origin_weight_loader = QKVParallelLinear.weight_loader
|
649
|
-
|
650
|
-
def get_original_weight(loaded_weight, head_dim):
|
651
|
-
n_kv_head = loaded_weight.shape[0] // (2 * head_dim)
|
652
|
-
dim = loaded_weight.shape[1]
|
653
|
-
for i in range(n_kv_head):
|
654
|
-
loaded_weight[i * head_dim : (i + 1) * head_dim, :] = loaded_weight[
|
655
|
-
2 * i * head_dim : (2 * i + 1) * head_dim, :
|
656
|
-
]
|
657
|
-
original_kv_weight = loaded_weight[: n_kv_head * head_dim, :]
|
658
|
-
assert original_kv_weight.shape == (n_kv_head * head_dim, dim)
|
659
|
-
return original_kv_weight
|
660
|
-
|
661
|
-
def weight_loader_srt(
|
662
|
-
self,
|
663
|
-
param: Parameter,
|
664
|
-
loaded_weight: torch.Tensor,
|
665
|
-
loaded_shard_id: Optional[str] = None,
|
666
|
-
):
|
667
|
-
if (
|
668
|
-
loaded_shard_id in ["k", "v"]
|
669
|
-
and loaded_weight.shape[0] == self.head_size * self.total_num_kv_heads * 2
|
670
|
-
):
|
671
|
-
loaded_weight = get_original_weight(loaded_weight, self.head_size)
|
672
|
-
|
673
|
-
origin_weight_loader(self, param, loaded_weight, loaded_shard_id)
|
674
|
-
|
675
|
-
setattr(QKVParallelLinear, "weight_loader", weight_loader_srt)
|
631
|
+
logger.warning(f"Fail to set RLIMIT_NOFILE: {e}")
|
676
632
|
|
677
633
|
|
678
634
|
def add_api_key_middleware(app, api_key: str):
|
sglang/test/few_shot_gsm8k.py
CHANGED
@@ -44,7 +44,7 @@ def get_answer_value(answer_str):
|
|
44
44
|
return INVALID
|
45
45
|
|
46
46
|
|
47
|
-
def
|
47
|
+
def run_eval(args):
|
48
48
|
# Select backend
|
49
49
|
set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))
|
50
50
|
|
@@ -119,6 +119,12 @@ def main(args):
|
|
119
119
|
# Dump results
|
120
120
|
dump_state_text("tmp_output_gsm8k.txt", states)
|
121
121
|
|
122
|
+
return {
|
123
|
+
"accuracy": acc,
|
124
|
+
"latency": latency,
|
125
|
+
"output_throughput": output_throughput,
|
126
|
+
}
|
127
|
+
|
122
128
|
|
123
129
|
if __name__ == "__main__":
|
124
130
|
parser = argparse.ArgumentParser()
|
@@ -129,4 +135,4 @@ if __name__ == "__main__":
|
|
129
135
|
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
130
136
|
parser.add_argument("--port", type=int, default=30000)
|
131
137
|
args = parser.parse_args()
|
132
|
-
|
138
|
+
run_eval(args)
|
sglang/test/test_utils.py
CHANGED
@@ -22,6 +22,7 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
|
22
22
|
from sglang.srt.utils import kill_child_process
|
23
23
|
from sglang.utils import get_exception_traceback
|
24
24
|
|
25
|
+
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
25
26
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
26
27
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
27
28
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
@@ -304,7 +305,6 @@ def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
|
|
304
305
|
def select_sglang_backend(args: argparse.Namespace):
|
305
306
|
if args.backend.startswith("srt"):
|
306
307
|
if args.backend == "srt-no-parallel":
|
307
|
-
global_config.enable_parallel_decoding = False
|
308
308
|
global_config.enable_parallel_encoding = False
|
309
309
|
backend = RuntimeEndpoint(f"{args.host}:{args.port}")
|
310
310
|
elif args.backend.startswith("gpt-"):
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.1"
|
1
|
+
__version__ = "0.3.1.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.1
|
3
|
+
Version: 0.3.1.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -269,7 +269,7 @@ Requires-Dist: peft; extra == "test"
|
|
269
269
|
|
270
270
|
--------------------------------------------------------------------------------
|
271
271
|
|
272
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
272
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
|
273
273
|
|
274
274
|
SGLang is a fast serving framework for large language models and vision language models.
|
275
275
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
@@ -278,7 +278,7 @@ The core features include:
|
|
278
278
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
279
279
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
280
280
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
281
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption
|
281
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
282
282
|
|
283
283
|
## News
|
284
284
|
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
318
318
|
### Method 2: From source
|
319
319
|
```
|
320
320
|
# Use the last release branch
|
321
|
-
git clone -b v0.3.1 https://github.com/sgl-project/sglang.git
|
321
|
+
git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
|
322
322
|
cd sglang
|
323
323
|
|
324
324
|
pip install --upgrade pip
|
@@ -483,7 +483,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
483
483
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
484
484
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
485
485
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
486
|
-
- To enable DeepSeek MLA acceleration, add `--enable-mla`.
|
487
486
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
488
487
|
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
489
488
|
```
|
@@ -1,18 +1,19 @@
|
|
1
1
|
sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
|
2
2
|
sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
|
3
|
-
sglang/bench_latency.py,sha256=
|
4
|
-
sglang/
|
3
|
+
sglang/bench_latency.py,sha256=bA50iUYOxEnLjzY2S4AgwxtSAqujUbGfQFwbLZj5XNc,17160
|
4
|
+
sglang/bench_server_latency.py,sha256=KvFJgKQTSons7KOG0CBqnnOOx1gW29bBM1Z3GQO_6-E,5599
|
5
|
+
sglang/bench_serving.py,sha256=3gIJ1O2x51Fwd4wYJjgwluTbWKXL-azckQte7YC5zIc,36261
|
5
6
|
sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
|
6
|
-
sglang/global_config.py,sha256=
|
7
|
+
sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
|
7
8
|
sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
|
8
9
|
sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
|
9
10
|
sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
|
10
|
-
sglang/version.py,sha256=
|
11
|
+
sglang/version.py,sha256=U9F0UlFDynnYN5dX-kxehylWCwXo9a6E6W4FfDusfRg,28
|
11
12
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
13
|
sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
|
13
14
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
14
15
|
sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
|
15
|
-
sglang/lang/interpreter.py,sha256=
|
16
|
+
sglang/lang/interpreter.py,sha256=rOquFbMzxry7IItZlAn5TwtQfxMy718JPxOkiXO-yrg,30234
|
16
17
|
sglang/lang/ir.py,sha256=W3UfZikcGeT86PDDjDjw-yNzrKY2e2UYO4DTatMCfm0,17704
|
17
18
|
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
18
19
|
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -25,82 +26,83 @@ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bE
|
|
25
26
|
sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19663
|
26
27
|
sglang/srt/hf_transformers_utils.py,sha256=6HlqcmGPIvnSGaEEICeuzwag1QylSoSGbXRVvUdIMDo,6016
|
27
28
|
sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
|
28
|
-
sglang/srt/server.py,sha256=
|
29
|
-
sglang/srt/server_args.py,sha256=
|
30
|
-
sglang/srt/utils.py,sha256=
|
29
|
+
sglang/srt/server.py,sha256=n4QRn36_t-HAH-lSME3tiZSCUGRQwqMUckgs0paHq5g,20179
|
30
|
+
sglang/srt/server_args.py,sha256=3XjDt6SSjTfbOe0HSXA--2aUvrpWSnQmAHYwmeS1-M0,23159
|
31
|
+
sglang/srt/utils.py,sha256=8yxiMRttCcfswynkNPWD3yZFNAGFz2P1PzSuxHCBGns,22340
|
31
32
|
sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
|
32
33
|
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
33
34
|
sglang/srt/configs/model_config.py,sha256=OqHrucJQHbH-wxgkGj-Dcx_B888uUGASpLRjz40HaLY,6651
|
34
35
|
sglang/srt/constrained/__init__.py,sha256=ze8awDPvwAzdeMwzJ-25kXOQ4nVWoaP55jBDt5UOS_4,2070
|
35
36
|
sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
|
36
|
-
sglang/srt/constrained/fsm_cache.py,sha256=
|
37
|
+
sglang/srt/constrained/fsm_cache.py,sha256=k7DRUAaiLTEX5UarfJ17gEYQ-QWQAGfvykjYFkM_Y2U,2982
|
37
38
|
sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
|
38
|
-
sglang/srt/layers/activation.py,sha256=
|
39
|
-
sglang/srt/layers/attention_backend.py,sha256=
|
39
|
+
sglang/srt/layers/activation.py,sha256=i3omgj3GdUIZBqJNUjpdJsMc2UM3Lx07FT2J1WICrqA,5171
|
40
|
+
sglang/srt/layers/attention_backend.py,sha256=lqMsY4VaOO_szIWoTAinXf1DnP2UsbF32kzvwFySz9w,18119
|
40
41
|
sglang/srt/layers/flashinfer_utils.py,sha256=jyaO7XiEisFZg_dfaCbfRCHSHSKYoM1wOzfHa0h1q14,7413
|
41
|
-
sglang/srt/layers/layernorm.py,sha256=
|
42
|
+
sglang/srt/layers/layernorm.py,sha256=p_7bnmSpJ_slpoP0Gk5wQPpHtLllUu3imSIRBqGqTP0,3737
|
42
43
|
sglang/srt/layers/logits_processor.py,sha256=Js2qSk1Z3uPL2cYO1ARai51f2i8OedV3qdwByQVSJtI,12439
|
43
44
|
sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
|
44
45
|
sglang/srt/layers/radix_attention.py,sha256=EcVO0fUSmgvE_9R-MlpgJq0O_uT8ACuHzbMi19bANYc,1874
|
45
|
-
sglang/srt/layers/sampler.py,sha256=
|
46
|
+
sglang/srt/layers/sampler.py,sha256=Y0o1bndTGRD713fHMbN5-LRUiyneBkb7bH_QlkkeqSs,3836
|
46
47
|
sglang/srt/layers/torchao_utils.py,sha256=rTECwKSXhj_ylh_iSzfbopz9_lZOFHatquQrNJNLZlE,2703
|
47
48
|
sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
|
48
49
|
sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
|
49
|
-
sglang/srt/layers/fused_moe/layer.py,sha256=
|
50
|
+
sglang/srt/layers/fused_moe/layer.py,sha256=raFyvPzjYz-Fv8B3IcOxQYKKCWqXis5mXwg1GFE61y4,22243
|
50
51
|
sglang/srt/layers/triton_attention/decode_attention.py,sha256=XCQTX0kUttT1AG5FRMgfQbiXgvoempYD0UR2r6D_vJg,16711
|
51
52
|
sglang/srt/layers/triton_attention/extend_attention.py,sha256=XTUTMrE-5jfMEufQUifZ-8NJQABSPcF47qhnNT5Z1iI,11050
|
52
53
|
sglang/srt/layers/triton_attention/prefill_attention.py,sha256=QkXPcT02c13zha2M4mBm2S5dh_sS-Gc4FkkrcywRqvc,5377
|
53
54
|
sglang/srt/lora/lora.py,sha256=ksj866lgDul6zxO30Jm7Nrjv-mFAMrzdvP8sez3Pl6U,14938
|
54
55
|
sglang/srt/lora/lora_config.py,sha256=paVB7F7SIuxr_vodvKf8zzAlH2fdVYHhXxcXV62D0Vo,1411
|
55
|
-
sglang/srt/lora/lora_manager.py,sha256=
|
56
|
+
sglang/srt/lora/lora_manager.py,sha256=7J7cGmyy1Ph4HCvLdM-ViAizAbV1snZqD-S7JLWXasI,9561
|
56
57
|
sglang/srt/managers/controller_multi.py,sha256=KolZDso2WqH1ZhQw9p1eTmlFRgo4bcvzBxE44_sNE_o,6300
|
57
58
|
sglang/srt/managers/controller_single.py,sha256=DiZALP_iIPZQMRx09a-LwT5_Dg7p-WU8HXyMoxJ9sRA,4955
|
58
59
|
sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
|
59
|
-
sglang/srt/managers/io_struct.py,sha256=
|
60
|
-
sglang/srt/managers/policy_scheduler.py,sha256=
|
61
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
60
|
+
sglang/srt/managers/io_struct.py,sha256=yNV5BmeUzLPqv19j79kXQ50Iaqdk4vP-_TciiRf4OEE,11396
|
61
|
+
sglang/srt/managers/policy_scheduler.py,sha256=PVo0DV0-5ODNN7FkPkeF1Y8BQ6uuLldPETOlB_YvvL4,11560
|
62
|
+
sglang/srt/managers/schedule_batch.py,sha256=ns2qkaYAvzul-LCV1BEB6q1t5jKyftNsReMv62PC8M0,27386
|
62
63
|
sglang/srt/managers/tokenizer_manager.py,sha256=ql-sObjl1oRigJwnLtqqTaaw-i7gPTDMoNXDEMftr40,29643
|
63
|
-
sglang/srt/managers/tp_worker.py,sha256=
|
64
|
+
sglang/srt/managers/tp_worker.py,sha256=0Y0k-roDrBxWZxD0axv5CCvUUW8vsJ8n78TANHLzEFs,39503
|
64
65
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
65
66
|
sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
|
66
67
|
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
67
68
|
sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
|
68
69
|
sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
|
69
|
-
sglang/srt/model_executor/cuda_graph_runner.py,sha256=
|
70
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=gZ0Wukqz6u67MMIj4MC8JET9jcHdh0rotYzpuPlHruY,10512
|
70
71
|
sglang/srt/model_executor/forward_batch_info.py,sha256=yvkhayY9Zu6gysoojcGT73lADGOtfHKkFKWdJLRyACI,6141
|
71
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
72
|
+
sglang/srt/model_executor/model_runner.py,sha256=X7AG1k9AI_kqS8q1i5Bfv-kFysIdqJAVWMGGZoAPThY,22726
|
72
73
|
sglang/srt/models/baichuan.py,sha256=NrG1rMJXhemkrUCEf8xKOSDQVsOD-nN8RQz6MWHOg84,15124
|
73
74
|
sglang/srt/models/chatglm.py,sha256=KwxLHBEvK02McXDvBS0gnRxfIvOAu2QP7lgibrj9Nbc,13371
|
74
75
|
sglang/srt/models/commandr.py,sha256=2rAXRZRb4PkJZ4NWEqP_rIgsjxbdZyHpuoMOarqTWzQ,14163
|
75
76
|
sglang/srt/models/dbrx.py,sha256=N_0Ku_p1NCsc29NktUBNqPv7Z33XhYxOZK5xN7nzW4s,14661
|
76
77
|
sglang/srt/models/deepseek.py,sha256=7UJgde1EV9ey6d-CKRcEyTKh1_WhZdatpZiltIuqpik,16006
|
77
|
-
sglang/srt/models/deepseek_v2.py,sha256=
|
78
|
+
sglang/srt/models/deepseek_v2.py,sha256=1J0pt1jZRcBBGYbgt1wGiuxPcrdpfTEUEaGFqju6TVA,28431
|
78
79
|
sglang/srt/models/exaone.py,sha256=3I5ZoiLotf7U-8c9QJRubpgf6JDx9I_z-ViXQlCC-x8,13087
|
79
80
|
sglang/srt/models/gemma.py,sha256=GkwgGFHgGlXgBZN7s7Wooz5tMyCp1YtgLahU2NOo66M,12273
|
80
81
|
sglang/srt/models/gemma2.py,sha256=sFfCNEm0_OOWElRSTDuroRv8wNMX8v_81Uko9m546KA,14923
|
81
82
|
sglang/srt/models/gpt_bigcode.py,sha256=kzHYogeGXZF4KHpkXA-RGqvs016mA-6klWxD2QJTi9E,10195
|
82
83
|
sglang/srt/models/grok.py,sha256=6I4OwQwNyAbh5GF24_SRm12XYBvM9iGWB-T4TSTJ0wU,14929
|
83
84
|
sglang/srt/models/internlm2.py,sha256=6j7JH0p3yib8GZDH8Cmrs-pgwfH3eOlAK6V3Cq64O7w,12202
|
84
|
-
sglang/srt/models/llama.py,sha256=
|
85
|
-
sglang/srt/models/llama_classification.py,sha256=
|
85
|
+
sglang/srt/models/llama.py,sha256=nbJwRcG9DnurVNSGLKJjnmBmTXP1_5WZpudth_0PVpw,15216
|
86
|
+
sglang/srt/models/llama_classification.py,sha256=HF-69J9qIYdfX0R5wEtIgvafMzprKcXdvF3W_orl_kA,3394
|
86
87
|
sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
|
87
88
|
sglang/srt/models/llava.py,sha256=O4XGdl70Hh4tM_OHapFGHbReC82mbe9xLw6GELKWKhU,24881
|
88
89
|
sglang/srt/models/llavavid.py,sha256=ou5uIuskBoBo0lXvqFFfDLBYYVfehx27n-Lu8X9gpLs,11992
|
89
90
|
sglang/srt/models/minicpm.py,sha256=ioqCsTCE_oF8xqGF5fm5cK9dclK5Y0EQ1UJfyteIDDo,13825
|
90
|
-
sglang/srt/models/minicpm3.py,sha256=
|
91
|
+
sglang/srt/models/minicpm3.py,sha256=McPWyy2fQqfHUhi9Nk36rkvvPAS8RmLOY7Vh4ah5c1w,25216
|
91
92
|
sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
|
92
93
|
sglang/srt/models/mixtral.py,sha256=oRC7mKBrPJhvzkWSabrbeQQQac-jtF4EV6H2Sgjc5JY,13897
|
93
94
|
sglang/srt/models/mixtral_quant.py,sha256=wMACJq78OTWj7HlqPDRNEh8cjrVAjKqJEsOG3CO5xow,14072
|
95
|
+
sglang/srt/models/olmoe.py,sha256=d0ECpU-IXXwGYg9tkVeMARUbqVcqEnWfpH3rrNiGKA0,15336
|
94
96
|
sglang/srt/models/qwen.py,sha256=nqSRzkiZzpRVG6WGQ1MBUclQnXyw8jlvoOq-euM8j5s,9954
|
95
97
|
sglang/srt/models/qwen2.py,sha256=9_M-VkHN1_T1XN-gsl_L636QMQ9BLF2WqvTcx_1L6aw,12432
|
96
98
|
sglang/srt/models/qwen2_moe.py,sha256=s7b5XnSvsBYtZZUkjPp442m59CqPJ3HxGUIwXBVWsXw,17153
|
97
99
|
sglang/srt/models/stablelm.py,sha256=30ngpc0Xq3VxzXJlf6svP1oax8Q3krMJkxM8PVKtZWU,11359
|
98
|
-
sglang/srt/models/xverse.py,sha256=
|
99
|
-
sglang/srt/models/xverse_moe.py,sha256=
|
100
|
+
sglang/srt/models/xverse.py,sha256=L3g32-je_7JmzF2-hztaIVshHYCIv7jOM3oFs-fb2MY,13658
|
101
|
+
sglang/srt/models/xverse_moe.py,sha256=CgDD9cR83UVfTsPU6WcbHVYBrkYKv_kTdwncTIx7Q7U,15842
|
100
102
|
sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
|
101
103
|
sglang/srt/openai_api/adapter.py,sha256=CJ47YftRHAip1FMcHIhtCorBtzlIkv7F0Wz_JUcI4T4,51032
|
102
104
|
sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
|
103
|
-
sglang/srt/sampling/sampling_batch_info.py,sha256=
|
105
|
+
sglang/srt/sampling/sampling_batch_info.py,sha256=GewqyxCrW2PFwuzGHaCR59Pvw6j0n2dKGrlJWYQWwW4,6149
|
104
106
|
sglang/srt/sampling/sampling_params.py,sha256=ggOXxafqfCD-xrGYcM57byLZ79CIeBP4AD5F44L_CW0,5635
|
105
107
|
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
106
108
|
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
|
@@ -108,7 +110,7 @@ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq
|
|
108
110
|
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
|
109
111
|
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
|
110
112
|
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
|
111
|
-
sglang/test/few_shot_gsm8k.py,sha256=
|
113
|
+
sglang/test/few_shot_gsm8k.py,sha256=To7Sdg-DLF8poIQLwiOBYKbkz-1C_gn6H79vIbyPR-o,3860
|
112
114
|
sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
|
113
115
|
sglang/test/runners.py,sha256=ZoWhT1TDXfLBVdbivXx1KUu9dhPlGjL_xrP18WLzVLo,11404
|
114
116
|
sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
|
@@ -120,10 +122,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
|
|
120
122
|
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
121
123
|
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
122
124
|
sglang/test/test_programs.py,sha256=3-XKnppQdCNWjaJb6jwib5Z9OSpgKvH8SFLJbE4J9qI,17001
|
123
|
-
sglang/test/test_utils.py,sha256=
|
125
|
+
sglang/test/test_utils.py,sha256=dsHRd1xLzcjlarxUnDIz2XEHfut7HvqVPwx2Fn7vf10,17179
|
124
126
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
|
125
|
-
sglang-0.3.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
126
|
-
sglang-0.3.1.dist-info/METADATA,sha256=
|
127
|
-
sglang-0.3.1.dist-info/WHEEL,sha256=
|
128
|
-
sglang-0.3.1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
129
|
-
sglang-0.3.1.dist-info/RECORD,,
|
127
|
+
sglang-0.3.1.post2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
128
|
+
sglang-0.3.1.post2.dist-info/METADATA,sha256=WxMy8Ur_rjPxqVOoWSFoM3eBHWt0cKGyrtwOUfWL-Vc,38114
|
129
|
+
sglang-0.3.1.post2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
130
|
+
sglang-0.3.1.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
131
|
+
sglang-0.3.1.post2.dist-info/RECORD,,
|
File without changes
|
File without changes
|