sglang 0.3.1.post1__py3-none-any.whl → 0.3.1.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +11 -2
- sglang/bench_server_latency.py +187 -0
- sglang/bench_serving.py +1 -1
- sglang/srt/layers/activation.py +8 -4
- sglang/srt/layers/attention_backend.py +3 -1
- sglang/srt/layers/layernorm.py +10 -7
- sglang/srt/layers/linear.py +1133 -0
- sglang/srt/layers/quantization/__init__.py +76 -0
- sglang/srt/layers/quantization/base_config.py +122 -0
- sglang/srt/layers/sampler.py +9 -2
- sglang/srt/managers/io_struct.py +3 -0
- sglang/srt/managers/policy_scheduler.py +49 -93
- sglang/srt/managers/schedule_batch.py +1 -1
- sglang/srt/managers/tp_worker.py +11 -6
- sglang/srt/model_executor/cuda_graph_runner.py +15 -14
- sglang/srt/model_executor/model_runner.py +13 -5
- sglang/srt/models/baichuan.py +1 -1
- sglang/srt/models/chatglm.py +6 -6
- sglang/srt/models/commandr.py +7 -7
- sglang/srt/models/dbrx.py +7 -7
- sglang/srt/models/deepseek.py +7 -7
- sglang/srt/models/deepseek_v2.py +9 -9
- sglang/srt/models/exaone.py +6 -6
- sglang/srt/models/gemma.py +6 -6
- sglang/srt/models/gemma2.py +6 -6
- sglang/srt/models/gpt_bigcode.py +6 -6
- sglang/srt/models/grok.py +6 -6
- sglang/srt/models/internlm2.py +6 -6
- sglang/srt/models/llama.py +7 -9
- sglang/srt/models/llama_classification.py +3 -4
- sglang/srt/models/llava.py +1 -1
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpm.py +6 -6
- sglang/srt/models/minicpm3.py +3 -3
- sglang/srt/models/mixtral.py +6 -6
- sglang/srt/models/mixtral_quant.py +6 -6
- sglang/srt/models/olmoe.py +1 -1
- sglang/srt/models/qwen.py +6 -6
- sglang/srt/models/qwen2.py +6 -6
- sglang/srt/models/qwen2_moe.py +7 -7
- sglang/srt/models/stablelm.py +6 -6
- sglang/srt/models/xverse.py +2 -4
- sglang/srt/models/xverse_moe.py +2 -5
- sglang/srt/models/yivl.py +1 -1
- sglang/srt/server_args.py +17 -21
- sglang/srt/utils.py +21 -1
- sglang/test/few_shot_gsm8k.py +8 -2
- sglang/test/test_utils.py +5 -2
- sglang/version.py +1 -1
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/METADATA +5 -5
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/RECORD +54 -50
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/LICENSE +0 -0
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/WHEEL +0 -0
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/top_level.txt +0 -0
sglang/srt/models/xverse_moe.py
CHANGED
@@ -34,7 +34,6 @@ from vllm.model_executor.layers.linear import (
|
|
34
34
|
ReplicatedLinear,
|
35
35
|
RowParallelLinear,
|
36
36
|
)
|
37
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
38
37
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
39
38
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
40
39
|
ParallelLMHead,
|
@@ -43,6 +42,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
43
42
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
44
43
|
|
45
44
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
45
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
46
46
|
from sglang.srt.layers.radix_attention import RadixAttention
|
47
47
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
48
48
|
|
@@ -383,8 +383,6 @@ class XverseMoeForCausalLM(nn.Module):
|
|
383
383
|
)
|
384
384
|
self.logits_processor = LogitsProcessor(config)
|
385
385
|
|
386
|
-
self.param_dict = dict(self.named_parameters())
|
387
|
-
|
388
386
|
@torch.no_grad()
|
389
387
|
def forward(
|
390
388
|
self,
|
@@ -406,8 +404,7 @@ class XverseMoeForCausalLM(nn.Module):
|
|
406
404
|
("gate_up_proj", "gate_proj", 0),
|
407
405
|
("gate_up_proj", "up_proj", 1),
|
408
406
|
]
|
409
|
-
|
410
|
-
params_dict = self.param_dict
|
407
|
+
params_dict = dict(self.named_parameters())
|
411
408
|
|
412
409
|
for name, loaded_weight in weights:
|
413
410
|
if "rotary_emb.inv_freq" in name:
|
sglang/srt/models/yivl.py
CHANGED
@@ -21,9 +21,9 @@ import torch
|
|
21
21
|
import torch.nn as nn
|
22
22
|
from transformers import CLIPVisionModel, LlavaConfig
|
23
23
|
from vllm.config import CacheConfig
|
24
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
25
24
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
26
25
|
|
26
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
27
27
|
from sglang.srt.models.llava import LlavaLlamaForCausalLM
|
28
28
|
|
29
29
|
|
sglang/srt/server_args.py
CHANGED
@@ -26,17 +26,6 @@ from sglang.srt.utils import is_hip
|
|
26
26
|
logger = logging.getLogger(__name__)
|
27
27
|
|
28
28
|
|
29
|
-
class LoRAPathAction(argparse.Action):
|
30
|
-
def __call__(self, parser, namespace, values, option_string=None):
|
31
|
-
setattr(namespace, self.dest, {})
|
32
|
-
for lora_path in values:
|
33
|
-
if "=" in lora_path:
|
34
|
-
name, path = lora_path.split("=", 1)
|
35
|
-
getattr(namespace, self.dest)[name] = path
|
36
|
-
else:
|
37
|
-
getattr(namespace, self.dest)[lora_path] = lora_path
|
38
|
-
|
39
|
-
|
40
29
|
@dataclasses.dataclass
|
41
30
|
class ServerArgs:
|
42
31
|
# Model and tokenizer
|
@@ -108,12 +97,12 @@ class ServerArgs:
|
|
108
97
|
disable_cuda_graph_padding: bool = False
|
109
98
|
disable_disk_cache: bool = False
|
110
99
|
disable_custom_all_reduce: bool = False
|
100
|
+
disable_mla: bool = False
|
111
101
|
enable_mixed_chunk: bool = False
|
112
102
|
enable_torch_compile: bool = False
|
113
103
|
max_torch_compile_bs: int = 32
|
114
104
|
torchao_config: str = ""
|
115
105
|
enable_p2p_check: bool = False
|
116
|
-
enable_mla: bool = False
|
117
106
|
triton_attention_reduce_in_fp32: bool = False
|
118
107
|
|
119
108
|
# LoRA
|
@@ -173,10 +162,6 @@ class ServerArgs:
|
|
173
162
|
self.sampling_backend = "pytorch"
|
174
163
|
|
175
164
|
# Default kernel backends
|
176
|
-
if self.enable_mla:
|
177
|
-
logger.info("MLA optimization is tunred on. Use triton backend.")
|
178
|
-
self.attention_backend = "triton"
|
179
|
-
|
180
165
|
if self.attention_backend is None:
|
181
166
|
self.attention_backend = "flashinfer"
|
182
167
|
|
@@ -514,6 +499,11 @@ class ServerArgs:
|
|
514
499
|
default=False,
|
515
500
|
help="Disable the custom all-reduce kernel and fall back to NCCL.",
|
516
501
|
)
|
502
|
+
parser.add_argument(
|
503
|
+
"--disable-mla",
|
504
|
+
action="store_true",
|
505
|
+
help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
|
506
|
+
)
|
517
507
|
parser.add_argument(
|
518
508
|
"--enable-mixed-chunk",
|
519
509
|
action="store_true",
|
@@ -541,11 +531,6 @@ class ServerArgs:
|
|
541
531
|
action="store_true",
|
542
532
|
help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
|
543
533
|
)
|
544
|
-
parser.add_argument(
|
545
|
-
"--enable-mla",
|
546
|
-
action="store_true",
|
547
|
-
help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
|
548
|
-
)
|
549
534
|
parser.add_argument(
|
550
535
|
"--triton-attention-reduce-in-fp32",
|
551
536
|
action="store_true",
|
@@ -623,3 +608,14 @@ class PortArgs:
|
|
623
608
|
controller_port: int
|
624
609
|
detokenizer_port: int
|
625
610
|
nccl_ports: List[int]
|
611
|
+
|
612
|
+
|
613
|
+
class LoRAPathAction(argparse.Action):
|
614
|
+
def __call__(self, parser, namespace, values, option_string=None):
|
615
|
+
setattr(namespace, self.dest, {})
|
616
|
+
for lora_path in values:
|
617
|
+
if "=" in lora_path:
|
618
|
+
name, path = lora_path.split("=", 1)
|
619
|
+
getattr(namespace, self.dest)[name] = path
|
620
|
+
else:
|
621
|
+
getattr(namespace, self.dest)[lora_path] = lora_path
|
sglang/srt/utils.py
CHANGED
@@ -26,7 +26,7 @@ import struct
|
|
26
26
|
import time
|
27
27
|
from importlib.metadata import PackageNotFoundError, version
|
28
28
|
from io import BytesIO
|
29
|
-
from typing import List, Optional, Union
|
29
|
+
from typing import Any, Dict, List, Optional, Union
|
30
30
|
|
31
31
|
import numpy as np
|
32
32
|
import psutil
|
@@ -682,3 +682,23 @@ def replace_submodule(
|
|
682
682
|
target_name = module_name.split(".")[-1]
|
683
683
|
setattr(parent, target_name, new_module)
|
684
684
|
return new_module
|
685
|
+
|
686
|
+
|
687
|
+
def set_weight_attrs(
|
688
|
+
weight: torch.Tensor,
|
689
|
+
weight_attrs: Optional[Dict[str, Any]],
|
690
|
+
):
|
691
|
+
"""Set attributes on a weight tensor.
|
692
|
+
|
693
|
+
This method is used to set attributes on a weight tensor. This method
|
694
|
+
will not overwrite existing attributes.
|
695
|
+
|
696
|
+
Args:
|
697
|
+
weight: The weight tensor.
|
698
|
+
weight_attrs: A dictionary of attributes to set on the weight tensor.
|
699
|
+
"""
|
700
|
+
if weight_attrs is None:
|
701
|
+
return
|
702
|
+
for key, value in weight_attrs.items():
|
703
|
+
assert not hasattr(weight, key), f"Overwriting existing tensor attribute: {key}"
|
704
|
+
setattr(weight, key, value)
|
sglang/test/few_shot_gsm8k.py
CHANGED
@@ -44,7 +44,7 @@ def get_answer_value(answer_str):
|
|
44
44
|
return INVALID
|
45
45
|
|
46
46
|
|
47
|
-
def
|
47
|
+
def run_eval(args):
|
48
48
|
# Select backend
|
49
49
|
set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))
|
50
50
|
|
@@ -119,6 +119,12 @@ def main(args):
|
|
119
119
|
# Dump results
|
120
120
|
dump_state_text("tmp_output_gsm8k.txt", states)
|
121
121
|
|
122
|
+
return {
|
123
|
+
"accuracy": acc,
|
124
|
+
"latency": latency,
|
125
|
+
"output_throughput": output_throughput,
|
126
|
+
}
|
127
|
+
|
122
128
|
|
123
129
|
if __name__ == "__main__":
|
124
130
|
parser = argparse.ArgumentParser()
|
@@ -129,4 +135,4 @@ if __name__ == "__main__":
|
|
129
135
|
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
130
136
|
parser.add_argument("--port", type=int, default=30000)
|
131
137
|
args = parser.parse_args()
|
132
|
-
|
138
|
+
run_eval(args)
|
sglang/test/test_utils.py
CHANGED
@@ -22,13 +22,16 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
|
22
22
|
from sglang.srt.utils import kill_child_process
|
23
23
|
from sglang.utils import get_exception_traceback
|
24
24
|
|
25
|
+
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
25
26
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
26
27
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
28
|
+
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
27
29
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
28
30
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
29
|
-
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
|
31
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
30
32
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
|
31
|
-
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
|
33
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
34
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
|
32
35
|
|
33
36
|
|
34
37
|
def is_in_ci():
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.1.
|
1
|
+
__version__ = "0.3.1.post3"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.1.
|
3
|
+
Version: 0.3.1.post3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -269,7 +269,7 @@ Requires-Dist: peft; extra == "test"
|
|
269
269
|
|
270
270
|
--------------------------------------------------------------------------------
|
271
271
|
|
272
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
272
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
|
273
273
|
|
274
274
|
SGLang is a fast serving framework for large language models and vision language models.
|
275
275
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
@@ -278,7 +278,7 @@ The core features include:
|
|
278
278
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
279
279
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
280
280
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
281
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption
|
281
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
282
282
|
|
283
283
|
## News
|
284
284
|
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
318
318
|
### Method 2: From source
|
319
319
|
```
|
320
320
|
# Use the last release branch
|
321
|
-
git clone -b v0.3.1.
|
321
|
+
git clone -b v0.3.1.post3 https://github.com/sgl-project/sglang.git
|
322
322
|
cd sglang
|
323
323
|
|
324
324
|
pip install --upgrade pip
|
@@ -483,7 +483,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
483
483
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
484
484
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
485
485
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
486
|
-
- To enable DeepSeek MLA acceleration, add `--enable-mla`.
|
487
486
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
488
487
|
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
489
488
|
```
|
@@ -500,6 +499,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
500
499
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
501
500
|
- Mistral / Mixtral / Mistral NeMo
|
502
501
|
- Gemma / Gemma 2
|
502
|
+
- OLMoE
|
503
503
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
504
504
|
- DeepSeek / DeepSeek 2
|
505
505
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
@@ -1,13 +1,14 @@
|
|
1
1
|
sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
|
2
2
|
sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
|
3
|
-
sglang/bench_latency.py,sha256=
|
4
|
-
sglang/
|
3
|
+
sglang/bench_latency.py,sha256=lyA_AwlhDbLMrH9Ca5_X3NUYQdwbHn_vpNbMyvqOZic,17342
|
4
|
+
sglang/bench_server_latency.py,sha256=KvFJgKQTSons7KOG0CBqnnOOx1gW29bBM1Z3GQO_6-E,5599
|
5
|
+
sglang/bench_serving.py,sha256=3gIJ1O2x51Fwd4wYJjgwluTbWKXL-azckQte7YC5zIc,36261
|
5
6
|
sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
|
6
7
|
sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
|
7
8
|
sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
|
8
9
|
sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
|
9
10
|
sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
|
10
|
-
sglang/version.py,sha256=
|
11
|
+
sglang/version.py,sha256=vtapUd7gvia5JFNpZOX5Q2A4TqgNWABeKFK66x_VeZU,28
|
11
12
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
13
|
sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
|
13
14
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
@@ -26,8 +27,8 @@ sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19
|
|
26
27
|
sglang/srt/hf_transformers_utils.py,sha256=6HlqcmGPIvnSGaEEICeuzwag1QylSoSGbXRVvUdIMDo,6016
|
27
28
|
sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
|
28
29
|
sglang/srt/server.py,sha256=n4QRn36_t-HAH-lSME3tiZSCUGRQwqMUckgs0paHq5g,20179
|
29
|
-
sglang/srt/server_args.py,sha256=
|
30
|
-
sglang/srt/utils.py,sha256=
|
30
|
+
sglang/srt/server_args.py,sha256=3XjDt6SSjTfbOe0HSXA--2aUvrpWSnQmAHYwmeS1-M0,23159
|
31
|
+
sglang/srt/utils.py,sha256=Vly46zMM_rz__DaU15vbidYtS0Gh2s7TnAMj4WLyAO4,22954
|
31
32
|
sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
|
32
33
|
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
33
34
|
sglang/srt/configs/model_config.py,sha256=OqHrucJQHbH-wxgkGj-Dcx_B888uUGASpLRjz40HaLY,6651
|
@@ -35,18 +36,21 @@ sglang/srt/constrained/__init__.py,sha256=ze8awDPvwAzdeMwzJ-25kXOQ4nVWoaP55jBDt5
|
|
35
36
|
sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
|
36
37
|
sglang/srt/constrained/fsm_cache.py,sha256=k7DRUAaiLTEX5UarfJ17gEYQ-QWQAGfvykjYFkM_Y2U,2982
|
37
38
|
sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
|
38
|
-
sglang/srt/layers/activation.py,sha256=
|
39
|
-
sglang/srt/layers/attention_backend.py,sha256=
|
39
|
+
sglang/srt/layers/activation.py,sha256=tRWHxIjcIopkOremkb5Jy5O0rgdB1PAhHfIEONfyj6Y,5166
|
40
|
+
sglang/srt/layers/attention_backend.py,sha256=TMxsN1HwgqAURD1i77c-TN-3Xy53H9Kbg6HgpRHHoj0,18167
|
40
41
|
sglang/srt/layers/flashinfer_utils.py,sha256=jyaO7XiEisFZg_dfaCbfRCHSHSKYoM1wOzfHa0h1q14,7413
|
41
|
-
sglang/srt/layers/layernorm.py,sha256
|
42
|
+
sglang/srt/layers/layernorm.py,sha256=p_7bnmSpJ_slpoP0Gk5wQPpHtLllUu3imSIRBqGqTP0,3737
|
43
|
+
sglang/srt/layers/linear.py,sha256=9rjCiSb_QOn5RgpVjIhEKdReRvSYVfcTSjbWBEbApLI,45173
|
42
44
|
sglang/srt/layers/logits_processor.py,sha256=Js2qSk1Z3uPL2cYO1ARai51f2i8OedV3qdwByQVSJtI,12439
|
43
45
|
sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
|
44
46
|
sglang/srt/layers/radix_attention.py,sha256=EcVO0fUSmgvE_9R-MlpgJq0O_uT8ACuHzbMi19bANYc,1874
|
45
|
-
sglang/srt/layers/sampler.py,sha256=
|
47
|
+
sglang/srt/layers/sampler.py,sha256=Y0o1bndTGRD713fHMbN5-LRUiyneBkb7bH_QlkkeqSs,3836
|
46
48
|
sglang/srt/layers/torchao_utils.py,sha256=rTECwKSXhj_ylh_iSzfbopz9_lZOFHatquQrNJNLZlE,2703
|
47
49
|
sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
|
48
50
|
sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
|
49
51
|
sglang/srt/layers/fused_moe/layer.py,sha256=raFyvPzjYz-Fv8B3IcOxQYKKCWqXis5mXwg1GFE61y4,22243
|
52
|
+
sglang/srt/layers/quantization/__init__.py,sha256=wl9mIOeA6mtKIaW1LWUJABWPdqOb-2uZ-kSijWoxLtU,3095
|
53
|
+
sglang/srt/layers/quantization/base_config.py,sha256=vlpSPvSrFmUe65ETg4SoPocQ9bVNY6As3QuHdr_3Dr4,4023
|
50
54
|
sglang/srt/layers/triton_attention/decode_attention.py,sha256=XCQTX0kUttT1AG5FRMgfQbiXgvoempYD0UR2r6D_vJg,16711
|
51
55
|
sglang/srt/layers/triton_attention/extend_attention.py,sha256=XTUTMrE-5jfMEufQUifZ-8NJQABSPcF47qhnNT5Z1iI,11050
|
52
56
|
sglang/srt/layers/triton_attention/prefill_attention.py,sha256=QkXPcT02c13zha2M4mBm2S5dh_sS-Gc4FkkrcywRqvc,5377
|
@@ -56,49 +60,49 @@ sglang/srt/lora/lora_manager.py,sha256=7J7cGmyy1Ph4HCvLdM-ViAizAbV1snZqD-S7JLWXa
|
|
56
60
|
sglang/srt/managers/controller_multi.py,sha256=KolZDso2WqH1ZhQw9p1eTmlFRgo4bcvzBxE44_sNE_o,6300
|
57
61
|
sglang/srt/managers/controller_single.py,sha256=DiZALP_iIPZQMRx09a-LwT5_Dg7p-WU8HXyMoxJ9sRA,4955
|
58
62
|
sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
|
59
|
-
sglang/srt/managers/io_struct.py,sha256=
|
60
|
-
sglang/srt/managers/policy_scheduler.py,sha256=
|
61
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
63
|
+
sglang/srt/managers/io_struct.py,sha256=yNV5BmeUzLPqv19j79kXQ50Iaqdk4vP-_TciiRf4OEE,11396
|
64
|
+
sglang/srt/managers/policy_scheduler.py,sha256=PVo0DV0-5ODNN7FkPkeF1Y8BQ6uuLldPETOlB_YvvL4,11560
|
65
|
+
sglang/srt/managers/schedule_batch.py,sha256=ns2qkaYAvzul-LCV1BEB6q1t5jKyftNsReMv62PC8M0,27386
|
62
66
|
sglang/srt/managers/tokenizer_manager.py,sha256=ql-sObjl1oRigJwnLtqqTaaw-i7gPTDMoNXDEMftr40,29643
|
63
|
-
sglang/srt/managers/tp_worker.py,sha256=
|
67
|
+
sglang/srt/managers/tp_worker.py,sha256=0Y0k-roDrBxWZxD0axv5CCvUUW8vsJ8n78TANHLzEFs,39503
|
64
68
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
65
69
|
sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
|
66
70
|
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
67
71
|
sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
|
68
72
|
sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
|
69
|
-
sglang/srt/model_executor/cuda_graph_runner.py,sha256=
|
73
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=gZ0Wukqz6u67MMIj4MC8JET9jcHdh0rotYzpuPlHruY,10512
|
70
74
|
sglang/srt/model_executor/forward_batch_info.py,sha256=yvkhayY9Zu6gysoojcGT73lADGOtfHKkFKWdJLRyACI,6141
|
71
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
72
|
-
sglang/srt/models/baichuan.py,sha256=
|
73
|
-
sglang/srt/models/chatglm.py,sha256=
|
74
|
-
sglang/srt/models/commandr.py,sha256=
|
75
|
-
sglang/srt/models/dbrx.py,sha256=
|
76
|
-
sglang/srt/models/deepseek.py,sha256=
|
77
|
-
sglang/srt/models/deepseek_v2.py,sha256=
|
78
|
-
sglang/srt/models/exaone.py,sha256=
|
79
|
-
sglang/srt/models/gemma.py,sha256=
|
80
|
-
sglang/srt/models/gemma2.py,sha256=
|
81
|
-
sglang/srt/models/gpt_bigcode.py,sha256=
|
82
|
-
sglang/srt/models/grok.py,sha256=
|
83
|
-
sglang/srt/models/internlm2.py,sha256=
|
84
|
-
sglang/srt/models/llama.py,sha256=
|
85
|
-
sglang/srt/models/llama_classification.py,sha256=
|
75
|
+
sglang/srt/model_executor/model_runner.py,sha256=X7AG1k9AI_kqS8q1i5Bfv-kFysIdqJAVWMGGZoAPThY,22726
|
76
|
+
sglang/srt/models/baichuan.py,sha256=d2PFmyLBXjzS7X7FL9uz139_CpBPb5WYhzcHgF--gRE,15115
|
77
|
+
sglang/srt/models/chatglm.py,sha256=chDkgLTRU3bPxTUilhW_FGnsUWj_2fkvulCi9pdDxBY,13353
|
78
|
+
sglang/srt/models/commandr.py,sha256=FspSRkMRAXUjD3xzAkxkMiGiRg91czn9T5bagrf3l9M,14136
|
79
|
+
sglang/srt/models/dbrx.py,sha256=UmpbTCuf8rYe2Grut7YUPU1gEwsDhgNIs8vW4DNiaf0,14634
|
80
|
+
sglang/srt/models/deepseek.py,sha256=TWwfwKYvZZyu2UbimvimeyU_7u7HyIYZlRdlPtOCTfo,15988
|
81
|
+
sglang/srt/models/deepseek_v2.py,sha256=36iH4HrObMasOY801Tacub_40BR_0ImdqdKcJ6nHOD8,28413
|
82
|
+
sglang/srt/models/exaone.py,sha256=0OTgeAzyi_xvoQTx4TwYkCxRq8sMa-4EYL0_KJRmiAU,13069
|
83
|
+
sglang/srt/models/gemma.py,sha256=qo-4F602DKuv33zp4i4dayteFoVhnTYgVbFWKYms5Og,12255
|
84
|
+
sglang/srt/models/gemma2.py,sha256=8wGqNQPaPjuTtgHiKsUP4nowOukPvXwRywD4lkAW9Dg,14905
|
85
|
+
sglang/srt/models/gpt_bigcode.py,sha256=k_pZa4Sg5GEsr4ln0kjP765moGUPNs5a6iANPjE2W8U,10177
|
86
|
+
sglang/srt/models/grok.py,sha256=71Zx-4Q3wggNMtRYlXuPMA-auK-sHBYukI1Usn8LVrE,14911
|
87
|
+
sglang/srt/models/internlm2.py,sha256=nEr6MSHFkTjPLvWl1jQQdGFO7iOHex6YtE-I4rYuLao,12184
|
88
|
+
sglang/srt/models/llama.py,sha256=bdIt9IfZBgsg6CoZT3lvB-dqXhfxempdRHLkY3Su_VU,15198
|
89
|
+
sglang/srt/models/llama_classification.py,sha256=UpwYsgNVS1065t7Yjmi2XGbk9Or8bq2cF82zH1Yx2Mg,3385
|
86
90
|
sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
|
87
|
-
sglang/srt/models/llava.py,sha256=
|
88
|
-
sglang/srt/models/llavavid.py,sha256=
|
89
|
-
sglang/srt/models/minicpm.py,sha256=
|
90
|
-
sglang/srt/models/minicpm3.py,sha256=
|
91
|
+
sglang/srt/models/llava.py,sha256=1MG1JDDQb7xc67BSimDo98Gmvza6PmrHQHmKybsDui4,24872
|
92
|
+
sglang/srt/models/llavavid.py,sha256=RqOUFROt-gqTlFYqnySAVBXJO9g-NMU2yke-AW5cV6o,11983
|
93
|
+
sglang/srt/models/minicpm.py,sha256=Xvy99mkfwzRZCLOe3BhfmNSuJyDhGjjAJq0YOpepu_Q,13807
|
94
|
+
sglang/srt/models/minicpm3.py,sha256=yuiwWNfJeWvfUgwkbEfpuc9_uPB6odqBCbdYj8t9aDQ,25207
|
91
95
|
sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
|
92
|
-
sglang/srt/models/mixtral.py,sha256=
|
93
|
-
sglang/srt/models/mixtral_quant.py,sha256=
|
94
|
-
sglang/srt/models/olmoe.py,sha256=
|
95
|
-
sglang/srt/models/qwen.py,sha256=
|
96
|
-
sglang/srt/models/qwen2.py,sha256=
|
97
|
-
sglang/srt/models/qwen2_moe.py,sha256=
|
98
|
-
sglang/srt/models/stablelm.py,sha256=
|
99
|
-
sglang/srt/models/xverse.py,sha256=
|
100
|
-
sglang/srt/models/xverse_moe.py,sha256=
|
101
|
-
sglang/srt/models/yivl.py,sha256=
|
96
|
+
sglang/srt/models/mixtral.py,sha256=QzWIhjk8gW9DquTvgQsWK3VK0ccdTMT0hCDDHI03KPI,13879
|
97
|
+
sglang/srt/models/mixtral_quant.py,sha256=e2x1AykUSVRqEVw6Pg7uKW1Uj8xyn4jZSfLJL4Kl5o8,14054
|
98
|
+
sglang/srt/models/olmoe.py,sha256=hGh2IlCg9kr1WIeGyRWwNpa1CfyZH163vq7eSx5d598,15327
|
99
|
+
sglang/srt/models/qwen.py,sha256=Vs6f8Jn1TswEzgiPS0G9qxeDU_DdC60JnhDeRDTH3FQ,9936
|
100
|
+
sglang/srt/models/qwen2.py,sha256=pamZrETUcaXbWN4tVTjObFPNjqaMu49-8g267NzxkFI,12414
|
101
|
+
sglang/srt/models/qwen2_moe.py,sha256=2BFsp1oPs7o_3uc8xvIGfGRNNU2TKkmKZY9P1qtgtlQ,17135
|
102
|
+
sglang/srt/models/stablelm.py,sha256=v67JM1SHb-LinrsX598WMsLVeyzjoKquW6G5G30X5fQ,11341
|
103
|
+
sglang/srt/models/xverse.py,sha256=VThXXKg3DzepcEP1JHcqSyhRBvq6yL14oh4uj5TJOEM,13649
|
104
|
+
sglang/srt/models/xverse_moe.py,sha256=BqmV-uk9ipp4nrj6-lnFfvkwUcuKmV7yfGAYB6Ob-UQ,15833
|
105
|
+
sglang/srt/models/yivl.py,sha256=N3noJ5M-FiZS-E_zfaJs4prQOu_ineRt11MWloYgOR8,4826
|
102
106
|
sglang/srt/openai_api/adapter.py,sha256=CJ47YftRHAip1FMcHIhtCorBtzlIkv7F0Wz_JUcI4T4,51032
|
103
107
|
sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
|
104
108
|
sglang/srt/sampling/sampling_batch_info.py,sha256=GewqyxCrW2PFwuzGHaCR59Pvw6j0n2dKGrlJWYQWwW4,6149
|
@@ -109,7 +113,7 @@ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq
|
|
109
113
|
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
|
110
114
|
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
|
111
115
|
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
|
112
|
-
sglang/test/few_shot_gsm8k.py,sha256=
|
116
|
+
sglang/test/few_shot_gsm8k.py,sha256=To7Sdg-DLF8poIQLwiOBYKbkz-1C_gn6H79vIbyPR-o,3860
|
113
117
|
sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
|
114
118
|
sglang/test/runners.py,sha256=ZoWhT1TDXfLBVdbivXx1KUu9dhPlGjL_xrP18WLzVLo,11404
|
115
119
|
sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
|
@@ -121,10 +125,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
|
|
121
125
|
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
122
126
|
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
123
127
|
sglang/test/test_programs.py,sha256=3-XKnppQdCNWjaJb6jwib5Z9OSpgKvH8SFLJbE4J9qI,17001
|
124
|
-
sglang/test/test_utils.py,sha256=
|
128
|
+
sglang/test/test_utils.py,sha256=OnAFpTA94GmQCHCV5XpaYImn11U7Cg4yfSw0nC17GRs,17504
|
125
129
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
|
126
|
-
sglang-0.3.1.
|
127
|
-
sglang-0.3.1.
|
128
|
-
sglang-0.3.1.
|
129
|
-
sglang-0.3.1.
|
130
|
-
sglang-0.3.1.
|
130
|
+
sglang-0.3.1.post3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
131
|
+
sglang-0.3.1.post3.dist-info/METADATA,sha256=uhvB-z9UZsAafHaPfU9qYU6oKxrC6BLcyBspbtoFAY8,38122
|
132
|
+
sglang-0.3.1.post3.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
133
|
+
sglang-0.3.1.post3.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
134
|
+
sglang-0.3.1.post3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|