sglang 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +1 -1
- sglang/bench_serving.py +243 -25
- sglang/global_config.py +3 -2
- sglang/lang/interpreter.py +1 -0
- sglang/srt/hf_transformers_utils.py +13 -1
- sglang/srt/layers/logits_processor.py +4 -5
- sglang/srt/layers/radix_attention.py +38 -49
- sglang/srt/managers/controller/cuda_graph_runner.py +58 -16
- sglang/srt/managers/controller/infer_batch.py +51 -22
- sglang/srt/managers/controller/model_runner.py +7 -4
- sglang/srt/managers/controller/schedule_heuristic.py +8 -3
- sglang/srt/managers/controller/tp_worker.py +9 -11
- sglang/srt/memory_pool.py +13 -5
- sglang/srt/models/deepseek.py +430 -0
- sglang/srt/models/gpt_bigcode.py +282 -0
- sglang/srt/models/llama2.py +19 -10
- sglang/srt/server.py +20 -1
- sglang/srt/server_args.py +12 -6
- sglang/srt/utils.py +49 -0
- {sglang-0.1.22.dist-info → sglang-0.1.24.dist-info}/METADATA +9 -5
- {sglang-0.1.22.dist-info → sglang-0.1.24.dist-info}/RECORD +24 -22
- {sglang-0.1.22.dist-info → sglang-0.1.24.dist-info}/WHEEL +1 -1
- {sglang-0.1.22.dist-info → sglang-0.1.24.dist-info}/LICENSE +0 -0
- {sglang-0.1.22.dist-info → sglang-0.1.24.dist-info}/top_level.txt +0 -0
sglang/srt/models/llama2.py
CHANGED
@@ -5,14 +5,10 @@
|
|
5
5
|
from typing import Any, Dict, Iterable, Optional, Tuple
|
6
6
|
|
7
7
|
import torch
|
8
|
-
import tqdm
|
9
8
|
from torch import nn
|
10
9
|
from transformers import LlamaConfig
|
11
10
|
from vllm.config import CacheConfig
|
12
|
-
from vllm.distributed import
|
13
|
-
get_tensor_model_parallel_rank,
|
14
|
-
get_tensor_model_parallel_world_size,
|
15
|
-
)
|
11
|
+
from vllm.distributed import get_tensor_model_parallel_world_size
|
16
12
|
from vllm.model_executor.layers.activation import SiluAndMul
|
17
13
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
18
14
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
@@ -39,6 +35,7 @@ class LlamaMLP(nn.Module):
|
|
39
35
|
intermediate_size: int,
|
40
36
|
hidden_act: str,
|
41
37
|
quant_config: Optional[QuantizationConfig] = None,
|
38
|
+
prefix: str = "",
|
42
39
|
) -> None:
|
43
40
|
super().__init__()
|
44
41
|
self.gate_up_proj = MergedColumnParallelLinear(
|
@@ -46,12 +43,14 @@ class LlamaMLP(nn.Module):
|
|
46
43
|
[intermediate_size] * 2,
|
47
44
|
bias=False,
|
48
45
|
quant_config=quant_config,
|
46
|
+
prefix=f"{prefix}.gate_up_proj",
|
49
47
|
)
|
50
48
|
self.down_proj = RowParallelLinear(
|
51
49
|
intermediate_size,
|
52
50
|
hidden_size,
|
53
51
|
bias=False,
|
54
52
|
quant_config=quant_config,
|
53
|
+
prefix=f"{prefix}.down_proj",
|
55
54
|
)
|
56
55
|
if hidden_act != "silu":
|
57
56
|
raise ValueError(
|
@@ -70,6 +69,7 @@ class LlamaMLP(nn.Module):
|
|
70
69
|
class LlamaAttention(nn.Module):
|
71
70
|
def __init__(
|
72
71
|
self,
|
72
|
+
config: LlamaConfig,
|
73
73
|
hidden_size: int,
|
74
74
|
num_heads: int,
|
75
75
|
num_kv_heads: int,
|
@@ -79,6 +79,7 @@ class LlamaAttention(nn.Module):
|
|
79
79
|
rope_is_neox_style: bool = True,
|
80
80
|
max_position_embeddings: int = 8192,
|
81
81
|
quant_config: Optional[QuantizationConfig] = None,
|
82
|
+
prefix: str = "",
|
82
83
|
) -> None:
|
83
84
|
super().__init__()
|
84
85
|
self.hidden_size = hidden_size
|
@@ -96,7 +97,10 @@ class LlamaAttention(nn.Module):
|
|
96
97
|
# the KV heads across multiple tensor parallel GPUs.
|
97
98
|
assert tp_size % self.total_num_kv_heads == 0
|
98
99
|
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
99
|
-
|
100
|
+
# MistralConfig has an optional head_dim introduced by Mistral-Nemo
|
101
|
+
self.head_dim = getattr(
|
102
|
+
config, "head_dim", self.hidden_size // self.total_num_heads
|
103
|
+
)
|
100
104
|
self.q_size = self.num_heads * self.head_dim
|
101
105
|
self.kv_size = self.num_kv_heads * self.head_dim
|
102
106
|
self.scaling = self.head_dim**-0.5
|
@@ -110,12 +114,14 @@ class LlamaAttention(nn.Module):
|
|
110
114
|
self.total_num_kv_heads,
|
111
115
|
bias=False,
|
112
116
|
quant_config=quant_config,
|
117
|
+
prefix=f"{prefix}.qkv_proj",
|
113
118
|
)
|
114
119
|
self.o_proj = RowParallelLinear(
|
115
120
|
self.total_num_heads * self.head_dim,
|
116
121
|
hidden_size,
|
117
122
|
bias=False,
|
118
123
|
quant_config=quant_config,
|
124
|
+
prefix=f"{prefix}.o_proj",
|
119
125
|
)
|
120
126
|
|
121
127
|
self.rotary_emb = get_rope(
|
@@ -154,6 +160,7 @@ class LlamaDecoderLayer(nn.Module):
|
|
154
160
|
config: LlamaConfig,
|
155
161
|
layer_id: int = 0,
|
156
162
|
quant_config: Optional[QuantizationConfig] = None,
|
163
|
+
prefix: str = "",
|
157
164
|
) -> None:
|
158
165
|
super().__init__()
|
159
166
|
self.hidden_size = config.hidden_size
|
@@ -168,6 +175,7 @@ class LlamaDecoderLayer(nn.Module):
|
|
168
175
|
rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
|
169
176
|
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
170
177
|
self.self_attn = LlamaAttention(
|
178
|
+
config=config,
|
171
179
|
hidden_size=self.hidden_size,
|
172
180
|
num_heads=config.num_attention_heads,
|
173
181
|
num_kv_heads=config.num_key_value_heads,
|
@@ -177,12 +185,14 @@ class LlamaDecoderLayer(nn.Module):
|
|
177
185
|
rope_is_neox_style=rope_is_neox_style,
|
178
186
|
max_position_embeddings=max_position_embeddings,
|
179
187
|
quant_config=quant_config,
|
188
|
+
prefix=f"{prefix}.self_attn",
|
180
189
|
)
|
181
190
|
self.mlp = LlamaMLP(
|
182
191
|
hidden_size=self.hidden_size,
|
183
192
|
intermediate_size=config.intermediate_size,
|
184
193
|
hidden_act=config.hidden_act,
|
185
194
|
quant_config=quant_config,
|
195
|
+
prefix=f"{prefix}.mlp",
|
186
196
|
)
|
187
197
|
self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
188
198
|
self.post_attention_layernorm = RMSNorm(
|
@@ -230,7 +240,9 @@ class LlamaModel(nn.Module):
|
|
230
240
|
)
|
231
241
|
self.layers = nn.ModuleList(
|
232
242
|
[
|
233
|
-
LlamaDecoderLayer(
|
243
|
+
LlamaDecoderLayer(
|
244
|
+
config, i, quant_config=quant_config, prefix=f"model.layers.{i}"
|
245
|
+
)
|
234
246
|
for i in range(config.num_hidden_layers)
|
235
247
|
]
|
236
248
|
)
|
@@ -370,9 +382,6 @@ class LlamaForCausalLM(nn.Module):
|
|
370
382
|
weight_loader(param, loaded_weight)
|
371
383
|
|
372
384
|
if name is None or loaded_weight is None:
|
373
|
-
if get_tensor_model_parallel_rank() == 0:
|
374
|
-
weights = tqdm.tqdm(weights, total=int(len(params_dict) * 1.5))
|
375
|
-
|
376
385
|
for name, loaded_weight in weights:
|
377
386
|
load_weights_per_param(name, loaded_weight)
|
378
387
|
else:
|
sglang/srt/server.py
CHANGED
@@ -157,6 +157,19 @@ def _set_global_server_args(server_args: ServerArgs):
|
|
157
157
|
}
|
158
158
|
|
159
159
|
|
160
|
+
def _set_torch_compile_config():
|
161
|
+
# The following configurations are for torch compile optimizations
|
162
|
+
import torch._dynamo.config
|
163
|
+
import torch._inductor.config
|
164
|
+
|
165
|
+
torch._inductor.config.coordinate_descent_tuning = True
|
166
|
+
torch._inductor.config.triton.unique_kernel_names = True
|
167
|
+
torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
|
168
|
+
|
169
|
+
# FIXME: tmp workaround
|
170
|
+
torch._dynamo.config.accumulated_cache_size_limit = 256
|
171
|
+
|
172
|
+
|
160
173
|
def launch_server(
|
161
174
|
server_args: ServerArgs,
|
162
175
|
model_overide_args: Optional[dict] = None,
|
@@ -174,6 +187,7 @@ def launch_server(
|
|
174
187
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
175
188
|
os.environ["NCCL_CUMEM_ENABLE"] = "0"
|
176
189
|
os.environ["NCCL_NVLS_ENABLE"] = "0"
|
190
|
+
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
177
191
|
set_ulimit()
|
178
192
|
if server_args.show_time_cost:
|
179
193
|
enable_show_time_cost()
|
@@ -182,7 +196,7 @@ def launch_server(
|
|
182
196
|
if not server_args.disable_flashinfer:
|
183
197
|
assert_pkg_version(
|
184
198
|
"flashinfer",
|
185
|
-
"0.1.
|
199
|
+
"0.1.1",
|
186
200
|
"Please uninstall the old version and "
|
187
201
|
"reinstall the latest version by following the instructions "
|
188
202
|
"at https://docs.flashinfer.ai/installation.html.",
|
@@ -190,6 +204,10 @@ def launch_server(
|
|
190
204
|
if server_args.chat_template:
|
191
205
|
# TODO: replace this with huggingface transformers template
|
192
206
|
load_chat_template_for_openai_api(server_args.chat_template)
|
207
|
+
|
208
|
+
if server_args.enable_torch_compile:
|
209
|
+
_set_torch_compile_config()
|
210
|
+
|
193
211
|
_set_global_server_args(server_args)
|
194
212
|
|
195
213
|
# Allocate ports
|
@@ -205,6 +223,7 @@ def launch_server(
|
|
205
223
|
detokenizer_port=ports[2],
|
206
224
|
nccl_ports=ports[3:],
|
207
225
|
)
|
226
|
+
logger.info(f"{server_args=}")
|
208
227
|
|
209
228
|
# Handle multi-node tensor parallelism
|
210
229
|
if server_args.nnodes > 1:
|
sglang/srt/server_args.py
CHANGED
@@ -29,7 +29,7 @@ class ServerArgs:
|
|
29
29
|
max_prefill_tokens: Optional[int] = None
|
30
30
|
max_running_requests: Optional[int] = None
|
31
31
|
schedule_heuristic: str = "lpm"
|
32
|
-
schedule_conservativeness: float = 0
|
32
|
+
schedule_conservativeness: float = 1.0
|
33
33
|
|
34
34
|
# Other runtime options
|
35
35
|
tp_size: int = 1
|
@@ -55,6 +55,7 @@ class ServerArgs:
|
|
55
55
|
disable_regex_jump_forward: bool = False
|
56
56
|
disable_cuda_graph: bool = False
|
57
57
|
disable_disk_cache: bool = False
|
58
|
+
enable_torch_compile: bool = False
|
58
59
|
attention_reduce_in_fp32: bool = False
|
59
60
|
enable_p2p_check: bool = False
|
60
61
|
efficient_weight_load: bool = False
|
@@ -69,15 +70,15 @@ class ServerArgs:
|
|
69
70
|
self.tokenizer_path = self.model_path
|
70
71
|
if self.mem_fraction_static is None:
|
71
72
|
if self.tp_size >= 16:
|
72
|
-
self.mem_fraction_static = 0.
|
73
|
+
self.mem_fraction_static = 0.80
|
73
74
|
elif self.tp_size >= 8:
|
74
|
-
self.mem_fraction_static = 0.
|
75
|
+
self.mem_fraction_static = 0.84
|
75
76
|
elif self.tp_size >= 4:
|
76
|
-
self.mem_fraction_static = 0.
|
77
|
+
self.mem_fraction_static = 0.86
|
77
78
|
elif self.tp_size >= 2:
|
78
|
-
self.mem_fraction_static = 0.85
|
79
|
-
else:
|
80
79
|
self.mem_fraction_static = 0.88
|
80
|
+
else:
|
81
|
+
self.mem_fraction_static = 0.89
|
81
82
|
if isinstance(self.additional_ports, int):
|
82
83
|
self.additional_ports = [self.additional_ports]
|
83
84
|
elif self.additional_ports is None:
|
@@ -317,6 +318,11 @@ class ServerArgs:
|
|
317
318
|
action="store_true",
|
318
319
|
help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
|
319
320
|
)
|
321
|
+
parser.add_argument(
|
322
|
+
"--enable-torch-compile",
|
323
|
+
action="store_true",
|
324
|
+
help="Optimize the model with torch.compile, experimental feature.",
|
325
|
+
)
|
320
326
|
parser.add_argument(
|
321
327
|
"--attention-reduce-in-fp32",
|
322
328
|
action="store_true",
|
sglang/srt/utils.py
CHANGED
@@ -312,6 +312,9 @@ def suppress_other_loggers():
|
|
312
312
|
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
|
313
313
|
logging.WARN
|
314
314
|
)
|
315
|
+
logging.getLogger("vllm.distributed.device_communicators.shm_broadcast").setLevel(
|
316
|
+
logging.WARN
|
317
|
+
)
|
315
318
|
logging.getLogger("vllm.selector").setLevel(logging.WARN)
|
316
319
|
logging.getLogger("vllm.utils").setLevel(logging.WARN)
|
317
320
|
|
@@ -411,6 +414,52 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
411
414
|
setattr(DummyModelLoader, "load_model", load_model)
|
412
415
|
|
413
416
|
|
417
|
+
vllm_all_gather_backup = None
|
418
|
+
|
419
|
+
|
420
|
+
def monkey_patch_vllm_all_gather(reverse: bool = False):
|
421
|
+
"""Monkey patch all-gather to remove in-place operations."""
|
422
|
+
from torch.distributed import _functional_collectives as funcol
|
423
|
+
from vllm.distributed.parallel_state import GroupCoordinator
|
424
|
+
|
425
|
+
global vllm_all_gather_backup
|
426
|
+
if vllm_all_gather_backup is None:
|
427
|
+
vllm_all_gather_backup = GroupCoordinator.all_gather
|
428
|
+
|
429
|
+
def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
|
430
|
+
world_size = self.world_size
|
431
|
+
# Bypass the function if we are using only 1 GPU.
|
432
|
+
if world_size == 1:
|
433
|
+
return input_
|
434
|
+
assert (
|
435
|
+
-input_.dim() <= dim < input_.dim()
|
436
|
+
), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
|
437
|
+
if dim < 0:
|
438
|
+
# Convert negative dim to positive.
|
439
|
+
dim += input_.dim()
|
440
|
+
input_size = input_.size()
|
441
|
+
# Allocate output tensor.
|
442
|
+
output_tensor = torch.empty(
|
443
|
+
(world_size,) + input_size, dtype=input_.dtype, device=input_.device
|
444
|
+
)
|
445
|
+
|
446
|
+
output_tensor = funcol.all_gather_tensor(
|
447
|
+
input_, gather_dim=0, group=self.device_group
|
448
|
+
).view((world_size,) + input_size)
|
449
|
+
|
450
|
+
# Reshape
|
451
|
+
output_tensor = output_tensor.movedim(0, dim)
|
452
|
+
output_tensor = output_tensor.reshape(
|
453
|
+
input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
|
454
|
+
)
|
455
|
+
return output_tensor
|
456
|
+
|
457
|
+
if reverse:
|
458
|
+
setattr(GroupCoordinator, "all_gather", vllm_all_gather_backup)
|
459
|
+
else:
|
460
|
+
setattr(GroupCoordinator, "all_gather", all_gather)
|
461
|
+
|
462
|
+
|
414
463
|
API_KEY_HEADER_NAME = "X-API-Key"
|
415
464
|
|
416
465
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.24
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -240,7 +240,7 @@ Requires-Dist: torch ; extra == 'srt'
|
|
240
240
|
Requires-Dist: uvicorn ; extra == 'srt'
|
241
241
|
Requires-Dist: uvloop ; extra == 'srt'
|
242
242
|
Requires-Dist: zmq ; extra == 'srt'
|
243
|
-
Requires-Dist: vllm ==0.5.
|
243
|
+
Requires-Dist: vllm ==0.5.3.post1 ; extra == 'srt'
|
244
244
|
Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
|
245
245
|
|
246
246
|
<div align="center">
|
@@ -282,6 +282,7 @@ The core features include:
|
|
282
282
|
|
283
283
|
### Method 1: With pip
|
284
284
|
```
|
285
|
+
pip install --upgrade pip setuptools wheel
|
285
286
|
pip install "sglang[all]"
|
286
287
|
|
287
288
|
# Install FlashInfer CUDA kernels
|
@@ -293,6 +294,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
293
294
|
git clone https://github.com/sgl-project/sglang.git
|
294
295
|
cd sglang
|
295
296
|
|
297
|
+
pip install --upgrade pip
|
296
298
|
pip install -e "python[all]"
|
297
299
|
|
298
300
|
# Install FlashInfer CUDA kernels
|
@@ -390,15 +392,16 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
390
392
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
391
393
|
```
|
392
394
|
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
393
|
-
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-
|
395
|
+
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
394
396
|
```
|
395
397
|
# Node 0
|
396
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-
|
398
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
397
399
|
|
398
400
|
# Node 1
|
399
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-
|
401
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
400
402
|
```
|
401
403
|
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
|
404
|
+
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
402
405
|
|
403
406
|
### Supported Models
|
404
407
|
|
@@ -420,6 +423,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
420
423
|
- Grok
|
421
424
|
- ChatGLM
|
422
425
|
- InternLM 2
|
426
|
+
- Mistral NeMo
|
423
427
|
|
424
428
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
425
429
|
|
@@ -1,10 +1,10 @@
|
|
1
|
-
sglang/__init__.py,sha256=
|
1
|
+
sglang/__init__.py,sha256=nMs6lYeKcQpYArIaZLQ2VGNleY1dVvdBFaHyG7fpOsA,1141
|
2
2
|
sglang/api.py,sha256=1JARbc1wNYF6tODdUpgmNgTyLOvMnxdTBctLvEwzGTY,5565
|
3
3
|
sglang/bench.py,sha256=p34wnfMRdiedOUf9GKGZkkNxehmyTzK6Q1O20q_SGjY,21841
|
4
4
|
sglang/bench_latency.py,sha256=UPy6WhrddMTDX7HqIeHNhCn5vF0YMOKxJlQRvhMC8zU,10552
|
5
|
-
sglang/bench_serving.py,sha256=
|
5
|
+
sglang/bench_serving.py,sha256=zKGgVX3S-ggUvOxvEM4AszzXRPRVU6NGNnBG5vAAvRY,34577
|
6
6
|
sglang/check_env.py,sha256=CscuPMlf68dkgZf0m-FiLpUisNNDoihMck4qhLOeV1Q,4124
|
7
|
-
sglang/global_config.py,sha256=
|
7
|
+
sglang/global_config.py,sha256=QG-ABVJksKK_llvUx7fSZcmK4GGCs-hBUVcM4LCr7Nw,1749
|
8
8
|
sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
|
9
9
|
sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
|
10
10
|
sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
|
@@ -18,7 +18,7 @@ sglang/backend/vertexai.py,sha256=98toR-L0OTi4dYHaSmmzJdlQ2qN_0lImoKZFlVgYLRE,48
|
|
18
18
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
|
20
20
|
sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
|
21
|
-
sglang/lang/interpreter.py,sha256=
|
21
|
+
sglang/lang/interpreter.py,sha256=27j7H9p7TY4uUfF9f5E17FxK1xCNeNju4aut_PaWCrQ,29693
|
22
22
|
sglang/lang/ir.py,sha256=5VVK2JnbspdysrhcGgkmp_JlAprd2XqqRnS_GfP_XWc,16645
|
23
23
|
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
24
24
|
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -30,16 +30,16 @@ sglang/lang/backend/runtime_endpoint.py,sha256=TZ0NV89or5_3MIZZFnc1JXAAjnv7tCfeQ
|
|
30
30
|
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
31
31
|
sglang/srt/conversation.py,sha256=Il7JJuu4o42k2xdBWVfONNmstTsAM-4idX6AcEOnrXQ,15526
|
32
32
|
sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
|
33
|
-
sglang/srt/hf_transformers_utils.py,sha256=
|
34
|
-
sglang/srt/memory_pool.py,sha256=
|
33
|
+
sglang/srt/hf_transformers_utils.py,sha256=94mOI93B2xOmXKqfJfEoGxqHgwwlWNbPHgsA47AQJK8,11245
|
34
|
+
sglang/srt/memory_pool.py,sha256=FhJk5GtYortO3MJIsMMQ-o49agwDHVX1aEQH2LITq6c,3949
|
35
35
|
sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
|
36
36
|
sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
|
37
37
|
sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
|
38
38
|
sglang/srt/openai_protocol.py,sha256=lGBhfxG6jmgUkMOh2NpBK9w9TUTRZKrsfHdW7XYhKKI,5700
|
39
39
|
sglang/srt/sampling_params.py,sha256=OI11asr1Bd_E5soDjih614v4flgWxdMZU9HAF0aBafQ,3062
|
40
|
-
sglang/srt/server.py,sha256=
|
41
|
-
sglang/srt/server_args.py,sha256=
|
42
|
-
sglang/srt/utils.py,sha256=
|
40
|
+
sglang/srt/server.py,sha256=JC6rs8mkWg2mWwriwZvYEZyO514_HJFOUNda-pu8U_4,14369
|
41
|
+
sglang/srt/server_args.py,sha256=aF6L35mEB-FU3BL_ooKuCIcOXLhYLxA9-MjpaOTQRCo,13189
|
42
|
+
sglang/srt/utils.py,sha256=ZB9WLlZ_GpKVpPJiETrYkqH10J8iWrN_4buxDnQoA88,18568
|
43
43
|
sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
|
44
44
|
sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
|
45
45
|
sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
|
@@ -48,33 +48,35 @@ sglang/srt/layers/context_flashattention_nopad.py,sha256=7ps_9W_ia9zikL9HqsSUwWH
|
|
48
48
|
sglang/srt/layers/extend_attention.py,sha256=aYAAL9HZJpaSASp-ulMvbmSmyMcqdYUsgVQC-Lbm7_U,12008
|
49
49
|
sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
|
50
50
|
sglang/srt/layers/linear.py,sha256=qLwFkOiRAljzE7LkAkLRdcCdVMk-t7b56jEjwQAuYDM,33953
|
51
|
-
sglang/srt/layers/logits_processor.py,sha256=
|
52
|
-
sglang/srt/layers/radix_attention.py,sha256=
|
51
|
+
sglang/srt/layers/logits_processor.py,sha256=KyRYANCiq9Cfu_VPjrIbSBAlqN_clcAgF3JrG9waU5k,9674
|
52
|
+
sglang/srt/layers/radix_attention.py,sha256=A3J_wOlysjblFXHgehAqRHBQmpYAHLyUovyLFsrMJ7A,6386
|
53
53
|
sglang/srt/layers/token_attention.py,sha256=EJ4gjbVLfshOZ_vr1iB-Eq8_B-4F26n_wPDj6e1Zrww,7386
|
54
54
|
sglang/srt/layers/quantization/__init__.py,sha256=PQFzdPpul98DvywBA6YMBOnrMjtHE1LMlMpJ7FM8J3I,1971
|
55
55
|
sglang/srt/layers/quantization/fp8.py,sha256=jaqgRFnHC--IL8iqB6Qygi-KXYPYBKKqt_j4Rk55_h4,24946
|
56
56
|
sglang/srt/managers/detokenizer_manager.py,sha256=8rN2cdMr61LWy07lingEqLnNy0W5Rebdn14IsTQ9PCs,5049
|
57
57
|
sglang/srt/managers/io_struct.py,sha256=Y6jW3p0cNg0jcrEQNki1H8MMEWxwWA4p6Y-xVgUVWaI,5404
|
58
58
|
sglang/srt/managers/tokenizer_manager.py,sha256=SbivhFhZUR9HU9pLTe93MlYprAFAHzOU3KMBA2piQUk,19308
|
59
|
-
sglang/srt/managers/controller/cuda_graph_runner.py,sha256=
|
59
|
+
sglang/srt/managers/controller/cuda_graph_runner.py,sha256=0aRqA1_34oJ557Zn8PjpJecex5bBWJdnCmBlcDVvYO0,8509
|
60
60
|
sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
|
61
|
-
sglang/srt/managers/controller/infer_batch.py,sha256=
|
61
|
+
sglang/srt/managers/controller/infer_batch.py,sha256=SKwCwhnZ_CNlG0mVCEc4X0e4HNjJFke-c8zdWP3TzjQ,34186
|
62
62
|
sglang/srt/managers/controller/manager_multi.py,sha256=DT8Y9RF5OyTxlrLEZYz4claNWir3UrVztdOZaVPiA6g,6077
|
63
63
|
sglang/srt/managers/controller/manager_single.py,sha256=2xO_iWK6tWvc0B31nKbe2N3klxwQBJmPTnFhNjzhVSI,4566
|
64
|
-
sglang/srt/managers/controller/model_runner.py,sha256=
|
64
|
+
sglang/srt/managers/controller/model_runner.py,sha256=927tf6nJjLjEDgz2wCDj2kvpZ-E_rAVm8PVKFVfP4p8,13951
|
65
65
|
sglang/srt/managers/controller/radix_cache.py,sha256=tx8LEQpqLxipw9UUVj4D1YQLMMDmWnjDYv8oDlOl-co,8210
|
66
|
-
sglang/srt/managers/controller/schedule_heuristic.py,sha256=
|
67
|
-
sglang/srt/managers/controller/tp_worker.py,sha256=
|
66
|
+
sglang/srt/managers/controller/schedule_heuristic.py,sha256=SQAGzPS3aB_TPj7rnPBhewwyR6W1sVwW4D3zG3JUY00,2714
|
67
|
+
sglang/srt/managers/controller/tp_worker.py,sha256=yjz-Xzl0zEy4QSU-EYneZH5vi3oHtBuXTtYe4VuDp2g,30517
|
68
68
|
sglang/srt/model_loader/model_loader.py,sha256=VS8VQL5ITN3akZ9eU_-uHWMan1axLMNG2_O12HzGysA,10132
|
69
69
|
sglang/srt/model_loader/utils.py,sha256=I2PS5HIH5Cg-p7xKO_Cw_foK2vQ61xVc3zQv7CbeGEw,10120
|
70
70
|
sglang/srt/models/chatglm.py,sha256=pH8g2Dj8qQLGPYpWVTb-IONfXsdfmpWi0-IEYNdSi4s,13296
|
71
71
|
sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
|
72
72
|
sglang/srt/models/dbrx.py,sha256=rRxOusGPu670ommeqXg62AllwB1apzE4yZoWc1fcr2M,14095
|
73
|
+
sglang/srt/models/deepseek.py,sha256=YtoPmv4fKmiH_jsRMSab9Wxq3aOZga9pCPGnkCs3Vvs,15457
|
73
74
|
sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
|
74
75
|
sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
|
76
|
+
sglang/srt/models/gpt_bigcode.py,sha256=XHO1naPdXfiKYQRQ6uZe1fN3PBDhKH3-bchsaaZvfE4,9637
|
75
77
|
sglang/srt/models/grok.py,sha256=611zrlIchvFaVfztRdBY7z97oU3KB-anykbOZy1hK6M,27295
|
76
78
|
sglang/srt/models/internlm2.py,sha256=8MNcwxU5Th9IxWa314HqqmbCRlPUFScnfneBDs0riIU,11659
|
77
|
-
sglang/srt/models/llama2.py,sha256=
|
79
|
+
sglang/srt/models/llama2.py,sha256=OyAf_lun5aZEsT80WmrIYBF8QXTXRpW8sUlylr4AZIc,14204
|
78
80
|
sglang/srt/models/llama_classification.py,sha256=foCPvNyP2bTZ0YcRBF-qkmBv-gT24lhLNCXP30Oq4VU,4370
|
79
81
|
sglang/srt/models/llava.py,sha256=vBI6EEeOG_9o23Shi9h8k58rxTOHZnSKMmPl3B3Q3uc,17924
|
80
82
|
sglang/srt/models/llavavid.py,sha256=SrNQ-U2wekHvP_up-ZXRkCSros2NzheHpPfXHrp0YBU,13050
|
@@ -96,8 +98,8 @@ sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq65
|
|
96
98
|
sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
|
97
99
|
sglang/test/test_programs.py,sha256=uefeHUFKT2NJESOujj-CsnPXdw1aQQN2TzUbPCHJjGs,13654
|
98
100
|
sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
|
99
|
-
sglang-0.1.
|
100
|
-
sglang-0.1.
|
101
|
-
sglang-0.1.
|
102
|
-
sglang-0.1.
|
103
|
-
sglang-0.1.
|
101
|
+
sglang-0.1.24.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
102
|
+
sglang-0.1.24.dist-info/METADATA,sha256=_HKFljParVedu-eht7OKKb_RpEkVcB-Wh_P_jRW3TJk,30933
|
103
|
+
sglang-0.1.24.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
|
104
|
+
sglang-0.1.24.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
105
|
+
sglang-0.1.24.dist-info/RECORD,,
|
File without changes
|
File without changes
|