sglang 0.4.1.post3__py3-none-any.whl → 0.4.1.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -0
- sglang/srt/layers/attention/__init__.py +14 -5
- sglang/srt/layers/attention/double_sparsity_backend.py +0 -52
- sglang/srt/layers/attention/flashinfer_backend.py +211 -81
- sglang/srt/layers/attention/torch_native_backend.py +1 -38
- sglang/srt/layers/attention/triton_backend.py +20 -11
- sglang/srt/layers/attention/triton_ops/decode_attention.py +4 -0
- sglang/srt/layers/logits_processor.py +167 -212
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +187 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -6
- sglang/srt/layers/quantization/fp8.py +2 -2
- sglang/srt/layers/sampler.py +57 -21
- sglang/srt/layers/torchao_utils.py +17 -3
- sglang/srt/managers/io_struct.py +1 -2
- sglang/srt/managers/schedule_batch.py +26 -2
- sglang/srt/managers/schedule_policy.py +159 -90
- sglang/srt/managers/scheduler.py +62 -26
- sglang/srt/managers/tokenizer_manager.py +22 -20
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
- sglang/srt/model_executor/cuda_graph_runner.py +118 -73
- sglang/srt/model_executor/forward_batch_info.py +33 -8
- sglang/srt/model_executor/model_runner.py +63 -61
- sglang/srt/models/deepseek_v2.py +34 -7
- sglang/srt/models/grok.py +97 -26
- sglang/srt/openai_api/adapter.py +0 -17
- sglang/srt/openai_api/protocol.py +3 -3
- sglang/srt/sampling/sampling_batch_info.py +21 -0
- sglang/srt/sampling/sampling_params.py +9 -1
- sglang/srt/server.py +9 -5
- sglang/srt/server_args.py +108 -57
- sglang/srt/speculative/build_eagle_tree.py +347 -0
- sglang/srt/speculative/eagle_utils.py +618 -0
- sglang/srt/speculative/eagle_worker.py +170 -0
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/utils.py +15 -2
- sglang/version.py +1 -1
- {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post4.dist-info}/METADATA +9 -8
- {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post4.dist-info}/RECORD +63 -39
- {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post4.dist-info}/WHEEL +1 -1
- {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post4.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,170 @@
|
|
1
|
+
from typing import List, Optional, Union
|
2
|
+
|
3
|
+
import torch
|
4
|
+
|
5
|
+
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
6
|
+
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
7
|
+
from sglang.srt.managers.tp_worker import TpModelWorker
|
8
|
+
from sglang.srt.model_executor.forward_batch_info import (
|
9
|
+
CaptureHiddenMode,
|
10
|
+
ForwardBatch,
|
11
|
+
ForwardMode,
|
12
|
+
)
|
13
|
+
from sglang.srt.model_executor.model_runner import ModelRunner
|
14
|
+
from sglang.srt.server_args import ServerArgs
|
15
|
+
from sglang.srt.speculative.eagle_utils import EAGLEDraftInput
|
16
|
+
|
17
|
+
|
18
|
+
class EAGLEWorker(TpModelWorker):
|
19
|
+
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
server_args: ServerArgs,
|
23
|
+
gpu_id: int,
|
24
|
+
tp_rank: int,
|
25
|
+
dp_rank: Optional[int],
|
26
|
+
nccl_port: int,
|
27
|
+
target_worker: TpModelWorker,
|
28
|
+
):
|
29
|
+
# Do not capture cuda graph in `super().__init__()`
|
30
|
+
# We will capture it later
|
31
|
+
backup_disable_cuda_graph = server_args.disable_cuda_graph
|
32
|
+
server_args.disable_cuda_graph = True
|
33
|
+
super().__init__(
|
34
|
+
gpu_id=gpu_id,
|
35
|
+
tp_rank=tp_rank,
|
36
|
+
server_args=server_args,
|
37
|
+
nccl_port=nccl_port,
|
38
|
+
dp_rank=dp_rank,
|
39
|
+
is_draft_worker=True,
|
40
|
+
)
|
41
|
+
self.target_worker = target_worker
|
42
|
+
self.server_args = server_args
|
43
|
+
|
44
|
+
# Share the embedding and lm_head
|
45
|
+
embed, head = self.target_worker.model_runner.model.get_embed_and_head()
|
46
|
+
self.model_runner.model.set_embed_and_head(embed, head)
|
47
|
+
self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
|
48
|
+
self.model_runner.init_cuda_graphs()
|
49
|
+
|
50
|
+
def forward_draft_decode(self, batch: ScheduleBatch):
|
51
|
+
batch.spec_info.prepare_for_decode(batch)
|
52
|
+
model_worker_batch = batch.get_model_worker_batch()
|
53
|
+
forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
|
54
|
+
forward_batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
|
55
|
+
logits_output = self.model_runner.forward(forward_batch)
|
56
|
+
self.capture_for_decode(logits_output, forward_batch)
|
57
|
+
|
58
|
+
def forward_draft_extend(self, batch: ScheduleBatch):
|
59
|
+
self._swap_mem_pool(batch, self.model_runner)
|
60
|
+
batch.spec_info.prepare_for_extend(batch)
|
61
|
+
model_worker_batch = batch.get_model_worker_batch()
|
62
|
+
forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
|
63
|
+
forward_batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
|
64
|
+
logits_output = self.model_runner.forward(forward_batch)
|
65
|
+
self.capture_for_decode(logits_output, forward_batch)
|
66
|
+
self._swap_mem_pool(batch, self.target_worker.model_runner)
|
67
|
+
|
68
|
+
def forward_batch_speculative_generation(self, batch: ScheduleBatch):
|
69
|
+
if batch.forward_mode.is_decode():
|
70
|
+
prev_spec_info = batch.spec_info
|
71
|
+
self._swap_mem_pool(batch, self.model_runner)
|
72
|
+
for i in range(self.server_args.speculative_num_steps):
|
73
|
+
self.forward_draft_decode(batch)
|
74
|
+
batch.spec_info.clear_draft_cache(batch)
|
75
|
+
self._swap_mem_pool(batch, self.target_worker.model_runner)
|
76
|
+
(
|
77
|
+
next_draft_input,
|
78
|
+
logits_output,
|
79
|
+
verified_id,
|
80
|
+
self.finish_extend_len,
|
81
|
+
model_worker_batch,
|
82
|
+
) = self.verify(batch)
|
83
|
+
next_draft_input.init(self.server_args)
|
84
|
+
batch.spec_info = next_draft_input
|
85
|
+
# if it is None, means all requsets are finished
|
86
|
+
if batch.spec_info.verified_id is not None:
|
87
|
+
self.forward_extend_after_decode(batch)
|
88
|
+
batch.spec_info = prev_spec_info
|
89
|
+
return logits_output, verified_id, model_worker_batch, next_draft_input
|
90
|
+
|
91
|
+
else:
|
92
|
+
spec_info = EAGLEDraftInput()
|
93
|
+
spec_info.init(self.server_args)
|
94
|
+
model_worker_batch = batch.get_model_worker_batch()
|
95
|
+
model_worker_batch.spec_info = spec_info
|
96
|
+
spec_info.capture_hidden_mode = CaptureHiddenMode.FULL
|
97
|
+
logits_output, next_token_ids = self.target_worker.forward_batch_generation(
|
98
|
+
model_worker_batch
|
99
|
+
)
|
100
|
+
model_worker_batch.spec_info.verified_id = next_token_ids
|
101
|
+
model_worker_batch.spec_info.hidden_states = logits_output.hidden_states
|
102
|
+
batch.spec_info = spec_info
|
103
|
+
self.forward_draft_extend(batch)
|
104
|
+
batch.spec_info = None
|
105
|
+
return logits_output, next_token_ids, model_worker_batch, spec_info
|
106
|
+
|
107
|
+
def verify(self, batch: ScheduleBatch):
|
108
|
+
verify_input = batch.spec_info.prepare_for_verify(batch)
|
109
|
+
batch.forward_mode = ForwardMode.TARGET_VERIFY
|
110
|
+
verify_input.prepare_for_verify(batch)
|
111
|
+
batch.spec_info = verify_input
|
112
|
+
batch.spec_info.capture_hidden_mode = CaptureHiddenMode.FULL
|
113
|
+
model_worker_batch = batch.get_model_worker_batch()
|
114
|
+
logits_output, _ = self.target_worker.forward_batch_generation(
|
115
|
+
model_worker_batch, skip_sample=True
|
116
|
+
)
|
117
|
+
verify_input.hidden_states = logits_output.hidden_states
|
118
|
+
res = verify_input.verify(batch, logits_output)
|
119
|
+
batch.forward_mode = ForwardMode.DECODE
|
120
|
+
return res + (model_worker_batch,)
|
121
|
+
|
122
|
+
def _swap_mem_pool(self, batch: ScheduleBatch, runner: ModelRunner):
|
123
|
+
batch.token_to_kv_pool = runner.token_to_kv_pool
|
124
|
+
batch.req_to_token_pool = runner.req_to_token_pool
|
125
|
+
|
126
|
+
def forward_extend_after_decode(self, batch: ScheduleBatch):
|
127
|
+
self._swap_mem_pool(batch, self.model_runner)
|
128
|
+
batch.forward_mode = ForwardMode.DRAFT_EXTEND
|
129
|
+
if batch.spec_info.has_finished:
|
130
|
+
index = batch.spec_info.unfinished_index
|
131
|
+
seq_lens = batch.seq_lens
|
132
|
+
batch.seq_lens = batch.seq_lens[index]
|
133
|
+
batch.spec_info.prepare_extend_after_decode(batch)
|
134
|
+
model_worker_batch = batch.get_model_worker_batch()
|
135
|
+
forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
|
136
|
+
forward_batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
|
137
|
+
logits_output = self.model_runner.forward(forward_batch)
|
138
|
+
batch.spec_info.hidden_states = logits_output.hidden_states
|
139
|
+
self.capture_for_decode(logits_output, forward_batch)
|
140
|
+
batch.forward_mode = ForwardMode.DECODE
|
141
|
+
if batch.spec_info.has_finished:
|
142
|
+
batch.seq_lens = seq_lens
|
143
|
+
self._swap_mem_pool(batch, self.target_worker.model_runner)
|
144
|
+
|
145
|
+
def capture_for_decode(self, logits_output, forward_batch):
|
146
|
+
if isinstance(logits_output, LogitsProcessorOutput):
|
147
|
+
logits = logits_output.next_token_logits
|
148
|
+
sample_output = torch.softmax(
|
149
|
+
logits, dim=-1
|
150
|
+
) # TODO: Support more sampling method @kavioyu
|
151
|
+
forward_batch.spec_info.capture_for_decode(
|
152
|
+
sample_output, logits_output.hidden_states, forward_batch.forward_mode
|
153
|
+
)
|
154
|
+
|
155
|
+
# Don't support prefix share now.
|
156
|
+
def finish_request(self, reqs: Union[Req, List[Req]]):
|
157
|
+
if not isinstance(reqs, List):
|
158
|
+
reqs = [reqs]
|
159
|
+
for req in reqs:
|
160
|
+
req_len = (
|
161
|
+
len(req.origin_input_ids)
|
162
|
+
+ len(req.output_ids)
|
163
|
+
- self.finish_extend_len[req.rid]
|
164
|
+
- 1
|
165
|
+
)
|
166
|
+
kv_indices = self.model_runner.req_to_token_pool.req_to_token[
|
167
|
+
req.req_pool_idx
|
168
|
+
][:req_len]
|
169
|
+
self.model_runner.token_to_kv_pool.free(kv_indices)
|
170
|
+
self.model_runner.req_to_token_pool.free(req.req_pool_idx)
|
@@ -2,8 +2,12 @@ from enum import IntEnum, auto
|
|
2
2
|
|
3
3
|
|
4
4
|
class SpeculativeAlgorithm(IntEnum):
|
5
|
+
NONE = auto()
|
5
6
|
EAGLE = auto()
|
6
7
|
|
8
|
+
def is_none(self):
|
9
|
+
return self == SpeculativeAlgorithm.NONE
|
10
|
+
|
7
11
|
def is_eagle(self):
|
8
12
|
return self == SpeculativeAlgorithm.EAGLE
|
9
13
|
|
@@ -11,6 +15,7 @@ class SpeculativeAlgorithm(IntEnum):
|
|
11
15
|
def from_string(name: str):
|
12
16
|
name_map = {
|
13
17
|
"EAGLE": SpeculativeAlgorithm.EAGLE,
|
18
|
+
None: SpeculativeAlgorithm.NONE,
|
14
19
|
}
|
15
20
|
return name_map[name]
|
16
21
|
|
sglang/srt/utils.py
CHANGED
@@ -15,6 +15,7 @@
|
|
15
15
|
|
16
16
|
import base64
|
17
17
|
import dataclasses
|
18
|
+
import io
|
18
19
|
import ipaddress
|
19
20
|
import itertools
|
20
21
|
import json
|
@@ -34,6 +35,7 @@ import warnings
|
|
34
35
|
from functools import lru_cache
|
35
36
|
from importlib.metadata import PackageNotFoundError, version
|
36
37
|
from io import BytesIO
|
38
|
+
from multiprocessing.reduction import ForkingPickler
|
37
39
|
from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
|
38
40
|
|
39
41
|
import numpy as np
|
@@ -60,7 +62,6 @@ from triton.runtime.cache import (
|
|
60
62
|
|
61
63
|
logger = logging.getLogger(__name__)
|
62
64
|
|
63
|
-
|
64
65
|
show_time_cost = False
|
65
66
|
time_infos = {}
|
66
67
|
|
@@ -1206,7 +1207,6 @@ def _cuda_device_count_stateless(cuda_visible_devices: Optional[str] = None) ->
|
|
1206
1207
|
# https://github.com/pytorch/pytorch/blob/
|
1207
1208
|
# c1cd946818442aca8c7f812b16d187ce1586c3bc/
|
1208
1209
|
# torch/cuda/__init__.py#L831C1-L831C17
|
1209
|
-
import torch.cuda
|
1210
1210
|
import torch.version
|
1211
1211
|
|
1212
1212
|
if not torch.cuda._is_compiled():
|
@@ -1335,3 +1335,16 @@ def parse_tool_response(text, tools, **kwargs):
|
|
1335
1335
|
for call_info in call_info_list
|
1336
1336
|
]
|
1337
1337
|
return text, call_info_list
|
1338
|
+
|
1339
|
+
|
1340
|
+
class MultiprocessingSerializer:
|
1341
|
+
@staticmethod
|
1342
|
+
def serialize(obj):
|
1343
|
+
buf = io.BytesIO()
|
1344
|
+
ForkingPickler(buf).dump(obj)
|
1345
|
+
buf.seek(0)
|
1346
|
+
return buf.read()
|
1347
|
+
|
1348
|
+
@staticmethod
|
1349
|
+
def deserialize(data):
|
1350
|
+
return ForkingPickler.loads(data)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.1.
|
1
|
+
__version__ = "0.4.1.post4"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.1.
|
3
|
+
Version: 0.4.1.post4
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -243,11 +243,11 @@ Requires-Dist: torch; extra == "srt"
|
|
243
243
|
Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
|
244
244
|
Requires-Dist: cuda-python; extra == "srt"
|
245
245
|
Requires-Dist: flashinfer==0.1.6; extra == "srt"
|
246
|
-
Requires-Dist: sgl-kernel>=0.0.2.
|
246
|
+
Requires-Dist: sgl-kernel>=0.0.2.post11; extra == "srt"
|
247
247
|
Provides-Extra: srt-hip
|
248
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
249
249
|
Requires-Dist: torch; extra == "srt-hip"
|
250
|
-
Requires-Dist: vllm==0.6.3.
|
250
|
+
Requires-Dist: vllm==0.6.3.post2.dev1; extra == "srt-hip"
|
251
251
|
Provides-Extra: srt-xpu
|
252
252
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
253
253
|
Provides-Extra: srt-hpu
|
@@ -315,7 +315,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
|
|
315
315
|
|
316
316
|
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
|
317
317
|
| [**Documentation**](https://sgl-project.github.io/)
|
318
|
-
| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-
|
318
|
+
| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw)
|
319
319
|
| [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
|
320
320
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
321
321
|
|
@@ -347,9 +347,10 @@ The core features include:
|
|
347
347
|
|
348
348
|
## Getting Started
|
349
349
|
- [Install SGLang](https://sgl-project.github.io/start/install.html)
|
350
|
-
- [
|
351
|
-
- [Backend
|
352
|
-
- [Frontend
|
350
|
+
- [Quick Start](https://sgl-project.github.io/start/send_request.html)
|
351
|
+
- [Backend Tutorial](https://sgl-project.github.io/backend/openai_api_completions.html)
|
352
|
+
- [Frontend Tutorial](https://sgl-project.github.io/frontend/frontend.html)
|
353
|
+
- [Contribution Guide](https://sgl-project.github.io/references/contribution_guide.html)
|
353
354
|
|
354
355
|
## Benchmark and Performance
|
355
356
|
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
|
@@ -361,5 +362,5 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
361
362
|
The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
362
363
|
|
363
364
|
## Acknowledgment and Citation
|
364
|
-
We learned
|
365
|
+
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
365
366
|
Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
@@ -2,7 +2,7 @@ sglang/__init__.py,sha256=b2oIdWzp5P8SzieeOs2TzJoN3Do3tfJbV8gZS_imVcs,1619
|
|
2
2
|
sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
|
3
3
|
sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
|
4
4
|
sglang/bench_offline_throughput.py,sha256=r-uBvpnx-30mAnVwQB4WlqiXxy2fn5a1NUARwZcaIo4,12533
|
5
|
-
sglang/bench_one_batch.py,sha256=
|
5
|
+
sglang/bench_one_batch.py,sha256=uw__0H3e3lY_6EDz4IAZUoYxq9kQIOPbbcyguYxttSA,15975
|
6
6
|
sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
|
7
7
|
sglang/bench_serving.py,sha256=YQiCZreejCPBTqMmZsCB99RMi1N-Jx-dZtaafcQ8-14,53377
|
8
8
|
sglang/check_env.py,sha256=4OqpZaEJOfBM6-vtPILto5kqDmgiZM1Koc7lK78A7CI,8427
|
@@ -11,7 +11,7 @@ sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
|
|
11
11
|
sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
|
12
12
|
sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
|
13
13
|
sglang/utils.py,sha256=23jf4Mz8E5p5a6JOkjnfYZixdjZUk88F_mZ8rZcby5Q,11597
|
14
|
-
sglang/version.py,sha256=
|
14
|
+
sglang/version.py,sha256=efEbFOIgkOX__fKbqiqjj6UK2e0KofwnPDZo0VFdehs,28
|
15
15
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
16
|
sglang/lang/chat_template.py,sha256=cnfjjxIIcYRGRxXlJlOGnpFxFuhMHut7DS52LsOMKcA,15826
|
17
17
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
@@ -32,9 +32,9 @@ sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21
|
|
32
32
|
sglang/srt/hf_transformers_utils.py,sha256=38Ms0H2-VMerOS6jnczcFtZMS6lhw9B5rSWKAfxVUfQ,7945
|
33
33
|
sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
|
34
34
|
sglang/srt/model_parallel.py,sha256=eLXZhvJ4wG6dh0FontNCIdVZvHYdWgaeY-5cu7TD9tE,6078
|
35
|
-
sglang/srt/server.py,sha256=
|
36
|
-
sglang/srt/server_args.py,sha256=
|
37
|
-
sglang/srt/utils.py,sha256=
|
35
|
+
sglang/srt/server.py,sha256=2HPaIwN8-KijB44ADUnpWD4A2mTKwW9CPl4NJBu9yzE,35068
|
36
|
+
sglang/srt/server_args.py,sha256=oIwBCTwSoj6qyKJ5zD38dPfgdXcNol64wV7l6UjmJNo,36208
|
37
|
+
sglang/srt/utils.py,sha256=Xn5Zf_HzRVSsGF6_lvd85hS-0I2MdQXbe0Yo46WsTOc,44447
|
38
38
|
sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
|
39
39
|
sglang/srt/configs/device_config.py,sha256=dResqHjkg_dq10v6rnVpbXpvABZRB0jylOm-2_JAnx0,428
|
40
40
|
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
@@ -63,19 +63,19 @@ sglang/srt/layers/activation.py,sha256=EboMjT9HV2tNHQ6rzpojtlkzev1lAFbhQlxMg9hwx
|
|
63
63
|
sglang/srt/layers/custom_op_util.py,sha256=0vu-yX2wwonmO1L_o5G7SA6C-8XuhDIh9rPDvNeLhoc,922
|
64
64
|
sglang/srt/layers/layernorm.py,sha256=nRQ1w1xSUcU-zlqVC61BnGG6otS5W1w9VaSzeXizrx4,4037
|
65
65
|
sglang/srt/layers/linear.py,sha256=KyRFU0VcoNuN-hnQB9QQcBN9NCpeqPtLzzufIHUpV6w,47064
|
66
|
-
sglang/srt/layers/logits_processor.py,sha256=
|
66
|
+
sglang/srt/layers/logits_processor.py,sha256=Yd7GisSfEgSq3cLMzz5lYiB5Cv-YgE4AMmVcACMBBZ4,12991
|
67
67
|
sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
|
68
68
|
sglang/srt/layers/radix_attention.py,sha256=E4cmvkcCdCtb6VyLNrCKy1D6VwHQ063oH3JQXPaRy6w,2178
|
69
69
|
sglang/srt/layers/rotary_embedding.py,sha256=29tx3JNR40AoXqBa2cFGBjva9vU2xgFipETlpMaaZas,3985
|
70
|
-
sglang/srt/layers/sampler.py,sha256=
|
71
|
-
sglang/srt/layers/torchao_utils.py,sha256=
|
70
|
+
sglang/srt/layers/sampler.py,sha256=HQWi1zb1gmD9pHMQyEP3WPjnL8vy-ncZDVMENbjQW7c,6944
|
71
|
+
sglang/srt/layers/torchao_utils.py,sha256=8c2vzt106iP_QKbJtfN1GuABW8nCuP5dElQLUeci6qg,3934
|
72
72
|
sglang/srt/layers/vocab_parallel_embedding.py,sha256=slGwLiWjuFLCUdRe-GTlfumyZpqVX9VF6No_UGOT-hA,21624
|
73
|
-
sglang/srt/layers/attention/__init__.py,sha256=
|
74
|
-
sglang/srt/layers/attention/double_sparsity_backend.py,sha256=
|
75
|
-
sglang/srt/layers/attention/flashinfer_backend.py,sha256=
|
76
|
-
sglang/srt/layers/attention/torch_native_backend.py,sha256=
|
77
|
-
sglang/srt/layers/attention/triton_backend.py,sha256
|
78
|
-
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=
|
73
|
+
sglang/srt/layers/attention/__init__.py,sha256=GUoygIsXzDFt9I1w9p0GO7leaDK6l2J3cBRQfpp4oDI,2869
|
74
|
+
sglang/srt/layers/attention/double_sparsity_backend.py,sha256=QEDF8tQKMkh-nbt4jHKHZhhgHuV0Fla_BPzzoo9JfT4,9231
|
75
|
+
sglang/srt/layers/attention/flashinfer_backend.py,sha256=I8b3Dq1O9PijLN40lEK0Gjj8GNS46WF4K-QVOtVccTg,33218
|
76
|
+
sglang/srt/layers/attention/torch_native_backend.py,sha256=KrcAqTLVZLtwgOmB0xhwUUsX32M-5LYZpNxaRNT4VuA,9252
|
77
|
+
sglang/srt/layers/attention/triton_backend.py,sha256=44ScKsVs-rFvqsaAZG_mREEpczhGaUBvaflvWqrukVE,6743
|
78
|
+
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=ltWcZ00ugpglSYvszpGb-UCpGIixdG25cWtSrOOOMik,17943
|
79
79
|
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
|
80
80
|
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=DWOZXSTVN5ZbcFjDjcqs-nPdUkxSwum0SVXhVKqwh2g,11688
|
81
81
|
sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
|
@@ -85,8 +85,8 @@ sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
85
85
|
sglang/srt/layers/moe/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
|
86
86
|
sglang/srt/layers/moe/ep_moe/layer.py,sha256=6iQU5ZjQ8IXGoQ8ZlBuJqyQxYTEem9vXI6rbVIWKlZw,22303
|
87
87
|
sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
|
88
|
-
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=
|
89
|
-
sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=
|
88
|
+
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=JjSn3fNTHgXFDxcAINZUYAttbYOxl9VSIF305NT73Wg,36255
|
89
|
+
sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=MZF6BHJVjduz-XerTrHvCP3qSZ3NW0pUK2p8zNwDuac,20798
|
90
90
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",sha256=iNGsE2ZeVnQEnN4A8UJ9Jv0d3hbRF2MJ9oBgjup5Szk,2737
|
91
91
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=JJN0hryyLr5Zv3dSS7C8cPFhAwTT6XxUVnBGMZvV6JA,2752
|
92
92
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",sha256=ouRyZ5PEMPP2njPftCNhs-1g1y6wueWLmhI7G1SjV1k,4131
|
@@ -123,43 +123,64 @@ sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=BclDj5JyCy-8Bfue4broL1-IG
|
|
123
123
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json",sha256=uv-RwTNZT2n264dLo4eWxUpB3g7QqUyf2MFEGiRvoqQ,3251
|
124
124
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",sha256=XbCRIOmiNqVKh89p-0UxvvspINRDA1iV83f9l5yORwA,3254
|
125
125
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=4uQnjGPWokscrxiXDIvexOA8OkK5vkoIulmvvMFIEog,3250
|
126
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json",sha256=i5AXj26mWI-xEoOnLCZDXUzz8jk2RjDcGuaiT1QYSbY,3263
|
127
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json",sha256=fJKk5LEM_LSnq1yc3ekLqAfbUWzPojQA6yX3XgSFo-o,3254
|
126
128
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",sha256=iJVlnCYTNDMb6U1UnV46ZuL_8LcpOv_XFaYWIeRFeNA,3263
|
129
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json",sha256=KfvSYCMG48vnRb35d9WOxYyZulI-RBrUGXUHQxXi4hk,3264
|
130
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json",sha256=60yuOluuk6q88Ze0toPJB8hzMBvF7ZWyMZpriMdQf3g,3252
|
127
131
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",sha256=DA4PrCu_BNLSWWVTwOicNfbyqUNW7BTZC2dyFz9DVbU,3265
|
128
132
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=prj-QxdfS9Ns5WRPvahY_Tr7CyqlaVgNHPT89SS5zzg,3239
|
133
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json",sha256=AUgoHK1PmAFehSNmsbxunlBdzM50Q5nFvdnG9FSOjOw,3265
|
134
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json",sha256=pC5fdtEFc5aVNzpj_REHhz1QPrGvgI9iQCvlodDP7J8,3244
|
129
135
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=oxOKFDrgmw1YmgxTtRa1uoe3p09ylTLrkj_jOTqNh1Q,3249
|
130
136
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json",sha256=bfr70r6PmM95w7raabQOaOOSPiwU2OQCOZh-kKXIehY,3248
|
131
137
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json",sha256=4g9lABUJnB-iVwXfYPYcI05XFPG4jY8o0yJUK7kSPZM,3253
|
132
138
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",sha256=RGLqrAGvPCFZ0jMPBCJ0TqsnrSdW-EbUaSZu61cWGN8,3265
|
133
139
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=sjEVPVTgUAlp4s8tZLGSyeNzbW6zTtUm2IioH3nZsIg,3254
|
140
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json",sha256=eD3Y9sOwHFcgVdOfya8KxPhvLx_b4whfEWm4d8Y2HW8,3268
|
141
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json",sha256=KIfpZydSl31FOEqq0EBfxTyWRj1QTDwTjkPHFjNO3_A,3253
|
134
142
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",sha256=OnadAdmDbX17Ni9VPrNXYSsxYhbtBeniCxxhhb0UmUk,4733
|
135
143
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",sha256=V_sgDtEtGEuBsGVa0maYJHhhGqe1NE7l-1ek2ed9WP8,3082
|
144
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json",sha256=LD4Z5MRR5Ivi4bYB5hMgymtvmFyVJwq6gmehA7fzecc,3271
|
145
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json",sha256=GLIH4egg-pE-NWU5XqKuJCoRXciHN6GSc3NaE4PaeYg,3261
|
136
146
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",sha256=bKsYVJm-IvWHWpxUG-lMPkyNz0nQpDb4UEIv895c9JI,4730
|
137
147
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",sha256=AffDc0_51ML8HiA3757zbD10TZJdUsUDIYIqO4g0yUw,3250
|
138
148
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=IEYBNjt9HGnzoOVSWvL0A0jUqq926QD0_BvVYR4RA1Y,3252
|
139
149
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=Ns9Y12aZbJnFhcG3nwb67bDqqiQAo9tdTAIe8K2Ajz4,3255
|
150
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json",sha256=obNtHlqs6l6EBqGm0e0TD2wR9TYoQV_N9Y7om847WJk,3268
|
151
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json",sha256=juxJYeWYFHeLb-83_IDgrHEpoeSEursjXD43mTHBdLE,3246
|
140
152
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=HOxWmCI2ifHmWc0or2y8nEen86jDeLDov1-tuMzuhxo,3256
|
141
153
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",sha256=csHezh0HGWaNwrblGzMgcE95hqbqjWS8HImLRJYr_ts,3266
|
142
154
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=_5weLBinQCDzyV75hHKIT95Y0ce94KWft2_5BC6EkbQ,3254
|
155
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json",sha256=4O4VeMpgFNrqWyWqWgYgcYAgBQnOlAXvt26CRSXK-sY,3270
|
156
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json",sha256=qfjbXqbl902TuiyzzomUy2sMvs-Dud8ZphDRY5WIPBM,3260
|
143
157
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",sha256=_bw1_oads8tz51i4RVQUAjNi8r3b2Q2jPbi50TLFzlY,4732
|
144
158
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",sha256=Ru460ZgnUP4U8OsJfwF8n-AI-gfcolNR3_qzoxG6DtY,3254
|
145
159
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=K6BGrKw_oHTAtHjsZldcjp-BUM1dIecKXrrRn9OpRGs,3254
|
146
160
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json",sha256=4Q_-yITMfijOMoguUM2n96clARh-DUFsS-4oW_a3Jpc,3252
|
147
161
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",sha256=qqFoMaObuO8pFWcSb9q0wYsdC4eSCO7B-_ruQhR1N9M,3264
|
148
162
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=-5nkLIunjG1ghPoUEtt2AXEQw9oGiilP7K3UvQv9CqE,3252
|
163
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json",sha256=BXjSVGdvgP_-7xTvbHOO6ZrXWe0qSXiQChxoHGgWL7o,3263
|
164
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json",sha256=Pi2coJlJlpgqXiPRd77B_eCmmi7sCdBuoSGK1RA5YO8,3258
|
149
165
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",sha256=p2qlRhTt7owWB8keEmoCrPZpo39IAxsKnULFQ7R38SI,3873
|
150
166
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json",sha256=AKIX43JVc26ERb862pNOMEfGhsgyk1OGa42EptAfG1s,4409
|
151
167
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=DxYu8regZOSFu8ugFGA_QbwWK4g8xwQUZF9a_nNY4Cs,3255
|
152
168
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",sha256=obzfE_9XgsbFNfC9biYOHxR-V_Bgc7PKT8qZZJaiJJc,3262
|
153
169
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=qwKy8oaMsd3QrXgQbM_x9xcfYiHK_Ou1CEwDPL5Gbgo,3259
|
170
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json",sha256=rR8b-OuQ3watb8b2zuNlxKDSZpzlAagm9nb-FdKkt7s,3270
|
171
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json",sha256=8G_QqV_DhvZ6xSavMSpeE6qcXPVpsVjEtJabydybKqY,3263
|
154
172
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",sha256=54KpHTMGt_zDQHqbdopuVHPpiI44ZsN_5LBUBZ_woY4,4733
|
155
173
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=BAJnXTZoewwCtzJLUPJ0oYuALv640MvDuLseGcsYaaw,3252
|
156
174
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",sha256=-Tj7ImS6ZFDof_0VTyq7kVm8XD9B54RD6CUOPSf3Jjg,3265
|
157
175
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=tme0ydWzIxdABZLk4tU8G_X2dJUYGGZNkQzNGcmcvUc,3261
|
176
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json",sha256=3YQakSmUKhpw1KO7Hn-tEc-yyD1fEj01_6JlSYnrrlI,3274
|
177
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json",sha256=W2ka_U8pzwjzX62NEGKXR32uuSR_zfHD1XjXYf5bgBs,3262
|
158
178
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json",sha256=aMP7oZmh8BZnPOrl0MFibcdhTn3VmOSjqoKoK2rMSbU,4323
|
159
179
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",sha256=sY2nWMPh9lsIkhPCjkHO245wpnfFbrHmzdcZDVFPVww,3265
|
180
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json",sha256=Uz5X80VcNBOaxshwVNUEittHk2zqB4HQCfTJ4TPG5aM,3274
|
160
181
|
sglang/srt/layers/quantization/__init__.py,sha256=VPYXShHvbvkOgVBlkIqic4RhdJ1y6EZ3r34T-nZMT1k,4606
|
161
182
|
sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
|
162
|
-
sglang/srt/layers/quantization/fp8.py,sha256=
|
183
|
+
sglang/srt/layers/quantization/fp8.py,sha256=ypIb8wUN18trzMhot8QKUj9sSdCXgPC1i2Qi-ESToWw,32670
|
163
184
|
sglang/srt/layers/quantization/fp8_kernel.py,sha256=cYF4ckqrUyhCO9Ha7zi05R8EhRaqSa8rFpYisz-9Ed0,10743
|
164
185
|
sglang/srt/layers/quantization/fp8_utils.py,sha256=qBVJXxbxqmf8-Juq0t-IXWjlaZoePJqFNYcs9-oT5Yo,4150
|
165
186
|
"sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",sha256=tkLjwLC_aVXhzuvo-2QHkojXZauPJsf3jNHFn1S7uRA,3244
|
@@ -206,14 +227,14 @@ sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rds
|
|
206
227
|
sglang/srt/managers/data_parallel_controller.py,sha256=psI4FAuBGjtdnEuwagnGdtRqvqSSxOROfNKQqVDqlVA,8382
|
207
228
|
sglang/srt/managers/detokenizer_manager.py,sha256=XvyxUhY_SNXlAcVsx9zczrGllpEMzj7p2Vbh6M_yHy8,8555
|
208
229
|
sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
|
209
|
-
sglang/srt/managers/io_struct.py,sha256=
|
210
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
211
|
-
sglang/srt/managers/schedule_policy.py,sha256=
|
212
|
-
sglang/srt/managers/scheduler.py,sha256=
|
230
|
+
sglang/srt/managers/io_struct.py,sha256=02NMBHRCjs9TUSdhKJmvMp3HculMC-50SkCGOEaYEHg,16197
|
231
|
+
sglang/srt/managers/schedule_batch.py,sha256=WlJstk0MP4QA434zu6_VZ4FhDByEPjSggFDT8em4GvQ,46851
|
232
|
+
sglang/srt/managers/schedule_policy.py,sha256=aHkIL9pZtc4Kdmy8XU9tsjaDzdChVN2dnGKvJkSyqFg,17965
|
233
|
+
sglang/srt/managers/scheduler.py,sha256=sTnZDLN8gLLHkiQW4UvMZsdKBMkgKfUk-rAHUZ9zNR0,65272
|
213
234
|
sglang/srt/managers/session_controller.py,sha256=3laMRIXEYWDjfytCjPs0vw_Tw__k-nKBY-bYzycYbfc,5482
|
214
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
215
|
-
sglang/srt/managers/tp_worker.py,sha256
|
216
|
-
sglang/srt/managers/tp_worker_overlap_thread.py,sha256=
|
235
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=Xryex_dgdZzRmPtSe16WUz1p9IlGysjVWmocaHjOJz4,33686
|
236
|
+
sglang/srt/managers/tp_worker.py,sha256=-bvUFCo544QQSEHqPPjeOvCWMEFn01Bva6AeO39Qe3o,8043
|
237
|
+
sglang/srt/managers/tp_worker_overlap_thread.py,sha256=rdHz2thdGSmceDedrolHOqjNPhrralyDTuNREL56oNI,9067
|
217
238
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=QC8HS8RC5DXu14kyXsxAgEUsn0f932p2DjqzbKjc6Bs,962
|
218
239
|
sglang/srt/mem_cache/chunk_cache.py,sha256=R2gHAuqKd5ayQW3NnsgoGUH31---Z5izCDyCqLL0FjQ,2524
|
219
240
|
sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
|
@@ -221,9 +242,9 @@ sglang/srt/mem_cache/memory_pool.py,sha256=oxk3UtiiFA3_1iIP6eFsk8HIcRI_8Z1-FE2KO
|
|
221
242
|
sglang/srt/mem_cache/radix_cache.py,sha256=c5voySV5L855c0G9cBEc9iQ4nR7PDDmg0V6fWWJHcq4,10945
|
222
243
|
sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTluMU,6861
|
223
244
|
sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
|
224
|
-
sglang/srt/model_executor/cuda_graph_runner.py,sha256=
|
225
|
-
sglang/srt/model_executor/forward_batch_info.py,sha256=
|
226
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
245
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=PFK4aRu8ffOBQw0zU_yYofUK_poi5C1vCc3ePixj0JY,18243
|
246
|
+
sglang/srt/model_executor/forward_batch_info.py,sha256=Z4VrcyQt3f4jPYvvm1vimMx3hSNNeUEONAFvSziy6N0,14788
|
247
|
+
sglang/srt/model_executor/model_runner.py,sha256=MZv0CNevcZ85L2JYVc1MnXRKbvUWoSuCFcShES07YN4,29889
|
227
248
|
sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
|
228
249
|
sglang/srt/model_loader/loader.py,sha256=7OG_8-66vFDFZ9kVKGNK1BFBjZ6ql449dlyvdCbMqvE,43876
|
229
250
|
sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
|
@@ -233,7 +254,7 @@ sglang/srt/models/chatglm.py,sha256=DOrEhmb0s-yPId88R6nJeLOTUEtogk-vkB69qT2JdWc,
|
|
233
254
|
sglang/srt/models/commandr.py,sha256=PNXgfOZF84h-rSH0edEECUmEGW8YLb44V75Z_oDhFiA,14223
|
234
255
|
sglang/srt/models/dbrx.py,sha256=okIpIwdr8Cfrz_thzc1F75XqCUfHhFLvZ1B6BaswKoA,14585
|
235
256
|
sglang/srt/models/deepseek.py,sha256=_cVOvR6eSEgRf6TUBpTD5uMdijDWFw4sSt4lGzl8tbg,15697
|
236
|
-
sglang/srt/models/deepseek_v2.py,sha256
|
257
|
+
sglang/srt/models/deepseek_v2.py,sha256=vbRhgI8yD7EmHUpq5pzI_sVpGLnkeyJ7ew-3Pl6D8F4,38499
|
237
258
|
sglang/srt/models/exaone.py,sha256=dkERTZVxrRroqu5AGLP7D4N6n8HvDqlNaDQUIe15mZY,13038
|
238
259
|
sglang/srt/models/gemma.py,sha256=ydRqsG-7004r1fAiz01LHUmcj_6XN0Tn4xO1keJnMQk,12126
|
239
260
|
sglang/srt/models/gemma2.py,sha256=-bFN-Te3YWAunLCrF-XFk_6fJS7gHM4Ca6h6aesXUTM,16362
|
@@ -241,7 +262,7 @@ sglang/srt/models/gemma2_reward.py,sha256=nJ01KfqLSJtqMLm3sG8p2mGZFK1xhhjh7I7Ccb
|
|
241
262
|
sglang/srt/models/gpt2.py,sha256=2je1kE09sGcaORWnJuGYAkcwwOrT9EK-KhQaoCKjCSA,9517
|
242
263
|
sglang/srt/models/gpt_bigcode.py,sha256=tovyOdJu2x3LkzmkdFXX_iJdkxuyChIDxwgvPBy6UPo,9528
|
243
264
|
sglang/srt/models/granite.py,sha256=AeQY9Dxd1ZnwgCYBK0vSXXiMGM-yt9iaOVf_ruOUHXw,20409
|
244
|
-
sglang/srt/models/grok.py,sha256=
|
265
|
+
sglang/srt/models/grok.py,sha256=PbLmYP-UEbImJgbEpkHhTjkqibIvt0oENPlAIrK0qSE,17751
|
245
266
|
sglang/srt/models/internlm2.py,sha256=_xcKtd6YtEFUTozaN-yUb0xbSYckRpomfPSKcAk4j-Y,12127
|
246
267
|
sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
|
247
268
|
sglang/srt/models/llama.py,sha256=4UPKF7erp7qqBD11uvvQkO1Fo_wDs71BmA8Y2csXRcA,20302
|
@@ -271,17 +292,20 @@ sglang/srt/models/torch_native_llama.py,sha256=YeXHorFm6QfnczLXwPb5TG9a-He0uiA9R
|
|
271
292
|
sglang/srt/models/xverse.py,sha256=Oq--KqvbYu2H4TMVGEHpSnJLEwXBpxlncR9ilsQeckc,13579
|
272
293
|
sglang/srt/models/xverse_moe.py,sha256=7E60YIST4ELYwLRgjtHiLRI5Uyc7XqQTM7jQXiWaQs4,15541
|
273
294
|
sglang/srt/models/yivl.py,sha256=88OubtuZ38Dxb2LzfV_MTPBI4wKhh4NJqFu--efbhFM,4809
|
274
|
-
sglang/srt/openai_api/adapter.py,sha256=
|
275
|
-
sglang/srt/openai_api/protocol.py,sha256=
|
276
|
-
sglang/srt/sampling/sampling_batch_info.py,sha256=
|
277
|
-
sglang/srt/sampling/sampling_params.py,sha256=
|
295
|
+
sglang/srt/openai_api/adapter.py,sha256=Yv-rEA0Jd54iFlnkVy-OZM4EnPqkW_NLtDPGCiPWVWo,56386
|
296
|
+
sglang/srt/openai_api/protocol.py,sha256=v_YUwH1PF4vIVqSE5rj1ODdSglprTe_vGiXoS99cOV4,11613
|
297
|
+
sglang/srt/sampling/sampling_batch_info.py,sha256=4FZIt_w5pDerRbny3uUplQO23xxnU8lmtG91OPCB_4w,9254
|
298
|
+
sglang/srt/sampling/sampling_params.py,sha256=KjUhZzRJvNTQZgJul2zSq3U8r352WzMKLbXfhP3V-nU,5685
|
278
299
|
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
279
300
|
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=J-DEemZcKm1--o37kf3qDOE8SZ_6H3d5oex49Mgq2ZU,10762
|
280
301
|
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD60mwD1tCcSG0x5IYo0v4z9ce-q_YwbJ9f8,2490
|
281
302
|
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=_Nxv0XgUPirZjw2SEJYp_Cd9ZcLwmt7h6JE6J4hhFq4,3629
|
282
303
|
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=5tOgCg7OvE9kSN9VMCpH1hwqo1YMxt9iS5PVpct9HpU,2468
|
283
304
|
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=m22Rfn1RuB1HpImBDECsiJ2VooBYpsFADAwnk1EPzk0,2751
|
284
|
-
sglang/srt/speculative/
|
305
|
+
sglang/srt/speculative/build_eagle_tree.py,sha256=SIKuOFUOIzMLyanL5vViPmFBEiUHm_ezwiGuIyLmauE,9886
|
306
|
+
sglang/srt/speculative/eagle_utils.py,sha256=VI8P8j9f1R5fRBIjiLXC8iYteMpqqeTQmaZ9OEudFuE,22983
|
307
|
+
sglang/srt/speculative/eagle_worker.py,sha256=cAqw91E-5Tl6Sgltf0q2eYwUEH_trWHdibMWqy1rPKQ,7548
|
308
|
+
sglang/srt/speculative/spec_info.py,sha256=D7A27UU1iOwIBEjXTgAxZ7jdftbTiVlMCvK8GmYr2zg,488
|
285
309
|
sglang/test/few_shot_gsm8k.py,sha256=7yDbEQe49gZeJhz2wFFX-gf_59ThDKsCS1xwfogNc7k,4034
|
286
310
|
sglang/test/few_shot_gsm8k_engine.py,sha256=QQbrwOX6-cJDD3RZC_e7zPnt6aSo8JdF8X_lRHSjdDM,3886
|
287
311
|
sglang/test/run_eval.py,sha256=9yO0hXZOcn4abEOs96T-XPguDEklK16Ltco0pGF3zCg,4020
|
@@ -298,8 +322,8 @@ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c
|
|
298
322
|
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
299
323
|
sglang/test/test_utils.py,sha256=HJG7kUQOk6n9FBbH89PDtQ41C3kt1cfJODhAEcFT0AQ,23823
|
300
324
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
|
301
|
-
sglang-0.4.1.
|
302
|
-
sglang-0.4.1.
|
303
|
-
sglang-0.4.1.
|
304
|
-
sglang-0.4.1.
|
305
|
-
sglang-0.4.1.
|
325
|
+
sglang-0.4.1.post4.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
|
326
|
+
sglang-0.4.1.post4.dist-info/METADATA,sha256=nI0C5ivDIygS7D_lOrLwV_xqvHAnlthIEA6zXmf_-54,22601
|
327
|
+
sglang-0.4.1.post4.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
|
328
|
+
sglang-0.4.1.post4.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
329
|
+
sglang-0.4.1.post4.dist-info/RECORD,,
|
File without changes
|
File without changes
|