sglang 0.3.1.post3__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +4 -10
- sglang/bench_server_latency.py +0 -6
- sglang/srt/hf_transformers_utils.py +1 -0
- sglang/srt/layers/attention_backend.py +3 -11
- sglang/srt/layers/fused_moe/patch.py +117 -0
- sglang/srt/managers/schedule_batch.py +3 -5
- sglang/srt/managers/tokenizer_manager.py +1 -0
- sglang/srt/managers/tp_worker.py +1 -1
- sglang/srt/mem_cache/radix_cache.py +5 -5
- sglang/srt/model_executor/cuda_graph_runner.py +10 -6
- sglang/srt/model_executor/forward_batch_info.py +2 -4
- sglang/srt/model_executor/model_runner.py +0 -3
- sglang/srt/models/llama.py +8 -0
- sglang/srt/openai_api/adapter.py +7 -0
- sglang/test/runners.py +7 -9
- sglang/test/test_utils.py +35 -0
- sglang/version.py +1 -1
- {sglang-0.3.1.post3.dist-info → sglang-0.3.2.dist-info}/METADATA +7 -6
- {sglang-0.3.1.post3.dist-info → sglang-0.3.2.dist-info}/RECORD +22 -21
- {sglang-0.3.1.post3.dist-info → sglang-0.3.2.dist-info}/LICENSE +0 -0
- {sglang-0.3.1.post3.dist-info → sglang-0.3.2.dist-info}/WHEEL +0 -0
- {sglang-0.3.1.post3.dist-info → sglang-0.3.2.dist-info}/top_level.txt +0 -0
sglang/bench_latency.py
CHANGED
@@ -260,7 +260,7 @@ def correctness_test(
|
|
260
260
|
|
261
261
|
# Decode
|
262
262
|
output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
|
263
|
-
for _ in range(bench_args.output_len[0]):
|
263
|
+
for _ in range(bench_args.output_len[0] - 1):
|
264
264
|
next_token_ids, _ = decode(next_token_ids, batch, model_runner)
|
265
265
|
for i in range(len(reqs)):
|
266
266
|
output_ids[i].append(next_token_ids[i])
|
@@ -311,7 +311,7 @@ def latency_test_run_once(
|
|
311
311
|
|
312
312
|
# Decode
|
313
313
|
decode_latencies = []
|
314
|
-
for i in range(output_len):
|
314
|
+
for i in range(output_len - 1):
|
315
315
|
torch.cuda.synchronize()
|
316
316
|
tic = time.time()
|
317
317
|
next_token_ids, _ = decode(next_token_ids, batch, model_runner)
|
@@ -491,18 +491,10 @@ def main(server_args, bench_args):
|
|
491
491
|
|
492
492
|
|
493
493
|
if __name__ == "__main__":
|
494
|
-
multiprocessing.set_start_method("spawn", force=True)
|
495
|
-
|
496
494
|
parser = argparse.ArgumentParser()
|
497
495
|
ServerArgs.add_cli_args(parser)
|
498
496
|
BenchArgs.add_cli_args(parser)
|
499
|
-
# For this script, model-path is not required
|
500
|
-
assert (
|
501
|
-
parser._actions[1].option_strings[0] == "--model-path"
|
502
|
-
), "options changed, this code need to be updated"
|
503
|
-
parser._actions[1].required = False
|
504
497
|
args = parser.parse_args()
|
505
|
-
|
506
498
|
server_args = ServerArgs.from_cli_args(args)
|
507
499
|
bench_args = BenchArgs.from_cli_args(args)
|
508
500
|
|
@@ -511,6 +503,8 @@ if __name__ == "__main__":
|
|
511
503
|
format="%(message)s",
|
512
504
|
)
|
513
505
|
|
506
|
+
multiprocessing.set_start_method("spawn", force=True)
|
507
|
+
|
514
508
|
try:
|
515
509
|
main(server_args, bench_args)
|
516
510
|
except Exception as e:
|
sglang/bench_server_latency.py
CHANGED
@@ -174,13 +174,7 @@ if __name__ == "__main__":
|
|
174
174
|
parser = argparse.ArgumentParser()
|
175
175
|
ServerArgs.add_cli_args(parser)
|
176
176
|
BenchArgs.add_cli_args(parser)
|
177
|
-
# For this script, model-path is not required
|
178
|
-
assert (
|
179
|
-
parser._actions[1].option_strings[0] == "--model-path"
|
180
|
-
), "options changed, this code need to be updated"
|
181
|
-
parser._actions[1].required = False
|
182
177
|
args = parser.parse_args()
|
183
|
-
|
184
178
|
server_args = ServerArgs.from_cli_args(args)
|
185
179
|
bench_args = BenchArgs.from_cli_args(args)
|
186
180
|
|
@@ -86,17 +86,9 @@ class FlashInferAttnBackend(AttentionBackend):
|
|
86
86
|
super().__init__()
|
87
87
|
self.model_runner = model_runner
|
88
88
|
|
89
|
-
|
90
|
-
model_runner.model_config.num_attention_heads // model_runner.tp_size
|
91
|
-
|
92
|
-
local_num_kv_heads = model_runner.model_config.get_num_kv_heads(
|
93
|
-
model_runner.tp_size
|
94
|
-
)
|
95
|
-
if (
|
96
|
-
not _grouped_size_compiled_for_decode_kernels(
|
97
|
-
local_num_qo_heads, local_num_kv_heads
|
98
|
-
)
|
99
|
-
or local_num_qo_heads // local_num_kv_heads > 4
|
89
|
+
if not _grouped_size_compiled_for_decode_kernels(
|
90
|
+
model_runner.model_config.num_attention_heads // model_runner.tp_size,
|
91
|
+
model_runner.model_config.get_num_kv_heads(model_runner.tp_size),
|
100
92
|
):
|
101
93
|
self.decode_use_tensor_cores = True
|
102
94
|
else:
|
@@ -0,0 +1,117 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
import torch
|
4
|
+
from torch.nn import functional as F
|
5
|
+
|
6
|
+
|
7
|
+
def fused_topk_native(
|
8
|
+
hidden_states: torch.Tensor,
|
9
|
+
gating_output: torch.Tensor,
|
10
|
+
topk: int,
|
11
|
+
renormalize: bool,
|
12
|
+
):
|
13
|
+
assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
|
14
|
+
M, _ = hidden_states.shape
|
15
|
+
topk_weights = torch.empty(
|
16
|
+
M, topk, dtype=torch.float32, device=hidden_states.device
|
17
|
+
)
|
18
|
+
topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
|
19
|
+
topk_weights = F.softmax(gating_output.float(), dim=-1)
|
20
|
+
topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
|
21
|
+
if renormalize:
|
22
|
+
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
|
23
|
+
return topk_weights, topk_ids
|
24
|
+
|
25
|
+
|
26
|
+
# This is used by the Deepseek-V2 model
|
27
|
+
def grouped_topk(
|
28
|
+
hidden_states: torch.Tensor,
|
29
|
+
gating_output: torch.Tensor,
|
30
|
+
topk: int,
|
31
|
+
renormalize: bool,
|
32
|
+
num_expert_group: int = 0,
|
33
|
+
topk_group: int = 0,
|
34
|
+
):
|
35
|
+
|
36
|
+
assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
|
37
|
+
|
38
|
+
scores = torch.softmax(gating_output, dim=-1)
|
39
|
+
num_token = scores.shape[0]
|
40
|
+
group_scores = (
|
41
|
+
scores.view(num_token, num_expert_group, -1).max(dim=-1).values
|
42
|
+
) # [n, n_group]
|
43
|
+
group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
|
44
|
+
1
|
45
|
+
] # [n, top_k_group]
|
46
|
+
group_mask = torch.zeros_like(group_scores) # [n, n_group]
|
47
|
+
group_mask.scatter_(1, group_idx, 1) # [n, n_group]
|
48
|
+
score_mask = (
|
49
|
+
group_mask.unsqueeze(-1)
|
50
|
+
.expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
|
51
|
+
.reshape(num_token, -1)
|
52
|
+
) # [n, e]
|
53
|
+
tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
|
54
|
+
topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
|
55
|
+
|
56
|
+
if renormalize:
|
57
|
+
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
|
58
|
+
return topk_weights, topk_ids
|
59
|
+
|
60
|
+
|
61
|
+
def select_experts_native(
|
62
|
+
hidden_states: torch.Tensor,
|
63
|
+
router_logits: torch.Tensor,
|
64
|
+
top_k: int,
|
65
|
+
use_grouped_topk: bool,
|
66
|
+
renormalize: bool,
|
67
|
+
topk_group: Optional[int] = None,
|
68
|
+
num_expert_group: Optional[int] = None,
|
69
|
+
):
|
70
|
+
# DeekSeekv2 uses grouped_top_k
|
71
|
+
if use_grouped_topk:
|
72
|
+
assert topk_group is not None
|
73
|
+
assert num_expert_group is not None
|
74
|
+
topk_weights, topk_ids = grouped_topk(
|
75
|
+
hidden_states=hidden_states,
|
76
|
+
gating_output=router_logits,
|
77
|
+
topk=top_k,
|
78
|
+
renormalize=renormalize,
|
79
|
+
num_expert_group=num_expert_group,
|
80
|
+
topk_group=topk_group,
|
81
|
+
)
|
82
|
+
else:
|
83
|
+
topk_weights, topk_ids = fused_topk_native(
|
84
|
+
hidden_states=hidden_states,
|
85
|
+
gating_output=router_logits,
|
86
|
+
topk=top_k,
|
87
|
+
renormalize=renormalize,
|
88
|
+
)
|
89
|
+
return topk_weights, topk_ids
|
90
|
+
|
91
|
+
|
92
|
+
def fused_moe_forward_native(
|
93
|
+
layer: torch.nn.Module,
|
94
|
+
x: torch.Tensor,
|
95
|
+
use_grouped_topk: bool,
|
96
|
+
top_k: int,
|
97
|
+
router_logits: torch.Tensor,
|
98
|
+
renormalize: bool,
|
99
|
+
topk_group: Optional[int] = None,
|
100
|
+
num_expert_group: Optional[int] = None,
|
101
|
+
) -> torch.Tensor:
|
102
|
+
topk_weights, topk_ids = select_experts_native(
|
103
|
+
hidden_states=x,
|
104
|
+
router_logits=router_logits,
|
105
|
+
use_grouped_topk=use_grouped_topk,
|
106
|
+
top_k=top_k,
|
107
|
+
renormalize=renormalize,
|
108
|
+
topk_group=topk_group,
|
109
|
+
num_expert_group=num_expert_group,
|
110
|
+
)
|
111
|
+
w13_weights = layer.w13_weight[topk_ids]
|
112
|
+
w1_weights, w3_weights = torch.chunk(w13_weights, 2, dim=2)
|
113
|
+
w2_weights = layer.w2_weight[topk_ids]
|
114
|
+
x1 = F.silu(torch.einsum("ti,taoi -> tao", x, w1_weights))
|
115
|
+
x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
|
116
|
+
expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
|
117
|
+
return torch.einsum("tai,ta -> ti", expert_outs, topk_weights)
|
@@ -429,7 +429,7 @@ class ScheduleBatch:
|
|
429
429
|
def prepare_for_extend(self, vocab_size: int):
|
430
430
|
self.forward_mode = ForwardMode.EXTEND
|
431
431
|
|
432
|
-
bs = self.
|
432
|
+
bs = len(self.reqs)
|
433
433
|
reqs = self.reqs
|
434
434
|
input_ids = [r.fill_ids[len(r.prefix_indices) :] for r in reqs]
|
435
435
|
extend_num_tokens = sum(len(ids) for ids in input_ids)
|
@@ -509,7 +509,7 @@ class ScheduleBatch:
|
|
509
509
|
self.extend_logprob_start_lens_cpu.extend([0] * running_bs)
|
510
510
|
|
511
511
|
def check_decode_mem(self):
|
512
|
-
bs = self.
|
512
|
+
bs = len(self.reqs)
|
513
513
|
if self.token_to_kv_pool.available_size() >= bs:
|
514
514
|
return True
|
515
515
|
|
@@ -680,14 +680,12 @@ class ScheduleBatch:
|
|
680
680
|
r.output_ids[-1] if r.output_ids else r.origin_input_ids[-1]
|
681
681
|
for r in self.reqs
|
682
682
|
]
|
683
|
-
else:
|
684
|
-
self.sampling_info.penalizer_orchestrator.cumulate_input_tokens(input_ids)
|
685
683
|
|
686
684
|
self.input_ids = torch.tensor(input_ids, dtype=torch.int32, device="cuda")
|
687
685
|
self.seq_lens.add_(1)
|
688
686
|
|
689
687
|
# Alloc mem
|
690
|
-
bs = self.
|
688
|
+
bs = len(self.reqs)
|
691
689
|
self.out_cache_loc = self.alloc_token_slots(bs)
|
692
690
|
|
693
691
|
self.req_to_token_pool.req_to_token[
|
sglang/srt/managers/tp_worker.py
CHANGED
@@ -215,6 +215,7 @@ class ModelTpServer:
|
|
215
215
|
self.new_token_ratio_decay = global_config.new_token_ratio_decay
|
216
216
|
self.do_not_get_new_batch = False
|
217
217
|
|
218
|
+
@torch.inference_mode()
|
218
219
|
def exposed_step(self, recv_reqs: List):
|
219
220
|
try:
|
220
221
|
# Recv requests
|
@@ -246,7 +247,6 @@ class ModelTpServer:
|
|
246
247
|
self.out_pyobjs = []
|
247
248
|
return ret
|
248
249
|
|
249
|
-
@torch.inference_mode()
|
250
250
|
def forward_step(self):
|
251
251
|
if self.do_not_get_new_batch and self.current_inflight_req is None:
|
252
252
|
new_batch = None
|
@@ -291,15 +291,15 @@ class RadixCache(BasePrefixCache):
|
|
291
291
|
|
292
292
|
def _collect_leaves(self):
|
293
293
|
ret_list = []
|
294
|
+
stack = [self.root_node]
|
294
295
|
|
295
|
-
|
296
|
+
while stack:
|
297
|
+
cur_node = stack.pop()
|
296
298
|
if len(cur_node.children) == 0:
|
297
299
|
ret_list.append(cur_node)
|
300
|
+
else:
|
301
|
+
stack.extend(cur_node.children.values())
|
298
302
|
|
299
|
-
for x in cur_node.children.values():
|
300
|
-
dfs_(x)
|
301
|
-
|
302
|
-
dfs_(self.root_node)
|
303
303
|
return ret_list
|
304
304
|
|
305
305
|
|
@@ -25,6 +25,7 @@ import torch
|
|
25
25
|
from vllm.distributed.parallel_state import graph_capture
|
26
26
|
from vllm.model_executor.custom_op import CustomOp
|
27
27
|
|
28
|
+
from sglang.srt.layers.fused_moe.patch import fused_moe_forward_native
|
28
29
|
from sglang.srt.layers.logits_processor import (
|
29
30
|
LogitsMetadata,
|
30
31
|
LogitsProcessor,
|
@@ -41,14 +42,15 @@ if TYPE_CHECKING:
|
|
41
42
|
def _to_torch(model: torch.nn.Module, reverse: bool = False):
|
42
43
|
for sub in model._modules.values():
|
43
44
|
if isinstance(sub, CustomOp):
|
44
|
-
# NOTE: FusedMoE torch native implementaiton is not efficient
|
45
|
-
if "FusedMoE" in sub.__class__.__name__:
|
46
|
-
continue
|
47
45
|
if reverse:
|
48
46
|
sub._forward_method = sub.forward_cuda
|
49
47
|
setattr(sub, "is_torch_compile", False)
|
50
48
|
else:
|
51
|
-
|
49
|
+
# NOTE: Temporarily workaround MoE
|
50
|
+
if "FusedMoE" in sub.__class__.__name__:
|
51
|
+
sub._forward_method = fused_moe_forward_native
|
52
|
+
else:
|
53
|
+
sub._forward_method = sub.forward_native
|
52
54
|
setattr(sub, "is_torch_compile", True)
|
53
55
|
if isinstance(sub, torch.nn.Module):
|
54
56
|
_to_torch(sub, reverse)
|
@@ -67,7 +69,9 @@ def patch_model(
|
|
67
69
|
monkey_patch_vllm_all_gather()
|
68
70
|
backup_ca_comm = tp_group.ca_comm
|
69
71
|
tp_group.ca_comm = None
|
70
|
-
yield torch.compile(
|
72
|
+
yield torch.compile(
|
73
|
+
torch.no_grad()(model.forward), mode="max-autotune-no-cudagraphs"
|
74
|
+
)
|
71
75
|
else:
|
72
76
|
yield model.forward
|
73
77
|
finally:
|
@@ -150,7 +154,7 @@ class CudaGraphRunner:
|
|
150
154
|
f"Capture cuda graph failed: {e}\n"
|
151
155
|
"Possible solutions:\n"
|
152
156
|
"1. disable cuda graph by --disable-cuda-graph\n"
|
153
|
-
"2. set --mem-fraction-static to a smaller value\n"
|
157
|
+
"2. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
154
158
|
"3. disable torch compile by not using --enable-torch-compile\n"
|
155
159
|
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
156
160
|
)
|
@@ -97,14 +97,12 @@ class InputMetadata:
|
|
97
97
|
self.modalities = [r.modalities for r in reqs]
|
98
98
|
|
99
99
|
def compute_positions(self, batch: ScheduleBatch):
|
100
|
-
position_ids_offsets = batch.position_ids_offsets
|
101
|
-
|
102
100
|
if self.forward_mode.is_decode():
|
103
101
|
if True:
|
104
102
|
self.positions = self.seq_lens - 1
|
105
103
|
else:
|
106
104
|
# Deprecated
|
107
|
-
self.positions = (self.seq_lens - 1) + position_ids_offsets
|
105
|
+
self.positions = (self.seq_lens - 1) + batch.position_ids_offsets
|
108
106
|
else:
|
109
107
|
if True:
|
110
108
|
self.positions = torch.tensor(
|
@@ -119,7 +117,7 @@ class InputMetadata:
|
|
119
117
|
)
|
120
118
|
else:
|
121
119
|
# Deprecated
|
122
|
-
position_ids_offsets_cpu = position_ids_offsets.cpu().numpy()
|
120
|
+
position_ids_offsets_cpu = batch.position_ids_offsets.cpu().numpy()
|
123
121
|
self.positions = torch.tensor(
|
124
122
|
np.concatenate(
|
125
123
|
[
|
@@ -467,7 +467,6 @@ class ModelRunner:
|
|
467
467
|
logger.info("Capture cuda graph begin. This can take up to several minutes.")
|
468
468
|
self.cuda_graph_runner = CudaGraphRunner(self)
|
469
469
|
|
470
|
-
@torch.inference_mode()
|
471
470
|
def forward_decode(self, batch: ScheduleBatch):
|
472
471
|
if self.server_args.lora_paths is not None:
|
473
472
|
self.lora_manager.prepare_lora_batch(batch)
|
@@ -481,7 +480,6 @@ class ModelRunner:
|
|
481
480
|
batch.input_ids, input_metadata.positions, input_metadata
|
482
481
|
)
|
483
482
|
|
484
|
-
@torch.inference_mode()
|
485
483
|
def forward_extend(self, batch: ScheduleBatch):
|
486
484
|
input_metadata = InputMetadata.from_schedule_batch(self, batch)
|
487
485
|
if self.server_args.lora_paths is not None:
|
@@ -500,7 +498,6 @@ class ModelRunner:
|
|
500
498
|
get_embedding=True,
|
501
499
|
)
|
502
500
|
|
503
|
-
@torch.inference_mode()
|
504
501
|
def forward_extend_multi_modal(self, batch: ScheduleBatch):
|
505
502
|
input_metadata = InputMetadata.from_schedule_batch(self, batch)
|
506
503
|
return self.model.forward(
|
sglang/srt/models/llama.py
CHANGED
@@ -403,6 +403,14 @@ class LlamaForCausalLM(nn.Module):
|
|
403
403
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
404
404
|
weight_loader(param, loaded_weight)
|
405
405
|
|
406
|
+
if (
|
407
|
+
hasattr(self.config, "tie_word_embeddings")
|
408
|
+
and self.config.tie_word_embeddings
|
409
|
+
):
|
410
|
+
# Tie output embedding layer to input embedding layer, to solve issues where lm_head.weight is missing
|
411
|
+
param = self.lm_head.weight
|
412
|
+
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
413
|
+
weight_loader(param, self.model.embed_tokens.weight)
|
406
414
|
apply_torchao_config_(self, params_dict, set(["proj.weight"]))
|
407
415
|
|
408
416
|
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -858,11 +858,18 @@ def v1_chat_generate_request(
|
|
858
858
|
openai_compatible_messages.append(
|
859
859
|
{"role": message.role, "content": content["text"]}
|
860
860
|
)
|
861
|
+
if openai_compatible_messages[-1]["role"] == "assistant":
|
862
|
+
assistant_prefix = openai_compatible_messages[-1]["content"]
|
863
|
+
openai_compatible_messages = openai_compatible_messages[:-1]
|
864
|
+
else:
|
865
|
+
assistant_prefix = None
|
861
866
|
prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
|
862
867
|
openai_compatible_messages,
|
863
868
|
tokenize=True,
|
864
869
|
add_generation_prompt=True,
|
865
870
|
)
|
871
|
+
if assistant_prefix:
|
872
|
+
prompt_ids += tokenizer_manager.tokenizer.encode(assistant_prefix)
|
866
873
|
stop = request.stop
|
867
874
|
image_data = None
|
868
875
|
modalities = []
|
sglang/test/runners.py
CHANGED
@@ -21,19 +21,19 @@ from typing import List, Union
|
|
21
21
|
|
22
22
|
import torch
|
23
23
|
import torch.nn.functional as F
|
24
|
-
from
|
25
|
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
24
|
+
from transformers import AutoModelForCausalLM
|
26
25
|
|
26
|
+
from sglang.srt.hf_transformers_utils import get_tokenizer
|
27
27
|
from sglang.srt.server import Runtime
|
28
28
|
from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER
|
29
29
|
|
30
30
|
DEFAULT_PROMPTS = [
|
31
|
-
# the output of gemma-2-2b from SRT is unstable on the commented prompt
|
32
|
-
# "The capital of France is",
|
33
31
|
"Apple is red. Banana is Yellow. " * 800 + "Apple is",
|
34
32
|
"The capital of the United Kingdom is",
|
35
33
|
"Today is a sunny day and I like",
|
36
34
|
"AI is a field of computer science focused on",
|
35
|
+
# the output of gemma-2-2b from SRT is unstable on the commented prompt
|
36
|
+
# "The capital of France is",
|
37
37
|
]
|
38
38
|
|
39
39
|
dirpath = os.path.dirname(__file__)
|
@@ -93,11 +93,7 @@ class HFRunner:
|
|
93
93
|
self.model_proc.start()
|
94
94
|
|
95
95
|
def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
|
96
|
-
self.tokenizer =
|
97
|
-
model_path,
|
98
|
-
torch_dtype=torch_dtype,
|
99
|
-
)
|
100
|
-
|
96
|
+
self.tokenizer = get_tokenizer(model_path)
|
101
97
|
if self.is_generation:
|
102
98
|
self.base_model = AutoModelForCausalLM.from_pretrained(
|
103
99
|
model_path,
|
@@ -132,6 +128,8 @@ class HFRunner:
|
|
132
128
|
input_ids = torch.tensor([p], device="cuda")
|
133
129
|
|
134
130
|
if lora_paths is not None and lora_paths[i] is not None:
|
131
|
+
from peft import PeftModel
|
132
|
+
|
135
133
|
self.model = PeftModel.from_pretrained(
|
136
134
|
self.base_model,
|
137
135
|
lora_paths[i],
|
sglang/test/test_utils.py
CHANGED
@@ -26,6 +26,7 @@ DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
|
26
26
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
27
27
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
28
28
|
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
29
|
+
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
29
30
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
30
31
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
31
32
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
@@ -587,3 +588,37 @@ def run_bench_latency(model, other_args):
|
|
587
588
|
kill_child_process(process.pid)
|
588
589
|
|
589
590
|
return output_throughput
|
591
|
+
|
592
|
+
|
593
|
+
def lcs(X, Y):
|
594
|
+
m = len(X)
|
595
|
+
n = len(Y)
|
596
|
+
L = [[0] * (n + 1) for _ in range(m + 1)]
|
597
|
+
|
598
|
+
for i in range(m + 1):
|
599
|
+
for j in range(n + 1):
|
600
|
+
if i == 0 or j == 0:
|
601
|
+
L[i][j] = 0
|
602
|
+
elif X[i - 1] == Y[j - 1]:
|
603
|
+
L[i][j] = L[i - 1][j - 1] + 1
|
604
|
+
else:
|
605
|
+
L[i][j] = max(L[i - 1][j], L[i][j - 1])
|
606
|
+
|
607
|
+
return L[m][n]
|
608
|
+
|
609
|
+
|
610
|
+
def calculate_rouge_l(output_strs_list1, output_strs_list2):
|
611
|
+
"""calculate the ROUGE-L score"""
|
612
|
+
rouge_l_scores = []
|
613
|
+
|
614
|
+
for s1, s2 in zip(output_strs_list1, output_strs_list2):
|
615
|
+
lcs_len = lcs(s1, s2)
|
616
|
+
precision = lcs_len / len(s1) if len(s1) > 0 else 0
|
617
|
+
recall = lcs_len / len(s2) if len(s2) > 0 else 0
|
618
|
+
if precision + recall > 0:
|
619
|
+
fmeasure = (2 * precision * recall) / (precision + recall)
|
620
|
+
else:
|
621
|
+
fmeasure = 0.0
|
622
|
+
rouge_l_scores.append(fmeasure)
|
623
|
+
|
624
|
+
return rouge_l_scores
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.
|
1
|
+
__version__ = "0.3.2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
318
318
|
### Method 2: From source
|
319
319
|
```
|
320
320
|
# Use the last release branch
|
321
|
-
git clone -b v0.3.
|
321
|
+
git clone -b v0.3.2 https://github.com/sgl-project/sglang.git
|
322
322
|
cd sglang
|
323
323
|
|
324
324
|
pip install --upgrade pip
|
@@ -348,9 +348,9 @@ docker run --gpus all \
|
|
348
348
|
<summary>More</summary>
|
349
349
|
|
350
350
|
> This method is recommended if you plan to serve it as a service.
|
351
|
-
> A better approach is to use the [k8s-sglang-service.yaml](
|
351
|
+
> A better approach is to use the [k8s-sglang-service.yaml](docker/k8s-sglang-service.yaml).
|
352
352
|
|
353
|
-
1. Copy the [compose.yml](
|
353
|
+
1. Copy the [compose.yml](docker/compose.yaml) to your local machine
|
354
354
|
2. Execute the command `docker compose up -d` in your terminal.
|
355
355
|
</details>
|
356
356
|
|
@@ -521,6 +521,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
521
521
|
- BaiChuan2
|
522
522
|
- MiniCPM / MiniCPM 3
|
523
523
|
- XVERSE / XVERSE MoE
|
524
|
+
- SmolLM
|
524
525
|
|
525
526
|
|
526
527
|
**Embedding Models**
|
@@ -529,7 +530,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
529
530
|
- gte-Qwen2
|
530
531
|
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
531
532
|
|
532
|
-
Instructions for supporting a new model are [here](
|
533
|
+
Instructions for supporting a new model are [here](docs/en/model_support.md).
|
533
534
|
|
534
535
|
#### Use Models From ModelScope
|
535
536
|
<details>
|
@@ -824,7 +825,7 @@ def chat_example(s):
|
|
824
825
|
Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
825
826
|
|
826
827
|
## Roadmap
|
827
|
-
[Development Roadmap (2024
|
828
|
+
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
828
829
|
|
829
830
|
## Citation And Acknowledgment
|
830
831
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
@@ -1,14 +1,14 @@
|
|
1
1
|
sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
|
2
2
|
sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
|
3
|
-
sglang/bench_latency.py,sha256=
|
4
|
-
sglang/bench_server_latency.py,sha256=
|
3
|
+
sglang/bench_latency.py,sha256=8Mb_Z8jZk7pDD9OisGfZapyJOsbcwtfxURy2lQ7bNYI,17128
|
4
|
+
sglang/bench_server_latency.py,sha256=rRSDqjJ5jan9AzppOGx75KRUjZCU2dUG2h06CQOdJgk,5377
|
5
5
|
sglang/bench_serving.py,sha256=3gIJ1O2x51Fwd4wYJjgwluTbWKXL-azckQte7YC5zIc,36261
|
6
6
|
sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
|
7
7
|
sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
|
8
8
|
sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
|
9
9
|
sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
|
10
10
|
sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
|
11
|
-
sglang/version.py,sha256=
|
11
|
+
sglang/version.py,sha256=vNiWJ14r_cw5t_7UDqDQIVZvladKFGyHH2avsLpN7Vg,22
|
12
12
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
|
14
14
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
@@ -24,7 +24,7 @@ sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI
|
|
24
24
|
sglang/lang/backend/runtime_endpoint.py,sha256=MEyMl5cIAMwaWmp4j0HtuCOQ_XdJoyywztvAOGsicao,9832
|
25
25
|
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
26
26
|
sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19663
|
27
|
-
sglang/srt/hf_transformers_utils.py,sha256=
|
27
|
+
sglang/srt/hf_transformers_utils.py,sha256=rt6flb6BoYTO8fw7AKCXmQLJx5XuSUuRmZX-VJHmuLQ,6064
|
28
28
|
sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
|
29
29
|
sglang/srt/server.py,sha256=n4QRn36_t-HAH-lSME3tiZSCUGRQwqMUckgs0paHq5g,20179
|
30
30
|
sglang/srt/server_args.py,sha256=3XjDt6SSjTfbOe0HSXA--2aUvrpWSnQmAHYwmeS1-M0,23159
|
@@ -37,7 +37,7 @@ sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCfl
|
|
37
37
|
sglang/srt/constrained/fsm_cache.py,sha256=k7DRUAaiLTEX5UarfJ17gEYQ-QWQAGfvykjYFkM_Y2U,2982
|
38
38
|
sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
|
39
39
|
sglang/srt/layers/activation.py,sha256=tRWHxIjcIopkOremkb5Jy5O0rgdB1PAhHfIEONfyj6Y,5166
|
40
|
-
sglang/srt/layers/attention_backend.py,sha256=
|
40
|
+
sglang/srt/layers/attention_backend.py,sha256=ySiSEHQnhZdQ6kV_9gkAOAP_UEANXSxaSOuLx3rZGzk,17946
|
41
41
|
sglang/srt/layers/flashinfer_utils.py,sha256=jyaO7XiEisFZg_dfaCbfRCHSHSKYoM1wOzfHa0h1q14,7413
|
42
42
|
sglang/srt/layers/layernorm.py,sha256=p_7bnmSpJ_slpoP0Gk5wQPpHtLllUu3imSIRBqGqTP0,3737
|
43
43
|
sglang/srt/layers/linear.py,sha256=9rjCiSb_QOn5RgpVjIhEKdReRvSYVfcTSjbWBEbApLI,45173
|
@@ -49,6 +49,7 @@ sglang/srt/layers/torchao_utils.py,sha256=rTECwKSXhj_ylh_iSzfbopz9_lZOFHatquQrNJ
|
|
49
49
|
sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
|
50
50
|
sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
|
51
51
|
sglang/srt/layers/fused_moe/layer.py,sha256=raFyvPzjYz-Fv8B3IcOxQYKKCWqXis5mXwg1GFE61y4,22243
|
52
|
+
sglang/srt/layers/fused_moe/patch.py,sha256=B9cDtHqHfnWE0QqZAffvUi6cVRKcMBMKDGJWGIaKh3U,3898
|
52
53
|
sglang/srt/layers/quantization/__init__.py,sha256=wl9mIOeA6mtKIaW1LWUJABWPdqOb-2uZ-kSijWoxLtU,3095
|
53
54
|
sglang/srt/layers/quantization/base_config.py,sha256=vlpSPvSrFmUe65ETg4SoPocQ9bVNY6As3QuHdr_3Dr4,4023
|
54
55
|
sglang/srt/layers/triton_attention/decode_attention.py,sha256=XCQTX0kUttT1AG5FRMgfQbiXgvoempYD0UR2r6D_vJg,16711
|
@@ -62,17 +63,17 @@ sglang/srt/managers/controller_single.py,sha256=DiZALP_iIPZQMRx09a-LwT5_Dg7p-WU8
|
|
62
63
|
sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
|
63
64
|
sglang/srt/managers/io_struct.py,sha256=yNV5BmeUzLPqv19j79kXQ50Iaqdk4vP-_TciiRf4OEE,11396
|
64
65
|
sglang/srt/managers/policy_scheduler.py,sha256=PVo0DV0-5ODNN7FkPkeF1Y8BQ6uuLldPETOlB_YvvL4,11560
|
65
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
66
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
67
|
-
sglang/srt/managers/tp_worker.py,sha256=
|
66
|
+
sglang/srt/managers/schedule_batch.py,sha256=rbBwX-Yy98WhaNfazgyyx4p5L3CaTOKMTOOYqpzEWng,27276
|
67
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=oo6UwyHMUGWMyWnGVlbpgzh-kiq3QSA1XU3eGGQNcA8,29727
|
68
|
+
sglang/srt/managers/tp_worker.py,sha256=qTzR773tJdssLENqdkAcfAD0gn0c1Tlgx2IynJDlQcU,39503
|
68
69
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
69
70
|
sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
|
70
71
|
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
71
72
|
sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
|
72
|
-
sglang/srt/mem_cache/radix_cache.py,sha256=
|
73
|
-
sglang/srt/model_executor/cuda_graph_runner.py,sha256=
|
74
|
-
sglang/srt/model_executor/forward_batch_info.py,sha256=
|
75
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
73
|
+
sglang/srt/mem_cache/radix_cache.py,sha256=00bghOihUm7lA1i4gxxMYQLept9LaHg2ZSXZryuFZZI,10121
|
74
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=GgD0iIzJQ6xmyTIozOQCluBkM58EcsXHXaP-wpbkHYQ,10698
|
75
|
+
sglang/srt/model_executor/forward_batch_info.py,sha256=eDARLwjSnUGXzsLprTEQRtwC5kiRCk3NpbbfqkFDwS8,6094
|
76
|
+
sglang/srt/model_executor/model_runner.py,sha256=CxBX35i7epmdVBFCoSl57JTZz8yOLxEj5WjSPs88tus,22642
|
76
77
|
sglang/srt/models/baichuan.py,sha256=d2PFmyLBXjzS7X7FL9uz139_CpBPb5WYhzcHgF--gRE,15115
|
77
78
|
sglang/srt/models/chatglm.py,sha256=chDkgLTRU3bPxTUilhW_FGnsUWj_2fkvulCi9pdDxBY,13353
|
78
79
|
sglang/srt/models/commandr.py,sha256=FspSRkMRAXUjD3xzAkxkMiGiRg91czn9T5bagrf3l9M,14136
|
@@ -85,7 +86,7 @@ sglang/srt/models/gemma2.py,sha256=8wGqNQPaPjuTtgHiKsUP4nowOukPvXwRywD4lkAW9Dg,1
|
|
85
86
|
sglang/srt/models/gpt_bigcode.py,sha256=k_pZa4Sg5GEsr4ln0kjP765moGUPNs5a6iANPjE2W8U,10177
|
86
87
|
sglang/srt/models/grok.py,sha256=71Zx-4Q3wggNMtRYlXuPMA-auK-sHBYukI1Usn8LVrE,14911
|
87
88
|
sglang/srt/models/internlm2.py,sha256=nEr6MSHFkTjPLvWl1jQQdGFO7iOHex6YtE-I4rYuLao,12184
|
88
|
-
sglang/srt/models/llama.py,sha256=
|
89
|
+
sglang/srt/models/llama.py,sha256=hTEi7Ce1RkbrTaAe_JuCdQprTbD1XkvglD1t9YecyvM,15629
|
89
90
|
sglang/srt/models/llama_classification.py,sha256=UpwYsgNVS1065t7Yjmi2XGbk9Or8bq2cF82zH1Yx2Mg,3385
|
90
91
|
sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
|
91
92
|
sglang/srt/models/llava.py,sha256=1MG1JDDQb7xc67BSimDo98Gmvza6PmrHQHmKybsDui4,24872
|
@@ -103,7 +104,7 @@ sglang/srt/models/stablelm.py,sha256=v67JM1SHb-LinrsX598WMsLVeyzjoKquW6G5G30X5fQ
|
|
103
104
|
sglang/srt/models/xverse.py,sha256=VThXXKg3DzepcEP1JHcqSyhRBvq6yL14oh4uj5TJOEM,13649
|
104
105
|
sglang/srt/models/xverse_moe.py,sha256=BqmV-uk9ipp4nrj6-lnFfvkwUcuKmV7yfGAYB6Ob-UQ,15833
|
105
106
|
sglang/srt/models/yivl.py,sha256=N3noJ5M-FiZS-E_zfaJs4prQOu_ineRt11MWloYgOR8,4826
|
106
|
-
sglang/srt/openai_api/adapter.py,sha256=
|
107
|
+
sglang/srt/openai_api/adapter.py,sha256=ULX1lo23r6semogKcbUOXGSgPJi8NJ7IuC0WVvEbVbs,51458
|
107
108
|
sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
|
108
109
|
sglang/srt/sampling/sampling_batch_info.py,sha256=GewqyxCrW2PFwuzGHaCR59Pvw6j0n2dKGrlJWYQWwW4,6149
|
109
110
|
sglang/srt/sampling/sampling_params.py,sha256=ggOXxafqfCD-xrGYcM57byLZ79CIeBP4AD5F44L_CW0,5635
|
@@ -115,7 +116,7 @@ sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959
|
|
115
116
|
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
|
116
117
|
sglang/test/few_shot_gsm8k.py,sha256=To7Sdg-DLF8poIQLwiOBYKbkz-1C_gn6H79vIbyPR-o,3860
|
117
118
|
sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
|
118
|
-
sglang/test/runners.py,sha256=
|
119
|
+
sglang/test/runners.py,sha256=ZKNGNxlXsgqIEatXO1xwnDkcybfNZ1U3sLfcMZRECdY,11400
|
119
120
|
sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
|
120
121
|
sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
|
121
122
|
sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
|
@@ -125,10 +126,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
|
|
125
126
|
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
126
127
|
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
127
128
|
sglang/test/test_programs.py,sha256=3-XKnppQdCNWjaJb6jwib5Z9OSpgKvH8SFLJbE4J9qI,17001
|
128
|
-
sglang/test/test_utils.py,sha256=
|
129
|
+
sglang/test/test_utils.py,sha256=6hVc0r_7bj1BTPeBPBwM1_rDJPqJElL9xfctvSJCrAI,18532
|
129
130
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
|
130
|
-
sglang-0.3.
|
131
|
-
sglang-0.3.
|
132
|
-
sglang-0.3.
|
133
|
-
sglang-0.3.
|
134
|
-
sglang-0.3.
|
131
|
+
sglang-0.3.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
132
|
+
sglang-0.3.2.dist-info/METADATA,sha256=9jaNpOSL-vIIWWpigGVUKX-mSoTY6OiVYg0VhwnDwiI,38068
|
133
|
+
sglang-0.3.2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
134
|
+
sglang-0.3.2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
135
|
+
sglang-0.3.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|