sglang 0.2.15__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/backend/runtime_endpoint.py +8 -4
- sglang/srt/hf_transformers_utils.py +1 -1
- sglang/srt/layers/sampler.py +34 -10
- sglang/srt/managers/schedule_batch.py +6 -3
- sglang/srt/managers/tokenizer_manager.py +7 -7
- sglang/srt/model_executor/cuda_graph_runner.py +2 -0
- sglang/srt/model_executor/model_runner.py +2 -11
- sglang/srt/models/chatglm.py +5 -3
- sglang/srt/models/exaone.py +7 -38
- sglang/srt/models/{llama2.py → llama.py} +18 -42
- sglang/srt/models/llama_classification.py +33 -40
- sglang/srt/models/llama_embedding.py +7 -6
- sglang/srt/models/llava.py +8 -11
- sglang/srt/models/llavavid.py +5 -6
- sglang/srt/models/mistral.py +2 -3
- sglang/srt/sampling/sampling_batch_info.py +11 -15
- sglang/test/test_programs.py +68 -0
- sglang/test/test_utils.py +4 -0
- sglang/utils.py +39 -0
- sglang/version.py +1 -1
- {sglang-0.2.15.dist-info → sglang-0.3.0.dist-info}/METADATA +4 -3
- {sglang-0.2.15.dist-info → sglang-0.3.0.dist-info}/RECORD +25 -25
- {sglang-0.2.15.dist-info → sglang-0.3.0.dist-info}/WHEEL +1 -1
- {sglang-0.2.15.dist-info → sglang-0.3.0.dist-info}/LICENSE +0 -0
- {sglang-0.2.15.dist-info → sglang-0.3.0.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ from typing import List, Optional
|
|
4
4
|
|
5
5
|
from sglang.global_config import global_config
|
6
6
|
from sglang.lang.backend.base_backend import BaseBackend
|
7
|
-
from sglang.lang.chat_template import get_chat_template_by_model_path
|
7
|
+
from sglang.lang.chat_template import get_chat_template, get_chat_template_by_model_path
|
8
8
|
from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod
|
9
9
|
from sglang.lang.interpreter import StreamExecutor
|
10
10
|
from sglang.lang.ir import (
|
@@ -23,6 +23,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
23
23
|
base_url: str,
|
24
24
|
api_key: Optional[str] = None,
|
25
25
|
verify: Optional[str] = None,
|
26
|
+
chat_template_name: Optional[str] = None,
|
26
27
|
):
|
27
28
|
super().__init__()
|
28
29
|
self.support_concate_and_append = True
|
@@ -39,9 +40,12 @@ class RuntimeEndpoint(BaseBackend):
|
|
39
40
|
self._assert_success(res)
|
40
41
|
self.model_info = res.json()
|
41
42
|
|
42
|
-
|
43
|
-
self.
|
44
|
-
|
43
|
+
if chat_template_name:
|
44
|
+
self.chat_template = get_chat_template(chat_template_name)
|
45
|
+
else:
|
46
|
+
self.chat_template = get_chat_template_by_model_path(
|
47
|
+
self.model_info["model_path"]
|
48
|
+
)
|
45
49
|
|
46
50
|
def get_model_name(self):
|
47
51
|
return self.model_info["model_path"]
|
@@ -92,7 +92,7 @@ def get_context_length(config):
|
|
92
92
|
"""Get the context length of a model from a huggingface model configs."""
|
93
93
|
rope_scaling = getattr(config, "rope_scaling", None)
|
94
94
|
if rope_scaling:
|
95
|
-
rope_scaling_factor = config.rope_scaling
|
95
|
+
rope_scaling_factor = config.rope_scaling.get("factor", 1)
|
96
96
|
if "original_max_position_embeddings" in rope_scaling:
|
97
97
|
rope_scaling_factor = 1
|
98
98
|
if config.rope_scaling.get("rope_type", None) == "llama3":
|
sglang/srt/layers/sampler.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import dataclasses
|
2
2
|
import logging
|
3
|
-
from typing import Union
|
3
|
+
from typing import Tuple, Union
|
4
4
|
|
5
5
|
import torch
|
6
6
|
from flashinfer.sampling import (
|
@@ -9,6 +9,7 @@ from flashinfer.sampling import (
|
|
9
9
|
top_k_top_p_sampling_from_probs,
|
10
10
|
top_p_renorm_prob,
|
11
11
|
)
|
12
|
+
from torch.library import custom_op as torch_custom_op
|
12
13
|
from vllm.model_executor.custom_op import CustomOp
|
13
14
|
|
14
15
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
@@ -30,6 +31,9 @@ class SampleOutput:
|
|
30
31
|
class Sampler(CustomOp):
|
31
32
|
def __init__(self):
|
32
33
|
super().__init__()
|
34
|
+
# FIXME: torch.multinomial has too many bugs
|
35
|
+
self.forward_native = self.forward_cuda
|
36
|
+
self.is_torch_compile = False
|
33
37
|
|
34
38
|
def _apply_penalties(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo):
|
35
39
|
# min-token, presence, frequency
|
@@ -46,16 +50,11 @@ class Sampler(CustomOp):
|
|
46
50
|
|
47
51
|
return logits
|
48
52
|
|
49
|
-
def _get_probs(
|
50
|
-
self,
|
51
|
-
logits: torch.Tensor,
|
52
|
-
sampling_info: SamplingBatchInfo,
|
53
|
-
is_torch_compile: bool = False,
|
54
|
-
):
|
53
|
+
def _get_probs(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo):
|
55
54
|
# Post process logits
|
56
55
|
logits = logits.contiguous()
|
57
56
|
logits.div_(sampling_info.temperatures)
|
58
|
-
if is_torch_compile:
|
57
|
+
if self.is_torch_compile:
|
59
58
|
# FIXME: Temporary workaround for unknown bugs in torch.compile
|
60
59
|
logits.add_(0)
|
61
60
|
|
@@ -91,7 +90,7 @@ class Sampler(CustomOp):
|
|
91
90
|
probs, uniform_samples, sampling_info.min_ps
|
92
91
|
)
|
93
92
|
else:
|
94
|
-
batch_next_token_ids, success =
|
93
|
+
batch_next_token_ids, success = flashinfer_top_k_top_p(
|
95
94
|
probs, uniform_samples, sampling_info.top_ks, sampling_info.top_ps
|
96
95
|
)
|
97
96
|
else:
|
@@ -110,7 +109,7 @@ class Sampler(CustomOp):
|
|
110
109
|
if isinstance(logits, LogitsProcessorOutput):
|
111
110
|
logits = logits.next_token_logits
|
112
111
|
|
113
|
-
probs = self._get_probs(logits, sampling_info
|
112
|
+
probs = self._get_probs(logits, sampling_info)
|
114
113
|
|
115
114
|
batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch(
|
116
115
|
probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
|
@@ -119,6 +118,31 @@ class Sampler(CustomOp):
|
|
119
118
|
return SampleOutput(success, probs, batch_next_token_ids)
|
120
119
|
|
121
120
|
|
121
|
+
@torch_custom_op("my_lib::flashinfer_top_k_top_p", mutates_args={})
|
122
|
+
def flashinfer_top_k_top_p(
|
123
|
+
probs: torch.Tensor,
|
124
|
+
uniform_samples: torch.Tensor,
|
125
|
+
top_ks: torch.Tensor,
|
126
|
+
top_ps: torch.Tensor,
|
127
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
128
|
+
# NOTE: we do not use min_p neither in CUDA nor in torch.compile
|
129
|
+
return top_k_top_p_sampling_from_probs(probs, uniform_samples, top_ks, top_ps)
|
130
|
+
|
131
|
+
|
132
|
+
@flashinfer_top_k_top_p.register_fake
|
133
|
+
def _(
|
134
|
+
probs: torch.Tensor,
|
135
|
+
uniform_samples: torch.Tensor,
|
136
|
+
top_ks: torch.Tensor,
|
137
|
+
top_ps: torch.Tensor,
|
138
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
139
|
+
bs = probs.shape[0]
|
140
|
+
return (
|
141
|
+
torch.ones(bs, dtype=torch.bool, device=probs.device),
|
142
|
+
torch.zeros(bs, dtype=torch.int32, device=probs.device),
|
143
|
+
)
|
144
|
+
|
145
|
+
|
122
146
|
def top_k_top_p_min_p_sampling_from_probs_torch(
|
123
147
|
probs: torch.Tensor,
|
124
148
|
top_ks: torch.Tensor,
|
@@ -178,19 +178,22 @@ class Req:
|
|
178
178
|
def adjust_max_prefix_ids(self):
|
179
179
|
self.fill_ids = self.origin_input_ids + self.output_ids
|
180
180
|
input_len = len(self.fill_ids)
|
181
|
-
|
181
|
+
|
182
|
+
# FIXME: To work around some bugs in logprob computation, we need to ensure each
|
183
|
+
# request has at least one token. Later, we can relax this requirement and use `input_len`.
|
184
|
+
max_prefix_len = input_len - 1
|
182
185
|
|
183
186
|
if self.sampling_params.max_new_tokens > 0:
|
184
187
|
# Need at least one token to compute logits
|
185
188
|
max_prefix_len = min(max_prefix_len, input_len - 1)
|
186
189
|
|
187
190
|
if self.return_logprob:
|
188
|
-
max_prefix_len = min(max_prefix_len, self.logprob_start_len)
|
189
|
-
|
190
191
|
if self.normalized_prompt_logprob is None:
|
191
192
|
# Need at least two tokens to compute normalized logprob
|
192
193
|
max_prefix_len = min(max_prefix_len, input_len - 2)
|
194
|
+
max_prefix_len = min(max_prefix_len, self.logprob_start_len)
|
193
195
|
|
196
|
+
max_prefix_len = max(max_prefix_len, 0)
|
194
197
|
return self.fill_ids[:max_prefix_len]
|
195
198
|
|
196
199
|
# Based on https://github.com/vllm-project/vllm/blob/7a64d24aad69e4d2548aa0bf528d9fe63428ab01/vllm/transformers_utils/detokenizer.py#L194-L313
|
@@ -86,8 +86,8 @@ class TokenizerManager:
|
|
86
86
|
self.recv_from_detokenizer = context.socket(zmq.PULL)
|
87
87
|
self.recv_from_detokenizer.bind(f"tcp://127.0.0.1:{port_args.tokenizer_port}")
|
88
88
|
|
89
|
-
self.
|
90
|
-
self.
|
89
|
+
self.send_to_controller = context.socket(zmq.PUSH)
|
90
|
+
self.send_to_controller.connect(f"tcp://127.0.0.1:{port_args.controller_port}")
|
91
91
|
|
92
92
|
# Read model args
|
93
93
|
self.model_path = server_args.model_path
|
@@ -271,7 +271,7 @@ class TokenizerManager:
|
|
271
271
|
input_ids,
|
272
272
|
sampling_params,
|
273
273
|
)
|
274
|
-
self.
|
274
|
+
self.send_to_controller.send_pyobj(tokenized_obj)
|
275
275
|
|
276
276
|
# Recv results
|
277
277
|
event = asyncio.Event()
|
@@ -367,7 +367,7 @@ class TokenizerManager:
|
|
367
367
|
input_ids,
|
368
368
|
sampling_params,
|
369
369
|
)
|
370
|
-
self.
|
370
|
+
self.send_to_controller.send_pyobj(tokenized_obj)
|
371
371
|
|
372
372
|
event = asyncio.Event()
|
373
373
|
state = ReqState([], False, event)
|
@@ -500,14 +500,14 @@ class TokenizerManager:
|
|
500
500
|
|
501
501
|
def flush_cache(self):
|
502
502
|
req = FlushCacheReq()
|
503
|
-
self.
|
503
|
+
self.send_to_controller.send_pyobj(req)
|
504
504
|
|
505
505
|
def abort_request(self, rid: str):
|
506
506
|
if rid not in self.rid_to_state:
|
507
507
|
return
|
508
508
|
del self.rid_to_state[rid]
|
509
509
|
req = AbortReq(rid)
|
510
|
-
self.
|
510
|
+
self.send_to_controller.send_pyobj(req)
|
511
511
|
|
512
512
|
async def update_weights(
|
513
513
|
self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
|
@@ -524,7 +524,7 @@ class TokenizerManager:
|
|
524
524
|
# wait for the previous generation requests to finish
|
525
525
|
while len(self.rid_to_state) > 0:
|
526
526
|
await asyncio.sleep(0)
|
527
|
-
self.
|
527
|
+
self.send_to_controller.send_pyobj(obj)
|
528
528
|
self.model_update_result = asyncio.Future()
|
529
529
|
result = await self.model_update_result
|
530
530
|
if result.success:
|
@@ -46,8 +46,10 @@ def _to_torch(model: torch.nn.Module, reverse: bool = False):
|
|
46
46
|
if isinstance(sub, CustomOp):
|
47
47
|
if reverse:
|
48
48
|
sub._forward_method = sub.forward_cuda
|
49
|
+
setattr(sub, "is_torch_compile", False)
|
49
50
|
else:
|
50
51
|
sub._forward_method = sub.forward_native
|
52
|
+
setattr(sub, "is_torch_compile", True)
|
51
53
|
if isinstance(sub, torch.nn.Module):
|
52
54
|
_to_torch(sub, reverse)
|
53
55
|
|
@@ -162,6 +162,7 @@ class ModelRunner:
|
|
162
162
|
return min_per_gpu_memory
|
163
163
|
|
164
164
|
def load_model(self):
|
165
|
+
torch.set_num_threads(1)
|
165
166
|
logger.info(
|
166
167
|
f"Load weight begin. avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
|
167
168
|
)
|
@@ -523,7 +524,7 @@ class ModelRunner:
|
|
523
524
|
if (
|
524
525
|
self.cuda_graph_runner
|
525
526
|
and self.cuda_graph_runner.can_run(len(batch.reqs))
|
526
|
-
and
|
527
|
+
and batch.sampling_info.can_run_in_cuda_graph()
|
527
528
|
):
|
528
529
|
return self.cuda_graph_runner.replay(batch)
|
529
530
|
|
@@ -606,16 +607,6 @@ def import_model_classes():
|
|
606
607
|
assert entry.__name__ not in model_arch_name_to_cls
|
607
608
|
model_arch_name_to_cls[entry.__name__] = entry
|
608
609
|
|
609
|
-
# compat: some models such as chatglm has incorrect class set in config.json
|
610
|
-
# usage: [ tuple("From_Entry_Class_Name": EntryClass), ]
|
611
|
-
if hasattr(module, "EntryClassRemapping") and isinstance(
|
612
|
-
module.EntryClassRemapping, list
|
613
|
-
):
|
614
|
-
for remap in module.EntryClassRemapping:
|
615
|
-
if isinstance(remap, tuple) and len(remap) == 2:
|
616
|
-
assert remap[0] not in model_arch_name_to_cls
|
617
|
-
model_arch_name_to_cls[remap[0]] = remap[1]
|
618
|
-
|
619
610
|
return model_arch_name_to_cls
|
620
611
|
|
621
612
|
|
sglang/srt/models/chatglm.py
CHANGED
@@ -402,6 +402,8 @@ class ChatGLMForCausalLM(nn.Module):
|
|
402
402
|
weight_loader(param, loaded_weight)
|
403
403
|
|
404
404
|
|
405
|
-
|
406
|
-
|
407
|
-
|
405
|
+
class ChatGLMModel(ChatGLMForCausalLM):
|
406
|
+
pass
|
407
|
+
|
408
|
+
|
409
|
+
EntryClass = [ChatGLMForCausalLM, ChatGLMModel]
|
sglang/srt/models/exaone.py
CHANGED
@@ -297,7 +297,6 @@ class ExaoneForCausalLM(nn.Module):
|
|
297
297
|
config,
|
298
298
|
quant_config: Optional[QuantizationConfig] = None,
|
299
299
|
cache_config: Optional[CacheConfig] = None,
|
300
|
-
efficient_weight_load=False,
|
301
300
|
) -> None:
|
302
301
|
super().__init__()
|
303
302
|
self.config = config
|
@@ -324,30 +323,7 @@ class ExaoneForCausalLM(nn.Module):
|
|
324
323
|
sample_output = self.sampler(logits_output, input_metadata.sampling_info)
|
325
324
|
return sample_output, logits_output
|
326
325
|
|
327
|
-
def
|
328
|
-
stacked_params_mapping = [
|
329
|
-
# (param_name, shard_name, shard_id, num_shard)
|
330
|
-
("qkv_proj", "q_proj", "q", 3),
|
331
|
-
("qkv_proj", "k_proj", "k", 3),
|
332
|
-
("qkv_proj", "v_proj", "v", 3),
|
333
|
-
("gate_up_proj", "c_fc_0", 0, 2),
|
334
|
-
("gate_up_proj", "c_fc_1", 1, 2),
|
335
|
-
]
|
336
|
-
for param_name, weight_name, shard_id, num_shard in stacked_params_mapping:
|
337
|
-
if weight_name in name:
|
338
|
-
return (
|
339
|
-
name.replace(weight_name, param_name)[: -len(".weight")],
|
340
|
-
num_shard,
|
341
|
-
)
|
342
|
-
return name[: -len(".weight")], 1
|
343
|
-
|
344
|
-
def get_num_params(self):
|
345
|
-
params_dict = dict(self.named_parameters())
|
346
|
-
return len(params_dict)
|
347
|
-
|
348
|
-
def load_weights(
|
349
|
-
self, weights: Iterable[Tuple[str, torch.Tensor]], name=None, loaded_weight=None
|
350
|
-
):
|
326
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
351
327
|
stacked_params_mapping = [
|
352
328
|
# (param_name, shard_name, shard_id)
|
353
329
|
("qkv_proj", "q_proj", "q"),
|
@@ -358,16 +334,17 @@ class ExaoneForCausalLM(nn.Module):
|
|
358
334
|
]
|
359
335
|
params_dict = dict(self.named_parameters())
|
360
336
|
|
361
|
-
|
337
|
+
for name, loaded_weight in weights:
|
362
338
|
if "rotary_emb.inv_freq" in name or "projector" in name:
|
363
|
-
|
339
|
+
continue
|
364
340
|
if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
|
365
341
|
# Models trained using ColossalAI may include these tensors in
|
366
342
|
# the checkpoint. Skip them.
|
367
|
-
|
343
|
+
continue
|
368
344
|
if name.startswith("model.vision_tower") and name not in params_dict:
|
369
|
-
|
345
|
+
continue
|
370
346
|
|
347
|
+
name = name.replace("attn.attention", "self_attn")
|
371
348
|
for param_name, weight_name, shard_id in stacked_params_mapping:
|
372
349
|
if weight_name not in name:
|
373
350
|
continue
|
@@ -382,18 +359,10 @@ class ExaoneForCausalLM(nn.Module):
|
|
382
359
|
else:
|
383
360
|
# Skip loading extra bias for GPTQ models.
|
384
361
|
if name.endswith(".bias") and name not in params_dict:
|
385
|
-
|
362
|
+
continue
|
386
363
|
param = params_dict[name]
|
387
364
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
388
365
|
weight_loader(param, loaded_weight)
|
389
366
|
|
390
|
-
if name is None or loaded_weight is None:
|
391
|
-
for name, loaded_weight in weights:
|
392
|
-
name = name.replace("attn.attention", "self_attn")
|
393
|
-
load_weights_per_param(name, loaded_weight)
|
394
|
-
else:
|
395
|
-
name = name.replace("attn.attention", "self_attn")
|
396
|
-
load_weights_per_param(name, loaded_weight)
|
397
|
-
|
398
367
|
|
399
368
|
EntryClass = ExaoneForCausalLM
|
@@ -295,7 +295,6 @@ class LlamaForCausalLM(nn.Module):
|
|
295
295
|
config: LlamaConfig,
|
296
296
|
quant_config: Optional[QuantizationConfig] = None,
|
297
297
|
cache_config: Optional[CacheConfig] = None,
|
298
|
-
efficient_weight_load=False,
|
299
298
|
) -> None:
|
300
299
|
super().__init__()
|
301
300
|
self.config = config
|
@@ -305,6 +304,8 @@ class LlamaForCausalLM(nn.Module):
|
|
305
304
|
self.logits_processor = LogitsProcessor(config)
|
306
305
|
self.sampler = Sampler()
|
307
306
|
|
307
|
+
self.param_dict = dict(self.named_parameters())
|
308
|
+
|
308
309
|
@torch.no_grad()
|
309
310
|
def forward(
|
310
311
|
self,
|
@@ -320,49 +321,26 @@ class LlamaForCausalLM(nn.Module):
|
|
320
321
|
sample_output = self.sampler(logits_output, input_metadata.sampling_info)
|
321
322
|
return sample_output, logits_output
|
322
323
|
|
323
|
-
def
|
324
|
-
stacked_params_mapping = [
|
325
|
-
# (param_name, shard_name, shard_id, num_shard)
|
326
|
-
("qkv_proj", "q_proj", "q", 3),
|
327
|
-
("qkv_proj", "k_proj", "k", 3),
|
328
|
-
("qkv_proj", "v_proj", "v", 3),
|
329
|
-
("gate_up_proj", "gate_proj", 0, 2),
|
330
|
-
("gate_up_proj", "up_proj", 1, 2),
|
331
|
-
]
|
332
|
-
for param_name, weight_name, shard_id, num_shard in stacked_params_mapping:
|
333
|
-
if weight_name in name:
|
334
|
-
return (
|
335
|
-
name.replace(weight_name, param_name)[: -len(".weight")],
|
336
|
-
num_shard,
|
337
|
-
)
|
338
|
-
return name[: -len(".weight")], 1
|
339
|
-
|
340
|
-
def get_num_params(self):
|
341
|
-
params_dict = dict(self.named_parameters())
|
342
|
-
return len(params_dict)
|
343
|
-
|
344
|
-
def load_weights(
|
345
|
-
self, weights: Iterable[Tuple[str, torch.Tensor]], name=None, loaded_weight=None
|
346
|
-
):
|
324
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
347
325
|
stacked_params_mapping = [
|
348
326
|
# (param_name, shard_name, shard_id)
|
349
|
-
("qkv_proj", "q_proj", "q"),
|
350
|
-
("qkv_proj", "k_proj", "k"),
|
351
|
-
("qkv_proj", "v_proj", "v"),
|
352
|
-
("gate_up_proj", "gate_proj", 0),
|
353
|
-
("gate_up_proj", "up_proj", 1),
|
327
|
+
(".qkv_proj", ".q_proj", "q"),
|
328
|
+
(".qkv_proj", ".k_proj", "k"),
|
329
|
+
(".qkv_proj", ".v_proj", "v"),
|
330
|
+
(".gate_up_proj", ".gate_proj", 0),
|
331
|
+
(".gate_up_proj", ".up_proj", 1),
|
354
332
|
]
|
355
|
-
params_dict =
|
333
|
+
params_dict = self.param_dict
|
356
334
|
|
357
|
-
|
335
|
+
for name, loaded_weight in weights:
|
358
336
|
if "rotary_emb.inv_freq" in name or "projector" in name:
|
359
|
-
|
337
|
+
continue
|
360
338
|
if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
|
361
339
|
# Models trained using ColossalAI may include these tensors in
|
362
340
|
# the checkpoint. Skip them.
|
363
|
-
|
341
|
+
continue
|
364
342
|
if name.startswith("model.vision_tower") and name not in params_dict:
|
365
|
-
|
343
|
+
continue
|
366
344
|
|
367
345
|
for param_name, weight_name, shard_id in stacked_params_mapping:
|
368
346
|
if weight_name not in name:
|
@@ -378,16 +356,14 @@ class LlamaForCausalLM(nn.Module):
|
|
378
356
|
else:
|
379
357
|
# Skip loading extra bias for GPTQ models.
|
380
358
|
if name.endswith(".bias") and name not in params_dict:
|
381
|
-
|
359
|
+
continue
|
382
360
|
param = params_dict[name]
|
383
361
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
384
362
|
weight_loader(param, loaded_weight)
|
385
363
|
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
else:
|
390
|
-
load_weights_per_param(name, loaded_weight)
|
364
|
+
|
365
|
+
class Phi3ForCausalLM(LlamaForCausalLM):
|
366
|
+
pass
|
391
367
|
|
392
368
|
|
393
|
-
EntryClass = LlamaForCausalLM
|
369
|
+
EntryClass = [LlamaForCausalLM, Phi3ForCausalLM]
|
@@ -16,17 +16,16 @@ limitations under the License.
|
|
16
16
|
from typing import Iterable, Optional, Tuple
|
17
17
|
|
18
18
|
import torch
|
19
|
-
import tqdm
|
20
19
|
from torch import nn
|
21
20
|
from transformers import LlamaConfig
|
22
21
|
from vllm.config import CacheConfig
|
23
|
-
from vllm.distributed import get_tensor_model_parallel_rank
|
24
22
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
25
23
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
26
24
|
|
27
25
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
26
|
+
from sglang.srt.layers.sampler import SampleOutput
|
28
27
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
29
|
-
from sglang.srt.models.
|
28
|
+
from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
|
30
29
|
|
31
30
|
|
32
31
|
class LlamaForClassification(nn.Module):
|
@@ -42,10 +41,12 @@ class LlamaForClassification(nn.Module):
|
|
42
41
|
self.model = LlamaModel(config, quant_config=quant_config)
|
43
42
|
|
44
43
|
self.classification_head = nn.Linear(
|
45
|
-
config.hidden_size, config.classification_out_size
|
44
|
+
config.hidden_size, config.classification_out_size, bias=False
|
46
45
|
)
|
47
46
|
self.eos_token_id = config.eos_token_id
|
48
47
|
|
48
|
+
self.param_dict = dict(self.named_parameters())
|
49
|
+
|
49
50
|
@torch.no_grad()
|
50
51
|
def forward(
|
51
52
|
self,
|
@@ -65,7 +66,7 @@ class LlamaForClassification(nn.Module):
|
|
65
66
|
(input_metadata.batch_size, self.config.classification_out_size)
|
66
67
|
).to(input_ids.device)
|
67
68
|
|
68
|
-
|
69
|
+
logits_output = LogitsProcessorOutput(
|
69
70
|
next_token_logits=scores,
|
70
71
|
next_token_logprobs=scores,
|
71
72
|
normalized_prompt_logprobs=scores,
|
@@ -74,46 +75,38 @@ class LlamaForClassification(nn.Module):
|
|
74
75
|
output_top_logprobs=None,
|
75
76
|
)
|
76
77
|
|
78
|
+
# A dummy to make this work
|
79
|
+
sample_output = SampleOutput(
|
80
|
+
success=torch.full(
|
81
|
+
size=(scores.shape[0],),
|
82
|
+
fill_value=True,
|
83
|
+
dtype=torch.bool,
|
84
|
+
),
|
85
|
+
probs=torch.full(
|
86
|
+
size=(scores.shape[0], 1),
|
87
|
+
fill_value=1.0,
|
88
|
+
dtype=torch.float16,
|
89
|
+
),
|
90
|
+
batch_next_token_ids=torch.full(
|
91
|
+
size=(scores.shape[0],),
|
92
|
+
fill_value=0,
|
93
|
+
dtype=torch.long,
|
94
|
+
),
|
95
|
+
)
|
96
|
+
return sample_output, logits_output
|
97
|
+
|
77
98
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
78
|
-
|
79
|
-
# (param_name, shard_name, shard_id)
|
80
|
-
("qkv_proj", "q_proj", "q"),
|
81
|
-
("qkv_proj", "k_proj", "k"),
|
82
|
-
("qkv_proj", "v_proj", "v"),
|
83
|
-
("gate_up_proj", "gate_proj", 0),
|
84
|
-
("gate_up_proj", "up_proj", 1),
|
85
|
-
]
|
86
|
-
params_dict = dict(self.named_parameters())
|
87
|
-
if get_tensor_model_parallel_rank() == 0:
|
88
|
-
weights = tqdm.tqdm(weights, total=int(len(params_dict) * 1.5))
|
89
|
-
for name, loaded_weight in weights:
|
90
|
-
if "rotary_emb.inv_freq" in name or "projector" in name:
|
91
|
-
continue
|
92
|
-
if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
|
93
|
-
# Models trained using ColossalAI may include these tensors in
|
94
|
-
# the checkpoint. Skip them.
|
95
|
-
continue
|
96
|
-
if "lm_head" in name:
|
97
|
-
continue
|
99
|
+
params_dict = self.param_dict
|
98
100
|
|
99
|
-
|
100
|
-
|
101
|
-
continue
|
102
|
-
name = name.replace(weight_name, param_name)
|
103
|
-
# Skip loading extra bias for GPTQ models.
|
104
|
-
if name.endswith(".bias") and name not in params_dict:
|
105
|
-
continue
|
106
|
-
param = params_dict[name]
|
107
|
-
weight_loader = param.weight_loader
|
108
|
-
weight_loader(param, loaded_weight, shard_id)
|
109
|
-
break
|
110
|
-
else:
|
111
|
-
# Skip loading extra bias for GPTQ models.
|
112
|
-
if name.endswith(".bias") and name not in params_dict:
|
113
|
-
continue
|
101
|
+
for name, loaded_weight in weights:
|
102
|
+
if "classification_head" in name:
|
114
103
|
param = params_dict[name]
|
115
104
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
116
105
|
weight_loader(param, loaded_weight)
|
106
|
+
elif "lm_head" in name:
|
107
|
+
continue
|
108
|
+
else:
|
109
|
+
LlamaForCausalLM.load_weights(self, [(name, loaded_weight)])
|
117
110
|
|
118
111
|
|
119
112
|
EntryClass = LlamaForClassification
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import Iterable,
|
1
|
+
from typing import Iterable, Tuple
|
2
2
|
|
3
3
|
import torch
|
4
4
|
from torch import nn
|
@@ -7,7 +7,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
7
7
|
|
8
8
|
from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
|
9
9
|
from sglang.srt.model_executor.model_runner import InputMetadata
|
10
|
-
from sglang.srt.models.
|
10
|
+
from sglang.srt.models.llama import LlamaModel
|
11
11
|
|
12
12
|
|
13
13
|
class LlamaEmbeddingModel(nn.Module):
|
@@ -16,7 +16,6 @@ class LlamaEmbeddingModel(nn.Module):
|
|
16
16
|
config: LlamaConfig,
|
17
17
|
quant_config=None,
|
18
18
|
cache_config=None,
|
19
|
-
efficient_weight_load=False,
|
20
19
|
) -> None:
|
21
20
|
super().__init__()
|
22
21
|
self.model = LlamaModel(config, quant_config=quant_config)
|
@@ -86,6 +85,8 @@ class LlamaEmbeddingModel(nn.Module):
|
|
86
85
|
load_weights_per_param(name, loaded_weight)
|
87
86
|
|
88
87
|
|
89
|
-
|
90
|
-
|
91
|
-
|
88
|
+
class MistralModel(LlamaEmbeddingModel):
|
89
|
+
pass
|
90
|
+
|
91
|
+
|
92
|
+
EntryClass = [LlamaEmbeddingModel, MistralModel]
|
sglang/srt/models/llava.py
CHANGED
@@ -41,7 +41,7 @@ from sglang.srt.mm_utils import (
|
|
41
41
|
unpad_image_shape,
|
42
42
|
)
|
43
43
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
|
44
|
-
from sglang.srt.models.
|
44
|
+
from sglang.srt.models.llama import LlamaForCausalLM
|
45
45
|
from sglang.srt.models.mistral import MistralForCausalLM
|
46
46
|
from sglang.srt.models.qwen2 import Qwen2ForCausalLM
|
47
47
|
|
@@ -395,21 +395,19 @@ class LlavaBaseForCausalLM(nn.Module):
|
|
395
395
|
"model.mm_projector.0": "multi_modal_projector.linear_1",
|
396
396
|
"model.mm_projector.2": "multi_modal_projector.linear_2",
|
397
397
|
"model.vision_tower.vision_tower": "vision_tower", # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
|
398
|
+
"model.image_newline": "language_model.model.image_newline",
|
398
399
|
}
|
399
400
|
params_dict = dict(self.named_parameters())
|
400
|
-
weights = list(weights)
|
401
401
|
for name, loaded_weight in weights:
|
402
|
-
|
403
|
-
if "projector" in name or "vision_tower" in name:
|
402
|
+
if "projector" in name or "vision_tower" in name or "image_newline" in name:
|
404
403
|
for weight_name, param_name in projector_weights.items():
|
405
404
|
if weight_name in name:
|
406
405
|
name = name.replace(weight_name, param_name)
|
407
406
|
param = params_dict[name]
|
408
407
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
409
408
|
weight_loader(param, loaded_weight)
|
410
|
-
|
411
|
-
|
412
|
-
self.language_model.load_weights(weights)
|
409
|
+
else:
|
410
|
+
self.language_model.load_weights([(name, loaded_weight)])
|
413
411
|
|
414
412
|
@property
|
415
413
|
def num_patches_per_side(self):
|
@@ -429,6 +427,7 @@ class LlavaLlamaForCausalLM(LlavaBaseForCausalLM):
|
|
429
427
|
self.vision_tower = None
|
430
428
|
self.config.vision_config.hidden_size = config.mm_hidden_size
|
431
429
|
self.config.text_config.hidden_size = config.hidden_size
|
430
|
+
|
432
431
|
self.multi_modal_projector = LlavaMultiModalProjector(config)
|
433
432
|
self.language_model = LlamaForCausalLM(config, quant_config=quant_config)
|
434
433
|
if "unpad" in getattr(config, "mm_patch_merge_type", ""):
|
@@ -448,9 +447,9 @@ class LlavaQwenForCausalLM(LlavaBaseForCausalLM):
|
|
448
447
|
|
449
448
|
self.config = config
|
450
449
|
self.vision_tower = None
|
450
|
+
|
451
451
|
if getattr(self.config, "vision_config", None) is None:
|
452
452
|
self.config.vision_config = CLIPVisionConfig(self.config.mm_vision_tower)
|
453
|
-
|
454
453
|
if getattr(self.config, "text_config", None) is None:
|
455
454
|
self.config.text_config = Qwen2Config(self.config._name_or_path)
|
456
455
|
|
@@ -459,7 +458,6 @@ class LlavaQwenForCausalLM(LlavaBaseForCausalLM):
|
|
459
458
|
|
460
459
|
if getattr(self.config, "projector_hidden_act", None) is None:
|
461
460
|
self.config.projector_hidden_act = "gelu"
|
462
|
-
|
463
461
|
if getattr(self.config, "image_token_index", None) is None:
|
464
462
|
self.config.image_token_index = 151646
|
465
463
|
|
@@ -482,9 +480,9 @@ class LlavaMistralForCausalLM(LlavaBaseForCausalLM):
|
|
482
480
|
|
483
481
|
self.config = config
|
484
482
|
self.vision_tower = None
|
483
|
+
|
485
484
|
if getattr(self.config, "vision_config", None) is None:
|
486
485
|
self.config.vision_config = CLIPVisionConfig(self.config.mm_vision_tower)
|
487
|
-
|
488
486
|
if getattr(self.config, "text_config", None) is None:
|
489
487
|
self.config.text_config = MistralConfig(self.config._name_or_path)
|
490
488
|
|
@@ -493,7 +491,6 @@ class LlavaMistralForCausalLM(LlavaBaseForCausalLM):
|
|
493
491
|
|
494
492
|
if getattr(self.config, "projector_hidden_act", None) is None:
|
495
493
|
self.config.projector_hidden_act = "gelu"
|
496
|
-
|
497
494
|
if getattr(self.config, "image_token_index", None) is None:
|
498
495
|
self.config.image_token_index = 32000
|
499
496
|
|
sglang/srt/models/llavavid.py
CHANGED
@@ -27,7 +27,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConf
|
|
27
27
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
28
28
|
|
29
29
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
|
30
|
-
from sglang.srt.models.
|
30
|
+
from sglang.srt.models.llama import LlamaForCausalLM
|
31
31
|
|
32
32
|
|
33
33
|
class LlavaVidForCausalLM(nn.Module):
|
@@ -239,12 +239,12 @@ class LlavaVidForCausalLM(nn.Module):
|
|
239
239
|
"model.vision_resampler.mm_projector.0": "multi_modal_projector.linear_1",
|
240
240
|
"model.vision_resampler.mm_projector.2": "multi_modal_projector.linear_2",
|
241
241
|
"model.vision_tower.vision_tower": "vision_tower", # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
|
242
|
+
"model.image_newline": "language_model.model.image_newline",
|
242
243
|
}
|
243
244
|
params_dict = dict(self.named_parameters())
|
244
|
-
weights = list(weights)
|
245
245
|
for name, loaded_weight in weights:
|
246
246
|
# FIXME: why projector weights read two times?
|
247
|
-
if "projector" in name or "vision_tower" in name:
|
247
|
+
if "projector" in name or "vision_tower" in name or "image_newline" in name:
|
248
248
|
for weight_name, param_name in projector_weights.items():
|
249
249
|
if weight_name in name:
|
250
250
|
name = name.replace(weight_name, param_name)
|
@@ -255,9 +255,8 @@ class LlavaVidForCausalLM(nn.Module):
|
|
255
255
|
continue
|
256
256
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
257
257
|
weight_loader(param, loaded_weight)
|
258
|
-
|
259
|
-
|
260
|
-
self.language_model.load_weights(weights)
|
258
|
+
else:
|
259
|
+
self.language_model.load_weights([(name, loaded_weight)])
|
261
260
|
|
262
261
|
@property
|
263
262
|
def num_patches_per_side(self):
|
sglang/srt/models/mistral.py
CHANGED
@@ -15,12 +15,11 @@ limitations under the License.
|
|
15
15
|
|
16
16
|
"""Inference-only Mistral model."""
|
17
17
|
|
18
|
-
from sglang.srt.models.
|
18
|
+
from sglang.srt.models.llama import LlamaForCausalLM
|
19
19
|
|
20
20
|
|
21
21
|
class MistralForCausalLM(LlamaForCausalLM):
|
22
|
-
|
23
|
-
super().__init__(*args, **kwargs)
|
22
|
+
pass
|
24
23
|
|
25
24
|
|
26
25
|
EntryClass = MistralForCausalLM
|
@@ -34,12 +34,14 @@ class SamplingBatchInfo:
|
|
34
34
|
linear_penalties: torch.Tensor = None
|
35
35
|
scaling_penalties: torch.Tensor = None
|
36
36
|
|
37
|
-
def
|
37
|
+
def can_run_in_cuda_graph(self):
|
38
|
+
# Vocab bias and min_ps are not supported in CUDA graph
|
38
39
|
return (
|
39
|
-
self.logit_bias is
|
40
|
-
|
41
|
-
|
42
|
-
|
40
|
+
self.logit_bias is None
|
41
|
+
and self.vocab_mask is None
|
42
|
+
and self.linear_penalties is None
|
43
|
+
and self.scaling_penalties is None
|
44
|
+
and not self.need_min_p_sampling
|
43
45
|
)
|
44
46
|
|
45
47
|
@classmethod
|
@@ -48,35 +50,29 @@ class SamplingBatchInfo:
|
|
48
50
|
ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
|
49
51
|
ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
|
50
52
|
ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
|
51
|
-
ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
|
52
53
|
return ret
|
53
54
|
|
54
55
|
def __getitem__(self, key):
|
55
56
|
if isinstance(key, slice):
|
56
|
-
# NOTE:
|
57
|
-
assert
|
57
|
+
# NOTE:This method is only used in CUDA graph
|
58
|
+
assert self.can_run_in_cuda_graph()
|
58
59
|
return SamplingBatchInfo(
|
59
60
|
vocab_size=self.vocab_size,
|
60
61
|
temperatures=self.temperatures[key],
|
61
62
|
top_ps=self.top_ps[key],
|
62
63
|
top_ks=self.top_ks[key],
|
63
|
-
min_ps=self.min_ps[key],
|
64
|
-
need_min_p_sampling=self.need_min_p_sampling,
|
65
64
|
)
|
66
65
|
else:
|
67
66
|
raise NotImplementedError
|
68
67
|
|
69
68
|
def inplace_assign(self, bs: int, other: SamplingBatchInfo):
|
70
|
-
# NOTE:
|
71
|
-
assert
|
69
|
+
# NOTE:This method is only used in CUDA graph
|
70
|
+
assert self.can_run_in_cuda_graph()
|
72
71
|
|
73
72
|
self.vocab_size = other.vocab_size
|
74
|
-
self.need_min_p_sampling = other.need_min_p_sampling
|
75
|
-
|
76
73
|
self.temperatures[:bs] = other.temperatures
|
77
74
|
self.top_ps[:bs] = other.top_ps
|
78
75
|
self.top_ks[:bs] = other.top_ks
|
79
|
-
self.min_ps[:bs] = other.min_ps
|
80
76
|
|
81
77
|
@classmethod
|
82
78
|
def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
|
sglang/test/test_programs.py
CHANGED
@@ -2,8 +2,12 @@
|
|
2
2
|
|
3
3
|
import json
|
4
4
|
import re
|
5
|
+
import time
|
6
|
+
|
7
|
+
import numpy as np
|
5
8
|
|
6
9
|
import sglang as sgl
|
10
|
+
from sglang.utils import fetch_and_cache_jsonl
|
7
11
|
|
8
12
|
|
9
13
|
def test_few_shot_qa():
|
@@ -447,3 +451,67 @@ def test_chat_completion_speculative():
|
|
447
451
|
)
|
448
452
|
|
449
453
|
gen_character_spec().sync()
|
454
|
+
|
455
|
+
|
456
|
+
def test_hellaswag_select():
|
457
|
+
"""Benchmark the accuracy of sgl.select on the HellaSwag dataset."""
|
458
|
+
|
459
|
+
url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
|
460
|
+
lines = fetch_and_cache_jsonl(url)
|
461
|
+
|
462
|
+
# Construct prompts
|
463
|
+
def get_one_example(lines, i, include_answer):
|
464
|
+
ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
|
465
|
+
if include_answer:
|
466
|
+
ret += lines[i]["endings"][lines[i]["label"]]
|
467
|
+
return ret
|
468
|
+
|
469
|
+
def get_few_shot_examples(lines, k):
|
470
|
+
ret = ""
|
471
|
+
for i in range(k):
|
472
|
+
ret += get_one_example(lines, i, True) + "\n\n"
|
473
|
+
return ret
|
474
|
+
|
475
|
+
num_questions = 200
|
476
|
+
num_shots = 20
|
477
|
+
few_shot_examples = get_few_shot_examples(lines, num_shots)
|
478
|
+
|
479
|
+
questions = []
|
480
|
+
choices = []
|
481
|
+
labels = []
|
482
|
+
for i in range(len(lines[:num_questions])):
|
483
|
+
questions.append(get_one_example(lines, i, False))
|
484
|
+
choices.append(lines[i]["endings"])
|
485
|
+
labels.append(lines[i]["label"])
|
486
|
+
arguments = [{"question": q, "choices": c} for q, c in zip(questions, choices)]
|
487
|
+
|
488
|
+
#####################################
|
489
|
+
######### SGL Program Begin #########
|
490
|
+
#####################################
|
491
|
+
|
492
|
+
import sglang as sgl
|
493
|
+
|
494
|
+
@sgl.function
|
495
|
+
def few_shot_hellaswag(s, question, choices):
|
496
|
+
s += few_shot_examples + question
|
497
|
+
s += sgl.select("answer", choices=choices)
|
498
|
+
|
499
|
+
#####################################
|
500
|
+
########## SGL Program End ##########
|
501
|
+
#####################################
|
502
|
+
|
503
|
+
# Run requests
|
504
|
+
tic = time.time()
|
505
|
+
rets = few_shot_hellaswag.run_batch(
|
506
|
+
arguments,
|
507
|
+
temperature=0,
|
508
|
+
num_threads=64,
|
509
|
+
progress_bar=True,
|
510
|
+
)
|
511
|
+
preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
|
512
|
+
latency = time.time() - tic
|
513
|
+
|
514
|
+
# Compute accuracy
|
515
|
+
accuracy = np.mean(np.array(preds) == np.array(labels))
|
516
|
+
|
517
|
+
return accuracy, latency
|
sglang/test/test_utils.py
CHANGED
@@ -23,6 +23,10 @@ from sglang.utils import get_exception_traceback
|
|
23
23
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
24
24
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
25
25
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
26
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
27
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
|
28
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
|
29
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
|
26
30
|
|
27
31
|
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
28
32
|
DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
|
sglang/utils.py
CHANGED
@@ -4,6 +4,7 @@ import base64
|
|
4
4
|
import importlib
|
5
5
|
import json
|
6
6
|
import logging
|
7
|
+
import os
|
7
8
|
import signal
|
8
9
|
import sys
|
9
10
|
import traceback
|
@@ -15,6 +16,7 @@ from typing import Union
|
|
15
16
|
|
16
17
|
import numpy as np
|
17
18
|
import requests
|
19
|
+
from tqdm import tqdm
|
18
20
|
|
19
21
|
logger = logging.getLogger(__name__)
|
20
22
|
|
@@ -260,3 +262,40 @@ class LazyImport:
|
|
260
262
|
def __call__(self, *args, **kwargs):
|
261
263
|
module = self._load()
|
262
264
|
return module(*args, **kwargs)
|
265
|
+
|
266
|
+
|
267
|
+
def fetch_and_cache_jsonl(url, cache_file="cached_data.jsonl"):
|
268
|
+
"""Read and cache a jsonl file from a url."""
|
269
|
+
|
270
|
+
# Check if the cache file already exists
|
271
|
+
if os.path.exists(cache_file):
|
272
|
+
print("Loading data from cache...")
|
273
|
+
with open(cache_file, "r") as f:
|
274
|
+
data = [json.loads(line) for line in f]
|
275
|
+
else:
|
276
|
+
print("Downloading data from URL...")
|
277
|
+
# Stream the response to show the progress bar
|
278
|
+
response = requests.get(url, stream=True)
|
279
|
+
response.raise_for_status() # Check for request errors
|
280
|
+
|
281
|
+
# Total size of the file in bytes
|
282
|
+
total_size = int(response.headers.get("content-length", 0))
|
283
|
+
chunk_size = 1024 # Download in chunks of 1KB
|
284
|
+
|
285
|
+
# Use tqdm to display the progress bar
|
286
|
+
with open(cache_file, "wb") as f, tqdm(
|
287
|
+
desc=cache_file,
|
288
|
+
total=total_size,
|
289
|
+
unit="B",
|
290
|
+
unit_scale=True,
|
291
|
+
unit_divisor=1024,
|
292
|
+
) as bar:
|
293
|
+
for chunk in response.iter_content(chunk_size=chunk_size):
|
294
|
+
f.write(chunk)
|
295
|
+
bar.update(len(chunk))
|
296
|
+
|
297
|
+
# Convert the data to a list of dictionaries
|
298
|
+
with open(cache_file, "r") as f:
|
299
|
+
data = [json.loads(line) for line in f]
|
300
|
+
|
301
|
+
return data
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.
|
1
|
+
__version__ = "0.3.0"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
312
312
|
### Method 2: From source
|
313
313
|
```
|
314
314
|
# Use the last release branch
|
315
|
-
git clone -b v0.
|
315
|
+
git clone -b v0.3.0 https://github.com/sgl-project/sglang.git
|
316
316
|
cd sglang
|
317
317
|
|
318
318
|
pip install --upgrade pip
|
@@ -461,7 +461,7 @@ It supports streaming, vision, and most features of the Chat/Completions/Models/
|
|
461
461
|
```
|
462
462
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
|
463
463
|
```
|
464
|
-
- Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism.
|
464
|
+
- Add `--dp 2` to enable multi-GPU data parallelism. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total.
|
465
465
|
```
|
466
466
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
467
467
|
```
|
@@ -495,6 +495,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
495
495
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
496
496
|
- DeepSeek / DeepSeek 2
|
497
497
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
498
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
498
499
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
499
500
|
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
500
501
|
- LLaVA 1.5 / 1.6 / NeXT
|
@@ -6,8 +6,8 @@ sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
|
|
6
6
|
sglang/global_config.py,sha256=nwOjUflwqLQySPUMvk8Hk63TIS6mknh_ODSW3CZ1rJw,1704
|
7
7
|
sglang/launch_server.py,sha256=FODfO0DW546dh-u1qDlWtrhsmj6hxkarXXv3cIdgkj8,549
|
8
8
|
sglang/launch_server_llavavid.py,sha256=xnpSILJxsrbvqkERav5P26bErCQnhoTFmoKeScJltUA,1034
|
9
|
-
sglang/utils.py,sha256=
|
10
|
-
sglang/version.py,sha256=
|
9
|
+
sglang/utils.py,sha256=zxHwQhVxW_lWf-IH0wUw_pBTRLHLPypdRiU5M4XosMM,9669
|
10
|
+
sglang/version.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
|
11
11
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
|
13
13
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
@@ -20,10 +20,10 @@ sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtx
|
|
20
20
|
sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
|
21
21
|
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
22
22
|
sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
|
23
|
-
sglang/lang/backend/runtime_endpoint.py,sha256=
|
23
|
+
sglang/lang/backend/runtime_endpoint.py,sha256=hpezro0H6vG9KzLeKfYpPMwb4TaE0UanCIM0uG8Kdjw,9746
|
24
24
|
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
25
25
|
sglang/srt/conversation.py,sha256=2KDNe1suUPy6xqSkCx2xcO3pDPxTwqx5FaUxaqwCJ-M,19525
|
26
|
-
sglang/srt/hf_transformers_utils.py,sha256=
|
26
|
+
sglang/srt/hf_transformers_utils.py,sha256=5UXJ-LdP92Sk_T843M9BHdnxRrcyiYfWH2IEg3dWgKI,6085
|
27
27
|
sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
|
28
28
|
sglang/srt/model_config.py,sha256=68QQ8iUWQHPv01RBeH23mvay6iJg9DWmCogC_vUgFLk,6371
|
29
29
|
sglang/srt/server.py,sha256=yi8prs9_M0P0dOInrQLkHKiZ-oTigk_uzW8otEHImbU,19846
|
@@ -43,7 +43,7 @@ sglang/srt/layers/logits_processor.py,sha256=Zx4eFAkFlThPrmz_-HuCN9SqGLanARm0wdZ
|
|
43
43
|
sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
|
44
44
|
sglang/srt/layers/prefill_attention.py,sha256=y7vdcuX8lMa9Qf_jQYNDvQO9PVCBQSs3hb5LV2DFgpU,5256
|
45
45
|
sglang/srt/layers/radix_attention.py,sha256=o5a8r3XQ-oRwaxBlAgzJGv7p3dMbu0LrYsDc4uvpPgA,8338
|
46
|
-
sglang/srt/layers/sampler.py,sha256=
|
46
|
+
sglang/srt/layers/sampler.py,sha256=zPVa3PHc-tjDM_oP-1XFeHSRIErx844SLoe6MG8Qef0,6418
|
47
47
|
sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
|
48
48
|
sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
|
49
49
|
sglang/srt/layers/fused_moe/layer.py,sha256=GT3r2UPx_PAufJd0SUMOXyh76ymAeYDubd0SM0H71bo,20977
|
@@ -52,35 +52,35 @@ sglang/srt/managers/controller_single.py,sha256=5brrZ8vZxjvrSJHWrm5H3qGEZShN4ERO
|
|
52
52
|
sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
|
53
53
|
sglang/srt/managers/io_struct.py,sha256=Bd91cydX9_960NNP2xngqK-lsIaDB3oMYd56QddN4_Q,10722
|
54
54
|
sglang/srt/managers/policy_scheduler.py,sha256=7HNUxBKJE444s_bHcPpbnHCygsnH-NIXYNSC2q6mRmc,8584
|
55
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
56
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
55
|
+
sglang/srt/managers/schedule_batch.py,sha256=i68O-e9I_gDlme96xSBDjA2xDF1p-XBKvJRiJ9CsgcY,26423
|
56
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=8aHR5h9nYZsfdZE80uBc9egDFOQgKvjxmp-30Ha4ELk,29463
|
57
57
|
sglang/srt/managers/tp_worker.py,sha256=4UuaBLzV6NMsG4XEIcpa4xMcOKIFvTan51ynKz85HXg,36842
|
58
58
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
59
59
|
sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
|
60
60
|
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
61
61
|
sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
|
62
62
|
sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
|
63
|
-
sglang/srt/model_executor/cuda_graph_runner.py,sha256=
|
63
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=4vIUqVQpnHNhwWrokMVmGM4Dp5JFPHyXIvpEQsi2pNU,12862
|
64
64
|
sglang/srt/model_executor/forward_batch_info.py,sha256=fSLhatN8vCgxn0Mft9D-r0pNi3SN0EQSTJmgaOtrqJc,16471
|
65
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
66
|
-
sglang/srt/models/chatglm.py,sha256=
|
65
|
+
sglang/srt/model_executor/model_runner.py,sha256=93YCStmZfdZlY0r-GGIVi0Xw66VwF77dEtGVmQf1VfU,23893
|
66
|
+
sglang/srt/models/chatglm.py,sha256=PPOaeqipbkcsTUhMPbLb1HItWgW7KntefUfjEoMSxUM,13585
|
67
67
|
sglang/srt/models/commandr.py,sha256=k86ykwWOlxLGaBbGUoMSaXngUxCbMVRbY5AoMOWpbU8,14377
|
68
68
|
sglang/srt/models/dbrx.py,sha256=goLJ9Yt-9vxkwhCUFBidvP41H_dYTFsvrMZ4xm4FqGA,14875
|
69
69
|
sglang/srt/models/deepseek.py,sha256=aYP6HUgxQbhcQGQEF4vX0ronBF8AirqIFG98EQn0YzY,16220
|
70
70
|
sglang/srt/models/deepseek_v2.py,sha256=Htw_HDju9huYU5gBu2dqq6bKVao-AsifxfkGl2xRx-8,28521
|
71
|
-
sglang/srt/models/exaone.py,sha256=
|
71
|
+
sglang/srt/models/exaone.py,sha256=ZFr0G0WITxg3dDfV_-vWqZpK_wMmiZi4r0vOT0gO9V4,13301
|
72
72
|
sglang/srt/models/gemma.py,sha256=Ya_u2lKPKAc9iHEsW_HAEfCDgYTbxUOCzBI0LDuoOYs,12489
|
73
73
|
sglang/srt/models/gemma2.py,sha256=MCmzzRAAafEQuQj6aGtB-TF4jH0RWrXcOPxSz6LRsXs,15137
|
74
74
|
sglang/srt/models/gpt_bigcode.py,sha256=HEhMRO1Y37JfZtP7mDp0MexWj5h6XT9rKvxorOMKoQA,10409
|
75
75
|
sglang/srt/models/grok.py,sha256=ZcJ4E11rKh-xo4k_j-H1XRreJWWv8yii-bMYC1lO2R8,15143
|
76
76
|
sglang/srt/models/internlm2.py,sha256=VtWATs2eLIqbadYXTPY_vycFIstVk4zg3kxycA9H0Qw,12416
|
77
|
-
sglang/srt/models/
|
78
|
-
sglang/srt/models/llama_classification.py,sha256=
|
79
|
-
sglang/srt/models/llama_embedding.py,sha256=
|
80
|
-
sglang/srt/models/llava.py,sha256=
|
81
|
-
sglang/srt/models/llavavid.py,sha256=
|
77
|
+
sglang/srt/models/llama.py,sha256=MfDnlVWoJUG9DxgGYPiwhoU-0ZeRbhp6UmBR2ZAJSNk,13402
|
78
|
+
sglang/srt/models/llama_classification.py,sha256=oSeROs633Gnak8vrbnWnCWDxfgP_zmKGO1A_43ukEQ4,4029
|
79
|
+
sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
|
80
|
+
sglang/srt/models/llava.py,sha256=OXmlOVIjFnMRKGwLweYB1N-xlfpZlTlZpqhsbwUCY6Y,23471
|
81
|
+
sglang/srt/models/llavavid.py,sha256=4R2t8BZJKN85IrTLsLFb4yZuKVI2Cwp7kY8AJ-nEVoE,12012
|
82
82
|
sglang/srt/models/minicpm.py,sha256=7RZEJ2TCqBL1JmMFVJ3J9DmZHRw0q90st49Wkh-sdL4,14039
|
83
|
-
sglang/srt/models/mistral.py,sha256=
|
83
|
+
sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
|
84
84
|
sglang/srt/models/mixtral.py,sha256=KIsvruhXNq3Fwrs4_YE7J6fx54ObfnMuRNxgScE3Bmo,13830
|
85
85
|
sglang/srt/models/mixtral_quant.py,sha256=O_97UKDYZokFhIBnamWfw0HLhln9_BUk_KfQ-sQnd8s,14286
|
86
86
|
sglang/srt/models/qwen.py,sha256=geK88AyEyPbbDvMHJNY8XMSNpsCeu8g9kxnKyiJBpK4,10168
|
@@ -90,7 +90,7 @@ sglang/srt/models/stablelm.py,sha256=9feHoiDEXSIe0WCrt4AfWXqxliJwRvr8w4XSnk6ipSI
|
|
90
90
|
sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
|
91
91
|
sglang/srt/openai_api/adapter.py,sha256=3EeqASZXogpUkOP4xj7Rg_LfOLiIMUrZ9uFdeAy_pcc,50144
|
92
92
|
sglang/srt/openai_api/protocol.py,sha256=onhnCjXpXCysvx_dLgOEmXz5XHHYB1t772cvHcK1GlY,9538
|
93
|
-
sglang/srt/sampling/sampling_batch_info.py,sha256=
|
93
|
+
sglang/srt/sampling/sampling_batch_info.py,sha256=CIoD0SzHSWCe7Wc4jkJj5vIPHGnOdfbgkC6fG5KQxOw,7551
|
94
94
|
sglang/srt/sampling/sampling_params.py,sha256=ggOXxafqfCD-xrGYcM57byLZ79CIeBP4AD5F44L_CW0,5635
|
95
95
|
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
96
96
|
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
|
@@ -108,11 +108,11 @@ sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2
|
|
108
108
|
sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
|
109
109
|
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
110
110
|
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
111
|
-
sglang/test/test_programs.py,sha256=
|
112
|
-
sglang/test/test_utils.py,sha256=
|
111
|
+
sglang/test/test_programs.py,sha256=l21J8N91QTMO9TOvXPWNvPZVT0DgxYxOPHh1pOoFV_k,16927
|
112
|
+
sglang/test/test_utils.py,sha256=3tt-BBv-lx7BT3whbVTMyRz6sh5jIbdBEbLZ08m2Ms8,15132
|
113
113
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
|
114
|
-
sglang-0.
|
115
|
-
sglang-0.
|
116
|
-
sglang-0.
|
117
|
-
sglang-0.
|
118
|
-
sglang-0.
|
114
|
+
sglang-0.3.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
115
|
+
sglang-0.3.0.dist-info/METADATA,sha256=muukBuN4kq_4mCG_r_RFY94pQliDcVh-WuXNMApXoak,37383
|
116
|
+
sglang-0.3.0.dist-info/WHEEL,sha256=uCRv0ZEik_232NlR4YDw4Pv3Ajt5bKvMH13NUU7hFuI,91
|
117
|
+
sglang-0.3.0.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
118
|
+
sglang-0.3.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|