sglang 0.2.14.post2__py3-none-any.whl → 0.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +2 -0
- sglang/bench_latency.py +39 -28
- sglang/lang/interpreter.py +3 -0
- sglang/lang/ir.py +5 -0
- sglang/launch_server_llavavid.py +12 -12
- sglang/srt/configs/__init__.py +5 -0
- sglang/srt/configs/exaone.py +195 -0
- sglang/srt/constrained/fsm_cache.py +1 -1
- sglang/srt/conversation.py +24 -2
- sglang/srt/hf_transformers_utils.py +11 -11
- sglang/srt/layers/extend_attention.py +13 -8
- sglang/srt/layers/logits_processor.py +4 -4
- sglang/srt/layers/sampler.py +69 -16
- sglang/srt/managers/controller_multi.py +5 -5
- sglang/srt/managers/controller_single.py +5 -5
- sglang/srt/managers/io_struct.py +6 -1
- sglang/srt/managers/schedule_batch.py +20 -8
- sglang/srt/managers/tokenizer_manager.py +2 -2
- sglang/srt/managers/tp_worker.py +38 -26
- sglang/srt/model_config.py +3 -3
- sglang/srt/model_executor/cuda_graph_runner.py +24 -9
- sglang/srt/model_executor/forward_batch_info.py +68 -23
- sglang/srt/model_executor/model_runner.py +14 -12
- sglang/srt/models/chatglm.py +4 -12
- sglang/srt/models/commandr.py +5 -1
- sglang/srt/models/dbrx.py +5 -1
- sglang/srt/models/deepseek.py +5 -1
- sglang/srt/models/deepseek_v2.py +57 -25
- sglang/srt/models/exaone.py +399 -0
- sglang/srt/models/gemma.py +5 -1
- sglang/srt/models/gemma2.py +5 -1
- sglang/srt/models/gpt_bigcode.py +5 -1
- sglang/srt/models/grok.py +5 -1
- sglang/srt/models/internlm2.py +5 -1
- sglang/srt/models/llama2.py +7 -3
- sglang/srt/models/llama_classification.py +2 -2
- sglang/srt/models/minicpm.py +5 -1
- sglang/srt/models/mixtral.py +6 -2
- sglang/srt/models/mixtral_quant.py +5 -1
- sglang/srt/models/qwen.py +5 -2
- sglang/srt/models/qwen2.py +6 -2
- sglang/srt/models/qwen2_moe.py +5 -14
- sglang/srt/models/stablelm.py +5 -1
- sglang/srt/openai_api/adapter.py +16 -1
- sglang/srt/openai_api/protocol.py +5 -5
- sglang/srt/sampling/sampling_batch_info.py +79 -6
- sglang/srt/server.py +6 -6
- sglang/srt/utils.py +0 -3
- sglang/test/runners.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.2.14.post2.dist-info → sglang-0.2.15.dist-info}/METADATA +7 -7
- {sglang-0.2.14.post2.dist-info → sglang-0.2.15.dist-info}/RECORD +55 -52
- {sglang-0.2.14.post2.dist-info → sglang-0.2.15.dist-info}/LICENSE +0 -0
- {sglang-0.2.14.post2.dist-info → sglang-0.2.15.dist-info}/WHEEL +0 -0
- {sglang-0.2.14.post2.dist-info → sglang-0.2.15.dist-info}/top_level.txt +0 -0
@@ -45,6 +45,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
45
45
|
from sglang.srt.layers.layernorm import RMSNorm
|
46
46
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
47
47
|
from sglang.srt.layers.radix_attention import RadixAttention
|
48
|
+
from sglang.srt.layers.sampler import Sampler
|
48
49
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
49
50
|
|
50
51
|
|
@@ -333,6 +334,7 @@ class QuantMixtralForCausalLM(nn.Module):
|
|
333
334
|
self.model = MixtralModel(config, quant_config=quant_config)
|
334
335
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
335
336
|
self.logits_processor = LogitsProcessor(config)
|
337
|
+
self.sampler = Sampler()
|
336
338
|
|
337
339
|
@torch.no_grad()
|
338
340
|
def forward(
|
@@ -343,9 +345,11 @@ class QuantMixtralForCausalLM(nn.Module):
|
|
343
345
|
input_embeds: torch.Tensor = None,
|
344
346
|
) -> torch.Tensor:
|
345
347
|
hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
|
346
|
-
|
348
|
+
logits_output = self.logits_processor(
|
347
349
|
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
348
350
|
)
|
351
|
+
sample_output = self.sampler(logits_output, input_metadata.sampling_info)
|
352
|
+
return sample_output, logits_output
|
349
353
|
|
350
354
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
351
355
|
stacked_params_mapping = [
|
sglang/srt/models/qwen.py
CHANGED
@@ -39,6 +39,7 @@ from sglang.srt.layers.activation import SiluAndMul
|
|
39
39
|
from sglang.srt.layers.layernorm import RMSNorm
|
40
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
41
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
|
+
from sglang.srt.layers.sampler import Sampler
|
42
43
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
43
44
|
|
44
45
|
|
@@ -251,6 +252,7 @@ class QWenLMHeadModel(nn.Module):
|
|
251
252
|
vocab_size = ((config.vocab_size + 63) // 64) * 64
|
252
253
|
self.lm_head = ParallelLMHead(vocab_size, config.hidden_size)
|
253
254
|
self.logits_processor = LogitsProcessor(config)
|
255
|
+
self.sampler = Sampler()
|
254
256
|
|
255
257
|
@torch.no_grad()
|
256
258
|
def forward(
|
@@ -260,10 +262,11 @@ class QWenLMHeadModel(nn.Module):
|
|
260
262
|
input_metadata: InputMetadata,
|
261
263
|
):
|
262
264
|
hidden_states = self.transformer(input_ids, positions, input_metadata)
|
263
|
-
|
265
|
+
logits_output = self.logits_processor(
|
264
266
|
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
265
267
|
)
|
266
|
-
|
268
|
+
sample_output = self.sampler(logits_output, input_metadata.sampling_info)
|
269
|
+
return sample_output, logits_output
|
267
270
|
|
268
271
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
269
272
|
stacked_params_mapping = [
|
sglang/srt/models/qwen2.py
CHANGED
@@ -38,8 +38,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
38
38
|
from sglang.srt.layers.activation import SiluAndMul
|
39
39
|
from sglang.srt.layers.layernorm import RMSNorm
|
40
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
|
-
from sglang.srt.layers.pooler import
|
41
|
+
from sglang.srt.layers.pooler import Pooler, PoolingType
|
42
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
|
+
from sglang.srt.layers.sampler import Sampler
|
43
44
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
44
45
|
|
45
46
|
Qwen2Config = None
|
@@ -276,6 +277,7 @@ class Qwen2ForCausalLM(nn.Module):
|
|
276
277
|
self.model = Qwen2Model(config, quant_config=quant_config)
|
277
278
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
278
279
|
self.logits_processor = LogitsProcessor(config)
|
280
|
+
self.sampler = Sampler()
|
279
281
|
self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
|
280
282
|
|
281
283
|
@torch.no_grad()
|
@@ -289,9 +291,11 @@ class Qwen2ForCausalLM(nn.Module):
|
|
289
291
|
) -> torch.Tensor:
|
290
292
|
hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
|
291
293
|
if not get_embedding:
|
292
|
-
|
294
|
+
logits_output = self.logits_processor(
|
293
295
|
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
294
296
|
)
|
297
|
+
sample_output = self.sampler(logits_output, input_metadata.sampling_info)
|
298
|
+
return sample_output, logits_output
|
295
299
|
else:
|
296
300
|
return self.pooler(hidden_states, input_metadata)
|
297
301
|
|
sglang/srt/models/qwen2_moe.py
CHANGED
@@ -35,10 +35,8 @@ from vllm.model_executor.layers.linear import (
|
|
35
35
|
ReplicatedLinear,
|
36
36
|
RowParallelLinear,
|
37
37
|
)
|
38
|
-
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
39
38
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
40
39
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
41
|
-
from vllm.model_executor.layers.sampler import Sampler
|
42
40
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
43
41
|
ParallelLMHead,
|
44
42
|
VocabParallelEmbedding,
|
@@ -49,6 +47,7 @@ from sglang.srt.layers.activation import SiluAndMul
|
|
49
47
|
from sglang.srt.layers.layernorm import RMSNorm
|
50
48
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
51
49
|
from sglang.srt.layers.radix_attention import RadixAttention
|
50
|
+
from sglang.srt.layers.sampler import Sampler
|
52
51
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
53
52
|
|
54
53
|
|
@@ -366,6 +365,7 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
366
365
|
config.vocab_size, config.hidden_size, quant_config=quant_config
|
367
366
|
)
|
368
367
|
self.logits_processor = LogitsProcessor(config)
|
368
|
+
self.sampler = Sampler()
|
369
369
|
|
370
370
|
@torch.no_grad()
|
371
371
|
def forward(
|
@@ -376,20 +376,11 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
376
376
|
input_embeds: torch.Tensor = None,
|
377
377
|
) -> torch.Tensor:
|
378
378
|
hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
|
379
|
-
|
379
|
+
logits_output = self.logits_processor(
|
380
380
|
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
381
381
|
)
|
382
|
-
|
383
|
-
|
384
|
-
self,
|
385
|
-
input_ids: torch.Tensor,
|
386
|
-
hidden_states: torch.Tensor,
|
387
|
-
input_metadata: InputMetadata,
|
388
|
-
) -> torch.Tensor:
|
389
|
-
logits = self.logits_processor(
|
390
|
-
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
391
|
-
)
|
392
|
-
return logits
|
382
|
+
sample_output = self.sampler(logits_output, input_metadata.sampling_info)
|
383
|
+
return sample_output, logits_output
|
393
384
|
|
394
385
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
395
386
|
stacked_params_mapping = [
|
sglang/srt/models/stablelm.py
CHANGED
@@ -40,6 +40,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
40
40
|
from sglang.srt.layers.activation import SiluAndMul
|
41
41
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
42
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
|
+
from sglang.srt.layers.sampler import Sampler
|
43
44
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
44
45
|
|
45
46
|
|
@@ -249,6 +250,7 @@ class StableLmForCausalLM(nn.Module):
|
|
249
250
|
self.model = StableLMEpochModel(config, quant_config=quant_config)
|
250
251
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
251
252
|
self.logits_processor = LogitsProcessor(config)
|
253
|
+
self.sampler = Sampler()
|
252
254
|
|
253
255
|
@torch.no_grad()
|
254
256
|
def forward(
|
@@ -259,9 +261,11 @@ class StableLmForCausalLM(nn.Module):
|
|
259
261
|
input_embeds: torch.Tensor = None,
|
260
262
|
) -> torch.Tensor:
|
261
263
|
hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
|
262
|
-
|
264
|
+
logits_output = self.logits_processor(
|
263
265
|
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
264
266
|
)
|
267
|
+
sample_output = self.sampler(logits_output, input_metadata.sampling_info)
|
268
|
+
return sample_output, logits_output
|
265
269
|
|
266
270
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
267
271
|
stacked_params_mapping = [
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -844,8 +844,23 @@ def v1_chat_generate_request(
|
|
844
844
|
if not isinstance(request.messages, str):
|
845
845
|
# Apply chat template and its stop strings.
|
846
846
|
if chat_template_name is None:
|
847
|
+
openai_compatible_messages = []
|
848
|
+
for message in request.messages:
|
849
|
+
if isinstance(message.content, str):
|
850
|
+
openai_compatible_messages.append(
|
851
|
+
{"role": message.role, "content": message.content}
|
852
|
+
)
|
853
|
+
else:
|
854
|
+
content_list = message.dict()["content"]
|
855
|
+
for content in content_list:
|
856
|
+
if content["type"] == "text":
|
857
|
+
openai_compatible_messages.append(
|
858
|
+
{"role": message.role, "content": content["text"]}
|
859
|
+
)
|
847
860
|
prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
|
848
|
-
|
861
|
+
openai_compatible_messages,
|
862
|
+
tokenize=True,
|
863
|
+
add_generation_prompt=True,
|
849
864
|
)
|
850
865
|
stop = request.stop
|
851
866
|
image_data = None
|
@@ -200,11 +200,6 @@ class CompletionStreamResponse(BaseModel):
|
|
200
200
|
usage: Optional[UsageInfo] = None
|
201
201
|
|
202
202
|
|
203
|
-
class ChatCompletionMessageGenericParam(BaseModel):
|
204
|
-
role: Literal["system", "assistant"]
|
205
|
-
content: str
|
206
|
-
|
207
|
-
|
208
203
|
class ChatCompletionMessageContentTextPart(BaseModel):
|
209
204
|
type: Literal["text"]
|
210
205
|
text: str
|
@@ -225,6 +220,11 @@ ChatCompletionMessageContentPart = Union[
|
|
225
220
|
]
|
226
221
|
|
227
222
|
|
223
|
+
class ChatCompletionMessageGenericParam(BaseModel):
|
224
|
+
role: Literal["system", "assistant"]
|
225
|
+
content: Union[str, List[ChatCompletionMessageContentTextPart]]
|
226
|
+
|
227
|
+
|
228
228
|
class ChatCompletionMessageUserParam(BaseModel):
|
229
229
|
role: Literal["user"]
|
230
230
|
content: Union[str, List[ChatCompletionMessageContentPart]]
|
@@ -21,10 +21,63 @@ class SamplingBatchInfo:
|
|
21
21
|
top_ps: torch.Tensor = None
|
22
22
|
top_ks: torch.Tensor = None
|
23
23
|
min_ps: torch.Tensor = None
|
24
|
-
|
24
|
+
|
25
|
+
# Dispatch in CUDA graph
|
26
|
+
need_min_p_sampling: bool = False
|
27
|
+
|
28
|
+
# Bias Tensors
|
25
29
|
logit_bias: torch.Tensor = None
|
26
30
|
vocab_mask: torch.Tensor = None
|
27
31
|
|
32
|
+
# Penalizer
|
33
|
+
penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
|
34
|
+
linear_penalties: torch.Tensor = None
|
35
|
+
scaling_penalties: torch.Tensor = None
|
36
|
+
|
37
|
+
def has_bias(self):
|
38
|
+
return (
|
39
|
+
self.logit_bias is not None
|
40
|
+
or self.vocab_mask is not None
|
41
|
+
or self.linear_penalties is not None
|
42
|
+
or self.scaling_penalties is not None
|
43
|
+
)
|
44
|
+
|
45
|
+
@classmethod
|
46
|
+
def dummy_one(cls, max_bs: int, vocab_size: int):
|
47
|
+
ret = cls(vocab_size=vocab_size)
|
48
|
+
ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
|
49
|
+
ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
|
50
|
+
ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
|
51
|
+
ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
|
52
|
+
return ret
|
53
|
+
|
54
|
+
def __getitem__(self, key):
|
55
|
+
if isinstance(key, slice):
|
56
|
+
# NOTE: We do not use cuda graph when there is bias tensors
|
57
|
+
assert not self.has_bias()
|
58
|
+
return SamplingBatchInfo(
|
59
|
+
vocab_size=self.vocab_size,
|
60
|
+
temperatures=self.temperatures[key],
|
61
|
+
top_ps=self.top_ps[key],
|
62
|
+
top_ks=self.top_ks[key],
|
63
|
+
min_ps=self.min_ps[key],
|
64
|
+
need_min_p_sampling=self.need_min_p_sampling,
|
65
|
+
)
|
66
|
+
else:
|
67
|
+
raise NotImplementedError
|
68
|
+
|
69
|
+
def inplace_assign(self, bs: int, other: SamplingBatchInfo):
|
70
|
+
# NOTE: We do not use cuda graph when there is bias tensors
|
71
|
+
assert not self.has_bias()
|
72
|
+
|
73
|
+
self.vocab_size = other.vocab_size
|
74
|
+
self.need_min_p_sampling = other.need_min_p_sampling
|
75
|
+
|
76
|
+
self.temperatures[:bs] = other.temperatures
|
77
|
+
self.top_ps[:bs] = other.top_ps
|
78
|
+
self.top_ks[:bs] = other.top_ks
|
79
|
+
self.min_ps[:bs] = other.min_ps
|
80
|
+
|
28
81
|
@classmethod
|
29
82
|
def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
|
30
83
|
device = "cuda"
|
@@ -45,6 +98,7 @@ class SamplingBatchInfo:
|
|
45
98
|
ret.min_ps = torch.tensor(
|
46
99
|
[r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
|
47
100
|
)
|
101
|
+
ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
|
48
102
|
|
49
103
|
# Each penalizers will do nothing if they evaluate themselves as not required by looking at
|
50
104
|
# the sampling_params of the requests (See {_is_required()} of each penalizers). So this
|
@@ -72,6 +126,25 @@ class SamplingBatchInfo:
|
|
72
126
|
|
73
127
|
return ret
|
74
128
|
|
129
|
+
def prepare_penalties(self):
|
130
|
+
self.scaling_penalties = None
|
131
|
+
self.linear_penalties = None
|
132
|
+
|
133
|
+
for penalizer in self.penalizer_orchestrator.penalizers.values():
|
134
|
+
if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
|
135
|
+
if penalizer.is_prepared():
|
136
|
+
self.scaling_penalties = penalizer.cumulated_repetition_penalties
|
137
|
+
else:
|
138
|
+
if penalizer.is_prepared():
|
139
|
+
if self.linear_penalties is None:
|
140
|
+
bs = self.penalizer_orchestrator.batch.batch_size()
|
141
|
+
self.linear_penalties = torch.zeros(
|
142
|
+
(bs, self.vocab_size),
|
143
|
+
dtype=torch.float32,
|
144
|
+
device="cuda",
|
145
|
+
)
|
146
|
+
self.linear_penalties = penalizer.apply(self.linear_penalties)
|
147
|
+
|
75
148
|
def update_regex_vocab_mask(self, batch: ScheduleBatch):
|
76
149
|
bs, reqs = batch.batch_size(), batch.reqs
|
77
150
|
device = "cuda"
|
@@ -81,15 +154,15 @@ class SamplingBatchInfo:
|
|
81
154
|
self.vocab_mask = None
|
82
155
|
|
83
156
|
if has_regex:
|
157
|
+
self.vocab_mask = torch.zeros(
|
158
|
+
bs, self.vocab_size, dtype=torch.bool, device=device
|
159
|
+
)
|
84
160
|
for i, req in enumerate(reqs):
|
85
161
|
if req.regex_fsm is not None:
|
86
|
-
|
87
|
-
self.vocab_mask = torch.zeros(
|
88
|
-
bs, self.vocab_size, dtype=torch.bool, device=device
|
89
|
-
)
|
162
|
+
self.vocab_mask[i].fill_(1)
|
90
163
|
self.vocab_mask[i][
|
91
164
|
req.regex_fsm.get_next_instruction(req.regex_fsm_state).tokens
|
92
|
-
] =
|
165
|
+
] = 0
|
93
166
|
|
94
167
|
def filter(self, unfinished_indices: List[int], new_indices: torch.Tensor):
|
95
168
|
self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
|
sglang/srt/server.py
CHANGED
@@ -272,7 +272,7 @@ async def retrieve_file_content(file_id: str):
|
|
272
272
|
|
273
273
|
def launch_server(
|
274
274
|
server_args: ServerArgs,
|
275
|
-
|
275
|
+
model_override_args: Optional[dict] = None,
|
276
276
|
pipe_finish_writer: Optional[mp.connection.Connection] = None,
|
277
277
|
):
|
278
278
|
"""Launch an HTTP server."""
|
@@ -317,7 +317,7 @@ def launch_server(
|
|
317
317
|
tp_rank_range,
|
318
318
|
server_args,
|
319
319
|
ports[3],
|
320
|
-
|
320
|
+
model_override_args,
|
321
321
|
)
|
322
322
|
|
323
323
|
try:
|
@@ -328,7 +328,7 @@ def launch_server(
|
|
328
328
|
return
|
329
329
|
|
330
330
|
# Launch processes
|
331
|
-
tokenizer_manager = TokenizerManager(server_args, port_args,
|
331
|
+
tokenizer_manager = TokenizerManager(server_args, port_args, model_override_args)
|
332
332
|
if server_args.chat_template:
|
333
333
|
load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
|
334
334
|
pipe_controller_reader, pipe_controller_writer = mp.Pipe(duplex=False)
|
@@ -341,7 +341,7 @@ def launch_server(
|
|
341
341
|
|
342
342
|
proc_controller = mp.Process(
|
343
343
|
target=start_controller_process,
|
344
|
-
args=(server_args, port_args, pipe_controller_writer,
|
344
|
+
args=(server_args, port_args, pipe_controller_writer, model_override_args),
|
345
345
|
)
|
346
346
|
proc_controller.start()
|
347
347
|
|
@@ -501,7 +501,7 @@ class Runtime:
|
|
501
501
|
def __init__(
|
502
502
|
self,
|
503
503
|
log_level: str = "error",
|
504
|
-
|
504
|
+
model_override_args: Optional[dict] = None,
|
505
505
|
*args,
|
506
506
|
**kwargs,
|
507
507
|
):
|
@@ -525,7 +525,7 @@ class Runtime:
|
|
525
525
|
|
526
526
|
proc = mp.Process(
|
527
527
|
target=launch_server,
|
528
|
-
args=(self.server_args,
|
528
|
+
args=(self.server_args, model_override_args, pipe_writer),
|
529
529
|
)
|
530
530
|
proc.start()
|
531
531
|
pipe_writer.close()
|
sglang/srt/utils.py
CHANGED
@@ -407,7 +407,6 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
407
407
|
DummyModelLoader,
|
408
408
|
LoRAConfig,
|
409
409
|
ModelConfig,
|
410
|
-
MultiModalConfig,
|
411
410
|
ParallelConfig,
|
412
411
|
SchedulerConfig,
|
413
412
|
_initialize_model,
|
@@ -422,7 +421,6 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
422
421
|
model_config: ModelConfig,
|
423
422
|
device_config: DeviceConfig,
|
424
423
|
lora_config: Optional[LoRAConfig],
|
425
|
-
multimodal_config: Optional[MultiModalConfig],
|
426
424
|
parallel_config: ParallelConfig,
|
427
425
|
scheduler_config: SchedulerConfig,
|
428
426
|
cache_config: CacheConfig,
|
@@ -433,7 +431,6 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
433
431
|
model_config,
|
434
432
|
self.load_config,
|
435
433
|
lora_config,
|
436
|
-
multimodal_config,
|
437
434
|
cache_config,
|
438
435
|
)
|
439
436
|
|
sglang/test/runners.py
CHANGED
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.
|
1
|
+
__version__ = "0.2.15"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.15
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
312
312
|
### Method 2: From source
|
313
313
|
```
|
314
314
|
# Use the last release branch
|
315
|
-
git clone -b v0.2.
|
315
|
+
git clone -b v0.2.15 https://github.com/sgl-project/sglang.git
|
316
316
|
cd sglang
|
317
317
|
|
318
318
|
pip install --upgrade pip
|
@@ -489,7 +489,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
489
489
|
### Supported Models
|
490
490
|
|
491
491
|
**Generative Models**
|
492
|
-
|
493
492
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
494
493
|
- Mistral / Mixtral / Mistral NeMo
|
495
494
|
- Gemma / Gemma 2
|
@@ -509,6 +508,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
509
508
|
- Grok
|
510
509
|
- ChatGLM
|
511
510
|
- InternLM 2
|
511
|
+
- Exaone 3
|
512
512
|
|
513
513
|
**Embedding Models**
|
514
514
|
|
@@ -636,7 +636,7 @@ print(state["answer_1"])
|
|
636
636
|
#### More Examples
|
637
637
|
|
638
638
|
Anthropic and VertexAI (Gemini) models are also supported.
|
639
|
-
You can find more examples at [examples/quick_start](examples/quick_start).
|
639
|
+
You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
|
640
640
|
|
641
641
|
### Language Feature
|
642
642
|
To begin with, import sglang.
|
@@ -649,7 +649,7 @@ You can implement your prompt flow in a function decorated by `sgl.function`.
|
|
649
649
|
You can then invoke the function with `run` or `run_batch`.
|
650
650
|
The system will manage the state, chat template, parallelism and batching for you.
|
651
651
|
|
652
|
-
The complete code for the examples below can be found at [readme_examples.py](examples/usage/readme_examples.py)
|
652
|
+
The complete code for the examples below can be found at [readme_examples.py](examples/frontend_language/usage/readme_examples.py)
|
653
653
|
|
654
654
|
#### Control Flow
|
655
655
|
You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
|
@@ -698,7 +698,7 @@ def image_qa(s, image_file, question):
|
|
698
698
|
s += sgl.assistant(sgl.gen("answer", max_tokens=256)
|
699
699
|
```
|
700
700
|
|
701
|
-
See also [srt_example_llava.py](examples/quick_start/
|
701
|
+
See also [srt_example_llava.py](examples/frontend_language/quick_start/local_example_llava_next.py).
|
702
702
|
|
703
703
|
#### Constrained Decoding
|
704
704
|
Use `regex` to specify a regular expression as a decoding constraint.
|
@@ -742,7 +742,7 @@ def character_gen(s, name):
|
|
742
742
|
s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
|
743
743
|
```
|
744
744
|
|
745
|
-
See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
745
|
+
See also [json_decode.py](examples/frontend_language/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
746
746
|
|
747
747
|
#### Batching
|
748
748
|
Use `run_batch` to run a batch of requests with continuous batching.
|