sglang 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -0
- sglang/api.py +10 -2
- sglang/bench_latency.py +151 -40
- sglang/bench_serving.py +46 -22
- sglang/check_env.py +24 -2
- sglang/global_config.py +0 -1
- sglang/lang/backend/base_backend.py +3 -1
- sglang/lang/backend/openai.py +8 -3
- sglang/lang/backend/runtime_endpoint.py +46 -29
- sglang/lang/choices.py +164 -0
- sglang/lang/compiler.py +2 -2
- sglang/lang/interpreter.py +6 -13
- sglang/lang/ir.py +14 -5
- sglang/srt/constrained/base_tool_cache.py +1 -1
- sglang/srt/constrained/fsm_cache.py +12 -2
- sglang/srt/layers/activation.py +33 -0
- sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
- sglang/srt/layers/extend_attention.py +6 -1
- sglang/srt/layers/layernorm.py +65 -0
- sglang/srt/layers/logits_processor.py +6 -1
- sglang/srt/layers/pooler.py +50 -0
- sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
- sglang/srt/layers/radix_attention.py +4 -7
- sglang/srt/managers/detokenizer_manager.py +31 -9
- sglang/srt/managers/io_struct.py +63 -0
- sglang/srt/managers/policy_scheduler.py +173 -25
- sglang/srt/managers/schedule_batch.py +174 -380
- sglang/srt/managers/tokenizer_manager.py +197 -112
- sglang/srt/managers/tp_worker.py +299 -364
- sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
- sglang/srt/mem_cache/chunk_cache.py +43 -20
- sglang/srt/mem_cache/memory_pool.py +10 -15
- sglang/srt/mem_cache/radix_cache.py +74 -40
- sglang/srt/model_executor/cuda_graph_runner.py +27 -12
- sglang/srt/model_executor/forward_batch_info.py +319 -0
- sglang/srt/model_executor/model_runner.py +30 -47
- sglang/srt/models/chatglm.py +1 -1
- sglang/srt/models/commandr.py +1 -1
- sglang/srt/models/dbrx.py +1 -1
- sglang/srt/models/deepseek.py +1 -1
- sglang/srt/models/deepseek_v2.py +1 -1
- sglang/srt/models/gemma.py +1 -1
- sglang/srt/models/gemma2.py +1 -2
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/grok.py +1 -1
- sglang/srt/models/internlm2.py +3 -8
- sglang/srt/models/llama2.py +5 -5
- sglang/srt/models/llama_classification.py +1 -1
- sglang/srt/models/llama_embedding.py +88 -0
- sglang/srt/models/llava.py +1 -2
- sglang/srt/models/llavavid.py +1 -2
- sglang/srt/models/minicpm.py +1 -1
- sglang/srt/models/mixtral.py +1 -1
- sglang/srt/models/mixtral_quant.py +1 -1
- sglang/srt/models/qwen.py +1 -1
- sglang/srt/models/qwen2.py +1 -1
- sglang/srt/models/qwen2_moe.py +1 -12
- sglang/srt/models/stablelm.py +1 -1
- sglang/srt/openai_api/adapter.py +189 -39
- sglang/srt/openai_api/protocol.py +43 -1
- sglang/srt/sampling/penaltylib/__init__.py +13 -0
- sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
- sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
- sglang/srt/sampling_params.py +31 -4
- sglang/srt/server.py +93 -21
- sglang/srt/server_args.py +30 -19
- sglang/srt/utils.py +31 -13
- sglang/test/run_eval.py +10 -1
- sglang/test/runners.py +63 -63
- sglang/test/simple_eval_humaneval.py +2 -8
- sglang/test/simple_eval_mgsm.py +203 -0
- sglang/test/srt/sampling/penaltylib/utils.py +337 -0
- sglang/test/test_layernorm.py +60 -0
- sglang/test/test_programs.py +4 -2
- sglang/test/test_utils.py +21 -3
- sglang/utils.py +0 -1
- sglang/version.py +1 -1
- {sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/METADATA +50 -31
- sglang-0.2.12.dist-info/RECORD +112 -0
- sglang/srt/layers/linear.py +0 -884
- sglang/srt/layers/quantization/__init__.py +0 -64
- sglang/srt/layers/quantization/fp8.py +0 -677
- sglang-0.2.10.dist-info/RECORD +0 -100
- {sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/LICENSE +0 -0
- {sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/WHEEL +0 -0
- {sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,337 @@
|
|
1
|
+
import dataclasses
|
2
|
+
import enum
|
3
|
+
import typing
|
4
|
+
import unittest
|
5
|
+
|
6
|
+
import torch
|
7
|
+
|
8
|
+
from sglang.srt.sampling.penaltylib.orchestrator import (
|
9
|
+
BatchedPenalizerOrchestrator,
|
10
|
+
_BatchedPenalizer,
|
11
|
+
_BatchLike,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
@dataclasses.dataclass
|
16
|
+
class MockSamplingParams:
|
17
|
+
frequency_penalty: float = 0.0
|
18
|
+
min_new_tokens: int = 0
|
19
|
+
stop_token_ids: typing.List[int] = None
|
20
|
+
presence_penalty: float = 0.0
|
21
|
+
repetition_penalty: float = 1.0
|
22
|
+
|
23
|
+
|
24
|
+
@dataclasses.dataclass
|
25
|
+
class MockTokenizer:
|
26
|
+
eos_token_id: int
|
27
|
+
|
28
|
+
|
29
|
+
@dataclasses.dataclass
|
30
|
+
class MockReq:
|
31
|
+
origin_input_ids: typing.List[int]
|
32
|
+
sampling_params: MockSamplingParams
|
33
|
+
tokenizer: MockTokenizer
|
34
|
+
|
35
|
+
|
36
|
+
class StepType(enum.Enum):
|
37
|
+
INPUT = "input"
|
38
|
+
OUTPUT = "output"
|
39
|
+
|
40
|
+
|
41
|
+
@dataclasses.dataclass
|
42
|
+
class Step:
|
43
|
+
type: StepType
|
44
|
+
token_ids: typing.List[int]
|
45
|
+
expected_tensors: typing.Dict[str, torch.Tensor]
|
46
|
+
# assume initial logits are all 1
|
47
|
+
expected_logits: torch.Tensor
|
48
|
+
|
49
|
+
|
50
|
+
@dataclasses.dataclass
|
51
|
+
class Subject:
|
52
|
+
sampling_params: MockSamplingParams
|
53
|
+
# first step must be input, which will be converted to Req
|
54
|
+
steps: typing.List[Step]
|
55
|
+
eos_token_id: int = -1
|
56
|
+
|
57
|
+
def __post_init__(self):
|
58
|
+
if self.steps[0].type != StepType.INPUT:
|
59
|
+
raise ValueError("First step must be input")
|
60
|
+
|
61
|
+
# each steps should have the same expected_tensors.keys()
|
62
|
+
for i in range(1, len(self.steps)):
|
63
|
+
if self.tensor_keys(i) != self.tensor_keys():
|
64
|
+
raise ValueError(
|
65
|
+
f"Expected tensors keys must be the same for all steps. Got {self.steps[i].expected_tensors.keys()} for key={i} and {self.steps[0].expected_tensors.keys()}"
|
66
|
+
)
|
67
|
+
|
68
|
+
def tensor_keys(self, i: int = 0) -> typing.Set[str]:
|
69
|
+
return set(self.steps[i].expected_tensors.keys())
|
70
|
+
|
71
|
+
def to_req(self) -> MockReq:
|
72
|
+
return MockReq(
|
73
|
+
origin_input_ids=self.steps[0].token_ids,
|
74
|
+
sampling_params=self.sampling_params,
|
75
|
+
tokenizer=MockTokenizer(eos_token_id=self.eos_token_id),
|
76
|
+
)
|
77
|
+
|
78
|
+
|
79
|
+
@dataclasses.dataclass
|
80
|
+
class Case:
|
81
|
+
enabled: bool
|
82
|
+
test_subjects: typing.List[Subject]
|
83
|
+
|
84
|
+
def __post_init__(self):
|
85
|
+
# each test_subjects.steps should have the same expected_tensors.keys()
|
86
|
+
for i in range(1, len(self.test_subjects)):
|
87
|
+
if self.tensor_keys(i) != self.tensor_keys():
|
88
|
+
raise ValueError(
|
89
|
+
f"Expected tensors keys must be the same for all test_subjects. Got {self.test_subjects[i].tensor_keys()} for key={i} and {self.test_subjects[0].tensor_keys()}"
|
90
|
+
)
|
91
|
+
|
92
|
+
def tensor_keys(self, i: int = 0) -> typing.List[str]:
|
93
|
+
return set(self.test_subjects[i].tensor_keys())
|
94
|
+
|
95
|
+
|
96
|
+
class BaseBatchedPenalizerTest(unittest.TestCase):
|
97
|
+
Penalizer: typing.Type[_BatchedPenalizer]
|
98
|
+
device = "cuda"
|
99
|
+
vocab_size = 5
|
100
|
+
|
101
|
+
enabled: Subject = None
|
102
|
+
disabled: Subject = None
|
103
|
+
|
104
|
+
def setUp(self):
|
105
|
+
if self.__class__ == BaseBatchedPenalizerTest:
|
106
|
+
self.skipTest("Base class for penalizer tests")
|
107
|
+
|
108
|
+
self.create_test_subjects()
|
109
|
+
self.create_test_cases()
|
110
|
+
|
111
|
+
def tensor(self, data, **kwargs) -> torch.Tensor:
|
112
|
+
"""
|
113
|
+
Shortcut to create a tensor with device=self.device.
|
114
|
+
"""
|
115
|
+
return torch.tensor(data, **kwargs, device=self.device)
|
116
|
+
|
117
|
+
def create_test_subjects(self) -> typing.List[Subject]:
|
118
|
+
raise NotImplementedError()
|
119
|
+
|
120
|
+
def create_test_cases(self):
|
121
|
+
self.test_cases = [
|
122
|
+
Case(enabled=True, test_subjects=[self.enabled]),
|
123
|
+
Case(enabled=False, test_subjects=[self.disabled]),
|
124
|
+
Case(enabled=True, test_subjects=[self.enabled, self.disabled]),
|
125
|
+
]
|
126
|
+
|
127
|
+
def _create_penalizer(
|
128
|
+
self, case: Case
|
129
|
+
) -> typing.Tuple[BatchedPenalizerOrchestrator, _BatchedPenalizer]:
|
130
|
+
orchestrator = BatchedPenalizerOrchestrator(
|
131
|
+
vocab_size=self.vocab_size,
|
132
|
+
batch=_BatchLike(reqs=[subject.to_req() for subject in case.test_subjects]),
|
133
|
+
device=self.device,
|
134
|
+
Penalizers={self.Penalizer},
|
135
|
+
)
|
136
|
+
|
137
|
+
return orchestrator, orchestrator.penalizers[self.Penalizer]
|
138
|
+
|
139
|
+
def test_is_required(self):
|
140
|
+
for case in self.test_cases:
|
141
|
+
with self.subTest(case=case):
|
142
|
+
_, penalizer = self._create_penalizer(case)
|
143
|
+
self.assertEqual(case.enabled, penalizer.is_required())
|
144
|
+
|
145
|
+
def test_prepare(self):
|
146
|
+
for case in self.test_cases:
|
147
|
+
with self.subTest(case=case):
|
148
|
+
orchestrator, penalizer = self._create_penalizer(case)
|
149
|
+
self.assertEqual(case.enabled, penalizer.is_prepared())
|
150
|
+
|
151
|
+
if case.enabled:
|
152
|
+
for key, tensor in {
|
153
|
+
key: torch.cat(
|
154
|
+
tensors=[
|
155
|
+
subject.steps[0].expected_tensors[key]
|
156
|
+
for subject in case.test_subjects
|
157
|
+
],
|
158
|
+
)
|
159
|
+
for key in case.tensor_keys()
|
160
|
+
}.items():
|
161
|
+
torch.testing.assert_close(
|
162
|
+
actual=getattr(penalizer, key),
|
163
|
+
expected=tensor,
|
164
|
+
msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
|
165
|
+
)
|
166
|
+
|
167
|
+
actual = orchestrator.apply(
|
168
|
+
torch.ones(
|
169
|
+
size=(len(case.test_subjects), self.vocab_size),
|
170
|
+
dtype=torch.float32,
|
171
|
+
device=self.device,
|
172
|
+
)
|
173
|
+
)
|
174
|
+
expected = torch.cat(
|
175
|
+
tensors=[
|
176
|
+
subject.steps[0].expected_logits
|
177
|
+
for subject in case.test_subjects
|
178
|
+
],
|
179
|
+
)
|
180
|
+
torch.testing.assert_close(
|
181
|
+
actual=actual,
|
182
|
+
expected=expected,
|
183
|
+
msg=f"logits\nactual={actual}\nexpected={expected}",
|
184
|
+
)
|
185
|
+
|
186
|
+
def test_teardown(self):
|
187
|
+
for case in self.test_cases:
|
188
|
+
with self.subTest(case=case):
|
189
|
+
_, penalizer = self._create_penalizer(case)
|
190
|
+
penalizer.teardown()
|
191
|
+
|
192
|
+
for key in case.test_subjects[0].steps[0].expected_tensors.keys():
|
193
|
+
self.assertIsNone(getattr(penalizer, key, None))
|
194
|
+
|
195
|
+
def test_filter(self):
|
196
|
+
for case in self.test_cases:
|
197
|
+
with self.subTest(case=case):
|
198
|
+
orchestrator, penalizer = self._create_penalizer(case)
|
199
|
+
|
200
|
+
indices_to_keep = [0]
|
201
|
+
orchestrator.filter(indices_to_keep=indices_to_keep)
|
202
|
+
|
203
|
+
filtered_subjects = [case.test_subjects[i] for i in indices_to_keep]
|
204
|
+
|
205
|
+
if penalizer.is_required():
|
206
|
+
self.assertTrue(penalizer.is_prepared())
|
207
|
+
for key, tensor in {
|
208
|
+
key: torch.cat(
|
209
|
+
tensors=[
|
210
|
+
subject.steps[0].expected_tensors[key]
|
211
|
+
for subject in filtered_subjects
|
212
|
+
],
|
213
|
+
)
|
214
|
+
for key in case.tensor_keys()
|
215
|
+
}.items():
|
216
|
+
torch.testing.assert_close(
|
217
|
+
actual=getattr(penalizer, key),
|
218
|
+
expected=tensor,
|
219
|
+
msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
|
220
|
+
)
|
221
|
+
|
222
|
+
actual_logits = orchestrator.apply(
|
223
|
+
torch.ones(
|
224
|
+
size=(len(filtered_subjects), self.vocab_size),
|
225
|
+
dtype=torch.float32,
|
226
|
+
device=self.device,
|
227
|
+
)
|
228
|
+
)
|
229
|
+
filtered_expected_logits = torch.cat(
|
230
|
+
tensors=[
|
231
|
+
subject.steps[0].expected_logits
|
232
|
+
for subject in filtered_subjects
|
233
|
+
],
|
234
|
+
)
|
235
|
+
torch.testing.assert_close(
|
236
|
+
actual=actual_logits,
|
237
|
+
expected=filtered_expected_logits,
|
238
|
+
msg=f"logits\nactual={actual_logits}\nexpected={filtered_expected_logits}",
|
239
|
+
)
|
240
|
+
|
241
|
+
def test_merge_enabled_with_disabled(self):
|
242
|
+
enabled_test_case = self.test_cases[0]
|
243
|
+
disabled_test_case = self.test_cases[1]
|
244
|
+
|
245
|
+
orchestrator, penalizer = self._create_penalizer(enabled_test_case)
|
246
|
+
theirs, _ = self._create_penalizer(disabled_test_case)
|
247
|
+
|
248
|
+
orchestrator.merge(theirs)
|
249
|
+
|
250
|
+
for key, tensor in {
|
251
|
+
key: torch.cat(
|
252
|
+
tensors=[
|
253
|
+
enabled_test_case.test_subjects[0].steps[0].expected_tensors[key],
|
254
|
+
disabled_test_case.test_subjects[0].steps[0].expected_tensors[key],
|
255
|
+
],
|
256
|
+
)
|
257
|
+
for key in enabled_test_case.tensor_keys()
|
258
|
+
}.items():
|
259
|
+
torch.testing.assert_close(
|
260
|
+
actual=getattr(penalizer, key),
|
261
|
+
expected=tensor,
|
262
|
+
msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
|
263
|
+
)
|
264
|
+
|
265
|
+
def test_cumulate_apply_repeat(self):
|
266
|
+
for case in self.test_cases:
|
267
|
+
with self.subTest(case=case):
|
268
|
+
orchestrator, penalizer = self._create_penalizer(case)
|
269
|
+
|
270
|
+
max_step = max(len(subject.steps) for subject in case.test_subjects)
|
271
|
+
for i in range(1, max_step):
|
272
|
+
orchestrator.filter(
|
273
|
+
indices_to_keep=[
|
274
|
+
j
|
275
|
+
for j, subject in enumerate(case.test_subjects)
|
276
|
+
if i < len(subject.steps)
|
277
|
+
]
|
278
|
+
)
|
279
|
+
|
280
|
+
filtered_subjects = [
|
281
|
+
subject
|
282
|
+
for subject in case.test_subjects
|
283
|
+
if i < len(subject.steps)
|
284
|
+
]
|
285
|
+
|
286
|
+
inputs: typing.List[typing.List[int]] = []
|
287
|
+
outputs: typing.List[typing.List[int]] = []
|
288
|
+
for subject in filtered_subjects:
|
289
|
+
step = subject.steps[i]
|
290
|
+
if step.type == StepType.INPUT:
|
291
|
+
inputs.append(step.token_ids)
|
292
|
+
outputs.append([])
|
293
|
+
else:
|
294
|
+
inputs.append([])
|
295
|
+
outputs.append(step.token_ids)
|
296
|
+
|
297
|
+
if any(inputs):
|
298
|
+
orchestrator.cumulate_input_tokens(inputs)
|
299
|
+
|
300
|
+
if any(outputs):
|
301
|
+
orchestrator.cumulate_output_tokens(outputs)
|
302
|
+
|
303
|
+
if penalizer.is_required():
|
304
|
+
self.assertTrue(penalizer.is_prepared())
|
305
|
+
for key, tensor in {
|
306
|
+
key: torch.cat(
|
307
|
+
tensors=[
|
308
|
+
subject.steps[i].expected_tensors[key]
|
309
|
+
for subject in filtered_subjects
|
310
|
+
],
|
311
|
+
)
|
312
|
+
for key in case.tensor_keys()
|
313
|
+
}.items():
|
314
|
+
torch.testing.assert_close(
|
315
|
+
actual=getattr(penalizer, key),
|
316
|
+
expected=tensor,
|
317
|
+
msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
|
318
|
+
)
|
319
|
+
|
320
|
+
actual_logits = orchestrator.apply(
|
321
|
+
torch.ones(
|
322
|
+
size=(len(filtered_subjects), self.vocab_size),
|
323
|
+
dtype=torch.float32,
|
324
|
+
device=self.device,
|
325
|
+
)
|
326
|
+
)
|
327
|
+
filtered_expected_logits = torch.cat(
|
328
|
+
tensors=[
|
329
|
+
subject.steps[i].expected_logits
|
330
|
+
for subject in filtered_subjects
|
331
|
+
],
|
332
|
+
)
|
333
|
+
torch.testing.assert_close(
|
334
|
+
actual=actual_logits,
|
335
|
+
expected=filtered_expected_logits,
|
336
|
+
msg=f"logits\nactual={actual_logits}\nexpected={filtered_expected_logits}",
|
337
|
+
)
|
@@ -0,0 +1,60 @@
|
|
1
|
+
import itertools
|
2
|
+
import unittest
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
from sglang.srt.layers.layernorm import RMSNorm
|
7
|
+
|
8
|
+
|
9
|
+
class TestRMSNorm(unittest.TestCase):
|
10
|
+
DTYPES = [torch.half, torch.bfloat16]
|
11
|
+
NUM_TOKENS = [7, 83, 4096]
|
12
|
+
HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
|
13
|
+
ADD_RESIDUAL = [False, True]
|
14
|
+
SEEDS = [0]
|
15
|
+
|
16
|
+
@classmethod
|
17
|
+
def setUpClass(cls):
|
18
|
+
if not torch.cuda.is_available():
|
19
|
+
raise unittest.SkipTest("CUDA is not available")
|
20
|
+
torch.set_default_device("cuda")
|
21
|
+
|
22
|
+
def _run_rms_norm_test(self, num_tokens, hidden_size, add_residual, dtype, seed):
|
23
|
+
torch.manual_seed(seed)
|
24
|
+
|
25
|
+
layer = RMSNorm(hidden_size).to(dtype=dtype)
|
26
|
+
layer.weight.data.normal_(mean=1.0, std=0.1)
|
27
|
+
scale = 1 / (2 * hidden_size)
|
28
|
+
x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
|
29
|
+
residual = torch.randn_like(x) * scale if add_residual else None
|
30
|
+
|
31
|
+
with torch.inference_mode():
|
32
|
+
ref_out = layer.forward_native(x, residual)
|
33
|
+
out = layer(x, residual)
|
34
|
+
|
35
|
+
if add_residual:
|
36
|
+
self.assertTrue(torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2))
|
37
|
+
self.assertTrue(torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2))
|
38
|
+
else:
|
39
|
+
self.assertTrue(torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2))
|
40
|
+
|
41
|
+
def test_rms_norm(self):
|
42
|
+
for params in itertools.product(
|
43
|
+
self.NUM_TOKENS,
|
44
|
+
self.HIDDEN_SIZES,
|
45
|
+
self.ADD_RESIDUAL,
|
46
|
+
self.DTYPES,
|
47
|
+
self.SEEDS,
|
48
|
+
):
|
49
|
+
with self.subTest(
|
50
|
+
num_tokens=params[0],
|
51
|
+
hidden_size=params[1],
|
52
|
+
add_residual=params[2],
|
53
|
+
dtype=params[3],
|
54
|
+
seed=params[4],
|
55
|
+
):
|
56
|
+
self._run_rms_norm_test(*params)
|
57
|
+
|
58
|
+
|
59
|
+
if __name__ == "__main__":
|
60
|
+
unittest.main(verbosity=2)
|
sglang/test/test_programs.py
CHANGED
@@ -149,7 +149,7 @@ def test_decode_json():
|
|
149
149
|
assert isinstance(js_obj["population"], int)
|
150
150
|
|
151
151
|
|
152
|
-
def test_expert_answer():
|
152
|
+
def test_expert_answer(check_answer=True):
|
153
153
|
@sgl.function
|
154
154
|
def expert_answer(s, question):
|
155
155
|
s += "Question: " + question + "\n"
|
@@ -167,7 +167,9 @@ def test_expert_answer():
|
|
167
167
|
)
|
168
168
|
|
169
169
|
ret = expert_answer.run(question="What is the capital of France?", temperature=0.1)
|
170
|
-
|
170
|
+
|
171
|
+
if check_answer:
|
172
|
+
assert "paris" in ret.text().lower(), f"Answer: {ret.text()}"
|
171
173
|
|
172
174
|
|
173
175
|
def test_tool_use():
|
sglang/test/test_utils.py
CHANGED
@@ -12,13 +12,16 @@ from typing import Callable, List, Optional
|
|
12
12
|
|
13
13
|
import numpy as np
|
14
14
|
import requests
|
15
|
+
import torch
|
16
|
+
import torch.nn.functional as F
|
15
17
|
|
16
18
|
from sglang.global_config import global_config
|
17
19
|
from sglang.lang.backend.openai import OpenAI
|
18
20
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
19
21
|
from sglang.utils import get_exception_traceback
|
20
22
|
|
21
|
-
|
23
|
+
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
24
|
+
DEFAULT_URL_FOR_TEST = "http://127.0.0.1:8157"
|
22
25
|
|
23
26
|
|
24
27
|
def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
|
@@ -396,6 +399,8 @@ def popen_launch_server(
|
|
396
399
|
timeout: float,
|
397
400
|
api_key: Optional[str] = None,
|
398
401
|
other_args: tuple = (),
|
402
|
+
env: Optional[dict] = None,
|
403
|
+
return_stdout_stderr: bool = False,
|
399
404
|
):
|
400
405
|
_, host, port = base_url.split(":")
|
401
406
|
host = host[2:]
|
@@ -415,7 +420,16 @@ def popen_launch_server(
|
|
415
420
|
if api_key:
|
416
421
|
command += ["--api-key", api_key]
|
417
422
|
|
418
|
-
|
423
|
+
if return_stdout_stderr:
|
424
|
+
process = subprocess.Popen(
|
425
|
+
command,
|
426
|
+
stdout=subprocess.PIPE,
|
427
|
+
stderr=subprocess.PIPE,
|
428
|
+
env=env,
|
429
|
+
text=True,
|
430
|
+
)
|
431
|
+
else:
|
432
|
+
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
|
419
433
|
|
420
434
|
start_time = time.time()
|
421
435
|
while time.time() - start_time < timeout:
|
@@ -482,7 +496,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
|
|
482
496
|
p.terminate()
|
483
497
|
time.sleep(5)
|
484
498
|
print(
|
485
|
-
"\nTimeout after {timeout_per_file} seconds when running {filename}\n"
|
499
|
+
f"\nTimeout after {timeout_per_file} seconds when running {filename}\n"
|
486
500
|
)
|
487
501
|
return False
|
488
502
|
|
@@ -492,3 +506,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
|
|
492
506
|
print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
|
493
507
|
|
494
508
|
return 0 if success else -1
|
509
|
+
|
510
|
+
|
511
|
+
def get_similarities(vec1, vec2):
|
512
|
+
return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
|
sglang/utils.py
CHANGED
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.
|
1
|
+
__version__ = "0.2.12"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.12
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -221,6 +221,9 @@ Requires-Dist: sglang[anthropic]; extra == "all"
|
|
221
221
|
Requires-Dist: sglang[litellm]; extra == "all"
|
222
222
|
Provides-Extra: anthropic
|
223
223
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
224
|
+
Provides-Extra: dev
|
225
|
+
Requires-Dist: sglang[all]; extra == "dev"
|
226
|
+
Requires-Dist: sglang[test]; extra == "dev"
|
224
227
|
Provides-Extra: litellm
|
225
228
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
226
229
|
Provides-Extra: openai
|
@@ -232,7 +235,6 @@ Requires-Dist: fastapi; extra == "srt"
|
|
232
235
|
Requires-Dist: hf-transfer; extra == "srt"
|
233
236
|
Requires-Dist: huggingface-hub; extra == "srt"
|
234
237
|
Requires-Dist: interegular; extra == "srt"
|
235
|
-
Requires-Dist: jsonlines; extra == "srt"
|
236
238
|
Requires-Dist: packaging; extra == "srt"
|
237
239
|
Requires-Dist: pillow; extra == "srt"
|
238
240
|
Requires-Dist: psutil; extra == "srt"
|
@@ -242,8 +244,12 @@ Requires-Dist: torch; extra == "srt"
|
|
242
244
|
Requires-Dist: uvicorn; extra == "srt"
|
243
245
|
Requires-Dist: uvloop; extra == "srt"
|
244
246
|
Requires-Dist: zmq; extra == "srt"
|
245
|
-
Requires-Dist: vllm==0.5.
|
247
|
+
Requires-Dist: vllm==0.5.4; extra == "srt"
|
246
248
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
249
|
+
Provides-Extra: test
|
250
|
+
Requires-Dist: jsonlines; extra == "test"
|
251
|
+
Requires-Dist: matplotlib; extra == "test"
|
252
|
+
Requires-Dist: pandas; extra == "test"
|
247
253
|
|
248
254
|
<div align="center">
|
249
255
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
@@ -296,20 +302,20 @@ pip install --upgrade pip
|
|
296
302
|
pip install "sglang[all]"
|
297
303
|
|
298
304
|
# Install FlashInfer CUDA kernels
|
299
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
305
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
300
306
|
```
|
301
307
|
|
302
308
|
### Method 2: From source
|
303
309
|
```
|
304
310
|
# Use the last release branch
|
305
|
-
git clone -b v0.2.
|
311
|
+
git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
|
306
312
|
cd sglang
|
307
313
|
|
308
314
|
pip install --upgrade pip
|
309
315
|
pip install -e "python[all]"
|
310
316
|
|
311
317
|
# Install FlashInfer CUDA kernels
|
312
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
318
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
313
319
|
```
|
314
320
|
|
315
321
|
### Method 3: Using docker
|
@@ -383,22 +389,26 @@ response = client.chat.completions.create(
|
|
383
389
|
print(response)
|
384
390
|
```
|
385
391
|
|
386
|
-
It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
392
|
+
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
387
393
|
|
388
394
|
### Additional Server Arguments
|
389
|
-
- Add `--tp 2` to enable tensor parallelism. If it
|
395
|
+
- Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
390
396
|
```
|
391
397
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
|
392
398
|
```
|
393
|
-
- Add `--dp 2` to enable data parallelism. It can also be used together with
|
399
|
+
- Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
|
394
400
|
```
|
395
401
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
396
402
|
```
|
397
|
-
- If you see out-of-memory errors during serving,
|
403
|
+
- If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
398
404
|
```
|
399
405
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
400
406
|
```
|
401
407
|
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
408
|
+
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
|
409
|
+
```
|
410
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
|
411
|
+
```
|
402
412
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
403
413
|
```
|
404
414
|
# Node 0
|
@@ -408,29 +418,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
408
418
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
409
419
|
```
|
410
420
|
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
411
|
-
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
412
421
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
```bash
|
417
|
-
## Run 405B (fp8) on a single node
|
418
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
419
|
-
|
420
|
-
## Run 405B (fp16) on two nodes
|
421
|
-
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
422
|
-
|
423
|
-
# on the first node
|
424
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
425
|
-
|
426
|
-
# on the second
|
427
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
428
|
-
```
|
429
|
-
|
422
|
+
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
423
|
+
|
430
424
|
### Supported Models
|
431
425
|
|
432
426
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
433
|
-
- Mistral / Mixtral
|
427
|
+
- Mistral / Mixtral / Mistral NeMo
|
434
428
|
- Gemma / Gemma 2
|
435
429
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
436
430
|
- DeepSeek / DeepSeek 2
|
@@ -448,10 +442,35 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
448
442
|
- Grok
|
449
443
|
- ChatGLM
|
450
444
|
- InternLM 2
|
451
|
-
- Mistral NeMo
|
452
445
|
|
453
446
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
454
447
|
|
448
|
+
#### Use Models From ModelScope
|
449
|
+
To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
|
450
|
+
```
|
451
|
+
export SGLANG_USE_MODELSCOPE=true
|
452
|
+
```
|
453
|
+
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
454
|
+
```
|
455
|
+
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
456
|
+
```
|
457
|
+
|
458
|
+
#### Run Llama 3.1 405B
|
459
|
+
|
460
|
+
```bash
|
461
|
+
## Run 405B (fp8) on a single node
|
462
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
463
|
+
|
464
|
+
## Run 405B (fp16) on two nodes
|
465
|
+
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
466
|
+
|
467
|
+
# on the first node
|
468
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
469
|
+
|
470
|
+
# on the second
|
471
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
472
|
+
```
|
473
|
+
|
455
474
|
### Benchmark Performance
|
456
475
|
|
457
476
|
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
|
@@ -464,7 +483,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
464
483
|
```
|
465
484
|
|
466
485
|
## Frontend: Structured Generation Language (SGLang)
|
467
|
-
The frontend language can be used with local models or API models.
|
486
|
+
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
468
487
|
|
469
488
|
### Quick Start
|
470
489
|
The example below shows how to use sglang to answer a mulit-turn question.
|