sglang 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. sglang/api.py +7 -1
  2. sglang/bench_latency.py +9 -6
  3. sglang/bench_serving.py +46 -22
  4. sglang/global_config.py +1 -1
  5. sglang/lang/backend/runtime_endpoint.py +60 -49
  6. sglang/lang/compiler.py +2 -2
  7. sglang/lang/interpreter.py +4 -2
  8. sglang/lang/ir.py +16 -7
  9. sglang/srt/constrained/base_tool_cache.py +1 -1
  10. sglang/srt/constrained/fsm_cache.py +12 -2
  11. sglang/srt/constrained/jump_forward.py +13 -2
  12. sglang/srt/layers/activation.py +32 -0
  13. sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
  14. sglang/srt/layers/extend_attention.py +9 -2
  15. sglang/srt/layers/fused_moe/__init__.py +1 -0
  16. sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
  17. sglang/srt/layers/fused_moe/layer.py +587 -0
  18. sglang/srt/layers/layernorm.py +65 -0
  19. sglang/srt/layers/logits_processor.py +7 -2
  20. sglang/srt/layers/pooler.py +50 -0
  21. sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
  22. sglang/srt/layers/radix_attention.py +40 -16
  23. sglang/srt/managers/detokenizer_manager.py +31 -9
  24. sglang/srt/managers/io_struct.py +63 -0
  25. sglang/srt/managers/policy_scheduler.py +173 -25
  26. sglang/srt/managers/schedule_batch.py +115 -97
  27. sglang/srt/managers/tokenizer_manager.py +194 -112
  28. sglang/srt/managers/tp_worker.py +290 -359
  29. sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
  30. sglang/srt/mem_cache/chunk_cache.py +43 -20
  31. sglang/srt/mem_cache/memory_pool.py +2 -2
  32. sglang/srt/mem_cache/radix_cache.py +74 -40
  33. sglang/srt/model_executor/cuda_graph_runner.py +71 -25
  34. sglang/srt/model_executor/forward_batch_info.py +293 -156
  35. sglang/srt/model_executor/model_runner.py +77 -57
  36. sglang/srt/models/chatglm.py +2 -2
  37. sglang/srt/models/commandr.py +1 -1
  38. sglang/srt/models/deepseek.py +2 -2
  39. sglang/srt/models/deepseek_v2.py +7 -6
  40. sglang/srt/models/gemma.py +1 -1
  41. sglang/srt/models/gemma2.py +11 -6
  42. sglang/srt/models/grok.py +50 -396
  43. sglang/srt/models/internlm2.py +2 -7
  44. sglang/srt/models/llama2.py +4 -4
  45. sglang/srt/models/llama_embedding.py +88 -0
  46. sglang/srt/models/minicpm.py +2 -2
  47. sglang/srt/models/mixtral.py +56 -254
  48. sglang/srt/models/mixtral_quant.py +1 -4
  49. sglang/srt/models/qwen.py +2 -2
  50. sglang/srt/models/qwen2.py +2 -2
  51. sglang/srt/models/qwen2_moe.py +2 -13
  52. sglang/srt/models/stablelm.py +1 -1
  53. sglang/srt/openai_api/adapter.py +187 -48
  54. sglang/srt/openai_api/protocol.py +37 -1
  55. sglang/srt/sampling/penaltylib/__init__.py +13 -0
  56. sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
  57. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
  58. sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
  59. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
  60. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
  61. sglang/srt/sampling_params.py +31 -8
  62. sglang/srt/server.py +91 -29
  63. sglang/srt/server_args.py +32 -19
  64. sglang/srt/utils.py +32 -15
  65. sglang/test/run_eval.py +10 -1
  66. sglang/test/runners.py +81 -73
  67. sglang/test/simple_eval_humaneval.py +2 -8
  68. sglang/test/simple_eval_mgsm.py +203 -0
  69. sglang/test/srt/sampling/penaltylib/utils.py +337 -0
  70. sglang/test/test_layernorm.py +60 -0
  71. sglang/test/test_programs.py +36 -7
  72. sglang/test/test_utils.py +24 -2
  73. sglang/utils.py +0 -1
  74. sglang/version.py +1 -1
  75. {sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/METADATA +33 -16
  76. sglang-0.2.13.dist-info/RECORD +112 -0
  77. {sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/WHEEL +1 -1
  78. sglang/srt/layers/linear.py +0 -884
  79. sglang/srt/layers/quantization/__init__.py +0 -64
  80. sglang/srt/layers/quantization/fp8.py +0 -677
  81. sglang/srt/model_loader/model_loader.py +0 -292
  82. sglang/srt/model_loader/utils.py +0 -275
  83. sglang-0.2.11.dist-info/RECORD +0 -102
  84. {sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/LICENSE +0 -0
  85. {sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,337 @@
1
+ import dataclasses
2
+ import enum
3
+ import typing
4
+ import unittest
5
+
6
+ import torch
7
+
8
+ from sglang.srt.sampling.penaltylib.orchestrator import (
9
+ BatchedPenalizerOrchestrator,
10
+ _BatchedPenalizer,
11
+ _BatchLike,
12
+ )
13
+
14
+
15
+ @dataclasses.dataclass
16
+ class MockSamplingParams:
17
+ frequency_penalty: float = 0.0
18
+ min_new_tokens: int = 0
19
+ stop_token_ids: typing.List[int] = None
20
+ presence_penalty: float = 0.0
21
+ repetition_penalty: float = 1.0
22
+
23
+
24
+ @dataclasses.dataclass
25
+ class MockTokenizer:
26
+ eos_token_id: int
27
+
28
+
29
+ @dataclasses.dataclass
30
+ class MockReq:
31
+ origin_input_ids: typing.List[int]
32
+ sampling_params: MockSamplingParams
33
+ tokenizer: MockTokenizer
34
+
35
+
36
+ class StepType(enum.Enum):
37
+ INPUT = "input"
38
+ OUTPUT = "output"
39
+
40
+
41
+ @dataclasses.dataclass
42
+ class Step:
43
+ type: StepType
44
+ token_ids: typing.List[int]
45
+ expected_tensors: typing.Dict[str, torch.Tensor]
46
+ # assume initial logits are all 1
47
+ expected_logits: torch.Tensor
48
+
49
+
50
+ @dataclasses.dataclass
51
+ class Subject:
52
+ sampling_params: MockSamplingParams
53
+ # first step must be input, which will be converted to Req
54
+ steps: typing.List[Step]
55
+ eos_token_id: int = -1
56
+
57
+ def __post_init__(self):
58
+ if self.steps[0].type != StepType.INPUT:
59
+ raise ValueError("First step must be input")
60
+
61
+ # each steps should have the same expected_tensors.keys()
62
+ for i in range(1, len(self.steps)):
63
+ if self.tensor_keys(i) != self.tensor_keys():
64
+ raise ValueError(
65
+ f"Expected tensors keys must be the same for all steps. Got {self.steps[i].expected_tensors.keys()} for key={i} and {self.steps[0].expected_tensors.keys()}"
66
+ )
67
+
68
+ def tensor_keys(self, i: int = 0) -> typing.Set[str]:
69
+ return set(self.steps[i].expected_tensors.keys())
70
+
71
+ def to_req(self) -> MockReq:
72
+ return MockReq(
73
+ origin_input_ids=self.steps[0].token_ids,
74
+ sampling_params=self.sampling_params,
75
+ tokenizer=MockTokenizer(eos_token_id=self.eos_token_id),
76
+ )
77
+
78
+
79
+ @dataclasses.dataclass
80
+ class Case:
81
+ enabled: bool
82
+ test_subjects: typing.List[Subject]
83
+
84
+ def __post_init__(self):
85
+ # each test_subjects.steps should have the same expected_tensors.keys()
86
+ for i in range(1, len(self.test_subjects)):
87
+ if self.tensor_keys(i) != self.tensor_keys():
88
+ raise ValueError(
89
+ f"Expected tensors keys must be the same for all test_subjects. Got {self.test_subjects[i].tensor_keys()} for key={i} and {self.test_subjects[0].tensor_keys()}"
90
+ )
91
+
92
+ def tensor_keys(self, i: int = 0) -> typing.List[str]:
93
+ return set(self.test_subjects[i].tensor_keys())
94
+
95
+
96
+ class BaseBatchedPenalizerTest(unittest.TestCase):
97
+ Penalizer: typing.Type[_BatchedPenalizer]
98
+ device = "cuda"
99
+ vocab_size = 5
100
+
101
+ enabled: Subject = None
102
+ disabled: Subject = None
103
+
104
+ def setUp(self):
105
+ if self.__class__ == BaseBatchedPenalizerTest:
106
+ self.skipTest("Base class for penalizer tests")
107
+
108
+ self.create_test_subjects()
109
+ self.create_test_cases()
110
+
111
+ def tensor(self, data, **kwargs) -> torch.Tensor:
112
+ """
113
+ Shortcut to create a tensor with device=self.device.
114
+ """
115
+ return torch.tensor(data, **kwargs, device=self.device)
116
+
117
+ def create_test_subjects(self) -> typing.List[Subject]:
118
+ raise NotImplementedError()
119
+
120
+ def create_test_cases(self):
121
+ self.test_cases = [
122
+ Case(enabled=True, test_subjects=[self.enabled]),
123
+ Case(enabled=False, test_subjects=[self.disabled]),
124
+ Case(enabled=True, test_subjects=[self.enabled, self.disabled]),
125
+ ]
126
+
127
+ def _create_penalizer(
128
+ self, case: Case
129
+ ) -> typing.Tuple[BatchedPenalizerOrchestrator, _BatchedPenalizer]:
130
+ orchestrator = BatchedPenalizerOrchestrator(
131
+ vocab_size=self.vocab_size,
132
+ batch=_BatchLike(reqs=[subject.to_req() for subject in case.test_subjects]),
133
+ device=self.device,
134
+ Penalizers={self.Penalizer},
135
+ )
136
+
137
+ return orchestrator, orchestrator.penalizers[self.Penalizer]
138
+
139
+ def test_is_required(self):
140
+ for case in self.test_cases:
141
+ with self.subTest(case=case):
142
+ _, penalizer = self._create_penalizer(case)
143
+ self.assertEqual(case.enabled, penalizer.is_required())
144
+
145
+ def test_prepare(self):
146
+ for case in self.test_cases:
147
+ with self.subTest(case=case):
148
+ orchestrator, penalizer = self._create_penalizer(case)
149
+ self.assertEqual(case.enabled, penalizer.is_prepared())
150
+
151
+ if case.enabled:
152
+ for key, tensor in {
153
+ key: torch.cat(
154
+ tensors=[
155
+ subject.steps[0].expected_tensors[key]
156
+ for subject in case.test_subjects
157
+ ],
158
+ )
159
+ for key in case.tensor_keys()
160
+ }.items():
161
+ torch.testing.assert_close(
162
+ actual=getattr(penalizer, key),
163
+ expected=tensor,
164
+ msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
165
+ )
166
+
167
+ actual = orchestrator.apply(
168
+ torch.ones(
169
+ size=(len(case.test_subjects), self.vocab_size),
170
+ dtype=torch.float32,
171
+ device=self.device,
172
+ )
173
+ )
174
+ expected = torch.cat(
175
+ tensors=[
176
+ subject.steps[0].expected_logits
177
+ for subject in case.test_subjects
178
+ ],
179
+ )
180
+ torch.testing.assert_close(
181
+ actual=actual,
182
+ expected=expected,
183
+ msg=f"logits\nactual={actual}\nexpected={expected}",
184
+ )
185
+
186
+ def test_teardown(self):
187
+ for case in self.test_cases:
188
+ with self.subTest(case=case):
189
+ _, penalizer = self._create_penalizer(case)
190
+ penalizer.teardown()
191
+
192
+ for key in case.test_subjects[0].steps[0].expected_tensors.keys():
193
+ self.assertIsNone(getattr(penalizer, key, None))
194
+
195
+ def test_filter(self):
196
+ for case in self.test_cases:
197
+ with self.subTest(case=case):
198
+ orchestrator, penalizer = self._create_penalizer(case)
199
+
200
+ indices_to_keep = [0]
201
+ orchestrator.filter(indices_to_keep=indices_to_keep)
202
+
203
+ filtered_subjects = [case.test_subjects[i] for i in indices_to_keep]
204
+
205
+ if penalizer.is_required():
206
+ self.assertTrue(penalizer.is_prepared())
207
+ for key, tensor in {
208
+ key: torch.cat(
209
+ tensors=[
210
+ subject.steps[0].expected_tensors[key]
211
+ for subject in filtered_subjects
212
+ ],
213
+ )
214
+ for key in case.tensor_keys()
215
+ }.items():
216
+ torch.testing.assert_close(
217
+ actual=getattr(penalizer, key),
218
+ expected=tensor,
219
+ msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
220
+ )
221
+
222
+ actual_logits = orchestrator.apply(
223
+ torch.ones(
224
+ size=(len(filtered_subjects), self.vocab_size),
225
+ dtype=torch.float32,
226
+ device=self.device,
227
+ )
228
+ )
229
+ filtered_expected_logits = torch.cat(
230
+ tensors=[
231
+ subject.steps[0].expected_logits
232
+ for subject in filtered_subjects
233
+ ],
234
+ )
235
+ torch.testing.assert_close(
236
+ actual=actual_logits,
237
+ expected=filtered_expected_logits,
238
+ msg=f"logits\nactual={actual_logits}\nexpected={filtered_expected_logits}",
239
+ )
240
+
241
+ def test_merge_enabled_with_disabled(self):
242
+ enabled_test_case = self.test_cases[0]
243
+ disabled_test_case = self.test_cases[1]
244
+
245
+ orchestrator, penalizer = self._create_penalizer(enabled_test_case)
246
+ theirs, _ = self._create_penalizer(disabled_test_case)
247
+
248
+ orchestrator.merge(theirs)
249
+
250
+ for key, tensor in {
251
+ key: torch.cat(
252
+ tensors=[
253
+ enabled_test_case.test_subjects[0].steps[0].expected_tensors[key],
254
+ disabled_test_case.test_subjects[0].steps[0].expected_tensors[key],
255
+ ],
256
+ )
257
+ for key in enabled_test_case.tensor_keys()
258
+ }.items():
259
+ torch.testing.assert_close(
260
+ actual=getattr(penalizer, key),
261
+ expected=tensor,
262
+ msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
263
+ )
264
+
265
+ def test_cumulate_apply_repeat(self):
266
+ for case in self.test_cases:
267
+ with self.subTest(case=case):
268
+ orchestrator, penalizer = self._create_penalizer(case)
269
+
270
+ max_step = max(len(subject.steps) for subject in case.test_subjects)
271
+ for i in range(1, max_step):
272
+ orchestrator.filter(
273
+ indices_to_keep=[
274
+ j
275
+ for j, subject in enumerate(case.test_subjects)
276
+ if i < len(subject.steps)
277
+ ]
278
+ )
279
+
280
+ filtered_subjects = [
281
+ subject
282
+ for subject in case.test_subjects
283
+ if i < len(subject.steps)
284
+ ]
285
+
286
+ inputs: typing.List[typing.List[int]] = []
287
+ outputs: typing.List[typing.List[int]] = []
288
+ for subject in filtered_subjects:
289
+ step = subject.steps[i]
290
+ if step.type == StepType.INPUT:
291
+ inputs.append(step.token_ids)
292
+ outputs.append([])
293
+ else:
294
+ inputs.append([])
295
+ outputs.append(step.token_ids)
296
+
297
+ if any(inputs):
298
+ orchestrator.cumulate_input_tokens(inputs)
299
+
300
+ if any(outputs):
301
+ orchestrator.cumulate_output_tokens(outputs)
302
+
303
+ if penalizer.is_required():
304
+ self.assertTrue(penalizer.is_prepared())
305
+ for key, tensor in {
306
+ key: torch.cat(
307
+ tensors=[
308
+ subject.steps[i].expected_tensors[key]
309
+ for subject in filtered_subjects
310
+ ],
311
+ )
312
+ for key in case.tensor_keys()
313
+ }.items():
314
+ torch.testing.assert_close(
315
+ actual=getattr(penalizer, key),
316
+ expected=tensor,
317
+ msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
318
+ )
319
+
320
+ actual_logits = orchestrator.apply(
321
+ torch.ones(
322
+ size=(len(filtered_subjects), self.vocab_size),
323
+ dtype=torch.float32,
324
+ device=self.device,
325
+ )
326
+ )
327
+ filtered_expected_logits = torch.cat(
328
+ tensors=[
329
+ subject.steps[i].expected_logits
330
+ for subject in filtered_subjects
331
+ ],
332
+ )
333
+ torch.testing.assert_close(
334
+ actual=actual_logits,
335
+ expected=filtered_expected_logits,
336
+ msg=f"logits\nactual={actual_logits}\nexpected={filtered_expected_logits}",
337
+ )
@@ -0,0 +1,60 @@
1
+ import itertools
2
+ import unittest
3
+
4
+ import torch
5
+
6
+ from sglang.srt.layers.layernorm import RMSNorm
7
+
8
+
9
+ class TestRMSNorm(unittest.TestCase):
10
+ DTYPES = [torch.half, torch.bfloat16]
11
+ NUM_TOKENS = [7, 83, 4096]
12
+ HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
13
+ ADD_RESIDUAL = [False, True]
14
+ SEEDS = [0]
15
+
16
+ @classmethod
17
+ def setUpClass(cls):
18
+ if not torch.cuda.is_available():
19
+ raise unittest.SkipTest("CUDA is not available")
20
+ torch.set_default_device("cuda")
21
+
22
+ def _run_rms_norm_test(self, num_tokens, hidden_size, add_residual, dtype, seed):
23
+ torch.manual_seed(seed)
24
+
25
+ layer = RMSNorm(hidden_size).to(dtype=dtype)
26
+ layer.weight.data.normal_(mean=1.0, std=0.1)
27
+ scale = 1 / (2 * hidden_size)
28
+ x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
29
+ residual = torch.randn_like(x) * scale if add_residual else None
30
+
31
+ with torch.inference_mode():
32
+ ref_out = layer.forward_native(x, residual)
33
+ out = layer(x, residual)
34
+
35
+ if add_residual:
36
+ self.assertTrue(torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2))
37
+ self.assertTrue(torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2))
38
+ else:
39
+ self.assertTrue(torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2))
40
+
41
+ def test_rms_norm(self):
42
+ for params in itertools.product(
43
+ self.NUM_TOKENS,
44
+ self.HIDDEN_SIZES,
45
+ self.ADD_RESIDUAL,
46
+ self.DTYPES,
47
+ self.SEEDS,
48
+ ):
49
+ with self.subTest(
50
+ num_tokens=params[0],
51
+ hidden_size=params[1],
52
+ add_residual=params[2],
53
+ dtype=params[3],
54
+ seed=params[4],
55
+ ):
56
+ self._run_rms_norm_test(*params)
57
+
58
+
59
+ if __name__ == "__main__":
60
+ unittest.main(verbosity=2)
@@ -103,16 +103,19 @@ def test_decode_int():
103
103
  def test_decode_json_regex():
104
104
  @sgl.function
105
105
  def decode_json(s):
106
- from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STRING
106
+ from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STR
107
107
 
108
108
  s += "Generate a JSON object to describe the basic city information of Paris.\n"
109
+ s += "Here are the JSON object:\n"
110
+
111
+ # NOTE: we recommend using dtype gen or whole regex string to control the output
109
112
 
110
113
  with s.var_scope("json_output"):
111
114
  s += "{\n"
112
- s += ' "name": ' + sgl.gen(regex=REGEX_STRING + ",") + "\n"
113
- s += ' "population": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
114
- s += ' "area": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
115
- s += ' "latitude": ' + sgl.gen(regex=REGEX_FLOAT) + "\n"
115
+ s += ' "name": ' + sgl.gen(regex=REGEX_STR) + ",\n"
116
+ s += ' "population": ' + sgl.gen(regex=REGEX_INT, stop=[" ", "\n"]) + ",\n"
117
+ s += ' "area": ' + sgl.gen(regex=REGEX_INT, stop=[" ", "\n"]) + ",\n"
118
+ s += ' "latitude": ' + sgl.gen(regex=REGEX_FLOAT, stop=[" ", "\n"]) + "\n"
116
119
  s += "}"
117
120
 
118
121
  ret = decode_json.run(temperature=0.0)
@@ -149,7 +152,7 @@ def test_decode_json():
149
152
  assert isinstance(js_obj["population"], int)
150
153
 
151
154
 
152
- def test_expert_answer():
155
+ def test_expert_answer(check_answer=True):
153
156
  @sgl.function
154
157
  def expert_answer(s, question):
155
158
  s += "Question: " + question + "\n"
@@ -167,7 +170,9 @@ def test_expert_answer():
167
170
  )
168
171
 
169
172
  ret = expert_answer.run(question="What is the capital of France?", temperature=0.1)
170
- assert "paris" in ret.text().lower()
173
+
174
+ if check_answer:
175
+ assert "paris" in ret.text().lower(), f"Answer: {ret.text()}"
171
176
 
172
177
 
173
178
  def test_tool_use():
@@ -357,6 +362,30 @@ def test_regex():
357
362
  assert re.match(regex, answer)
358
363
 
359
364
 
365
+ def test_dtype_gen():
366
+ @sgl.function
367
+ def dtype_gen(s):
368
+ s += "Q: What is the full name of DNS?\n"
369
+ s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
370
+ s += "Q: Which year was DNS invented?\n"
371
+ s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
372
+ s += "Q: What is the value of pi?\n"
373
+ s += "A: " + sgl.gen("float_res", dtype=float) + "\n"
374
+ s += "Q: Is the sky blue?\n"
375
+ s += "A: " + sgl.gen("bool_res", dtype=bool) + "\n"
376
+
377
+ state = dtype_gen.run()
378
+
379
+ try:
380
+ state["int_res"] = int(state["int_res"])
381
+ state["float_res"] = float(state["float_res"])
382
+ state["bool_res"] = bool(state["bool_res"])
383
+ # assert state["str_res"].startswith('"') and state["str_res"].endswith('"')
384
+ except ValueError:
385
+ print(state)
386
+ raise
387
+
388
+
360
389
  def test_completion_speculative():
361
390
  @sgl.function(num_api_spec_tokens=64)
362
391
  def gen_character_spec(s):
sglang/test/test_utils.py CHANGED
@@ -12,6 +12,8 @@ from typing import Callable, List, Optional
12
12
 
13
13
  import numpy as np
14
14
  import requests
15
+ import torch
16
+ import torch.nn.functional as F
15
17
 
16
18
  from sglang.global_config import global_config
17
19
  from sglang.lang.backend.openai import OpenAI
@@ -19,6 +21,11 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
19
21
  from sglang.utils import get_exception_traceback
20
22
 
21
23
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
24
+ DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
25
+ DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157"
26
+ DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157"
27
+ DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157"
28
+ DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:9157"
22
29
 
23
30
 
24
31
  def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
@@ -396,6 +403,8 @@ def popen_launch_server(
396
403
  timeout: float,
397
404
  api_key: Optional[str] = None,
398
405
  other_args: tuple = (),
406
+ env: Optional[dict] = None,
407
+ return_stdout_stderr: bool = False,
399
408
  ):
400
409
  _, host, port = base_url.split(":")
401
410
  host = host[2:]
@@ -415,7 +424,16 @@ def popen_launch_server(
415
424
  if api_key:
416
425
  command += ["--api-key", api_key]
417
426
 
418
- process = subprocess.Popen(command, stdout=None, stderr=None)
427
+ if return_stdout_stderr:
428
+ process = subprocess.Popen(
429
+ command,
430
+ stdout=subprocess.PIPE,
431
+ stderr=subprocess.PIPE,
432
+ env=env,
433
+ text=True,
434
+ )
435
+ else:
436
+ process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
419
437
 
420
438
  start_time = time.time()
421
439
  while time.time() - start_time < timeout:
@@ -482,7 +500,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
482
500
  p.terminate()
483
501
  time.sleep(5)
484
502
  print(
485
- "\nTimeout after {timeout_per_file} seconds when running {filename}\n"
503
+ f"\nTimeout after {timeout_per_file} seconds when running {filename}\n"
486
504
  )
487
505
  return False
488
506
 
@@ -492,3 +510,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
492
510
  print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
493
511
 
494
512
  return 0 if success else -1
513
+
514
+
515
+ def get_similarities(vec1, vec2):
516
+ return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
sglang/utils.py CHANGED
@@ -6,7 +6,6 @@ import json
6
6
  import logging
7
7
  import signal
8
8
  import sys
9
- import threading
10
9
  import traceback
11
10
  import urllib.request
12
11
  from concurrent.futures import ThreadPoolExecutor
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.11"
1
+ __version__ = "0.2.13"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.11
3
+ Version: 0.2.13
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -308,7 +308,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
308
308
  ### Method 2: From source
309
309
  ```
310
310
  # Use the last release branch
311
- git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
311
+ git clone -b v0.2.13 https://github.com/sgl-project/sglang.git
312
312
  cd sglang
313
313
 
314
314
  pip install --upgrade pip
@@ -329,11 +329,19 @@ docker run --gpus all \
329
329
  --env "HF_TOKEN=<secret>" \
330
330
  --ipc=host \
331
331
  lmsysorg/sglang:latest \
332
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
332
+ python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
333
333
  ```
334
334
 
335
+ ### Method 4: Using docker compose
336
+
337
+ > This method is recommended if you plan to serve it as a service.
338
+ > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
339
+
340
+ 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
341
+ 2. Execute the command `docker compose up -d` in your terminal.
342
+
335
343
  ### Common Notes
336
- - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
344
+ - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
337
345
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
338
346
 
339
347
  ## Backend: SGLang Runtime (SRT)
@@ -392,23 +400,23 @@ print(response)
392
400
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
393
401
 
394
402
  ### Additional Server Arguments
395
- - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
403
+ - Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
396
404
  ```
397
405
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
398
406
  ```
399
- - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
407
+ - Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
400
408
  ```
401
409
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
402
410
  ```
403
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
411
+ - If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
404
412
  ```
405
413
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
406
414
  ```
407
- - If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
415
+ - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
416
+ - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
408
417
  ```
409
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
418
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
410
419
  ```
411
- - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
412
420
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
413
421
  ```
414
422
  # Node 0
@@ -418,13 +426,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
418
426
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
419
427
  ```
420
428
  - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
421
- - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
422
429
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
423
-
430
+ - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
431
+
424
432
  ### Supported Models
425
433
 
426
434
  - Llama / Llama 2 / Llama 3 / Llama 3.1
427
- - Mistral / Mixtral
435
+ - Mistral / Mixtral / Mistral NeMo
428
436
  - Gemma / Gemma 2
429
437
  - Qwen / Qwen 2 / Qwen 2 MoE
430
438
  - DeepSeek / DeepSeek 2
@@ -442,11 +450,20 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
442
450
  - Grok
443
451
  - ChatGLM
444
452
  - InternLM 2
445
- - Mistral NeMo
446
453
 
447
454
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
448
455
 
449
- ### Run Llama 3.1 405B
456
+ #### Use Models From ModelScope
457
+ To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
458
+ ```
459
+ export SGLANG_USE_MODELSCOPE=true
460
+ ```
461
+ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
462
+ ```
463
+ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
464
+ ```
465
+
466
+ #### Run Llama 3.1 405B
450
467
 
451
468
  ```bash
452
469
  ## Run 405B (fp8) on a single node
@@ -474,7 +491,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
474
491
  ```
475
492
 
476
493
  ## Frontend: Structured Generation Language (SGLang)
477
- The frontend language can be used with local models or API models.
494
+ The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
478
495
 
479
496
  ### Quick Start
480
497
  The example below shows how to use sglang to answer a mulit-turn question.