sglang 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. sglang/__init__.py +8 -0
  2. sglang/api.py +10 -2
  3. sglang/bench_latency.py +151 -40
  4. sglang/bench_serving.py +46 -22
  5. sglang/check_env.py +24 -2
  6. sglang/global_config.py +0 -1
  7. sglang/lang/backend/base_backend.py +3 -1
  8. sglang/lang/backend/openai.py +8 -3
  9. sglang/lang/backend/runtime_endpoint.py +46 -29
  10. sglang/lang/choices.py +164 -0
  11. sglang/lang/compiler.py +2 -2
  12. sglang/lang/interpreter.py +6 -13
  13. sglang/lang/ir.py +14 -5
  14. sglang/srt/constrained/base_tool_cache.py +1 -1
  15. sglang/srt/constrained/fsm_cache.py +12 -2
  16. sglang/srt/layers/activation.py +33 -0
  17. sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
  18. sglang/srt/layers/extend_attention.py +6 -1
  19. sglang/srt/layers/layernorm.py +65 -0
  20. sglang/srt/layers/logits_processor.py +6 -1
  21. sglang/srt/layers/pooler.py +50 -0
  22. sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
  23. sglang/srt/layers/radix_attention.py +4 -7
  24. sglang/srt/managers/detokenizer_manager.py +31 -9
  25. sglang/srt/managers/io_struct.py +63 -0
  26. sglang/srt/managers/policy_scheduler.py +173 -25
  27. sglang/srt/managers/schedule_batch.py +174 -380
  28. sglang/srt/managers/tokenizer_manager.py +197 -112
  29. sglang/srt/managers/tp_worker.py +299 -364
  30. sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
  31. sglang/srt/mem_cache/chunk_cache.py +43 -20
  32. sglang/srt/mem_cache/memory_pool.py +10 -15
  33. sglang/srt/mem_cache/radix_cache.py +74 -40
  34. sglang/srt/model_executor/cuda_graph_runner.py +27 -12
  35. sglang/srt/model_executor/forward_batch_info.py +319 -0
  36. sglang/srt/model_executor/model_runner.py +30 -47
  37. sglang/srt/models/chatglm.py +1 -1
  38. sglang/srt/models/commandr.py +1 -1
  39. sglang/srt/models/dbrx.py +1 -1
  40. sglang/srt/models/deepseek.py +1 -1
  41. sglang/srt/models/deepseek_v2.py +1 -1
  42. sglang/srt/models/gemma.py +1 -1
  43. sglang/srt/models/gemma2.py +1 -2
  44. sglang/srt/models/gpt_bigcode.py +1 -1
  45. sglang/srt/models/grok.py +1 -1
  46. sglang/srt/models/internlm2.py +3 -8
  47. sglang/srt/models/llama2.py +5 -5
  48. sglang/srt/models/llama_classification.py +1 -1
  49. sglang/srt/models/llama_embedding.py +88 -0
  50. sglang/srt/models/llava.py +1 -2
  51. sglang/srt/models/llavavid.py +1 -2
  52. sglang/srt/models/minicpm.py +1 -1
  53. sglang/srt/models/mixtral.py +1 -1
  54. sglang/srt/models/mixtral_quant.py +1 -1
  55. sglang/srt/models/qwen.py +1 -1
  56. sglang/srt/models/qwen2.py +1 -1
  57. sglang/srt/models/qwen2_moe.py +1 -12
  58. sglang/srt/models/stablelm.py +1 -1
  59. sglang/srt/openai_api/adapter.py +189 -39
  60. sglang/srt/openai_api/protocol.py +43 -1
  61. sglang/srt/sampling/penaltylib/__init__.py +13 -0
  62. sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
  63. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
  64. sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
  65. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
  66. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
  67. sglang/srt/sampling_params.py +31 -4
  68. sglang/srt/server.py +93 -21
  69. sglang/srt/server_args.py +30 -19
  70. sglang/srt/utils.py +31 -13
  71. sglang/test/run_eval.py +10 -1
  72. sglang/test/runners.py +63 -63
  73. sglang/test/simple_eval_humaneval.py +2 -8
  74. sglang/test/simple_eval_mgsm.py +203 -0
  75. sglang/test/srt/sampling/penaltylib/utils.py +337 -0
  76. sglang/test/test_layernorm.py +60 -0
  77. sglang/test/test_programs.py +4 -2
  78. sglang/test/test_utils.py +21 -3
  79. sglang/utils.py +0 -1
  80. sglang/version.py +1 -1
  81. {sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/METADATA +50 -31
  82. sglang-0.2.12.dist-info/RECORD +112 -0
  83. sglang/srt/layers/linear.py +0 -884
  84. sglang/srt/layers/quantization/__init__.py +0 -64
  85. sglang/srt/layers/quantization/fp8.py +0 -677
  86. sglang-0.2.10.dist-info/RECORD +0 -100
  87. {sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/LICENSE +0 -0
  88. {sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/WHEEL +0 -0
  89. {sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,337 @@
1
+ import dataclasses
2
+ import enum
3
+ import typing
4
+ import unittest
5
+
6
+ import torch
7
+
8
+ from sglang.srt.sampling.penaltylib.orchestrator import (
9
+ BatchedPenalizerOrchestrator,
10
+ _BatchedPenalizer,
11
+ _BatchLike,
12
+ )
13
+
14
+
15
+ @dataclasses.dataclass
16
+ class MockSamplingParams:
17
+ frequency_penalty: float = 0.0
18
+ min_new_tokens: int = 0
19
+ stop_token_ids: typing.List[int] = None
20
+ presence_penalty: float = 0.0
21
+ repetition_penalty: float = 1.0
22
+
23
+
24
+ @dataclasses.dataclass
25
+ class MockTokenizer:
26
+ eos_token_id: int
27
+
28
+
29
+ @dataclasses.dataclass
30
+ class MockReq:
31
+ origin_input_ids: typing.List[int]
32
+ sampling_params: MockSamplingParams
33
+ tokenizer: MockTokenizer
34
+
35
+
36
+ class StepType(enum.Enum):
37
+ INPUT = "input"
38
+ OUTPUT = "output"
39
+
40
+
41
+ @dataclasses.dataclass
42
+ class Step:
43
+ type: StepType
44
+ token_ids: typing.List[int]
45
+ expected_tensors: typing.Dict[str, torch.Tensor]
46
+ # assume initial logits are all 1
47
+ expected_logits: torch.Tensor
48
+
49
+
50
+ @dataclasses.dataclass
51
+ class Subject:
52
+ sampling_params: MockSamplingParams
53
+ # first step must be input, which will be converted to Req
54
+ steps: typing.List[Step]
55
+ eos_token_id: int = -1
56
+
57
+ def __post_init__(self):
58
+ if self.steps[0].type != StepType.INPUT:
59
+ raise ValueError("First step must be input")
60
+
61
+ # each steps should have the same expected_tensors.keys()
62
+ for i in range(1, len(self.steps)):
63
+ if self.tensor_keys(i) != self.tensor_keys():
64
+ raise ValueError(
65
+ f"Expected tensors keys must be the same for all steps. Got {self.steps[i].expected_tensors.keys()} for key={i} and {self.steps[0].expected_tensors.keys()}"
66
+ )
67
+
68
+ def tensor_keys(self, i: int = 0) -> typing.Set[str]:
69
+ return set(self.steps[i].expected_tensors.keys())
70
+
71
+ def to_req(self) -> MockReq:
72
+ return MockReq(
73
+ origin_input_ids=self.steps[0].token_ids,
74
+ sampling_params=self.sampling_params,
75
+ tokenizer=MockTokenizer(eos_token_id=self.eos_token_id),
76
+ )
77
+
78
+
79
+ @dataclasses.dataclass
80
+ class Case:
81
+ enabled: bool
82
+ test_subjects: typing.List[Subject]
83
+
84
+ def __post_init__(self):
85
+ # each test_subjects.steps should have the same expected_tensors.keys()
86
+ for i in range(1, len(self.test_subjects)):
87
+ if self.tensor_keys(i) != self.tensor_keys():
88
+ raise ValueError(
89
+ f"Expected tensors keys must be the same for all test_subjects. Got {self.test_subjects[i].tensor_keys()} for key={i} and {self.test_subjects[0].tensor_keys()}"
90
+ )
91
+
92
+ def tensor_keys(self, i: int = 0) -> typing.List[str]:
93
+ return set(self.test_subjects[i].tensor_keys())
94
+
95
+
96
+ class BaseBatchedPenalizerTest(unittest.TestCase):
97
+ Penalizer: typing.Type[_BatchedPenalizer]
98
+ device = "cuda"
99
+ vocab_size = 5
100
+
101
+ enabled: Subject = None
102
+ disabled: Subject = None
103
+
104
+ def setUp(self):
105
+ if self.__class__ == BaseBatchedPenalizerTest:
106
+ self.skipTest("Base class for penalizer tests")
107
+
108
+ self.create_test_subjects()
109
+ self.create_test_cases()
110
+
111
+ def tensor(self, data, **kwargs) -> torch.Tensor:
112
+ """
113
+ Shortcut to create a tensor with device=self.device.
114
+ """
115
+ return torch.tensor(data, **kwargs, device=self.device)
116
+
117
+ def create_test_subjects(self) -> typing.List[Subject]:
118
+ raise NotImplementedError()
119
+
120
+ def create_test_cases(self):
121
+ self.test_cases = [
122
+ Case(enabled=True, test_subjects=[self.enabled]),
123
+ Case(enabled=False, test_subjects=[self.disabled]),
124
+ Case(enabled=True, test_subjects=[self.enabled, self.disabled]),
125
+ ]
126
+
127
+ def _create_penalizer(
128
+ self, case: Case
129
+ ) -> typing.Tuple[BatchedPenalizerOrchestrator, _BatchedPenalizer]:
130
+ orchestrator = BatchedPenalizerOrchestrator(
131
+ vocab_size=self.vocab_size,
132
+ batch=_BatchLike(reqs=[subject.to_req() for subject in case.test_subjects]),
133
+ device=self.device,
134
+ Penalizers={self.Penalizer},
135
+ )
136
+
137
+ return orchestrator, orchestrator.penalizers[self.Penalizer]
138
+
139
+ def test_is_required(self):
140
+ for case in self.test_cases:
141
+ with self.subTest(case=case):
142
+ _, penalizer = self._create_penalizer(case)
143
+ self.assertEqual(case.enabled, penalizer.is_required())
144
+
145
+ def test_prepare(self):
146
+ for case in self.test_cases:
147
+ with self.subTest(case=case):
148
+ orchestrator, penalizer = self._create_penalizer(case)
149
+ self.assertEqual(case.enabled, penalizer.is_prepared())
150
+
151
+ if case.enabled:
152
+ for key, tensor in {
153
+ key: torch.cat(
154
+ tensors=[
155
+ subject.steps[0].expected_tensors[key]
156
+ for subject in case.test_subjects
157
+ ],
158
+ )
159
+ for key in case.tensor_keys()
160
+ }.items():
161
+ torch.testing.assert_close(
162
+ actual=getattr(penalizer, key),
163
+ expected=tensor,
164
+ msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
165
+ )
166
+
167
+ actual = orchestrator.apply(
168
+ torch.ones(
169
+ size=(len(case.test_subjects), self.vocab_size),
170
+ dtype=torch.float32,
171
+ device=self.device,
172
+ )
173
+ )
174
+ expected = torch.cat(
175
+ tensors=[
176
+ subject.steps[0].expected_logits
177
+ for subject in case.test_subjects
178
+ ],
179
+ )
180
+ torch.testing.assert_close(
181
+ actual=actual,
182
+ expected=expected,
183
+ msg=f"logits\nactual={actual}\nexpected={expected}",
184
+ )
185
+
186
+ def test_teardown(self):
187
+ for case in self.test_cases:
188
+ with self.subTest(case=case):
189
+ _, penalizer = self._create_penalizer(case)
190
+ penalizer.teardown()
191
+
192
+ for key in case.test_subjects[0].steps[0].expected_tensors.keys():
193
+ self.assertIsNone(getattr(penalizer, key, None))
194
+
195
+ def test_filter(self):
196
+ for case in self.test_cases:
197
+ with self.subTest(case=case):
198
+ orchestrator, penalizer = self._create_penalizer(case)
199
+
200
+ indices_to_keep = [0]
201
+ orchestrator.filter(indices_to_keep=indices_to_keep)
202
+
203
+ filtered_subjects = [case.test_subjects[i] for i in indices_to_keep]
204
+
205
+ if penalizer.is_required():
206
+ self.assertTrue(penalizer.is_prepared())
207
+ for key, tensor in {
208
+ key: torch.cat(
209
+ tensors=[
210
+ subject.steps[0].expected_tensors[key]
211
+ for subject in filtered_subjects
212
+ ],
213
+ )
214
+ for key in case.tensor_keys()
215
+ }.items():
216
+ torch.testing.assert_close(
217
+ actual=getattr(penalizer, key),
218
+ expected=tensor,
219
+ msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
220
+ )
221
+
222
+ actual_logits = orchestrator.apply(
223
+ torch.ones(
224
+ size=(len(filtered_subjects), self.vocab_size),
225
+ dtype=torch.float32,
226
+ device=self.device,
227
+ )
228
+ )
229
+ filtered_expected_logits = torch.cat(
230
+ tensors=[
231
+ subject.steps[0].expected_logits
232
+ for subject in filtered_subjects
233
+ ],
234
+ )
235
+ torch.testing.assert_close(
236
+ actual=actual_logits,
237
+ expected=filtered_expected_logits,
238
+ msg=f"logits\nactual={actual_logits}\nexpected={filtered_expected_logits}",
239
+ )
240
+
241
+ def test_merge_enabled_with_disabled(self):
242
+ enabled_test_case = self.test_cases[0]
243
+ disabled_test_case = self.test_cases[1]
244
+
245
+ orchestrator, penalizer = self._create_penalizer(enabled_test_case)
246
+ theirs, _ = self._create_penalizer(disabled_test_case)
247
+
248
+ orchestrator.merge(theirs)
249
+
250
+ for key, tensor in {
251
+ key: torch.cat(
252
+ tensors=[
253
+ enabled_test_case.test_subjects[0].steps[0].expected_tensors[key],
254
+ disabled_test_case.test_subjects[0].steps[0].expected_tensors[key],
255
+ ],
256
+ )
257
+ for key in enabled_test_case.tensor_keys()
258
+ }.items():
259
+ torch.testing.assert_close(
260
+ actual=getattr(penalizer, key),
261
+ expected=tensor,
262
+ msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
263
+ )
264
+
265
+ def test_cumulate_apply_repeat(self):
266
+ for case in self.test_cases:
267
+ with self.subTest(case=case):
268
+ orchestrator, penalizer = self._create_penalizer(case)
269
+
270
+ max_step = max(len(subject.steps) for subject in case.test_subjects)
271
+ for i in range(1, max_step):
272
+ orchestrator.filter(
273
+ indices_to_keep=[
274
+ j
275
+ for j, subject in enumerate(case.test_subjects)
276
+ if i < len(subject.steps)
277
+ ]
278
+ )
279
+
280
+ filtered_subjects = [
281
+ subject
282
+ for subject in case.test_subjects
283
+ if i < len(subject.steps)
284
+ ]
285
+
286
+ inputs: typing.List[typing.List[int]] = []
287
+ outputs: typing.List[typing.List[int]] = []
288
+ for subject in filtered_subjects:
289
+ step = subject.steps[i]
290
+ if step.type == StepType.INPUT:
291
+ inputs.append(step.token_ids)
292
+ outputs.append([])
293
+ else:
294
+ inputs.append([])
295
+ outputs.append(step.token_ids)
296
+
297
+ if any(inputs):
298
+ orchestrator.cumulate_input_tokens(inputs)
299
+
300
+ if any(outputs):
301
+ orchestrator.cumulate_output_tokens(outputs)
302
+
303
+ if penalizer.is_required():
304
+ self.assertTrue(penalizer.is_prepared())
305
+ for key, tensor in {
306
+ key: torch.cat(
307
+ tensors=[
308
+ subject.steps[i].expected_tensors[key]
309
+ for subject in filtered_subjects
310
+ ],
311
+ )
312
+ for key in case.tensor_keys()
313
+ }.items():
314
+ torch.testing.assert_close(
315
+ actual=getattr(penalizer, key),
316
+ expected=tensor,
317
+ msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
318
+ )
319
+
320
+ actual_logits = orchestrator.apply(
321
+ torch.ones(
322
+ size=(len(filtered_subjects), self.vocab_size),
323
+ dtype=torch.float32,
324
+ device=self.device,
325
+ )
326
+ )
327
+ filtered_expected_logits = torch.cat(
328
+ tensors=[
329
+ subject.steps[i].expected_logits
330
+ for subject in filtered_subjects
331
+ ],
332
+ )
333
+ torch.testing.assert_close(
334
+ actual=actual_logits,
335
+ expected=filtered_expected_logits,
336
+ msg=f"logits\nactual={actual_logits}\nexpected={filtered_expected_logits}",
337
+ )
@@ -0,0 +1,60 @@
1
+ import itertools
2
+ import unittest
3
+
4
+ import torch
5
+
6
+ from sglang.srt.layers.layernorm import RMSNorm
7
+
8
+
9
+ class TestRMSNorm(unittest.TestCase):
10
+ DTYPES = [torch.half, torch.bfloat16]
11
+ NUM_TOKENS = [7, 83, 4096]
12
+ HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
13
+ ADD_RESIDUAL = [False, True]
14
+ SEEDS = [0]
15
+
16
+ @classmethod
17
+ def setUpClass(cls):
18
+ if not torch.cuda.is_available():
19
+ raise unittest.SkipTest("CUDA is not available")
20
+ torch.set_default_device("cuda")
21
+
22
+ def _run_rms_norm_test(self, num_tokens, hidden_size, add_residual, dtype, seed):
23
+ torch.manual_seed(seed)
24
+
25
+ layer = RMSNorm(hidden_size).to(dtype=dtype)
26
+ layer.weight.data.normal_(mean=1.0, std=0.1)
27
+ scale = 1 / (2 * hidden_size)
28
+ x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
29
+ residual = torch.randn_like(x) * scale if add_residual else None
30
+
31
+ with torch.inference_mode():
32
+ ref_out = layer.forward_native(x, residual)
33
+ out = layer(x, residual)
34
+
35
+ if add_residual:
36
+ self.assertTrue(torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2))
37
+ self.assertTrue(torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2))
38
+ else:
39
+ self.assertTrue(torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2))
40
+
41
+ def test_rms_norm(self):
42
+ for params in itertools.product(
43
+ self.NUM_TOKENS,
44
+ self.HIDDEN_SIZES,
45
+ self.ADD_RESIDUAL,
46
+ self.DTYPES,
47
+ self.SEEDS,
48
+ ):
49
+ with self.subTest(
50
+ num_tokens=params[0],
51
+ hidden_size=params[1],
52
+ add_residual=params[2],
53
+ dtype=params[3],
54
+ seed=params[4],
55
+ ):
56
+ self._run_rms_norm_test(*params)
57
+
58
+
59
+ if __name__ == "__main__":
60
+ unittest.main(verbosity=2)
@@ -149,7 +149,7 @@ def test_decode_json():
149
149
  assert isinstance(js_obj["population"], int)
150
150
 
151
151
 
152
- def test_expert_answer():
152
+ def test_expert_answer(check_answer=True):
153
153
  @sgl.function
154
154
  def expert_answer(s, question):
155
155
  s += "Question: " + question + "\n"
@@ -167,7 +167,9 @@ def test_expert_answer():
167
167
  )
168
168
 
169
169
  ret = expert_answer.run(question="What is the capital of France?", temperature=0.1)
170
- assert "paris" in ret.text().lower()
170
+
171
+ if check_answer:
172
+ assert "paris" in ret.text().lower(), f"Answer: {ret.text()}"
171
173
 
172
174
 
173
175
  def test_tool_use():
sglang/test/test_utils.py CHANGED
@@ -12,13 +12,16 @@ from typing import Callable, List, Optional
12
12
 
13
13
  import numpy as np
14
14
  import requests
15
+ import torch
16
+ import torch.nn.functional as F
15
17
 
16
18
  from sglang.global_config import global_config
17
19
  from sglang.lang.backend.openai import OpenAI
18
20
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
19
21
  from sglang.utils import get_exception_traceback
20
22
 
21
- MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
23
+ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
24
+ DEFAULT_URL_FOR_TEST = "http://127.0.0.1:8157"
22
25
 
23
26
 
24
27
  def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
@@ -396,6 +399,8 @@ def popen_launch_server(
396
399
  timeout: float,
397
400
  api_key: Optional[str] = None,
398
401
  other_args: tuple = (),
402
+ env: Optional[dict] = None,
403
+ return_stdout_stderr: bool = False,
399
404
  ):
400
405
  _, host, port = base_url.split(":")
401
406
  host = host[2:]
@@ -415,7 +420,16 @@ def popen_launch_server(
415
420
  if api_key:
416
421
  command += ["--api-key", api_key]
417
422
 
418
- process = subprocess.Popen(command, stdout=None, stderr=None)
423
+ if return_stdout_stderr:
424
+ process = subprocess.Popen(
425
+ command,
426
+ stdout=subprocess.PIPE,
427
+ stderr=subprocess.PIPE,
428
+ env=env,
429
+ text=True,
430
+ )
431
+ else:
432
+ process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
419
433
 
420
434
  start_time = time.time()
421
435
  while time.time() - start_time < timeout:
@@ -482,7 +496,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
482
496
  p.terminate()
483
497
  time.sleep(5)
484
498
  print(
485
- "\nTimeout after {timeout_per_file} seconds when running {filename}\n"
499
+ f"\nTimeout after {timeout_per_file} seconds when running {filename}\n"
486
500
  )
487
501
  return False
488
502
 
@@ -492,3 +506,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
492
506
  print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
493
507
 
494
508
  return 0 if success else -1
509
+
510
+
511
+ def get_similarities(vec1, vec2):
512
+ return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
sglang/utils.py CHANGED
@@ -6,7 +6,6 @@ import json
6
6
  import logging
7
7
  import signal
8
8
  import sys
9
- import threading
10
9
  import traceback
11
10
  import urllib.request
12
11
  from concurrent.futures import ThreadPoolExecutor
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.10"
1
+ __version__ = "0.2.12"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.10
3
+ Version: 0.2.12
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -221,6 +221,9 @@ Requires-Dist: sglang[anthropic]; extra == "all"
221
221
  Requires-Dist: sglang[litellm]; extra == "all"
222
222
  Provides-Extra: anthropic
223
223
  Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
224
+ Provides-Extra: dev
225
+ Requires-Dist: sglang[all]; extra == "dev"
226
+ Requires-Dist: sglang[test]; extra == "dev"
224
227
  Provides-Extra: litellm
225
228
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
226
229
  Provides-Extra: openai
@@ -232,7 +235,6 @@ Requires-Dist: fastapi; extra == "srt"
232
235
  Requires-Dist: hf-transfer; extra == "srt"
233
236
  Requires-Dist: huggingface-hub; extra == "srt"
234
237
  Requires-Dist: interegular; extra == "srt"
235
- Requires-Dist: jsonlines; extra == "srt"
236
238
  Requires-Dist: packaging; extra == "srt"
237
239
  Requires-Dist: pillow; extra == "srt"
238
240
  Requires-Dist: psutil; extra == "srt"
@@ -242,8 +244,12 @@ Requires-Dist: torch; extra == "srt"
242
244
  Requires-Dist: uvicorn; extra == "srt"
243
245
  Requires-Dist: uvloop; extra == "srt"
244
246
  Requires-Dist: zmq; extra == "srt"
245
- Requires-Dist: vllm==0.5.3.post1; extra == "srt"
247
+ Requires-Dist: vllm==0.5.4; extra == "srt"
246
248
  Requires-Dist: outlines>=0.0.44; extra == "srt"
249
+ Provides-Extra: test
250
+ Requires-Dist: jsonlines; extra == "test"
251
+ Requires-Dist: matplotlib; extra == "test"
252
+ Requires-Dist: pandas; extra == "test"
247
253
 
248
254
  <div align="center">
249
255
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
@@ -296,20 +302,20 @@ pip install --upgrade pip
296
302
  pip install "sglang[all]"
297
303
 
298
304
  # Install FlashInfer CUDA kernels
299
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
305
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
300
306
  ```
301
307
 
302
308
  ### Method 2: From source
303
309
  ```
304
310
  # Use the last release branch
305
- git clone -b v0.2.10 https://github.com/sgl-project/sglang.git
311
+ git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
306
312
  cd sglang
307
313
 
308
314
  pip install --upgrade pip
309
315
  pip install -e "python[all]"
310
316
 
311
317
  # Install FlashInfer CUDA kernels
312
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
318
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
313
319
  ```
314
320
 
315
321
  ### Method 3: Using docker
@@ -383,22 +389,26 @@ response = client.chat.completions.create(
383
389
  print(response)
384
390
  ```
385
391
 
386
- It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
392
+ It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
387
393
 
388
394
  ### Additional Server Arguments
389
- - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
395
+ - Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
390
396
  ```
391
397
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
392
398
  ```
393
- - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
399
+ - Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
394
400
  ```
395
401
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
396
402
  ```
397
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
403
+ - If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
398
404
  ```
399
405
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
400
406
  ```
401
407
  - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
408
+ - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
409
+ ```
410
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
411
+ ```
402
412
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
403
413
  ```
404
414
  # Node 0
@@ -408,29 +418,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
408
418
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
409
419
  ```
410
420
  - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
411
- - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
412
421
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
413
-
414
- ### Run Llama 3.1 405B
415
-
416
- ```bash
417
- ## Run 405B (fp8) on a single node
418
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
419
-
420
- ## Run 405B (fp16) on two nodes
421
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
422
-
423
- # on the first node
424
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
425
-
426
- # on the second
427
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
428
- ```
429
-
422
+ - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
423
+
430
424
  ### Supported Models
431
425
 
432
426
  - Llama / Llama 2 / Llama 3 / Llama 3.1
433
- - Mistral / Mixtral
427
+ - Mistral / Mixtral / Mistral NeMo
434
428
  - Gemma / Gemma 2
435
429
  - Qwen / Qwen 2 / Qwen 2 MoE
436
430
  - DeepSeek / DeepSeek 2
@@ -448,10 +442,35 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
448
442
  - Grok
449
443
  - ChatGLM
450
444
  - InternLM 2
451
- - Mistral NeMo
452
445
 
453
446
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
454
447
 
448
+ #### Use Models From ModelScope
449
+ To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
450
+ ```
451
+ export SGLANG_USE_MODELSCOPE=true
452
+ ```
453
+ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
454
+ ```
455
+ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
456
+ ```
457
+
458
+ #### Run Llama 3.1 405B
459
+
460
+ ```bash
461
+ ## Run 405B (fp8) on a single node
462
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
463
+
464
+ ## Run 405B (fp16) on two nodes
465
+ # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
466
+
467
+ # on the first node
468
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
469
+
470
+ # on the second
471
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
472
+ ```
473
+
455
474
  ### Benchmark Performance
456
475
 
457
476
  - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
@@ -464,7 +483,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
464
483
  ```
465
484
 
466
485
  ## Frontend: Structured Generation Language (SGLang)
467
- The frontend language can be used with local models or API models.
486
+ The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
468
487
 
469
488
  ### Quick Start
470
489
  The example below shows how to use sglang to answer a mulit-turn question.