sglang 0.2.14.post2__py3-none-any.whl → 0.2.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. sglang/api.py +2 -0
  2. sglang/bench_latency.py +39 -28
  3. sglang/lang/interpreter.py +3 -0
  4. sglang/lang/ir.py +5 -0
  5. sglang/launch_server_llavavid.py +12 -12
  6. sglang/srt/configs/__init__.py +5 -0
  7. sglang/srt/configs/exaone.py +195 -0
  8. sglang/srt/constrained/fsm_cache.py +1 -1
  9. sglang/srt/conversation.py +24 -2
  10. sglang/srt/hf_transformers_utils.py +11 -11
  11. sglang/srt/layers/extend_attention.py +13 -8
  12. sglang/srt/layers/logits_processor.py +4 -4
  13. sglang/srt/layers/sampler.py +69 -16
  14. sglang/srt/managers/controller_multi.py +5 -5
  15. sglang/srt/managers/controller_single.py +5 -5
  16. sglang/srt/managers/io_struct.py +6 -1
  17. sglang/srt/managers/schedule_batch.py +20 -8
  18. sglang/srt/managers/tokenizer_manager.py +2 -2
  19. sglang/srt/managers/tp_worker.py +38 -26
  20. sglang/srt/model_config.py +3 -3
  21. sglang/srt/model_executor/cuda_graph_runner.py +24 -9
  22. sglang/srt/model_executor/forward_batch_info.py +68 -23
  23. sglang/srt/model_executor/model_runner.py +14 -12
  24. sglang/srt/models/chatglm.py +4 -12
  25. sglang/srt/models/commandr.py +5 -1
  26. sglang/srt/models/dbrx.py +5 -1
  27. sglang/srt/models/deepseek.py +5 -1
  28. sglang/srt/models/deepseek_v2.py +57 -25
  29. sglang/srt/models/exaone.py +399 -0
  30. sglang/srt/models/gemma.py +5 -1
  31. sglang/srt/models/gemma2.py +5 -1
  32. sglang/srt/models/gpt_bigcode.py +5 -1
  33. sglang/srt/models/grok.py +5 -1
  34. sglang/srt/models/internlm2.py +5 -1
  35. sglang/srt/models/llama2.py +7 -3
  36. sglang/srt/models/llama_classification.py +2 -2
  37. sglang/srt/models/minicpm.py +5 -1
  38. sglang/srt/models/mixtral.py +6 -2
  39. sglang/srt/models/mixtral_quant.py +5 -1
  40. sglang/srt/models/qwen.py +5 -2
  41. sglang/srt/models/qwen2.py +6 -2
  42. sglang/srt/models/qwen2_moe.py +5 -14
  43. sglang/srt/models/stablelm.py +5 -1
  44. sglang/srt/openai_api/adapter.py +16 -1
  45. sglang/srt/openai_api/protocol.py +5 -5
  46. sglang/srt/sampling/sampling_batch_info.py +79 -6
  47. sglang/srt/server.py +6 -6
  48. sglang/srt/utils.py +0 -3
  49. sglang/test/runners.py +1 -1
  50. sglang/version.py +1 -1
  51. {sglang-0.2.14.post2.dist-info → sglang-0.2.15.dist-info}/METADATA +7 -7
  52. {sglang-0.2.14.post2.dist-info → sglang-0.2.15.dist-info}/RECORD +55 -52
  53. {sglang-0.2.14.post2.dist-info → sglang-0.2.15.dist-info}/LICENSE +0 -0
  54. {sglang-0.2.14.post2.dist-info → sglang-0.2.15.dist-info}/WHEEL +0 -0
  55. {sglang-0.2.14.post2.dist-info → sglang-0.2.15.dist-info}/top_level.txt +0 -0
@@ -45,6 +45,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
45
45
  from sglang.srt.layers.layernorm import RMSNorm
46
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
47
47
  from sglang.srt.layers.radix_attention import RadixAttention
48
+ from sglang.srt.layers.sampler import Sampler
48
49
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
49
50
 
50
51
 
@@ -333,6 +334,7 @@ class QuantMixtralForCausalLM(nn.Module):
333
334
  self.model = MixtralModel(config, quant_config=quant_config)
334
335
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
335
336
  self.logits_processor = LogitsProcessor(config)
337
+ self.sampler = Sampler()
336
338
 
337
339
  @torch.no_grad()
338
340
  def forward(
@@ -343,9 +345,11 @@ class QuantMixtralForCausalLM(nn.Module):
343
345
  input_embeds: torch.Tensor = None,
344
346
  ) -> torch.Tensor:
345
347
  hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
346
- return self.logits_processor(
348
+ logits_output = self.logits_processor(
347
349
  input_ids, hidden_states, self.lm_head.weight, input_metadata
348
350
  )
351
+ sample_output = self.sampler(logits_output, input_metadata.sampling_info)
352
+ return sample_output, logits_output
349
353
 
350
354
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
351
355
  stacked_params_mapping = [
sglang/srt/models/qwen.py CHANGED
@@ -39,6 +39,7 @@ from sglang.srt.layers.activation import SiluAndMul
39
39
  from sglang.srt.layers.layernorm import RMSNorm
40
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
+ from sglang.srt.layers.sampler import Sampler
42
43
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
43
44
 
44
45
 
@@ -251,6 +252,7 @@ class QWenLMHeadModel(nn.Module):
251
252
  vocab_size = ((config.vocab_size + 63) // 64) * 64
252
253
  self.lm_head = ParallelLMHead(vocab_size, config.hidden_size)
253
254
  self.logits_processor = LogitsProcessor(config)
255
+ self.sampler = Sampler()
254
256
 
255
257
  @torch.no_grad()
256
258
  def forward(
@@ -260,10 +262,11 @@ class QWenLMHeadModel(nn.Module):
260
262
  input_metadata: InputMetadata,
261
263
  ):
262
264
  hidden_states = self.transformer(input_ids, positions, input_metadata)
263
- next_tokens = self.logits_processor(
265
+ logits_output = self.logits_processor(
264
266
  input_ids, hidden_states, self.lm_head.weight, input_metadata
265
267
  )
266
- return next_tokens
268
+ sample_output = self.sampler(logits_output, input_metadata.sampling_info)
269
+ return sample_output, logits_output
267
270
 
268
271
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
269
272
  stacked_params_mapping = [
@@ -38,8 +38,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
38
38
  from sglang.srt.layers.activation import SiluAndMul
39
39
  from sglang.srt.layers.layernorm import RMSNorm
40
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
- from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
41
+ from sglang.srt.layers.pooler import Pooler, PoolingType
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
+ from sglang.srt.layers.sampler import Sampler
43
44
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
44
45
 
45
46
  Qwen2Config = None
@@ -276,6 +277,7 @@ class Qwen2ForCausalLM(nn.Module):
276
277
  self.model = Qwen2Model(config, quant_config=quant_config)
277
278
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
278
279
  self.logits_processor = LogitsProcessor(config)
280
+ self.sampler = Sampler()
279
281
  self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
280
282
 
281
283
  @torch.no_grad()
@@ -289,9 +291,11 @@ class Qwen2ForCausalLM(nn.Module):
289
291
  ) -> torch.Tensor:
290
292
  hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
291
293
  if not get_embedding:
292
- return self.logits_processor(
294
+ logits_output = self.logits_processor(
293
295
  input_ids, hidden_states, self.lm_head.weight, input_metadata
294
296
  )
297
+ sample_output = self.sampler(logits_output, input_metadata.sampling_info)
298
+ return sample_output, logits_output
295
299
  else:
296
300
  return self.pooler(hidden_states, input_metadata)
297
301
 
@@ -35,10 +35,8 @@ from vllm.model_executor.layers.linear import (
35
35
  ReplicatedLinear,
36
36
  RowParallelLinear,
37
37
  )
38
- from vllm.model_executor.layers.logits_processor import LogitsProcessor
39
38
  from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
40
39
  from vllm.model_executor.layers.rotary_embedding import get_rope
41
- from vllm.model_executor.layers.sampler import Sampler
42
40
  from vllm.model_executor.layers.vocab_parallel_embedding import (
43
41
  ParallelLMHead,
44
42
  VocabParallelEmbedding,
@@ -49,6 +47,7 @@ from sglang.srt.layers.activation import SiluAndMul
49
47
  from sglang.srt.layers.layernorm import RMSNorm
50
48
  from sglang.srt.layers.logits_processor import LogitsProcessor
51
49
  from sglang.srt.layers.radix_attention import RadixAttention
50
+ from sglang.srt.layers.sampler import Sampler
52
51
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
53
52
 
54
53
 
@@ -366,6 +365,7 @@ class Qwen2MoeForCausalLM(nn.Module):
366
365
  config.vocab_size, config.hidden_size, quant_config=quant_config
367
366
  )
368
367
  self.logits_processor = LogitsProcessor(config)
368
+ self.sampler = Sampler()
369
369
 
370
370
  @torch.no_grad()
371
371
  def forward(
@@ -376,20 +376,11 @@ class Qwen2MoeForCausalLM(nn.Module):
376
376
  input_embeds: torch.Tensor = None,
377
377
  ) -> torch.Tensor:
378
378
  hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
379
- return self.logits_processor(
379
+ logits_output = self.logits_processor(
380
380
  input_ids, hidden_states, self.lm_head.weight, input_metadata
381
381
  )
382
-
383
- def compute_logits(
384
- self,
385
- input_ids: torch.Tensor,
386
- hidden_states: torch.Tensor,
387
- input_metadata: InputMetadata,
388
- ) -> torch.Tensor:
389
- logits = self.logits_processor(
390
- input_ids, hidden_states, self.lm_head.weight, input_metadata
391
- )
392
- return logits
382
+ sample_output = self.sampler(logits_output, input_metadata.sampling_info)
383
+ return sample_output, logits_output
393
384
 
394
385
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
395
386
  stacked_params_mapping = [
@@ -40,6 +40,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
40
40
  from sglang.srt.layers.activation import SiluAndMul
41
41
  from sglang.srt.layers.logits_processor import LogitsProcessor
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
+ from sglang.srt.layers.sampler import Sampler
43
44
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
44
45
 
45
46
 
@@ -249,6 +250,7 @@ class StableLmForCausalLM(nn.Module):
249
250
  self.model = StableLMEpochModel(config, quant_config=quant_config)
250
251
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
251
252
  self.logits_processor = LogitsProcessor(config)
253
+ self.sampler = Sampler()
252
254
 
253
255
  @torch.no_grad()
254
256
  def forward(
@@ -259,9 +261,11 @@ class StableLmForCausalLM(nn.Module):
259
261
  input_embeds: torch.Tensor = None,
260
262
  ) -> torch.Tensor:
261
263
  hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
262
- return self.logits_processor(
264
+ logits_output = self.logits_processor(
263
265
  input_ids, hidden_states, self.lm_head.weight, input_metadata
264
266
  )
267
+ sample_output = self.sampler(logits_output, input_metadata.sampling_info)
268
+ return sample_output, logits_output
265
269
 
266
270
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
267
271
  stacked_params_mapping = [
@@ -844,8 +844,23 @@ def v1_chat_generate_request(
844
844
  if not isinstance(request.messages, str):
845
845
  # Apply chat template and its stop strings.
846
846
  if chat_template_name is None:
847
+ openai_compatible_messages = []
848
+ for message in request.messages:
849
+ if isinstance(message.content, str):
850
+ openai_compatible_messages.append(
851
+ {"role": message.role, "content": message.content}
852
+ )
853
+ else:
854
+ content_list = message.dict()["content"]
855
+ for content in content_list:
856
+ if content["type"] == "text":
857
+ openai_compatible_messages.append(
858
+ {"role": message.role, "content": content["text"]}
859
+ )
847
860
  prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
848
- request.messages, tokenize=True, add_generation_prompt=True
861
+ openai_compatible_messages,
862
+ tokenize=True,
863
+ add_generation_prompt=True,
849
864
  )
850
865
  stop = request.stop
851
866
  image_data = None
@@ -200,11 +200,6 @@ class CompletionStreamResponse(BaseModel):
200
200
  usage: Optional[UsageInfo] = None
201
201
 
202
202
 
203
- class ChatCompletionMessageGenericParam(BaseModel):
204
- role: Literal["system", "assistant"]
205
- content: str
206
-
207
-
208
203
  class ChatCompletionMessageContentTextPart(BaseModel):
209
204
  type: Literal["text"]
210
205
  text: str
@@ -225,6 +220,11 @@ ChatCompletionMessageContentPart = Union[
225
220
  ]
226
221
 
227
222
 
223
+ class ChatCompletionMessageGenericParam(BaseModel):
224
+ role: Literal["system", "assistant"]
225
+ content: Union[str, List[ChatCompletionMessageContentTextPart]]
226
+
227
+
228
228
  class ChatCompletionMessageUserParam(BaseModel):
229
229
  role: Literal["user"]
230
230
  content: Union[str, List[ChatCompletionMessageContentPart]]
@@ -21,10 +21,63 @@ class SamplingBatchInfo:
21
21
  top_ps: torch.Tensor = None
22
22
  top_ks: torch.Tensor = None
23
23
  min_ps: torch.Tensor = None
24
- penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
24
+
25
+ # Dispatch in CUDA graph
26
+ need_min_p_sampling: bool = False
27
+
28
+ # Bias Tensors
25
29
  logit_bias: torch.Tensor = None
26
30
  vocab_mask: torch.Tensor = None
27
31
 
32
+ # Penalizer
33
+ penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
34
+ linear_penalties: torch.Tensor = None
35
+ scaling_penalties: torch.Tensor = None
36
+
37
+ def has_bias(self):
38
+ return (
39
+ self.logit_bias is not None
40
+ or self.vocab_mask is not None
41
+ or self.linear_penalties is not None
42
+ or self.scaling_penalties is not None
43
+ )
44
+
45
+ @classmethod
46
+ def dummy_one(cls, max_bs: int, vocab_size: int):
47
+ ret = cls(vocab_size=vocab_size)
48
+ ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
49
+ ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
50
+ ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
51
+ ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
52
+ return ret
53
+
54
+ def __getitem__(self, key):
55
+ if isinstance(key, slice):
56
+ # NOTE: We do not use cuda graph when there is bias tensors
57
+ assert not self.has_bias()
58
+ return SamplingBatchInfo(
59
+ vocab_size=self.vocab_size,
60
+ temperatures=self.temperatures[key],
61
+ top_ps=self.top_ps[key],
62
+ top_ks=self.top_ks[key],
63
+ min_ps=self.min_ps[key],
64
+ need_min_p_sampling=self.need_min_p_sampling,
65
+ )
66
+ else:
67
+ raise NotImplementedError
68
+
69
+ def inplace_assign(self, bs: int, other: SamplingBatchInfo):
70
+ # NOTE: We do not use cuda graph when there is bias tensors
71
+ assert not self.has_bias()
72
+
73
+ self.vocab_size = other.vocab_size
74
+ self.need_min_p_sampling = other.need_min_p_sampling
75
+
76
+ self.temperatures[:bs] = other.temperatures
77
+ self.top_ps[:bs] = other.top_ps
78
+ self.top_ks[:bs] = other.top_ks
79
+ self.min_ps[:bs] = other.min_ps
80
+
28
81
  @classmethod
29
82
  def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
30
83
  device = "cuda"
@@ -45,6 +98,7 @@ class SamplingBatchInfo:
45
98
  ret.min_ps = torch.tensor(
46
99
  [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
47
100
  )
101
+ ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
48
102
 
49
103
  # Each penalizers will do nothing if they evaluate themselves as not required by looking at
50
104
  # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
@@ -72,6 +126,25 @@ class SamplingBatchInfo:
72
126
 
73
127
  return ret
74
128
 
129
+ def prepare_penalties(self):
130
+ self.scaling_penalties = None
131
+ self.linear_penalties = None
132
+
133
+ for penalizer in self.penalizer_orchestrator.penalizers.values():
134
+ if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
135
+ if penalizer.is_prepared():
136
+ self.scaling_penalties = penalizer.cumulated_repetition_penalties
137
+ else:
138
+ if penalizer.is_prepared():
139
+ if self.linear_penalties is None:
140
+ bs = self.penalizer_orchestrator.batch.batch_size()
141
+ self.linear_penalties = torch.zeros(
142
+ (bs, self.vocab_size),
143
+ dtype=torch.float32,
144
+ device="cuda",
145
+ )
146
+ self.linear_penalties = penalizer.apply(self.linear_penalties)
147
+
75
148
  def update_regex_vocab_mask(self, batch: ScheduleBatch):
76
149
  bs, reqs = batch.batch_size(), batch.reqs
77
150
  device = "cuda"
@@ -81,15 +154,15 @@ class SamplingBatchInfo:
81
154
  self.vocab_mask = None
82
155
 
83
156
  if has_regex:
157
+ self.vocab_mask = torch.zeros(
158
+ bs, self.vocab_size, dtype=torch.bool, device=device
159
+ )
84
160
  for i, req in enumerate(reqs):
85
161
  if req.regex_fsm is not None:
86
- if self.vocab_mask is None:
87
- self.vocab_mask = torch.zeros(
88
- bs, self.vocab_size, dtype=torch.bool, device=device
89
- )
162
+ self.vocab_mask[i].fill_(1)
90
163
  self.vocab_mask[i][
91
164
  req.regex_fsm.get_next_instruction(req.regex_fsm_state).tokens
92
- ] = 1
165
+ ] = 0
93
166
 
94
167
  def filter(self, unfinished_indices: List[int], new_indices: torch.Tensor):
95
168
  self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
sglang/srt/server.py CHANGED
@@ -272,7 +272,7 @@ async def retrieve_file_content(file_id: str):
272
272
 
273
273
  def launch_server(
274
274
  server_args: ServerArgs,
275
- model_overide_args: Optional[dict] = None,
275
+ model_override_args: Optional[dict] = None,
276
276
  pipe_finish_writer: Optional[mp.connection.Connection] = None,
277
277
  ):
278
278
  """Launch an HTTP server."""
@@ -317,7 +317,7 @@ def launch_server(
317
317
  tp_rank_range,
318
318
  server_args,
319
319
  ports[3],
320
- model_overide_args,
320
+ model_override_args,
321
321
  )
322
322
 
323
323
  try:
@@ -328,7 +328,7 @@ def launch_server(
328
328
  return
329
329
 
330
330
  # Launch processes
331
- tokenizer_manager = TokenizerManager(server_args, port_args, model_overide_args)
331
+ tokenizer_manager = TokenizerManager(server_args, port_args, model_override_args)
332
332
  if server_args.chat_template:
333
333
  load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
334
334
  pipe_controller_reader, pipe_controller_writer = mp.Pipe(duplex=False)
@@ -341,7 +341,7 @@ def launch_server(
341
341
 
342
342
  proc_controller = mp.Process(
343
343
  target=start_controller_process,
344
- args=(server_args, port_args, pipe_controller_writer, model_overide_args),
344
+ args=(server_args, port_args, pipe_controller_writer, model_override_args),
345
345
  )
346
346
  proc_controller.start()
347
347
 
@@ -501,7 +501,7 @@ class Runtime:
501
501
  def __init__(
502
502
  self,
503
503
  log_level: str = "error",
504
- model_overide_args: Optional[dict] = None,
504
+ model_override_args: Optional[dict] = None,
505
505
  *args,
506
506
  **kwargs,
507
507
  ):
@@ -525,7 +525,7 @@ class Runtime:
525
525
 
526
526
  proc = mp.Process(
527
527
  target=launch_server,
528
- args=(self.server_args, model_overide_args, pipe_writer),
528
+ args=(self.server_args, model_override_args, pipe_writer),
529
529
  )
530
530
  proc.start()
531
531
  pipe_writer.close()
sglang/srt/utils.py CHANGED
@@ -407,7 +407,6 @@ def monkey_patch_vllm_dummy_weight_loader():
407
407
  DummyModelLoader,
408
408
  LoRAConfig,
409
409
  ModelConfig,
410
- MultiModalConfig,
411
410
  ParallelConfig,
412
411
  SchedulerConfig,
413
412
  _initialize_model,
@@ -422,7 +421,6 @@ def monkey_patch_vllm_dummy_weight_loader():
422
421
  model_config: ModelConfig,
423
422
  device_config: DeviceConfig,
424
423
  lora_config: Optional[LoRAConfig],
425
- multimodal_config: Optional[MultiModalConfig],
426
424
  parallel_config: ParallelConfig,
427
425
  scheduler_config: SchedulerConfig,
428
426
  cache_config: CacheConfig,
@@ -433,7 +431,6 @@ def monkey_patch_vllm_dummy_weight_loader():
433
431
  model_config,
434
432
  self.load_config,
435
433
  lora_config,
436
- multimodal_config,
437
434
  cache_config,
438
435
  )
439
436
 
sglang/test/runners.py CHANGED
@@ -180,7 +180,7 @@ class SRTRunner:
180
180
  tp_size=tp_size,
181
181
  dtype=get_dtype_str(torch_dtype),
182
182
  port=port,
183
- mem_fraction_static=0.7,
183
+ mem_fraction_static=0.69,
184
184
  trust_remote_code=False,
185
185
  is_embedding=not self.is_generation,
186
186
  )
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.14.post2"
1
+ __version__ = "0.2.15"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.14.post2
3
+ Version: 0.2.15
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
312
312
  ### Method 2: From source
313
313
  ```
314
314
  # Use the last release branch
315
- git clone -b v0.2.14.post2 https://github.com/sgl-project/sglang.git
315
+ git clone -b v0.2.15 https://github.com/sgl-project/sglang.git
316
316
  cd sglang
317
317
 
318
318
  pip install --upgrade pip
@@ -489,7 +489,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
489
489
  ### Supported Models
490
490
 
491
491
  **Generative Models**
492
-
493
492
  - Llama / Llama 2 / Llama 3 / Llama 3.1
494
493
  - Mistral / Mixtral / Mistral NeMo
495
494
  - Gemma / Gemma 2
@@ -509,6 +508,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
509
508
  - Grok
510
509
  - ChatGLM
511
510
  - InternLM 2
511
+ - Exaone 3
512
512
 
513
513
  **Embedding Models**
514
514
 
@@ -636,7 +636,7 @@ print(state["answer_1"])
636
636
  #### More Examples
637
637
 
638
638
  Anthropic and VertexAI (Gemini) models are also supported.
639
- You can find more examples at [examples/quick_start](examples/quick_start).
639
+ You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
640
640
 
641
641
  ### Language Feature
642
642
  To begin with, import sglang.
@@ -649,7 +649,7 @@ You can implement your prompt flow in a function decorated by `sgl.function`.
649
649
  You can then invoke the function with `run` or `run_batch`.
650
650
  The system will manage the state, chat template, parallelism and batching for you.
651
651
 
652
- The complete code for the examples below can be found at [readme_examples.py](examples/usage/readme_examples.py)
652
+ The complete code for the examples below can be found at [readme_examples.py](examples/frontend_language/usage/readme_examples.py)
653
653
 
654
654
  #### Control Flow
655
655
  You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
@@ -698,7 +698,7 @@ def image_qa(s, image_file, question):
698
698
  s += sgl.assistant(sgl.gen("answer", max_tokens=256)
699
699
  ```
700
700
 
701
- See also [srt_example_llava.py](examples/quick_start/srt_example_llava.py).
701
+ See also [srt_example_llava.py](examples/frontend_language/quick_start/local_example_llava_next.py).
702
702
 
703
703
  #### Constrained Decoding
704
704
  Use `regex` to specify a regular expression as a decoding constraint.
@@ -742,7 +742,7 @@ def character_gen(s, name):
742
742
  s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
743
743
  ```
744
744
 
745
- See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
745
+ See also [json_decode.py](examples/frontend_language/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
746
746
 
747
747
  #### Batching
748
748
  Use `run_batch` to run a batch of requests with continuous batching.