sglang 0.2.14.post1__py3-none-any.whl → 0.2.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. sglang/api.py +2 -0
  2. sglang/bench_latency.py +39 -28
  3. sglang/lang/interpreter.py +3 -0
  4. sglang/lang/ir.py +5 -0
  5. sglang/launch_server_llavavid.py +26 -0
  6. sglang/srt/configs/__init__.py +5 -0
  7. sglang/srt/configs/exaone.py +195 -0
  8. sglang/srt/constrained/fsm_cache.py +1 -1
  9. sglang/srt/conversation.py +24 -2
  10. sglang/srt/hf_transformers_utils.py +11 -160
  11. sglang/srt/layers/activation.py +10 -4
  12. sglang/srt/layers/extend_attention.py +13 -8
  13. sglang/srt/layers/layernorm.py +47 -1
  14. sglang/srt/layers/logits_processor.py +4 -4
  15. sglang/srt/layers/sampler.py +69 -16
  16. sglang/srt/managers/controller_multi.py +5 -5
  17. sglang/srt/managers/controller_single.py +5 -5
  18. sglang/srt/managers/io_struct.py +11 -5
  19. sglang/srt/managers/schedule_batch.py +25 -13
  20. sglang/srt/managers/tokenizer_manager.py +76 -63
  21. sglang/srt/managers/tp_worker.py +47 -36
  22. sglang/srt/model_config.py +3 -3
  23. sglang/srt/model_executor/cuda_graph_runner.py +24 -9
  24. sglang/srt/model_executor/forward_batch_info.py +78 -43
  25. sglang/srt/model_executor/model_runner.py +29 -18
  26. sglang/srt/models/chatglm.py +5 -13
  27. sglang/srt/models/commandr.py +5 -1
  28. sglang/srt/models/dbrx.py +5 -1
  29. sglang/srt/models/deepseek.py +5 -1
  30. sglang/srt/models/deepseek_v2.py +57 -25
  31. sglang/srt/models/exaone.py +399 -0
  32. sglang/srt/models/gemma.py +7 -3
  33. sglang/srt/models/gemma2.py +6 -52
  34. sglang/srt/models/gpt_bigcode.py +5 -1
  35. sglang/srt/models/grok.py +14 -4
  36. sglang/srt/models/internlm2.py +5 -1
  37. sglang/srt/models/llama2.py +10 -7
  38. sglang/srt/models/llama_classification.py +2 -6
  39. sglang/srt/models/llama_embedding.py +3 -4
  40. sglang/srt/models/llava.py +69 -91
  41. sglang/srt/models/llavavid.py +40 -86
  42. sglang/srt/models/minicpm.py +5 -1
  43. sglang/srt/models/mixtral.py +6 -2
  44. sglang/srt/models/mixtral_quant.py +5 -1
  45. sglang/srt/models/qwen.py +5 -2
  46. sglang/srt/models/qwen2.py +9 -6
  47. sglang/srt/models/qwen2_moe.py +12 -33
  48. sglang/srt/models/stablelm.py +5 -1
  49. sglang/srt/models/yivl.py +2 -7
  50. sglang/srt/openai_api/adapter.py +16 -1
  51. sglang/srt/openai_api/protocol.py +5 -5
  52. sglang/srt/sampling/sampling_batch_info.py +79 -6
  53. sglang/srt/server.py +9 -9
  54. sglang/srt/utils.py +18 -36
  55. sglang/test/runners.py +2 -2
  56. sglang/test/test_layernorm.py +53 -1
  57. sglang/version.py +1 -1
  58. {sglang-0.2.14.post1.dist-info → sglang-0.2.15.dist-info}/METADATA +8 -8
  59. sglang-0.2.15.dist-info/RECORD +118 -0
  60. sglang-0.2.14.post1.dist-info/RECORD +0 -114
  61. {sglang-0.2.14.post1.dist-info → sglang-0.2.15.dist-info}/LICENSE +0 -0
  62. {sglang-0.2.14.post1.dist-info → sglang-0.2.15.dist-info}/WHEEL +0 -0
  63. {sglang-0.2.14.post1.dist-info → sglang-0.2.15.dist-info}/top_level.txt +0 -0
@@ -21,10 +21,63 @@ class SamplingBatchInfo:
21
21
  top_ps: torch.Tensor = None
22
22
  top_ks: torch.Tensor = None
23
23
  min_ps: torch.Tensor = None
24
- penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
24
+
25
+ # Dispatch in CUDA graph
26
+ need_min_p_sampling: bool = False
27
+
28
+ # Bias Tensors
25
29
  logit_bias: torch.Tensor = None
26
30
  vocab_mask: torch.Tensor = None
27
31
 
32
+ # Penalizer
33
+ penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
34
+ linear_penalties: torch.Tensor = None
35
+ scaling_penalties: torch.Tensor = None
36
+
37
+ def has_bias(self):
38
+ return (
39
+ self.logit_bias is not None
40
+ or self.vocab_mask is not None
41
+ or self.linear_penalties is not None
42
+ or self.scaling_penalties is not None
43
+ )
44
+
45
+ @classmethod
46
+ def dummy_one(cls, max_bs: int, vocab_size: int):
47
+ ret = cls(vocab_size=vocab_size)
48
+ ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
49
+ ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
50
+ ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
51
+ ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
52
+ return ret
53
+
54
+ def __getitem__(self, key):
55
+ if isinstance(key, slice):
56
+ # NOTE: We do not use cuda graph when there is bias tensors
57
+ assert not self.has_bias()
58
+ return SamplingBatchInfo(
59
+ vocab_size=self.vocab_size,
60
+ temperatures=self.temperatures[key],
61
+ top_ps=self.top_ps[key],
62
+ top_ks=self.top_ks[key],
63
+ min_ps=self.min_ps[key],
64
+ need_min_p_sampling=self.need_min_p_sampling,
65
+ )
66
+ else:
67
+ raise NotImplementedError
68
+
69
+ def inplace_assign(self, bs: int, other: SamplingBatchInfo):
70
+ # NOTE: We do not use cuda graph when there is bias tensors
71
+ assert not self.has_bias()
72
+
73
+ self.vocab_size = other.vocab_size
74
+ self.need_min_p_sampling = other.need_min_p_sampling
75
+
76
+ self.temperatures[:bs] = other.temperatures
77
+ self.top_ps[:bs] = other.top_ps
78
+ self.top_ks[:bs] = other.top_ks
79
+ self.min_ps[:bs] = other.min_ps
80
+
28
81
  @classmethod
29
82
  def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
30
83
  device = "cuda"
@@ -45,6 +98,7 @@ class SamplingBatchInfo:
45
98
  ret.min_ps = torch.tensor(
46
99
  [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
47
100
  )
101
+ ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
48
102
 
49
103
  # Each penalizers will do nothing if they evaluate themselves as not required by looking at
50
104
  # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
@@ -72,6 +126,25 @@ class SamplingBatchInfo:
72
126
 
73
127
  return ret
74
128
 
129
+ def prepare_penalties(self):
130
+ self.scaling_penalties = None
131
+ self.linear_penalties = None
132
+
133
+ for penalizer in self.penalizer_orchestrator.penalizers.values():
134
+ if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
135
+ if penalizer.is_prepared():
136
+ self.scaling_penalties = penalizer.cumulated_repetition_penalties
137
+ else:
138
+ if penalizer.is_prepared():
139
+ if self.linear_penalties is None:
140
+ bs = self.penalizer_orchestrator.batch.batch_size()
141
+ self.linear_penalties = torch.zeros(
142
+ (bs, self.vocab_size),
143
+ dtype=torch.float32,
144
+ device="cuda",
145
+ )
146
+ self.linear_penalties = penalizer.apply(self.linear_penalties)
147
+
75
148
  def update_regex_vocab_mask(self, batch: ScheduleBatch):
76
149
  bs, reqs = batch.batch_size(), batch.reqs
77
150
  device = "cuda"
@@ -81,15 +154,15 @@ class SamplingBatchInfo:
81
154
  self.vocab_mask = None
82
155
 
83
156
  if has_regex:
157
+ self.vocab_mask = torch.zeros(
158
+ bs, self.vocab_size, dtype=torch.bool, device=device
159
+ )
84
160
  for i, req in enumerate(reqs):
85
161
  if req.regex_fsm is not None:
86
- if self.vocab_mask is None:
87
- self.vocab_mask = torch.zeros(
88
- bs, self.vocab_size, dtype=torch.bool, device=device
89
- )
162
+ self.vocab_mask[i].fill_(1)
90
163
  self.vocab_mask[i][
91
164
  req.regex_fsm.get_next_instruction(req.regex_fsm_state).tokens
92
- ] = 1
165
+ ] = 0
93
166
 
94
167
  def filter(self, unfinished_indices: List[int], new_indices: torch.Tensor):
95
168
  self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
sglang/srt/server.py CHANGED
@@ -272,7 +272,7 @@ async def retrieve_file_content(file_id: str):
272
272
 
273
273
  def launch_server(
274
274
  server_args: ServerArgs,
275
- model_overide_args: Optional[dict] = None,
275
+ model_override_args: Optional[dict] = None,
276
276
  pipe_finish_writer: Optional[mp.connection.Connection] = None,
277
277
  ):
278
278
  """Launch an HTTP server."""
@@ -317,7 +317,7 @@ def launch_server(
317
317
  tp_rank_range,
318
318
  server_args,
319
319
  ports[3],
320
- model_overide_args,
320
+ model_override_args,
321
321
  )
322
322
 
323
323
  try:
@@ -328,20 +328,20 @@ def launch_server(
328
328
  return
329
329
 
330
330
  # Launch processes
331
- tokenizer_manager = TokenizerManager(server_args, port_args, model_overide_args)
331
+ tokenizer_manager = TokenizerManager(server_args, port_args, model_override_args)
332
332
  if server_args.chat_template:
333
333
  load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
334
334
  pipe_controller_reader, pipe_controller_writer = mp.Pipe(duplex=False)
335
335
  pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False)
336
336
 
337
337
  if server_args.dp_size == 1:
338
- start_process = start_controller_process_single
338
+ start_controller_process = start_controller_process_single
339
339
  else:
340
- start_process = start_controller_process_multi
340
+ start_controller_process = start_controller_process_multi
341
341
 
342
342
  proc_controller = mp.Process(
343
- target=start_process,
344
- args=(server_args, port_args, pipe_controller_writer, model_overide_args),
343
+ target=start_controller_process,
344
+ args=(server_args, port_args, pipe_controller_writer, model_override_args),
345
345
  )
346
346
  proc_controller.start()
347
347
 
@@ -501,7 +501,7 @@ class Runtime:
501
501
  def __init__(
502
502
  self,
503
503
  log_level: str = "error",
504
- model_overide_args: Optional[dict] = None,
504
+ model_override_args: Optional[dict] = None,
505
505
  *args,
506
506
  **kwargs,
507
507
  ):
@@ -525,7 +525,7 @@ class Runtime:
525
525
 
526
526
  proc = mp.Process(
527
527
  target=launch_server,
528
- args=(self.server_args, model_overide_args, pipe_writer),
528
+ args=(self.server_args, model_override_args, pipe_writer),
529
529
  )
530
530
  proc.start()
531
531
  pipe_writer.close()
sglang/srt/utils.py CHANGED
@@ -26,7 +26,7 @@ import struct
26
26
  import time
27
27
  from importlib.metadata import PackageNotFoundError, version
28
28
  from io import BytesIO
29
- from typing import List, Optional
29
+ from typing import List, Optional, Union
30
30
 
31
31
  import numpy as np
32
32
  import psutil
@@ -193,35 +193,16 @@ def allocate_init_ports(
193
193
  return ret_ports[0], ret_ports[1:num_ports_needed]
194
194
 
195
195
 
196
- def get_int_token_logit_bias(tokenizer, vocab_size):
197
- """Get the logit bias for integer-only tokens."""
198
- # a bug when model's vocab size > tokenizer.vocab_size
199
- if tokenizer == None:
200
- return [-1e5] * vocab_size
201
- vocab_size = tokenizer.vocab_size
202
- logit_bias = np.zeros(vocab_size, dtype=np.float32)
203
- for t_id in range(vocab_size):
204
- ss = tokenizer.decode([t_id]).strip()
205
- if not (ss.isdigit() or len(ss) == 0 or t_id == tokenizer.eos_token_id):
206
- logit_bias[t_id] = -1e5
207
-
208
- return logit_bias
209
-
210
-
211
- def is_multimodal_model(model):
212
- from sglang.srt.model_config import ModelConfig
213
-
214
- if isinstance(model, str):
215
- model = model.lower()
216
- return "llava" in model or "yi-vl" in model or "llava-next" in model
217
-
218
- if isinstance(model, ModelConfig):
219
- model_path = model.path.lower()
220
- return (
221
- "llava" in model_path or "yi-vl" in model_path or "llava-next" in model_path
222
- )
223
-
224
- raise ValueError("unrecognized type")
196
+ def is_multimodal_model(model_architectures):
197
+ if (
198
+ "LlavaLlamaForCausalLM" in model_architectures
199
+ or "LlavaQwenForCausalLM" in model_architectures
200
+ or "LlavaMistralForCausalLM" in model_architectures
201
+ or "LlavaVidForCausalLM" in model_architectures
202
+ ):
203
+ return True
204
+ else:
205
+ return False
225
206
 
226
207
 
227
208
  def is_generation_model(model_architectures, is_embedding: bool = False):
@@ -317,12 +298,14 @@ def decode_video_base64(video_base64):
317
298
  ) # Return an empty array and size tuple if no frames were found
318
299
 
319
300
 
320
- def load_image(image_file):
301
+ def load_image(image_file: Union[str, bytes]):
321
302
  from PIL import Image
322
303
 
323
304
  image = image_size = None
324
305
 
325
- if image_file.startswith("http://") or image_file.startswith("https://"):
306
+ if isinstance(image_file, bytes):
307
+ image = Image.open(BytesIO(image_file))
308
+ elif image_file.startswith("http://") or image_file.startswith("https://"):
326
309
  timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))
327
310
  response = requests.get(image_file, timeout=timeout)
328
311
  image = Image.open(BytesIO(response.content))
@@ -334,8 +317,10 @@ def load_image(image_file):
334
317
  elif image_file.startswith("video:"):
335
318
  image_file = image_file.replace("video:", "")
336
319
  image, image_size = decode_video_base64(image_file)
337
- else:
320
+ elif isinstance(image_file, str):
338
321
  image = Image.open(BytesIO(base64.b64decode(image_file)))
322
+ else:
323
+ raise ValueError(f"Invalid image: {image}")
339
324
 
340
325
  return image, image_size
341
326
 
@@ -422,7 +407,6 @@ def monkey_patch_vllm_dummy_weight_loader():
422
407
  DummyModelLoader,
423
408
  LoRAConfig,
424
409
  ModelConfig,
425
- MultiModalConfig,
426
410
  ParallelConfig,
427
411
  SchedulerConfig,
428
412
  _initialize_model,
@@ -437,7 +421,6 @@ def monkey_patch_vllm_dummy_weight_loader():
437
421
  model_config: ModelConfig,
438
422
  device_config: DeviceConfig,
439
423
  lora_config: Optional[LoRAConfig],
440
- multimodal_config: Optional[MultiModalConfig],
441
424
  parallel_config: ParallelConfig,
442
425
  scheduler_config: SchedulerConfig,
443
426
  cache_config: CacheConfig,
@@ -448,7 +431,6 @@ def monkey_patch_vllm_dummy_weight_loader():
448
431
  model_config,
449
432
  self.load_config,
450
433
  lora_config,
451
- multimodal_config,
452
434
  cache_config,
453
435
  )
454
436
 
sglang/test/runners.py CHANGED
@@ -30,7 +30,7 @@ DEFAULT_PROMPTS = [
30
30
  # the output of gemma-2-2b from SRT is unstable on the commented prompt
31
31
  # "The capital of France is",
32
32
  "Apple is red. Banana is Yellow. " * 800 + "Apple is",
33
- "The capital of the United Kindom is",
33
+ "The capital of the United Kingdom is",
34
34
  "Today is a sunny day and I like",
35
35
  "AI is a field of computer science focused on",
36
36
  ]
@@ -180,7 +180,7 @@ class SRTRunner:
180
180
  tp_size=tp_size,
181
181
  dtype=get_dtype_str(torch_dtype),
182
182
  port=port,
183
- mem_fraction_static=0.7,
183
+ mem_fraction_static=0.69,
184
184
  trust_remote_code=False,
185
185
  is_embedding=not self.is_generation,
186
186
  )
@@ -3,7 +3,7 @@ import unittest
3
3
 
4
4
  import torch
5
5
 
6
- from sglang.srt.layers.layernorm import RMSNorm
6
+ from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm
7
7
 
8
8
 
9
9
  class TestRMSNorm(unittest.TestCase):
@@ -56,5 +56,57 @@ class TestRMSNorm(unittest.TestCase):
56
56
  self._run_rms_norm_test(*params)
57
57
 
58
58
 
59
+ class TestGemmaRMSNorm(unittest.TestCase):
60
+ DTYPES = [torch.half, torch.bfloat16]
61
+ NUM_TOKENS = [7, 83, 4096]
62
+ HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
63
+ ADD_RESIDUAL = [False, True]
64
+ SEEDS = [0]
65
+
66
+ @classmethod
67
+ def setUpClass(cls):
68
+ if not torch.cuda.is_available():
69
+ raise unittest.SkipTest("CUDA is not available")
70
+ torch.set_default_device("cuda")
71
+
72
+ def _run_gemma_rms_norm_test(
73
+ self, num_tokens, hidden_size, add_residual, dtype, seed
74
+ ):
75
+ torch.manual_seed(seed)
76
+
77
+ layer = GemmaRMSNorm(hidden_size).to(dtype=dtype)
78
+ layer.weight.data.normal_(mean=1.0, std=0.1)
79
+ scale = 1 / (2 * hidden_size)
80
+ x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
81
+ residual = torch.randn_like(x) * scale if add_residual else None
82
+
83
+ with torch.inference_mode():
84
+ ref_out = layer.forward_native(x, residual)
85
+ out = layer(x, residual)
86
+
87
+ if add_residual:
88
+ self.assertTrue(torch.allclose(out[0], ref_out[0], atol=1e-3, rtol=1e-3))
89
+ self.assertTrue(torch.allclose(out[1], ref_out[1], atol=1e-3, rtol=1e-3))
90
+ else:
91
+ self.assertTrue(torch.allclose(out, ref_out, atol=1e-3, rtol=1e-3))
92
+
93
+ def test_gemma_rms_norm(self):
94
+ for params in itertools.product(
95
+ self.NUM_TOKENS,
96
+ self.HIDDEN_SIZES,
97
+ self.ADD_RESIDUAL,
98
+ self.DTYPES,
99
+ self.SEEDS,
100
+ ):
101
+ with self.subTest(
102
+ num_tokens=params[0],
103
+ hidden_size=params[1],
104
+ add_residual=params[2],
105
+ dtype=params[3],
106
+ seed=params[4],
107
+ ):
108
+ self._run_gemma_rms_norm_test(*params)
109
+
110
+
59
111
  if __name__ == "__main__":
60
112
  unittest.main(verbosity=2)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.14.post1"
1
+ __version__ = "0.2.15"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.14.post1
3
+ Version: 0.2.15
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
312
312
  ### Method 2: From source
313
313
  ```
314
314
  # Use the last release branch
315
- git clone -b v0.2.14.post1 https://github.com/sgl-project/sglang.git
315
+ git clone -b v0.2.15 https://github.com/sgl-project/sglang.git
316
316
  cd sglang
317
317
 
318
318
  pip install --upgrade pip
@@ -489,14 +489,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
489
489
  ### Supported Models
490
490
 
491
491
  **Generative Models**
492
-
493
492
  - Llama / Llama 2 / Llama 3 / Llama 3.1
494
493
  - Mistral / Mixtral / Mistral NeMo
495
494
  - Gemma / Gemma 2
496
495
  - Qwen / Qwen 2 / Qwen 2 MoE
497
496
  - DeepSeek / DeepSeek 2
498
497
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
499
- - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
498
+ - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
500
499
  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
501
500
  - LLaVA 1.5 / 1.6 / NeXT
502
501
  - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
@@ -509,6 +508,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
509
508
  - Grok
510
509
  - ChatGLM
511
510
  - InternLM 2
511
+ - Exaone 3
512
512
 
513
513
  **Embedding Models**
514
514
 
@@ -636,7 +636,7 @@ print(state["answer_1"])
636
636
  #### More Examples
637
637
 
638
638
  Anthropic and VertexAI (Gemini) models are also supported.
639
- You can find more examples at [examples/quick_start](examples/quick_start).
639
+ You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
640
640
 
641
641
  ### Language Feature
642
642
  To begin with, import sglang.
@@ -649,7 +649,7 @@ You can implement your prompt flow in a function decorated by `sgl.function`.
649
649
  You can then invoke the function with `run` or `run_batch`.
650
650
  The system will manage the state, chat template, parallelism and batching for you.
651
651
 
652
- The complete code for the examples below can be found at [readme_examples.py](examples/usage/readme_examples.py)
652
+ The complete code for the examples below can be found at [readme_examples.py](examples/frontend_language/usage/readme_examples.py)
653
653
 
654
654
  #### Control Flow
655
655
  You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
@@ -698,7 +698,7 @@ def image_qa(s, image_file, question):
698
698
  s += sgl.assistant(sgl.gen("answer", max_tokens=256)
699
699
  ```
700
700
 
701
- See also [srt_example_llava.py](examples/quick_start/srt_example_llava.py).
701
+ See also [srt_example_llava.py](examples/frontend_language/quick_start/local_example_llava_next.py).
702
702
 
703
703
  #### Constrained Decoding
704
704
  Use `regex` to specify a regular expression as a decoding constraint.
@@ -742,7 +742,7 @@ def character_gen(s, name):
742
742
  s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
743
743
  ```
744
744
 
745
- See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
745
+ See also [json_decode.py](examples/frontend_language/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
746
746
 
747
747
  #### Batching
748
748
  Use `run_batch` to run a batch of requests with continuous batching.
@@ -0,0 +1,118 @@
1
+ sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
2
+ sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
3
+ sglang/bench_latency.py,sha256=F7jMfKqMf1XFKJgkpR_yE33VJpsIhSr_SOJeRbngkb0,16758
4
+ sglang/bench_serving.py,sha256=J_mMwnmDn0Jt07mzdGAuYOxpockHPLYJFL-kwoaqASY,36527
5
+ sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
6
+ sglang/global_config.py,sha256=nwOjUflwqLQySPUMvk8Hk63TIS6mknh_ODSW3CZ1rJw,1704
7
+ sglang/launch_server.py,sha256=FODfO0DW546dh-u1qDlWtrhsmj6hxkarXXv3cIdgkj8,549
8
+ sglang/launch_server_llavavid.py,sha256=xnpSILJxsrbvqkERav5P26bErCQnhoTFmoKeScJltUA,1034
9
+ sglang/utils.py,sha256=zFYGkC4vOUR3sTv1TmQXcsOLZDtDBR3wnjqnDp3xMIs,8352
10
+ sglang/version.py,sha256=ogr0x4sazo5ruMrKOQDYO_YrTwtaXZTE8fKnwCajH7I,23
11
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
13
+ sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
14
+ sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
15
+ sglang/lang/interpreter.py,sha256=AC3tNNDwYfiu87jCldBWXYpFicCv6NMPJACMFEfCXu4,30331
16
+ sglang/lang/ir.py,sha256=W3UfZikcGeT86PDDjDjw-yNzrKY2e2UYO4DTatMCfm0,17704
17
+ sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
18
+ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
20
+ sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
21
+ sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
22
+ sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
23
+ sglang/lang/backend/runtime_endpoint.py,sha256=SDlp03EuQEK1eGK4_IaFySWgxlp4wCs3EPewZ6O640E,9549
24
+ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
25
+ sglang/srt/conversation.py,sha256=2KDNe1suUPy6xqSkCx2xcO3pDPxTwqx5FaUxaqwCJ-M,19525
26
+ sglang/srt/hf_transformers_utils.py,sha256=kNGJ5OfAth7dZrWfhpKpt7s2LQWvLH2d-v0GtcEs3R0,6078
27
+ sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
28
+ sglang/srt/model_config.py,sha256=68QQ8iUWQHPv01RBeH23mvay6iJg9DWmCogC_vUgFLk,6371
29
+ sglang/srt/server.py,sha256=yi8prs9_M0P0dOInrQLkHKiZ-oTigk_uzW8otEHImbU,19846
30
+ sglang/srt/server_args.py,sha256=GiDyPWCvYA_98mSE9LuvUoEodo9gRnNPPIPn0nFkxUs,18259
31
+ sglang/srt/utils.py,sha256=JJOlqRPbN_tSSNWj63syQpfz4v7hUwNvzWvOUpBh9SM,23746
32
+ sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
33
+ sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
34
+ sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
35
+ sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
36
+ sglang/srt/constrained/fsm_cache.py,sha256=wigJs9PeTt-vYPJQEeUZwEKl6MFIfb5xy8uIg18bDbM,3132
37
+ sglang/srt/constrained/jump_forward.py,sha256=LWRsmGPQcH6KT87wXwCRqtblU3pcAVCEzO0nWPxevs0,6636
38
+ sglang/srt/layers/activation.py,sha256=JEXNTgqxoiU4N-gVm4XMjobhft4JKDcMrgTkfpsRUzM,4856
39
+ sglang/srt/layers/decode_attention.py,sha256=TPD_608ZX9fQ_HDImifkxG_qcEYmimbEYY8lCBIjFuM,16628
40
+ sglang/srt/layers/extend_attention.py,sha256=XIXm3p2cvKrDg10Po4qYGaEkXJOJBtCIhTB_lTyjAFE,14390
41
+ sglang/srt/layers/layernorm.py,sha256=RXuS4UyksatqTF6lSK7VYyEiUEnBiNIBlEn8q4w84UA,3404
42
+ sglang/srt/layers/logits_processor.py,sha256=Zx4eFAkFlThPrmz_-HuCN9SqGLanARm0wdZSVDyASAc,13085
43
+ sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
44
+ sglang/srt/layers/prefill_attention.py,sha256=y7vdcuX8lMa9Qf_jQYNDvQO9PVCBQSs3hb5LV2DFgpU,5256
45
+ sglang/srt/layers/radix_attention.py,sha256=o5a8r3XQ-oRwaxBlAgzJGv7p3dMbu0LrYsDc4uvpPgA,8338
46
+ sglang/srt/layers/sampler.py,sha256=YEDZrwzshX-fZZ5tkW57yBBIJRu2SPAUZzXhhrpQs4Q,5543
47
+ sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
48
+ sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
49
+ sglang/srt/layers/fused_moe/layer.py,sha256=GT3r2UPx_PAufJd0SUMOXyh76ymAeYDubd0SM0H71bo,20977
50
+ sglang/srt/managers/controller_multi.py,sha256=z3rguY1YYlSvVqLjKuurgJW1h0dxwPgIdPCQdJsVzYs,6478
51
+ sglang/srt/managers/controller_single.py,sha256=5brrZ8vZxjvrSJHWrm5H3qGEZShN4EROG5r1o3pSjps,5124
52
+ sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
53
+ sglang/srt/managers/io_struct.py,sha256=Bd91cydX9_960NNP2xngqK-lsIaDB3oMYd56QddN4_Q,10722
54
+ sglang/srt/managers/policy_scheduler.py,sha256=7HNUxBKJE444s_bHcPpbnHCygsnH-NIXYNSC2q6mRmc,8584
55
+ sglang/srt/managers/schedule_batch.py,sha256=D3NBNi_6_KEMfBTn_8XPrtCbXHjnUki0sOVhQ7kgqqM,26182
56
+ sglang/srt/managers/tokenizer_manager.py,sha256=ung-uQrvtPn-vzpQMjpYW_jKWDJR_B8NL88WW3OWyy0,29435
57
+ sglang/srt/managers/tp_worker.py,sha256=4UuaBLzV6NMsG4XEIcpa4xMcOKIFvTan51ynKz85HXg,36842
58
+ sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
59
+ sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
60
+ sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
61
+ sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
62
+ sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
63
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=qyKjW9TjSjZ-NZI3aspJwnmuKSKT6DX1MMTFwqJtNE8,12751
64
+ sglang/srt/model_executor/forward_batch_info.py,sha256=fSLhatN8vCgxn0Mft9D-r0pNi3SN0EQSTJmgaOtrqJc,16471
65
+ sglang/srt/model_executor/model_runner.py,sha256=9ard4FLjb_rz0EUS3KMrlDkos0zNGh5TQ6wlHSIsev4,24408
66
+ sglang/srt/models/chatglm.py,sha256=BzLtDK_CsD1Pmn-sHnJuLulJCUuSbNm1q1fqCShRdQ8,13628
67
+ sglang/srt/models/commandr.py,sha256=k86ykwWOlxLGaBbGUoMSaXngUxCbMVRbY5AoMOWpbU8,14377
68
+ sglang/srt/models/dbrx.py,sha256=goLJ9Yt-9vxkwhCUFBidvP41H_dYTFsvrMZ4xm4FqGA,14875
69
+ sglang/srt/models/deepseek.py,sha256=aYP6HUgxQbhcQGQEF4vX0ronBF8AirqIFG98EQn0YzY,16220
70
+ sglang/srt/models/deepseek_v2.py,sha256=Htw_HDju9huYU5gBu2dqq6bKVao-AsifxfkGl2xRx-8,28521
71
+ sglang/srt/models/exaone.py,sha256=58JELgg-dZl6CUNd2PEWR0ok9u4osOuE5QKSfX6MzhE,14480
72
+ sglang/srt/models/gemma.py,sha256=Ya_u2lKPKAc9iHEsW_HAEfCDgYTbxUOCzBI0LDuoOYs,12489
73
+ sglang/srt/models/gemma2.py,sha256=MCmzzRAAafEQuQj6aGtB-TF4jH0RWrXcOPxSz6LRsXs,15137
74
+ sglang/srt/models/gpt_bigcode.py,sha256=HEhMRO1Y37JfZtP7mDp0MexWj5h6XT9rKvxorOMKoQA,10409
75
+ sglang/srt/models/grok.py,sha256=ZcJ4E11rKh-xo4k_j-H1XRreJWWv8yii-bMYC1lO2R8,15143
76
+ sglang/srt/models/internlm2.py,sha256=VtWATs2eLIqbadYXTPY_vycFIstVk4zg3kxycA9H0Qw,12416
77
+ sglang/srt/models/llama2.py,sha256=NriIElOdhhsiJFmNPc4bDXjxU_FgqfqdtoagSuIcnnc,14394
78
+ sglang/srt/models/llama_classification.py,sha256=ClNlaLi3Z0ME1ETOwGxl8DtJy8VJu8kobVRFX9jKJqM,4704
79
+ sglang/srt/models/llama_embedding.py,sha256=Z3FWGNEWrperMxnVqOhxv6vApNpChh-AaahlEqeYOrk,3574
80
+ sglang/srt/models/llava.py,sha256=ypq0hWprqN73P-VuYfSAZ1_Otm48qDqEPA2YO583goM,23453
81
+ sglang/srt/models/llavavid.py,sha256=Dx_wED6stC8lTASUrGt6B3c8wQ9lVrX-76-dNyyuVVg,11934
82
+ sglang/srt/models/minicpm.py,sha256=7RZEJ2TCqBL1JmMFVJ3J9DmZHRw0q90st49Wkh-sdL4,14039
83
+ sglang/srt/models/mistral.py,sha256=jlrWBVNXbAUziAaIdHAjFcOJnKtn9Bl8rBd65ypJM-I,819
84
+ sglang/srt/models/mixtral.py,sha256=KIsvruhXNq3Fwrs4_YE7J6fx54ObfnMuRNxgScE3Bmo,13830
85
+ sglang/srt/models/mixtral_quant.py,sha256=O_97UKDYZokFhIBnamWfw0HLhln9_BUk_KfQ-sQnd8s,14286
86
+ sglang/srt/models/qwen.py,sha256=geK88AyEyPbbDvMHJNY8XMSNpsCeu8g9kxnKyiJBpK4,10168
87
+ sglang/srt/models/qwen2.py,sha256=WGYy3wcRY3f8Drd9I8GblXfv0bbHluRKVhnnhEZf584,12654
88
+ sglang/srt/models/qwen2_moe.py,sha256=b0gd42GBWyvDmUu8BZbD9ZJO_ExbXBLQZRvu61UuXOA,17086
89
+ sglang/srt/models/stablelm.py,sha256=9feHoiDEXSIe0WCrt4AfWXqxliJwRvr8w4XSnk6ipSI,11573
90
+ sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
91
+ sglang/srt/openai_api/adapter.py,sha256=3EeqASZXogpUkOP4xj7Rg_LfOLiIMUrZ9uFdeAy_pcc,50144
92
+ sglang/srt/openai_api/protocol.py,sha256=onhnCjXpXCysvx_dLgOEmXz5XHHYB1t772cvHcK1GlY,9538
93
+ sglang/srt/sampling/sampling_batch_info.py,sha256=WO7fgURK7XqXU3jORWpkz7Tyx3FC34r--hPMKvkt4Iw,7735
94
+ sglang/srt/sampling/sampling_params.py,sha256=ggOXxafqfCD-xrGYcM57byLZ79CIeBP4AD5F44L_CW0,5635
95
+ sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
96
+ sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
97
+ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
98
+ sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
99
+ sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
100
+ sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
101
+ sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
102
+ sglang/test/runners.py,sha256=7N2g4vyqN98o6F0Lem5LUNAlW9ShEVxZxZuzSjmc0i4,7688
103
+ sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
104
+ sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
105
+ sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
106
+ sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
107
+ sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
108
+ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
109
+ sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
110
+ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
111
+ sglang/test/test_programs.py,sha256=V_-Bx3lLkw37P6gDyA7mZCqxlyNMaFLBkRrPMQQQqn4,14909
112
+ sglang/test/test_utils.py,sha256=HD-9rcj7EFS_NX1GQFU5613ITQlZaTK2l9RmqA0F7x4,14380
113
+ sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
114
+ sglang-0.2.15.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
115
+ sglang-0.2.15.dist-info/METADATA,sha256=bmiMZPX1vW_NYDBk92pG1u9_PZRcXanJ2KXtxBmaiF4,37211
116
+ sglang-0.2.15.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
117
+ sglang-0.2.15.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
118
+ sglang-0.2.15.dist-info/RECORD,,