sglang 0.2.14.post2__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. sglang/api.py +2 -0
  2. sglang/bench_latency.py +39 -28
  3. sglang/lang/backend/runtime_endpoint.py +8 -4
  4. sglang/lang/interpreter.py +3 -0
  5. sglang/lang/ir.py +5 -0
  6. sglang/launch_server_llavavid.py +12 -12
  7. sglang/srt/configs/__init__.py +5 -0
  8. sglang/srt/configs/exaone.py +195 -0
  9. sglang/srt/constrained/fsm_cache.py +1 -1
  10. sglang/srt/conversation.py +24 -2
  11. sglang/srt/hf_transformers_utils.py +12 -12
  12. sglang/srt/layers/extend_attention.py +13 -8
  13. sglang/srt/layers/logits_processor.py +4 -4
  14. sglang/srt/layers/sampler.py +94 -17
  15. sglang/srt/managers/controller_multi.py +5 -5
  16. sglang/srt/managers/controller_single.py +5 -5
  17. sglang/srt/managers/io_struct.py +6 -1
  18. sglang/srt/managers/schedule_batch.py +26 -11
  19. sglang/srt/managers/tokenizer_manager.py +9 -9
  20. sglang/srt/managers/tp_worker.py +38 -26
  21. sglang/srt/model_config.py +3 -3
  22. sglang/srt/model_executor/cuda_graph_runner.py +26 -9
  23. sglang/srt/model_executor/forward_batch_info.py +68 -23
  24. sglang/srt/model_executor/model_runner.py +15 -22
  25. sglang/srt/models/chatglm.py +9 -15
  26. sglang/srt/models/commandr.py +5 -1
  27. sglang/srt/models/dbrx.py +5 -1
  28. sglang/srt/models/deepseek.py +5 -1
  29. sglang/srt/models/deepseek_v2.py +57 -25
  30. sglang/srt/models/exaone.py +368 -0
  31. sglang/srt/models/gemma.py +5 -1
  32. sglang/srt/models/gemma2.py +5 -1
  33. sglang/srt/models/gpt_bigcode.py +5 -1
  34. sglang/srt/models/grok.py +5 -1
  35. sglang/srt/models/internlm2.py +5 -1
  36. sglang/srt/models/{llama2.py → llama.py} +25 -45
  37. sglang/srt/models/llama_classification.py +34 -41
  38. sglang/srt/models/llama_embedding.py +7 -6
  39. sglang/srt/models/llava.py +8 -11
  40. sglang/srt/models/llavavid.py +5 -6
  41. sglang/srt/models/minicpm.py +5 -1
  42. sglang/srt/models/mistral.py +2 -3
  43. sglang/srt/models/mixtral.py +6 -2
  44. sglang/srt/models/mixtral_quant.py +5 -1
  45. sglang/srt/models/qwen.py +5 -2
  46. sglang/srt/models/qwen2.py +6 -2
  47. sglang/srt/models/qwen2_moe.py +5 -14
  48. sglang/srt/models/stablelm.py +5 -1
  49. sglang/srt/openai_api/adapter.py +16 -1
  50. sglang/srt/openai_api/protocol.py +5 -5
  51. sglang/srt/sampling/sampling_batch_info.py +75 -6
  52. sglang/srt/server.py +6 -6
  53. sglang/srt/utils.py +0 -3
  54. sglang/test/runners.py +1 -1
  55. sglang/test/test_programs.py +68 -0
  56. sglang/test/test_utils.py +4 -0
  57. sglang/utils.py +39 -0
  58. sglang/version.py +1 -1
  59. {sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/METADATA +9 -8
  60. sglang-0.3.0.dist-info/RECORD +118 -0
  61. {sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/WHEEL +1 -1
  62. sglang-0.2.14.post2.dist-info/RECORD +0 -115
  63. {sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/LICENSE +0 -0
  64. {sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/top_level.txt +0 -0
@@ -21,10 +21,59 @@ class SamplingBatchInfo:
21
21
  top_ps: torch.Tensor = None
22
22
  top_ks: torch.Tensor = None
23
23
  min_ps: torch.Tensor = None
24
- penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
24
+
25
+ # Dispatch in CUDA graph
26
+ need_min_p_sampling: bool = False
27
+
28
+ # Bias Tensors
25
29
  logit_bias: torch.Tensor = None
26
30
  vocab_mask: torch.Tensor = None
27
31
 
32
+ # Penalizer
33
+ penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
34
+ linear_penalties: torch.Tensor = None
35
+ scaling_penalties: torch.Tensor = None
36
+
37
+ def can_run_in_cuda_graph(self):
38
+ # Vocab bias and min_ps are not supported in CUDA graph
39
+ return (
40
+ self.logit_bias is None
41
+ and self.vocab_mask is None
42
+ and self.linear_penalties is None
43
+ and self.scaling_penalties is None
44
+ and not self.need_min_p_sampling
45
+ )
46
+
47
+ @classmethod
48
+ def dummy_one(cls, max_bs: int, vocab_size: int):
49
+ ret = cls(vocab_size=vocab_size)
50
+ ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
51
+ ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
52
+ ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
53
+ return ret
54
+
55
+ def __getitem__(self, key):
56
+ if isinstance(key, slice):
57
+ # NOTE:This method is only used in CUDA graph
58
+ assert self.can_run_in_cuda_graph()
59
+ return SamplingBatchInfo(
60
+ vocab_size=self.vocab_size,
61
+ temperatures=self.temperatures[key],
62
+ top_ps=self.top_ps[key],
63
+ top_ks=self.top_ks[key],
64
+ )
65
+ else:
66
+ raise NotImplementedError
67
+
68
+ def inplace_assign(self, bs: int, other: SamplingBatchInfo):
69
+ # NOTE:This method is only used in CUDA graph
70
+ assert self.can_run_in_cuda_graph()
71
+
72
+ self.vocab_size = other.vocab_size
73
+ self.temperatures[:bs] = other.temperatures
74
+ self.top_ps[:bs] = other.top_ps
75
+ self.top_ks[:bs] = other.top_ks
76
+
28
77
  @classmethod
29
78
  def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
30
79
  device = "cuda"
@@ -45,6 +94,7 @@ class SamplingBatchInfo:
45
94
  ret.min_ps = torch.tensor(
46
95
  [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
47
96
  )
97
+ ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
48
98
 
49
99
  # Each penalizers will do nothing if they evaluate themselves as not required by looking at
50
100
  # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
@@ -72,6 +122,25 @@ class SamplingBatchInfo:
72
122
 
73
123
  return ret
74
124
 
125
+ def prepare_penalties(self):
126
+ self.scaling_penalties = None
127
+ self.linear_penalties = None
128
+
129
+ for penalizer in self.penalizer_orchestrator.penalizers.values():
130
+ if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
131
+ if penalizer.is_prepared():
132
+ self.scaling_penalties = penalizer.cumulated_repetition_penalties
133
+ else:
134
+ if penalizer.is_prepared():
135
+ if self.linear_penalties is None:
136
+ bs = self.penalizer_orchestrator.batch.batch_size()
137
+ self.linear_penalties = torch.zeros(
138
+ (bs, self.vocab_size),
139
+ dtype=torch.float32,
140
+ device="cuda",
141
+ )
142
+ self.linear_penalties = penalizer.apply(self.linear_penalties)
143
+
75
144
  def update_regex_vocab_mask(self, batch: ScheduleBatch):
76
145
  bs, reqs = batch.batch_size(), batch.reqs
77
146
  device = "cuda"
@@ -81,15 +150,15 @@ class SamplingBatchInfo:
81
150
  self.vocab_mask = None
82
151
 
83
152
  if has_regex:
153
+ self.vocab_mask = torch.zeros(
154
+ bs, self.vocab_size, dtype=torch.bool, device=device
155
+ )
84
156
  for i, req in enumerate(reqs):
85
157
  if req.regex_fsm is not None:
86
- if self.vocab_mask is None:
87
- self.vocab_mask = torch.zeros(
88
- bs, self.vocab_size, dtype=torch.bool, device=device
89
- )
158
+ self.vocab_mask[i].fill_(1)
90
159
  self.vocab_mask[i][
91
160
  req.regex_fsm.get_next_instruction(req.regex_fsm_state).tokens
92
- ] = 1
161
+ ] = 0
93
162
 
94
163
  def filter(self, unfinished_indices: List[int], new_indices: torch.Tensor):
95
164
  self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
sglang/srt/server.py CHANGED
@@ -272,7 +272,7 @@ async def retrieve_file_content(file_id: str):
272
272
 
273
273
  def launch_server(
274
274
  server_args: ServerArgs,
275
- model_overide_args: Optional[dict] = None,
275
+ model_override_args: Optional[dict] = None,
276
276
  pipe_finish_writer: Optional[mp.connection.Connection] = None,
277
277
  ):
278
278
  """Launch an HTTP server."""
@@ -317,7 +317,7 @@ def launch_server(
317
317
  tp_rank_range,
318
318
  server_args,
319
319
  ports[3],
320
- model_overide_args,
320
+ model_override_args,
321
321
  )
322
322
 
323
323
  try:
@@ -328,7 +328,7 @@ def launch_server(
328
328
  return
329
329
 
330
330
  # Launch processes
331
- tokenizer_manager = TokenizerManager(server_args, port_args, model_overide_args)
331
+ tokenizer_manager = TokenizerManager(server_args, port_args, model_override_args)
332
332
  if server_args.chat_template:
333
333
  load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
334
334
  pipe_controller_reader, pipe_controller_writer = mp.Pipe(duplex=False)
@@ -341,7 +341,7 @@ def launch_server(
341
341
 
342
342
  proc_controller = mp.Process(
343
343
  target=start_controller_process,
344
- args=(server_args, port_args, pipe_controller_writer, model_overide_args),
344
+ args=(server_args, port_args, pipe_controller_writer, model_override_args),
345
345
  )
346
346
  proc_controller.start()
347
347
 
@@ -501,7 +501,7 @@ class Runtime:
501
501
  def __init__(
502
502
  self,
503
503
  log_level: str = "error",
504
- model_overide_args: Optional[dict] = None,
504
+ model_override_args: Optional[dict] = None,
505
505
  *args,
506
506
  **kwargs,
507
507
  ):
@@ -525,7 +525,7 @@ class Runtime:
525
525
 
526
526
  proc = mp.Process(
527
527
  target=launch_server,
528
- args=(self.server_args, model_overide_args, pipe_writer),
528
+ args=(self.server_args, model_override_args, pipe_writer),
529
529
  )
530
530
  proc.start()
531
531
  pipe_writer.close()
sglang/srt/utils.py CHANGED
@@ -407,7 +407,6 @@ def monkey_patch_vllm_dummy_weight_loader():
407
407
  DummyModelLoader,
408
408
  LoRAConfig,
409
409
  ModelConfig,
410
- MultiModalConfig,
411
410
  ParallelConfig,
412
411
  SchedulerConfig,
413
412
  _initialize_model,
@@ -422,7 +421,6 @@ def monkey_patch_vllm_dummy_weight_loader():
422
421
  model_config: ModelConfig,
423
422
  device_config: DeviceConfig,
424
423
  lora_config: Optional[LoRAConfig],
425
- multimodal_config: Optional[MultiModalConfig],
426
424
  parallel_config: ParallelConfig,
427
425
  scheduler_config: SchedulerConfig,
428
426
  cache_config: CacheConfig,
@@ -433,7 +431,6 @@ def monkey_patch_vllm_dummy_weight_loader():
433
431
  model_config,
434
432
  self.load_config,
435
433
  lora_config,
436
- multimodal_config,
437
434
  cache_config,
438
435
  )
439
436
 
sglang/test/runners.py CHANGED
@@ -180,7 +180,7 @@ class SRTRunner:
180
180
  tp_size=tp_size,
181
181
  dtype=get_dtype_str(torch_dtype),
182
182
  port=port,
183
- mem_fraction_static=0.7,
183
+ mem_fraction_static=0.69,
184
184
  trust_remote_code=False,
185
185
  is_embedding=not self.is_generation,
186
186
  )
@@ -2,8 +2,12 @@
2
2
 
3
3
  import json
4
4
  import re
5
+ import time
6
+
7
+ import numpy as np
5
8
 
6
9
  import sglang as sgl
10
+ from sglang.utils import fetch_and_cache_jsonl
7
11
 
8
12
 
9
13
  def test_few_shot_qa():
@@ -447,3 +451,67 @@ def test_chat_completion_speculative():
447
451
  )
448
452
 
449
453
  gen_character_spec().sync()
454
+
455
+
456
+ def test_hellaswag_select():
457
+ """Benchmark the accuracy of sgl.select on the HellaSwag dataset."""
458
+
459
+ url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
460
+ lines = fetch_and_cache_jsonl(url)
461
+
462
+ # Construct prompts
463
+ def get_one_example(lines, i, include_answer):
464
+ ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
465
+ if include_answer:
466
+ ret += lines[i]["endings"][lines[i]["label"]]
467
+ return ret
468
+
469
+ def get_few_shot_examples(lines, k):
470
+ ret = ""
471
+ for i in range(k):
472
+ ret += get_one_example(lines, i, True) + "\n\n"
473
+ return ret
474
+
475
+ num_questions = 200
476
+ num_shots = 20
477
+ few_shot_examples = get_few_shot_examples(lines, num_shots)
478
+
479
+ questions = []
480
+ choices = []
481
+ labels = []
482
+ for i in range(len(lines[:num_questions])):
483
+ questions.append(get_one_example(lines, i, False))
484
+ choices.append(lines[i]["endings"])
485
+ labels.append(lines[i]["label"])
486
+ arguments = [{"question": q, "choices": c} for q, c in zip(questions, choices)]
487
+
488
+ #####################################
489
+ ######### SGL Program Begin #########
490
+ #####################################
491
+
492
+ import sglang as sgl
493
+
494
+ @sgl.function
495
+ def few_shot_hellaswag(s, question, choices):
496
+ s += few_shot_examples + question
497
+ s += sgl.select("answer", choices=choices)
498
+
499
+ #####################################
500
+ ########## SGL Program End ##########
501
+ #####################################
502
+
503
+ # Run requests
504
+ tic = time.time()
505
+ rets = few_shot_hellaswag.run_batch(
506
+ arguments,
507
+ temperature=0,
508
+ num_threads=64,
509
+ progress_bar=True,
510
+ )
511
+ preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
512
+ latency = time.time() - tic
513
+
514
+ # Compute accuracy
515
+ accuracy = np.mean(np.array(preds) == np.array(labels))
516
+
517
+ return accuracy, latency
sglang/test/test_utils.py CHANGED
@@ -23,6 +23,10 @@ from sglang.utils import get_exception_traceback
23
23
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
24
24
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
25
25
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
26
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
27
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
28
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
29
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
26
30
 
27
31
  if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
28
32
  DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
sglang/utils.py CHANGED
@@ -4,6 +4,7 @@ import base64
4
4
  import importlib
5
5
  import json
6
6
  import logging
7
+ import os
7
8
  import signal
8
9
  import sys
9
10
  import traceback
@@ -15,6 +16,7 @@ from typing import Union
15
16
 
16
17
  import numpy as np
17
18
  import requests
19
+ from tqdm import tqdm
18
20
 
19
21
  logger = logging.getLogger(__name__)
20
22
 
@@ -260,3 +262,40 @@ class LazyImport:
260
262
  def __call__(self, *args, **kwargs):
261
263
  module = self._load()
262
264
  return module(*args, **kwargs)
265
+
266
+
267
+ def fetch_and_cache_jsonl(url, cache_file="cached_data.jsonl"):
268
+ """Read and cache a jsonl file from a url."""
269
+
270
+ # Check if the cache file already exists
271
+ if os.path.exists(cache_file):
272
+ print("Loading data from cache...")
273
+ with open(cache_file, "r") as f:
274
+ data = [json.loads(line) for line in f]
275
+ else:
276
+ print("Downloading data from URL...")
277
+ # Stream the response to show the progress bar
278
+ response = requests.get(url, stream=True)
279
+ response.raise_for_status() # Check for request errors
280
+
281
+ # Total size of the file in bytes
282
+ total_size = int(response.headers.get("content-length", 0))
283
+ chunk_size = 1024 # Download in chunks of 1KB
284
+
285
+ # Use tqdm to display the progress bar
286
+ with open(cache_file, "wb") as f, tqdm(
287
+ desc=cache_file,
288
+ total=total_size,
289
+ unit="B",
290
+ unit_scale=True,
291
+ unit_divisor=1024,
292
+ ) as bar:
293
+ for chunk in response.iter_content(chunk_size=chunk_size):
294
+ f.write(chunk)
295
+ bar.update(len(chunk))
296
+
297
+ # Convert the data to a list of dictionaries
298
+ with open(cache_file, "r") as f:
299
+ data = [json.loads(line) for line in f]
300
+
301
+ return data
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.14.post2"
1
+ __version__ = "0.3.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.14.post2
3
+ Version: 0.3.0
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
312
312
  ### Method 2: From source
313
313
  ```
314
314
  # Use the last release branch
315
- git clone -b v0.2.14.post2 https://github.com/sgl-project/sglang.git
315
+ git clone -b v0.3.0 https://github.com/sgl-project/sglang.git
316
316
  cd sglang
317
317
 
318
318
  pip install --upgrade pip
@@ -461,7 +461,7 @@ It supports streaming, vision, and most features of the Chat/Completions/Models/
461
461
  ```
462
462
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
463
463
  ```
464
- - Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
464
+ - Add `--dp 2` to enable multi-GPU data parallelism. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total.
465
465
  ```
466
466
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
467
467
  ```
@@ -489,13 +489,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
489
489
  ### Supported Models
490
490
 
491
491
  **Generative Models**
492
-
493
492
  - Llama / Llama 2 / Llama 3 / Llama 3.1
494
493
  - Mistral / Mixtral / Mistral NeMo
495
494
  - Gemma / Gemma 2
496
495
  - Qwen / Qwen 2 / Qwen 2 MoE
497
496
  - DeepSeek / DeepSeek 2
498
497
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
498
+ - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
499
499
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
500
500
  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
501
501
  - LLaVA 1.5 / 1.6 / NeXT
@@ -509,6 +509,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
509
509
  - Grok
510
510
  - ChatGLM
511
511
  - InternLM 2
512
+ - Exaone 3
512
513
 
513
514
  **Embedding Models**
514
515
 
@@ -636,7 +637,7 @@ print(state["answer_1"])
636
637
  #### More Examples
637
638
 
638
639
  Anthropic and VertexAI (Gemini) models are also supported.
639
- You can find more examples at [examples/quick_start](examples/quick_start).
640
+ You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
640
641
 
641
642
  ### Language Feature
642
643
  To begin with, import sglang.
@@ -649,7 +650,7 @@ You can implement your prompt flow in a function decorated by `sgl.function`.
649
650
  You can then invoke the function with `run` or `run_batch`.
650
651
  The system will manage the state, chat template, parallelism and batching for you.
651
652
 
652
- The complete code for the examples below can be found at [readme_examples.py](examples/usage/readme_examples.py)
653
+ The complete code for the examples below can be found at [readme_examples.py](examples/frontend_language/usage/readme_examples.py)
653
654
 
654
655
  #### Control Flow
655
656
  You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
@@ -698,7 +699,7 @@ def image_qa(s, image_file, question):
698
699
  s += sgl.assistant(sgl.gen("answer", max_tokens=256)
699
700
  ```
700
701
 
701
- See also [srt_example_llava.py](examples/quick_start/srt_example_llava.py).
702
+ See also [srt_example_llava.py](examples/frontend_language/quick_start/local_example_llava_next.py).
702
703
 
703
704
  #### Constrained Decoding
704
705
  Use `regex` to specify a regular expression as a decoding constraint.
@@ -742,7 +743,7 @@ def character_gen(s, name):
742
743
  s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
743
744
  ```
744
745
 
745
- See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
746
+ See also [json_decode.py](examples/frontend_language/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
746
747
 
747
748
  #### Batching
748
749
  Use `run_batch` to run a batch of requests with continuous batching.
@@ -0,0 +1,118 @@
1
+ sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
2
+ sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
3
+ sglang/bench_latency.py,sha256=F7jMfKqMf1XFKJgkpR_yE33VJpsIhSr_SOJeRbngkb0,16758
4
+ sglang/bench_serving.py,sha256=J_mMwnmDn0Jt07mzdGAuYOxpockHPLYJFL-kwoaqASY,36527
5
+ sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
6
+ sglang/global_config.py,sha256=nwOjUflwqLQySPUMvk8Hk63TIS6mknh_ODSW3CZ1rJw,1704
7
+ sglang/launch_server.py,sha256=FODfO0DW546dh-u1qDlWtrhsmj6hxkarXXv3cIdgkj8,549
8
+ sglang/launch_server_llavavid.py,sha256=xnpSILJxsrbvqkERav5P26bErCQnhoTFmoKeScJltUA,1034
9
+ sglang/utils.py,sha256=zxHwQhVxW_lWf-IH0wUw_pBTRLHLPypdRiU5M4XosMM,9669
10
+ sglang/version.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
11
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
13
+ sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
14
+ sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
15
+ sglang/lang/interpreter.py,sha256=AC3tNNDwYfiu87jCldBWXYpFicCv6NMPJACMFEfCXu4,30331
16
+ sglang/lang/ir.py,sha256=W3UfZikcGeT86PDDjDjw-yNzrKY2e2UYO4DTatMCfm0,17704
17
+ sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
18
+ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
20
+ sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
21
+ sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
22
+ sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
23
+ sglang/lang/backend/runtime_endpoint.py,sha256=hpezro0H6vG9KzLeKfYpPMwb4TaE0UanCIM0uG8Kdjw,9746
24
+ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
25
+ sglang/srt/conversation.py,sha256=2KDNe1suUPy6xqSkCx2xcO3pDPxTwqx5FaUxaqwCJ-M,19525
26
+ sglang/srt/hf_transformers_utils.py,sha256=5UXJ-LdP92Sk_T843M9BHdnxRrcyiYfWH2IEg3dWgKI,6085
27
+ sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
28
+ sglang/srt/model_config.py,sha256=68QQ8iUWQHPv01RBeH23mvay6iJg9DWmCogC_vUgFLk,6371
29
+ sglang/srt/server.py,sha256=yi8prs9_M0P0dOInrQLkHKiZ-oTigk_uzW8otEHImbU,19846
30
+ sglang/srt/server_args.py,sha256=GiDyPWCvYA_98mSE9LuvUoEodo9gRnNPPIPn0nFkxUs,18259
31
+ sglang/srt/utils.py,sha256=JJOlqRPbN_tSSNWj63syQpfz4v7hUwNvzWvOUpBh9SM,23746
32
+ sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
33
+ sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
34
+ sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
35
+ sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
36
+ sglang/srt/constrained/fsm_cache.py,sha256=wigJs9PeTt-vYPJQEeUZwEKl6MFIfb5xy8uIg18bDbM,3132
37
+ sglang/srt/constrained/jump_forward.py,sha256=LWRsmGPQcH6KT87wXwCRqtblU3pcAVCEzO0nWPxevs0,6636
38
+ sglang/srt/layers/activation.py,sha256=JEXNTgqxoiU4N-gVm4XMjobhft4JKDcMrgTkfpsRUzM,4856
39
+ sglang/srt/layers/decode_attention.py,sha256=TPD_608ZX9fQ_HDImifkxG_qcEYmimbEYY8lCBIjFuM,16628
40
+ sglang/srt/layers/extend_attention.py,sha256=XIXm3p2cvKrDg10Po4qYGaEkXJOJBtCIhTB_lTyjAFE,14390
41
+ sglang/srt/layers/layernorm.py,sha256=RXuS4UyksatqTF6lSK7VYyEiUEnBiNIBlEn8q4w84UA,3404
42
+ sglang/srt/layers/logits_processor.py,sha256=Zx4eFAkFlThPrmz_-HuCN9SqGLanARm0wdZSVDyASAc,13085
43
+ sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
44
+ sglang/srt/layers/prefill_attention.py,sha256=y7vdcuX8lMa9Qf_jQYNDvQO9PVCBQSs3hb5LV2DFgpU,5256
45
+ sglang/srt/layers/radix_attention.py,sha256=o5a8r3XQ-oRwaxBlAgzJGv7p3dMbu0LrYsDc4uvpPgA,8338
46
+ sglang/srt/layers/sampler.py,sha256=zPVa3PHc-tjDM_oP-1XFeHSRIErx844SLoe6MG8Qef0,6418
47
+ sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
48
+ sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
49
+ sglang/srt/layers/fused_moe/layer.py,sha256=GT3r2UPx_PAufJd0SUMOXyh76ymAeYDubd0SM0H71bo,20977
50
+ sglang/srt/managers/controller_multi.py,sha256=z3rguY1YYlSvVqLjKuurgJW1h0dxwPgIdPCQdJsVzYs,6478
51
+ sglang/srt/managers/controller_single.py,sha256=5brrZ8vZxjvrSJHWrm5H3qGEZShN4EROG5r1o3pSjps,5124
52
+ sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
53
+ sglang/srt/managers/io_struct.py,sha256=Bd91cydX9_960NNP2xngqK-lsIaDB3oMYd56QddN4_Q,10722
54
+ sglang/srt/managers/policy_scheduler.py,sha256=7HNUxBKJE444s_bHcPpbnHCygsnH-NIXYNSC2q6mRmc,8584
55
+ sglang/srt/managers/schedule_batch.py,sha256=i68O-e9I_gDlme96xSBDjA2xDF1p-XBKvJRiJ9CsgcY,26423
56
+ sglang/srt/managers/tokenizer_manager.py,sha256=8aHR5h9nYZsfdZE80uBc9egDFOQgKvjxmp-30Ha4ELk,29463
57
+ sglang/srt/managers/tp_worker.py,sha256=4UuaBLzV6NMsG4XEIcpa4xMcOKIFvTan51ynKz85HXg,36842
58
+ sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
59
+ sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
60
+ sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
61
+ sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
62
+ sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
63
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=4vIUqVQpnHNhwWrokMVmGM4Dp5JFPHyXIvpEQsi2pNU,12862
64
+ sglang/srt/model_executor/forward_batch_info.py,sha256=fSLhatN8vCgxn0Mft9D-r0pNi3SN0EQSTJmgaOtrqJc,16471
65
+ sglang/srt/model_executor/model_runner.py,sha256=93YCStmZfdZlY0r-GGIVi0Xw66VwF77dEtGVmQf1VfU,23893
66
+ sglang/srt/models/chatglm.py,sha256=PPOaeqipbkcsTUhMPbLb1HItWgW7KntefUfjEoMSxUM,13585
67
+ sglang/srt/models/commandr.py,sha256=k86ykwWOlxLGaBbGUoMSaXngUxCbMVRbY5AoMOWpbU8,14377
68
+ sglang/srt/models/dbrx.py,sha256=goLJ9Yt-9vxkwhCUFBidvP41H_dYTFsvrMZ4xm4FqGA,14875
69
+ sglang/srt/models/deepseek.py,sha256=aYP6HUgxQbhcQGQEF4vX0ronBF8AirqIFG98EQn0YzY,16220
70
+ sglang/srt/models/deepseek_v2.py,sha256=Htw_HDju9huYU5gBu2dqq6bKVao-AsifxfkGl2xRx-8,28521
71
+ sglang/srt/models/exaone.py,sha256=ZFr0G0WITxg3dDfV_-vWqZpK_wMmiZi4r0vOT0gO9V4,13301
72
+ sglang/srt/models/gemma.py,sha256=Ya_u2lKPKAc9iHEsW_HAEfCDgYTbxUOCzBI0LDuoOYs,12489
73
+ sglang/srt/models/gemma2.py,sha256=MCmzzRAAafEQuQj6aGtB-TF4jH0RWrXcOPxSz6LRsXs,15137
74
+ sglang/srt/models/gpt_bigcode.py,sha256=HEhMRO1Y37JfZtP7mDp0MexWj5h6XT9rKvxorOMKoQA,10409
75
+ sglang/srt/models/grok.py,sha256=ZcJ4E11rKh-xo4k_j-H1XRreJWWv8yii-bMYC1lO2R8,15143
76
+ sglang/srt/models/internlm2.py,sha256=VtWATs2eLIqbadYXTPY_vycFIstVk4zg3kxycA9H0Qw,12416
77
+ sglang/srt/models/llama.py,sha256=MfDnlVWoJUG9DxgGYPiwhoU-0ZeRbhp6UmBR2ZAJSNk,13402
78
+ sglang/srt/models/llama_classification.py,sha256=oSeROs633Gnak8vrbnWnCWDxfgP_zmKGO1A_43ukEQ4,4029
79
+ sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
80
+ sglang/srt/models/llava.py,sha256=OXmlOVIjFnMRKGwLweYB1N-xlfpZlTlZpqhsbwUCY6Y,23471
81
+ sglang/srt/models/llavavid.py,sha256=4R2t8BZJKN85IrTLsLFb4yZuKVI2Cwp7kY8AJ-nEVoE,12012
82
+ sglang/srt/models/minicpm.py,sha256=7RZEJ2TCqBL1JmMFVJ3J9DmZHRw0q90st49Wkh-sdL4,14039
83
+ sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
84
+ sglang/srt/models/mixtral.py,sha256=KIsvruhXNq3Fwrs4_YE7J6fx54ObfnMuRNxgScE3Bmo,13830
85
+ sglang/srt/models/mixtral_quant.py,sha256=O_97UKDYZokFhIBnamWfw0HLhln9_BUk_KfQ-sQnd8s,14286
86
+ sglang/srt/models/qwen.py,sha256=geK88AyEyPbbDvMHJNY8XMSNpsCeu8g9kxnKyiJBpK4,10168
87
+ sglang/srt/models/qwen2.py,sha256=WGYy3wcRY3f8Drd9I8GblXfv0bbHluRKVhnnhEZf584,12654
88
+ sglang/srt/models/qwen2_moe.py,sha256=b0gd42GBWyvDmUu8BZbD9ZJO_ExbXBLQZRvu61UuXOA,17086
89
+ sglang/srt/models/stablelm.py,sha256=9feHoiDEXSIe0WCrt4AfWXqxliJwRvr8w4XSnk6ipSI,11573
90
+ sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
91
+ sglang/srt/openai_api/adapter.py,sha256=3EeqASZXogpUkOP4xj7Rg_LfOLiIMUrZ9uFdeAy_pcc,50144
92
+ sglang/srt/openai_api/protocol.py,sha256=onhnCjXpXCysvx_dLgOEmXz5XHHYB1t772cvHcK1GlY,9538
93
+ sglang/srt/sampling/sampling_batch_info.py,sha256=CIoD0SzHSWCe7Wc4jkJj5vIPHGnOdfbgkC6fG5KQxOw,7551
94
+ sglang/srt/sampling/sampling_params.py,sha256=ggOXxafqfCD-xrGYcM57byLZ79CIeBP4AD5F44L_CW0,5635
95
+ sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
96
+ sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
97
+ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
98
+ sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
99
+ sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
100
+ sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
101
+ sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
102
+ sglang/test/runners.py,sha256=7N2g4vyqN98o6F0Lem5LUNAlW9ShEVxZxZuzSjmc0i4,7688
103
+ sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
104
+ sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
105
+ sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
106
+ sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
107
+ sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
108
+ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
109
+ sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
110
+ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
111
+ sglang/test/test_programs.py,sha256=l21J8N91QTMO9TOvXPWNvPZVT0DgxYxOPHh1pOoFV_k,16927
112
+ sglang/test/test_utils.py,sha256=3tt-BBv-lx7BT3whbVTMyRz6sh5jIbdBEbLZ08m2Ms8,15132
113
+ sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
114
+ sglang-0.3.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
115
+ sglang-0.3.0.dist-info/METADATA,sha256=muukBuN4kq_4mCG_r_RFY94pQliDcVh-WuXNMApXoak,37383
116
+ sglang-0.3.0.dist-info/WHEEL,sha256=uCRv0ZEik_232NlR4YDw4Pv3Ajt5bKvMH13NUU7hFuI,91
117
+ sglang-0.3.0.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
118
+ sglang-0.3.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (74.0.0)
2
+ Generator: setuptools (74.1.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5