PyPI - sglang - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.6.post2__py3-none-any.whl - Mend

sglang 0.3.6py3-none-any.whl → 0.3.6.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

sglang/__init__.py +2 -2
sglang/api.py +2 -2
sglang/bench_one_batch.py +4 -7
sglang/bench_one_batch_server.py +2 -2
sglang/bench_serving.py +75 -26
sglang/check_env.py +7 -1
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/runtime_endpoint.py +2 -2
sglang/lang/tracer.py +1 -1
sglang/launch_server.py +0 -3
sglang/srt/configs/model_config.py +15 -20
sglang/srt/constrained/__init__.py +13 -14
sglang/srt/constrained/base_grammar_backend.py +13 -15
sglang/srt/constrained/outlines_backend.py +13 -15
sglang/srt/constrained/outlines_jump_forward.py +13 -15
sglang/srt/constrained/xgrammar_backend.py +38 -57
sglang/srt/conversation.py +13 -15
sglang/srt/hf_transformers_utils.py +13 -15
sglang/srt/layers/activation.py +13 -13
sglang/srt/layers/attention/flashinfer_backend.py +14 -7
sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
sglang/srt/layers/custom_op_util.py +13 -14
sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
sglang/srt/layers/fused_moe_triton/layer.py +633 -0
sglang/srt/layers/layernorm.py +13 -15
sglang/srt/layers/logits_processor.py +13 -15
sglang/srt/layers/quantization/__init__.py +77 -17
sglang/srt/layers/radix_attention.py +13 -15
sglang/srt/layers/rotary_embedding.py +13 -13
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/lora.py +13 -14
sglang/srt/lora/lora_config.py +13 -14
sglang/srt/lora/lora_manager.py +22 -24
sglang/srt/managers/data_parallel_controller.py +25 -19
sglang/srt/managers/detokenizer_manager.py +13 -18
sglang/srt/managers/image_processor.py +6 -9
sglang/srt/managers/io_struct.py +43 -28
sglang/srt/managers/schedule_batch.py +92 -27
sglang/srt/managers/schedule_policy.py +13 -15
sglang/srt/managers/scheduler.py +94 -72
sglang/srt/managers/session_controller.py +29 -19
sglang/srt/managers/tokenizer_manager.py +29 -22
sglang/srt/managers/tp_worker.py +13 -15
sglang/srt/managers/tp_worker_overlap_thread.py +13 -15
sglang/srt/metrics/collector.py +13 -15
sglang/srt/metrics/func_timer.py +13 -15
sglang/srt/mm_utils.py +13 -14
sglang/srt/model_executor/cuda_graph_runner.py +20 -19
sglang/srt/model_executor/forward_batch_info.py +19 -17
sglang/srt/model_executor/model_runner.py +42 -30
sglang/srt/models/chatglm.py +15 -16
sglang/srt/models/commandr.py +15 -16
sglang/srt/models/dbrx.py +15 -16
sglang/srt/models/deepseek.py +15 -15
sglang/srt/models/deepseek_v2.py +15 -15
sglang/srt/models/exaone.py +14 -15
sglang/srt/models/gemma.py +14 -14
sglang/srt/models/gemma2.py +24 -19
sglang/srt/models/gemma2_reward.py +13 -14
sglang/srt/models/gpt_bigcode.py +14 -14
sglang/srt/models/grok.py +15 -15
sglang/srt/models/internlm2.py +13 -15
sglang/srt/models/internlm2_reward.py +13 -14
sglang/srt/models/llama.py +21 -21
sglang/srt/models/llama_classification.py +13 -14
sglang/srt/models/llama_reward.py +13 -14
sglang/srt/models/llava.py +20 -16
sglang/srt/models/llavavid.py +13 -15
sglang/srt/models/minicpm.py +13 -15
sglang/srt/models/minicpm3.py +13 -15
sglang/srt/models/mistral.py +13 -15
sglang/srt/models/mixtral.py +15 -15
sglang/srt/models/mixtral_quant.py +14 -14
sglang/srt/models/olmo.py +21 -19
sglang/srt/models/olmoe.py +23 -20
sglang/srt/models/qwen.py +14 -14
sglang/srt/models/qwen2.py +22 -19
sglang/srt/models/qwen2_moe.py +17 -18
sglang/srt/models/stablelm.py +18 -16
sglang/srt/models/torch_native_llama.py +15 -17
sglang/srt/models/xverse.py +13 -14
sglang/srt/models/xverse_moe.py +15 -16
sglang/srt/models/yivl.py +13 -15
sglang/srt/openai_api/adapter.py +13 -15
sglang/srt/openai_api/protocol.py +13 -15
sglang/srt/sampling/sampling_batch_info.py +4 -1
sglang/srt/sampling/sampling_params.py +13 -15
sglang/srt/server.py +60 -34
sglang/srt/server_args.py +22 -22
sglang/srt/utils.py +208 -19
sglang/test/few_shot_gsm8k.py +8 -4
sglang/test/runners.py +13 -14
sglang/test/test_utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/LICENSE +1 -1
{sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/METADATA +25 -15
sglang-0.3.6.post2.dist-info/RECORD +164 -0
sglang/srt/layers/fused_moe/__init__.py +0 -1
sglang-0.3.6.dist-info/RECORD +0 -161
/sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +0 -0
{sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/WHEEL +0 -0
{sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -11,7 +11,7 @@ from sglang.api import (
     gen,
     gen_int,
     gen_string,
-    get_server_args,
+    get_server_info,
     image,
     select,
     set_default_backend,
@@ -41,7 +41,7 @@ __all__ = [
     "gen",
     "gen_int",
     "gen_string",
-    "get_server_args",
+    "get_server_info",
     "image",
     "select",
     "set_default_backend",

sglang/api.py CHANGED Viewed

@@ -65,7 +65,7 @@ def flush_cache(backend: Optional[BaseBackend] = None):
     return backend.flush_cache()
-def get_server_args(backend: Optional[BaseBackend] = None):
+def get_server_info(backend: Optional[BaseBackend] = None):
     backend = backend or global_config.default_backend
     if backend is None:
         return None
@@ -73,7 +73,7 @@ def get_server_args(backend: Optional[BaseBackend] = None):
     # If backend is Runtime
     if hasattr(backend, "endpoint"):
         backend = backend.endpoint
-    return backend.get_server_args()
+    return backend.get_server_info()
 def gen(

sglang/bench_one_batch.py CHANGED Viewed

@@ -212,6 +212,7 @@ def extend(reqs, model_runner):
         token_to_kv_pool=model_runner.token_to_kv_pool,
         tree_cache=None,
         model_config=model_runner.model_config,
+        enable_overlap=False,
     )
     batch.prepare_for_extend()
     model_worker_batch = batch.get_model_worker_batch()
@@ -278,10 +279,7 @@ def correctness_test(
 def synchronize(device):
-    if device == "cuda":
-        torch.cuda.synchronize()
-    elif device == "xpu":
-        torch.xpu.synchronize()
+    torch.get_device_module(device).synchronize()
 def latency_test_run_once(
@@ -468,7 +466,6 @@ if __name__ == "__main__":
     try:
         main(server_args, bench_args)
-    except Exception as e:
-        raise e
     finally:
-        kill_child_process()
+        if server_args.tp_size != 1:
+            kill_child_process()

sglang/bench_one_batch_server.py CHANGED Viewed

@@ -5,9 +5,9 @@ This script launches a server and uses the HTTP interface.
 It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
 Usage:
-python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
+python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
-python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
+python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
 """
 import argparse

sglang/bench_serving.py CHANGED Viewed

@@ -25,6 +25,7 @@ import warnings
 from argparse import ArgumentParser
 from dataclasses import dataclass, field
 from datetime import datetime
+from pathlib import Path
 from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 import aiohttp
@@ -407,7 +408,7 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput:
 def get_model(pretrained_model_name_or_path: str) -> str:
-    if os.getenv("SGLANG_USE_MODELSCOPE", "False").lower() == "true":
+    if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
         import huggingface_hub.constants
         from modelscope import snapshot_download
@@ -693,6 +694,19 @@ def gen_prompt(tokenizer, token_num):
     return tokenizer.decode(selected_tokens)
+def get_gen_prefix_cache_path(args, tokenizer):
+    """Create cache directory under ~/.cache/sglang/benchmark"""
+    cache_dir = Path.home() / ".cache" / "sglang" / "benchmark"
+    # Create a unique cache filename based on the generation parameters
+    cache_key = (
+        f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
+        f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
+        f"{tokenizer.__class__.__name__}.pkl"
+    )
+    return cache_dir / cache_key
 def sample_generated_shared_prefix_requests(
     num_groups: int,
     prompts_per_group: int,
@@ -701,12 +715,17 @@ def sample_generated_shared_prefix_requests(
     output_len: int,
     tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, int, int]]:
-    if args.generated_input_path and os.path.exists(args.generated_input_path):
-        print(f"\nloading generated input data from {args.generated_input_path}")
-        with open(args.generated_input_path, "rb") as f:
+    """Generate benchmark requests with shared system prompts using random tokens and caching."""
+    cache_path = get_gen_prefix_cache_path(args, tokenizer)
+    # Try to load from cache first
+    if cache_path.exists():
+        print(f"\nLoading cached generated input data from {cache_path}")
+        with open(cache_path, "rb") as f:
             return pickle.load(f)
-    """Generate benchmark requests with shared system prompts using random tokens."""
+    print("\nGenerating new input data...")
     # Generate system prompts for each group
     system_prompts = []
     for _ in range(num_groups):
@@ -719,17 +738,16 @@ def sample_generated_shared_prefix_requests(
         question = gen_prompt(tokenizer, question_len)
         questions.append(question)
-    # Shuffle questions
-    random.shuffle(questions)
     # Combine system prompts with questions
     input_requests = []
     total_input_tokens = 0
     total_output_tokens = 0
-    for group_idx in range(num_groups):
+    for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
         system_prompt = system_prompts[group_idx]
-        for prompt_idx in range(prompts_per_group):
+        for prompt_idx in tqdm(
+            range(prompts_per_group), desc="Generating questions", leave=False
+        ):
             question = questions[group_idx * prompts_per_group + prompt_idx]
             full_prompt = f"{system_prompt}\n\n{question}"
             prompt_len = len(tokenizer.encode(full_prompt))
@@ -738,6 +756,10 @@ def sample_generated_shared_prefix_requests(
             total_input_tokens += prompt_len
             total_output_tokens += output_len
+    # Shuffle questions
+    random.shuffle(input_requests)
+    # Print statistics
     print(f"\nGenerated shared prefix dataset statistics:")
     print(f"Number of groups: {num_groups}")
     print(f"Prompts per group: {prompts_per_group}")
@@ -750,11 +772,12 @@ def sample_generated_shared_prefix_requests(
     print(
         f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
     )
-    if args.generated_input_save_path:
-        print(f"Saving generated input data to {args.generated_input_save_path}")
-        os.makedirs(os.path.dirname(args.generated_input_save_path), exist_ok=True)
-        with open(args.generated_input_save_path, "wb") as f:
-            pickle.dump(input_requests, f)
+    # Save to cache
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f"Caching generated input data to {cache_path}")
+    with open(cache_path, "wb") as f:
+        pickle.dump(input_requests, f)
     return input_requests
@@ -859,6 +882,7 @@ async def benchmark(
     tokenizer: PreTrainedTokenizerBase,
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
+    max_concurrency: Optional[int],
     disable_tqdm: bool,
     extra_request_body: Dict[str, Any],
     profile: bool,
@@ -868,6 +892,15 @@ async def benchmark(
     else:
         raise ValueError(f"Unknown backend: {backend}")
+    # From https://github.com/vllm-project/vllm/pull/9390
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
     test_input = RequestFuncInput(
@@ -913,7 +946,7 @@ async def benchmark(
         )
         tasks.append(
             asyncio.create_task(
-                request_func(request_func_input=request_func_input, pbar=pbar)
+                limited_request_func(request_func_input=request_func_input, pbar=pbar)
             )
         )
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
@@ -940,6 +973,12 @@ async def benchmark(
     print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
     print("{:<40} {:<10}".format("Backend:", backend))
     print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
+    print(
+        "{:<40} {:<10}".format(
+            "Max reqeuest concurrency:",
+            max_concurrency if max_concurrency else "not set",
+        )
+    )
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
@@ -1003,6 +1042,7 @@ async def benchmark(
             "backend": args.backend,
             "dataset_name": args.dataset_name,
             "request_rate": request_rate,
+            "max_concurrency": max_concurrency,
             "total_input_tokens": metrics.total_input,
             "total_output_tokens": metrics.total_output,
             "total_output_tokens_retokenized": metrics.total_output_retokenized,
@@ -1090,6 +1130,10 @@ def run_benchmark(args_: argparse.Namespace):
     global args
     args = args_
+    # Set default value for max_concurrency if not present
+    if not hasattr(args, "max_concurrency"):
+        args.max_concurrency = None
     # Set global environments
     set_ulimit()
     random.seed(args.seed)
@@ -1201,6 +1245,7 @@ def run_benchmark(args_: argparse.Namespace):
                 tokenizer=tokenizer,
                 input_requests=input_requests,
                 request_rate=args.request_rate,
+                max_concurrency=args.max_concurrency,
                 disable_tqdm=args.disable_tqdm,
                 extra_request_body=extra_request_body,
                 profile=args.profile,
@@ -1220,6 +1265,7 @@ def run_benchmark(args_: argparse.Namespace):
                     tokenizer=tokenizer,
                     input_requests=input_requests,
                     request_rate=rate,
+                    max_concurrency=args.max_concurrency,
                     disable_tqdm=args.disable_tqdm,
                     extra_request_body=extra_request_body,
                     profile=args.profile,
@@ -1319,6 +1365,19 @@ if __name__ == "__main__":
         help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
         "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
     )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.",
+    )
     parser.add_argument("--seed", type=int, default=1, help="The random seed.")
     parser.add_argument(
         "--multi",
@@ -1386,16 +1445,6 @@ if __name__ == "__main__":
         default=256,
         help="Target length in tokens for outputs in generated-shared-prefix dataset",
     )
-    parser.add_argument(
-        "--generated-input-save-path",
-        type=str,
-        help="Path to save generated input data",
-    )
-    parser.add_argument(
-        "--generated-input-path",
-        type=str,
-        help="Path to load previously generated input data",
-    )
     parser.add_argument(
         "--profile",
         action="store_true",

sglang/check_env.py CHANGED Viewed

@@ -22,18 +22,24 @@ PACKAGE_LIST = [
     "hf_transfer",
     "huggingface_hub",
     "interegular",
+    "modelscope",
+    "orjson",
+    "outlines",
+    "packaging",
     "psutil",
     "pydantic",
     "multipart",
     "zmq",
+    "torchao",
     "uvicorn",
     "uvloop",
     "vllm",
-    "outlines",
+    "xgrammar",
     "openai",
     "tiktoken",
     "anthropic",
     "litellm",
+    "decord",
 ]

sglang/lang/backend/base_backend.py CHANGED Viewed

@@ -78,5 +78,5 @@ class BaseBackend:
     def flush_cache(self):
         pass
-    def get_server_args(self):
+    def get_server_info(self):
         pass

sglang/lang/backend/runtime_endpoint.py CHANGED Viewed

@@ -58,9 +58,9 @@ class RuntimeEndpoint(BaseBackend):
         )
         self._assert_success(res)
-    def get_server_args(self):
+    def get_server_info(self):
         res = http_request(
-            self.base_url + "/get_server_args",
+            self.base_url + "/get_server_info",
             api_key=self.api_key,
             verify=self.verify,
         )

sglang/lang/tracer.py CHANGED Viewed

@@ -278,6 +278,6 @@ class TracingScope:
     def add_child_state(self, state: TracerProgramState):
         cur_scope = self
-        while cur_scope != None:
+        while cur_scope is not None:
             cur_scope.tracer_state.child_states.append(state)
             cur_scope = cur_scope.last_scope

sglang/launch_server.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Launch the inference server."""
-import os
 import sys
 from sglang.srt.server import launch_server
@@ -12,7 +11,5 @@ if __name__ == "__main__":
     try:
         launch_server(server_args)
-    except Exception as e:
-        raise e
     finally:
         kill_child_process()

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -1,27 +1,26 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 import json
 import logging
-import os
 from enum import IntEnum, auto
 from typing import List, Optional
 from transformers import PretrainedConfig
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
+from sglang.srt.utils import get_bool_env_var
 logger = logging.getLogger(__name__)
@@ -60,13 +59,9 @@ class ModelConfig:
         # Derive context length
         derived_context_len = get_context_length(self.hf_text_config)
-        allow_long_context = os.environ.get(
-            "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
-        )
         if context_length is not None:
             if context_length > derived_context_len:
-                if allow_long_context:
+                if get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN"):
                     logger.warning(
                         f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
                         f"This may lead to incorrect model outputs or CUDA errors."

sglang/srt/constrained/__init__.py CHANGED Viewed

@@ -1,17 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 # TODO(lmzheng): make this an optional dependency
 from sglang.srt.constrained.outlines_backend import build_regex_from_object

sglang/srt/constrained/base_grammar_backend.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """The baseclass of a backend for grammar-guided constrained decoding."""
 from concurrent.futures import Future, ThreadPoolExecutor

sglang/srt/constrained/outlines_backend.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Constrained decoding with outlines backend."""
 import json

sglang/srt/constrained/outlines_jump_forward.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """
 Faster constrained decoding with jump forward decoding / compressed finite state machine.
 Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/

sglang 0.3.6__py3-none-any.whl → 0.3.6.post2__py3-none-any.whl

sglang 0.3.6py3-none-any.whl → 0.3.6.post2py3-none-any.whl