PyPI - sglang - Versions diffs - 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl - Mend

sglang 0.2.11py3-none-any.whl → 0.2.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

sglang/api.py +7 -1
sglang/bench_latency.py +9 -6
sglang/bench_serving.py +46 -22
sglang/global_config.py +1 -1
sglang/lang/backend/runtime_endpoint.py +60 -49
sglang/lang/compiler.py +2 -2
sglang/lang/interpreter.py +4 -2
sglang/lang/ir.py +16 -7
sglang/srt/constrained/base_tool_cache.py +1 -1
sglang/srt/constrained/fsm_cache.py +12 -2
sglang/srt/constrained/jump_forward.py +13 -2
sglang/srt/layers/activation.py +32 -0
sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
sglang/srt/layers/extend_attention.py +9 -2
sglang/srt/layers/fused_moe/__init__.py +1 -0
sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
sglang/srt/layers/fused_moe/layer.py +587 -0
sglang/srt/layers/layernorm.py +65 -0
sglang/srt/layers/logits_processor.py +7 -2
sglang/srt/layers/pooler.py +50 -0
sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
sglang/srt/layers/radix_attention.py +40 -16
sglang/srt/managers/detokenizer_manager.py +31 -9
sglang/srt/managers/io_struct.py +63 -0
sglang/srt/managers/policy_scheduler.py +173 -25
sglang/srt/managers/schedule_batch.py +115 -97
sglang/srt/managers/tokenizer_manager.py +194 -112
sglang/srt/managers/tp_worker.py +290 -359
sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
sglang/srt/mem_cache/chunk_cache.py +43 -20
sglang/srt/mem_cache/memory_pool.py +2 -2
sglang/srt/mem_cache/radix_cache.py +74 -40
sglang/srt/model_executor/cuda_graph_runner.py +71 -25
sglang/srt/model_executor/forward_batch_info.py +293 -156
sglang/srt/model_executor/model_runner.py +77 -57
sglang/srt/models/chatglm.py +2 -2
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/deepseek.py +2 -2
sglang/srt/models/deepseek_v2.py +7 -6
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +11 -6
sglang/srt/models/grok.py +50 -396
sglang/srt/models/internlm2.py +2 -7
sglang/srt/models/llama2.py +4 -4
sglang/srt/models/llama_embedding.py +88 -0
sglang/srt/models/minicpm.py +2 -2
sglang/srt/models/mixtral.py +56 -254
sglang/srt/models/mixtral_quant.py +1 -4
sglang/srt/models/qwen.py +2 -2
sglang/srt/models/qwen2.py +2 -2
sglang/srt/models/qwen2_moe.py +2 -13
sglang/srt/models/stablelm.py +1 -1
sglang/srt/openai_api/adapter.py +187 -48
sglang/srt/openai_api/protocol.py +37 -1
sglang/srt/sampling/penaltylib/__init__.py +13 -0
sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
sglang/srt/sampling_params.py +31 -8
sglang/srt/server.py +91 -29
sglang/srt/server_args.py +32 -19
sglang/srt/utils.py +32 -15
sglang/test/run_eval.py +10 -1
sglang/test/runners.py +81 -73
sglang/test/simple_eval_humaneval.py +2 -8
sglang/test/simple_eval_mgsm.py +203 -0
sglang/test/srt/sampling/penaltylib/utils.py +337 -0
sglang/test/test_layernorm.py +60 -0
sglang/test/test_programs.py +36 -7
sglang/test/test_utils.py +24 -2
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/METADATA +33 -16
sglang-0.2.13.dist-info/RECORD +112 -0
{sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/WHEEL +1 -1
sglang/srt/layers/linear.py +0 -884
sglang/srt/layers/quantization/__init__.py +0 -64
sglang/srt/layers/quantization/fp8.py +0 -677
sglang/srt/model_loader/model_loader.py +0 -292
sglang/srt/model_loader/utils.py +0 -275
sglang-0.2.11.dist-info/RECORD +0 -102
{sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/LICENSE +0 -0
{sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/top_level.txt +0 -0

sglang/api.py CHANGED Viewed

@@ -62,6 +62,7 @@ def gen(
     name: Optional[str] = None,
     max_tokens: Optional[int] = None,
     stop: Optional[Union[str, List[str]]] = None,
+    stop_token_ids: Optional[List[int]] = None,
     temperature: Optional[float] = None,
     top_p: Optional[float] = None,
     top_k: Optional[int] = None,
@@ -72,7 +73,7 @@ def gen(
     logprob_start_len: Optional[int] = None,
     top_logprobs_num: Optional[int] = None,
     return_text_in_logprobs: Optional[bool] = None,
-    dtype: Optional[type] = None,
+    dtype: Optional[Union[type, str]] = None,
     choices: Optional[List[str]] = None,
     choices_method: Optional[ChoicesSamplingMethod] = None,
     regex: Optional[str] = None,
@@ -98,6 +99,7 @@ def gen(
         name,
         max_tokens,
         stop,
+        stop_token_ids,
         temperature,
         top_p,
         top_k,
@@ -117,6 +119,7 @@ def gen_int(
     name: Optional[str] = None,
     max_tokens: Optional[int] = None,
     stop: Optional[Union[str, List[str]]] = None,
+    stop_token_ids: Optional[List[int]] = None,
     temperature: Optional[float] = None,
     top_p: Optional[float] = None,
     top_k: Optional[int] = None,
@@ -132,6 +135,7 @@ def gen_int(
         name,
         max_tokens,
         stop,
+        stop_token_ids,
         temperature,
         top_p,
         top_k,
@@ -151,6 +155,7 @@ def gen_string(
     name: Optional[str] = None,
     max_tokens: Optional[int] = None,
     stop: Optional[Union[str, List[str]]] = None,
+    stop_token_ids: Optional[List[int]] = None,
     temperature: Optional[float] = None,
     top_p: Optional[float] = None,
     top_k: Optional[int] = None,
@@ -166,6 +171,7 @@ def gen_string(
         name,
         max_tokens,
         stop,
+        stop_token_ids,
         temperature,
         top_p,
         top_k,

sglang/bench_latency.py CHANGED Viewed

@@ -64,7 +64,7 @@ class BenchArgs:
     run_name: str = "before"
     batch_size: Tuple[int] = (1,)
     input_len: Tuple[int] = (1024,)
-    output_len: Tuple[int] = (4,)
+    output_len: Tuple[int] = (16,)
     result_filename: str = ""
     correctness_test: bool = False
     # This is only used for correctness test
@@ -152,7 +152,7 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
         req = Req(rid=i, origin_input_text=prompts[i], origin_input_ids=tmp_input_ids)
         req.prefix_indices = []
         req.sampling_params = sampling_params
-        req.input_ids = req.origin_input_ids
+        req.fill_ids = req.origin_input_ids
         reqs.append(req)
     return input_ids, reqs
@@ -163,7 +163,7 @@ def prepare_extend_inputs_for_correctness_test(
 ):
     for i in range(len(reqs)):
         req = reqs[i]
-        req.input_ids += input_ids[i][bench_args.cut_len :]
+        req.fill_ids += input_ids[i][bench_args.cut_len :]
         req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
             i, : bench_args.cut_len
         ]
@@ -182,7 +182,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
         req = Req(rid=i, origin_input_text="", origin_input_ids=list(input_ids[i]))
         req.prefix_indices = []
         req.sampling_params = sampling_params
-        req.input_ids = req.origin_input_ids
+        req.fill_ids = req.origin_input_ids
         reqs.append(req)
     return reqs
@@ -195,7 +195,7 @@ def extend(reqs, model_runner):
         token_to_kv_pool=model_runner.token_to_kv_pool,
         tree_cache=None,
     )
-    batch.prepare_for_extend(model_runner.model_config.vocab_size, None)
+    batch.prepare_for_extend(model_runner.model_config.vocab_size)
     output = model_runner.forward(batch, ForwardMode.EXTEND)
     next_token_ids = batch.sample(output.next_token_logits)
     return next_token_ids, output.next_token_logits, batch
@@ -221,6 +221,7 @@ def correctness_test(
     # Prepare inputs
     input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
+    rank_print(f"{input_ids=}")
     if bench_args.cut_len > 0:
         # Prefill
@@ -238,7 +239,7 @@ def correctness_test(
     # Decode
     output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
-    for _ in range(bench_args.output_len):
+    for _ in range(bench_args.output_len[0]):
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)
         for i in range(len(reqs)):
             output_ids[i].append(next_token_ids[i])
@@ -332,6 +333,7 @@ def latency_test(
     )
     # Warm up
+    rank_print("Warmup ...")
     latency_test_run_once(
         bench_args.run_name,
         model_runner,
@@ -341,6 +343,7 @@ def latency_test(
         bench_args.input_len[0],
         4,  # shorter decoding to speed up the warmup
     )
+    rank_print("Benchmark ...")
     # Run the sweep
     result_list = []

sglang/bench_serving.py CHANGED Viewed

@@ -24,7 +24,7 @@ import warnings
 from argparse import ArgumentParser
 from dataclasses import dataclass, field
 from datetime import datetime
-from typing import AsyncGenerator, List, Optional, Tuple, Union
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 import aiohttp
 import numpy as np
@@ -39,6 +39,8 @@ from transformers import (
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+global args
 @dataclass
 class RequestFuncInput:
@@ -47,6 +49,7 @@ class RequestFuncInput:
     prompt_len: int
     output_len: int
     model: str
+    extra_request_body: Dict[str, Any]
 @dataclass
@@ -84,6 +87,7 @@ async def async_request_trt_llm(
             "stream": True,
             "min_length": request_func_input.output_len,
             "end_id": 1048576,
+            **request_func_input.extra_request_body,
         }
         if args.disable_ignore_eos:
             del payload["min_length"]
@@ -154,6 +158,7 @@ async def async_request_openai_completions(
             "max_tokens": request_func_input.output_len,
             "stream": not args.disable_stream,
             "ignore_eos": not args.disable_ignore_eos,
+            **request_func_input.extra_request_body,
         }
         headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
@@ -192,7 +197,8 @@ async def async_request_openai_completions(
                                     output.ttft = ttft
                                 # Decoding phase
-                                output.itl.append(timestamp - most_recent_timestamp)
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
                                 most_recent_timestamp = timestamp
                                 generated_text += data["choices"][0]["text"]
@@ -542,6 +548,7 @@ async def benchmark(
     request_rate: float,
     disable_tqdm: bool,
     enable_multi: bool,
+    extra_request_body: Dict[str, Any],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -556,6 +563,7 @@ async def benchmark(
         api_url=api_url,
         prompt_len=test_prompt_len,
         output_len=test_output_len,
+        extra_request_body=extra_request_body,
     )
     test_output = await request_func(request_func_input=test_input)
     if not test_output.success:
@@ -578,6 +586,7 @@ async def benchmark(
             api_url=api_url,
             prompt_len=prompt_len,
             output_len=output_len,
+            extra_request_body=extra_request_body,
         )
         tasks.append(
             asyncio.create_task(
@@ -660,19 +669,20 @@ async def benchmark(
             "backend": args.backend,
             "dataset_name": args.dataset_name,
             "request_rate": request_rate,
-            "total_input": metrics.total_input,
-            "total_output": metrics.total_output,
-            "total_output_retokenized": metrics.total_output_retokenized,
-            "mean_e2e_latency": metrics.mean_e2e_latency_ms,
-            "median_e2e_latency": metrics.median_e2e_latency_ms,
-            "median_ttft": metrics.median_ttft_ms,
-            "median_itl": metrics.median_itl_ms,
-            "output_token_throughput": metrics.output_throughput,
+            "total_input_tokens": metrics.total_input,
+            "total_output_tokens": metrics.total_output,
+            "total_output_tokens_retokenized": metrics.total_output_retokenized,
+            "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+            "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
+            "median_ttft_ms": metrics.median_ttft_ms,
+            "median_itl_ms": metrics.median_itl_ms,
+            "output_throughput": metrics.output_throughput,
             "sharegpt_output_len": args.sharegpt_output_len,
             "random_input_len": args.random_input_len,
             "random_output_len": args.random_output_len,
             "random_range_ratio": args.random_range_ratio,
-            "benchmark_duration": benchmark_duration,
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
         }
     else:
         print(f"Error running benchmark for request rate: {request_rate}")
@@ -742,10 +752,18 @@ def check_chat_template(model_path):
         return False
-def fire(args: argparse.Namespace):
+def run_benchmark(args_: argparse.Namespace):
+    global args
+    args = args_
+    set_ulimit()
     random.seed(args.seed)
     np.random.seed(args.seed)
+    extra_request_body = {}
+    if args.extra_request_body:
+        extra_request_body = json.loads(args.extra_request_body)
     if args.port is None:
         args.port = {
             "sglang": 30000,
@@ -838,10 +856,11 @@ def fire(args: argparse.Namespace):
                     request_rate=rate,
                     disable_tqdm=args.disable_tqdm,
                     enable_multi=args.multi,
+                    extra_request_body=extra_request_body,
                 )
             )
     else:
-        asyncio.run(
+        return asyncio.run(
             benchmark(
                 backend=backend,
                 api_url=api_url,
@@ -851,6 +870,7 @@ def fire(args: argparse.Namespace):
                 request_rate=args.request_rate,
                 disable_tqdm=args.disable_tqdm,
                 enable_multi=args.multi,
+                extra_request_body=extra_request_body,
             )
         )
@@ -949,11 +969,6 @@ if __name__ == "__main__":
         "Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.",
     )
     parser.add_argument("--seed", type=int, default=0, help="Default is 0.")
-    parser.add_argument(
-        "--disable-tqdm",
-        action="store_true",
-        help="Specify to disable tqdm progress bar.",
-    )
     parser.add_argument(
         "--multi",
         action="store_true",
@@ -966,6 +981,11 @@ if __name__ == "__main__":
         help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
     )
     parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
     parser.add_argument(
         "--disable-stream",
         action="store_true",
@@ -976,8 +996,12 @@ if __name__ == "__main__":
         action="store_true",
         help="Disable ignoring EOS.",
     )
-    set_ulimit()
+    parser.add_argument(
+        "--extra-request-body",
+        metavar='{"key1": "value1", "key2": "value2"}',
+        type=str,
+        help="Append given JSON object to the request payload. You can use this to specify"
+        "additional generate params like sampling params.",
+    )
     args = parser.parse_args()
-    fire(args)
+    run_benchmark(args)

sglang/global_config.py CHANGED Viewed

@@ -27,7 +27,7 @@ class GlobalConfig:
         # Runtime constants: others
         self.num_continue_decode_steps = 10
         self.retract_decode_steps = 20
-        self.flashinfer_workspace_size = 192 * 1024 * 1024
+        self.flashinfer_workspace_size = 384 * 1024 * 1024
         # Output tokenization configs
         self.skip_special_tokens_in_output = True

sglang/lang/backend/runtime_endpoint.py CHANGED Viewed

@@ -1,21 +1,23 @@
 import json
+import warnings
 from typing import List, Optional
 from sglang.global_config import global_config
 from sglang.lang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import get_chat_template_by_model_path
-from sglang.lang.choices import (
-    ChoicesDecision,
-    ChoicesSamplingMethod,
-    token_length_normalized,
-)
+from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod
 from sglang.lang.interpreter import StreamExecutor
-from sglang.lang.ir import SglSamplingParams
+from sglang.lang.ir import (
+    REGEX_BOOL,
+    REGEX_FLOAT,
+    REGEX_INT,
+    REGEX_STR,
+    SglSamplingParams,
+)
 from sglang.utils import http_request
 class RuntimeEndpoint(BaseBackend):
     def __init__(
         self,
         base_url: str,
@@ -95,32 +97,52 @@ class RuntimeEndpoint(BaseBackend):
         )
         self._assert_success(res)
+    def _handle_dtype_to_regex(self, sampling_params: SglSamplingParams):
+        if sampling_params.dtype is None:
+            return
+        if sampling_params.stop == ():
+            sampling_params.stop = []
+        dtype_regex = None
+        if sampling_params.dtype in ["int", int]:
+            dtype_regex = REGEX_INT
+            sampling_params.stop.extend([" ", "\n"])
+        elif sampling_params.dtype in ["float", float]:
+            dtype_regex = REGEX_FLOAT
+            sampling_params.stop.extend([" ", "\n"])
+        elif sampling_params.dtype in ["str", str]:
+            dtype_regex = REGEX_STR
+        elif sampling_params.dtype in ["bool", bool]:
+            dtype_regex = REGEX_BOOL
+        else:
+            raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
+        if dtype_regex is not None and sampling_params.regex is not None:
+            warnings.warn(
+                f"Both dtype and regex are set. Only dtype will be used. dtype: {sampling_params.dtype}, regex: {sampling_params.regex}"
+            )
+        sampling_params.regex = dtype_regex
     def generate(
         self,
         s: StreamExecutor,
         sampling_params: SglSamplingParams,
     ):
-        if sampling_params.dtype is None:
-            data = {
-                "text": s.text_,
-                "sampling_params": {
-                    "skip_special_tokens": global_config.skip_special_tokens_in_output,
-                    "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
-                    **sampling_params.to_srt_kwargs(),
-                },
-            }
-        elif sampling_params.dtype in [int, "int"]:
-            data = {
-                "text": s.text_,
-                "sampling_params": {
-                    "skip_special_tokens": global_config.skip_special_tokens_in_output,
-                    "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
-                    "dtype": "int",
-                    **sampling_params.to_srt_kwargs(),
-                },
-            }
-        else:
-            raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
+        self._handle_dtype_to_regex(sampling_params)
+        data = {
+            "text": s.text_,
+            "sampling_params": {
+                "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
+                **sampling_params.to_srt_kwargs(),
+            },
+        }
         for item in [
             "return_logprob",
@@ -151,27 +173,16 @@ class RuntimeEndpoint(BaseBackend):
         s: StreamExecutor,
         sampling_params: SglSamplingParams,
     ):
-        if sampling_params.dtype is None:
-            data = {
-                "text": s.text_,
-                "sampling_params": {
-                    "skip_special_tokens": global_config.skip_special_tokens_in_output,
-                    "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
-                    **sampling_params.to_srt_kwargs(),
-                },
-            }
-        elif sampling_params.dtype in [int, "int"]:
-            data = {
-                "text": s.text_,
-                "sampling_params": {
-                    "skip_special_tokens": global_config.skip_special_tokens_in_output,
-                    "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
-                    "dtype": "int",
-                    **sampling_params.to_srt_kwargs(),
-                },
-            }
-        else:
-            raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
+        self._handle_dtype_to_regex(sampling_params)
+        data = {
+            "text": s.text_,
+            "sampling_params": {
+                "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
+                **sampling_params.to_srt_kwargs(),
+            },
+        }
         for item in [
             "return_logprob",

sglang/lang/compiler.py CHANGED Viewed

@@ -125,7 +125,7 @@ class CompiledFunction:
     def run(
         self,
         *,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,
@@ -155,7 +155,7 @@ class CompiledFunction:
         self,
         batch_kwargs,
         *,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,

sglang/lang/interpreter.py CHANGED Viewed

@@ -20,7 +20,6 @@ from sglang.lang.ir import (
     SglConstantText,
     SglExpr,
     SglExprList,
-    SglFunction,
     SglGen,
     SglImage,
     SglRoleBegin,
@@ -181,8 +180,10 @@ class StreamExecutor:
         num_api_spec_tokens=None,
         use_thread=True,
     ):
+        from sglang.lang.backend.base_backend import BaseBackend
         self.sid = uuid.uuid4().hex
-        self.backend = backend
+        self.backend: BaseBackend = backend
         self.arguments: Dict[str, Any] = arguments
         self.default_sampling_para = default_sampling_para
         self.stream = stream
@@ -658,6 +659,7 @@ class StreamExecutor:
         for item in [
             "max_new_tokens",
             "stop",
+            "stop_token_ids",
             "temperature",
             "top_p",
             "top_k",

sglang/lang/ir.py CHANGED Viewed

@@ -8,16 +8,17 @@ from typing import List, Optional, Union
 from sglang.global_config import global_config
 from sglang.lang.choices import ChoicesSamplingMethod
-REGEX_INT = r"[-+]?[0-9]+"
-REGEX_FLOAT = r"[-+]?[0-9]*\.?[0-9]+"
+REGEX_INT = r"[-+]?[0-9]+[ \n]*"
+REGEX_FLOAT = r"[-+]?[0-9]*\.?[0-9]+[ \n]*"
 REGEX_BOOL = r"(True|False)"
-REGEX_STRING = r"\"[\w\d\s]*\""  # bugs with regex r"\".*\"" in interegular pkg
+REGEX_STR = r"\"[\w\d\s]*\""  # bugs with regex r"\".*\"" in interegular pkg
 @dataclasses.dataclass
 class SglSamplingParams:
-    max_new_tokens: int = 16
+    max_new_tokens: int = 128
     stop: Union[str, List[str]] = ()
+    stop_token_ids: Optional[List[int]] = ()
     temperature: float = 1.0
     top_p: float = 1.0
     top_k: int = -1  # -1 means disable
@@ -37,6 +38,7 @@ class SglSamplingParams:
         return SglSamplingParams(
             self.max_new_tokens,
             self.stop,
+            self.stop_token_ids,
             self.temperature,
             self.top_p,
             self.top_k,
@@ -108,6 +110,7 @@ class SglSamplingParams:
         return {
             "max_new_tokens": self.max_new_tokens,
             "stop": self.stop,
+            "stop_token_ids": self.stop_token_ids,
             "temperature": self.temperature,
             "top_p": self.top_p,
             "top_k": self.top_k,
@@ -140,8 +143,9 @@ class SglFunction:
     def run(
         self,
         *args,
-        max_new_tokens: int = 16,
-        stop: Union[str, List[str]] = (),
+        max_new_tokens: int = 128,
+        stop: Union[str, List[str]] = [],
+        stop_token_ids: Optional[List[int]] = [],
         temperature: float = 1.0,
         top_p: float = 1.0,
         top_k: int = -1,
@@ -161,6 +165,7 @@ class SglFunction:
         default_sampling_para = SglSamplingParams(
             max_new_tokens=max_new_tokens,
             stop=stop,
+            stop_token_ids=stop_token_ids,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
@@ -179,8 +184,9 @@ class SglFunction:
         self,
         batch_kwargs,
         *,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
+        stop_token_ids: Optional[List[int]] = [],
         temperature: float = 1.0,
         top_p: float = 1.0,
         top_k: int = -1,
@@ -218,6 +224,7 @@ class SglFunction:
         default_sampling_para = SglSamplingParams(
             max_new_tokens=max_new_tokens,
             stop=stop,
+            stop_token_ids=stop_token_ids,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
@@ -397,6 +404,7 @@ class SglGen(SglExpr):
         name: Optional[str] = None,
         max_new_tokens: Optional[int] = None,
         stop: Optional[Union[str, List[str]]] = None,
+        stop_token_ids: Optional[List[int]] = None,
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
         top_k: Optional[int] = None,
@@ -416,6 +424,7 @@ class SglGen(SglExpr):
         self.sampling_params = SglSamplingParams(
             max_new_tokens=max_new_tokens,
             stop=stop,
+            stop_token_ids=stop_token_ids,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,

sglang/srt/constrained/base_tool_cache.py CHANGED Viewed

@@ -54,7 +54,7 @@ class BaseToolCache:
         return val
     def init_value(self, key):
-        raise NotImplementedError
+        raise NotImplementedError()
     def get_cache_hit_rate(self):
         if self.metrics["total"] == 0:

sglang/srt/constrained/fsm_cache.py CHANGED Viewed

@@ -20,10 +20,20 @@ from sglang.srt.constrained.base_tool_cache import BaseToolCache
 class FSMCache(BaseToolCache):
-    def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
+    def __init__(
+        self,
+        tokenizer_path,
+        tokenizer_args_dict,
+        enable=True,
+        skip_tokenizer_init=False,
+    ):
         super().__init__(enable=enable)
-        if tokenizer_path.endswith(".json") or tokenizer_path.endswith(".model"):
+        if (
+            skip_tokenizer_init
+            or tokenizer_path.endswith(".json")
+            or tokenizer_path.endswith(".model")
+        ):
             # Do not support TiktokenTokenizer or SentencePieceTokenizer
             return

sglang/srt/constrained/jump_forward.py CHANGED Viewed

@@ -62,16 +62,22 @@ class JumpForwardMap:
                 id_to_symbol.setdefault(id_, []).append(symbol)
             transitions = fsm_info.transitions
             outgoings_ct = defaultdict(int)
-            state_to_jump_forward = {}
+            # NOTE(lsyin): Final states can lead to terminate, so they have one outgoing edge naturally
+            for s in fsm_info.finals:
+                outgoings_ct[s] = 1
+            state_to_jump_forward = {}
             for (state, id_), next_state in transitions.items():
                 if id_ == fsm_info.alphabet_anything_value:
+                    # Arbitrarily symbol cannot be recognized as jump forward
                     continue
                 symbols = id_to_symbol[id_]
                 for c in symbols:
                     if len(c) > 1:
-                        # Skip byte level transitions
+                        # Skip byte level transitions like c = "5E"
                         continue
                     outgoings_ct[state] += 1
@@ -87,6 +93,9 @@ class JumpForwardMap:
             # Process the byte level jump forward
             outgoings_ct = defaultdict(int)
+            for s in fsm_info.finals:
+                outgoings_ct[s] = 1
             for (state, id_), next_state in transitions.items():
                 if id_ == fsm_info.alphabet_anything_value:
                     continue
@@ -177,3 +186,5 @@ if __name__ == "__main__":
     test_main(r"霍格沃茨特快列车|霍比特人比尔博")
     # 霍格: \xe9\x9c\x8d \xe6\xa0\xbc ...
     # 霍比: \xe9\x9c\x8d \xe6\xaf\x94 ...
+    test_main(r"[-+]?[0-9]+[ ]*")

sglang 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl

sglang 0.2.11py3-none-any.whl → 0.2.13py3-none-any.whl