PyPI - sglang - Versions diffs - 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl - Mend

sglang 0.1.16py3-none-any.whl → 0.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

sglang/__init__.py +3 -1
sglang/api.py +7 -7
sglang/backend/anthropic.py +1 -1
sglang/backend/litellm.py +90 -0
sglang/backend/openai.py +158 -11
sglang/backend/runtime_endpoint.py +18 -10
sglang/bench_latency.py +299 -0
sglang/global_config.py +12 -2
sglang/lang/compiler.py +2 -2
sglang/lang/interpreter.py +114 -67
sglang/lang/ir.py +28 -3
sglang/launch_server.py +4 -1
sglang/launch_server_llavavid.py +2 -1
sglang/srt/constrained/__init__.py +13 -6
sglang/srt/constrained/fsm_cache.py +8 -2
sglang/srt/constrained/jump_forward.py +113 -25
sglang/srt/conversation.py +2 -0
sglang/srt/flush_cache.py +3 -1
sglang/srt/hf_transformers_utils.py +130 -1
sglang/srt/layers/extend_attention.py +17 -0
sglang/srt/layers/fused_moe.py +582 -0
sglang/srt/layers/logits_processor.py +65 -32
sglang/srt/layers/radix_attention.py +41 -7
sglang/srt/layers/token_attention.py +16 -1
sglang/srt/managers/controller/dp_worker.py +113 -0
sglang/srt/managers/{router → controller}/infer_batch.py +242 -100
sglang/srt/managers/controller/manager_multi.py +191 -0
sglang/srt/managers/{router/manager.py → controller/manager_single.py} +34 -14
sglang/srt/managers/{router → controller}/model_runner.py +262 -158
sglang/srt/managers/{router → controller}/radix_cache.py +11 -1
sglang/srt/managers/{router/scheduler.py → controller/schedule_heuristic.py} +9 -7
sglang/srt/managers/{router/model_rpc.py → controller/tp_worker.py} +298 -267
sglang/srt/managers/detokenizer_manager.py +42 -46
sglang/srt/managers/io_struct.py +22 -12
sglang/srt/managers/tokenizer_manager.py +151 -87
sglang/srt/model_config.py +83 -5
sglang/srt/models/chatglm.py +399 -0
sglang/srt/models/commandr.py +10 -13
sglang/srt/models/dbrx.py +9 -15
sglang/srt/models/gemma.py +12 -15
sglang/srt/models/grok.py +738 -0
sglang/srt/models/llama2.py +26 -15
sglang/srt/models/llama_classification.py +104 -0
sglang/srt/models/llava.py +86 -19
sglang/srt/models/llavavid.py +11 -20
sglang/srt/models/mixtral.py +282 -103
sglang/srt/models/mixtral_quant.py +372 -0
sglang/srt/models/qwen.py +9 -13
sglang/srt/models/qwen2.py +11 -13
sglang/srt/models/stablelm.py +9 -15
sglang/srt/models/yivl.py +17 -22
sglang/srt/openai_api_adapter.py +150 -95
sglang/srt/openai_protocol.py +11 -2
sglang/srt/server.py +124 -48
sglang/srt/server_args.py +128 -48
sglang/srt/utils.py +234 -67
sglang/test/test_programs.py +65 -3
sglang/test/test_utils.py +32 -1
sglang/utils.py +23 -4
{sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/METADATA +40 -27
sglang-0.1.18.dist-info/RECORD +78 -0
{sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/WHEEL +1 -1
sglang/srt/backend_config.py +0 -13
sglang/srt/models/dbrx_config.py +0 -281
sglang/srt/weight_utils.py +0 -417
sglang-0.1.16.dist-info/RECORD +0 -72
{sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/LICENSE +0 -0
{sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import argparse
 import dataclasses
+import random
 from typing import List, Optional, Union
@@ -10,11 +11,13 @@ class ServerArgs:
     # Model and tokenizer
     model_path: str
     tokenizer_path: Optional[str] = None
-    load_format: str = "auto"
     tokenizer_mode: str = "auto"
-    chat_template: Optional[str] = None
+    load_format: str = "auto"
+    dtype: str = "auto"
     trust_remote_code: bool = True
     context_length: Optional[int] = None
+    quantization: Optional[str] = None
+    chat_template: Optional[str] = None
     # Port
     host: str = "127.0.0.1"
@@ -23,31 +26,40 @@ class ServerArgs:
     # Memory and scheduling
     mem_fraction_static: Optional[float] = None
-    max_prefill_num_token: Optional[int] = None
+    max_prefill_tokens: Optional[int] = None
+    max_running_requests: Optional[int] = None
     schedule_heuristic: str = "lpm"
     schedule_conservativeness: float = 1.0
     # Other runtime options
     tp_size: int = 1
     stream_interval: int = 8
-    random_seed: int = 42
+    random_seed: Optional[int] = None
     # Logging
     log_level: str = "info"
+    log_level_http: Optional[str] = None
     log_requests: bool = False
-    disable_log_stats: bool = False
-    log_stats_interval: int = 10
     show_time_cost: bool = False
     # Other
     api_key: str = ""
+    # Data parallelism
+    dp_size: int = 1
+    load_balance_method: str = "round_robin"
     # Optimization/debug options
-    enable_flashinfer: bool = False
-    attention_reduce_in_fp32: bool = False
+    disable_flashinfer: bool = False
     disable_radix_cache: bool = False
     disable_regex_jump_forward: bool = False
     disable_disk_cache: bool = False
+    attention_reduce_in_fp32: bool = False
+    # Distributed args
+    nccl_init_addr: Optional[str] = None
+    nnodes: int = 1
+    node_rank: Optional[int] = None
     def __post_init__(self):
         if self.tokenizer_path is None:
@@ -66,6 +78,9 @@ class ServerArgs:
         elif self.additional_ports is None:
             self.additional_ports = []
+        if self.random_seed is None:
+            self.random_seed = random.randint(0, 1 << 30)
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         parser.add_argument(
@@ -91,7 +106,16 @@ class ServerArgs:
             type=int,
             nargs="*",
             default=[],
-            help="Additional ports specified for the server.",
+            help="The additional ports specified for the server.",
+        )
+        parser.add_argument(
+            "--tokenizer-mode",
+            type=str,
+            default=ServerArgs.tokenizer_mode,
+            choices=["auto", "slow"],
+            help="Tokenizer mode. 'auto' will use the fast "
+            "tokenizer if available, and 'slow' will "
+            "always use the slow tokenizer.",
         )
         parser.add_argument(
             "--load-format",
@@ -110,20 +134,20 @@ class ServerArgs:
             "which is mainly for profiling.",
         )
         parser.add_argument(
-            "--tokenizer-mode",
-            type=str,
-            default=ServerArgs.tokenizer_mode,
-            choices=["auto", "slow"],
-            help="Tokenizer mode. 'auto' will use the fast "
-            "tokenizer if available, and 'slow' will "
-            "always use the slow tokenizer.",
-        )
-        parser.add_argument(
-            "--chat-template",
+            "--dtype",
             type=str,
-            default=ServerArgs.chat_template,
-            help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server",
-        )
+            default=ServerArgs.dtype,
+            choices=[
+                "auto", "half", "float16", "bfloat16", "float", "float32"
+            ],
+            help='Data type for model weights and activations.\n\n'
+            '* "auto" will use FP16 precision for FP32 and FP16 models, and '
+            'BF16 precision for BF16 models.\n'
+            '* "half" for FP16. Recommended for AWQ quantization.\n'
+            '* "float16" is the same as "half".\n'
+            '* "bfloat16" for a balance between precision and range.\n'
+            '* "float" is shorthand for FP32 precision.\n'
+            '* "float32" for FP32 precision.')
         parser.add_argument(
             "--trust-remote-code",
             action="store_true",
@@ -135,6 +159,18 @@ class ServerArgs:
             default=ServerArgs.context_length,
             help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
         )
+        parser.add_argument(
+            "--quantization",
+            type=str,
+            default=ServerArgs.quantization,
+            help="The quantization method.",
+        )
+        parser.add_argument(
+            "--chat-template",
+            type=str,
+            default=ServerArgs.chat_template,
+            help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
+        )
         parser.add_argument(
             "--mem-fraction-static",
             type=float,
@@ -142,17 +178,23 @@ class ServerArgs:
             help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
         )
         parser.add_argument(
-            "--max-prefill-num-token",
+            "--max-prefill-tokens",
             type=int,
-            default=ServerArgs.max_prefill_num_token,
+            default=ServerArgs.max_prefill_tokens,
             help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
         )
+        parser.add_argument(
+            "--max-running-requests",
+            type=int,
+            default=ServerArgs.max_running_requests,
+            help="The maximum number of running requests.",
+        )
         parser.add_argument(
             "--schedule-heuristic",
             type=str,
             default=ServerArgs.schedule_heuristic,
             choices=["lpm", "random", "fcfs", "dfs-weight"],
-            help="Scheduling Heuristic.",
+            help="The scheduling heuristic.",
         )
         parser.add_argument(
             "--schedule-conservativeness",
@@ -164,7 +206,7 @@ class ServerArgs:
             "--tp-size",
             type=int,
             default=ServerArgs.tp_size,
-            help="Tensor parallelism size.",
+            help="The tensor parallelism size.",
         )
         parser.add_argument(
             "--stream-interval",
@@ -176,29 +218,24 @@ class ServerArgs:
             "--random-seed",
             type=int,
             default=ServerArgs.random_seed,
-            help="Random seed.",
+            help="The random seed.",
         )
         parser.add_argument(
             "--log-level",
             type=str,
             default=ServerArgs.log_level,
-            help="Logging level",
+            help="The logging level of all loggers.",
         )
         parser.add_argument(
-            "--log-requests",
-            action="store_true",
-            help="Log all requests",
+            "--log-level-http",
+            type=str,
+            default=ServerArgs.log_level_http,
+            help="The logging level of HTTP server. If not set, reuse --log-level by default.",
         )
         parser.add_argument(
-            "--disable-log-stats",
+            "--log-requests",
             action="store_true",
-            help="Disable logging throughput stats.",
-        )
-        parser.add_argument(
-            "--log-stats-interval",
-            type=int,
-            default=ServerArgs.log_stats_interval,
-            help="Log stats interval in second.",
+            help="Log the inputs and outputs of all requests.",
         )
         parser.add_argument(
             "--show-time-cost",
@@ -212,16 +249,47 @@ class ServerArgs:
             help="Set API key of the server",
         )
-        # Optimization/debug options
+        # Data parallelism
         parser.add_argument(
-            "--enable-flashinfer",
-            action="store_true",
-            help="Enable flashinfer inference kernels",
+            "--dp-size",
+            type=int,
+            default=ServerArgs.dp_size,
+            help="The data parallelism size.",
         )
         parser.add_argument(
-            "--attention-reduce-in-fp32",
+            "--load-balance-method",
+            type=str,
+            default=ServerArgs.load_balance_method,
+            help="The load balancing strategy for data parallelism.",
+            choices=[
+                "round_robin",
+                "shortest_queue",
+            ],
+        )
+        # Multi-node distributed serving args
+        parser.add_argument(
+            "--nccl-init-addr",
+            type=str,
+            help="The nccl init address of multi-node server."
+        )
+        parser.add_argument(
+            "--nnodes",
+            type=int,
+            default=1,
+            help="The number of nodes."
+        )
+        parser.add_argument(
+            "--node-rank",
+            type=int,
+            help="The node rank."
+        )
+        # Optimization/debug options
+        parser.add_argument(
+            "--disable-flashinfer",
             action="store_true",
-            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16.",
+            help="Disable flashinfer inference kernels",
         )
         parser.add_argument(
             "--disable-radix-cache",
@@ -238,6 +306,12 @@ class ServerArgs:
             action="store_true",
             help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
         )
+        parser.add_argument(
+            "--attention-reduce-in-fp32",
+            action="store_true",
+            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
+            "This only affects Triton attention kernels",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -249,7 +323,7 @@ class ServerArgs:
     def print_mode_args(self):
         return (
-            f"enable_flashinfer={self.enable_flashinfer}, "
+            f"disable_flashinfer={self.disable_flashinfer}, "
             f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}, "
             f"disable_radix_cache={self.disable_radix_cache}, "
             f"disable_regex_jump_forward={self.disable_regex_jump_forward}, "
@@ -257,10 +331,16 @@ class ServerArgs:
         )
+@dataclasses.dataclass
+class ModelPortArgs:
+    nccl_port: int
+    model_tp_ips: List[str]
+    model_tp_ports: List[int]
 @dataclasses.dataclass
 class PortArgs:
     tokenizer_port: int
     router_port: int
     detokenizer_port: int
-    nccl_port: int
-    model_rpc_ports: List[int]
+    model_port_args: List[ModelPortArgs]

sglang 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl

sglang 0.1.16py3-none-any.whl → 0.1.18py3-none-any.whl