PyPI - sglang - Versions diffs - 0.3.5__py3-none-any.whl → 0.3.5.post1__py3-none-any.whl - Mend

sglang 0.3.5py3-none-any.whl → 0.3.5.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

sglang/bench_serving.py +113 -3
sglang/srt/configs/model_config.py +5 -2
sglang/srt/constrained/__init__.py +2 -66
sglang/srt/constrained/base_grammar_backend.py +72 -0
sglang/srt/constrained/outlines_backend.py +165 -0
sglang/srt/constrained/outlines_jump_forward.py +182 -0
sglang/srt/constrained/xgrammar_backend.py +114 -0
sglang/srt/layers/attention/triton_ops/decode_attention.py +7 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +6 -0
sglang/srt/layers/fused_moe/fused_moe.py +23 -7
sglang/srt/layers/quantization/base_config.py +4 -6
sglang/srt/layers/vocab_parallel_embedding.py +216 -150
sglang/srt/managers/io_struct.py +5 -3
sglang/srt/managers/schedule_batch.py +14 -20
sglang/srt/managers/scheduler.py +153 -94
sglang/srt/managers/tokenizer_manager.py +81 -17
sglang/srt/metrics/collector.py +211 -0
sglang/srt/metrics/func_timer.py +108 -0
sglang/srt/mm_utils.py +1 -1
sglang/srt/model_executor/cuda_graph_runner.py +2 -2
sglang/srt/model_executor/forward_batch_info.py +7 -3
sglang/srt/model_executor/model_runner.py +2 -1
sglang/srt/models/gemma2_reward.py +69 -0
sglang/srt/models/gpt2.py +31 -37
sglang/srt/models/internlm2_reward.py +62 -0
sglang/srt/models/llama.py +11 -6
sglang/srt/models/llama_reward.py +5 -26
sglang/srt/models/qwen2_vl.py +5 -7
sglang/srt/openai_api/adapter.py +6 -2
sglang/srt/sampling/sampling_batch_info.py +2 -3
sglang/srt/sampling/sampling_params.py +0 -14
sglang/srt/server.py +58 -16
sglang/srt/server_args.py +42 -22
sglang/srt/utils.py +87 -0
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +2 -2
sglang/test/simple_eval_mgsm.py +2 -2
sglang/test/test_utils.py +18 -4
sglang/utils.py +1 -0
sglang/version.py +1 -1
{sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/METADATA +11 -7
{sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/RECORD +45 -42
{sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/WHEEL +1 -1
sglang/srt/constrained/base_tool_cache.py +0 -65
sglang/srt/constrained/bnf_cache.py +0 -61
sglang/srt/constrained/fsm_cache.py +0 -95
sglang/srt/constrained/grammar.py +0 -190
sglang/srt/constrained/jump_forward.py +0 -203
{sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/LICENSE +0 -0
{sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/top_level.txt +0 -0

sglang/srt/server.py CHANGED Viewed

@@ -30,12 +30,11 @@ import time
 from http import HTTPStatus
 from typing import AsyncIterator, Dict, List, Optional, Union
-import orjson
 # Fix a bug of Python threading
 setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
 import aiohttp
+import orjson
 import requests
 import uvicorn
 import uvloop
@@ -57,6 +56,7 @@ from sglang.srt.managers.io_struct import (
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.metrics.func_timer import enable_func_timer, time_func_latency
 from sglang.srt.openai_api.adapter import (
     load_chat_template_for_openai_api,
     v1_batches,
@@ -74,12 +74,15 @@ from sglang.srt.openai_api.protocol import ModelCard, ModelList
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import (
     add_api_key_middleware,
+    add_prometheus_middleware,
     assert_pkg_version,
     configure_logger,
+    delete_directory,
     is_port_available,
     kill_child_process,
     maybe_set_triton_cache_manager,
     prepare_model_and_tokenizer,
+    set_prometheus_multiproc_dir,
     set_ulimit,
 )
 from sglang.utils import get_exception_traceback
@@ -90,8 +93,6 @@ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 app = FastAPI()
-tokenizer_manager: TokenizerManager = None
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -100,6 +101,10 @@ app.add_middleware(
     allow_headers=["*"],
 )
+tokenizer_manager: TokenizerManager = None
+##### Native API endpoints #####
 @app.get("/health")
 async def health() -> Response:
@@ -110,9 +115,16 @@ async def health() -> Response:
 @app.get("/health_generate")
 async def health_generate(request: Request) -> Response:
     """Check the health of the inference server by generating one token."""
-    gri = GenerateReqInput(
-        text="s", sampling_params={"max_new_tokens": 1, "temperature": 0.7}
-    )
+    if tokenizer_manager.is_generation:
+        gri = GenerateReqInput(
+            input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
+        )
+    else:
+        gri = EmbeddingReqInput(
+            input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
+        )
     try:
         async for _ in tokenizer_manager.generate_request(gri, request):
             break
@@ -185,6 +197,7 @@ async def get_memory_pool_size():
 @app.post("/update_weights")
+@time_func_latency
 async def update_weights(obj: UpdateWeightReqInput, request: Request):
     """Update the weights inplace without re-launching the server."""
     success, message = await tokenizer_manager.update_weights(obj, request)
@@ -201,7 +214,7 @@ async def update_weights(obj: UpdateWeightReqInput, request: Request):
         )
-# fastapi implicitly converts json in the request to obj (dataclass)
+@time_func_latency
 async def generate_request(obj: GenerateReqInput, request: Request):
     """Handle a generate request."""
     if obj.stream:
@@ -234,10 +247,12 @@ async def generate_request(obj: GenerateReqInput, request: Request):
             )
+# fastapi implicitly converts json in the request to obj (dataclass)
 app.post("/generate")(generate_request)
 app.put("/generate")(generate_request)
+@time_func_latency
 async def encode_request(obj: EmbeddingReqInput, request: Request):
     """Handle an embedding request."""
     try:
@@ -253,7 +268,8 @@ app.post("/encode")(encode_request)
 app.put("/encode")(encode_request)
-async def judge_request(obj: EmbeddingReqInput, request: Request):
+@time_func_latency
+async def classify_request(obj: EmbeddingReqInput, request: Request):
     """Handle a reward model request. Now the arguments and return values are the same as embedding models."""
     try:
         ret = await tokenizer_manager.generate_request(obj, request).__anext__()
@@ -264,21 +280,27 @@ async def judge_request(obj: EmbeddingReqInput, request: Request):
         )
-app.post("/judge")(judge_request)
-app.put("/judge")(judge_request)
+app.post("/classify")(classify_request)
+app.put("/classify")(classify_request)
+##### OpenAI-compatible API endpoints #####
 @app.post("/v1/completions")
+@time_func_latency
 async def openai_v1_completions(raw_request: Request):
     return await v1_completions(tokenizer_manager, raw_request)
 @app.post("/v1/chat/completions")
+@time_func_latency
 async def openai_v1_chat_completions(raw_request: Request):
     return await v1_chat_completions(tokenizer_manager, raw_request)
 @app.post("/v1/embeddings", response_class=ORJSONResponse)
+@time_func_latency
 async def openai_v1_embeddings(raw_request: Request):
     response = await v1_embeddings(tokenizer_manager, raw_request)
     return response
@@ -432,13 +454,17 @@ def launch_server(
     1. The HTTP server and Tokenizer Manager both run in the main process.
     2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
     """
     launch_engine(server_args=server_args)
     # Add api key authorization
     if server_args.api_key:
         add_api_key_middleware(app, server_args.api_key)
+    # add prometheus middleware
+    if server_args.enable_metrics:
+        add_prometheus_middleware(app)
+        enable_func_timer()
     # Send a warmup request
     t = threading.Thread(
         target=_wait_and_warmup, args=(server_args, pipe_finish_writer)
@@ -475,6 +501,10 @@ def _set_envs_and_config(server_args: ServerArgs):
     os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
     os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
+    # Set prometheus env vars
+    if server_args.enable_metrics:
+        set_prometheus_multiproc_dir()
     # Set ulimit
     set_ulimit()
@@ -523,6 +553,7 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
         return
     model_info = res.json()
     # Send a warmup request
     request_name = "/generate" if model_info["is_generation"] else "/encode"
     max_new_tokens = 8 if model_info["is_generation"] else 1
@@ -560,6 +591,9 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
     if pipe_finish_writer is not None:
         pipe_finish_writer.send("ready")
+    if server_args.delete_ckpt_after_loading:
+        delete_directory(server_args.model_path)
 class Runtime:
     """
@@ -720,12 +754,12 @@ class Engine:
         # before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
         atexit.register(self.shutdown)
         # runtime server default log level is log
         # offline engine works in scripts, so we set it to error
-        if 'log_level' not in kwargs:
-            kwargs['log_level'] = 'error'
+        if "log_level" not in kwargs:
+            kwargs["log_level"] = "error"
         server_args = ServerArgs(*args, **kwargs)
         launch_engine(server_args=server_args)
@@ -840,4 +874,12 @@ class Engine:
         else:
             return tokenizer_manager.tokenizer
-    # TODO (ByronHsu): encode
+    def encode(
+        self,
+        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+    ):
+        obj = EmbeddingReqInput(text=prompt)
+        # get the current event loop
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(encode_request(obj, None))

sglang/srt/server_args.py CHANGED Viewed

@@ -63,25 +63,26 @@ class ServerArgs:
     stream_interval: int = 1
     random_seed: Optional[int] = None
     constrained_json_whitespace_pattern: Optional[str] = None
-    decode_log_interval: int = 40
+    watchdog_timeout: float = 300
     # Logging
     log_level: str = "info"
     log_level_http: Optional[str] = None
     log_requests: bool = False
     show_time_cost: bool = False
+    enable_metrics: bool = False
+    decode_log_interval: int = 40
-    # Other
+    # API related
     api_key: Optional[str] = None
     file_storage_pth: str = "SGLang_storage"
     enable_cache_report: bool = False
-    watchdog_timeout: float = 600
     # Data parallelism
     dp_size: int = 1
     load_balance_method: str = "round_robin"
-    # Distributed args
+    # Multi-node distributed serving
     dist_init_addr: Optional[str] = None
     nnodes: int = 1
     node_rank: int = 0
@@ -110,7 +111,7 @@ class ServerArgs:
     disable_flashinfer: bool = False
     disable_flashinfer_sampling: bool = False
     disable_radix_cache: bool = False
-    disable_regex_jump_forward: bool = False
+    disable_jump_forward: bool = False
     disable_cuda_graph: bool = False
     disable_cuda_graph_padding: bool = False
     disable_disk_cache: bool = False
@@ -127,6 +128,7 @@ class ServerArgs:
     enable_p2p_check: bool = False
     triton_attention_reduce_in_fp32: bool = False
     num_continuous_decode_steps: int = 1
+    delete_ckpt_after_loading: bool = False
     def __post_init__(self):
         # Set missing default values
@@ -204,6 +206,7 @@ class ServerArgs:
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
+        # Model and port args
         parser.add_argument(
             "--model-path",
             type=str,
@@ -323,6 +326,8 @@ class ServerArgs:
             action="store_true",
             help="Whether to use a CausalLM as an embedding model.",
         )
+        # Memory and scheduling
         parser.add_argument(
             "--mem-fraction-static",
             type=float,
@@ -367,6 +372,8 @@ class ServerArgs:
             default=ServerArgs.schedule_conservativeness,
             help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
         )
+        # Other runtime options
         parser.add_argument(
             "--tensor-parallel-size",
             "--tp-size",
@@ -392,6 +399,14 @@ class ServerArgs:
             default=ServerArgs.constrained_json_whitespace_pattern,
             help=r"Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
         )
+        parser.add_argument(
+            "--watchdog-timeout",
+            type=float,
+            default=ServerArgs.watchdog_timeout,
+            help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
+        )
+        # Logging
         parser.add_argument(
             "--log-level",
             type=str,
@@ -414,6 +429,19 @@ class ServerArgs:
             action="store_true",
             help="Show time cost of custom marks.",
         )
+        parser.add_argument(
+            "--enable-metrics",
+            action="store_true",
+            help="Enable log prometheus metrics.",
+        )
+        parser.add_argument(
+            "--decode-log-interval",
+            type=int,
+            default=ServerArgs.decode_log_interval,
+            help="The log interval of decode batch",
+        )
+        # API related
         parser.add_argument(
             "--api-key",
             type=str,
@@ -431,18 +459,6 @@ class ServerArgs:
             action="store_true",
             help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
         )
-        parser.add_argument(
-            "--watchdog-timeout",
-            type=float,
-            default=ServerArgs.watchdog_timeout,
-            help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
-        )
-        parser.add_argument(
-            "--decode-log-interval",
-            type=int,
-            default=ServerArgs.decode_log_interval,
-            help="The log interval of decode batch"
-        )
         # Data parallelism
         parser.add_argument(
@@ -463,7 +479,7 @@ class ServerArgs:
             ],
         )
-        # Multi-node distributed serving args
+        # Multi-node distributed serving
         parser.add_argument(
             "--dist-init-addr",
             "--nccl-init-addr",  # For backward compatbility. This will be removed in the future.
@@ -558,7 +574,7 @@ class ServerArgs:
             type=str,
             choices=["xgrammar", "outlines"],
             default=ServerArgs.grammar_backend,
-            help="Choose the backend for constrained decoding.",
+            help="Choose the backend for grammar-guided decoding.",
         )
         # Optimization/debug options
@@ -578,9 +594,9 @@ class ServerArgs:
             help="Disable RadixAttention for prefix caching.",
         )
         parser.add_argument(
-            "--disable-regex-jump-forward",
+            "--disable-jump-forward",
             action="store_true",
-            help="Disable regex jump-forward.",
+            help="Disable jump-forward for grammar-guided decoding.",
         )
         parser.add_argument(
             "--disable-cuda-graph",
@@ -600,7 +616,6 @@ class ServerArgs:
         parser.add_argument(
             "--disable-custom-all-reduce",
             action="store_true",
-            default=False,
             help="Disable the custom all-reduce kernel and fall back to NCCL.",
         )
         parser.add_argument(
@@ -670,6 +685,11 @@ class ServerArgs:
             "This can potentially increase throughput but may also increase time-to-first-token latency. "
             "The default value is 1, meaning only run one decoding step at a time.",
         )
+        parser.add_argument(
+            "--delete-ckpt-after-loading",
+            action="store_true",
+            help="Delete the model checkpoint after loading the model.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):

sglang/srt/utils.py CHANGED Viewed

@@ -22,8 +22,12 @@ import logging
 import os
 import pickle
 import random
+import re
 import resource
+import shutil
+import signal
 import socket
+import tempfile
 import time
 import warnings
 from importlib.metadata import PackageNotFoundError, version
@@ -35,9 +39,11 @@ import psutil
 import requests
 import torch
 import torch.distributed as dist
+import triton
 import zmq
 from fastapi.responses import ORJSONResponse
 from packaging import version as pkg_version
+from starlette.routing import Mount
 from torch import nn
 from torch.profiler import ProfilerActivity, profile, record_function
 from triton.runtime.cache import (
@@ -379,6 +385,10 @@ def kill_child_process(pid=None, include_self=False, skip_pid=None):
     if include_self:
         try:
             itself.kill()
+            # Sometime processes cannot be killed with SIGKILL (e.g, PID=1 launched by kubernetes),
+            # so we send an additional signal to kill them.
+            itself.send_signal(signal.SIGINT)
         except psutil.NoSuchProcess:
             pass
@@ -704,3 +714,80 @@ def get_zmq_socket(context: zmq.Context, socket_type: zmq.SocketType, endpoint:
         raise ValueError(f"Unsupported socket type: {socket_type}")
     return socket
+def dump_to_file(dirpath, name, value):
+    from vllm.distributed import get_tensor_model_parallel_rank
+    if get_tensor_model_parallel_rank() != 0:
+        return
+    os.makedirs(dirpath, exist_ok=True)
+    if value.dtype is torch.bfloat16:
+        value = value.float()
+    value = value.cpu().numpy()
+    output_filename = os.path.join(dirpath, f"pytorch_dump_{name}.npy")
+    logger.info(f"Dump a tensor to {output_filename}. Shape = {value.shape}")
+    np.save(output_filename, value)
+def is_triton_3():
+    return triton.__version__.startswith("3.")
+def maybe_torch_compile(*args, **kwargs):
+    """
+    torch.compile does not work for triton 2.2.0, which is needed in xlm1's jax.
+    Therefore, we disable it here.
+    """
+    def decorator(func):
+        if is_triton_3():
+            return torch.compile(*args, **kwargs)(func)
+        return func
+    return decorator
+def delete_directory(dirpath):
+    try:
+        # This will remove the directory and all its contents
+        shutil.rmtree(dirpath)
+    except OSError as e:
+        print(f"Warning: {dirpath} : {e.strerror}")
+# Temporary directory for prometheus multiprocess mode
+# Cleaned up automatically when this object is garbage collected
+prometheus_multiproc_dir: tempfile.TemporaryDirectory
+def set_prometheus_multiproc_dir():
+    # Set prometheus multiprocess directory
+    # sglang uses prometheus multiprocess mode
+    # we need to set this before importing prometheus_client
+    # https://prometheus.github.io/client_python/multiprocess/
+    global prometheus_multiproc_dir
+    if "PROMETHEUS_MULTIPROC_DIR" in os.environ:
+        logger.debug("User set PROMETHEUS_MULTIPROC_DIR detected.")
+        prometheus_multiproc_dir = tempfile.TemporaryDirectory(
+            dir=os.environ["PROMETHEUS_MULTIPROC_DIR"]
+        )
+    else:
+        prometheus_multiproc_dir = tempfile.TemporaryDirectory()
+        os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name
+    logger.debug(f"PROMETHEUS_MULTIPROC_DIR: {os.environ['PROMETHEUS_MULTIPROC_DIR']}")
+def add_prometheus_middleware(app):
+    # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
+    from prometheus_client import CollectorRegistry, make_asgi_app, multiprocess
+    registry = CollectorRegistry()
+    multiprocess.MultiProcessCollector(registry)
+    metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
+    # Workaround for 307 Redirect for /metrics
+    metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
+    app.routes.append(metrics_route)

sglang/test/simple_eval_common.py CHANGED Viewed

@@ -320,7 +320,7 @@ jinja_env = jinja2.Environment(
 _message_template = """
 <div class="message {{ role }}">
     <div class="role">
-    {{ role }}
+    {{ role }}
     {% if variant %}<span class="variant">({{ variant }})</span>{% endif %}
     </div>
     <div class="content">

sglang/test/simple_eval_humaneval.py CHANGED Viewed

@@ -2,8 +2,8 @@
 """
 HumanEval: Evaluating Large Language Models Trained on Code
-Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba
-https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
+Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba
+https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
 """
 import random

sglang/test/simple_eval_mgsm.py CHANGED Viewed

@@ -1,10 +1,10 @@
 # Adapted from https://github.com/openai/simple-evals/
 """
-MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems.
+MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems.
 Language Models are Multilingual Chain-of-Thought Reasoners
 Freda Shi, Mirac Suzgun, Markus Freitag, Xuezhi Wang, Suraj Srivats, Soroush Vosoughi, Hyung Won Chung, Yi Tay, Sebastian Ruder, Denny Zhou, Dipanjan Das, Jason Wei
-https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp
+https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp
 """
 import re

sglang/test/test_utils.py CHANGED Viewed

@@ -27,6 +27,8 @@ from sglang.utils import get_exception_traceback
 DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
+DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
+DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
@@ -441,7 +443,7 @@ def popen_launch_server(
                 "Content-Type": "application/json; charset=utf-8",
                 "Authorization": f"Bearer {api_key}",
             }
-            response = requests.get(f"{base_url}/v1/models", headers=headers)
+            response = requests.get(f"{base_url}/health_generate", headers=headers)
             if response.status_code == 200:
                 return process
         except requests.RequestException:
@@ -636,8 +638,8 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
     return rouge_l_scores
-STDOUT_FILENAME = "stdout.txt"
 STDERR_FILENAME = "stderr.txt"
+STDOUT_FILENAME = "stdout.txt"
 def read_output(output_lines):
@@ -742,7 +744,13 @@ def run_mmlu_test(
         finally:
             pass
-    run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size)
+    run_and_check_memory_leak(
+        workload_func,
+        disable_radix_cache,
+        enable_mixed_chunk,
+        enable_overlap,
+        chunked_prefill_size,
+    )
 def run_mulit_request_test(
@@ -775,4 +783,10 @@ def run_mulit_request_test(
         with ThreadPoolExecutor(2) as executor:
             list(executor.map(run_one, list(range(4))))
-    run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size)
+    run_and_check_memory_leak(
+        workload_func,
+        disable_radix_cache,
+        enable_mixed_chunk,
+        enable_overlap,
+        chunked_prefill_size,
+    )

sglang/utils.py CHANGED Viewed

@@ -349,6 +349,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
 def terminate_process(process):
     from sglang.srt.utils import kill_child_process
     kill_child_process(process.pid, include_self=True)

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.5"
1	+ __version__ = "0.3.5.post1"

{sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.3.5
+Version: 0.3.5.post1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -256,13 +256,14 @@ Requires-Dist: interegular; extra == "runtime-common"
 Requires-Dist: orjson; extra == "runtime-common"
 Requires-Dist: packaging; extra == "runtime-common"
 Requires-Dist: pillow; extra == "runtime-common"
+Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
 Requires-Dist: psutil; extra == "runtime-common"
 Requires-Dist: pydantic; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: torchao; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: zmq; extra == "runtime-common"
+Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
 Requires-Dist: modelscope; extra == "runtime-common"
 Provides-Extra: srt
@@ -291,13 +292,14 @@ Requires-Dist: peft; extra == "test"
 [![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
 [![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
 [![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
+[![](https://img.shields.io/badge/Gurubase-(experimental)-006BFF)](https://gurubase.io/g/sglang)
 </div>
 --------------------------------------------------------------------------------
-| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
-[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
+| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
+[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
 - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
@@ -321,11 +323,13 @@ The core features include:
 - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
-- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
+- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
-## Install
-See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
+## Getting Started
+Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
+Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
 ## Backend: SGLang Runtime (SRT)
 See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)

sglang 0.3.5__py3-none-any.whl → 0.3.5.post1__py3-none-any.whl

sglang 0.3.5py3-none-any.whl → 0.3.5.post1py3-none-any.whl