PyPI - sglang - Versions diffs - 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

sglang 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

sglang/api.py +13 -1
sglang/bench_latency.py +10 -5
sglang/bench_serving.py +50 -26
sglang/check_env.py +15 -0
sglang/global_config.py +1 -1
sglang/lang/backend/runtime_endpoint.py +60 -49
sglang/lang/chat_template.py +10 -5
sglang/lang/compiler.py +4 -0
sglang/lang/interpreter.py +5 -2
sglang/lang/ir.py +22 -4
sglang/launch_server.py +8 -1
sglang/srt/constrained/jump_forward.py +13 -2
sglang/srt/conversation.py +50 -1
sglang/srt/hf_transformers_utils.py +22 -23
sglang/srt/layers/activation.py +24 -2
sglang/srt/layers/decode_attention.py +338 -50
sglang/srt/layers/extend_attention.py +3 -1
sglang/srt/layers/fused_moe/__init__.py +1 -0
sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
sglang/srt/layers/fused_moe/layer.py +587 -0
sglang/srt/layers/layernorm.py +3 -0
sglang/srt/layers/logits_processor.py +64 -27
sglang/srt/layers/radix_attention.py +41 -18
sglang/srt/layers/sampler.py +154 -0
sglang/srt/managers/controller_multi.py +2 -8
sglang/srt/managers/controller_single.py +7 -10
sglang/srt/managers/detokenizer_manager.py +20 -9
sglang/srt/managers/io_struct.py +44 -11
sglang/srt/managers/policy_scheduler.py +5 -2
sglang/srt/managers/schedule_batch.py +59 -179
sglang/srt/managers/tokenizer_manager.py +193 -84
sglang/srt/managers/tp_worker.py +131 -50
sglang/srt/mem_cache/memory_pool.py +82 -8
sglang/srt/mm_utils.py +79 -7
sglang/srt/model_executor/cuda_graph_runner.py +97 -28
sglang/srt/model_executor/forward_batch_info.py +188 -82
sglang/srt/model_executor/model_runner.py +269 -87
sglang/srt/models/chatglm.py +6 -14
sglang/srt/models/commandr.py +6 -2
sglang/srt/models/dbrx.py +5 -1
sglang/srt/models/deepseek.py +7 -3
sglang/srt/models/deepseek_v2.py +12 -7
sglang/srt/models/gemma.py +6 -2
sglang/srt/models/gemma2.py +22 -8
sglang/srt/models/gpt_bigcode.py +5 -1
sglang/srt/models/grok.py +66 -398
sglang/srt/models/internlm2.py +5 -1
sglang/srt/models/llama2.py +7 -3
sglang/srt/models/llama_classification.py +2 -2
sglang/srt/models/llama_embedding.py +4 -0
sglang/srt/models/llava.py +176 -59
sglang/srt/models/minicpm.py +7 -3
sglang/srt/models/mixtral.py +61 -255
sglang/srt/models/mixtral_quant.py +6 -5
sglang/srt/models/qwen.py +7 -4
sglang/srt/models/qwen2.py +15 -5
sglang/srt/models/qwen2_moe.py +7 -16
sglang/srt/models/stablelm.py +6 -2
sglang/srt/openai_api/adapter.py +149 -58
sglang/srt/sampling/sampling_batch_info.py +209 -0
sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -4
sglang/srt/server.py +107 -71
sglang/srt/server_args.py +49 -15
sglang/srt/utils.py +27 -18
sglang/test/runners.py +38 -38
sglang/test/simple_eval_common.py +9 -10
sglang/test/simple_eval_gpqa.py +2 -1
sglang/test/simple_eval_humaneval.py +2 -2
sglang/test/simple_eval_math.py +2 -1
sglang/test/simple_eval_mmlu.py +2 -1
sglang/test/test_activation.py +55 -0
sglang/test/test_programs.py +32 -5
sglang/test/test_utils.py +37 -50
sglang/version.py +1 -1
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/METADATA +102 -27
sglang-0.2.14.dist-info/RECORD +114 -0
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
sglang/launch_server_llavavid.py +0 -29
sglang/srt/model_loader/model_loader.py +0 -292
sglang/srt/model_loader/utils.py +0 -275
sglang-0.2.12.dist-info/RECORD +0 -112
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0

sglang/test/test_utils.py CHANGED Viewed

@@ -2,11 +2,10 @@
 import argparse
 import asyncio
-import multiprocessing
+import os
 import subprocess
 import threading
 import time
-import unittest
 from functools import partial
 from typing import Callable, List, Optional
@@ -18,10 +17,19 @@ import torch.nn.functional as F
 from sglang.global_config import global_config
 from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.srt.utils import kill_child_process
 from sglang.utils import get_exception_traceback
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-DEFAULT_URL_FOR_TEST = "http://127.0.0.1:8157"
+DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
+if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
+    DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
+else:
+    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 1157
+    DEFAULT_URL_FOR_TEST = "http://127.0.0.1:2157"
 def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
@@ -100,31 +108,8 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
     return pred
-def call_generate_ginfer(prompt, temperature, max_tokens, stop=None, url=None):
-    import grpc
-    from ginfer import sampler_pb2, sampler_pb2_grpc
-    sampler_channel = grpc.insecure_channel(url.replace("http://", ""))
-    sampler = sampler_pb2_grpc.SamplerStub(sampler_channel)
-    if stop is None:
-        stop_strings = None
-    else:
-        stop_strings = [stop]
-    sample_request = sampler_pb2.SampleTextRequest(
-        prompt=prompt,
-        settings=sampler_pb2.SampleSettings(
-            max_len=max_tokens,
-            rng_seed=0,
-            temperature=max(temperature, 1e-7),
-            nucleus_p=1,
-            stop_strings=stop_strings,
-        ),
-    )
-    stream = sampler.SampleText(sample_request)
-    response = "".join([x.text for x in stream])
-    return response
+def call_generate_gserver(prompt, temperature, max_tokens, stop=None, url=None):
+    raise NotImplementedError()
 def call_generate_guidance(
@@ -267,7 +252,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
             "vllm",
             "outlines",
             "lightllm",
-            "ginfer",
+            "gserver",
             "guidance",
             "lmql",
             "srt-raw",
@@ -288,7 +273,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
             "lightllm": 22000,
             "lmql": 23000,
             "srt-raw": 30000,
-            "ginfer": 9988,
+            "gserver": 9988,
         }
         args.port = default_port.get(args.backend, None)
     return args
@@ -324,8 +309,8 @@ def _get_call_generate(args: argparse.Namespace):
         return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
     elif args.backend == "srt-raw":
         return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
-    elif args.backend == "ginfer":
-        return partial(call_generate_ginfer, url=f"{args.host}:{args.port}")
+    elif args.backend == "gserver":
+        return partial(call_generate_gserver, url=f"{args.host}:{args.port}")
     elif args.backend == "outlines":
         return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
     elif args.backend == "guidance":
@@ -476,34 +461,36 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
     success = True
     for filename in files:
+        global process
-        def func():
-            print(f"\n\nRun {filename}\n\n")
-            ret = unittest.main(module=None, argv=["", "-vb"] + [filename])
-        p = multiprocessing.Process(target=func)
-        def run_one_file():
-            p.start()
-            p.join()
+        def run_one_file(filename):
+            filename = os.path.join(os.getcwd(), filename)
+            print(f"\n\nRun:\npython3 {filename}\n\n", flush=True)
+            process = subprocess.Popen(
+                ["python3", filename], stdout=None, stderr=None, env=os.environ
+            )
+            process.wait()
+            return process.returncode
         try:
-            run_with_timeout(run_one_file, timeout=timeout_per_file)
-            if p.exitcode != 0:
-                success = False
-                break
+            ret_code = run_with_timeout(
+                run_one_file, args=(filename,), timeout=timeout_per_file
+            )
+            assert ret_code == 0
         except TimeoutError:
-            p.terminate()
+            kill_child_process(process.pid)
             time.sleep(5)
             print(
-                f"\nTimeout after {timeout_per_file} seconds when running {filename}\n"
+                f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
+                flush=True,
             )
-            return False
+            success = False
+            break
     if success:
-        print(f"Success. Time elapsed: {time.time() - tic:.2f}s")
+        print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
     else:
-        print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
+        print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
     return 0 if success else -1

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.12"
1	+ __version__ = "0.2.14"

{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.2.12
+Version: 0.2.14
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -231,6 +231,7 @@ Requires-Dist: openai>=1.0; extra == "openai"
 Requires-Dist: tiktoken; extra == "openai"
 Provides-Extra: srt
 Requires-Dist: aiohttp; extra == "srt"
+Requires-Dist: decord; extra == "srt"
 Requires-Dist: fastapi; extra == "srt"
 Requires-Dist: hf-transfer; extra == "srt"
 Requires-Dist: huggingface-hub; extra == "srt"
@@ -244,12 +245,14 @@ Requires-Dist: torch; extra == "srt"
 Requires-Dist: uvicorn; extra == "srt"
 Requires-Dist: uvloop; extra == "srt"
 Requires-Dist: zmq; extra == "srt"
-Requires-Dist: vllm==0.5.4; extra == "srt"
+Requires-Dist: vllm==0.5.5; extra == "srt"
 Requires-Dist: outlines>=0.0.44; extra == "srt"
 Provides-Extra: test
 Requires-Dist: jsonlines; extra == "test"
 Requires-Dist: matplotlib; extra == "test"
 Requires-Dist: pandas; extra == "test"
+Requires-Dist: sentence-transformers; extra == "test"
+Requires-Dist: accelerate; extra == "test"
 <div align="center">
 <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
@@ -270,17 +273,18 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
+- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
 - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
 ## News
 - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
-- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
+- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
 - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
 <details>
 <summary>More</summary>
+- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
 - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
 - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -308,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
+git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -329,11 +333,63 @@ docker run --gpus all \
     --env "HF_TOKEN=<secret>" \
     --ipc=host \
     lmsysorg/sglang:latest \
-    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
+    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
 ```
+### Method 4: Using docker compose
+<details>
+> This method is recommended if you plan to serve it as a service.
+> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
+1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
+2. Execute the command `docker compose up -d` in your terminal.
+</details>
+### Method 5: Run on Kubernetes or Clouds with SkyPilot
+<details>
+To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
+1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
+2. Deploy on your own infra with a single command and get the HTTP API endpoint:
+<details>
+<summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
+```yaml
+# sglang.yaml
+envs:
+  HF_TOKEN: null
+resources:
+  image_id: docker:lmsysorg/sglang:latest
+  accelerators: A100
+  ports: 30000
+run: |
+  conda deactivate
+  python3 -m sglang.launch_server \
+    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --host 0.0.0.0 \
+    --port 30000
+```
+</details>
+```bash
+# Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
+HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
+# Get the HTTP API endpoint
+sky status --endpoint 30000 sglang
+```
+3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
+</details>
 ### Common Notes
-- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
+- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
 - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
 ## Backend: SGLang Runtime (SRT)
@@ -387,6 +443,13 @@ response = client.chat.completions.create(
     max_tokens=64,
 )
 print(response)
+# Text embedding
+response = client.embeddings.create(
+    model="default",
+    input="How are you today",
+)
+print(response)
 ```
 It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
@@ -423,19 +486,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ### Supported Models
+**Generative Models**
 - Llama / Llama 2 / Llama 3 / Llama 3.1
 - Mistral / Mixtral / Mistral NeMo
 - Gemma / Gemma 2
 - Qwen / Qwen 2 / Qwen 2 MoE
 - DeepSeek / DeepSeek 2
-- LLaVA 1.5 / 1.6
-  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
-  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
-  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
-- LLaVA-NeXT-Video
-  - see [examples/usage/llava_video](examples/usage/llava_video)
+- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
+  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
+  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
+- LLaVA 1.5 / 1.6 / NeXT
+  - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
+  - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
+  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
 - Yi-VL
-  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
 - StableLM
 - Command-R
 - DBRX
@@ -443,34 +508,45 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - ChatGLM
 - InternLM 2
+**Embedding Models**
+- e5-mistral
+- gte-Qwen2
+  - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
 Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
 #### Use Models From ModelScope
-To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
+<details>
+To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
 ```
 export SGLANG_USE_MODELSCOPE=true
 ```
 Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
 ```
 SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
-```
+```
+</details>
 #### Run Llama 3.1 405B
+<details>
 ```bash
-## Run 405B (fp8) on a single node
+# Run 405B (fp8) on a single node
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
-## Run 405B (fp16) on two nodes
-# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
-# on the first node
-GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
+# Run 405B (fp16) on two nodes
+## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
+GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
-# on the second
-GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
+## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
+GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
 ```
+</details>
 ### Benchmark Performance
 - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
@@ -606,7 +682,7 @@ def tip_suggestion(s):
     s += "In summary" + sgl.gen("summary")
 ```
-#### Multi Modality
+#### Multi-Modality
 Use `sgl.image` to pass an image as input.
 ```python
@@ -660,7 +736,7 @@ def character_gen(s, name):
     s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
 ```
-See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models.
+See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
 #### Batching
 Use `run_batch` to run a batch of requests with continuous batching.
@@ -722,7 +798,6 @@ def chat_example(s):
 - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
 - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
 ## Benchmark And Performance
 ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
 ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)

sglang-0.2.14.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,114 @@
+sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
+sglang/api.py,sha256=8B_ADgLN2fjo9Ej123hInfHA4wmpUkV0yyErSiRnfAA,6408
+sglang/bench_latency.py,sha256=VEdGBX5vZSngS8AeOdJJRW65BIJsZXhKwAK5z20SZoI,16344
+sglang/bench_serving.py,sha256=J_mMwnmDn0Jt07mzdGAuYOxpockHPLYJFL-kwoaqASY,36527
+sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
+sglang/global_config.py,sha256=nwOjUflwqLQySPUMvk8Hk63TIS6mknh_ODSW3CZ1rJw,1704
+sglang/launch_server.py,sha256=FODfO0DW546dh-u1qDlWtrhsmj6hxkarXXv3cIdgkj8,549
+sglang/utils.py,sha256=zFYGkC4vOUR3sTv1TmQXcsOLZDtDBR3wnjqnDp3xMIs,8352
+sglang/version.py,sha256=3fSLgeJpZq4cUgzAH_CdFzXwJEO3NH_VVDv2pQnmwN0,23
+sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
+sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
+sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
+sglang/lang/interpreter.py,sha256=-9VjAb5JqlxtBuQUDT08Cj2BW8VbLxTmJACe2cqza-s,30215
+sglang/lang/ir.py,sha256=GRcPsEjnR4k5q5Kf-Rb2YgDBseCTGQoasclhjmQtL8Y,17511
+sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
+sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
+sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
+sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
+sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
+sglang/lang/backend/runtime_endpoint.py,sha256=SDlp03EuQEK1eGK4_IaFySWgxlp4wCs3EPewZ6O640E,9549
+sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
+sglang/srt/conversation.py,sha256=Ze2_dTHG6jc04ti7vuOEnoEe1ehvhxCJRpa4EYD0T_8,18494
+sglang/srt/hf_transformers_utils.py,sha256=OP5uBwnWiam6h9QvkBaG-nrDgkEUEwLXy1IWvW7rrRo,11737
+sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
+sglang/srt/model_config.py,sha256=k4OfRV-szWkFaJMIC40JoJGJ75AfYQ2hf4M1dS1aQ-o,6366
+sglang/srt/server.py,sha256=KM6fq1RXbnBr0nWj8IO54T-K14o0iscgdFR4z3uU5C4,19572
+sglang/srt/server_args.py,sha256=GiDyPWCvYA_98mSE9LuvUoEodo9gRnNPPIPn0nFkxUs,18259
+sglang/srt/utils.py,sha256=x9MdBu0e8HAgaNIGuxiMVL7_nh03kl_rWuMnLas_Dgo,24327
+sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
+sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
+sglang/srt/constrained/fsm_cache.py,sha256=QTrBFoZCp2FeigtIakz2MCgQLtvQFXgl2lDPQaGtu9M,2784
+sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
+sglang/srt/layers/activation.py,sha256=4RIgqvAIXPpZV4q0YVbAPVygz_YFAbpI4x47p7LcOw4,1911
+sglang/srt/layers/decode_attention.py,sha256=TPD_608ZX9fQ_HDImifkxG_qcEYmimbEYY8lCBIjFuM,16628
+sglang/srt/layers/extend_attention.py,sha256=h4O0R7PJpAVKS3Vx_583zhrFPD0vv6XqzvOcHBI3zoc,14268
+sglang/srt/layers/layernorm.py,sha256=sI_oveGW4uyFI2LOtWF2yd77wH2k5LGAvUIZuoOn2Oo,2227
+sglang/srt/layers/logits_processor.py,sha256=Zx4eFAkFlThPrmz_-HuCN9SqGLanARm0wdZSVDyASAc,13085
+sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
+sglang/srt/layers/prefill_attention.py,sha256=y7vdcuX8lMa9Qf_jQYNDvQO9PVCBQSs3hb5LV2DFgpU,5256
+sglang/srt/layers/radix_attention.py,sha256=o5a8r3XQ-oRwaxBlAgzJGv7p3dMbu0LrYsDc4uvpPgA,8338
+sglang/srt/layers/sampler.py,sha256=YVzlrXE6uJoDwFHaZcUyxgUOUdR5a5myZvrRL6qckoA,5544
+sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
+sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
+sglang/srt/layers/fused_moe/layer.py,sha256=GT3r2UPx_PAufJd0SUMOXyh76ymAeYDubd0SM0H71bo,20977
+sglang/srt/managers/controller_multi.py,sha256=R45ST6oBlIwfUwuibMw0sgTk8iqphb_rFyIdW048JA4,6472
+sglang/srt/managers/controller_single.py,sha256=tnc71OTe8KDYouMdfqgwBT4lX5nZt6Rak9t2GmKtAME,5119
+sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
+sglang/srt/managers/io_struct.py,sha256=4Cs655K4n_F_usu6R3YE5_RdcE0XO9AXQNk5vl2II2c,10534
+sglang/srt/managers/policy_scheduler.py,sha256=7HNUxBKJE444s_bHcPpbnHCygsnH-NIXYNSC2q6mRmc,8584
+sglang/srt/managers/schedule_batch.py,sha256=yW7fkBi31vytfNEkFzs1Z3xzEzLMevXvoCyuoubut3M,25920
+sglang/srt/managers/tokenizer_manager.py,sha256=aaZV7G3-m35pba1meRapqO7bdPjM2Cmkue5lbR_Jv3M,28836
+sglang/srt/managers/tp_worker.py,sha256=DBrrd3QbjzAAvANvPs0zdYogsaFlusGx-IjpDVCP8RA,35976
+sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
+sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
+sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
+sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
+sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
+sglang/srt/model_executor/cuda_graph_runner.py,sha256=ba4WZhBbkJyZjronzwoDJmoh7l8oz0s5oj_i_3PLzSY,12662
+sglang/srt/model_executor/forward_batch_info.py,sha256=MUcquCqmK-Jc1WNEciREmPj4iZu39tJk0axpexfyEXg,15775
+sglang/srt/model_executor/model_runner.py,sha256=9L0cvNK2ELNfE4L6Hq9-K74ltXYenkFl4UVnY9d9JkU,24205
+sglang/srt/models/chatglm.py,sha256=EaZKaRlsAbSP5rob6vUGqDuJLAY1HC2Oh-jgEUS4ZVY,13634
+sglang/srt/models/commandr.py,sha256=k86ykwWOlxLGaBbGUoMSaXngUxCbMVRbY5AoMOWpbU8,14377
+sglang/srt/models/dbrx.py,sha256=goLJ9Yt-9vxkwhCUFBidvP41H_dYTFsvrMZ4xm4FqGA,14875
+sglang/srt/models/deepseek.py,sha256=aYP6HUgxQbhcQGQEF4vX0ronBF8AirqIFG98EQn0YzY,16220
+sglang/srt/models/deepseek_v2.py,sha256=kzqfZvidRe6uydaMJI40qh_Qg7-gI0oBVH0rdWp7ONg,27218
+sglang/srt/models/gemma.py,sha256=iC424guGOdsYC43xke5_uul9UIY0j6t7lUsDcB_uqa8,12492
+sglang/srt/models/gemma2.py,sha256=JQvM6rYvjmLqdhQIQ9mRAAO1MhnIqTb32CqdL8X0o80,16798
+sglang/srt/models/gpt_bigcode.py,sha256=jaolXlRp1PRHNEQPT-ZZ_cWAQ2us5DiNheSaNQ4Es_c,10418
+sglang/srt/models/grok.py,sha256=FF_eURzXYXe1b39AbGtEPv2yYNzWarjmBsjkgutOkek,15019
+sglang/srt/models/internlm2.py,sha256=VtWATs2eLIqbadYXTPY_vycFIstVk4zg3kxycA9H0Qw,12416
+sglang/srt/models/llama2.py,sha256=JZPvaLSPiFMN-4qlOUBXZxsUsz6XtTGD-bB_fidxcfU,14516
+sglang/srt/models/llama_classification.py,sha256=2zhBJtO9uieVj4Cd94KNiA8M_IdLuILDeTv1rePVJXw,4934
+sglang/srt/models/llama_embedding.py,sha256=NQCQ3MnK3iRohL-UdY5UWxW4LlZ3RQZ7w4mlFOnpVrM,3696
+sglang/srt/models/llava.py,sha256=iuXLJVDWBiYo8zJuDPSSjt2LYqbkg2MAcOFUZO1fOX4,24353
+sglang/srt/models/llavavid.py,sha256=MX7YpqYh5J4BoOnV7vVAIfoOlBFQXYpp8Kpe7WK0ejk,13562
+sglang/srt/models/minicpm.py,sha256=7RZEJ2TCqBL1JmMFVJ3J9DmZHRw0q90st49Wkh-sdL4,14039
+sglang/srt/models/mistral.py,sha256=jlrWBVNXbAUziAaIdHAjFcOJnKtn9Bl8rBd65ypJM-I,819
+sglang/srt/models/mixtral.py,sha256=StnGKdRhoweY46M2b2pv-vrfXaNqbhaVU4iKhEkMEfM,13837
+sglang/srt/models/mixtral_quant.py,sha256=O_97UKDYZokFhIBnamWfw0HLhln9_BUk_KfQ-sQnd8s,14286
+sglang/srt/models/qwen.py,sha256=geK88AyEyPbbDvMHJNY8XMSNpsCeu8g9kxnKyiJBpK4,10168
+sglang/srt/models/qwen2.py,sha256=B1qfqukSA3_02Q3tvIxqIg-6kmxdJ36Roxn0WFmnVxQ,12776
+sglang/srt/models/qwen2_moe.py,sha256=JZRd8AzvJgjVlHww1eCMPdF8rzC93X_1rgk3PEWE70M,17499
+sglang/srt/models/stablelm.py,sha256=9feHoiDEXSIe0WCrt4AfWXqxliJwRvr8w4XSnk6ipSI,11573
+sglang/srt/models/yivl.py,sha256=p4s_D_m4H2exP4b91Y-CTkq8T-eIG3DJsFy9pB0e7TM,4932
+sglang/srt/openai_api/adapter.py,sha256=KaIYqkeguuVNHhpfSBvL7M0wRPhcivRAtuG-DsyXExI,46654
+sglang/srt/openai_api/protocol.py,sha256=knf-nds0XO2LYg-hPM-Ho1f1y2XZIV_Gvg3xcCKLfgQ,9411
+sglang/srt/sampling/sampling_batch_info.py,sha256=encziVWrUDswoay0qfFVALHx_96Vra2mzD6_GHthZ3s,7771
+sglang/srt/sampling/sampling_params.py,sha256=dmjUlTY4VfuRtyc_sR59zMzhkjiTzHmljyTIogCFd0k,5411
+sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
+sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
+sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
+sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
+sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
+sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
+sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
+sglang/test/runners.py,sha256=IOaaNJ4y3GSbUCsnbKZrbZDoBR2_us2zWKWxccfrGlk,7687
+sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
+sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
+sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
+sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
+sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
+sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
+sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
+sglang/test/test_layernorm.py,sha256=VDdoeqGvebUa-l3rDiid6cC7wZq0Phpbm5fxxD0-cpg,1910
+sglang/test/test_programs.py,sha256=V_-Bx3lLkw37P6gDyA7mZCqxlyNMaFLBkRrPMQQQqn4,14909
+sglang/test/test_utils.py,sha256=HD-9rcj7EFS_NX1GQFU5613ITQlZaTK2l9RmqA0F7x4,14380
+sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
+sglang-0.2.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sglang-0.2.14.dist-info/METADATA,sha256=V3t6L-QOiHsJYTihE9W1YeR_YyRC_ZPZwlWjw0Mymsg,37161
+sglang-0.2.14.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
+sglang-0.2.14.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.2.14.dist-info/RECORD,,

{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (72.1.0)
+Generator: setuptools (73.0.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

sglang/launch_server_llavavid.py DELETED Viewed

@@ -1,29 +0,0 @@
-"""Launch the inference server for Llava-video model."""
-import argparse
-from sglang.srt.server import ServerArgs, launch_server
-if __name__ == "__main__":
-    model_overide_args = {}
-    model_overide_args["mm_spatial_pool_stride"] = 2
-    model_overide_args["architectures"] = ["LlavaVidForCausalLM"]
-    model_overide_args["num_frames"] = 16
-    model_overide_args["model_type"] = "llavavid"
-    if model_overide_args["num_frames"] == 32:
-        model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
-        model_overide_args["max_sequence_length"] = 4096 * 2
-        model_overide_args["tokenizer_model_max_length"] = 4096 * 2
-        model_overide_args["model_max_length"] = 4096 * 2
-    parser = argparse.ArgumentParser()
-    ServerArgs.add_cli_args(parser)
-    args = parser.parse_args()
-    if "34b" in args.model_path.lower():
-        model_overide_args["image_token_index"] = 64002
-    server_args = ServerArgs.from_cli_args(args)
-    launch_server(server_args, model_overide_args, None)

sglang 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl

sglang 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl