PyPI - sglang - Versions diffs - 0.1.22__tar.gz → 0.1.24__tar.gz - Mend

sglang 0.1.22tar.gz → 0.1.24tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

{sglang-0.1.22/sglang.egg-info → sglang-0.1.24}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.1.22
+Version: 0.1.24
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -228,7 +228,7 @@ Requires-Dist: torch; extra == "srt"
 Requires-Dist: uvicorn; extra == "srt"
 Requires-Dist: uvloop; extra == "srt"
 Requires-Dist: zmq; extra == "srt"
-Requires-Dist: vllm==0.5.1; extra == "srt"
+Requires-Dist: vllm==0.5.3.post1; extra == "srt"
 Requires-Dist: outlines>=0.0.44; extra == "srt"
 Provides-Extra: openai
 Requires-Dist: openai>=1.0; extra == "openai"
@@ -282,6 +282,7 @@ The core features include:
 ### Method 1: With pip
 ```
+pip install --upgrade pip setuptools wheel
 pip install "sglang[all]"
 # Install FlashInfer CUDA kernels
@@ -293,6 +294,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 git clone https://github.com/sgl-project/sglang.git
 cd sglang
+pip install --upgrade pip
 pip install -e "python[all]"
 # Install FlashInfer CUDA kernels
@@ -390,15 +392,16 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
 ```
 - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
-- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
+- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
 ```
 # Node 0
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
 # Node 1
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
 ```
 - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
+- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 ### Supported Models
@@ -420,6 +423,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - Grok
 - ChatGLM
 - InternLM 2
+- Mistral NeMo
 Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).

{sglang-0.1.22 → sglang-0.1.24}/README.md RENAMED Viewed

@@ -37,6 +37,7 @@ The core features include:
 ### Method 1: With pip
 ```
+pip install --upgrade pip setuptools wheel
 pip install "sglang[all]"
 # Install FlashInfer CUDA kernels
@@ -48,6 +49,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 git clone https://github.com/sgl-project/sglang.git
 cd sglang
+pip install --upgrade pip
 pip install -e "python[all]"
 # Install FlashInfer CUDA kernels
@@ -145,15 +147,16 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
 ```
 - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
-- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
+- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
 ```
 # Node 0
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
 # Node 1
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
 ```
 - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
+- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 ### Supported Models
@@ -175,6 +178,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - Grok
 - ChatGLM
 - InternLM 2
+- Mistral NeMo
 Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).

{sglang-0.1.22 → sglang-0.1.24}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.1.22"
+version = "0.1.24"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -21,7 +21,7 @@ dependencies = [
 [project.optional-dependencies]
 srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
-       "psutil", "pydantic", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.1", "outlines>=0.0.44"]
+       "psutil", "pydantic", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.3.post1", "outlines>=0.0.44"]
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]

{sglang-0.1.22 → sglang-0.1.24}/sglang/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.1.22"
+__version__ = "0.1.24"
 # SGL API Components
 from sglang.api import (

{sglang-0.1.22 → sglang-0.1.24}/sglang/bench_serving.py RENAMED Viewed

@@ -5,6 +5,9 @@ Benchmark online serving.
 Usage:
 python3 -m sglang.bench_serving --backend sglang --num-prompt 10
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi
 """
 import argparse
@@ -19,6 +22,7 @@ import traceback
 import warnings
 from argparse import ArgumentParser as FlexibleArgumentParser
 from dataclasses import dataclass, field
+from datetime import datetime
 from typing import AsyncGenerator, List, Optional, Tuple, Union
 import aiohttp
@@ -53,12 +57,80 @@ class RequestFuncOutput:
     itl: List[float] = field(default_factory=list)  # List of inter-token latencies
     prompt_len: int = 0
     error: str = ""
+    output_len: int = 0
 def remove_prefix(text: str, prefix: str) -> str:
     return text[len(prefix) :] if text.startswith(prefix) else text
+# trt llm not support ignore_eos
+# https://github.com/triton-inference-server/tensorrtllm_backend/issues/505
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.000001,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+            "min_length": request_func_input.output_len,
+            "end_id": 1048576,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data:")
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+                        most_recent_timestamp = timestamp
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.output_len = request_func_input.output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+        if pbar:
+            pbar.update(1)
+        return output
 # set ignore_eos True by default
 async def async_request_openai_completions(
     request_func_input: RequestFuncInput,
@@ -76,7 +148,7 @@ async def async_request_openai_completions(
             "temperature": 0.0,
             "best_of": 1,
             "max_tokens": request_func_input.output_len,
-            "stream": True,
+            "stream": not args.disable_stream,
             "ignore_eos": True,
         }
         headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
@@ -99,8 +171,9 @@ async def async_request_openai_completions(
                             continue
                         chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
                         if chunk == "[DONE]":
-                            latency = time.perf_counter() - st
+                            pass
                         else:
                             data = json.loads(chunk)
@@ -123,6 +196,7 @@ async def async_request_openai_completions(
                     output.generated_text = generated_text
                     output.success = True
                     output.latency = latency
+                    output.output_len = request_func_input.output_len
                 else:
                     output.error = response.reason or ""
                     output.success = False
@@ -167,6 +241,7 @@ ASYNC_REQUEST_FUNCS = {
     "sglang": async_request_openai_completions,
     "vllm": async_request_openai_completions,
     "lmdeploy": async_request_openai_completions,
+    "trt": async_request_trt_llm,
 }
@@ -175,9 +250,11 @@ class BenchmarkMetrics:
     completed: int
     total_input: int
     total_output: int
+    total_output_retokenized: int
     request_throughput: float
     input_throughput: float
     output_throughput: float
+    output_throughput_retokenized: float
     mean_ttft_ms: float
     median_ttft_ms: float
     std_ttft_ms: float
@@ -190,6 +267,8 @@ class BenchmarkMetrics:
     median_itl_ms: float
     std_itl_ms: float
     p99_itl_ms: float
+    mean_e2e_latency_ms: float
+    median_e2e_latency_ms: float
 default_sharegpt_path = "ShareGPT_V3_unfiltered_cleaned_split.json"
@@ -384,31 +463,36 @@ def calculate_metrics(
     outputs: List[RequestFuncOutput],
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
+    backend: str,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens: List[int] = []
+    output_lens: List[int] = []
+    retokenized_output_lens: List[int] = []
     total_input = 0
     completed = 0
     itls: List[float] = []
     tpots: List[float] = []
     ttfts: List[float] = []
+    e2e_latencies: List[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
-            # We use the tokenizer to count the number of output tokens for all
-            # serving backends instead of looking at len(outputs[i].itl) since
-            # multiple output tokens may be bundled together
-            # Note : this may inflate the output token count slightly
-            output_len = len(
+            output_len = outputs[i].output_len
+            output_lens.append(output_len)
+            retokenized_output_len = len(
                 tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
             )
-            actual_output_lens.append(output_len)
+            retokenized_output_lens.append(retokenized_output_len)
             total_input += input_requests[i][1]
             if output_len > 1:
                 tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
             itls += outputs[i].itl
             ttfts.append(outputs[i].ttft)
+            e2e_latencies.append(outputs[i].latency)
             completed += 1
         else:
-            actual_output_lens.append(0)
+            output_lens.append(0)
+            retokenized_output_lens.append(0)
     if completed == 0:
         warnings.warn(
@@ -419,10 +503,12 @@ def calculate_metrics(
     metrics = BenchmarkMetrics(
         completed=completed,
         total_input=total_input,
-        total_output=sum(actual_output_lens),
+        total_output=sum(output_lens),
+        total_output_retokenized=sum(retokenized_output_lens),
         request_throughput=completed / dur_s,
         input_throughput=total_input / dur_s,
-        output_throughput=sum(actual_output_lens) / dur_s,
+        output_throughput=sum(output_lens) / dur_s,
+        output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
         mean_ttft_ms=np.mean(ttfts or 0)
         * 1000,  # ttfts is empty if streaming is not supported by backend
         median_ttft_ms=np.median(ttfts or 0) * 1000,
@@ -436,9 +522,11 @@ def calculate_metrics(
         median_itl_ms=np.median(itls or 0) * 1000,
         std_itl_ms=np.std(itls or 0) * 1000,
         p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
+        mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
+        median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
     )
-    return metrics, actual_output_lens
+    return metrics, output_lens
 async def benchmark(
@@ -449,6 +537,7 @@ async def benchmark(
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
     disable_tqdm: bool,
+    enable_multi: bool,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -498,19 +587,26 @@ async def benchmark(
     benchmark_duration = time.perf_counter() - benchmark_start_time
-    metrics, actual_output_lens = calculate_metrics(
+    metrics, output_lens = calculate_metrics(
         input_requests=input_requests,
         outputs=outputs,
         dur_s=benchmark_duration,
         tokenizer=tokenizer,
+        backend=backend,
     )
     print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Backend:", backend))
     print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
     print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10}".format(
+            "Total generated tokens (retokenized):", metrics.total_output_retokenized
+        )
+    )
     print(
         "{:<40} {:<10.2f}".format(
             "Request throughput (req/s):", metrics.request_throughput
@@ -526,6 +622,15 @@ async def benchmark(
             "Output token throughput (tok/s):", metrics.output_throughput
         )
     )
+    print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
+    print(
+        "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Median E2E Latency (ms):", metrics.median_e2e_latency_ms
+        )
+    )
     print("{s:{c}^{n}}".format(s="Time to First Token", n=50, c="-"))
     print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
     print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
@@ -542,11 +647,53 @@ async def benchmark(
     print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
     print("=" * 50)
+    if (
+        metrics.median_ttft_ms is not None
+        and metrics.mean_itl_ms is not None
+        and metrics.output_throughput is not None
+    ):
+        result = {
+            "backend": args.backend,
+            "dataset_name": args.dataset_name,
+            "request_rate": request_rate,
+            "total_input": metrics.total_input,
+            "total_output": metrics.total_output,
+            "total_output_retokenized": metrics.total_output_retokenized,
+            "mean_e2e_latency": metrics.mean_e2e_latency_ms,
+            "median_e2e_latency": metrics.median_e2e_latency_ms,
+            "median_ttft": metrics.median_ttft_ms,
+            "median_itl": metrics.median_itl_ms,
+            "output_token_throughput": metrics.output_throughput,
+            "sharegpt_output_len": args.sharegpt_output_len,
+            "random_input_len": args.random_input_len,
+            "random_output_len": args.random_output_len,
+            "random_range_ratio": args.random_range_ratio,
+            "benchmark_duration": benchmark_duration,
+        }
+    else:
+        print(f"Error running benchmark for request rate: {request_rate}")
+        print("-" * 30)
+    # Determine output file name
+    if args.output_file:
+        output_file_name = args.output_file
+    else:
+        now = datetime.now().strftime("%m%d")
+        if args.dataset_name == "random":
+            output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
+        else:
+            output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
+    # Append results to a JSONL file
+    with open(output_file_name, "a") as file:
+        file.write(json.dumps(result) + "\n")
     result = {
         "duration": benchmark_duration,
         "completed": metrics.completed,
         "total_input_tokens": metrics.total_input,
         "total_output_tokens": metrics.total_output,
+        "total_output_tokens_retokenized": metrics.total_output_retokenized,
         "request_throughput": metrics.request_throughput,
         "input_throughput": metrics.input_throughput,
         "output_throughput": metrics.output_throughput,
@@ -563,15 +710,34 @@ async def benchmark(
         "std_itl_ms": metrics.std_itl_ms,
         "p99_itl_ms": metrics.p99_itl_ms,
         "input_lens": [output.prompt_len for output in outputs],
-        "output_lens": actual_output_lens,
+        "output_lens": output_lens,
         "ttfts": [output.ttft for output in outputs],
         "itls": [output.itl for output in outputs],
         "generated_texts": [output.generated_text for output in outputs],
         "errors": [output.error for output in outputs],
+        "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+        "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
     }
     return result
+def parse_request_rate_range(request_rate_range):
+    if len(request_rate_range.split(",")) == 3:
+        start, stop, step = map(int, request_rate_range.split(","))
+        return list(range(start, stop, step))
+    else:
+        return list(map(int, request_rate_range.split(",")))
+def check_chat_template(model_path):
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        return "chat_template" in tokenizer.init_kwargs
+    except Exception as e:
+        print(f"Fail to load tokenizer config with error={e}")
+        return False
 def fire(args: argparse.Namespace):
     random.seed(args.seed)
     np.random.seed(args.seed)
@@ -581,6 +747,7 @@ def fire(args: argparse.Namespace):
             "sglang": 30000,
             "lmdeploy": 23333,
             "vllm": 8000,
+            "trt": 8000,
         }.get(args.backend, 30000)
     api_url = (
@@ -594,6 +761,16 @@ def fire(args: argparse.Namespace):
         else f"http://{args.host}:{args.port}/v1/models"
     )
+    if args.backend == "trt":
+        api_url = (
+            f"{args.base_url}/v2/models/ensemble/generate_stream"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v2/models/ensemble/generate_stream"
+        )
+        if args.model is None:
+            print("Please provide a model using `--model` when using `trt` backend.")
+            sys.exit(1)
     if args.model is None:
         try:
             response = requests.get(model_url)
@@ -610,6 +787,12 @@ def fire(args: argparse.Namespace):
         print("No model specified or found. Please provide a model using `--model`.")
         sys.exit(1)
+    if not check_chat_template(args.model):
+        print(
+            "\nWARNING It is recommended to use the `Chat` or `Instruct` model for benchmarking.\n"
+            "Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.\n"
+        )
     print(f"{args}\n")
     backend = args.backend
@@ -637,17 +820,35 @@ def fire(args: argparse.Namespace):
     else:
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
-    asyncio.run(
-        benchmark(
-            backend=backend,
-            api_url=api_url,
-            model_id=model_id,
-            tokenizer=tokenizer,
-            input_requests=input_requests,
-            request_rate=args.request_rate,
-            disable_tqdm=args.disable_tqdm,
+    if args.multi:
+        request_rates = parse_request_rate_range(args.request_rate_range)
+        for rate in request_rates:
+            asyncio.run(
+                benchmark(
+                    backend=backend,
+                    api_url=api_url,
+                    model_id=model_id,
+                    tokenizer=tokenizer,
+                    input_requests=input_requests,
+                    request_rate=rate,
+                    disable_tqdm=args.disable_tqdm,
+                    enable_multi=args.multi,
+                )
+            )
+    else:
+        asyncio.run(
+            benchmark(
+                backend=backend,
+                api_url=api_url,
+                model_id=model_id,
+                tokenizer=tokenizer,
+                input_requests=input_requests,
+                request_rate=args.request_rate,
+                disable_tqdm=args.disable_tqdm,
+                enable_multi=args.multi,
+            )
         )
-    )
 # to avoid relying on SGLang's components
@@ -751,6 +952,23 @@ if __name__ == "__main__":
         action="store_true",
         help="Specify to disable tqdm progress bar.",
     )
+    parser.add_argument(
+        "--multi",
+        action="store_true",
+        help="Use request rate range rather than single value.",
+    )
+    parser.add_argument(
+        "--request-rate-range",
+        type=str,
+        default="2,34,2",
+        help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
+    )
+    parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
+    parser.add_argument(
+        "--disable-stream",
+        action="store_true",
+        help="Disable streaming mode.",
+    )
     set_ulimit()

{sglang-0.1.22 → sglang-0.1.24}/sglang/global_config.py RENAMED Viewed

@@ -16,9 +16,9 @@ class GlobalConfig:
         self.wait_for_new_request_delay = 0.0006
         # Runtime constants: New generation token ratio estimation
-        self.base_new_token_ratio = 0.4
+        self.init_new_token_ratio = 0.7
         self.base_min_new_token_ratio = 0.2
-        self.new_token_ratio_decay = 0.0001
+        self.new_token_ratio_decay = 0.001
         self.new_token_ratio_recovery = 0.05
         # Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
@@ -27,6 +27,7 @@ class GlobalConfig:
         # Runtime constants: others
         self.num_continue_decode_steps = 10
+        self.retract_decode_steps = 20
         self.flashinfer_workspace_size = 192 * 1024 * 1024
         # Output tokenization configs

{sglang-0.1.22 → sglang-0.1.24}/sglang/lang/interpreter.py RENAMED Viewed

@@ -288,6 +288,7 @@ class StreamExecutor:
             exes[i].text_ = str(self.text_)
             exes[i].messages_ = list(self.messages_)
             exes[i].cur_role = self.cur_role
+            exes[i].cur_role_begin_pos = self.cur_role_begin_pos
             exes[i].fork_start_text_pos = len(self.text_)
             exes[i].images_ = list(self.images_)

{sglang-0.1.22 → sglang-0.1.24}/sglang/srt/hf_transformers_utils.py RENAMED Viewed

@@ -4,19 +4,26 @@ import functools
 import json
 import os
 import warnings
-from typing import AbstractSet, Collection, Literal, Optional, Union
+from typing import AbstractSet, Collection, Dict, Literal, Optional, Type, Union
 from huggingface_hub import snapshot_download
 from transformers import (
     AutoConfig,
     AutoProcessor,
     AutoTokenizer,
+    PretrainedConfig,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
 )
+from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig
 from sglang.srt.utils import is_multimodal_model
+_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
+    ChatGLMConfig.model_type: ChatGLMConfig,
+    DbrxConfig.model_type: DbrxConfig,
+}
 def download_from_hf(model_path: str):
     if os.path.exists(model_path):
@@ -40,6 +47,9 @@ def get_config(
     config = AutoConfig.from_pretrained(
         model, trust_remote_code=trust_remote_code, revision=revision
     )
+    if config.model_type in _CONFIG_REGISTRY:
+        config_class = _CONFIG_REGISTRY[config.model_type]
+        config = config_class.from_pretrained(model, revision=revision)
     if model_overide_args:
         config.update(model_overide_args)
     return config
@@ -63,6 +73,8 @@ def get_context_length(config):
     rope_scaling = getattr(config, "rope_scaling", None)
     if rope_scaling:
         rope_scaling_factor = config.rope_scaling["factor"]
+        if config.rope_scaling["rope_type"] == "llama3":
+            rope_scaling_factor = 1
     else:
         rope_scaling_factor = 1

sglang 0.1.22__tar.gz → 0.1.24__tar.gz

sglang 0.1.22tar.gz → 0.1.24tar.gz