PyPI - sglang - Versions diffs - 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl - Mend

sglang 0.5.1.post2py3-none-any.whl → 0.5.2rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

sglang/bench_one_batch.py +3 -0
sglang/bench_one_batch_server.py +79 -53
sglang/bench_serving.py +186 -14
sglang/profiler.py +0 -1
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/longcat_flash.py +104 -0
sglang/srt/configs/model_config.py +12 -0
sglang/srt/connector/__init__.py +1 -1
sglang/srt/connector/base_connector.py +1 -2
sglang/srt/connector/redis.py +2 -2
sglang/srt/connector/serde/__init__.py +1 -1
sglang/srt/connector/serde/safe_serde.py +4 -3
sglang/srt/conversation.py +38 -5
sglang/srt/disaggregation/ascend/conn.py +75 -0
sglang/srt/disaggregation/launch_lb.py +0 -13
sglang/srt/disaggregation/mini_lb.py +33 -8
sglang/srt/disaggregation/prefill.py +1 -1
sglang/srt/distributed/parallel_state.py +24 -14
sglang/srt/entrypoints/engine.py +19 -12
sglang/srt/entrypoints/http_server.py +174 -34
sglang/srt/entrypoints/openai/protocol.py +87 -24
sglang/srt/entrypoints/openai/serving_chat.py +50 -9
sglang/srt/entrypoints/openai/serving_completions.py +15 -0
sglang/srt/eplb/eplb_manager.py +26 -2
sglang/srt/eplb/expert_distribution.py +29 -2
sglang/srt/function_call/deepseekv31_detector.py +222 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/gpt_oss_detector.py +144 -256
sglang/srt/harmony_parser.py +588 -0
sglang/srt/hf_transformers_utils.py +26 -7
sglang/srt/layers/activation.py +12 -0
sglang/srt/layers/attention/ascend_backend.py +374 -136
sglang/srt/layers/attention/flashattention_backend.py +241 -7
sglang/srt/layers/attention/flashinfer_backend.py +5 -2
sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
sglang/srt/layers/communicator.py +1 -2
sglang/srt/layers/layernorm.py +28 -3
sglang/srt/layers/linear.py +3 -2
sglang/srt/layers/logits_processor.py +1 -1
sglang/srt/layers/moe/cutlass_moe.py +0 -8
sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
sglang/srt/layers/moe/ep_moe/layer.py +13 -13
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/topk.py +35 -12
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
sglang/srt/layers/quantization/fp8.py +2 -1
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +7 -0
sglang/srt/layers/quantization/mxfp4.py +25 -27
sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
sglang/srt/layers/quantization/utils.py +13 -0
sglang/srt/layers/quantization/w8a8_int8.py +7 -3
sglang/srt/layers/rotary_embedding.py +28 -1
sglang/srt/layers/sampler.py +29 -5
sglang/srt/layers/utils.py +0 -14
sglang/srt/managers/cache_controller.py +237 -204
sglang/srt/managers/detokenizer_manager.py +48 -2
sglang/srt/managers/io_struct.py +57 -0
sglang/srt/managers/mm_utils.py +5 -1
sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
sglang/srt/managers/scheduler.py +94 -9
sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
sglang/srt/managers/tokenizer_manager.py +122 -42
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +51 -23
sglang/srt/mem_cache/hiradix_cache.py +87 -71
sglang/srt/mem_cache/lora_radix_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +77 -14
sglang/srt/mem_cache/memory_pool_host.py +4 -5
sglang/srt/mem_cache/radix_cache.py +6 -4
sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
sglang/srt/mem_cache/swa_radix_cache.py +1 -1
sglang/srt/model_executor/model_runner.py +6 -5
sglang/srt/model_loader/loader.py +15 -24
sglang/srt/model_loader/utils.py +12 -0
sglang/srt/models/deepseek_v2.py +38 -13
sglang/srt/models/gpt_oss.py +2 -15
sglang/srt/models/llama_eagle3.py +4 -0
sglang/srt/models/longcat_flash.py +1015 -0
sglang/srt/models/longcat_flash_nextn.py +691 -0
sglang/srt/models/qwen2.py +26 -3
sglang/srt/models/qwen2_5_vl.py +66 -41
sglang/srt/models/qwen2_moe.py +22 -2
sglang/srt/models/transformers.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +4 -2
sglang/srt/reasoning_parser.py +56 -300
sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
sglang/srt/server_args.py +122 -56
sglang/srt/speculative/eagle_worker.py +28 -8
sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
sglang/srt/utils.py +73 -5
sglang/test/attention/test_trtllm_mla_backend.py +12 -3
sglang/version.py +1 -1
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0

sglang/bench_one_batch.py CHANGED Viewed

@@ -61,6 +61,7 @@ from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.distributed.parallel_state import destroy_distributed_environment
 from sglang.srt.entrypoints.engine import _set_envs_and_config
 from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.layers.moe import initialize_moe_config
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.managers.scheduler import Scheduler
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -509,6 +510,8 @@ def latency_test(
     bench_args,
     tp_rank,
 ):
+    initialize_moe_config(server_args)
     # Set CPU affinity
     if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
         set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)

sglang/bench_one_batch_server.py CHANGED Viewed

@@ -18,7 +18,7 @@ import json
 import multiprocessing
 import os
 import time
-from typing import Tuple
+from typing import List, Tuple
 import requests
@@ -45,6 +45,7 @@ class BenchArgs:
     skip_warmup: bool = False
     show_report: bool = False
     profile: bool = False
+    profile_steps: int = 3
     profile_by_stage: bool = False
     @staticmethod
@@ -78,6 +79,9 @@ class BenchArgs:
         parser.add_argument("--skip-warmup", action="store_true")
         parser.add_argument("--show-report", action="store_true")
         parser.add_argument("--profile", action="store_true")
+        parser.add_argument(
+            "--profile-steps", type=int, default=BenchArgs.profile_steps
+        )
         parser.add_argument("--profile-by-stage", action="store_true")
     @classmethod
@@ -132,6 +136,7 @@ def run_one_case(
     result_filename: str,
     tokenizer,
     profile: bool = False,
+    profile_steps: int = 3,
     profile_by_stage: bool = False,
 ):
     requests.post(url + "/flush_cache")
@@ -162,7 +167,7 @@ def run_one_case(
     profile_link = None
     if profile:
         profile_link: str = run_profile(
-            url, 3, ["CPU", "GPU"], None, None, profile_by_stage
+            url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
         )
     tic = time.perf_counter()
@@ -247,6 +252,71 @@ def run_one_case(
     )
+def get_report_summary(
+    result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
+):
+    import tabulate
+    summary = (
+        f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
+    )
+    headers = [
+        "batch size",
+        "latency (s)",
+        "input throughput (tok/s)",
+        "output throughput (tok/s)",
+        "acc length",
+        "ITL (ms)",
+        "input cost ($/1M)",
+        "output cost ($/1M)",
+    ]
+    if bench_args.profile:
+        headers.append("profile")
+    rows = []
+    for (
+        batch_size,
+        latency,
+        ttft,
+        input_throughput,
+        output_throughput,
+        _,
+        _,
+        acc_length,
+        trace_link,
+    ) in result:
+        if is_blackwell():
+            hourly_cost_per_gpu = 4  # $4/hour for one B200
+        else:
+            hourly_cost_per_gpu = 2  # $2/hour for one H100
+        hourly_cost = hourly_cost_per_gpu * server_args.tp_size
+        input_util = 0.7
+        accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
+        itl = 1 / (output_throughput / batch_size) * 1000
+        input_cost = 1e6 / (input_throughput * input_util) / 3600 * hourly_cost
+        output_cost = 1e6 / output_throughput / 3600 * hourly_cost
+        row = [
+            batch_size,
+            latency,
+            input_throughput,
+            output_throughput,
+            accept_length,
+            itl,
+            input_cost,
+            output_cost,
+        ]
+        if trace_link:
+            row.append(f"[Profile]({trace_link})")
+        rows.append(row)
+    summary += tabulate.tabulate(
+        rows, headers=headers, tablefmt="github", floatfmt=".2f"
+    )
+    return summary
 def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
     if bench_args.base_url:
         proc, base_url = None, bench_args.base_url
@@ -321,6 +391,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                                 result_filename=bench_args.result_filename,
                                 tokenizer=tokenizer,
                                 profile=bench_args.profile,
+                                profile_steps=bench_args.profile_steps,
                                 profile_by_stage=bench_args.profile_by_stage,
                             )[-1],
                         )
@@ -337,63 +408,14 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
     if not bench_args.show_report:
         return
-    summary = (
-        f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
-    )
-    summary += "| batch size | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) |"
-    if bench_args.profile:
-        summary += " profile |"
-    summary += "\n"
-    summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ |"
-    if bench_args.profile:
-        summary += "-------------|"
-    summary += "\n"
-    for (
-        batch_size,
-        latency,
-        ttft,
-        input_throughput,
-        output_throughput,
-        overall_throughput,
-        last_gen_throughput,
-        acc_length,
-        trace_link,
-    ) in result:
-        if is_blackwell():
-            hourly_cost_per_gpu = 4  # $4/hour for one B200
-        else:
-            hourly_cost_per_gpu = 2  # $2/hour for one H100
-        hourly_cost = hourly_cost_per_gpu * server_args.tp_size
-        input_util = 0.7
-        accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
-        line = (
-            f"| {batch_size} | "
-            f"{latency:.2f} | "
-            f"{input_throughput:.2f} | "
-            f"{output_throughput:.2f} | "
-            f"{accept_length} | "
-            f"{1 / (output_throughput/batch_size) * 1000:.2f} | "
-            f"{1e6 / (input_throughput * input_util) / 3600 * hourly_cost:.2f} | "
-            f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |"
-        )
-        if trace_link:
-            line += f" [Profile]({trace_link}) |"
-        line += "\n"
-        summary += line
-    # print metrics table
+    summary = get_report_summary(result, server_args, bench_args)
     print(summary)
     if is_in_ci():
         write_github_step_summary(summary)
-if __name__ == "__main__":
+def main():
     parser = argparse.ArgumentParser()
     ServerArgs.add_cli_args(parser)
     BenchArgs.add_cli_args(parser)
@@ -402,3 +424,7 @@ if __name__ == "__main__":
     bench_args = BenchArgs.from_cli_args(args)
     run_benchmark(server_args, bench_args)
+if __name__ == "__main__":
+    main()

sglang/bench_serving.py CHANGED Viewed

@@ -12,6 +12,8 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
 import argparse
 import asyncio
+import base64
+import io
 import json
 import os
 import pickle
@@ -71,7 +73,7 @@ class RequestFuncInput:
     output_len: int
     model: str
     lora_name: str
-    image_data: str
+    image_data: Optional[List[str]]
     extra_request_body: Dict[str, Any]
@@ -289,16 +291,19 @@ async def async_request_openai_chat_completions(
     ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
     if request_func_input.image_data:
+        # Build multi-image content: a list of image_url entries followed by the text
+        content_items = [
+            {
+                "type": "image_url",
+                "image_url": {"url": img_url},
+            }
+            for img_url in request_func_input.image_data
+        ]
+        content_items.append({"type": "text", "text": request_func_input.prompt})
         messages = [
             {
                 "role": "user",
-                "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": request_func_input.image_data},
-                    },
-                    {"type": "text", "text": request_func_input.prompt},
-                ],
+                "content": content_items,
             },
         ]
     else:
@@ -497,7 +502,7 @@ async def async_request_sglang_generate(
             **request_func_input.extra_request_body,
         }
-        # Add image data if available
+        # Add image data if available (list of image urls/base64)
         if request_func_input.image_data:
             payload["image_data"] = request_func_input.image_data
@@ -648,7 +653,7 @@ def get_dataset(args, tokenizer):
             prompt_suffix=args.prompt_suffix,
             apply_chat_template=args.apply_chat_template,
         )
-    elif args.dataset_name.startswith("random"):
+    elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
         input_requests = sample_random_requests(
             input_len=args.random_input_len,
             output_len=args.random_output_len,
@@ -659,6 +664,18 @@ def get_dataset(args, tokenizer):
             random_sample=args.dataset_name == "random",
             return_text=not tokenize_prompt,
         )
+    elif args.dataset_name == "random-image":
+        assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
+        input_requests = sample_random_image_requests(
+            num_requests=args.num_prompts,
+            num_images=args.random_image_num_images,
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+            apply_chat_template=args.apply_chat_template,
+            image_resolution=args.random_image_resolution,
+        )
     elif args.dataset_name == "generated-shared-prefix":
         assert not tokenize_prompt
         input_requests = sample_generated_shared_prefix_requests(
@@ -790,7 +807,7 @@ class DatasetRow:
     prompt: str
     prompt_len: int
     output_len: int
-    image_data: Optional[str] = None
+    image_data: Optional[List[str]] = None
 def sample_mmmu_requests(
@@ -913,7 +930,7 @@ def sample_mmmu_requests(
                         prompt=prompt,
                         prompt_len=prompt_len,
                         output_len=output_len,
-                        image_data=image_data,
+                        image_data=[image_data],
                     )
                 )
@@ -1113,6 +1130,132 @@ def sample_random_requests(
     return input_requests
+def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
+    """Parse image resolution into (width, height).
+    Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
+    (e.g., '1080x1920' means height=1080, width=1920).
+    """
+    resolution_to_size = {
+        "4k": (3840, 2160),
+        "1080p": (1920, 1080),
+        "720p": (1280, 720),
+        "360p": (640, 360),
+    }
+    if image_resolution in resolution_to_size:
+        return resolution_to_size[image_resolution]
+    res = image_resolution.strip().lower()
+    if "x" in res:
+        parts = res.split("x")
+        if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
+            height = int(parts[0])
+            width = int(parts[1])
+            if height > 0 and width > 0:
+                return (width, height)
+    raise ValueError(
+        f"Unsupported random-image resolution: {image_resolution}. "
+        "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
+    )
+def sample_random_image_requests(
+    num_requests: int,
+    num_images: int,
+    input_len: int,
+    output_len: int,
+    range_ratio: float,
+    tokenizer: PreTrainedTokenizerBase,
+    apply_chat_template: bool = True,
+    image_resolution: str = "1080p",
+) -> List[DatasetRow]:
+    """Generate requests with random images.
+    - Each request includes ``num_images`` random images.
+    - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
+      or custom 'heightxwidth' (e.g., 1080x1920).
+    - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
+      only counts text tokens and excludes image data.
+    """
+    try:
+        import pybase64
+        from PIL import Image
+    except ImportError as e:
+        raise ImportError(
+            "Please install Pillow to generate random images: pip install pillow"
+        ) from e
+    # Parse resolution (supports presets and 'heightxwidth')
+    width, height = parse_random_image_resolution(image_resolution)
+    # Check for potentially problematic combinations and warn user
+    if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
+        warnings.warn(
+            f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
+            f"may take a long time. Consider reducing resolution or image count.",
+            UserWarning,
+            stacklevel=2,
+        )
+    # Sample text lengths
+    input_lens = np.random.randint(
+        max(int(input_len * range_ratio), 1), input_len + 1, size=num_requests
+    )
+    output_lens = np.random.randint(
+        int(output_len * range_ratio), output_len + 1, size=num_requests
+    )
+    def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
+        arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
+        img = Image.fromarray(arr, mode="RGB")
+        buf = io.BytesIO()
+        img.save(buf, format="JPEG", quality=85)
+        encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
+        return f"data:image/jpeg;base64,{encoded}"
+    dataset: List[DatasetRow] = []
+    for i in range(num_requests):
+        # Generate text prompt
+        text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
+        # Generate image list
+        images = [_gen_random_image_data_uri() for _ in range(num_images)]
+        prompt_str = text_prompt
+        if apply_chat_template:
+            try:
+                content_items = [
+                    {"type": "image_url", "image_url": {"url": img_url}}
+                    for img_url in images
+                ]
+                content_items.append({"type": "text", "text": text_prompt})
+                prompt_str = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": content_items}],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
+            except Exception:
+                # Some tokenizers do not support list content; fall back to a placeholder in the text
+                prompt_str = f"<image>{text_prompt}"
+        prompt_token_ids = tokenizer.encode(prompt_str)
+        prompt_token_len = len(prompt_token_ids)
+        dataset.append(
+            DatasetRow(
+                prompt=prompt_str,
+                prompt_len=prompt_token_len,
+                output_len=int(output_lens[i]),
+                image_data=images,
+            )
+        )
+    print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
+    print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
+    return dataset
 def gen_prompt(tokenizer, token_num):
     """Generate a random prompt of specified token length using tokenizer vocabulary."""
     all_available_tokens = list(tokenizer.get_vocab().values())
@@ -1579,7 +1722,13 @@ async def benchmark(
         output_file_name = args.output_file
     else:
         now = datetime.now().strftime("%m%d")
-        if args.dataset_name.startswith("random"):
+        if args.dataset_name == "random-image":
+            output_file_name = (
+                f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
+                f"{args.random_output_len}_{args.random_image_num_images}imgs_"
+                f"{args.random_image_resolution}.jsonl"
+            )
+        elif args.dataset_name.startswith("random"):
             output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
         else:
             output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
@@ -1819,7 +1968,14 @@ if __name__ == "__main__":
         "--dataset-name",
         type=str,
         default="sharegpt",
-        choices=["sharegpt", "random", "random-ids", "generated-shared-prefix", "mmmu"],
+        choices=[
+            "sharegpt",
+            "random",
+            "random-ids",
+            "generated-shared-prefix",
+            "mmmu",
+            "random-image",
+        ],
         help="Name of the dataset to benchmark on.",
     )
     parser.add_argument(
@@ -1872,6 +2028,22 @@ if __name__ == "__main__":
         help="Range of sampled ratio of input/output length, "
         "used only for random dataset.",
     )
+    # random-image dataset args
+    parser.add_argument(
+        "--random-image-num-images",
+        type=int,
+        default=1,
+        help="Number of images per request (only available with the random-image dataset)",
+    )
+    parser.add_argument(
+        "--random-image-resolution",
+        type=str,
+        default="1080p",
+        help=(
+            "Resolution of random images for random-image dataset. "
+            "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
+        ),
+    )
     parser.add_argument(
         "--request-rate",
         type=float,

sglang/profiler.py CHANGED Viewed

@@ -9,7 +9,6 @@ import argparse
 import json
 import os
 import time
-import urllib.parse
 from argparse import ArgumentParser
 from pathlib import Path
 from typing import List, Optional

sglang/srt/configs/__init__.py CHANGED Viewed

@@ -5,6 +5,7 @@ from sglang.srt.configs.exaone import ExaoneConfig
 from sglang.srt.configs.janus_pro import MultiModalityConfig
 from sglang.srt.configs.kimi_vl import KimiVLConfig
 from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
+from sglang.srt.configs.longcat_flash import LongcatFlashConfig
 from sglang.srt.configs.step3_vl import (
     Step3TextConfig,
     Step3VisionEncoderConfig,
@@ -16,6 +17,7 @@ __all__ = [
     "ChatGLMConfig",
     "DbrxConfig",
     "DeepseekVL2Config",
+    "LongcatFlashConfig",
     "MultiModalityConfig",
     "KimiVLConfig",
     "MoonViTConfig",

sglang/srt/configs/longcat_flash.py ADDED Viewed

@@ -0,0 +1,104 @@
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+FLASH_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class LongcatFlashConfig(PretrainedConfig):
+    model_type = "longcat_flash"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=131072,
+        hidden_size=6144,
+        intermediate_size=None,
+        ffn_hidden_size=12288,
+        expert_ffn_hidden_size=2048,
+        num_layers=28,
+        num_hidden_layers=None,
+        num_attention_heads=64,
+        ep_size=1,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=128,
+        qk_nope_head_dim=128,
+        v_head_dim=128,
+        n_routed_experts=512,
+        moe_topk=12,
+        norm_topk_prob=False,
+        max_position_embeddings=131072,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mla_scale_q_lora=True,
+        mla_scale_kv_lora=True,
+        torch_dtype="bfloat16",
+        params_dtype="bfloat16",
+        rounter_params_dtype="float32",
+        router_bias=False,
+        topk_method=None,
+        routed_scaling_factor=6.0,
+        zero_expert_num=256,
+        zero_expert_type="identity",
+        nextn_use_scmoe=False,
+        num_nextn_predict_layers=1,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            torch_dtype=torch_dtype,
+            params_dtype=params_dtype,
+            rounter_params_dtype=rounter_params_dtype,
+            topk_method=topk_method,
+            router_bias=router_bias,
+            nextn_use_scmoe=nextn_use_scmoe,
+            num_nextn_predict_layers=num_nextn_predict_layers,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = (
+            num_hidden_layers if num_hidden_layers is not None else num_layers
+        )
+        self.intermediate_size = (
+            intermediate_size if intermediate_size is not None else ffn_hidden_size
+        )
+        self.moe_intermediate_size = expert_ffn_hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.ep_size = ep_size
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.n_routed_experts = n_routed_experts
+        self.moe_topk = moe_topk
+        self.norm_topk_prob = norm_topk_prob
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mla_scale_q_lora = mla_scale_q_lora
+        self.mla_scale_kv_lora = mla_scale_kv_lora
+        self.zero_expert_num = zero_expert_num
+        self.zero_expert_type = zero_expert_type
+        self.routed_scaling_factor = routed_scaling_factor
+        self.hidden_act = "silu"

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -132,6 +132,13 @@ class ModelConfig:
         if is_draft_model and self.hf_config.architectures[0] == "Glm4MoeForCausalLM":
             self.hf_config.architectures[0] = "Glm4MoeForCausalLMNextN"
+        if (
+            is_draft_model
+            and self.hf_config.architectures[0] == "LongcatFlashForCausalLM"
+        ):
+            self.hf_config.architectures[0] = "LongcatFlashForCausalLMNextN"
+            self.hf_config.num_hidden_layers = self.hf_config.num_nextn_predict_layers
         if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
             self.hf_config.architectures[0] = "MiMoMTP"
         if (
@@ -199,6 +206,8 @@ class ModelConfig:
             "DeepseekV2ForCausalLM" in self.hf_config.architectures
             or "DeepseekV3ForCausalLM" in self.hf_config.architectures
             or "DeepseekV3ForCausalLMNextN" in self.hf_config.architectures
+            or "LongcatFlashForCausalLM" in self.hf_config.architectures
+            or "LongcatFlashForCausalLMNextN" in self.hf_config.architectures
         ):
             self.head_dim = 256
             self.attention_arch = AttentionArch.MLA
@@ -270,6 +279,9 @@ class ModelConfig:
             self.num_key_value_heads = self.num_attention_heads
         self.hidden_size = self.hf_text_config.hidden_size
         self.num_hidden_layers = self.hf_text_config.num_hidden_layers
+        self.num_attention_layers = self.num_hidden_layers
+        if "LongcatFlashForCausalLM" in self.hf_config.architectures:
+            self.num_attention_layers = self.num_hidden_layers * 2
         self.num_nextn_predict_layers = getattr(
             self.hf_text_config, "num_nextn_predict_layers", None
         )

sglang/srt/connector/__init__.py CHANGED Viewed

@@ -20,7 +20,7 @@ class ConnectorType(str, enum.Enum):
     KV = "KV"
-def create_remote_connector(url, device="cpu") -> BaseConnector:
+def create_remote_connector(url, **kwargs) -> BaseConnector:
     connector_type = parse_connector_type(url)
     if connector_type == "redis":
         return RedisConnector(url)

sglang/srt/connector/base_connector.py CHANGED Viewed

@@ -20,9 +20,8 @@ class BaseConnector(ABC):
     <connector_type://<host>:<port>/<model_name>/files/<filename>
     """
-    def __init__(self, url: str, device: torch.device = "cpu"):
+    def __init__(self, url: str):
         self.url = url
-        self.device = device
         self.closed = False
         self.local_dir = tempfile.mkdtemp()
         for sig in (signal.SIGINT, signal.SIGTERM):

sglang/srt/connector/redis.py CHANGED Viewed

@@ -15,10 +15,10 @@ logger = logging.getLogger(__name__)
 class RedisConnector(BaseKVConnector):
-    def __init__(self, url: str, device: torch.device = "cpu"):
+    def __init__(self, url: str):
         import redis
-        super().__init__(url, device)
+        super().__init__(url)
         parsed_url = urlparse(url)
         self.connection = redis.Redis(host=parsed_url.hostname, port=parsed_url.port)
         self.model_name = parsed_url.path.lstrip("/")

sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl

sglang 0.5.1.post2py3-none-any.whl → 0.5.2rc0py3-none-any.whl