PyPI - sglang - Versions diffs - 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl - Mend

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_offline_throughput.py +4 -2
sglang/bench_one_batch.py +3 -13
sglang/bench_one_batch_server.py +143 -15
sglang/bench_serving.py +158 -8
sglang/compile_deep_gemm.py +1 -1
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +119 -75
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +5 -2
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/internvl.py +696 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +18 -0
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +71 -53
sglang/srt/conversation.py +78 -46
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +11 -3
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +74 -23
sglang/srt/disaggregation/mooncake/conn.py +236 -138
sglang/srt/disaggregation/nixl/conn.py +242 -71
sglang/srt/disaggregation/prefill.py +7 -4
sglang/srt/disaggregation/utils.py +51 -2
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
sglang/srt/distributed/device_communicators/pynccl.py +2 -1
sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
sglang/srt/distributed/parallel_state.py +22 -1
sglang/srt/entrypoints/engine.py +31 -4
sglang/srt/entrypoints/http_server.py +45 -3
sglang/srt/entrypoints/verl_engine.py +3 -2
sglang/srt/function_call_parser.py +2 -2
sglang/srt/hf_transformers_utils.py +20 -1
sglang/srt/layers/attention/flashattention_backend.py +147 -51
sglang/srt/layers/attention/flashinfer_backend.py +23 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
sglang/srt/layers/attention/merge_state.py +46 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
sglang/srt/layers/attention/utils.py +4 -2
sglang/srt/layers/attention/vision.py +290 -163
sglang/srt/layers/dp_attention.py +71 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
sglang/srt/layers/moe/ep_moe/layer.py +121 -2
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/topk.py +1 -1
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
sglang/srt/layers/quantization/deep_gemm.py +77 -71
sglang/srt/layers/quantization/fp8.py +110 -97
sglang/srt/layers/quantization/fp8_kernel.py +81 -62
sglang/srt/layers/quantization/fp8_utils.py +71 -23
sglang/srt/layers/quantization/int8_kernel.py +2 -2
sglang/srt/layers/quantization/kv_cache.py +3 -10
sglang/srt/layers/quantization/utils.py +0 -5
sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +11 -14
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +115 -119
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/io_struct.py +13 -1
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
sglang/srt/managers/multimodal_processors/internvl.py +232 -0
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/schedule_batch.py +93 -23
sglang/srt/managers/schedule_policy.py +11 -8
sglang/srt/managers/scheduler.py +140 -100
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/tokenizer_manager.py +157 -47
sglang/srt/managers/tp_worker.py +21 -21
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/chunk_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +4 -2
sglang/srt/metrics/collector.py +312 -37
sglang/srt/model_executor/cuda_graph_runner.py +10 -11
sglang/srt/model_executor/forward_batch_info.py +1 -1
sglang/srt/model_executor/model_runner.py +57 -41
sglang/srt/model_loader/loader.py +18 -11
sglang/srt/models/clip.py +4 -4
sglang/srt/models/deepseek_janus_pro.py +3 -3
sglang/srt/models/deepseek_nextn.py +1 -20
sglang/srt/models/deepseek_v2.py +77 -39
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/internlm2.py +3 -0
sglang/srt/models/internvl.py +670 -0
sglang/srt/models/llama.py +3 -1
sglang/srt/models/llama4.py +58 -13
sglang/srt/models/llava.py +248 -5
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/phi3_small.py +16 -2
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2_5_vl.py +8 -4
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/models/xiaomi_mimo.py +171 -0
sglang/srt/openai_api/adapter.py +52 -42
sglang/srt/openai_api/protocol.py +20 -16
sglang/srt/reasoning_parser.py +1 -1
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +2 -2
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +64 -10
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +7 -7
sglang/srt/speculative/eagle_worker.py +22 -19
sglang/srt/utils.py +41 -6
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deepep_utils.py +219 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +92 -15
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0

sglang/test/test_programs.py CHANGED Viewed

@@ -370,7 +370,7 @@ def test_dtype_gen():
     @sgl.function
     def dtype_gen(s):
         s += "Q: What is the full name of DNS?\n"
-        s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
+        s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
         s += "Q: Which year was DNS invented?\n"
         s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
         s += "Q: What is the value of pi?\n"
@@ -503,7 +503,7 @@ def test_hellaswag_select():
     #####################################
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     rets = few_shot_hellaswag.run_batch(
         arguments,
         temperature=0,
@@ -514,13 +514,13 @@ def test_hellaswag_select():
     preds = []
     for i, ret in enumerate(rets):
         preds.append(choices[i].index(ret["answer"]))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
     # Compute accuracy
     accuracy = np.mean(np.array(preds) == np.array(labels))
     # Test generator style of run_batch
-    tic = time.time()
+    tic = time.perf_counter()
     rets = few_shot_hellaswag.run_batch(
         arguments,
         temperature=0,
@@ -531,7 +531,7 @@ def test_hellaswag_select():
     preds_gen = []
     for i, ret in enumerate(rets):
         preds_gen.append(choices[i].index(ret["answer"]))
-    latency_gen = time.time() - tic
+    latency_gen = time.perf_counter() - tic
     # Compute accuracy
     accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))

sglang/test/test_utils.py CHANGED Viewed

@@ -66,6 +66,7 @@ DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
 )
 DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
 DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
+DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-V3-0324"
 DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
     "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
 )
@@ -78,7 +79,8 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Ins
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
-DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
+DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
+DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST = "qwen2-vl"
 DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
 DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
@@ -393,12 +395,12 @@ def popen_launch_server(
     other_args: list[str] = (),
     env: Optional[dict] = None,
     return_stdout_stderr: Optional[tuple] = None,
-    pd_seperated: bool = False,
+    pd_separated: bool = False,
 ):
     _, host, port = base_url.split(":")
     host = host[2:]
-    if pd_seperated:
+    if pd_separated:
         command = "sglang.launch_pd_server"
     else:
         command = "sglang.launch_server"
@@ -412,7 +414,7 @@ def popen_launch_server(
         *[str(x) for x in other_args],
     ]
-    if pd_seperated:
+    if pd_separated:
         command.extend(
             [
                 "--lb-host",
@@ -447,9 +449,9 @@ def popen_launch_server(
     else:
         process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
-    start_time = time.time()
+    start_time = time.perf_counter()
     with requests.Session() as session:
-        while time.time() - start_time < timeout:
+        while time.perf_counter() - start_time < timeout:
             try:
                 headers = {
                     "Content-Type": "application/json; charset=utf-8",
@@ -476,6 +478,81 @@ def popen_launch_server(
     raise TimeoutError("Server failed to start within the timeout period.")
+def popen_launch_pd_server(
+    model: str,
+    base_url: str,
+    timeout: float,
+    api_key: Optional[str] = None,
+    other_args: list[str] = (),
+    env: Optional[dict] = None,
+    return_stdout_stderr: Optional[tuple] = None,
+):
+    _, host, port = base_url.split(":")
+    host = host[2:]
+    command = "sglang.launch_server"
+    command = [
+        "python3",
+        "-m",
+        command,
+        "--model-path",
+        model,
+        *[str(x) for x in other_args],
+    ]
+    command.extend(
+        [
+            "--host",
+            host,
+            "--port",
+            port,
+        ]
+    )
+    if api_key:
+        command += ["--api-key", api_key]
+    print(f"command={' '.join(command)}")
+    if return_stdout_stderr:
+        process = subprocess.Popen(
+            command,
+            stdout=return_stdout_stderr[0],
+            stderr=return_stdout_stderr[1],
+            env=env,
+            text=True,
+        )
+    else:
+        process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
+    start_time = time.time()
+    with requests.Session() as session:
+        while time.time() - start_time < timeout:
+            try:
+                headers = {
+                    "Content-Type": "application/json; charset=utf-8",
+                    "Authorization": f"Bearer {api_key}",
+                }
+                response = session.get(
+                    f"{base_url}/health",
+                    headers=headers,
+                )
+                if response.status_code == 200:
+                    return process
+            except requests.RequestException:
+                pass
+            return_code = process.poll()
+            if return_code is not None:
+                raise Exception(f"Server unexpectedly exits ({return_code=}).")
+            time.sleep(10)
+    kill_process_tree(process.pid)
+    raise TimeoutError("Server failed to start within the timeout period.")
 def run_with_timeout(
     func: Callable,
     args: tuple = (),
@@ -507,7 +584,7 @@ class TestFile:
 def run_unittest_files(files: List[TestFile], timeout_per_file: float):
-    tic = time.time()
+    tic = time.perf_counter()
     success = True
     for i, file in enumerate(files):
@@ -522,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
                 f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
                 flush=True,
             )
-            tic = time.time()
+            tic = time.perf_counter()
             process = subprocess.Popen(
                 ["python3", filename], stdout=None, stderr=None, env=os.environ
             )
             process.wait()
-            elapsed = time.time() - tic
+            elapsed = time.perf_counter() - tic
             print(
                 f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
@@ -554,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
             break
     if success:
-        print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
+        print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
     else:
-        print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
+        print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
     return 0 if success else -1
@@ -579,7 +656,7 @@ def get_benchmark_args(
     disable_stream=False,
     disable_ignore_eos=False,
     seed: int = 0,
-    pd_seperated: bool = False,
+    pd_separated: bool = False,
 ):
     return SimpleNamespace(
         backend="sglang",
@@ -609,7 +686,7 @@ def get_benchmark_args(
         profile=None,
         lora_name=None,
         prompt_suffix="",
-        pd_seperated=pd_seperated,
+        pd_separated=pd_separated,
     )
@@ -673,7 +750,7 @@ def run_bench_serving_multi(
     other_server_args,
     benchmark_args,
     need_warmup=False,
-    pd_seperated=False,
+    pd_separated=False,
 ):
     # Launch the server
     process = popen_launch_server(
@@ -681,7 +758,7 @@ def run_bench_serving_multi(
         base_url,
         timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
         other_args=other_server_args,
-        pd_seperated=pd_seperated,
+        pd_separated=pd_separated,
     )
     # run benchmark for all

sglang/utils.py CHANGED Viewed

@@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str):
             f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
         )
         if signum == signal.SIGTERM:
-            logger.info(f"{sub_module_name} recive sigterm")
+            logger.info(f"{sub_module_name} receive sigterm")
     signal.signal(signal.SIGTERM, graceful_shutdown)

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.6.~~post2~~"
1	+ __version__ = "0.4.6.post4"

{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.6.post2
+Version: 0.4.6.post4
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -230,6 +230,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
 Requires-Dist: ninja; extra == "runtime-common"
 Requires-Dist: orjson; extra == "runtime-common"
 Requires-Dist: packaging; extra == "runtime-common"
+Requires-Dist: partial_json_parser; extra == "runtime-common"
 Requires-Dist: pillow; extra == "runtime-common"
 Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
 Requires-Dist: psutil; extra == "runtime-common"
@@ -242,17 +243,16 @@ Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
 Requires-Dist: transformers==4.51.1; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
+Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
 Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.1.1; extra == "srt"
+Requires-Dist: sgl-kernel==0.1.2.post1; extra == "srt"
 Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
 Requires-Dist: torch==2.6.0; extra == "srt"
 Requires-Dist: torchvision==0.21.0; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
-Requires-Dist: partial_json_parser; extra == "srt"
 Requires-Dist: einops; extra == "srt"
 Provides-Extra: blackwell
 Requires-Dist: sglang[runtime_common]; extra == "blackwell"
@@ -261,7 +261,6 @@ Requires-Dist: torch; extra == "blackwell"
 Requires-Dist: torchvision; extra == "blackwell"
 Requires-Dist: cuda-python; extra == "blackwell"
 Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
-Requires-Dist: partial_json_parser; extra == "blackwell"
 Requires-Dist: einops; extra == "blackwell"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
@@ -278,6 +277,9 @@ Provides-Extra: srt-cpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
 Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
 Requires-Dist: torch; extra == "srt-cpu"
+Provides-Extra: srt-npu
+Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
+Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-npu"
 Provides-Extra: openai
 Requires-Dist: openai>=1.0; extra == "openai"
 Requires-Dist: tiktoken; extra == "openai"
@@ -299,6 +301,7 @@ Requires-Dist: sglang[srt]; extra == "all"
 Requires-Dist: sglang[openai]; extra == "all"
 Requires-Dist: sglang[anthropic]; extra == "all"
 Requires-Dist: sglang[litellm]; extra == "all"
+Requires-Dist: sglang[torch_memory_saver]; extra == "all"
 Provides-Extra: all-hip
 Requires-Dist: sglang[srt_hip]; extra == "all-hip"
 Requires-Dist: sglang[openai]; extra == "all-hip"
@@ -319,6 +322,11 @@ Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
 Requires-Dist: sglang[openai]; extra == "all-cpu"
 Requires-Dist: sglang[anthropic]; extra == "all-cpu"
 Requires-Dist: sglang[litellm]; extra == "all-cpu"
+Provides-Extra: all-npu
+Requires-Dist: sglang[srt_npu]; extra == "all-npu"
+Requires-Dist: sglang[openai]; extra == "all-npu"
+Requires-Dist: sglang[anthropic]; extra == "all-npu"
+Requires-Dist: sglang[litellm]; extra == "all-npu"
 Provides-Extra: dev
 Requires-Dist: sglang[all]; extra == "dev"
 Requires-Dist: sglang[test]; extra == "dev"
@@ -358,18 +366,19 @@ Dynamic: license-file
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
+- [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
 - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
 - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
-- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
 - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
 - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
-- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
 - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
 <details>
 <summary>More</summary>
+- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
 - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
+- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
 - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
 - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
 - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -383,7 +392,7 @@ The core features include:
 - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
-- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
+- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
 ## Getting Started
@@ -401,7 +410,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 ## Adoption and Sponsorship
 The project has been deployed to large-scale production, generating trillions of tokens every day.
-It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
+It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Google Cloud, Hyperbolic, Iflytek, InnoMatrix, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
 <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>

sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl