PyPI - sglang - Versions diffs - 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

sglang 0.4.1.post6py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

sglang/__init__.py +21 -23
sglang/api.py +2 -7
sglang/bench_offline_throughput.py +41 -27
sglang/bench_one_batch.py +60 -4
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +83 -71
sglang/lang/backend/runtime_endpoint.py +183 -4
sglang/lang/chat_template.py +46 -4
sglang/launch_server.py +1 -1
sglang/srt/_custom_ops.py +80 -42
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +1 -0
sglang/srt/constrained/base_grammar_backend.py +21 -0
sglang/srt/constrained/xgrammar_backend.py +8 -4
sglang/srt/conversation.py +14 -1
sglang/srt/distributed/__init__.py +3 -3
sglang/srt/distributed/communication_op.py +2 -1
sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +112 -42
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
sglang/srt/distributed/device_communicators/pynccl.py +80 -1
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
sglang/srt/distributed/parallel_state.py +1 -1
sglang/srt/distributed/utils.py +2 -1
sglang/srt/entrypoints/engine.py +452 -0
sglang/srt/entrypoints/http_server.py +603 -0
sglang/srt/function_call_parser.py +494 -0
sglang/srt/layers/activation.py +8 -8
sglang/srt/layers/attention/flashinfer_backend.py +10 -9
sglang/srt/layers/attention/triton_backend.py +4 -6
sglang/srt/layers/attention/vision.py +204 -0
sglang/srt/layers/dp_attention.py +71 -0
sglang/srt/layers/layernorm.py +5 -5
sglang/srt/layers/linear.py +65 -14
sglang/srt/layers/logits_processor.py +49 -64
sglang/srt/layers/moe/ep_moe/layer.py +24 -16
sglang/srt/layers/moe/fused_moe_native.py +84 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -7
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -5
sglang/srt/layers/parameter.py +18 -8
sglang/srt/layers/quantization/__init__.py +20 -23
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/fp8.py +10 -4
sglang/srt/layers/quantization/modelopt_quant.py +1 -2
sglang/srt/layers/quantization/w8a8_int8.py +1 -1
sglang/srt/layers/radix_attention.py +2 -2
sglang/srt/layers/rotary_embedding.py +1184 -31
sglang/srt/layers/sampler.py +64 -6
sglang/srt/layers/torchao_utils.py +12 -6
sglang/srt/layers/vocab_parallel_embedding.py +2 -2
sglang/srt/lora/lora.py +1 -9
sglang/srt/managers/configure_logging.py +3 -0
sglang/srt/managers/data_parallel_controller.py +79 -72
sglang/srt/managers/detokenizer_manager.py +24 -6
sglang/srt/managers/image_processor.py +158 -2
sglang/srt/managers/io_struct.py +57 -3
sglang/srt/managers/schedule_batch.py +78 -45
sglang/srt/managers/schedule_policy.py +26 -12
sglang/srt/managers/scheduler.py +326 -201
sglang/srt/managers/session_controller.py +1 -0
sglang/srt/managers/tokenizer_manager.py +210 -121
sglang/srt/managers/tp_worker.py +6 -4
sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
sglang/srt/managers/utils.py +44 -0
sglang/srt/mem_cache/memory_pool.py +10 -32
sglang/srt/metrics/collector.py +15 -6
sglang/srt/model_executor/cuda_graph_runner.py +26 -30
sglang/srt/model_executor/forward_batch_info.py +5 -7
sglang/srt/model_executor/model_runner.py +44 -19
sglang/srt/model_loader/loader.py +83 -6
sglang/srt/model_loader/weight_utils.py +145 -6
sglang/srt/models/baichuan.py +6 -6
sglang/srt/models/chatglm.py +2 -2
sglang/srt/models/commandr.py +17 -5
sglang/srt/models/dbrx.py +13 -5
sglang/srt/models/deepseek.py +3 -3
sglang/srt/models/deepseek_v2.py +11 -11
sglang/srt/models/exaone.py +2 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +15 -25
sglang/srt/models/gpt2.py +3 -5
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/granite.py +2 -2
sglang/srt/models/grok.py +4 -3
sglang/srt/models/internlm2.py +2 -2
sglang/srt/models/llama.py +7 -5
sglang/srt/models/minicpm.py +2 -2
sglang/srt/models/minicpm3.py +9 -9
sglang/srt/models/minicpmv.py +1238 -0
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mixtral_quant.py +3 -3
sglang/srt/models/mllama.py +2 -2
sglang/srt/models/olmo.py +3 -3
sglang/srt/models/olmo2.py +4 -4
sglang/srt/models/olmoe.py +7 -13
sglang/srt/models/phi3_small.py +2 -2
sglang/srt/models/qwen.py +2 -2
sglang/srt/models/qwen2.py +41 -4
sglang/srt/models/qwen2_moe.py +3 -3
sglang/srt/models/qwen2_vl.py +22 -122
sglang/srt/models/stablelm.py +2 -2
sglang/srt/models/torch_native_llama.py +20 -7
sglang/srt/models/xverse.py +6 -6
sglang/srt/models/xverse_moe.py +6 -6
sglang/srt/openai_api/adapter.py +139 -37
sglang/srt/openai_api/protocol.py +7 -4
sglang/srt/sampling/custom_logit_processor.py +38 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
sglang/srt/sampling/sampling_batch_info.py +143 -18
sglang/srt/sampling/sampling_params.py +3 -1
sglang/srt/server.py +4 -1090
sglang/srt/server_args.py +77 -15
sglang/srt/speculative/eagle_utils.py +37 -15
sglang/srt/speculative/eagle_worker.py +11 -13
sglang/srt/utils.py +164 -129
sglang/test/runners.py +8 -13
sglang/test/test_programs.py +2 -1
sglang/test/test_utils.py +83 -22
sglang/utils.py +12 -2
sglang/version.py +1 -1
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/METADATA +21 -10
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/RECORD +138 -123
sglang/launch_server_llavavid.py +0 -25
sglang/srt/constrained/__init__.py +0 -16
sglang/srt/distributed/device_communicators/__init__.py +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0

sglang/bench_serving.py CHANGED Viewed

@@ -452,6 +452,8 @@ def get_dataset(args, tokenizer):
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             fixed_output_len=args.sharegpt_output_len,
+            context_len=args.sharegpt_context_len,
+            apply_chat_template=args.apply_chat_template,
         )
     elif args.dataset_name == "random":
         input_requests = sample_random_requests(
@@ -464,11 +466,11 @@ def get_dataset(args, tokenizer):
         )
     elif args.dataset_name == "generated-shared-prefix":
         input_requests = sample_generated_shared_prefix_requests(
-            num_groups=args.gen_num_groups,
-            prompts_per_group=args.gen_prompts_per_group,
-            system_prompt_len=args.gen_system_prompt_len,
-            question_len=args.gen_question_len,
-            output_len=args.gen_output_len,
+            num_groups=args.gsp_num_groups,
+            prompts_per_group=args.gsp_prompts_per_group,
+            system_prompt_len=args.gsp_system_prompt_len,
+            question_len=args.gsp_question_len,
+            output_len=args.gsp_output_len,
             tokenizer=tokenizer,
         )
     else:
@@ -516,6 +518,7 @@ class BenchmarkMetrics:
     median_e2e_latency_ms: float
     std_e2e_latency_ms: float
     p99_e2e_latency_ms: float
+    concurrency: float
 SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
@@ -560,6 +563,8 @@ def sample_sharegpt_requests(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int] = None,
+    context_len: Optional[int] = None,
+    apply_chat_template=False,
 ) -> List[Tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
@@ -590,6 +595,15 @@ def sample_sharegpt_requests(
         # Tokenize the prompts and completions.
         prompt = dataset[i][0]
+        if apply_chat_template:
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+            prompt = prompt.replace(tokenizer.bos_token, "")
         prompt_token_ids = tokenizer.encode(prompt)
         completion = dataset[i][1]
         completion_token_ids = tokenizer.encode(completion)
@@ -597,14 +611,15 @@ def sample_sharegpt_requests(
         output_len = (
             len(completion_token_ids) if fixed_output_len is None else fixed_output_len
         )
-        if prompt_len < 4 or output_len < 4:
+        if prompt_len < 2 or output_len < 2:
             # Prune too short sequences.
             continue
-        if prompt_len > 1024 or (
-            prompt_len + output_len > 2048 and fixed_output_len is None
-        ):
+        if context_len and prompt_len + output_len > context_len:
             # Prune too long sequences.
             continue
         filtered_dataset.append((prompt, prompt_len, output_len))
     print(f"#Input tokens: {np.sum([x[1] for x in filtered_dataset])}")
@@ -706,8 +721,8 @@ def get_gen_prefix_cache_path(args, tokenizer):
     # Create a unique cache filename based on the generation parameters
     cache_key = (
-        f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
-        f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
+        f"gen_shared_prefix_{args.gsp_num_groups}_{args.gsp_prompts_per_group}_"
+        f"{args.gsp_system_prompt_len}_{args.gsp_question_len}_{args.gsp_output_len}_"
         f"{tokenizer.__class__.__name__}.pkl"
     )
     return cache_dir / cache_key
@@ -877,6 +892,7 @@ def calculate_metrics(
         median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
         std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
         p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000,
+        concurrency=np.sum(e2e_latencies) / dur_s,
     )
     return metrics, output_lens
@@ -1028,6 +1044,7 @@ async def benchmark(
             "Total token throughput (tok/s):", metrics.total_throughput
         )
     )
+    print("{:<40} {:<10.2f}".format("Concurrency:", metrics.concurrency))
     print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
     print(
         "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
@@ -1059,13 +1076,24 @@ async def benchmark(
         and metrics.output_throughput is not None
     ):
         result = {
+            # Arguments
             "backend": args.backend,
             "dataset_name": args.dataset_name,
             "request_rate": request_rate,
             "max_concurrency": max_concurrency,
+            "sharegpt_output_len": args.sharegpt_output_len,
+            "random_input_len": args.random_input_len,
+            "random_output_len": args.random_output_len,
+            "random_range_ratio": args.random_range_ratio,
+            # Results
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
             "total_input_tokens": metrics.total_input,
             "total_output_tokens": metrics.total_output,
             "total_output_tokens_retokenized": metrics.total_output_retokenized,
+            "request_throughput": metrics.request_throughput,
+            "input_throughput": metrics.input_throughput,
+            "output_throughput": metrics.output_throughput,
             "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
             "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
             "std_e2e_latency_ms": metrics.std_e2e_latency_ms,
@@ -1082,14 +1110,7 @@ async def benchmark(
             "median_itl_ms": metrics.median_itl_ms,
             "std_itl_ms": metrics.std_itl_ms,
             "p99_itl_ms": metrics.p99_itl_ms,
-            "input_throughput": metrics.input_throughput,
-            "output_throughput": metrics.output_throughput,
-            "sharegpt_output_len": args.sharegpt_output_len,
-            "random_input_len": args.random_input_len,
-            "random_output_len": args.random_output_len,
-            "random_range_ratio": args.random_range_ratio,
-            "duration": benchmark_duration,
-            "completed": metrics.completed,
+            "concurrency": metrics.concurrency,
         }
     else:
         print(f"Error running benchmark for request rate: {request_rate}")
@@ -1109,36 +1130,16 @@ async def benchmark(
     with open(output_file_name, "a") as file:
         file.write(json.dumps(result) + "\n")
-    result = {
-        "duration": benchmark_duration,
-        "completed": metrics.completed,
-        "total_input_tokens": metrics.total_input,
-        "total_output_tokens": metrics.total_output,
-        "total_output_tokens_retokenized": metrics.total_output_retokenized,
-        "request_throughput": metrics.request_throughput,
-        "input_throughput": metrics.input_throughput,
-        "output_throughput": metrics.output_throughput,
-        "mean_ttft_ms": metrics.mean_ttft_ms,
-        "median_ttft_ms": metrics.median_ttft_ms,
-        "std_ttft_ms": metrics.std_ttft_ms,
-        "p99_ttft_ms": metrics.p99_ttft_ms,
-        "mean_tpot_ms": metrics.mean_tpot_ms,
-        "median_tpot_ms": metrics.median_tpot_ms,
-        "std_tpot_ms": metrics.std_tpot_ms,
-        "p99_tpot_ms": metrics.p99_tpot_ms,
-        "mean_itl_ms": metrics.mean_itl_ms,
-        "median_itl_ms": metrics.median_itl_ms,
-        "std_itl_ms": metrics.std_itl_ms,
-        "p99_itl_ms": metrics.p99_itl_ms,
-        "input_lens": [output.prompt_len for output in outputs],
-        "output_lens": output_lens,
-        "ttfts": [output.ttft for output in outputs],
-        "itls": [output.itl for output in outputs],
-        "generated_texts": [output.generated_text for output in outputs],
-        "errors": [output.error for output in outputs],
-        "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
-        "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
-    }
+    result.update(
+        {
+            "input_lens": [output.prompt_len for output in outputs],
+            "output_lens": output_lens,
+            "ttfts": [output.ttft for output in outputs],
+            "itls": [output.itl for output in outputs],
+            "generated_texts": [output.generated_text for output in outputs],
+            "errors": [output.error for output in outputs],
+        }
+    )
     return result
@@ -1374,6 +1375,12 @@ if __name__ == "__main__":
         default=None,
         help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
     )
+    parser.add_argument(
+        "--sharegpt-context-len",
+        type=int,
+        default=None,
+        help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
+    )
     parser.add_argument(
         "--random-input-len",
         type=int,
@@ -1413,7 +1420,6 @@ if __name__ == "__main__":
         "actual request rate may be lower than specified with --request-rate, "
         "if the server is not processing requests fast enough to keep up.",
     )
-    parser.add_argument("--seed", type=int, default=1, help="The random seed.")
     parser.add_argument(
         "--multi",
         action="store_true",
@@ -1437,14 +1443,15 @@ if __name__ == "__main__":
         help="Disable streaming mode.",
     )
     parser.add_argument(
-        "--disable-ignore-eos",
+        "--return-logprob",
         action="store_true",
-        help="Disable ignoring EOS.",
+        help="Return logprob.",
     )
+    parser.add_argument("--seed", type=int, default=1, help="The random seed.")
     parser.add_argument(
-        "--return-logprob",
+        "--disable-ignore-eos",
         action="store_true",
-        help="Return logprob.",
+        help="Disable ignoring EOS.",
     )
     parser.add_argument(
         "--extra-request-body",
@@ -1453,49 +1460,54 @@ if __name__ == "__main__":
         help="Append given JSON object to the request payload. You can use this to specify"
         "additional generate params like sampling params.",
     )
+    parser.add_argument(
+        "--apply-chat-template",
+        action="store_true",
+        help="Apply chat template",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--lora-name",
+        type=str,
+        default=None,
+        help="The name of LoRA adapter",
+    )
     group = parser.add_argument_group("generated-shared-prefix dataset arguments")
     group.add_argument(
-        "--gen-num-groups",
+        "--gsp-num-groups",
         type=int,
         default=64,
         help="Number of system prompt groups for generated-shared-prefix dataset",
     )
     group.add_argument(
-        "--gen-prompts-per-group",
+        "--gsp-prompts-per-group",
         type=int,
         default=16,
         help="Number of prompts per system prompt group for generated-shared-prefix dataset",
     )
     group.add_argument(
-        "--gen-system-prompt-len",
+        "--gsp-system-prompt-len",
         type=int,
         default=2048,
         help="Target length in tokens for system prompts in generated-shared-prefix dataset",
     )
     group.add_argument(
-        "--gen-question-len",
+        "--gsp-question-len",
         type=int,
         default=128,
         help="Target length in tokens for questions in generated-shared-prefix dataset",
     )
     group.add_argument(
-        "--gen-output-len",
+        "--gsp-output-len",
         type=int,
         default=256,
         help="Target length in tokens for outputs in generated-shared-prefix dataset",
     )
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
-    )
-    parser.add_argument(
-        "--lora-name",
-        type=str,
-        default=None,
-        help="The name of LoRA adapter",
-    )
     args = parser.parse_args()
     run_benchmark(args)

sglang/lang/backend/runtime_endpoint.py CHANGED Viewed

@@ -1,6 +1,11 @@
+import atexit
 import json
+import multiprocessing
 import warnings
-from typing import List, Optional
+from typing import Dict, List, Optional, Union
+import aiohttp
+import requests
 from sglang.global_config import global_config
 from sglang.lang.backend.base_backend import BaseBackend
@@ -251,11 +256,12 @@ class RuntimeEndpoint(BaseBackend):
         }
         obj = self._generate_http_request(s, data)
-        normalized_prompt_logprobs = [
-            r["meta_info"]["normalized_prompt_logprob"] for r in obj
-        ]
         input_token_logprobs = [r["meta_info"]["input_token_logprobs"] for r in obj]
         output_token_logprobs = [r["meta_info"]["output_token_logprobs"] for r in obj]
+        normalized_prompt_logprobs = [
+            compute_normalized_prompt_logprobs(r["meta_info"]["input_token_logprobs"])
+            for r in obj
+        ]
         # Remove extra token if no token healing occurred
         for i in range(len(input_token_logprobs)):
@@ -319,3 +325,176 @@ class RuntimeEndpoint(BaseBackend):
     def _assert_success(self, res):
         if res.status_code != 200:
             raise RuntimeError(res.json())
+def compute_normalized_prompt_logprobs(input_logprobs):
+    values = [x[0] for x in input_logprobs if x[0]]
+    return sum(values) / len(values)
+class Runtime:
+    """
+    A wrapper for the HTTP server.
+    This is used for launching the server in a python program without
+    using the commond line interface.
+    It is mainly used for the frontend language.
+    You should use the Engine class if you want to do normal offline processing without the frontend language.
+    """
+    def __init__(
+        self,
+        log_level: str = "error",
+        *args,
+        **kwargs,
+    ):
+        """See the arguments in server_args.py::ServerArgs"""
+        # We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
+        # client code without installing SRT server and its dependency if they want.
+        from sglang.srt.entrypoints.http_server import launch_server
+        from sglang.srt.server_args import ServerArgs
+        from sglang.srt.utils import is_port_available
+        self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
+        # Pre-allocate ports
+        for port in range(self.server_args.port, 40000):
+            if is_port_available(port):
+                break
+        self.server_args.port = port
+        self.url = self.server_args.url()
+        self.generate_url = self.url + "/generate"
+        # NOTE: We store pid instead of proc to fix some issues during __delete__
+        self.pid = None
+        pipe_reader, pipe_writer = multiprocessing.Pipe(duplex=False)
+        proc = multiprocessing.Process(
+            target=launch_server,
+            args=(self.server_args, pipe_writer),
+        )
+        proc.start()
+        pipe_writer.close()
+        self.pid = proc.pid
+        # Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
+        atexit.register(self.shutdown)
+        # TODO: remove this pipe_writer mechanism and use `/health_generate` instead.
+        try:
+            init_state = pipe_reader.recv()
+        except EOFError:
+            init_state = ""
+        if init_state != "ready":
+            self.shutdown()
+            raise RuntimeError(
+                "Initialization failed. Please see the error messages above."
+            )
+        self.endpoint = RuntimeEndpoint(self.url)
+    def shutdown(self):
+        from sglang.srt.utils import kill_process_tree
+        if self.pid is not None:
+            kill_process_tree(self.pid)
+            self.pid = None
+    def cache_prefix(self, prefix: str):
+        self.endpoint.cache_prefix(prefix)
+    def get_tokenizer(self):
+        from sglang.srt.hf_transformers_utils import get_tokenizer
+        return get_tokenizer(
+            self.server_args.tokenizer_path,
+            tokenizer_mode=self.server_args.tokenizer_mode,
+            trust_remote_code=self.server_args.trust_remote_code,
+            revision=self.server_args.revision,
+        )
+    async def async_generate(
+        self,
+        prompt: str,
+        sampling_params: Optional[Dict] = None,
+    ):
+        if self.server_args.skip_tokenizer_init:
+            json_data = {
+                "input_ids": prompt,
+                "sampling_params": sampling_params,
+                "stream": True,
+            }
+        else:
+            json_data = {
+                "text": prompt,
+                "sampling_params": sampling_params,
+                "stream": True,
+            }
+        pos = 0
+        timeout = aiohttp.ClientTimeout(total=3 * 3600)
+        async with aiohttp.ClientSession(timeout=timeout, trust_env=True) as session:
+            async with session.post(self.generate_url, json=json_data) as response:
+                async for chunk, _ in response.content.iter_chunks():
+                    chunk = chunk.decode("utf-8")
+                    if chunk and chunk.startswith("data:"):
+                        if chunk == "data: [DONE]\n\n":
+                            break
+                        data = json.loads(chunk[5:].strip("\n"))
+                        if "text" in data:
+                            cur = data["text"][pos:]
+                            if cur:
+                                yield cur
+                            pos += len(cur)
+                        else:
+                            yield data
+    add_request = async_generate
+    def generate(
+        self,
+        prompt: Union[str, List[str]],
+        sampling_params: Optional[Dict] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
+    ):
+        json_data = {
+            "text": prompt,
+            "sampling_params": sampling_params,
+            "return_logprob": return_logprob,
+            "logprob_start_len": logprob_start_len,
+            "top_logprobs_num": top_logprobs_num,
+            "lora_path": lora_path,
+        }
+        assert not isinstance(lora_path, list) or len(lora_path) == len(prompt)
+        response = requests.post(
+            self.url + "/generate",
+            json=json_data,
+        )
+        return json.dumps(response.json())
+    def encode(
+        self,
+        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+    ):
+        json_data = {"text": prompt}
+        response = requests.post(self.url + "/encode", json=json_data)
+        return json.dumps(response.json())
+    async def get_server_info(self):
+        async with aiohttp.ClientSession() as session:
+            async with session.get(f"{self.url}/get_server_info") as response:
+                if response.status == 200:
+                    return await response.json()
+                else:
+                    error_data = await response.json()
+                    raise RuntimeError(
+                        f"Failed to get server info. {error_data['error']['message']}"
+                    )
+    def __del__(self):
+        self.shutdown()

sglang/lang/chat_template.py CHANGED Viewed

@@ -88,7 +88,6 @@ register_chat_template(
     )
 )
 register_chat_template(
     ChatTemplate(
         name="claude",
@@ -101,7 +100,6 @@ register_chat_template(
     )
 )
 register_chat_template(
     ChatTemplate(
         name="chatml",
@@ -116,7 +114,6 @@ register_chat_template(
     )
 )
 register_chat_template(
     ChatTemplate(
         name="chatml-llava",
@@ -132,7 +129,6 @@ register_chat_template(
     )
 )
 # There is default system prompt for qwen
 # reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
 # The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
@@ -219,6 +215,21 @@ register_chat_template(
     )
 )
+# https://huggingface.co/openbmb/MiniCPM-V-2_6
+register_chat_template(
+    ChatTemplate(
+        name="minicpmv",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", " "),
+            "user": ("user:", " "),
+            "assistant": ("assistant:", "</s>"),
+        },
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+    )
+)
 # The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
 register_chat_template(
     ChatTemplate(
@@ -343,6 +354,37 @@ register_chat_template(
 )
+register_chat_template(
+    ChatTemplate(
+        name="deepseek-v3",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "",
+                "",
+            ),
+            "user": (
+                "<｜User｜>",
+                "",
+            ),
+            "assistant": (
+                "<｜Assistant｜>",
+                "<｜end▁of▁sentence｜>",
+            ),
+        },
+        stop_str=("<｜end▁of▁sentence｜>",),
+    )
+)
+@register_chat_template_matching_function
+def match_deepseek(model_path: str):
+    if (
+        "deepseek-v3" in model_path.lower() or "deepseek-r1" in model_path.lower()
+    ) and "base" not in model_path.lower():
+        return get_chat_template("deepseek-v3")
 @register_chat_template_matching_function
 def match_dbrx(model_path: str):
     if "dbrx" in model_path.lower() and "instruct" in model_path.lower():

sglang/launch_server.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import os
 import sys
-from sglang.srt.server import launch_server
+from sglang.srt.entrypoints.http_server import launch_server
 from sglang.srt.server_args import prepare_server_args
 from sglang.srt.utils import kill_process_tree

sglang 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl

sglang 0.4.1.post6py3-none-any.whl → 0.4.2py3-none-any.whl