PyPI - sglang - Versions diffs - 0.4.1.post6__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl - Mend

sglang 0.4.1.post6py3-none-any.whl → 0.4.1.post7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

sglang/__init__.py +21 -23
sglang/api.py +2 -7
sglang/bench_offline_throughput.py +24 -16
sglang/bench_one_batch.py +51 -3
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +37 -28
sglang/lang/backend/runtime_endpoint.py +183 -4
sglang/lang/chat_template.py +15 -4
sglang/launch_server.py +1 -1
sglang/srt/_custom_ops.py +80 -42
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/model_config.py +1 -0
sglang/srt/constrained/base_grammar_backend.py +21 -0
sglang/srt/constrained/xgrammar_backend.py +8 -4
sglang/srt/conversation.py +14 -1
sglang/srt/distributed/__init__.py +3 -3
sglang/srt/distributed/communication_op.py +2 -1
sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
sglang/srt/distributed/device_communicators/pynccl.py +80 -1
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
sglang/srt/distributed/parallel_state.py +1 -1
sglang/srt/distributed/utils.py +2 -1
sglang/srt/entrypoints/engine.py +449 -0
sglang/srt/entrypoints/http_server.py +579 -0
sglang/srt/layers/activation.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +10 -9
sglang/srt/layers/attention/triton_backend.py +4 -6
sglang/srt/layers/attention/vision.py +204 -0
sglang/srt/layers/dp_attention.py +69 -0
sglang/srt/layers/linear.py +41 -5
sglang/srt/layers/logits_processor.py +48 -63
sglang/srt/layers/moe/ep_moe/layer.py +4 -4
sglang/srt/layers/moe/fused_moe_native.py +69 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
sglang/srt/layers/moe/fused_moe_triton/layer.py +29 -5
sglang/srt/layers/parameter.py +2 -1
sglang/srt/layers/quantization/__init__.py +20 -23
sglang/srt/layers/quantization/fp8.py +6 -3
sglang/srt/layers/quantization/modelopt_quant.py +1 -2
sglang/srt/layers/quantization/w8a8_int8.py +1 -1
sglang/srt/layers/radix_attention.py +2 -2
sglang/srt/layers/rotary_embedding.py +1179 -31
sglang/srt/layers/sampler.py +39 -1
sglang/srt/layers/vocab_parallel_embedding.py +2 -2
sglang/srt/lora/lora.py +1 -9
sglang/srt/managers/configure_logging.py +3 -0
sglang/srt/managers/data_parallel_controller.py +79 -72
sglang/srt/managers/detokenizer_manager.py +23 -6
sglang/srt/managers/image_processor.py +158 -2
sglang/srt/managers/io_struct.py +25 -2
sglang/srt/managers/schedule_batch.py +49 -22
sglang/srt/managers/schedule_policy.py +26 -12
sglang/srt/managers/scheduler.py +277 -178
sglang/srt/managers/session_controller.py +1 -0
sglang/srt/managers/tokenizer_manager.py +206 -121
sglang/srt/managers/tp_worker.py +6 -4
sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
sglang/srt/managers/utils.py +44 -0
sglang/srt/mem_cache/memory_pool.py +10 -32
sglang/srt/metrics/collector.py +15 -6
sglang/srt/model_executor/cuda_graph_runner.py +4 -6
sglang/srt/model_executor/model_runner.py +37 -15
sglang/srt/model_loader/loader.py +8 -6
sglang/srt/model_loader/weight_utils.py +55 -2
sglang/srt/models/baichuan.py +6 -6
sglang/srt/models/chatglm.py +2 -2
sglang/srt/models/commandr.py +3 -3
sglang/srt/models/dbrx.py +4 -4
sglang/srt/models/deepseek.py +3 -3
sglang/srt/models/deepseek_v2.py +8 -8
sglang/srt/models/exaone.py +2 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +6 -24
sglang/srt/models/gpt2.py +3 -5
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/granite.py +2 -2
sglang/srt/models/grok.py +3 -3
sglang/srt/models/internlm2.py +2 -2
sglang/srt/models/llama.py +7 -5
sglang/srt/models/minicpm.py +2 -2
sglang/srt/models/minicpm3.py +6 -6
sglang/srt/models/minicpmv.py +1238 -0
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mixtral_quant.py +3 -3
sglang/srt/models/mllama.py +2 -2
sglang/srt/models/olmo.py +3 -3
sglang/srt/models/olmo2.py +4 -4
sglang/srt/models/olmoe.py +7 -13
sglang/srt/models/phi3_small.py +2 -2
sglang/srt/models/qwen.py +2 -2
sglang/srt/models/qwen2.py +41 -4
sglang/srt/models/qwen2_moe.py +3 -3
sglang/srt/models/qwen2_vl.py +22 -122
sglang/srt/models/stablelm.py +2 -2
sglang/srt/models/torch_native_llama.py +3 -3
sglang/srt/models/xverse.py +6 -6
sglang/srt/models/xverse_moe.py +6 -6
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/sampling/custom_logit_processor.py +38 -0
sglang/srt/sampling/sampling_batch_info.py +139 -4
sglang/srt/sampling/sampling_params.py +3 -1
sglang/srt/server.py +4 -1090
sglang/srt/server_args.py +57 -14
sglang/srt/utils.py +103 -65
sglang/test/runners.py +8 -13
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +3 -1
sglang/utils.py +12 -2
sglang/version.py +1 -1
{sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/METADATA +16 -5
{sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/RECORD +119 -115
sglang/launch_server_llavavid.py +0 -25
sglang/srt/constrained/__init__.py +0 -16
sglang/srt/distributed/device_communicators/__init__.py +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
-# SGL API Components
+# SGLang public APIs
+# Frontend Language APIs
 from sglang.api import (
     Engine,
     Runtime,
@@ -23,16 +24,26 @@ from sglang.api import (
     user_end,
     video,
 )
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.lang.choices import (
     greedy_token_selection,
     token_length_normalized,
     unconditional_likelihood_normalized,
 )
+from sglang.utils import LazyImport
+Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
+LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
+OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
+VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
+# Other configs
+from sglang.global_config import global_config
+from sglang.version import __version__
-# SGLang DSL APIs
 __all__ = [
-    "Runtime",
     "Engine",
+    "Runtime",
     "assistant",
     "assistant_begin",
     "assistant_end",
@@ -52,27 +63,14 @@ __all__ = [
     "user_begin",
     "user_end",
     "video",
+    "RuntimeEndpoint",
     "greedy_token_selection",
     "token_length_normalized",
     "unconditional_likelihood_normalized",
+    "Anthropic",
+    "LiteLLM",
+    "OpenAI",
+    "VertexAI",
+    "global_config",
+    "__version__",
 ]
-# Global Configurations
-from sglang.global_config import global_config
-__all__ += ["global_config"]
-from sglang.version import __version__
-__all__ += ["__version__"]
-# SGLang Backends
-from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
-from sglang.utils import LazyImport
-Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
-LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
-OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
-VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
-__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]

sglang/api.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Public APIs of the language."""
-import os
 import re
 from typing import Callable, List, Optional, Union
@@ -33,19 +32,15 @@ def function(
 def Runtime(*args, **kwargs):
-    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
     # Avoid importing unnecessary dependency
-    from sglang.srt.server import Runtime
+    from sglang.lang.backend.runtime_endpoint import Runtime
     return Runtime(*args, **kwargs)
 def Engine(*args, **kwargs):
-    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
     # Avoid importing unnecessary dependency
-    from sglang.srt.server import Engine
+    from sglang.srt.entrypoints.engine import Engine
     return Engine(*args, **kwargs)

sglang/bench_offline_throughput.py CHANGED Viewed

@@ -27,7 +27,8 @@ from sglang.bench_serving import (
     sample_random_requests,
     set_ulimit,
 )
-from sglang.srt.server import Engine, Runtime
+from sglang.lang.backend.runtime_endpoint import Runtime
+from sglang.srt.entrypoints.engine import Engine
 from sglang.srt.server_args import ServerArgs
@@ -39,14 +40,15 @@ class BenchArgs:
     dataset_path: str = ""
     num_prompts: int = 1000
     sharegpt_output_len: Optional[int] = None
+    sharegpt_context_len: Optional[int] = None
     random_input_len: int = 1024
     random_output_len: int = 1024
     random_range_ratio: float = 0.0
-    gen_num_groups: int = 64
-    gen_prompts_per_group: int = 16
-    gen_system_prompt_len: int = 2048
-    gen_question_len: int = 128
-    gen_output_len: int = 256
+    gsp_num_groups: int = 64
+    gsp_prompts_per_group: int = 16
+    gsp_system_prompt_len: int = 2048
+    gsp_question_len: int = 128
+    gsp_output_len: int = 256
     disable_ignore_eos: bool = False
     extra_request_body: Optional[str] = None
     seed: int = 1
@@ -82,6 +84,12 @@ class BenchArgs:
             default=BenchArgs.sharegpt_output_len,
             help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
         )
+        parser.add_argument(
+            "--sharegpt-context-len",
+            type=int,
+            default=BenchArgs.sharegpt_context_len,
+            help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
+        )
         parser.add_argument(
             "--random-input-len",
             type=int,
@@ -102,35 +110,35 @@ class BenchArgs:
             "used only for random dataset.",
         )
         parser.add_argument(
-            "--gen-num-groups",
+            "--gsp-num-groups",
             type=int,
-            default=BenchArgs.gen_num_groups,
+            default=BenchArgs.gsp_num_groups,
             help="Number of groups with shared prefix, used"
             "only for generate-shared-prefix",
         )
         parser.add_argument(
-            "--gen-prompts-per-group",
+            "--gsp-prompts-per-group",
             type=int,
-            default=BenchArgs.gen_prompts_per_group,
+            default=BenchArgs.gsp_prompts_per_group,
             help="Number of prompts per group of shared prefix, used"
             "only for generate-shared-prefix",
         )
         parser.add_argument(
-            "--gen-system-prompt-len",
+            "--gsp-system-prompt-len",
             type=int,
-            default=BenchArgs.gen_system_prompt_len,
+            default=BenchArgs.gsp_system_prompt_len,
             help="System prompt length, used" "only for generate-shared-prefix",
         )
         parser.add_argument(
-            "--gen-question-len",
+            "--gsp-question-len",
             type=int,
-            default=BenchArgs.gen_question_len,
+            default=BenchArgs.gsp_question_len,
             help="Question length, used" "only for generate-shared-prefix",
         )
         parser.add_argument(
-            "--gen-output-len",
+            "--gsp-output-len",
             type=int,
-            default=BenchArgs.gen_output_len,
+            default=BenchArgs.gsp_output_len,
             help="Target length in tokens for outputs in generated-shared-prefix dataset",
         )
         parser.add_argument(

sglang/bench_one_batch.py CHANGED Viewed

@@ -9,7 +9,8 @@ It accepts server arguments (the same as launch_server.py) and benchmark argumen
 python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
 ## sweep through multiple data points and store (append) the results in a jsonl file:
 python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
+## run with profiling:
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
 # Usage (correctness test):
 python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
@@ -56,12 +57,12 @@ import torch
 import torch.distributed as dist
 from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.entrypoints.engine import _set_envs_and_config
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling.sampling_params import SamplingParams
-from sglang.srt.server import _set_envs_and_config
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
@@ -77,6 +78,8 @@ class BenchArgs:
     correctness_test: bool = False
     # This is only used for correctness test
     cut_len: int = 4
+    profile: bool = False
+    profile_filename_prefix: str = "profile"
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -95,6 +98,19 @@ class BenchArgs:
         )
         parser.add_argument("--correctness-test", action="store_true")
         parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
+        parser.add_argument(
+            "--profile",
+            action="store_true",
+            help="Use Torch Profiler. The endpoint must be launched with "
+            "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+        )
+        parser.add_argument(
+            "--profile-filename-prefix",
+            type=str,
+            default=BenchArgs.profile_filename_prefix,
+            help="Prefix of the profiling file names. The full profiling result file(s) be "
+            '"[profile_filename_prefix]_batch[batch_size]_input[input_len]_output[output_len].trace.json.gz"',
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -216,6 +232,7 @@ def extend(reqs, model_runner):
         model_config=model_runner.model_config,
         enable_overlap=False,
         spec_algorithm=SpeculativeAlgorithm.NONE,
+        enable_custom_logit_processor=False,
     )
     batch.prepare_for_extend()
     model_worker_batch = batch.get_model_worker_batch()
@@ -286,7 +303,16 @@ def synchronize(device):
 def latency_test_run_once(
-    run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len, device
+    run_name,
+    model_runner,
+    rank_print,
+    reqs,
+    batch_size,
+    input_len,
+    output_len,
+    device,
+    profile,
+    profile_filename_prefix,
 ):
     max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
     if batch_size > max_batch_size:
@@ -308,6 +334,17 @@ def latency_test_run_once(
     tot_latency = 0
+    profiler = None
+    if profile:
+        profiler = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            with_stack=True,
+        )
+        profiler.start()
     # Prefill
     synchronize(device)
     tic = time.time()
@@ -338,6 +375,13 @@ def latency_test_run_once(
                 f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
             )
+    if profile:
+        profiler.stop()
+        profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}.trace.json.gz"
+        parent_dir = os.path.dirname(os.path.abspath(profile_filename))
+        os.makedirs(parent_dir, exist_ok=True)
+        profiler.export_chrome_trace(profile_filename)
     # Record decode timing from 2nd output
     if output_len > 1:
         med_decode_latency = np.median(decode_latencies)
@@ -386,6 +430,8 @@ def latency_test(
         bench_args.input_len[0],
         8,  # shorter decoding to speed up the warmup
         server_args.device,
+        profile=False,
+        profile_filename_prefix="",  # not used
     )
     rank_print("Benchmark ...")
@@ -405,6 +451,8 @@ def latency_test(
             il,
             ol,
             server_args.device,
+            bench_args.profile,
+            bench_args.profile_filename_prefix,
         )
         if ret is not None:
             result_list.append(ret)

sglang/bench_one_batch_server.py CHANGED Viewed

@@ -22,7 +22,7 @@ from typing import Tuple
 import numpy as np
 import requests
-from sglang.srt.server import launch_server
+from sglang.srt.entrypoints.http_server import launch_server
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import kill_process_tree

sglang/bench_serving.py CHANGED Viewed

@@ -452,6 +452,7 @@ def get_dataset(args, tokenizer):
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             fixed_output_len=args.sharegpt_output_len,
+            context_len=args.sharegpt_context_len,
         )
     elif args.dataset_name == "random":
         input_requests = sample_random_requests(
@@ -464,11 +465,11 @@ def get_dataset(args, tokenizer):
         )
     elif args.dataset_name == "generated-shared-prefix":
         input_requests = sample_generated_shared_prefix_requests(
-            num_groups=args.gen_num_groups,
-            prompts_per_group=args.gen_prompts_per_group,
-            system_prompt_len=args.gen_system_prompt_len,
-            question_len=args.gen_question_len,
-            output_len=args.gen_output_len,
+            num_groups=args.gsp_num_groups,
+            prompts_per_group=args.gsp_prompts_per_group,
+            system_prompt_len=args.gsp_system_prompt_len,
+            question_len=args.gsp_question_len,
+            output_len=args.gsp_output_len,
             tokenizer=tokenizer,
         )
     else:
@@ -560,6 +561,7 @@ def sample_sharegpt_requests(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int] = None,
+    context_len: Optional[int] = None,
 ) -> List[Tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
@@ -597,14 +599,15 @@ def sample_sharegpt_requests(
         output_len = (
             len(completion_token_ids) if fixed_output_len is None else fixed_output_len
         )
-        if prompt_len < 4 or output_len < 4:
+        if prompt_len < 1 or output_len < 1:
             # Prune too short sequences.
             continue
-        if prompt_len > 1024 or (
-            prompt_len + output_len > 2048 and fixed_output_len is None
-        ):
+        if context_len and prompt_len + output_len > context_len:
             # Prune too long sequences.
             continue
         filtered_dataset.append((prompt, prompt_len, output_len))
     print(f"#Input tokens: {np.sum([x[1] for x in filtered_dataset])}")
@@ -706,8 +709,8 @@ def get_gen_prefix_cache_path(args, tokenizer):
     # Create a unique cache filename based on the generation parameters
     cache_key = (
-        f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
-        f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
+        f"gen_shared_prefix_{args.gsp_num_groups}_{args.gsp_prompts_per_group}_"
+        f"{args.gsp_system_prompt_len}_{args.gsp_question_len}_{args.gsp_output_len}_"
         f"{tokenizer.__class__.__name__}.pkl"
     )
     return cache_dir / cache_key
@@ -1374,6 +1377,12 @@ if __name__ == "__main__":
         default=None,
         help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
     )
+    parser.add_argument(
+        "--sharegpt-context-len",
+        type=int,
+        default=None,
+        help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
+    )
     parser.add_argument(
         "--random-input-len",
         type=int,
@@ -1453,49 +1462,49 @@ if __name__ == "__main__":
         help="Append given JSON object to the request payload. You can use this to specify"
         "additional generate params like sampling params.",
     )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--lora-name",
+        type=str,
+        default=None,
+        help="The name of LoRA adapter",
+    )
     group = parser.add_argument_group("generated-shared-prefix dataset arguments")
     group.add_argument(
-        "--gen-num-groups",
+        "--gsp-num-groups",
         type=int,
         default=64,
         help="Number of system prompt groups for generated-shared-prefix dataset",
     )
     group.add_argument(
-        "--gen-prompts-per-group",
+        "--gsp-prompts-per-group",
         type=int,
         default=16,
         help="Number of prompts per system prompt group for generated-shared-prefix dataset",
     )
     group.add_argument(
-        "--gen-system-prompt-len",
+        "--gsp-system-prompt-len",
         type=int,
         default=2048,
         help="Target length in tokens for system prompts in generated-shared-prefix dataset",
     )
     group.add_argument(
-        "--gen-question-len",
+        "--gsp-question-len",
         type=int,
         default=128,
         help="Target length in tokens for questions in generated-shared-prefix dataset",
     )
     group.add_argument(
-        "--gen-output-len",
+        "--gsp-output-len",
         type=int,
         default=256,
         help="Target length in tokens for outputs in generated-shared-prefix dataset",
     )
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
-    )
-    parser.add_argument(
-        "--lora-name",
-        type=str,
-        default=None,
-        help="The name of LoRA adapter",
-    )
     args = parser.parse_args()
     run_benchmark(args)

sglang/lang/backend/runtime_endpoint.py CHANGED Viewed

@@ -1,6 +1,11 @@
+import atexit
 import json
+import multiprocessing
 import warnings
-from typing import List, Optional
+from typing import Dict, List, Optional, Union
+import aiohttp
+import requests
 from sglang.global_config import global_config
 from sglang.lang.backend.base_backend import BaseBackend
@@ -251,11 +256,12 @@ class RuntimeEndpoint(BaseBackend):
         }
         obj = self._generate_http_request(s, data)
-        normalized_prompt_logprobs = [
-            r["meta_info"]["normalized_prompt_logprob"] for r in obj
-        ]
         input_token_logprobs = [r["meta_info"]["input_token_logprobs"] for r in obj]
         output_token_logprobs = [r["meta_info"]["output_token_logprobs"] for r in obj]
+        normalized_prompt_logprobs = [
+            compute_normalized_prompt_logprobs(r["meta_info"]["input_token_logprobs"])
+            for r in obj
+        ]
         # Remove extra token if no token healing occurred
         for i in range(len(input_token_logprobs)):
@@ -319,3 +325,176 @@ class RuntimeEndpoint(BaseBackend):
     def _assert_success(self, res):
         if res.status_code != 200:
             raise RuntimeError(res.json())
+def compute_normalized_prompt_logprobs(input_logprobs):
+    values = [x[0] for x in input_logprobs if x[0]]
+    return sum(values) / len(values)
+class Runtime:
+    """
+    A wrapper for the HTTP server.
+    This is used for launching the server in a python program without
+    using the commond line interface.
+    It is mainly used for the frontend language.
+    You should use the Engine class if you want to do normal offline processing without the frontend language.
+    """
+    def __init__(
+        self,
+        log_level: str = "error",
+        *args,
+        **kwargs,
+    ):
+        """See the arguments in server_args.py::ServerArgs"""
+        # We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
+        # client code without installing SRT server and its dependency if they want.
+        from sglang.srt.entrypoints.http_server import launch_server
+        from sglang.srt.server_args import ServerArgs
+        from sglang.srt.utils import is_port_available
+        self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
+        # Pre-allocate ports
+        for port in range(self.server_args.port, 40000):
+            if is_port_available(port):
+                break
+        self.server_args.port = port
+        self.url = self.server_args.url()
+        self.generate_url = self.url + "/generate"
+        # NOTE: We store pid instead of proc to fix some issues during __delete__
+        self.pid = None
+        pipe_reader, pipe_writer = multiprocessing.Pipe(duplex=False)
+        proc = multiprocessing.Process(
+            target=launch_server,
+            args=(self.server_args, pipe_writer),
+        )
+        proc.start()
+        pipe_writer.close()
+        self.pid = proc.pid
+        # Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
+        atexit.register(self.shutdown)
+        # TODO: remove this pipe_writer mechanism and use `/health_generate` instead.
+        try:
+            init_state = pipe_reader.recv()
+        except EOFError:
+            init_state = ""
+        if init_state != "ready":
+            self.shutdown()
+            raise RuntimeError(
+                "Initialization failed. Please see the error messages above."
+            )
+        self.endpoint = RuntimeEndpoint(self.url)
+    def shutdown(self):
+        from sglang.srt.utils import kill_process_tree
+        if self.pid is not None:
+            kill_process_tree(self.pid)
+            self.pid = None
+    def cache_prefix(self, prefix: str):
+        self.endpoint.cache_prefix(prefix)
+    def get_tokenizer(self):
+        from sglang.srt.hf_transformers_utils import get_tokenizer
+        return get_tokenizer(
+            self.server_args.tokenizer_path,
+            tokenizer_mode=self.server_args.tokenizer_mode,
+            trust_remote_code=self.server_args.trust_remote_code,
+            revision=self.server_args.revision,
+        )
+    async def async_generate(
+        self,
+        prompt: str,
+        sampling_params: Optional[Dict] = None,
+    ):
+        if self.server_args.skip_tokenizer_init:
+            json_data = {
+                "input_ids": prompt,
+                "sampling_params": sampling_params,
+                "stream": True,
+            }
+        else:
+            json_data = {
+                "text": prompt,
+                "sampling_params": sampling_params,
+                "stream": True,
+            }
+        pos = 0
+        timeout = aiohttp.ClientTimeout(total=3 * 3600)
+        async with aiohttp.ClientSession(timeout=timeout, trust_env=True) as session:
+            async with session.post(self.generate_url, json=json_data) as response:
+                async for chunk, _ in response.content.iter_chunks():
+                    chunk = chunk.decode("utf-8")
+                    if chunk and chunk.startswith("data:"):
+                        if chunk == "data: [DONE]\n\n":
+                            break
+                        data = json.loads(chunk[5:].strip("\n"))
+                        if "text" in data:
+                            cur = data["text"][pos:]
+                            if cur:
+                                yield cur
+                            pos += len(cur)
+                        else:
+                            yield data
+    add_request = async_generate
+    def generate(
+        self,
+        prompt: Union[str, List[str]],
+        sampling_params: Optional[Dict] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
+    ):
+        json_data = {
+            "text": prompt,
+            "sampling_params": sampling_params,
+            "return_logprob": return_logprob,
+            "logprob_start_len": logprob_start_len,
+            "top_logprobs_num": top_logprobs_num,
+            "lora_path": lora_path,
+        }
+        assert not isinstance(lora_path, list) or len(lora_path) == len(prompt)
+        response = requests.post(
+            self.url + "/generate",
+            json=json_data,
+        )
+        return json.dumps(response.json())
+    def encode(
+        self,
+        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+    ):
+        json_data = {"text": prompt}
+        response = requests.post(self.url + "/encode", json=json_data)
+        return json.dumps(response.json())
+    async def get_server_info(self):
+        async with aiohttp.ClientSession() as session:
+            async with session.get(f"{self.url}/get_server_info") as response:
+                if response.status == 200:
+                    return await response.json()
+                else:
+                    error_data = await response.json()
+                    raise RuntimeError(
+                        f"Failed to get server info. {error_data['error']['message']}"
+                    )
+    def __del__(self):
+        self.shutdown()

sglang 0.4.1.post6__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl

sglang 0.4.1.post6py3-none-any.whl → 0.4.1.post7py3-none-any.whl