PyPI - sglang - Versions diffs - 0.2.11__py3-none-any.whl → 0.2.12__py3-none-any.whl - Mend

sglang 0.2.11py3-none-any.whl → 0.2.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

sglang/bench_latency.py +6 -4
sglang/bench_serving.py +46 -22
sglang/lang/compiler.py +2 -2
sglang/lang/ir.py +3 -3
sglang/srt/constrained/base_tool_cache.py +1 -1
sglang/srt/constrained/fsm_cache.py +12 -2
sglang/srt/layers/activation.py +33 -0
sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
sglang/srt/layers/extend_attention.py +6 -1
sglang/srt/layers/layernorm.py +65 -0
sglang/srt/layers/logits_processor.py +5 -0
sglang/srt/layers/pooler.py +50 -0
sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
sglang/srt/layers/radix_attention.py +2 -2
sglang/srt/managers/detokenizer_manager.py +31 -9
sglang/srt/managers/io_struct.py +63 -0
sglang/srt/managers/policy_scheduler.py +173 -25
sglang/srt/managers/schedule_batch.py +110 -87
sglang/srt/managers/tokenizer_manager.py +193 -111
sglang/srt/managers/tp_worker.py +289 -352
sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
sglang/srt/mem_cache/chunk_cache.py +43 -20
sglang/srt/mem_cache/memory_pool.py +2 -2
sglang/srt/mem_cache/radix_cache.py +74 -40
sglang/srt/model_executor/cuda_graph_runner.py +24 -9
sglang/srt/model_executor/forward_batch_info.py +168 -105
sglang/srt/model_executor/model_runner.py +24 -37
sglang/srt/models/gemma2.py +0 -1
sglang/srt/models/internlm2.py +2 -7
sglang/srt/models/llama2.py +4 -4
sglang/srt/models/llama_embedding.py +88 -0
sglang/srt/models/qwen2_moe.py +0 -11
sglang/srt/openai_api/adapter.py +155 -27
sglang/srt/openai_api/protocol.py +37 -1
sglang/srt/sampling/penaltylib/__init__.py +13 -0
sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
sglang/srt/sampling_params.py +31 -4
sglang/srt/server.py +69 -15
sglang/srt/server_args.py +26 -19
sglang/srt/utils.py +31 -13
sglang/test/run_eval.py +10 -1
sglang/test/runners.py +63 -63
sglang/test/simple_eval_humaneval.py +2 -8
sglang/test/simple_eval_mgsm.py +203 -0
sglang/test/srt/sampling/penaltylib/utils.py +337 -0
sglang/test/test_layernorm.py +60 -0
sglang/test/test_programs.py +4 -2
sglang/test/test_utils.py +20 -2
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/METADATA +23 -14
sglang-0.2.12.dist-info/RECORD +112 -0
sglang/srt/layers/linear.py +0 -884
sglang/srt/layers/quantization/__init__.py +0 -64
sglang/srt/layers/quantization/fp8.py +0 -677
sglang-0.2.11.dist-info/RECORD +0 -102
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/LICENSE +0 -0
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/WHEEL +0 -0
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/top_level.txt +0 -0

sglang/srt/sampling_params.py CHANGED Viewed

@@ -23,13 +23,16 @@ _SAMPLING_EPS = 1e-6
 class SamplingParams:
     def __init__(
         self,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
+        min_new_tokens: int = 0,
         stop: Optional[Union[str, List[str]]] = None,
+        stop_token_ids: Optional[List[int]] = [],
         temperature: float = 1.0,
         top_p: float = 1.0,
         top_k: int = -1,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
+        repetition_penalty: float = 1.0,
         ignore_eos: bool = False,
         skip_special_tokens: bool = True,
         spaces_between_special_tokens: bool = True,
@@ -42,8 +45,11 @@ class SamplingParams:
         self.top_k = top_k
         self.frequency_penalty = frequency_penalty
         self.presence_penalty = presence_penalty
+        self.repetition_penalty = repetition_penalty
         self.stop_strs = stop
+        self.stop_token_ids = {*stop_token_ids}
         self.max_new_tokens = max_new_tokens
+        self.min_new_tokens = min_new_tokens
         self.ignore_eos = ignore_eos
         self.skip_special_tokens = skip_special_tokens
         self.spaces_between_special_tokens = spaces_between_special_tokens
@@ -80,23 +86,44 @@ class SamplingParams:
             raise ValueError(
                 "presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}."
             )
+        if not 0.0 <= self.repetition_penalty <= 2.0:
+            raise ValueError(
+                "repetition_penalty must be in (0, 2], got "
+                f"{self.repetition_penalty}."
+            )
+        if not 0 <= self.min_new_tokens:
+            raise ValueError(
+                f"min_new_tokens must be in (0, max_new_tokens], got "
+                f"{self.min_new_tokens}."
+            )
         if self.max_new_tokens is not None:
             if self.max_new_tokens < 0:
                 raise ValueError(
                     f"max_new_tokens must be at least 0, got {self.max_new_tokens}."
                 )
+            if not self.min_new_tokens <= self.max_new_tokens:
+                raise ValueError(
+                    f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
+                    f"{self.min_new_tokens}."
+                )
     def normalize(self, tokenizer):
         # Process stop strings
         if self.stop_strs is None:
             self.stop_strs = []
-            self.stop_str_max_len = 0
+            if self.stop_token_ids is None:
+                self.stop_str_max_len = 0
+            else:
+                self.stop_str_max_len = 1
         else:
             if isinstance(self.stop_strs, str):
                 self.stop_strs = [self.stop_strs]
             stop_str_max_len = 0
             for stop_str in self.stop_strs:
-                stop_str_ids = tokenizer.encode(stop_str, add_special_tokens=False)
-                stop_str_max_len = max(stop_str_max_len, len(stop_str_ids))
+                if tokenizer is not None:
+                    stop_str_ids = tokenizer.encode(stop_str, add_special_tokens=False)
+                    stop_str_max_len = max(stop_str_max_len, len(stop_str_ids))
+                else:
+                    stop_str_max_len = max(stop_str_max_len, len(stop_str))
             self.stop_str_max_len = stop_str_max_len

sglang/srt/server.py CHANGED Viewed

@@ -52,7 +52,7 @@ from sglang.srt.managers.controller_single import (
     start_controller_process as start_controller_process_single,
 )
 from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
-from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
 from sglang.srt.openai_api.adapter import (
     load_chat_template_for_openai_api,
@@ -60,6 +60,7 @@ from sglang.srt.openai_api.adapter import (
     v1_chat_completions,
     v1_completions,
     v1_delete_file,
+    v1_embeddings,
     v1_files_create,
     v1_retrieve_batch,
     v1_retrieve_file,
@@ -74,7 +75,8 @@ from sglang.srt.utils import (
     enable_show_time_cost,
     kill_child_process,
     maybe_set_triton_cache_manager,
-    set_torch_compile_config,
+    prepare_model,
+    prepare_tokenizer,
     set_ulimit,
 )
 from sglang.utils import get_exception_traceback
@@ -98,6 +100,7 @@ async def health() -> Response:
 async def get_model_info():
     result = {
         "model_path": tokenizer_manager.model_path,
+        "is_generation": tokenizer_manager.is_generation,
     }
     return result
@@ -149,6 +152,21 @@ app.post("/generate")(generate_request)
 app.put("/generate")(generate_request)
+async def encode_request(obj: EmbeddingReqInput, request: Request):
+    """Handle an embedding request."""
+    try:
+        ret = await tokenizer_manager.generate_request(obj, request).__anext__()
+        return ret
+    except ValueError as e:
+        return JSONResponse(
+            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
+        )
+app.post("/encode")(encode_request)
+app.put("/encode")(encode_request)
 @app.post("/v1/completions")
 async def openai_v1_completions(raw_request: Request):
     return await v1_completions(tokenizer_manager, raw_request)
@@ -159,6 +177,12 @@ async def openai_v1_chat_completions(raw_request: Request):
     return await v1_chat_completions(tokenizer_manager, raw_request)
+@app.post("/v1/embeddings")
+async def openai_v1_embeddings(raw_request: Request):
+    response = await v1_embeddings(tokenizer_manager, raw_request)
+    return response
 @app.get("/v1/models")
 def available_models():
     """Show available models."""
@@ -235,6 +259,10 @@ def launch_server(
     )
     logger.info(f"{server_args=}")
+    # Use model from www.modelscope.cn, first download the model.
+    server_args.model_path = prepare_model(server_args.model_path)
+    server_args.tokenizer_path = prepare_tokenizer(server_args.tokenizer_path)
     # Launch processes for multi-node tensor parallelism
     if server_args.nnodes > 1:
         if server_args.node_rank != 0:
@@ -347,10 +375,6 @@ def _set_envs_and_config(server_args: ServerArgs):
         # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
         maybe_set_triton_cache_manager()
-    # Set torch compile config
-    if server_args.enable_torch_compile:
-        set_torch_compile_config()
     # Set global chat template
     if server_args.chat_template:
         # TODO: replace this with huggingface transformers template
@@ -360,7 +384,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if not server_args.disable_flashinfer:
         assert_pkg_version(
             "flashinfer",
-            "0.1.3",
+            "0.1.4",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
@@ -385,6 +409,7 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
         except (AssertionError, requests.exceptions.RequestException) as e:
             last_traceback = get_exception_traceback()
             pass
+    model_info = res.json()
     if not success:
         if pipe_finish_writer is not None:
@@ -393,17 +418,24 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
         sys.exit(1)
     # Send a warmup request
+    request_name = "/generate" if model_info["is_generation"] else "/encode"
+    max_new_tokens = 8 if model_info["is_generation"] else 1
+    json_data = {
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": max_new_tokens,
+        },
+    }
+    if server_args.skip_tokenizer_init:
+        json_data["input_ids"] = [10, 11, 12]
+    else:
+        json_data["text"] = "The capital city of France is"
     try:
         for _ in range(server_args.dp_size):
             res = requests.post(
-                url + "/generate",
-                json={
-                    "text": "The capital city of France is",
-                    "sampling_params": {
-                        "temperature": 0,
-                        "max_new_tokens": 8,
-                    },
-                },
+                url + request_name,
+                json=json_data,
                 headers=headers,
                 timeout=600,
             )
@@ -415,6 +447,15 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
         print(f"Initialization failed. warmup error: {last_traceback}", flush=True)
         sys.exit(1)
+    # Print warnings here
+    if server_args.disable_radix_cache and server_args.chunked_prefill_size is not None:
+        logger.warning(
+            "You set both `--disable-radix-cache` and `--chunked-prefill-size`. "
+            "This combination is an experimental feature and we noticed it can lead to "
+            "wrong generation results. If you want to use chunked prefill, it is recommended "
+            "not using `--disable-radix-cache`."
+        )
     logger.info("The server is fired up and ready to roll!")
     if pipe_finish_writer is not None:
         pipe_finish_writer.send("init ok")
@@ -534,5 +575,18 @@ class Runtime:
         )
         return json.dumps(response.json())
+    def encode(
+        self,
+        prompt: str,
+    ):
+        json_data = {
+            "text": prompt,
+        }
+        response = requests.post(
+            self.url + "/encode",
+            json=json_data,
+        )
+        return json.dumps(response.json())
     def __del__(self):
         self.shutdown()

sglang/srt/server_args.py CHANGED Viewed

@@ -27,6 +27,7 @@ class ServerArgs:
     model_path: str
     tokenizer_path: Optional[str] = None
     tokenizer_mode: str = "auto"
+    skip_tokenizer_init: bool = False
     load_format: str = "auto"
     dtype: str = "auto"
     trust_remote_code: bool = True
@@ -42,10 +43,11 @@ class ServerArgs:
     # Memory and scheduling
     mem_fraction_static: Optional[float] = None
-    max_prefill_tokens: Optional[int] = None
     max_running_requests: Optional[int] = None
     max_num_reqs: Optional[int] = None
     max_total_tokens: Optional[int] = None
+    chunked_prefill_size: int = -1
+    max_prefill_tokens: int = 16384
     schedule_policy: str = "lpm"
     schedule_conservativeness: float = 1.0
@@ -62,15 +64,12 @@ class ServerArgs:
     # Other
     api_key: Optional[str] = None
-    file_storage_pth: str = "SGlang_storage"
+    file_storage_pth: str = "SGLang_storage"
     # Data parallelism
     dp_size: int = 1
     load_balance_method: str = "round_robin"
-    # Chunked Prefill
-    chunked_prefill_size: Optional[int] = None
     # Optimization/debug options
     disable_flashinfer: bool = False
     disable_flashinfer_sampling: bool = False
@@ -96,6 +95,10 @@ class ServerArgs:
         if self.served_model_name is None:
             self.served_model_name = self.model_path
+        if self.chunked_prefill_size <= 0:
+            # Disable chunked prefill
+            self.chunked_prefill_size = None
         if self.mem_fraction_static is None:
             if self.tp_size >= 16:
                 self.mem_fraction_static = 0.79
@@ -107,6 +110,7 @@ class ServerArgs:
                 self.mem_fraction_static = 0.87
             else:
                 self.mem_fraction_static = 0.88
         if isinstance(self.additional_ports, int):
             self.additional_ports = [self.additional_ports]
         elif self.additional_ports is None:
@@ -151,6 +155,11 @@ class ServerArgs:
             "tokenizer if available, and 'slow' will "
             "always use the slow tokenizer.",
         )
+        parser.add_argument(
+            "--skip-tokenizer-init",
+            action="store_true",
+            help="If set, skip init tokenizer and pass input_ids in generate request",
+        )
         parser.add_argument(
             "--load-format",
             type=str,
@@ -226,12 +235,6 @@ class ServerArgs:
             default=ServerArgs.mem_fraction_static,
             help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
         )
-        parser.add_argument(
-            "--max-prefill-tokens",
-            type=int,
-            default=ServerArgs.max_prefill_tokens,
-            help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
-        )
         parser.add_argument(
             "--max-running-requests",
             type=int,
@@ -250,6 +253,18 @@ class ServerArgs:
             default=ServerArgs.max_total_tokens,
             help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
         )
+        parser.add_argument(
+            "--chunked-prefill-size",
+            type=int,
+            default=ServerArgs.chunked_prefill_size,
+            help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill",
+        )
+        parser.add_argument(
+            "--max-prefill-tokens",
+            type=int,
+            default=ServerArgs.max_prefill_tokens,
+            help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
+        )
         parser.add_argument(
             "--schedule-policy",
             type=str,
@@ -347,14 +362,6 @@ class ServerArgs:
         )
         parser.add_argument("--node-rank", type=int, help="The node rank.")
-        # Chunked prefill
-        parser.add_argument(
-            "--chunked-prefill-size",
-            type=int,
-            default=ServerArgs.chunked_prefill_size,
-            help="The size of the chunked prefill.",
-        )
         # Optimization/debug options
         parser.add_argument(
             "--disable-flashinfer",

sglang/srt/utils.py CHANGED Viewed

@@ -197,6 +197,8 @@ def allocate_init_ports(
 def get_int_token_logit_bias(tokenizer, vocab_size):
     """Get the logit bias for integer-only tokens."""
     # a bug when model's vocab size > tokenizer.vocab_size
+    if tokenizer == None:
+        return [-1e5] * vocab_size
     vocab_size = tokenizer.vocab_size
     logit_bias = np.zeros(vocab_size, dtype=np.float32)
     for t_id in range(vocab_size):
@@ -223,6 +225,15 @@ def is_multimodal_model(model):
     raise ValueError("unrecognized type")
+def is_generation_model(model_architectures):
+    if (
+        "LlamaEmbeddingModel" in model_architectures
+        or "MistralModel" in model_architectures
+    ):
+        return False
+    return True
 def decode_video_base64(video_base64):
     from PIL import Image
@@ -622,19 +633,6 @@ def receive_addrs(model_port_args, server_args):
     dist.destroy_process_group()
-def set_torch_compile_config():
-    # The following configurations are for torch compile optimizations
-    import torch._dynamo.config
-    import torch._inductor.config
-    torch._inductor.config.coordinate_descent_tuning = True
-    torch._inductor.config.triton.unique_kernel_names = True
-    torch._inductor.config.fx_graph_cache = True  # Experimental feature to reduce compilation times, will be on by default in future
-    # FIXME: tmp workaround
-    torch._dynamo.config.accumulated_cache_size_limit = 256
 def set_ulimit(target_soft_limit=65535):
     resource_type = resource.RLIMIT_NOFILE
     current_soft, current_hard = resource.getrlimit(resource_type)
@@ -705,3 +703,23 @@ def add_api_key_middleware(app, api_key):
         if request.headers.get("Authorization") != "Bearer " + api_key:
             return JSONResponse(content={"error": "Unauthorized"}, status_code=401)
         return await call_next(request)
+def prepare_model(model_path):
+    if "SGLANG_USE_MODELSCOPE" in os.environ:
+        if not os.path.exists(model_path):
+            from modelscope import snapshot_download
+            return snapshot_download(model_path)
+    return model_path
+def prepare_tokenizer(tokenizer_path):
+    if "SGLANG_USE_MODELSCOPE" in os.environ:
+        if not os.path.exists(tokenizer_path):
+            from modelscope import snapshot_download
+            return snapshot_download(
+                tokenizer_path, ignore_patterns=["*.bin", "*.safetensors"]
+            )
+    return tokenizer_path

sglang/test/run_eval.py CHANGED Viewed

@@ -16,6 +16,8 @@ from sglang.test.simple_eval_common import (
 def run_eval(args):
+    set_ulimit()
     if "OPENAI_API_KEY" not in os.environ:
         os.environ["OPENAI_API_KEY"] = "EMPTY"
@@ -39,6 +41,14 @@ def run_eval(args):
         eval_obj = MathEval(
             filename, equality_checker, args.num_examples, args.num_threads
         )
+    elif args.eval_name == "mgsm":
+        from sglang.test.simple_eval_mgsm import MGSMEval
+        eval_obj = MGSMEval(args.num_examples, args.num_threads)
+    elif args.eval_name == "mgsm_en":
+        from sglang.test.simple_eval_mgsm import MGSMEval
+        eval_obj = MGSMEval(args.num_examples, args.num_threads, languages=["en"])
     elif args.eval_name == "gpqa":
         from sglang.test.simple_eval_gpqa import GPQAEval
@@ -109,7 +119,6 @@ if __name__ == "__main__":
     parser.add_argument("--eval-name", type=str, default="mmlu")
     parser.add_argument("--num-examples", type=int)
     parser.add_argument("--num-threads", type=int, default=512)
-    set_ulimit()
     args = parser.parse_args()
     run_eval(args)

sglang/test/runners.py CHANGED Viewed

@@ -23,23 +23,19 @@ import torch.nn.functional as F
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from sglang.srt.server import Runtime
+from sglang.srt.utils import is_generation_model
 DEFAULT_PROMPTS = [
-    "The capital of France is",
+    # the output of gemma-2-2b from SRT is unstable on the commented prompt
+    # "The capital of France is",
     "The capital of the United Kindom is",
     "Today is a sunny day and I like",
+    "AI is a field of computer science focused on",
 ]
 NUM_TOP_LOGPROBS = 5
-def is_embedding_model(model_path):
-    # FIXME incomplete list
-    if "e5-mistral-7b-instruct" in model_path.lower():
-        return True
-    return False
 def get_dtype_str(torch_dtype):
     if torch_dtype is torch.float16:
         return "float16"
@@ -49,10 +45,11 @@ def get_dtype_str(torch_dtype):
 @dataclass
 class ModelOutput:
-    output_strs: str = None
-    top_input_logprobs: torch.Tensor = None
-    top_output_logprobs: torch.Tensor = None
-    embed_logits: torch.Tensor = None
+    output_strs: List[str] = None
+    output_ids: List[int] = None
+    top_input_logprobs: List[torch.Tensor] = None
+    top_output_logprobs: List[torch.Tensor] = None
+    embed_logits: List[torch.Tensor] = None
 class HFRunner:
@@ -60,7 +57,7 @@ class HFRunner:
         self,
         model_path,
         torch_dtype=torch.float16,
-        is_embedding_model=None,
+        is_generation_model=None,
     ):
         self.in_queue = multiprocessing.Queue()
         self.out_queue = multiprocessing.Queue()
@@ -72,13 +69,13 @@ class HFRunner:
                 self.out_queue,
                 model_path,
                 torch_dtype,
-                is_embedding_model,
+                is_generation_model,
             ),
         )
         self.model_proc.start()
     def start_model_process(
-        self, in_queue, out_queue, model_path, torch_dtype, is_embedding_model
+        self, in_queue, out_queue, model_path, torch_dtype, is_generation_model
     ):
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_path,
@@ -86,12 +83,12 @@ class HFRunner:
             trust_remote_code=True,
         )
-        self.is_embedding_model = (
-            is_embedding_model(model_path)
-            if is_embedding_model is None
-            else is_embedding_model
+        self.is_generation_model = (
+            is_generation_model(model_path)
+            if is_generation_model is None
+            else is_generation_model
         )
-        if not self.is_embedding_model:
+        if self.is_generation_model:
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_path,
                 torch_dtype=torch_dtype,
@@ -103,13 +100,13 @@ class HFRunner:
             self.model = SentenceTransformer(
                 model_path,
-                device="cpu",
-            ).to(dtype=torch_dtype)
+                model_kwargs={"torch_dtype": torch_dtype},
+            )
         while True:
             prompts, max_new_tokens = in_queue.get()
             if prompts is not None:
-                if not self.is_embedding_model:
+                if self.is_generation_model:
                     output_strs = []
                     prefill_logprobs = []
                     for p in prompts:
@@ -123,7 +120,9 @@ class HFRunner:
                         output_ids = self.model.generate(
                             input_ids, do_sample=False, max_new_tokens=max_new_tokens
                         )
-                        output_strs.append(self.tokenizer.decode(output_ids[0]))
+                        output_strs.append(
+                            self.tokenizer.decode(output_ids[0][len(input_ids[0]) :])
+                        )
                         logits = self.model.forward(input_ids).logits[0]
                         logprobs = F.log_softmax(
@@ -144,7 +143,6 @@ class HFRunner:
                     )
                 else:
-                    assert isinstance(prompts, List[str])
                     logits = self.model.encode(prompts).tolist()
                     out_queue.put(ModelOutput(embed_logits=logits))
@@ -152,7 +150,7 @@ class HFRunner:
     def forward(
         self,
         prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
-        max_new_tokens=64,
+        max_new_tokens=8,
     ):
         self.in_queue.put((prompts, max_new_tokens))
         return self.out_queue.get()
@@ -175,16 +173,13 @@ class SRTRunner:
         model_path,
         tp_size=1,
         torch_dtype=torch.float16,
-        is_embedding_model=None,
+        is_generation_model=None,
     ):
-        self.is_embedding_model = (
-            is_embedding_model(model_path)
-            if is_embedding_model is None
-            else is_embedding_model
+        self.is_generation_model = (
+            is_generation_model(model_path)
+            if is_generation_model is None
+            else is_generation_model
         )
-        if self.is_embedding_model:
-            raise NotImplementedError()
         self.runtime = Runtime(
             model_path=model_path,
             tp_size=tp_size,
@@ -194,40 +189,45 @@ class SRTRunner:
     def forward(
         self,
         prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
-        max_new_tokens=64,
+        max_new_tokens=8,
     ):
-        # the return value contains logprobs from prefill
-        output_strs = []
-        top_input_logprobs = []
-        sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
-        for prompt in prompts:
-            response = self.runtime.generate(
-                prompt,
-                sampling_params=sampling_params,
-                return_logprob=True,
-                top_logprobs_num=NUM_TOP_LOGPROBS,
-            )
-            response = json.loads(response)
-            output_strs.append(response["text"])
-            top_input_logprobs.append(
-                [
-                    [tup[0] for tup in x[:NUM_TOP_LOGPROBS]]
-                    for x in response["meta_info"]["input_top_logprobs"][1:]
-                ]
-                + [
+        if self.is_generation_model:
+            # the return value contains logprobs from prefill
+            output_strs = []
+            top_input_logprobs = []
+            sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
+            for prompt in prompts:
+                response = self.runtime.generate(
+                    prompt,
+                    sampling_params=sampling_params,
+                    return_logprob=True,
+                    top_logprobs_num=NUM_TOP_LOGPROBS,
+                )
+                response = json.loads(response)
+                output_strs.append(response["text"])
+                top_input_logprobs.append(
                     [
-                        tup[0]
-                        for tup in response["meta_info"]["output_top_logprobs"][0][
-                            :NUM_TOP_LOGPROBS
+                        [tup[0] for tup in x[:NUM_TOP_LOGPROBS]]
+                        for x in response["meta_info"]["input_top_logprobs"][1:]
+                    ]
+                    + [
+                        [
+                            tup[0]
+                            for tup in response["meta_info"]["output_top_logprobs"][0][
+                                :NUM_TOP_LOGPROBS
+                            ]
                         ]
                     ]
-                ]
-            )
-            # print(response["meta_info"]["output_top_logprobs"][0])
+                )
-        return ModelOutput(
-            output_strs=output_strs, top_input_logprobs=top_input_logprobs
-        )
+            return ModelOutput(
+                output_strs=output_strs, top_input_logprobs=top_input_logprobs
+            )
+        else:
+            response = self.runtime.encode(prompts)
+            response = json.loads(response)
+            logits = [x["embedding"] for x in response]
+            return ModelOutput(embed_logits=logits)
     def __enter__(self):
         return self

sglang 0.2.11__py3-none-any.whl → 0.2.12__py3-none-any.whl

sglang 0.2.11py3-none-any.whl → 0.2.12py3-none-any.whl