PyPI - sglang - Versions diffs - 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

sglang 0.2.13py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

sglang/api.py +6 -0
sglang/bench_latency.py +7 -3
sglang/bench_serving.py +50 -26
sglang/check_env.py +15 -0
sglang/lang/chat_template.py +10 -5
sglang/lang/compiler.py +4 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +9 -0
sglang/launch_server.py +8 -1
sglang/srt/conversation.py +50 -1
sglang/srt/hf_transformers_utils.py +22 -23
sglang/srt/layers/activation.py +24 -1
sglang/srt/layers/decode_attention.py +338 -50
sglang/srt/layers/fused_moe/layer.py +2 -2
sglang/srt/layers/layernorm.py +3 -0
sglang/srt/layers/logits_processor.py +60 -23
sglang/srt/layers/radix_attention.py +3 -4
sglang/srt/layers/sampler.py +154 -0
sglang/srt/managers/controller_multi.py +2 -8
sglang/srt/managers/controller_single.py +7 -10
sglang/srt/managers/detokenizer_manager.py +20 -9
sglang/srt/managers/io_struct.py +44 -11
sglang/srt/managers/policy_scheduler.py +5 -2
sglang/srt/managers/schedule_batch.py +52 -167
sglang/srt/managers/tokenizer_manager.py +192 -83
sglang/srt/managers/tp_worker.py +130 -43
sglang/srt/mem_cache/memory_pool.py +82 -8
sglang/srt/mm_utils.py +79 -7
sglang/srt/model_executor/cuda_graph_runner.py +49 -11
sglang/srt/model_executor/forward_batch_info.py +59 -27
sglang/srt/model_executor/model_runner.py +210 -61
sglang/srt/models/chatglm.py +4 -12
sglang/srt/models/commandr.py +5 -1
sglang/srt/models/dbrx.py +5 -1
sglang/srt/models/deepseek.py +5 -1
sglang/srt/models/deepseek_v2.py +5 -1
sglang/srt/models/gemma.py +5 -1
sglang/srt/models/gemma2.py +15 -7
sglang/srt/models/gpt_bigcode.py +5 -1
sglang/srt/models/grok.py +16 -2
sglang/srt/models/internlm2.py +5 -1
sglang/srt/models/llama2.py +7 -3
sglang/srt/models/llama_classification.py +2 -2
sglang/srt/models/llama_embedding.py +4 -0
sglang/srt/models/llava.py +176 -59
sglang/srt/models/minicpm.py +5 -1
sglang/srt/models/mixtral.py +5 -1
sglang/srt/models/mixtral_quant.py +5 -1
sglang/srt/models/qwen.py +5 -2
sglang/srt/models/qwen2.py +13 -3
sglang/srt/models/qwen2_moe.py +5 -14
sglang/srt/models/stablelm.py +5 -1
sglang/srt/openai_api/adapter.py +117 -37
sglang/srt/sampling/sampling_batch_info.py +209 -0
sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -0
sglang/srt/server.py +84 -56
sglang/srt/server_args.py +43 -15
sglang/srt/utils.py +26 -16
sglang/test/runners.py +23 -31
sglang/test/simple_eval_common.py +9 -10
sglang/test/simple_eval_gpqa.py +2 -1
sglang/test/simple_eval_humaneval.py +2 -2
sglang/test/simple_eval_math.py +2 -1
sglang/test/simple_eval_mmlu.py +2 -1
sglang/test/test_activation.py +55 -0
sglang/test/test_utils.py +36 -53
sglang/version.py +1 -1
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/METADATA +92 -25
sglang-0.2.14.dist-info/RECORD +114 -0
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
sglang/launch_server_llavavid.py +0 -29
sglang-0.2.13.dist-info/RECORD +0 -112
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0

sglang/api.py CHANGED Viewed

@@ -66,6 +66,7 @@ def gen(
     temperature: Optional[float] = None,
     top_p: Optional[float] = None,
     top_k: Optional[int] = None,
+    min_p: Optional[float] = None,
     frequency_penalty: Optional[float] = None,
     presence_penalty: Optional[float] = None,
     ignore_eos: Optional[bool] = None,
@@ -103,6 +104,7 @@ def gen(
         temperature,
         top_p,
         top_k,
+        min_p,
         frequency_penalty,
         presence_penalty,
         ignore_eos,
@@ -123,6 +125,7 @@ def gen_int(
     temperature: Optional[float] = None,
     top_p: Optional[float] = None,
     top_k: Optional[int] = None,
+    min_p: Optional[float] = None,
     frequency_penalty: Optional[float] = None,
     presence_penalty: Optional[float] = None,
     ignore_eos: Optional[bool] = None,
@@ -139,6 +142,7 @@ def gen_int(
         temperature,
         top_p,
         top_k,
+        min_p,
         frequency_penalty,
         presence_penalty,
         ignore_eos,
@@ -159,6 +163,7 @@ def gen_string(
     temperature: Optional[float] = None,
     top_p: Optional[float] = None,
     top_k: Optional[int] = None,
+    min_p: Optional[float] = None,
     frequency_penalty: Optional[float] = None,
     presence_penalty: Optional[float] = None,
     ignore_eos: Optional[bool] = None,
@@ -175,6 +180,7 @@ def gen_string(
         temperature,
         top_p,
         top_k,
+        min_p,
         frequency_penalty,
         presence_penalty,
         ignore_eos,

sglang/bench_latency.py CHANGED Viewed

@@ -54,7 +54,7 @@ from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.model_config import ModelConfig
 from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.model_executor.model_runner import ModelRunner
-from sglang.srt.sampling_params import SamplingParams
+from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import suppress_other_loggers
@@ -111,7 +111,11 @@ def load_model(server_args, tp_rank):
     suppress_other_loggers()
     rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
-    model_config = ModelConfig(path=server_args.model_path)
+    model_config = ModelConfig(
+        server_args.model_path,
+        server_args.trust_remote_code,
+        context_length=server_args.context_length,
+    )
     model_runner = ModelRunner(
         model_config=model_config,
         mem_fraction_static=server_args.mem_fraction_static,
@@ -350,7 +354,7 @@ def latency_test(
     for bs, il, ol in itertools.product(
         bench_args.batch_size, bench_args.input_len, bench_args.output_len
     ):
-        req = prepare_synthetic_inputs_for_latency_test(bs, il)
+        reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
         ret = latency_test_run_once(
             bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol
         )

sglang/bench_serving.py CHANGED Viewed

@@ -149,10 +149,12 @@ async def async_request_openai_completions(
         "completions"
     ), "OpenAI Completions API URL must end with 'completions'."
+    prompt = request_func_input.prompt
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         payload = {
             "model": request_func_input.model,
-            "prompt": request_func_input.prompt,
+            "prompt": prompt,
             "temperature": 0.0,
             "best_of": 1,
             "max_tokens": request_func_input.output_len,
@@ -220,6 +222,13 @@ async def async_request_openai_completions(
     return output
+async def async_request_gserver(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    raise NotImplementedError()
 def get_model(pretrained_model_name_or_path: str) -> str:
     if os.getenv("SGLANG_USE_MODELSCOPE", "False").lower() == "true":
         import huggingface_hub.constants
@@ -238,6 +247,13 @@ def get_model(pretrained_model_name_or_path: str) -> str:
 def get_tokenizer(
     pretrained_model_name_or_path: str,
 ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    if pretrained_model_name_or_path.endswith(
+        ".json"
+    ) or pretrained_model_name_or_path.endswith(".model"):
+        from sglang.srt.hf_transformers_utils import get_tokenizer
+        return get_tokenizer(pretrained_model_name_or_path)
     if pretrained_model_name_or_path is not None and not os.path.exists(
         pretrained_model_name_or_path
     ):
@@ -252,6 +268,7 @@ ASYNC_REQUEST_FUNCS = {
     "vllm": async_request_openai_completions,
     "lmdeploy": async_request_openai_completions,
     "trt": async_request_trt_llm,
+    "gserver": async_request_gserver,
 }
@@ -351,9 +368,9 @@ def sample_sharegpt_requests(
         # Tokenize the prompts and completions.
         prompt = dataset[i][0]
-        prompt_token_ids = tokenizer(prompt).input_ids
+        prompt_token_ids = tokenizer.encode(prompt)
         completion = dataset[i][1]
-        completion_token_ids = tokenizer(completion).input_ids
+        completion_token_ids = tokenizer.encode(completion)
         prompt_len = len(prompt_token_ids)
         output_len = (
             len(completion_token_ids) if fixed_output_len is None else fixed_output_len
@@ -361,7 +378,9 @@ def sample_sharegpt_requests(
         if prompt_len < 4 or output_len < 4:
             # Prune too short sequences.
             continue
-        if prompt_len > 1024 or prompt_len + output_len > 2048:
+        if prompt_len > 1024 or (
+            prompt_len + output_len > 2048 and fixed_output_len is None
+        ):
             # Prune too long sequences.
             continue
         filtered_dataset.append((prompt, prompt_len, output_len))
@@ -422,7 +441,7 @@ def sample_random_requests(
         for i in range(num_prompts):
             # Tokenize the prompts and completions.
             prompt = dataset[i][0]
-            prompt_token_ids = tokenizer(prompt).input_ids
+            prompt_token_ids = tokenizer.encode(prompt)
             prompt_len = len(prompt_token_ids)
             if prompt_len > input_lens[i]:
@@ -488,7 +507,7 @@ def calculate_metrics(
             output_len = outputs[i].output_len
             output_lens.append(output_len)
             retokenized_output_len = len(
-                tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
+                tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
             )
             retokenized_output_lens.append(retokenized_output_len)
             total_input += input_requests[i][1]
@@ -547,7 +566,6 @@ async def benchmark(
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
     disable_tqdm: bool,
-    enable_multi: bool,
     extra_request_body: Dict[str, Any],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
@@ -756,6 +774,7 @@ def run_benchmark(args_: argparse.Namespace):
     global args
     args = args_
+    # Set global environments
     set_ulimit()
     random.seed(args.seed)
     np.random.seed(args.seed)
@@ -764,12 +783,14 @@ def run_benchmark(args_: argparse.Namespace):
     if args.extra_request_body:
         extra_request_body = json.loads(args.extra_request_body)
+    # Set url
     if args.port is None:
         args.port = {
             "sglang": 30000,
             "lmdeploy": 23333,
             "vllm": 8000,
             "trt": 8000,
+            "gserver": 9988,
         }.get(args.backend, 30000)
     api_url = (
@@ -792,7 +813,11 @@ def run_benchmark(args_: argparse.Namespace):
         if args.model is None:
             print("Please provide a model using `--model` when using `trt` backend.")
             sys.exit(1)
+    elif args.backend == "gserver":
+        api_url = args.base_url if args.base_url else f"{args.host}:{args.port}"
+        args.model = args.model or "default"
+    # Get model name
     if args.model is None:
         try:
             response = requests.get(model_url)
@@ -817,6 +842,7 @@ def run_benchmark(args_: argparse.Namespace):
     print(f"{args}\n")
+    # Read dataset
     backend = args.backend
     model_id = args.model
     tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
@@ -842,7 +868,21 @@ def run_benchmark(args_: argparse.Namespace):
     else:
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
-    if args.multi:
+    if not args.multi:
+        return asyncio.run(
+            benchmark(
+                backend=backend,
+                api_url=api_url,
+                model_id=model_id,
+                tokenizer=tokenizer,
+                input_requests=input_requests,
+                request_rate=args.request_rate,
+                disable_tqdm=args.disable_tqdm,
+                extra_request_body=extra_request_body,
+            )
+        )
+    else:
+        # Benchmark multiple rps. TODO: use a fixed duration to compute num_prompts
         request_rates = parse_request_rate_range(args.request_rate_range)
         for rate in request_rates:
@@ -855,27 +895,11 @@ def run_benchmark(args_: argparse.Namespace):
                     input_requests=input_requests,
                     request_rate=rate,
                     disable_tqdm=args.disable_tqdm,
-                    enable_multi=args.multi,
                     extra_request_body=extra_request_body,
                 )
             )
-    else:
-        return asyncio.run(
-            benchmark(
-                backend=backend,
-                api_url=api_url,
-                model_id=model_id,
-                tokenizer=tokenizer,
-                input_requests=input_requests,
-                request_rate=args.request_rate,
-                disable_tqdm=args.disable_tqdm,
-                enable_multi=args.multi,
-                extra_request_body=extra_request_body,
-            )
-        )
-# to avoid relying on SGLang's components
 def set_ulimit(target_soft_limit=65535):
     resource_type = resource.RLIMIT_NOFILE
     current_soft, current_hard = resource.getrlimit(resource_type)
@@ -966,9 +990,9 @@ if __name__ == "__main__":
         type=float,
         default=float("inf"),
         help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
-        "Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.",
+        "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
     )
-    parser.add_argument("--seed", type=int, default=0, help="Default is 0.")
+    parser.add_argument("--seed", type=int, default=1, help="The random seed.")
     parser.add_argument(
         "--multi",
         action="store_true",

sglang/check_env.py CHANGED Viewed

@@ -170,6 +170,17 @@ def get_gpu_topology():
         return None
+def get_hypervisor_vendor():
+    try:
+        output = subprocess.check_output(["lscpu"], text=True)
+        for line in output.split("\n"):
+            if "Hypervisor vendor:" in line:
+                return line.split(":")[1].strip()
+        return None
+    except:
+        return None
 def check_env():
     """
     Check and print environment information.
@@ -184,6 +195,10 @@ def check_env():
     if gpu_topo:
         env_info["NVIDIA Topology"] = gpu_topo
+    hypervisor_vendor = get_hypervisor_vendor()
+    if hypervisor_vendor:
+        env_info["Hypervisor vendor"] = hypervisor_vendor
     ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
     env_info["ulimit soft"] = ulimit_soft

sglang/lang/chat_template.py CHANGED Viewed

@@ -1,6 +1,6 @@
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from enum import Enum, auto
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Tuple
 class ChatTemplateStyle(Enum):
@@ -137,7 +137,7 @@ register_chat_template(
 register_chat_template(
     ChatTemplate(
         name="chatml-llava",
-        default_system_prompt="Answer the questions.",
+        default_system_prompt="You are a helpful assistant.",
         role_prefix_and_suffix={
             "system": ("<|im_start|>system\n", "<|im_end|>\n"),
             "user": ("<|im_start|>user\n", "<|im_end|>\n"),
@@ -145,7 +145,7 @@ register_chat_template(
         },
         style=ChatTemplateStyle.PLAIN,
         stop_str=("<|im_end|>",),
-        image_token=" <image>\n",
+        image_token="<image>\n",
     )
 )
@@ -322,12 +322,17 @@ def match_chat_ml(model_path: str):
     if "tinyllama" in model_path:
         return get_chat_template("chatml")
     # Now the suffix for qwen2 chat model is "instruct"
-    if "qwen" in model_path and ("chat" in model_path or "instruct" in model_path):
+    if (
+        "qwen" in model_path
+        and ("chat" in model_path or "instruct" in model_path)
+        and ("llava" not in model_path)
+    ):
         return get_chat_template("qwen")
     if (
         "llava-v1.6-34b" in model_path
         or "llava-v1.6-yi-34b" in model_path
         or "llava-next-video-34b" in model_path
+        or "llava-onevision-qwen2" in model_path
     ):
         return get_chat_template("chatml-llava")

sglang/lang/compiler.py CHANGED Viewed

@@ -130,6 +130,7 @@ class CompiledFunction:
         temperature: float = 1.0,
         top_p: float = 1.0,
         top_k: int = -1,
+        min_p: float = 0.0,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         backend=None,
@@ -145,6 +146,7 @@ class CompiledFunction:
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            min_p=min_p,
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
         )
@@ -160,6 +162,7 @@ class CompiledFunction:
         temperature: float = 1.0,
         top_p: float = 1.0,
         top_k: int = -1,
+        min_p: float = 0.0,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         backend=None,
@@ -178,6 +181,7 @@ class CompiledFunction:
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            min_p=min_p,
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
         )

sglang/lang/interpreter.py CHANGED Viewed

@@ -663,6 +663,7 @@ class StreamExecutor:
             "temperature",
             "top_p",
             "top_k",
+            "min_p",
             "frequency_penalty",
             "presence_penalty",
             "ignore_eos",

sglang/lang/ir.py CHANGED Viewed

@@ -22,6 +22,7 @@ class SglSamplingParams:
     temperature: float = 1.0
     top_p: float = 1.0
     top_k: int = -1  # -1 means disable
+    min_p: float = 0.0
     frequency_penalty: float = 0.0
     presence_penalty: float = 0.0
     ignore_eos: bool = False
@@ -42,6 +43,7 @@ class SglSamplingParams:
             self.temperature,
             self.top_p,
             self.top_k,
+            self.min_p,
             self.frequency_penalty,
             self.presence_penalty,
             self.ignore_eos,
@@ -114,6 +116,7 @@ class SglSamplingParams:
             "temperature": self.temperature,
             "top_p": self.top_p,
             "top_k": self.top_k,
+            "min_p": self.min_p,
             "frequency_penalty": self.frequency_penalty,
             "presence_penalty": self.presence_penalty,
             "ignore_eos": self.ignore_eos,
@@ -149,6 +152,7 @@ class SglFunction:
         temperature: float = 1.0,
         top_p: float = 1.0,
         top_k: int = -1,
+        min_p: float = 0.0,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         ignore_eos: bool = False,
@@ -169,6 +173,7 @@ class SglFunction:
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            min_p=min_p,
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
             ignore_eos=ignore_eos,
@@ -190,6 +195,7 @@ class SglFunction:
         temperature: float = 1.0,
         top_p: float = 1.0,
         top_k: int = -1,
+        min_p: float = 0.0,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         ignore_eos: bool = False,
@@ -228,6 +234,7 @@ class SglFunction:
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            min_p=min_p,
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
             ignore_eos=ignore_eos,
@@ -408,6 +415,7 @@ class SglGen(SglExpr):
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
         top_k: Optional[int] = None,
+        min_p: Optional[float] = None,
         frequency_penalty: Optional[float] = None,
         presence_penalty: Optional[float] = None,
         ignore_eos: Optional[bool] = None,
@@ -428,6 +436,7 @@ class SglGen(SglExpr):
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            min_p=min_p,
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
             ignore_eos=ignore_eos,

sglang/launch_server.py CHANGED Viewed

@@ -1,9 +1,11 @@
 """Launch the inference server."""
 import argparse
+import os
 from sglang.srt.server import launch_server
 from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import kill_child_process
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -11,4 +13,9 @@ if __name__ == "__main__":
     args = parser.parse_args()
     server_args = ServerArgs.from_cli_args(args)
-    launch_server(server_args)
+    try:
+        launch_server(server_args)
+    except Exception as e:
+        raise e
+    finally:
+        kill_child_process(os.getpid(), including_parent=False)

sglang/srt/conversation.py CHANGED Viewed

@@ -34,6 +34,7 @@ class SeparatorStyle(IntEnum):
     NO_COLON_TWO = auto()
     ADD_NEW_LINE_SINGLE = auto()
     LLAMA2 = auto()
+    LLAMA3 = auto()
     CHATGLM = auto()
     CHATML = auto()
     CHATINTERN = auto()
@@ -137,6 +138,20 @@ class Conversation:
                 else:
                     ret += role + ":"
             return ret
+        elif self.sep_style == SeparatorStyle.LLAMA3:
+            ret = "<|begin_of_text|>"
+            if self.system_message:
+                ret += system_prompt
+            else:
+                ret += ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+                    ret += f"{message.strip()}<|eot_id|>"
+                else:
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+            # print(ret)
+            return ret
         elif self.sep_style == SeparatorStyle.LLAMA2:
             seps = [self.sep, self.sep2]
             if self.system_message:
@@ -379,12 +394,23 @@ def generate_chat_conv(
                 conv.append_message(conv.roles[0], message.content)
             else:
                 real_content = ""
+                # calculate number of image_url
+                num_image_url = 0
+                for content in message.content:
+                    if content.type == "image_url":
+                        num_image_url += 1
+                if num_image_url > 1:
+                    image_token = "<image>"
+                else:
+                    image_token = "<image>\n"
                 for content in message.content:
                     if content.type == "text":
+                        if num_image_url > 16:
+                            real_content += "\n"  # for video
                         real_content += content.text
                     elif content.type == "image_url":
                         # NOTE: Only works for llava
-                        real_content += "<image>\n"
+                        real_content += image_token
                         conv.append_image(content.image_url.url)
                 conv.append_message(conv.roles[0], real_content)
         elif msg_role == "assistant":
@@ -425,6 +451,18 @@ register_conv_template(
     )
 )
+register_conv_template(
+    Conversation(
+        name="chatml-llava",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are a helpful assistant.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_str=["<|endoftext|>", "<|im_end|>"],
+    )
+)
 register_conv_template(
     Conversation(
         name="vicuna_v1.1",
@@ -437,6 +475,17 @@ register_conv_template(
     )
 )
+register_conv_template(
+    Conversation(
+        name="llava_llama_3",
+        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
+        system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.LLAMA3,
+        sep="",
+        stop_str=["<|end_of_text|>", "<|eot_id|>"],
+    )
+)
 # Reference: https://github.com/InternLM/lmdeploy/blob/387bf54b4f124e72aab30ae9755f562e435d3d01/lmdeploy/model.py#L425-L442
 register_conv_template(
     Conversation(

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -30,14 +30,19 @@ from transformers import (
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
 )
-from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig
-from sglang.srt.utils import is_multimodal_model
+try:
+    from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig
+    _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
+        ChatGLMConfig.model_type: ChatGLMConfig,
+        DbrxConfig.model_type: DbrxConfig,
+    }
+except ImportError:
+    # We want this file to run without vllm dependency
+    _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {}
-_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
-    ChatGLMConfig.model_type: ChatGLMConfig,
-    DbrxConfig.model_type: DbrxConfig,
-}
+from sglang.srt.utils import is_multimodal_model
 def download_from_hf(model_path: str):
@@ -137,18 +142,6 @@ def get_tokenizer(
             raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
         kwargs["use_fast"] = False
-    if (
-        "llama" in tokenizer_name.lower()
-        and kwargs.get("use_fast", True)
-        and tokenizer_name != _FAST_LLAMA_TOKENIZER
-    ):
-        pass
-        # warnings.warn(
-        #    "For some LLaMA V1 models, initializing the fast tokenizer may "
-        #    "take a long time. To reduce the initialization time, consider "
-        #    f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original "
-        #    "tokenizer."
-        # )
     try:
         tokenizer = AutoTokenizer.from_pretrained(
             tokenizer_name,
@@ -229,6 +222,8 @@ class TiktokenTokenizer:
         }
         assert tok_dict["word_split"] == "V1"
+        default_allowed_special = None
         kwargs = {
             "name": name,
             "pat_str": tok_dict.get("pat_str", PAT_STR_B),
@@ -242,14 +237,18 @@ class TiktokenTokenizer:
                     for bytes_list in tok_dict["default_allowed_special"]
                 ]
             )
-        else:
-            default_allowed_special = None
         if "vocab_size" in tok_dict:
             kwargs["explicit_n_vocab"] = tok_dict["vocab_size"]
+        PAD = "<|pad|>"
+        EOS = "<|eos|>"
+        SEP = "<|separator|>"
+        DEFAULT_CONTROL_TOKENS = {"pad": PAD, "sep": EOS, "eos": SEP}
         tokenizer = tiktoken.Encoding(**kwargs)
         tokenizer._default_allowed_special = default_allowed_special or set()
-        tokenizer._default_allowed_special |= {"<|separator|>"}
+        tokenizer._control_tokens = DEFAULT_CONTROL_TOKENS
         def encode_patched(
             self,
@@ -266,14 +265,14 @@ class TiktokenTokenizer:
                 self,
                 text,
                 allowed_special=allowed_special,
-                disallowed_special=disallowed_special,
+                disallowed_special=(),
             )
         tokenizer.encode = functools.partial(encode_patched, tokenizer)
         # Convert to HF interface
         self.tokenizer = tokenizer
-        self.eos_token_id = tokenizer._special_tokens["<|eos|>"]
+        self.eos_token_id = tokenizer._special_tokens[EOS]
         self.vocab_size = tokenizer.n_vocab
         self.chat_template = Template(
             "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: '  + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"

sglang 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl

sglang 0.2.13py3-none-any.whl → 0.2.14py3-none-any.whl