PyPI - sglang - Versions diffs - 0.3.4.post2__py3-none-any.whl → 0.3.5.post1__py3-none-any.whl - Mend

sglang 0.3.4.post2py3-none-any.whl → 0.3.5.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

sglang/api.py +1 -1
sglang/bench_latency.py +3 -3
sglang/bench_server_latency.py +2 -3
sglang/bench_serving.py +205 -3
sglang/global_config.py +9 -3
sglang/lang/chat_template.py +50 -25
sglang/lang/interpreter.py +9 -1
sglang/lang/ir.py +11 -2
sglang/launch_server.py +1 -1
sglang/srt/configs/model_config.py +54 -13
sglang/srt/constrained/__init__.py +2 -48
sglang/srt/constrained/base_grammar_backend.py +72 -0
sglang/srt/constrained/outlines_backend.py +165 -0
sglang/srt/constrained/outlines_jump_forward.py +182 -0
sglang/srt/constrained/xgrammar_backend.py +114 -0
sglang/srt/hf_transformers_utils.py +6 -5
sglang/srt/layers/attention/triton_ops/decode_attention.py +117 -30
sglang/srt/layers/attention/triton_ops/extend_attention.py +6 -0
sglang/srt/layers/attention/triton_ops/prefill_attention.py +1 -1
sglang/srt/layers/fused_moe/fused_moe.py +27 -10
sglang/srt/layers/fused_moe/layer.py +28 -0
sglang/srt/layers/quantization/base_config.py +14 -1
sglang/srt/layers/vocab_parallel_embedding.py +552 -0
sglang/srt/managers/data_parallel_controller.py +7 -6
sglang/srt/managers/detokenizer_manager.py +9 -11
sglang/srt/managers/image_processor.py +4 -3
sglang/srt/managers/io_struct.py +74 -80
sglang/srt/managers/schedule_batch.py +35 -57
sglang/srt/managers/schedule_policy.py +24 -13
sglang/srt/managers/scheduler.py +266 -150
sglang/srt/managers/tokenizer_manager.py +292 -340
sglang/srt/managers/tp_worker.py +5 -5
sglang/srt/mem_cache/flush_cache.py +1 -1
sglang/srt/metrics/collector.py +211 -0
sglang/srt/metrics/func_timer.py +108 -0
sglang/srt/mm_utils.py +1 -1
sglang/srt/model_executor/cuda_graph_runner.py +9 -6
sglang/srt/model_executor/forward_batch_info.py +7 -3
sglang/srt/model_executor/model_runner.py +10 -18
sglang/srt/models/baichuan.py +4 -4
sglang/srt/models/chatglm.py +4 -4
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/dbrx.py +5 -5
sglang/srt/models/deepseek.py +4 -4
sglang/srt/models/deepseek_v2.py +4 -4
sglang/srt/models/exaone.py +4 -4
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +1 -1
sglang/srt/models/gemma2_reward.py +69 -0
sglang/srt/models/gpt2.py +281 -0
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +4 -4
sglang/srt/models/internlm2.py +4 -4
sglang/srt/models/internlm2_reward.py +62 -0
sglang/srt/models/llama.py +25 -12
sglang/srt/models/llama_embedding.py +2 -10
sglang/srt/models/llama_reward.py +10 -26
sglang/srt/models/minicpm.py +4 -4
sglang/srt/models/minicpm3.py +4 -4
sglang/srt/models/mixtral.py +7 -5
sglang/srt/models/mixtral_quant.py +4 -4
sglang/srt/models/mllama.py +5 -5
sglang/srt/models/olmo.py +4 -4
sglang/srt/models/olmoe.py +4 -4
sglang/srt/models/qwen.py +4 -4
sglang/srt/models/qwen2.py +4 -4
sglang/srt/models/qwen2_moe.py +4 -4
sglang/srt/models/qwen2_vl.py +9 -15
sglang/srt/models/stablelm.py +4 -4
sglang/srt/models/torch_native_llama.py +4 -4
sglang/srt/models/xverse.py +4 -4
sglang/srt/models/xverse_moe.py +4 -4
sglang/srt/openai_api/adapter.py +58 -68
sglang/srt/sampling/sampling_batch_info.py +6 -13
sglang/srt/sampling/sampling_params.py +0 -14
sglang/srt/server.py +84 -46
sglang/srt/server_args.py +61 -12
sglang/srt/utils.py +127 -56
sglang/test/runners.py +2 -1
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +2 -2
sglang/test/simple_eval_mgsm.py +2 -2
sglang/test/test_utils.py +89 -27
sglang/utils.py +63 -1
sglang/version.py +1 -1
sglang-0.3.5.post1.dist-info/METADATA +348 -0
sglang-0.3.5.post1.dist-info/RECORD +155 -0
{sglang-0.3.4.post2.dist-info → sglang-0.3.5.post1.dist-info}/WHEEL +1 -1
sglang/srt/constrained/base_tool_cache.py +0 -65
sglang/srt/constrained/fsm_cache.py +0 -95
sglang/srt/constrained/jump_forward.py +0 -203
sglang-0.3.4.post2.dist-info/METADATA +0 -899
sglang-0.3.4.post2.dist-info/RECORD +0 -148
{sglang-0.3.4.post2.dist-info → sglang-0.3.5.post1.dist-info}/LICENSE +0 -0
{sglang-0.3.4.post2.dist-info → sglang-0.3.5.post1.dist-info}/top_level.txt +0 -0

sglang/api.py CHANGED Viewed

@@ -99,7 +99,7 @@ def gen(
     regex: Optional[str] = None,
     json_schema: Optional[str] = None,
 ):
-    """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
+    """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
     if choices:
         return SglSelect(

sglang/bench_latency.py CHANGED Viewed

@@ -129,9 +129,9 @@ def load_model(server_args, port_args, tp_rank):
     model_config = ModelConfig(
         server_args.model_path,
-        server_args.trust_remote_code,
+        trust_remote_code=server_args.trust_remote_code,
         context_length=server_args.context_length,
-        model_override_args=json.loads(server_args.json_model_override_args),
+        model_override_args=server_args.json_model_override_args,
     )
     model_runner = ModelRunner(
         model_config=model_config,
@@ -550,4 +550,4 @@ if __name__ == "__main__":
     except Exception as e:
         raise e
     finally:
-        kill_child_process(os.getpid(), including_parent=False)
+        kill_child_process()

sglang/bench_server_latency.py CHANGED Viewed

@@ -15,7 +15,6 @@ import dataclasses
 import itertools
 import json
 import multiprocessing
-import os
 import time
 from typing import Tuple
@@ -70,7 +69,7 @@ def launch_server_internal(server_args):
     except Exception as e:
         raise e
     finally:
-        kill_child_process(os.getpid(), including_parent=False)
+        kill_child_process()
 def launch_server_process(server_args: ServerArgs):
@@ -176,7 +175,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
             )
     finally:
         if proc:
-            kill_child_process(proc.pid)
+            kill_child_process(proc.pid, include_self=True)
     print(f"\nResults are saved to {bench_args.result_filename}")

sglang/bench_serving.py CHANGED Viewed

@@ -222,6 +222,85 @@ async def async_request_openai_completions(
     return output
+async def async_request_truss(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    prompt = request_func_input.prompt
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": request_func_input.model,
+            "prompt": prompt,
+            "temperature": 0.0,
+            "best_of": 1,
+            "max_tokens": request_func_input.output_len,
+            "stream": not args.disable_stream,
+            "ignore_eos": not args.disable_ignore_eos,
+            **request_func_input.extra_request_body,
+        }
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if data["choices"][0]["delta"]["content"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+                                most_recent_timestamp = timestamp
+                                generated_text += data["choices"][0]["delta"]["content"]
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.output_len = request_func_input.output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+    if pbar:
+        pbar.update(1)
+    return output
 async def async_request_sglang_generate(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
@@ -350,6 +429,7 @@ ASYNC_REQUEST_FUNCS = {
     "lmdeploy": async_request_openai_completions,
     "trt": async_request_trt_llm,
     "gserver": async_request_gserver,
+    "truss": async_request_truss,
 }
@@ -516,12 +596,20 @@ def sample_random_requests(
         # Filter out sequences that are too long or too short
         input_requests: List[Tuple[str, int, int]] = []
-        for i in range(num_prompts):
+        for data in dataset:
+            i = len(input_requests)
+            if i == num_prompts:
+                break
             # Tokenize the prompts and completions.
-            prompt = dataset[i][0]
+            prompt = data[0]
             prompt_token_ids = tokenizer.encode(prompt)
             prompt_len = len(prompt_token_ids)
+            # Skip empty prompt
+            if prompt_len == 0:
+                continue
             if prompt_len > input_lens[i]:
                 input_ids = prompt_token_ids[: input_lens[i]]
             else:
@@ -547,6 +635,66 @@ def sample_random_requests(
     return input_requests
+def gen_prompt(tokenizer, token_num):
+    """Generate a random prompt of specified token length using tokenizer vocabulary."""
+    all_available_tokens = list(tokenizer.get_vocab().values())
+    selected_tokens = random.choices(all_available_tokens, k=token_num)
+    return tokenizer.decode(selected_tokens)
+def sample_generated_shared_prefix_requests(
+    num_groups: int,
+    prompts_per_group: int,
+    system_prompt_len: int,
+    question_len: int,
+    output_len: int,
+    tokenizer: PreTrainedTokenizerBase,
+) -> List[Tuple[str, int, int]]:
+    """Generate benchmark requests with shared system prompts using random tokens."""
+    # Generate system prompts for each group
+    system_prompts = []
+    for _ in range(num_groups):
+        system_prompt = gen_prompt(tokenizer, system_prompt_len)
+        system_prompts.append(system_prompt)
+    # Generate questions
+    questions = []
+    for _ in range(num_groups * prompts_per_group):
+        question = gen_prompt(tokenizer, question_len)
+        questions.append(question)
+    # Combine system prompts with questions
+    input_requests = []
+    total_input_tokens = 0
+    total_output_tokens = 0
+    for group_idx in range(num_groups):
+        system_prompt = system_prompts[group_idx]
+        for prompt_idx in range(prompts_per_group):
+            question = questions[group_idx * prompts_per_group + prompt_idx]
+            full_prompt = f"{system_prompt}\n\n{question}"
+            prompt_len = len(tokenizer.encode(full_prompt))
+            input_requests.append((full_prompt, prompt_len, output_len))
+            total_input_tokens += prompt_len
+            total_output_tokens += output_len
+    print(f"\nGenerated shared prefix dataset statistics:")
+    print(f"Number of groups: {num_groups}")
+    print(f"Prompts per group: {prompts_per_group}")
+    print(f"Total prompts: {len(input_requests)}")
+    print(f"Total input tokens: {total_input_tokens}")
+    print(f"Total output tokens: {total_output_tokens}")
+    print(
+        f"Average system prompt length: {sum(len(tokenizer.encode(sp)) for sp in system_prompts) / len(system_prompts):.1f} tokens"
+    )
+    print(
+        f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
+    )
+    return input_requests
 async def get_request(
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
@@ -873,6 +1021,7 @@ def run_benchmark(args_: argparse.Namespace):
             "vllm": 8000,
             "trt": 8000,
             "gserver": 9988,
+            "truss": 8080,
         }.get(args.backend, 30000)
     model_url = (
@@ -905,9 +1054,20 @@ def run_benchmark(args_: argparse.Namespace):
     elif args.backend == "gserver":
         api_url = args.base_url if args.base_url else f"{args.host}:{args.port}"
         args.model = args.model or "default"
+    elif args.backend == "truss":
+        api_url = (
+            f"{args.base_url}/v1/models/model:predict"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v1/models/model:predict"
+        )
     # Get model name
     if args.model is None:
+        if args.backend == "truss":
+            print(
+                "Please provide a model with `--model` when using truss backend. e.g. --model meta-llama/Llama-3.1-8B-Instruct"
+            )
+            sys.exit(1)
         try:
             response = requests.get(model_url)
             model_list = response.json().get("data", [])
@@ -956,6 +1116,15 @@ def run_benchmark(args_: argparse.Namespace):
             tokenizer=tokenizer,
             dataset_path=args.dataset_path,
         )
+    elif args.dataset_name == "generated-shared-prefix":
+        input_requests = sample_generated_shared_prefix_requests(
+            num_groups=args.gen_num_groups,
+            prompts_per_group=args.gen_prompts_per_group,
+            system_prompt_len=args.gen_system_prompt_len,
+            question_len=args.gen_question_len,
+            output_len=args.gen_output_len,
+            tokenizer=tokenizer,
+        )
     else:
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
@@ -1029,7 +1198,7 @@ if __name__ == "__main__":
         "--dataset-name",
         type=str,
         default="sharegpt",
-        choices=["sharegpt", "random"],
+        choices=["sharegpt", "random", "generated-shared-prefix"],
         help="Name of the dataset to benchmark on.",
     )
     parser.add_argument(
@@ -1116,5 +1285,38 @@ if __name__ == "__main__":
         help="Append given JSON object to the request payload. You can use this to specify"
         "additional generate params like sampling params.",
     )
+    group = parser.add_argument_group("generated-shared-prefix dataset arguments")
+    group.add_argument(
+        "--gen-num-groups",
+        type=int,
+        default=64,
+        help="Number of system prompt groups for generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gen-prompts-per-group",
+        type=int,
+        default=16,
+        help="Number of prompts per system prompt group for generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gen-system-prompt-len",
+        type=int,
+        default=2048,
+        help="Target length in tokens for system prompts in generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gen-question-len",
+        type=int,
+        default=128,
+        help="Target length in tokens for questions in generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gen-output-len",
+        type=int,
+        default=256,
+        help="Target length in tokens for outputs in generated-shared-prefix dataset",
+    )
     args = parser.parse_args()
     run_benchmark(args)

sglang/global_config.py CHANGED Viewed

@@ -14,9 +14,15 @@ class GlobalConfig:
         self.default_backend = None
         # Runtime constants: New generation token ratio estimation
-        self.init_new_token_ratio = 0.7
-        self.base_min_new_token_ratio = 0.1
-        self.new_token_ratio_decay = 0.001
+        self.default_init_new_token_ratio = float(
+            os.environ.get("SGLANG_INIT_NEW_TOKEN_RATIO", 0.7)
+        )
+        self.default_min_new_token_ratio_factor = float(
+            os.environ.get("SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR", 0.14)
+        )
+        self.default_new_token_ratio_decay_steps = float(
+            os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600)
+        )
         # Runtime constants: others
         self.retract_decode_steps = 20

sglang/lang/chat_template.py CHANGED Viewed

@@ -116,12 +116,10 @@ register_chat_template(
     )
 )
-# There is default system prompt for qwen
-# reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
-# The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
 register_chat_template(
     ChatTemplate(
-        name="qwen",
+        name="chatml-llava",
         default_system_prompt="You are a helpful assistant.",
         role_prefix_and_suffix={
             "system": ("<|im_start|>system\n", "<|im_end|>\n"),
@@ -130,13 +128,17 @@ register_chat_template(
         },
         style=ChatTemplateStyle.PLAIN,
         stop_str=("<|im_end|>",),
+        image_token="<image>\n",
     )
 )
-# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
+# There is default system prompt for qwen
+# reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
+# The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
 register_chat_template(
     ChatTemplate(
-        name="qwen2-vl",
+        name="qwen",
         default_system_prompt="You are a helpful assistant.",
         role_prefix_and_suffix={
             "system": ("<|im_start|>system\n", "<|im_end|>\n"),
@@ -144,15 +146,14 @@ register_chat_template(
             "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
         },
         style=ChatTemplateStyle.PLAIN,
-        stop_str=("<|im_end|>"),
-        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+        stop_str=("<|im_end|>",),
     )
 )
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
 register_chat_template(
     ChatTemplate(
-        name="chatml-llava",
+        name="qwen2-vl",
         default_system_prompt="You are a helpful assistant.",
         role_prefix_and_suffix={
             "system": ("<|im_start|>system\n", "<|im_end|>\n"),
@@ -161,7 +162,7 @@ register_chat_template(
         },
         style=ChatTemplateStyle.PLAIN,
         stop_str=("<|im_end|>",),
-        image_token="<image>\n",
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
     )
 )
@@ -182,37 +183,46 @@ register_chat_template(
     )
 )
-# Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
 register_chat_template(
     ChatTemplate(
-        name="yi-1.5",
+        name="llama-2-chat",
         default_system_prompt=None,
         role_prefix_and_suffix={
-            "system": ("", ""),
-            "user": ("<|im_start|>user\n", "<|im_end|>\n<|im_start|>assistant\n"),
-            "assistant": ("", "<|im_end|>\n"),
+            "system": ("<<SYS>>\n", "\n<</SYS>>\n\n"),
+            "user": ("[INST] ", " [/INST]"),
+            "assistant": ("", " </s><s>"),
         },
-        style=ChatTemplateStyle.PLAIN,
-        stop_str=("<|im_end|>",),
+        style=ChatTemplateStyle.LLAMA2,
     )
 )
 register_chat_template(
     ChatTemplate(
-        name="llama-2-chat",
+        name="llama-3-instruct",
         default_system_prompt=None,
         role_prefix_and_suffix={
-            "system": ("<<SYS>>\n", "\n<</SYS>>\n\n"),
-            "user": ("[INST] ", " [/INST]"),
-            "assistant": ("", " </s><s>"),
+            "system": (
+                "<|start_header_id|>system<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "user": (
+                "<|start_header_id|>user<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "assistant": (
+                "<|start_header_id|>assistant<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
         },
-        style=ChatTemplateStyle.LLAMA2,
+        stop_str=("<|eot_id|>",),
+        image_token="<|image|>",
     )
 )
+# The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
 register_chat_template(
     ChatTemplate(
-        name="llama-3-instruct",
+        name="llama-3-instruct-llava",
         default_system_prompt=None,
         role_prefix_and_suffix={
             "system": (
@@ -229,7 +239,22 @@ register_chat_template(
             ),
         },
         stop_str=("<|eot_id|>",),
-        image_token="<|image|>",
+        image_token="<image>\n",
+    )
+)
+# Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
+register_chat_template(
+    ChatTemplate(
+        name="yi-1.5",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", ""),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n<|im_start|>assistant\n"),
+            "assistant": ("", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
     )
 )

sglang/lang/interpreter.py CHANGED Viewed

@@ -54,7 +54,14 @@ def run_internal(state, program, func_args, func_kwargs, sync):
 def run_program(
-    program, backend, func_args, func_kwargs, default_sampling_para, stream, sync=False
+    program,
+    backend,
+    func_args,
+    func_kwargs,
+    default_sampling_para,
+    stream,
+    sync=False,
+    use_thread=True,
 ):
     if hasattr(backend, "endpoint"):
         backend = backend.endpoint
@@ -67,6 +74,7 @@ def run_program(
         chat_template=None,
         stream=stream,
         num_api_spec_tokens=program.num_api_spec_tokens,
+        use_thread=use_thread,
     )
     state = ProgramState(stream_executor)

sglang/lang/ir.py CHANGED Viewed

@@ -168,6 +168,7 @@ class SglFunction:
         return_text_in_logprobs: Optional[bool] = None,
         stream: bool = False,
         backend=None,
+        use_thread: bool = True,
         **kwargs,
     ):
         from sglang.lang.interpreter import run_program
@@ -195,7 +196,15 @@ class SglFunction:
             return_text_in_logprobs=return_text_in_logprobs,
         )
         backend = backend or global_config.default_backend
-        return run_program(self, backend, args, kwargs, default_sampling_para, stream)
+        return run_program(
+            self,
+            backend,
+            args,
+            kwargs,
+            default_sampling_para,
+            stream,
+            use_thread=use_thread,
+        )
     def run_batch(
         self,
@@ -445,7 +454,7 @@ class SglGen(SglExpr):
         regex: Optional[str] = None,
         json_schema: Optional[str] = None,
     ):
-        """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
+        """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
         super().__init__()
         self.name = name
         self.sampling_params = SglSamplingParams(

sglang/launch_server.py CHANGED Viewed

@@ -15,4 +15,4 @@ if __name__ == "__main__":
     except Exception as e:
         raise e
     finally:
-        kill_child_process(os.getpid(), including_parent=False)
+        kill_child_process()

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+import json
 import logging
 import os
 from enum import IntEnum, auto
-from typing import Optional
+from typing import List, Optional
 from transformers import PretrainedConfig
@@ -38,18 +39,26 @@ class ModelConfig:
         revision: Optional[str] = None,
         context_length: Optional[int] = None,
         model_override_args: Optional[dict] = None,
+        is_embedding: Optional[bool] = None,
     ) -> None:
-        self.path = path
-        self.trust_remote_code = trust_remote_code
-        self.revision = revision
-        self.model_override_args = model_override_args
+        # Parse args
+        self.model_override_args = json.loads(model_override_args)
         self.hf_config = get_config(
-            self.path,
-            trust_remote_code,
-            revision,
-            model_override_args=model_override_args,
+            path,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            model_override_args=self.model_override_args,
         )
         self.hf_text_config = get_hf_text_config(self.hf_config)
+        # Check model type
+        self.is_generation = is_generation_model(
+            self.hf_config.architectures, is_embedding
+        )
+        self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
+        self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
+        # Derive context length
         derived_context_len = get_context_length(self.hf_text_config)
         allow_long_context = os.environ.get(
             "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
@@ -81,7 +90,7 @@ class ModelConfig:
             self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads,
         )
-        # FIXME: temporary special judge for deepseek v2 MLA architecture
+        # FIXME: temporary special judge for MLA architecture
         if "DeepseekV2ForCausalLM" in self.hf_config.architectures:
             self.head_dim = 256
             self.attention_arch = AttentionArch.MLA
@@ -112,8 +121,6 @@ class ModelConfig:
         self.num_hidden_layers = self.hf_text_config.num_hidden_layers
         self.vocab_size = self.hf_text_config.vocab_size
-        self.is_encoder_decoder = self.hf_config.model_type in ["mllama"]
     # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
     def get_total_num_kv_heads(self) -> int:
         """Returns the total number of KV heads."""
@@ -163,7 +170,6 @@ class ModelConfig:
         # equal to the number of attention heads.
         return self.hf_text_config.num_attention_heads
-    # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L328
     def get_num_kv_heads(self, tensor_parallel_size) -> int:
         """Returns the number of KV heads per GPU."""
         total_num_kv_heads = self.get_total_num_kv_heads()
@@ -192,3 +198,38 @@ def get_hf_text_config(config: PretrainedConfig):
         return config.text_config
     else:
         return config
+def is_generation_model(model_architectures: List[str], is_embedding: bool = False):
+    # We have two ways to determine whether a model is a generative model.
+    # 1. Check the model architectue
+    # 2. check the `is_embedding` server args
+    if (
+        "LlamaEmbeddingModel" in model_architectures
+        or "MistralModel" in model_architectures
+        or "LlamaForSequenceClassification" in model_architectures
+        or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
+        or "InternLM2ForRewardModel" in model_architectures
+    ):
+        return False
+    else:
+        return not is_embedding
+def is_multimodal_model(model_architectures: List[str]):
+    if (
+        "LlavaLlamaForCausalLM" in model_architectures
+        or "LlavaQwenForCausalLM" in model_architectures
+        or "LlavaMistralForCausalLM" in model_architectures
+        or "LlavaVidForCausalLM" in model_architectures
+        or "MllamaForConditionalGeneration" in model_architectures
+        or "Qwen2VLForConditionalGeneration" in model_architectures
+    ):
+        return True
+    else:
+        return False
+def is_encoder_decoder_model(model_architectures: List[str]):
+    return "MllamaForConditionalGeneration" in model_architectures

sglang 0.3.4.post2__py3-none-any.whl → 0.3.5.post1__py3-none-any.whl

sglang 0.3.4.post2py3-none-any.whl → 0.3.5.post1py3-none-any.whl