sglang 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +2 -1
 - sglang/lang/chat_template.py +17 -0
 - sglang/launch_server_llavavid.py +1 -1
 - sglang/srt/configs/__init__.py +3 -0
 - sglang/srt/configs/model_config.py +27 -2
 - sglang/srt/configs/qwen2vl.py +133 -0
 - sglang/srt/constrained/fsm_cache.py +10 -3
 - sglang/srt/conversation.py +27 -0
 - sglang/srt/hf_transformers_utils.py +16 -1
 - sglang/srt/layers/attention/__init__.py +16 -5
 - sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
 - sglang/srt/layers/attention/flashinfer_backend.py +174 -54
 - sglang/srt/layers/attention/triton_backend.py +22 -6
 - sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
 - sglang/srt/layers/linear.py +89 -63
 - sglang/srt/layers/logits_processor.py +5 -5
 - sglang/srt/layers/rotary_embedding.py +112 -0
 - sglang/srt/layers/sampler.py +51 -39
 - sglang/srt/lora/lora.py +3 -1
 - sglang/srt/managers/data_parallel_controller.py +1 -1
 - sglang/srt/managers/detokenizer_manager.py +4 -0
 - sglang/srt/managers/image_processor.py +186 -13
 - sglang/srt/managers/io_struct.py +10 -0
 - sglang/srt/managers/schedule_batch.py +238 -68
 - sglang/srt/managers/scheduler.py +69 -50
 - sglang/srt/managers/tokenizer_manager.py +24 -4
 - sglang/srt/managers/tp_worker.py +26 -111
 - sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
 - sglang/srt/mem_cache/memory_pool.py +56 -10
 - sglang/srt/mem_cache/radix_cache.py +4 -3
 - sglang/srt/model_executor/cuda_graph_runner.py +87 -28
 - sglang/srt/model_executor/forward_batch_info.py +83 -3
 - sglang/srt/model_executor/model_runner.py +32 -11
 - sglang/srt/models/chatglm.py +3 -3
 - sglang/srt/models/deepseek_v2.py +2 -2
 - sglang/srt/models/mllama.py +1004 -0
 - sglang/srt/models/qwen2_vl.py +724 -0
 - sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
 - sglang/srt/sampling/sampling_batch_info.py +13 -3
 - sglang/srt/sampling/sampling_params.py +5 -7
 - sglang/srt/server.py +12 -0
 - sglang/srt/server_args.py +10 -0
 - sglang/srt/utils.py +22 -0
 - sglang/test/run_eval.py +2 -0
 - sglang/test/runners.py +20 -1
 - sglang/test/srt/sampling/penaltylib/utils.py +1 -0
 - sglang/test/test_utils.py +100 -3
 - sglang/version.py +1 -1
 - {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +17 -18
 - {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +53 -48
 - {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
 - {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
 - {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0
 
| 
         @@ -31,9 +31,12 @@ class BatchedMinNewTokensPenalizer(_BatchedPenalizer): 
     | 
|
| 
       31 
31 
     | 
    
         
             
                    padded_stop_token_ids = torch.nn.utils.rnn.pad_sequence(
         
     | 
| 
       32 
32 
     | 
    
         
             
                        sequences=[
         
     | 
| 
       33 
33 
     | 
    
         
             
                            torch.tensor(
         
     | 
| 
       34 
     | 
    
         
            -
                                data= 
     | 
| 
       35 
     | 
    
         
            -
                                     
     | 
| 
       36 
     | 
    
         
            -
             
     | 
| 
      
 34 
     | 
    
         
            +
                                data=(
         
     | 
| 
      
 35 
     | 
    
         
            +
                                    list(
         
     | 
| 
      
 36 
     | 
    
         
            +
                                        (req.sampling_params.stop_token_ids or set())
         
     | 
| 
      
 37 
     | 
    
         
            +
                                        | (req.tokenizer.additional_stop_token_ids or set())
         
     | 
| 
      
 38 
     | 
    
         
            +
                                        | {req.tokenizer.eos_token_id}
         
     | 
| 
      
 39 
     | 
    
         
            +
                                    )
         
     | 
| 
       37 
40 
     | 
    
         
             
                                ),
         
     | 
| 
       38 
41 
     | 
    
         
             
                                dtype=torch.int64,
         
     | 
| 
       39 
42 
     | 
    
         
             
                                device=self.orchestrator.device,
         
     | 
| 
         @@ -51,7 +51,7 @@ class SamplingBatchInfo: 
     | 
|
| 
       51 
51 
     | 
    
         
             
                    disable_penalizer: bool,
         
     | 
| 
       52 
52 
     | 
    
         
             
                ):
         
     | 
| 
       53 
53 
     | 
    
         
             
                    reqs = batch.reqs
         
     | 
| 
       54 
     | 
    
         
            -
                    device = batch. 
     | 
| 
      
 54 
     | 
    
         
            +
                    device = batch.device
         
     | 
| 
       55 
55 
     | 
    
         
             
                    temperatures = (
         
     | 
| 
       56 
56 
     | 
    
         
             
                        torch.tensor(
         
     | 
| 
       57 
57 
     | 
    
         
             
                            [r.sampling_params.temperature for r in reqs],
         
     | 
| 
         @@ -78,7 +78,7 @@ class SamplingBatchInfo: 
     | 
|
| 
       78 
78 
     | 
    
         
             
                        need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
         
     | 
| 
       79 
79 
     | 
    
         
             
                        is_all_greedy=top_ks.max().item() <= 1,
         
     | 
| 
       80 
80 
     | 
    
         
             
                        vocab_size=vocab_size,
         
     | 
| 
       81 
     | 
    
         
            -
                        device= 
     | 
| 
      
 81 
     | 
    
         
            +
                        device=device,
         
     | 
| 
       82 
82 
     | 
    
         
             
                    )
         
     | 
| 
       83 
83 
     | 
    
         
             
                    # TODO (lianmin): `need_min_p_sampling` needs to be updated in filter and merge.
         
     | 
| 
       84 
84 
     | 
    
         | 
| 
         @@ -95,7 +95,7 @@ class SamplingBatchInfo: 
     | 
|
| 
       95 
95 
     | 
    
         
             
                        ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
         
     | 
| 
       96 
96 
     | 
    
         
             
                            vocab_size=vocab_size,
         
     | 
| 
       97 
97 
     | 
    
         
             
                            batch=batch,
         
     | 
| 
       98 
     | 
    
         
            -
                            device=batch. 
     | 
| 
      
 98 
     | 
    
         
            +
                            device=batch.device,
         
     | 
| 
       99 
99 
     | 
    
         
             
                            Penalizers={
         
     | 
| 
       100 
100 
     | 
    
         
             
                                penaltylib.BatchedFrequencyPenalizer,
         
     | 
| 
       101 
101 
     | 
    
         
             
                                penaltylib.BatchedMinNewTokensPenalizer,
         
     | 
| 
         @@ -224,3 +224,13 @@ class SamplingBatchInfo: 
     | 
|
| 
       224 
224 
     | 
    
         
             
                        vocab_size=self.vocab_size,
         
     | 
| 
       225 
225 
     | 
    
         
             
                        device=self.device,
         
     | 
| 
       226 
226 
     | 
    
         
             
                    )
         
     | 
| 
      
 227 
     | 
    
         
            +
             
     | 
| 
      
 228 
     | 
    
         
            +
                def to(self, device: str):
         
     | 
| 
      
 229 
     | 
    
         
            +
                    for item in [
         
     | 
| 
      
 230 
     | 
    
         
            +
                        "temperatures",
         
     | 
| 
      
 231 
     | 
    
         
            +
                        "top_ps",
         
     | 
| 
      
 232 
     | 
    
         
            +
                        "top_ks",
         
     | 
| 
      
 233 
     | 
    
         
            +
                        "min_ps",
         
     | 
| 
      
 234 
     | 
    
         
            +
                    ]:
         
     | 
| 
      
 235 
     | 
    
         
            +
                        value = getattr(self, item)
         
     | 
| 
      
 236 
     | 
    
         
            +
                        setattr(self, item, value.to(device, non_blocking=True))
         
     | 
| 
         @@ -50,9 +50,10 @@ class SamplingParams: 
     | 
|
| 
       50 
50 
     | 
    
         
             
                    self.presence_penalty = presence_penalty
         
     | 
| 
       51 
51 
     | 
    
         
             
                    self.repetition_penalty = repetition_penalty
         
     | 
| 
       52 
52 
     | 
    
         
             
                    self.stop_strs = stop
         
     | 
| 
       53 
     | 
    
         
            -
                    if stop_token_ids 
     | 
| 
       54 
     | 
    
         
            -
                        stop_token_ids =  
     | 
| 
       55 
     | 
    
         
            -
                     
     | 
| 
      
 53 
     | 
    
         
            +
                    if stop_token_ids:
         
     | 
| 
      
 54 
     | 
    
         
            +
                        self.stop_token_ids = set(stop_token_ids)
         
     | 
| 
      
 55 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 56 
     | 
    
         
            +
                        self.stop_token_ids = None
         
     | 
| 
       56 
57 
     | 
    
         
             
                    self.max_new_tokens = max_new_tokens
         
     | 
| 
       57 
58 
     | 
    
         
             
                    self.min_new_tokens = min_new_tokens
         
     | 
| 
       58 
59 
     | 
    
         
             
                    self.ignore_eos = ignore_eos
         
     | 
| 
         @@ -119,10 +120,7 @@ class SamplingParams: 
     | 
|
| 
       119 
120 
     | 
    
         
             
                    # Process stop strings
         
     | 
| 
       120 
121 
     | 
    
         
             
                    if self.stop_strs is None:
         
     | 
| 
       121 
122 
     | 
    
         
             
                        self.stop_strs = []
         
     | 
| 
       122 
     | 
    
         
            -
                         
     | 
| 
       123 
     | 
    
         
            -
                            self.stop_str_max_len = 0
         
     | 
| 
       124 
     | 
    
         
            -
                        else:
         
     | 
| 
       125 
     | 
    
         
            -
                            self.stop_str_max_len = 1
         
     | 
| 
      
 123 
     | 
    
         
            +
                        self.stop_str_max_len = 0
         
     | 
| 
       126 
124 
     | 
    
         
             
                    else:
         
     | 
| 
       127 
125 
     | 
    
         
             
                        if isinstance(self.stop_strs, str):
         
     | 
| 
       128 
126 
     | 
    
         
             
                            self.stop_strs = [self.stop_strs]
         
     | 
    
        sglang/srt/server.py
    CHANGED
    
    | 
         @@ -172,6 +172,18 @@ async def stop_profile(): 
     | 
|
| 
       172 
172 
     | 
    
         
             
                )
         
     | 
| 
       173 
173 
     | 
    
         | 
| 
       174 
174 
     | 
    
         | 
| 
      
 175 
     | 
    
         
            +
            @app.api_route("/get_memory_pool_size", methods=["GET", "POST"])
         
     | 
| 
      
 176 
     | 
    
         
            +
            async def get_memory_pool_size():
         
     | 
| 
      
 177 
     | 
    
         
            +
                """Get the memory pool size in number of tokens"""
         
     | 
| 
      
 178 
     | 
    
         
            +
                try:
         
     | 
| 
      
 179 
     | 
    
         
            +
                    ret = await tokenizer_manager.get_memory_pool_size()
         
     | 
| 
      
 180 
     | 
    
         
            +
                    return ret.size
         
     | 
| 
      
 181 
     | 
    
         
            +
                except Exception as e:
         
     | 
| 
      
 182 
     | 
    
         
            +
                    return JSONResponse(
         
     | 
| 
      
 183 
     | 
    
         
            +
                        {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
         
     | 
| 
      
 184 
     | 
    
         
            +
                    )
         
     | 
| 
      
 185 
     | 
    
         
            +
             
     | 
| 
      
 186 
     | 
    
         
            +
             
     | 
| 
       175 
187 
     | 
    
         
             
            @app.post("/update_weights")
         
     | 
| 
       176 
188 
     | 
    
         
             
            async def update_weights(obj: UpdateWeightReqInput, request: Request):
         
     | 
| 
       177 
189 
     | 
    
         
             
                """Update the weights inplace without re-launching the server."""
         
     | 
    
        sglang/srt/server_args.py
    CHANGED
    
    | 
         @@ -177,6 +177,16 @@ class ServerArgs: 
     | 
|
| 
       177 
177 
     | 
    
         
             
                    if self.sampling_backend is None:
         
     | 
| 
       178 
178 
     | 
    
         
             
                        self.sampling_backend = "flashinfer"
         
     | 
| 
       179 
179 
     | 
    
         | 
| 
      
 180 
     | 
    
         
            +
                    if self.enable_overlap_schedule:
         
     | 
| 
      
 181 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 182 
     | 
    
         
            +
                            "Overlap scheduler mode is enabled. This is an experimental feature. "
         
     | 
| 
      
 183 
     | 
    
         
            +
                            "Sampling penalizer (e.g., frequency and repetition penalty), constrained decoding (e.g., regex, JSON), "
         
     | 
| 
      
 184 
     | 
    
         
            +
                            "and embedding APIs are not supported and will lead to wrong results. "
         
     | 
| 
      
 185 
     | 
    
         
            +
                            "The NaN detection is also disabled."
         
     | 
| 
      
 186 
     | 
    
         
            +
                        )
         
     | 
| 
      
 187 
     | 
    
         
            +
                        self.disable_penalizer = True
         
     | 
| 
      
 188 
     | 
    
         
            +
                        self.disable_nan_detection = True
         
     | 
| 
      
 189 
     | 
    
         
            +
             
     | 
| 
       180 
190 
     | 
    
         
             
                    # Model-specific patches
         
     | 
| 
       181 
191 
     | 
    
         
             
                    if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path:
         
     | 
| 
       182 
192 
     | 
    
         
             
                        logger.info(
         
     | 
    
        sglang/srt/utils.py
    CHANGED
    
    | 
         @@ -209,6 +209,28 @@ def is_multimodal_model(model_architectures): 
     | 
|
| 
       209 
209 
     | 
    
         
             
                    or "LlavaQwenForCausalLM" in model_architectures
         
     | 
| 
       210 
210 
     | 
    
         
             
                    or "LlavaMistralForCausalLM" in model_architectures
         
     | 
| 
       211 
211 
     | 
    
         
             
                    or "LlavaVidForCausalLM" in model_architectures
         
     | 
| 
      
 212 
     | 
    
         
            +
                    or "MllamaForConditionalGeneration" in model_architectures
         
     | 
| 
      
 213 
     | 
    
         
            +
                    or "Qwen2VLForConditionalGeneration" in model_architectures
         
     | 
| 
      
 214 
     | 
    
         
            +
                ):
         
     | 
| 
      
 215 
     | 
    
         
            +
                    return True
         
     | 
| 
      
 216 
     | 
    
         
            +
                else:
         
     | 
| 
      
 217 
     | 
    
         
            +
                    return False
         
     | 
| 
      
 218 
     | 
    
         
            +
             
     | 
| 
      
 219 
     | 
    
         
            +
             
     | 
| 
      
 220 
     | 
    
         
            +
            def is_attention_free_model(model_architectures):
         
     | 
| 
      
 221 
     | 
    
         
            +
                return False
         
     | 
| 
      
 222 
     | 
    
         
            +
             
     | 
| 
      
 223 
     | 
    
         
            +
             
     | 
| 
      
 224 
     | 
    
         
            +
            def model_has_inner_state(model_architectures):
         
     | 
| 
      
 225 
     | 
    
         
            +
                return False
         
     | 
| 
      
 226 
     | 
    
         
            +
             
     | 
| 
      
 227 
     | 
    
         
            +
             
     | 
| 
      
 228 
     | 
    
         
            +
            def is_embedding_model(model_architectures):
         
     | 
| 
      
 229 
     | 
    
         
            +
                if (
         
     | 
| 
      
 230 
     | 
    
         
            +
                    "LlamaEmbeddingModel" in model_architectures
         
     | 
| 
      
 231 
     | 
    
         
            +
                    or "MistralModel" in model_architectures
         
     | 
| 
      
 232 
     | 
    
         
            +
                    or "LlamaForSequenceClassification" in model_architectures
         
     | 
| 
      
 233 
     | 
    
         
            +
                    or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
         
     | 
| 
       212 
234 
     | 
    
         
             
                ):
         
     | 
| 
       213 
235 
     | 
    
         
             
                    return True
         
     | 
| 
       214 
236 
     | 
    
         
             
                else:
         
     | 
    
        sglang/test/run_eval.py
    CHANGED
    
    | 
         @@ -67,6 +67,7 @@ def run_eval(args): 
     | 
|
| 
       67 
67 
     | 
    
         
             
                    model=args.model,
         
     | 
| 
       68 
68 
     | 
    
         
             
                    max_tokens=2048,
         
     | 
| 
       69 
69 
     | 
    
         
             
                    base_url=base_url,
         
     | 
| 
      
 70 
     | 
    
         
            +
                    temperature=getattr(args, "temperature", 0.0),
         
     | 
| 
       70 
71 
     | 
    
         
             
                )
         
     | 
| 
       71 
72 
     | 
    
         | 
| 
       72 
73 
     | 
    
         
             
                # Run eval
         
     | 
| 
         @@ -119,6 +120,7 @@ if __name__ == "__main__": 
     | 
|
| 
       119 
120 
     | 
    
         
             
                parser.add_argument("--eval-name", type=str, default="mmlu")
         
     | 
| 
       120 
121 
     | 
    
         
             
                parser.add_argument("--num-examples", type=int)
         
     | 
| 
       121 
122 
     | 
    
         
             
                parser.add_argument("--num-threads", type=int, default=512)
         
     | 
| 
      
 123 
     | 
    
         
            +
                parser.add_argument("--temperature", type=float, default=0.0)
         
     | 
| 
       122 
124 
     | 
    
         
             
                args = parser.parse_args()
         
     | 
| 
       123 
125 
     | 
    
         | 
| 
       124 
126 
     | 
    
         
             
                run_eval(args)
         
     | 
    
        sglang/test/runners.py
    CHANGED
    
    | 
         @@ -102,8 +102,10 @@ class HFRunner: 
     | 
|
| 
       102 
102 
     | 
    
         
             
                    return False
         
     | 
| 
       103 
103 
     | 
    
         | 
| 
       104 
104 
     | 
    
         
             
                def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
         
     | 
| 
       105 
     | 
    
         
            -
                     
     | 
| 
      
 105 
     | 
    
         
            +
                    # Apply model-specific patches
         
     | 
| 
      
 106 
     | 
    
         
            +
                    monkey_patch_gemma2_sdpa()
         
     | 
| 
       106 
107 
     | 
    
         | 
| 
      
 108 
     | 
    
         
            +
                    # Load the model and tokenizer
         
     | 
| 
       107 
109 
     | 
    
         
             
                    if self.model_type == "generation":
         
     | 
| 
       108 
110 
     | 
    
         
             
                        self.base_model = AutoModelForCausalLM.from_pretrained(
         
     | 
| 
       109 
111 
     | 
    
         
             
                            model_path,
         
     | 
| 
         @@ -128,7 +130,9 @@ class HFRunner: 
     | 
|
| 
       128 
130 
     | 
    
         
             
                        ).cuda()
         
     | 
| 
       129 
131 
     | 
    
         
             
                    else:
         
     | 
| 
       130 
132 
     | 
    
         
             
                        raise Exception(f"Unrecognized model type {self.model_type}")
         
     | 
| 
      
 133 
     | 
    
         
            +
                    self.tokenizer = get_tokenizer(model_path, torch_dtype=torch.dtype)
         
     | 
| 
       131 
134 
     | 
    
         | 
| 
      
 135 
     | 
    
         
            +
                    # Run forward
         
     | 
| 
       132 
136 
     | 
    
         
             
                    while True:
         
     | 
| 
       133 
137 
     | 
    
         
             
                        prompts, max_new_tokens, lora_paths = in_queue.get()
         
     | 
| 
       134 
138 
     | 
    
         
             
                        if lora_paths is not None:
         
     | 
| 
         @@ -370,3 +374,18 @@ class SRTRunner: 
     | 
|
| 
       370 
374 
     | 
    
         
             
                def __exit__(self, exc_type, exc_value, traceback):
         
     | 
| 
       371 
375 
     | 
    
         
             
                    self.runtime.shutdown()
         
     | 
| 
       372 
376 
     | 
    
         
             
                    del self.runtime
         
     | 
| 
      
 377 
     | 
    
         
            +
             
     | 
| 
      
 378 
     | 
    
         
            +
             
     | 
| 
      
 379 
     | 
    
         
            +
            def monkey_patch_gemma2_sdpa():
         
     | 
| 
      
 380 
     | 
    
         
            +
                """
         
     | 
| 
      
 381 
     | 
    
         
            +
                Use sdpa by default to fix the OOM issue.
         
     | 
| 
      
 382 
     | 
    
         
            +
                Revert this commit:
         
     | 
| 
      
 383 
     | 
    
         
            +
                https://github.com/huggingface/transformers/commit/975b988bfe6e7ebb47390cd9a1556c6888804883#diff-5f76eac6f18f4b491521314c318a9692318feb4d19228e9576cce7bde4240834R660
         
     | 
| 
      
 384 
     | 
    
         
            +
                """
         
     | 
| 
      
 385 
     | 
    
         
            +
                from transformers.models.gemma2.modeling_gemma2 import Gemma2PreTrainedModel
         
     | 
| 
      
 386 
     | 
    
         
            +
             
     | 
| 
      
 387 
     | 
    
         
            +
                def _check_and_enable_sdpa(config, hard_check_only: bool = False):
         
     | 
| 
      
 388 
     | 
    
         
            +
                    config._attn_implementation = "sdpa"
         
     | 
| 
      
 389 
     | 
    
         
            +
                    return config
         
     | 
| 
      
 390 
     | 
    
         
            +
             
     | 
| 
      
 391 
     | 
    
         
            +
                setattr(Gemma2PreTrainedModel, "_check_and_enable_sdpa", _check_and_enable_sdpa)
         
     | 
    
        sglang/test/test_utils.py
    CHANGED
    
    | 
         @@ -3,6 +3,7 @@ 
     | 
|
| 
       3 
3 
     | 
    
         
             
            import argparse
         
     | 
| 
       4 
4 
     | 
    
         
             
            import asyncio
         
     | 
| 
       5 
5 
     | 
    
         
             
            import os
         
     | 
| 
      
 6 
     | 
    
         
            +
            import random
         
     | 
| 
       6 
7 
     | 
    
         
             
            import subprocess
         
     | 
| 
       7 
8 
     | 
    
         
             
            import threading
         
     | 
| 
       8 
9 
     | 
    
         
             
            import time
         
     | 
| 
         @@ -20,6 +21,7 @@ from sglang.global_config import global_config 
     | 
|
| 
       20 
21 
     | 
    
         
             
            from sglang.lang.backend.openai import OpenAI
         
     | 
| 
       21 
22 
     | 
    
         
             
            from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
         
     | 
| 
       22 
23 
     | 
    
         
             
            from sglang.srt.utils import kill_child_process
         
     | 
| 
      
 24 
     | 
    
         
            +
            from sglang.test.run_eval import run_eval
         
     | 
| 
       23 
25 
     | 
    
         
             
            from sglang.utils import get_exception_traceback
         
     | 
| 
       24 
26 
     | 
    
         | 
| 
       25 
27 
     | 
    
         
             
            DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
         
     | 
| 
         @@ -400,7 +402,7 @@ def popen_launch_server( 
     | 
|
| 
       400 
402 
     | 
    
         
             
                api_key: Optional[str] = None,
         
     | 
| 
       401 
403 
     | 
    
         
             
                other_args: tuple = (),
         
     | 
| 
       402 
404 
     | 
    
         
             
                env: Optional[dict] = None,
         
     | 
| 
       403 
     | 
    
         
            -
                return_stdout_stderr:  
     | 
| 
      
 405 
     | 
    
         
            +
                return_stdout_stderr: Optional[tuple] = None,
         
     | 
| 
       404 
406 
     | 
    
         
             
            ):
         
     | 
| 
       405 
407 
     | 
    
         
             
                _, host, port = base_url.split(":")
         
     | 
| 
       406 
408 
     | 
    
         
             
                host = host[2:]
         
     | 
| 
         @@ -423,8 +425,8 @@ def popen_launch_server( 
     | 
|
| 
       423 
425 
     | 
    
         
             
                if return_stdout_stderr:
         
     | 
| 
       424 
426 
     | 
    
         
             
                    process = subprocess.Popen(
         
     | 
| 
       425 
427 
     | 
    
         
             
                        command,
         
     | 
| 
       426 
     | 
    
         
            -
                        stdout= 
     | 
| 
       427 
     | 
    
         
            -
                        stderr= 
     | 
| 
      
 428 
     | 
    
         
            +
                        stdout=return_stdout_stderr[0],
         
     | 
| 
      
 429 
     | 
    
         
            +
                        stderr=return_stdout_stderr[1],
         
     | 
| 
       428 
430 
     | 
    
         
             
                        env=env,
         
     | 
| 
       429 
431 
     | 
    
         
             
                        text=True,
         
     | 
| 
       430 
432 
     | 
    
         
             
                    )
         
     | 
| 
         @@ -631,3 +633,98 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2): 
     | 
|
| 
       631 
633 
     | 
    
         
             
                    rouge_l_scores.append(fmeasure)
         
     | 
| 
       632 
634 
     | 
    
         | 
| 
       633 
635 
     | 
    
         
             
                return rouge_l_scores
         
     | 
| 
      
 636 
     | 
    
         
            +
             
     | 
| 
      
 637 
     | 
    
         
            +
             
     | 
| 
      
 638 
     | 
    
         
            +
            STDOUT_FILENAME = "stdout.txt"
         
     | 
| 
      
 639 
     | 
    
         
            +
            STDERR_FILENAME = "stderr.txt"
         
     | 
| 
      
 640 
     | 
    
         
            +
             
     | 
| 
      
 641 
     | 
    
         
            +
             
     | 
| 
      
 642 
     | 
    
         
            +
            def read_output(output_lines):
         
     | 
| 
      
 643 
     | 
    
         
            +
                """Print the output in real time with another thread."""
         
     | 
| 
      
 644 
     | 
    
         
            +
                while not os.path.exists(STDERR_FILENAME):
         
     | 
| 
      
 645 
     | 
    
         
            +
                    time.sleep(1)
         
     | 
| 
      
 646 
     | 
    
         
            +
             
     | 
| 
      
 647 
     | 
    
         
            +
                pt = 0
         
     | 
| 
      
 648 
     | 
    
         
            +
                while pt >= 0:
         
     | 
| 
      
 649 
     | 
    
         
            +
                    if pt > 0 and not os.path.exists(STDERR_FILENAME):
         
     | 
| 
      
 650 
     | 
    
         
            +
                        break
         
     | 
| 
      
 651 
     | 
    
         
            +
                    lines = open(STDERR_FILENAME).readlines()
         
     | 
| 
      
 652 
     | 
    
         
            +
                    for line in lines[pt:]:
         
     | 
| 
      
 653 
     | 
    
         
            +
                        print(line, end="", flush=True)
         
     | 
| 
      
 654 
     | 
    
         
            +
                        output_lines.append(line)
         
     | 
| 
      
 655 
     | 
    
         
            +
                        pt += 1
         
     | 
| 
      
 656 
     | 
    
         
            +
                    time.sleep(0.1)
         
     | 
| 
      
 657 
     | 
    
         
            +
             
     | 
| 
      
 658 
     | 
    
         
            +
             
     | 
| 
      
 659 
     | 
    
         
            +
            def run_mmlu_test(
         
     | 
| 
      
 660 
     | 
    
         
            +
                disable_radix_cache,
         
     | 
| 
      
 661 
     | 
    
         
            +
                enable_mixed_chunk=False,
         
     | 
| 
      
 662 
     | 
    
         
            +
                enable_overlap=False,
         
     | 
| 
      
 663 
     | 
    
         
            +
                chunked_prefill_size=32,
         
     | 
| 
      
 664 
     | 
    
         
            +
            ):
         
     | 
| 
      
 665 
     | 
    
         
            +
                other_args = ["--chunked-prefill-size", str(chunked_prefill_size)]
         
     | 
| 
      
 666 
     | 
    
         
            +
                if disable_radix_cache:
         
     | 
| 
      
 667 
     | 
    
         
            +
                    other_args += ["--disable-radix-cache"]
         
     | 
| 
      
 668 
     | 
    
         
            +
                if enable_mixed_chunk:
         
     | 
| 
      
 669 
     | 
    
         
            +
                    other_args += ["--enable-mixed-chunk"]
         
     | 
| 
      
 670 
     | 
    
         
            +
                if enable_overlap:
         
     | 
| 
      
 671 
     | 
    
         
            +
                    other_args += ["--enable-overlap-scheduler"]
         
     | 
| 
      
 672 
     | 
    
         
            +
             
     | 
| 
      
 673 
     | 
    
         
            +
                model = DEFAULT_MODEL_NAME_FOR_TEST
         
     | 
| 
      
 674 
     | 
    
         
            +
                port = random.randint(4000, 5000)
         
     | 
| 
      
 675 
     | 
    
         
            +
                base_url = f"http://127.0.0.1:{port}"
         
     | 
| 
      
 676 
     | 
    
         
            +
             
     | 
| 
      
 677 
     | 
    
         
            +
                # Create files and launch the server
         
     | 
| 
      
 678 
     | 
    
         
            +
                stdout = open(STDOUT_FILENAME, "w")
         
     | 
| 
      
 679 
     | 
    
         
            +
                stderr = open(STDERR_FILENAME, "w")
         
     | 
| 
      
 680 
     | 
    
         
            +
                process = popen_launch_server(
         
     | 
| 
      
 681 
     | 
    
         
            +
                    model,
         
     | 
| 
      
 682 
     | 
    
         
            +
                    base_url,
         
     | 
| 
      
 683 
     | 
    
         
            +
                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
         
     | 
| 
      
 684 
     | 
    
         
            +
                    other_args=other_args,
         
     | 
| 
      
 685 
     | 
    
         
            +
                    return_stdout_stderr=(stdout, stderr),
         
     | 
| 
      
 686 
     | 
    
         
            +
                )
         
     | 
| 
      
 687 
     | 
    
         
            +
             
     | 
| 
      
 688 
     | 
    
         
            +
                # Launch a thread to stream the output
         
     | 
| 
      
 689 
     | 
    
         
            +
                output_lines = []
         
     | 
| 
      
 690 
     | 
    
         
            +
                t = threading.Thread(target=read_output, args=(output_lines,))
         
     | 
| 
      
 691 
     | 
    
         
            +
                t.start()
         
     | 
| 
      
 692 
     | 
    
         
            +
             
     | 
| 
      
 693 
     | 
    
         
            +
                # Run the eval
         
     | 
| 
      
 694 
     | 
    
         
            +
                args = SimpleNamespace(
         
     | 
| 
      
 695 
     | 
    
         
            +
                    base_url=base_url,
         
     | 
| 
      
 696 
     | 
    
         
            +
                    model=model,
         
     | 
| 
      
 697 
     | 
    
         
            +
                    eval_name="mmlu",
         
     | 
| 
      
 698 
     | 
    
         
            +
                    num_examples=128,
         
     | 
| 
      
 699 
     | 
    
         
            +
                    num_threads=128,
         
     | 
| 
      
 700 
     | 
    
         
            +
                )
         
     | 
| 
      
 701 
     | 
    
         
            +
             
     | 
| 
      
 702 
     | 
    
         
            +
                try:
         
     | 
| 
      
 703 
     | 
    
         
            +
                    metrics = run_eval(args)
         
     | 
| 
      
 704 
     | 
    
         
            +
                    print(f"{metrics=}")
         
     | 
| 
      
 705 
     | 
    
         
            +
                    assert metrics["score"] >= 0.65
         
     | 
| 
      
 706 
     | 
    
         
            +
                finally:
         
     | 
| 
      
 707 
     | 
    
         
            +
                    pass
         
     | 
| 
      
 708 
     | 
    
         
            +
             
     | 
| 
      
 709 
     | 
    
         
            +
                # Clean up everything
         
     | 
| 
      
 710 
     | 
    
         
            +
                kill_child_process(process.pid)
         
     | 
| 
      
 711 
     | 
    
         
            +
                kill_child_process(process.pid)
         
     | 
| 
      
 712 
     | 
    
         
            +
                stdout.close()
         
     | 
| 
      
 713 
     | 
    
         
            +
                stderr.close()
         
     | 
| 
      
 714 
     | 
    
         
            +
                if os.path.exists(STDOUT_FILENAME):
         
     | 
| 
      
 715 
     | 
    
         
            +
                    os.remove(STDOUT_FILENAME)
         
     | 
| 
      
 716 
     | 
    
         
            +
                if os.path.exists(STDERR_FILENAME):
         
     | 
| 
      
 717 
     | 
    
         
            +
                    os.remove(STDERR_FILENAME)
         
     | 
| 
      
 718 
     | 
    
         
            +
                t.join()
         
     | 
| 
      
 719 
     | 
    
         
            +
             
     | 
| 
      
 720 
     | 
    
         
            +
                # Assert success
         
     | 
| 
      
 721 
     | 
    
         
            +
                has_new_server = False
         
     | 
| 
      
 722 
     | 
    
         
            +
                has_leak = False
         
     | 
| 
      
 723 
     | 
    
         
            +
                for line in output_lines:
         
     | 
| 
      
 724 
     | 
    
         
            +
                    if "The server is fired" in line:
         
     | 
| 
      
 725 
     | 
    
         
            +
                        has_new_server = True
         
     | 
| 
      
 726 
     | 
    
         
            +
                    if "leak" in line:
         
     | 
| 
      
 727 
     | 
    
         
            +
                        has_leak = True
         
     | 
| 
      
 728 
     | 
    
         
            +
             
     | 
| 
      
 729 
     | 
    
         
            +
                assert has_new_server
         
     | 
| 
      
 730 
     | 
    
         
            +
                # assert not has_leak
         
     | 
    
        sglang/version.py
    CHANGED
    
    | 
         @@ -1 +1 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            __version__ = "0.3.4"
         
     | 
| 
      
 1 
     | 
    
         
            +
            __version__ = "0.3.4.post2"
         
     | 
| 
         @@ -1,6 +1,6 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            Metadata-Version: 2.1
         
     | 
| 
       2 
2 
     | 
    
         
             
            Name: sglang
         
     | 
| 
       3 
     | 
    
         
            -
            Version: 0.3.4
         
     | 
| 
      
 3 
     | 
    
         
            +
            Version: 0.3.4.post2
         
     | 
| 
       4 
4 
     | 
    
         
             
            Summary: SGLang is yet another fast serving framework for large language models and vision language models.
         
     | 
| 
       5 
5 
     | 
    
         
             
            License: Apache License
         
     | 
| 
       6 
6 
     | 
    
         
             
                                               Version 2.0, January 2004
         
     | 
| 
         @@ -259,7 +259,7 @@ Requires-Dist: modelscope; extra == "runtime-common" 
     | 
|
| 
       259 
259 
     | 
    
         
             
            Provides-Extra: srt
         
     | 
| 
       260 
260 
     | 
    
         
             
            Requires-Dist: sglang[runtime_common]; extra == "srt"
         
     | 
| 
       261 
261 
     | 
    
         
             
            Requires-Dist: torch; extra == "srt"
         
     | 
| 
       262 
     | 
    
         
            -
            Requires-Dist: vllm==0. 
     | 
| 
      
 262 
     | 
    
         
            +
            Requires-Dist: vllm==0.6.3.post1; extra == "srt"
         
     | 
| 
       263 
263 
     | 
    
         
             
            Provides-Extra: srt_xpu
         
     | 
| 
       264 
264 
     | 
    
         
             
            Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
         
     | 
| 
       265 
265 
     | 
    
         
             
            Provides-Extra: test
         
     | 
| 
         @@ -284,17 +284,17 @@ Requires-Dist: peft; extra == "test" 
     | 
|
| 
       284 
284 
     | 
    
         
             
            --------------------------------------------------------------------------------
         
     | 
| 
       285 
285 
     | 
    
         | 
| 
       286 
286 
     | 
    
         
             
            | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
         
     | 
| 
       287 
     | 
    
         
            -
            [**Join Bi-Weekly Development Meeting 
     | 
| 
      
 287 
     | 
    
         
            +
            [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
         
     | 
| 
       288 
288 
     | 
    
         | 
| 
       289 
289 
     | 
    
         
             
            ## News
         
     | 
| 
       290 
290 
     | 
    
         
             
            - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
         
     | 
| 
       291 
291 
     | 
    
         
             
            - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
         
     | 
| 
       292 
292 
     | 
    
         
             
            - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
         
     | 
| 
       293 
     | 
    
         
            -
            - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
         
     | 
| 
       294 
293 
     | 
    
         | 
| 
       295 
294 
     | 
    
         
             
            <details>
         
     | 
| 
       296 
295 
     | 
    
         
             
            <summary>More</summary>
         
     | 
| 
       297 
296 
     | 
    
         | 
| 
      
 297 
     | 
    
         
            +
            - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
         
     | 
| 
       298 
298 
     | 
    
         
             
            - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
         
     | 
| 
       299 
299 
     | 
    
         
             
            - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
         
     | 
| 
       300 
300 
     | 
    
         
             
            - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
         
     | 
| 
         @@ -328,23 +328,27 @@ You can install SGLang using any of the methods below. 
     | 
|
| 
       328 
328 
     | 
    
         
             
            pip install --upgrade pip
         
     | 
| 
       329 
329 
     | 
    
         
             
            pip install "sglang[all]"
         
     | 
| 
       330 
330 
     | 
    
         | 
| 
       331 
     | 
    
         
            -
            # Install FlashInfer  
     | 
| 
      
 331 
     | 
    
         
            +
            # Install FlashInfer accelerated kernels
         
     | 
| 
       332 
332 
     | 
    
         
             
            pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
         
     | 
| 
       333 
333 
     | 
    
         
             
            ```
         
     | 
| 
       334 
334 
     | 
    
         | 
| 
      
 335 
     | 
    
         
            +
            Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
         
     | 
| 
      
 336 
     | 
    
         
            +
             
     | 
| 
       335 
337 
     | 
    
         
             
            ### Method 2: From source
         
     | 
| 
       336 
338 
     | 
    
         
             
            ```
         
     | 
| 
       337 
339 
     | 
    
         
             
            # Use the last release branch
         
     | 
| 
       338 
     | 
    
         
            -
            git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
         
     | 
| 
      
 340 
     | 
    
         
            +
            git clone -b v0.3.4.post2 https://github.com/sgl-project/sglang.git
         
     | 
| 
       339 
341 
     | 
    
         
             
            cd sglang
         
     | 
| 
       340 
342 
     | 
    
         | 
| 
       341 
343 
     | 
    
         
             
            pip install --upgrade pip
         
     | 
| 
       342 
344 
     | 
    
         
             
            pip install -e "python[all]"
         
     | 
| 
       343 
345 
     | 
    
         | 
| 
       344 
     | 
    
         
            -
            # Install FlashInfer  
     | 
| 
      
 346 
     | 
    
         
            +
            # Install FlashInfer accelerated kernels
         
     | 
| 
       345 
347 
     | 
    
         
             
            pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
         
     | 
| 
       346 
348 
     | 
    
         
             
            ```
         
     | 
| 
       347 
349 
     | 
    
         | 
| 
      
 350 
     | 
    
         
            +
            Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
         
     | 
| 
      
 351 
     | 
    
         
            +
             
     | 
| 
       348 
352 
     | 
    
         
             
            ### Method 3: Using docker
         
     | 
| 
       349 
353 
     | 
    
         
             
            The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
         
     | 
| 
       350 
354 
     | 
    
         
             
            Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
         
     | 
| 
         @@ -498,7 +502,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct 
     | 
|
| 
       498 
502 
     | 
    
         
             
            ```
         
     | 
| 
       499 
503 
     | 
    
         
             
            python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
         
     | 
| 
       500 
504 
     | 
    
         
             
            ```
         
     | 
| 
       501 
     | 
    
         
            -
            - To enable  
     | 
| 
      
 505 
     | 
    
         
            +
            - To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
         
     | 
| 
      
 506 
     | 
    
         
            +
            - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
         
     | 
| 
       502 
507 
     | 
    
         
             
            - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
         
     | 
| 
       503 
508 
     | 
    
         
             
            - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
         
     | 
| 
       504 
509 
     | 
    
         
             
            - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
         
     | 
| 
         @@ -519,7 +524,6 @@ We also provide an inference engine **without a HTTP server**. For example, 
     | 
|
| 
       519 
524 
     | 
    
         
             
            ```python
         
     | 
| 
       520 
525 
     | 
    
         
             
            import sglang as sgl
         
     | 
| 
       521 
526 
     | 
    
         | 
| 
       522 
     | 
    
         
            -
             
     | 
| 
       523 
527 
     | 
    
         
             
            def main():
         
     | 
| 
       524 
528 
     | 
    
         
             
                prompts = [
         
     | 
| 
       525 
529 
     | 
    
         
             
                    "Hello, my name is",
         
     | 
| 
         @@ -539,12 +543,8 @@ if __name__ == "__main__": 
     | 
|
| 
       539 
543 
     | 
    
         
             
                main()
         
     | 
| 
       540 
544 
     | 
    
         
             
            ```
         
     | 
| 
       541 
545 
     | 
    
         | 
| 
       542 
     | 
    
         
            -
            This can be used for 
     | 
| 
       543 
     | 
    
         
            -
             
     | 
| 
       544 
     | 
    
         
            -
            1. **Offline Batch Inference**
         
     | 
| 
       545 
     | 
    
         
            -
            2. **Building Custom Servers**
         
     | 
| 
       546 
     | 
    
         
            -
             
     | 
| 
       547 
     | 
    
         
            -
            You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
         
     | 
| 
      
 546 
     | 
    
         
            +
            This can be used for offline batch inference and building custom servers.
         
     | 
| 
      
 547 
     | 
    
         
            +
            You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine).
         
     | 
| 
       548 
548 
     | 
    
         | 
| 
       549 
549 
     | 
    
         
             
            ### Supported Models
         
     | 
| 
       550 
550 
     | 
    
         | 
| 
         @@ -552,7 +552,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/ 
     | 
|
| 
       552 
552 
     | 
    
         
             
            - Llama / Llama 2 / Llama 3 / Llama 3.1
         
     | 
| 
       553 
553 
     | 
    
         
             
            - Mistral / Mixtral / Mistral NeMo
         
     | 
| 
       554 
554 
     | 
    
         
             
            - Gemma / Gemma 2
         
     | 
| 
       555 
     | 
    
         
            -
            - Qwen / Qwen 2 / Qwen 2 MoE
         
     | 
| 
      
 555 
     | 
    
         
            +
            - Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
         
     | 
| 
       556 
556 
     | 
    
         
             
            - DeepSeek / DeepSeek 2
         
     | 
| 
       557 
557 
     | 
    
         
             
            - OLMoE
         
     | 
| 
       558 
558 
     | 
    
         
             
            - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
         
     | 
| 
         @@ -575,6 +575,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/ 
     | 
|
| 
       575 
575 
     | 
    
         
             
            - MiniCPM / MiniCPM 3
         
     | 
| 
       576 
576 
     | 
    
         
             
            - XVERSE / XVERSE MoE
         
     | 
| 
       577 
577 
     | 
    
         
             
            - SmolLM
         
     | 
| 
      
 578 
     | 
    
         
            +
            - GLM-4
         
     | 
| 
       578 
579 
     | 
    
         | 
| 
       579 
580 
     | 
    
         
             
            **Embedding Models**
         
     | 
| 
       580 
581 
     | 
    
         | 
| 
         @@ -711,7 +712,6 @@ print(state["answer_1"]) 
     | 
|
| 
       711 
712 
     | 
    
         
             
            ```
         
     | 
| 
       712 
713 
     | 
    
         | 
| 
       713 
714 
     | 
    
         
             
            #### More Examples
         
     | 
| 
       714 
     | 
    
         
            -
             
     | 
| 
       715 
715 
     | 
    
         
             
            Anthropic and VertexAI (Gemini) models are also supported.
         
     | 
| 
       716 
716 
     | 
    
         
             
            You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
         
     | 
| 
       717 
717 
     | 
    
         | 
| 
         @@ -892,7 +892,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model 
     | 
|
| 
       892 
892 
     | 
    
         
             
            We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
         
     | 
| 
       893 
893 
     | 
    
         | 
| 
       894 
894 
     | 
    
         | 
| 
       895 
     | 
    
         
            -
             
     | 
| 
       896 
895 
     | 
    
         
             
            <p align="center">
         
     | 
| 
       897 
896 
     | 
    
         
             
              <a href="#sglangtop" target="_blank">
         
     | 
| 
       898 
897 
     | 
    
         
             
              <bold>Back To Top </bold>
         
     |