PyPI - sglang - Versions diffs - 0.3.4.post1__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

sglang 0.3.4.post1py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

sglang/api.py +1 -1
sglang/bench_latency.py +3 -3
sglang/bench_server_latency.py +2 -3
sglang/bench_serving.py +92 -0
sglang/global_config.py +9 -3
sglang/lang/chat_template.py +50 -25
sglang/lang/interpreter.py +9 -1
sglang/lang/ir.py +11 -2
sglang/launch_server.py +1 -1
sglang/srt/configs/model_config.py +76 -15
sglang/srt/constrained/__init__.py +18 -0
sglang/srt/constrained/bnf_cache.py +61 -0
sglang/srt/constrained/fsm_cache.py +10 -3
sglang/srt/constrained/grammar.py +190 -0
sglang/srt/hf_transformers_utils.py +20 -5
sglang/srt/layers/attention/flashinfer_backend.py +5 -5
sglang/srt/layers/attention/triton_ops/decode_attention.py +110 -30
sglang/srt/layers/attention/triton_ops/prefill_attention.py +1 -1
sglang/srt/layers/fused_moe/fused_moe.py +4 -3
sglang/srt/layers/fused_moe/layer.py +28 -0
sglang/srt/layers/logits_processor.py +5 -5
sglang/srt/layers/quantization/base_config.py +16 -1
sglang/srt/layers/rotary_embedding.py +15 -48
sglang/srt/layers/sampler.py +51 -39
sglang/srt/layers/vocab_parallel_embedding.py +486 -0
sglang/srt/managers/data_parallel_controller.py +8 -7
sglang/srt/managers/detokenizer_manager.py +11 -9
sglang/srt/managers/image_processor.py +4 -3
sglang/srt/managers/io_struct.py +80 -78
sglang/srt/managers/schedule_batch.py +46 -52
sglang/srt/managers/schedule_policy.py +24 -13
sglang/srt/managers/scheduler.py +145 -82
sglang/srt/managers/tokenizer_manager.py +236 -334
sglang/srt/managers/tp_worker.py +5 -5
sglang/srt/managers/tp_worker_overlap_thread.py +58 -21
sglang/srt/mem_cache/flush_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +10 -3
sglang/srt/model_executor/cuda_graph_runner.py +34 -23
sglang/srt/model_executor/forward_batch_info.py +6 -9
sglang/srt/model_executor/model_runner.py +10 -19
sglang/srt/models/baichuan.py +4 -4
sglang/srt/models/chatglm.py +4 -4
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/dbrx.py +5 -5
sglang/srt/models/deepseek.py +4 -4
sglang/srt/models/deepseek_v2.py +4 -4
sglang/srt/models/exaone.py +4 -4
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +1 -1
sglang/srt/models/gpt2.py +287 -0
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +4 -4
sglang/srt/models/internlm2.py +4 -4
sglang/srt/models/llama.py +15 -7
sglang/srt/models/llama_embedding.py +2 -10
sglang/srt/models/llama_reward.py +5 -0
sglang/srt/models/minicpm.py +4 -4
sglang/srt/models/minicpm3.py +4 -4
sglang/srt/models/mixtral.py +7 -5
sglang/srt/models/mixtral_quant.py +4 -4
sglang/srt/models/mllama.py +5 -5
sglang/srt/models/olmo.py +4 -4
sglang/srt/models/olmoe.py +4 -4
sglang/srt/models/qwen.py +4 -4
sglang/srt/models/qwen2.py +4 -4
sglang/srt/models/qwen2_moe.py +4 -4
sglang/srt/models/qwen2_vl.py +4 -8
sglang/srt/models/stablelm.py +4 -4
sglang/srt/models/torch_native_llama.py +4 -4
sglang/srt/models/xverse.py +4 -4
sglang/srt/models/xverse_moe.py +4 -4
sglang/srt/openai_api/adapter.py +52 -66
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
sglang/srt/sampling/sampling_batch_info.py +7 -13
sglang/srt/sampling/sampling_params.py +5 -7
sglang/srt/server.py +41 -33
sglang/srt/server_args.py +34 -5
sglang/srt/utils.py +40 -56
sglang/test/run_eval.py +2 -0
sglang/test/runners.py +2 -1
sglang/test/srt/sampling/penaltylib/utils.py +1 -0
sglang/test/test_utils.py +151 -6
sglang/utils.py +62 -1
sglang/version.py +1 -1
sglang-0.3.5.dist-info/METADATA +344 -0
sglang-0.3.5.dist-info/RECORD +152 -0
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/WHEEL +1 -1
sglang-0.3.4.post1.dist-info/METADATA +0 -900
sglang-0.3.4.post1.dist-info/RECORD +0 -148
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/LICENSE +0 -0
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/top_level.txt +0 -0

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -22,64 +22,33 @@ class MRotaryEmbedding:
     @staticmethod
     def get_input_positions(
-        input_tokens: List[int],
+        input_tokens: torch.Tensor,
         image_grid_thw: Union[List[List[int]], torch.Tensor],
-        video_grid_thw: Union[List[List[int]], torch.Tensor],
-        image_token_id: int,
-        video_token_id: int,
         vision_start_token_id: int,
-        vision_end_token_id: int,
         spatial_merge_size: int,
         context_len: int = 0,
-        extend_prefix_len: int = 0,
     ) -> Tuple[List[List[int]], int]:
         """Get mrope input positions and delta value."""
         if isinstance(image_grid_thw, torch.Tensor):
             image_grid_thw = image_grid_thw.tolist()
-        if isinstance(video_grid_thw, torch.Tensor):
-            video_grid_thw = video_grid_thw.tolist()
-        input_tokens_tensor = torch.tensor(input_tokens)
         vision_start_indices = torch.argwhere(
-            input_tokens_tensor == vision_start_token_id
+            input_tokens == vision_start_token_id
         ).squeeze(1)
-        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
-        image_nums = (vision_tokens == image_token_id).sum()
-        video_nums = (vision_tokens == video_token_id).sum()
+        image_indices = vision_start_indices + 1
+        image_nums = image_indices.shape[0]
         llm_pos_ids_list: list = []
         st = 0
-        remain_images, remain_videos = image_nums, video_nums
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + video_nums):
-            if image_token_id in input_tokens and remain_images > 0:
-                ed_image = input_tokens.index(image_token_id, st)
-            else:
-                ed_image = len(input_tokens) + 1
-            if video_token_id in input_tokens and remain_videos > 0:
-                ed_video = input_tokens.index(video_token_id, st)
-            else:
-                ed_video = len(input_tokens) + 1
-            if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
-                image_index += 1
-                remain_images -= 1
-                ed = ed_image
-            else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
-                video_index += 1
-                remain_videos -= 1
-                ed = ed_video
+        input_tokens_len = input_tokens.shape[0]
+        for image_index in range(image_nums):
+            ed = image_indices[image_index].item()
+            t, h, w = (
+                image_grid_thw[image_index][0],
+                image_grid_thw[image_index][1],
+                image_grid_thw[image_index][2],
+            )
             llm_grid_t, llm_grid_h, llm_grid_w = (
                 t,
                 h // spatial_merge_size,
@@ -115,18 +84,16 @@ class MRotaryEmbedding:
             )
             st = ed + llm_grid_t * llm_grid_h * llm_grid_w
-        if st < len(input_tokens):
+        if st < input_tokens_len:
             st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            text_len = len(input_tokens) - st
+            text_len = input_tokens_len - st
             llm_pos_ids_list.append(
                 torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
             )
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
         llm_positions = llm_positions[:, context_len:]
-        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions += extend_prefix_len
+        mrope_position_delta = (llm_positions.max() + 1 - input_tokens_len).item()
         return llm_positions.tolist(), mrope_position_delta
     @staticmethod

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import os
 from typing import Union
 import torch
@@ -17,6 +18,11 @@ if is_flashinfer_available():
         top_p_renorm_prob,
     )
+# Crash on warning if we are running CI tests
+crash_on_warning = os.getenv("SGLANG_IS_IN_CI", "false") == "true"
 logger = logging.getLogger(__name__)
@@ -33,56 +39,62 @@ class Sampler(nn.Module):
         if isinstance(logits, LogitsProcessorOutput):
             logits = logits.next_token_logits
-        # Post process logits
         logits = logits.contiguous()
-        logits.div_(sampling_info.temperatures)
-        probs = torch.softmax(logits, dim=-1)
-        logits = None
-        del logits
-        if self.use_nan_detectioin and torch.any(torch.isnan(probs)):
-            logger.warning("Detected errors during sampling! NaN in the probability.")
-            probs = torch.where(
-                torch.isnan(probs), torch.full_like(probs, 1e-10), probs
+        if self.use_nan_detectioin and torch.any(torch.isnan(logits)):
+            logger.warning("Detected errors during sampling! NaN in the logits.")
+            logits = torch.where(
+                torch.isnan(logits), torch.full_like(logits, -1e5), logits
             )
+            exit(1) if crash_on_warning else None
         if sampling_info.is_all_greedy:
             # Use torch.argmax if all requests use greedy sampling
-            batch_next_token_ids = torch.argmax(probs, -1)
-        elif global_server_args_dict["sampling_backend"] == "flashinfer":
-            max_top_k_round, batch_size = 32, probs.shape[0]
-            uniform_samples = torch.rand(
-                (max_top_k_round, batch_size), device=probs.device
-            )
-            if sampling_info.need_min_p_sampling:
-                probs = top_k_renorm_prob(probs, sampling_info.top_ks)
-                probs = top_p_renorm_prob(probs, sampling_info.top_ps)
-                batch_next_token_ids, success = min_p_sampling_from_probs(
-                    probs, uniform_samples, sampling_info.min_ps
+            batch_next_token_ids = torch.argmax(logits, -1)
+        else:
+            # Post process logits
+            logits.div_(sampling_info.temperatures)
+            probs = torch.softmax(logits, dim=-1)
+            logits = None
+            del logits
+            if global_server_args_dict["sampling_backend"] == "flashinfer":
+                max_top_k_round, batch_size = 32, probs.shape[0]
+                uniform_samples = torch.rand(
+                    (max_top_k_round, batch_size), device=probs.device
                 )
-            else:
-                batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
+                if sampling_info.need_min_p_sampling:
+                    probs = top_k_renorm_prob(probs, sampling_info.top_ks)
+                    probs = top_p_renorm_prob(probs, sampling_info.top_ps)
+                    batch_next_token_ids, success = min_p_sampling_from_probs(
+                        probs, uniform_samples, sampling_info.min_ps
+                    )
+                else:
+                    batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
+                        probs,
+                        uniform_samples,
+                        sampling_info.top_ks,
+                        sampling_info.top_ps,
+                        filter_apply_order="joint",
+                    )
+                if not torch.all(success):
+                    logger.warning("Detected errors during sampling!")
+                    batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
+            elif global_server_args_dict["sampling_backend"] == "pytorch":
+                # A slower fallback implementation with torch native operations.
+                batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
                     probs,
-                    uniform_samples,
                     sampling_info.top_ks,
                     sampling_info.top_ps,
-                    filter_apply_order="joint",
+                    sampling_info.min_ps,
+                )
+            else:
+                raise ValueError(
+                    f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
                 )
-            if not torch.all(success):
-                logger.warning("Detected errors during sampling!")
-                batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
-        elif global_server_args_dict["sampling_backend"] == "pytorch":
-            # Here we provide a slower fallback implementation.
-            batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
-                probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
-            )
-        else:
-            raise ValueError(
-                f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
-            )
-        return batch_next_token_ids
+        return batch_next_token_ids.to(torch.int32)
 def top_k_top_p_min_p_sampling_from_probs_torch(

sglang 0.3.4.post1__py3-none-any.whl → 0.3.5__py3-none-any.whl

sglang 0.3.4.post1py3-none-any.whl → 0.3.5py3-none-any.whl