PyPI - sglang - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.7py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/__init__.py +2 -0
sglang/api.py +7 -0
sglang/bench_serving.py +1 -1
sglang/lang/interpreter.py +40 -1
sglang/lang/ir.py +27 -0
sglang/math_utils.py +8 -0
sglang/srt/configs/model_config.py +6 -0
sglang/srt/conversation.py +6 -0
sglang/srt/disaggregation/base/__init__.py +1 -1
sglang/srt/disaggregation/base/conn.py +25 -11
sglang/srt/disaggregation/common/__init__.py +5 -1
sglang/srt/disaggregation/common/utils.py +42 -0
sglang/srt/disaggregation/decode.py +196 -51
sglang/srt/disaggregation/fake/__init__.py +1 -1
sglang/srt/disaggregation/fake/conn.py +15 -9
sglang/srt/disaggregation/mooncake/__init__.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +18 -13
sglang/srt/disaggregation/nixl/__init__.py +6 -1
sglang/srt/disaggregation/nixl/conn.py +17 -12
sglang/srt/disaggregation/prefill.py +128 -43
sglang/srt/disaggregation/utils.py +127 -123
sglang/srt/entrypoints/engine.py +15 -1
sglang/srt/entrypoints/http_server.py +13 -2
sglang/srt/eplb_simulator/__init__.py +1 -0
sglang/srt/eplb_simulator/reader.py +51 -0
sglang/srt/layers/activation.py +19 -0
sglang/srt/layers/attention/aiter_backend.py +15 -2
sglang/srt/layers/attention/cutlass_mla_backend.py +38 -15
sglang/srt/layers/attention/flashattention_backend.py +53 -64
sglang/srt/layers/attention/flashinfer_backend.py +1 -2
sglang/srt/layers/attention/flashinfer_mla_backend.py +22 -24
sglang/srt/layers/attention/flashmla_backend.py +2 -10
sglang/srt/layers/attention/triton_backend.py +119 -119
sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
sglang/srt/layers/attention/vision.py +51 -24
sglang/srt/layers/communicator.py +23 -5
sglang/srt/layers/linear.py +0 -4
sglang/srt/layers/logits_processor.py +0 -12
sglang/srt/layers/moe/ep_moe/kernels.py +6 -5
sglang/srt/layers/moe/ep_moe/layer.py +42 -32
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -4
sglang/srt/layers/moe/topk.py +16 -8
sglang/srt/layers/pooler.py +56 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
sglang/srt/layers/quantization/fp8_kernel.py +44 -15
sglang/srt/layers/quantization/fp8_utils.py +87 -22
sglang/srt/layers/radix_attention.py +2 -3
sglang/srt/lora/lora_manager.py +79 -34
sglang/srt/lora/mem_pool.py +4 -5
sglang/srt/managers/cache_controller.py +2 -1
sglang/srt/managers/io_struct.py +28 -4
sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
sglang/srt/managers/multimodal_processors/vila.py +85 -0
sglang/srt/managers/schedule_batch.py +39 -6
sglang/srt/managers/scheduler.py +73 -17
sglang/srt/managers/tokenizer_manager.py +29 -2
sglang/srt/mem_cache/chunk_cache.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +4 -2
sglang/srt/mem_cache/memory_pool.py +111 -407
sglang/srt/mem_cache/memory_pool_host.py +380 -0
sglang/srt/mem_cache/radix_cache.py +36 -12
sglang/srt/model_executor/cuda_graph_runner.py +122 -55
sglang/srt/model_executor/forward_batch_info.py +14 -5
sglang/srt/model_executor/model_runner.py +6 -6
sglang/srt/model_loader/loader.py +8 -1
sglang/srt/models/bert.py +113 -13
sglang/srt/models/deepseek_v2.py +113 -155
sglang/srt/models/internvl.py +46 -102
sglang/srt/models/roberta.py +117 -9
sglang/srt/models/vila.py +305 -0
sglang/srt/openai_api/adapter.py +162 -4
sglang/srt/openai_api/protocol.py +37 -1
sglang/srt/sampling/sampling_batch_info.py +24 -0
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +318 -233
sglang/srt/speculative/build_eagle_tree.py +1 -1
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -3
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +5 -2
sglang/srt/speculative/eagle_utils.py +389 -109
sglang/srt/speculative/eagle_worker.py +134 -43
sglang/srt/two_batch_overlap.py +4 -2
sglang/srt/utils.py +58 -0
sglang/test/attention/test_prefix_chunk_info.py +2 -0
sglang/test/runners.py +38 -3
sglang/test/test_block_fp8.py +1 -0
sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
sglang/test/test_block_fp8_ep.py +1 -0
sglang/test/test_utils.py +3 -1
sglang/utils.py +9 -0
sglang/version.py +1 -1
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +5 -5
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +99 -88
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -35,11 +35,17 @@ from sglang.srt.speculative.eagle_utils import (
     EagleVerifyInput,
     EagleVerifyOutput,
     assign_draft_cache_locs,
+    fast_topk,
     generate_token_bitmask,
     select_top_k_tokens,
 )
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
-from sglang.srt.utils import empty_context, fast_topk, get_available_gpu_memory, is_cuda
+from sglang.srt.utils import (
+    empty_context,
+    get_available_gpu_memory,
+    is_cuda,
+    next_power_of_2,
+)
 if is_cuda():
     from sgl_kernel import segment_packbits
@@ -152,6 +158,12 @@ class EAGLEWorker(TpModelWorker):
             self.init_attention_backend()
             self.init_cuda_graphs()
+        # Some dummy tensors
+        self.num_new_pages_per_topk = torch.empty(
+            (), dtype=torch.int64, device=self.device
+        )
+        self.extend_lens = torch.empty((), dtype=torch.int64, device=self.device)
     def init_attention_backend(self):
         # Create multi-step attn backends and cuda graph runners
         if self.server_args.attention_backend == "flashinfer":
@@ -254,7 +266,7 @@ class EAGLEWorker(TpModelWorker):
         self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
         after_mem = get_available_gpu_memory(self.device, self.gpu_id)
         logger.info(
-            f"Capture draft cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB."
+            f"Capture draft cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
         )
         # Capture extend
@@ -269,7 +281,7 @@ class EAGLEWorker(TpModelWorker):
             )
             after_mem = get_available_gpu_memory(self.device, self.gpu_id)
             logger.info(
-                f"Capture draft extend cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB."
+                f"Capture draft extend cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
             )
     @property
@@ -365,14 +377,21 @@ class EAGLEWorker(TpModelWorker):
             )
         # Allocate cache locations
+        # Layout of the out_cache_loc
+        # [       topk 0         ] [       topk 1         ]
+        # [iter=0, iter=1, iter=2] [iter=0, iter=1, iter=2]
         if self.page_size == 1:
             out_cache_loc, token_to_kv_pool_state_backup = batch.alloc_token_slots(
-                num_seqs * self.topk * self.speculative_num_steps, backup_state=True
+                num_seqs * self.speculative_num_steps * self.topk, backup_state=True
             )
         else:
             if self.topk == 1:
-                prefix_lens = batch.seq_lens
-                seq_lens = prefix_lens + self.speculative_num_steps
+                prefix_lens, seq_lens, last_loc = get_last_loc_large_page_size_top_k_1(
+                    batch.req_to_token_pool.req_to_token,
+                    batch.req_pool_indices,
+                    batch.seq_lens,
+                    self.speculative_num_steps,
+                )
                 extend_num_tokens = num_seqs * self.speculative_num_steps
             else:
                 # In this case, the last partial page needs to be duplicated.
@@ -385,29 +404,33 @@ class EAGLEWorker(TpModelWorker):
                 #  "x" means speculative draft tokens
                 #  "." means padded tokens
-                # TODO: fuse these ops
-                prefix_lens = batch.seq_lens
-                last_page_lens = prefix_lens % self.page_size
-                num_new_pages = (
-                    last_page_lens + self.speculative_num_steps + self.page_size - 1
-                ) // self.page_size
-                seq_lens = (
-                    prefix_lens // self.page_size * self.page_size
-                    + num_new_pages * (self.page_size * self.topk)
-                )
-                extend_num_tokens = torch.sum(seq_lens - prefix_lens).item()
-                raise NotImplementedError(
-                    "page_size > 1 and top_k > 1 are not supported."
+                # TODO(lmzheng): The current implementation is still a fake support
+                # for page size > 1. In the `assign_draft_cache_locs` below,
+                # we directly move the indices instead of the real kv cache.
+                # This only works when the kernel backend runs with page size = 1.
+                # If the kernel backend runs with page size > 1, we need to
+                # duplicate the real KV cache. The overhead of duplicating KV
+                # cache seems okay because the draft KV cache only has one layer.
+                # see a related copy operation in MHATokenToKVPool::move_kv_cache.
+                (
+                    prefix_lens,
+                    seq_lens,
+                    last_loc,
+                    self.num_new_pages_per_topk,
+                    self.extend_lens,
+                ) = get_last_loc_large_page_size_large_top_k(
+                    batch.req_to_token_pool.req_to_token,
+                    batch.req_pool_indices,
+                    batch.seq_lens,
+                    self.speculative_num_steps,
+                    self.topk,
+                    self.page_size,
                 )
-                # TODO: Support page_size > 1 and top_k > 1
-                # 1. Duplicate the KV cache in the last partial page for all top-k segments
-                # 2. Modify generate_draft_decode_kv_indices accordingly
-            last_loc = get_last_loc(
-                batch.req_to_token_pool.req_to_token,
-                batch.req_pool_indices,
-                prefix_lens,
-            )
+                # TODO(lmzheng): remove this device sync
+                extend_num_tokens = torch.sum(self.extend_lens).item()
             out_cache_loc, token_to_kv_pool_state_backup = (
                 batch.alloc_paged_token_slots_extend(
                     prefix_lens,
@@ -422,18 +445,30 @@ class EAGLEWorker(TpModelWorker):
             batch.req_pool_indices,
             batch.req_to_token_pool.req_to_token,
             batch.seq_lens,
+            self.extend_lens,
+            self.num_new_pages_per_topk,
             out_cache_loc,
             batch.req_to_token_pool.req_to_token.shape[1],
             self.topk,
             self.speculative_num_steps,
             self.page_size,
+            next_power_of_2(num_seqs),
+            next_power_of_2(self.speculative_num_steps),
         )
+        if self.page_size > 1 and self.topk > 1:
+            # Remove padded slots
+            out_cache_loc = out_cache_loc[
+                : num_seqs * self.topk * self.speculative_num_steps
+            ]
         batch.out_cache_loc = out_cache_loc
         batch.seq_lens_sum = torch.sum(batch.seq_lens).item()
+        batch.return_hidden_states = False
         spec_info.positions = batch.seq_lens.repeat_interleave(self.topk, dim=0)
+        spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         # Get forward batch
-        spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         model_worker_batch = batch.get_model_worker_batch()
         forward_batch = ForwardBatch.init_new(
             model_worker_batch, self.draft_model_runner
@@ -448,9 +483,6 @@ class EAGLEWorker(TpModelWorker):
         else:
             # Initialize attention backend
             self.draft_attn_backend.init_forward_metadata(forward_batch)
-            forward_batch = ForwardBatch.init_new(
-                model_worker_batch, self.draft_model_runner
-            )
             # Run forward steps
             score_list, token_list, parents_list = self.draft_forward(forward_batch)
@@ -503,6 +535,13 @@ class EAGLEWorker(TpModelWorker):
         if self.hot_token_id is not None:
             topk_index = self.hot_token_id[topk_index]
+        out_cache_loc = out_cache_loc.reshape(
+            forward_batch.batch_size, self.topk, self.speculative_num_steps
+        )
+        out_cache_loc = out_cache_loc.permute((2, 0, 1)).reshape(
+            self.speculative_num_steps, -1
+        )
         # Return values
         score_list: List[torch.Tensor] = []
         token_list: List[torch.Tensor] = []
@@ -524,10 +563,7 @@ class EAGLEWorker(TpModelWorker):
             # Set inputs
             forward_batch.input_ids = input_ids
-            out_cache_loc = out_cache_loc.view(forward_batch.batch_size, -1)
-            forward_batch.out_cache_loc = out_cache_loc[
-                :, self.topk * i : self.topk * (i + 1)
-            ].flatten()
+            forward_batch.out_cache_loc = out_cache_loc[i]
             forward_batch.positions.add_(1)
             forward_batch.attn_backend = self.draft_attn_backend.attn_backends[i]
             spec_info.hidden_states = hidden_states
@@ -547,11 +583,13 @@ class EAGLEWorker(TpModelWorker):
     def verify(self, batch: ScheduleBatch, spec_info: EagleVerifyInput):
         spec_info.prepare_for_verify(batch, self.page_size)
+        batch.return_hidden_states = False
         batch.forward_mode = ForwardMode.TARGET_VERIFY
         batch.spec_info = spec_info
         model_worker_batch = batch.get_model_worker_batch(
             seq_lens_cpu_cache=spec_info.seq_lens_cpu
         )
+        assert model_worker_batch.capture_hidden_mode == spec_info.capture_hidden_mode
         if batch.has_grammar:
             retrieve_next_token_cpu = spec_info.retrive_next_token.cpu()
@@ -583,7 +621,7 @@ class EAGLEWorker(TpModelWorker):
             if vocab_mask is not None:
                 assert spec_info.grammar is not None
                 vocab_mask = vocab_mask.to(spec_info.retrive_next_token.device)
-                # otherwise, this vocab mask will be the one from the previous extend stage
+                # NOTE (sk): otherwise, this vocab mask will be the one from the previous extend stage
                 # and will be applied to produce wrong results
                 batch.sampling_info.vocab_mask = None
@@ -604,13 +642,13 @@ class EAGLEWorker(TpModelWorker):
         ]
         logits_output.hidden_states = logits_output.hidden_states[res.accepted_indices]
+        if batch.return_logprob:
+            self.add_logprob_values(batch, res, logits_output)
         # Prepare the batch for the next draft forwards.
         batch.forward_mode = ForwardMode.DECODE
         batch.spec_info = res.draft_input
-        if batch.return_logprob:
-            self.add_logprob_values(batch, res, logits_output)
         return logits_output, res, model_worker_batch, can_run_cuda_graph
     def add_logprob_values(
@@ -623,8 +661,16 @@ class EAGLEWorker(TpModelWorker):
         logits_output = res.logits_output
         top_logprobs_nums = batch.top_logprobs_nums
         token_ids_logprobs = batch.token_ids_logprobs
+        accepted_indices = res.accepted_indices
+        assert len(accepted_indices) == len(logits_output.next_token_logits)
+        temperatures = batch.sampling_info.temperatures
+        num_draft_tokens = batch.spec_info.draft_token_num
+        # acceptance indices are the indices in a "flattened" batch.
+        # dividing it to num_draft_tokens will yield the actual batch index.
+        temperatures = temperatures[accepted_indices // num_draft_tokens]
         logprobs = torch.nn.functional.log_softmax(
-            logits_output.next_token_logits, dim=-1
+            logits_output.next_token_logits / temperatures, dim=-1
         )
         batch_next_token_ids = res.verified_id
         num_tokens_per_req = [accept + 1 for accept in res.accept_length_per_req_cpu]
@@ -659,7 +705,7 @@ class EAGLEWorker(TpModelWorker):
         pt = 0
         next_token_logprobs = logits_output.next_token_logprobs.tolist()
         verified_ids = batch_next_token_ids.tolist()
-        for req, num_tokens in zip(batch.reqs, num_tokens_per_req):
+        for req, num_tokens in zip(batch.reqs, num_tokens_per_req, strict=True):
             for _ in range(num_tokens):
                 if req.return_logprob:
                     req.output_token_logprobs_val.append(next_token_logprobs[pt])
@@ -691,6 +737,7 @@ class EAGLEWorker(TpModelWorker):
             hidden_states=hidden_states,
             verified_id=next_token_ids,
         )
+        batch.return_hidden_states = False
         batch.spec_info.prepare_for_extend(batch)
         batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         model_worker_batch = batch.get_model_worker_batch(
@@ -781,4 +828,48 @@ def load_token_map(token_map_path: str) -> List[int]:
         )
         token_map_path = os.path.join(cache_dir, os.path.basename(token_map_path))
     hot_token_id = torch.load(token_map_path, weights_only=True)
-    return torch.tensor(hot_token_id, dtype=torch.int32)
+    return torch.tensor(hot_token_id, dtype=torch.int64)
+@torch.compile(dynamic=True)
+def get_last_loc_large_page_size_top_k_1(
+    req_to_token: torch.Tensor,
+    req_pool_indices: torch.Tensor,
+    seq_lens,
+    speculative_num_steps: int,
+):
+    prefix_lens = seq_lens
+    seq_lens = prefix_lens + speculative_num_steps
+    last_loc = get_last_loc(
+        req_to_token,
+        req_pool_indices,
+        prefix_lens,
+    )
+    return prefix_lens, seq_lens, last_loc
+@torch.compile(dynamic=True)
+def get_last_loc_large_page_size_large_top_k(
+    req_to_token: torch.Tensor,
+    req_pool_indices: torch.Tensor,
+    seq_lens: torch.Tensor,
+    speculative_num_steps: int,
+    topk: int,
+    page_size: int,
+):
+    prefix_lens = seq_lens
+    last_page_lens = prefix_lens % page_size
+    num_new_pages_per_topk = (
+        last_page_lens + speculative_num_steps + page_size - 1
+    ) // page_size
+    seq_lens = prefix_lens // page_size * page_size + num_new_pages_per_topk * (
+        page_size * topk
+    )
+    extend_lens = seq_lens - prefix_lens
+    last_loc = get_last_loc(
+        req_to_token,
+        req_pool_indices,
+        prefix_lens,
+    )
+    return prefix_lens, seq_lens, last_loc, num_new_pages_per_topk, extend_lens

sglang/srt/two_batch_overlap.py CHANGED Viewed

@@ -11,7 +11,7 @@ from sglang.srt.layers.communicator import (
     ScatterMode,
 )
 from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
-from sglang.srt.layers.quantization.deep_gemm import configure_deep_gemm_num_sms
+from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.operations import execute_operations, execute_overlapped_operations
@@ -479,7 +479,9 @@ def _model_forward_tbo(
     )
     del inputs
-    with configure_deep_gemm_num_sms(operations_strategy.deep_gemm_num_sms):
+    with deep_gemm_wrapper.configure_deep_gemm_num_sms(
+        operations_strategy.deep_gemm_num_sms
+    ):
         outputs_arr = execute_overlapped_operations(
             inputs_arr=inputs_arr,
             operations_arr=[operations_strategy.operations] * 2,

sglang/srt/utils.py CHANGED Viewed

@@ -17,6 +17,7 @@ import base64
 import builtins
 import ctypes
 import dataclasses
+import functools
 import importlib
 import io
 import ipaddress
@@ -837,6 +838,7 @@ class CustomCacheManager(FileCacheManager):
 def set_ulimit(target_soft_limit=65535):
+    # number of open files
     resource_type = resource.RLIMIT_NOFILE
     current_soft, current_hard = resource.getrlimit(resource_type)
@@ -846,6 +848,18 @@ def set_ulimit(target_soft_limit=65535):
         except ValueError as e:
             logger.warning(f"Fail to set RLIMIT_NOFILE: {e}")
+    # stack size
+    resource_type = resource.RLIMIT_STACK
+    current_soft, current_hard = resource.getrlimit(resource_type)
+    target_soft_limit_stack_size = 1024 * target_soft_limit
+    if current_soft < target_soft_limit_stack_size:
+        try:
+            resource.setrlimit(
+                resource_type, (target_soft_limit_stack_size, current_hard)
+            )
+        except ValueError as e:
+            logger.warning(f"Fail to set RLIMIT_STACK: {e}")
 def add_api_key_middleware(app, api_key: str):
     @app.middleware("http")
@@ -1373,6 +1387,11 @@ def print_warning_once(msg: str) -> None:
     logger.warning(msg, stacklevel=2)
+@functools.lru_cache(None)
+def print_info_once(msg: str) -> None:
+    logger.info(msg)
 def get_device_name(device_id: int = 0) -> str:
     if hasattr(torch, "cuda") and torch.cuda.is_available():
         return torch.cuda.get_device_name(device_id)
@@ -2197,6 +2216,45 @@ class Withable(Generic[T]):
             self._value = None
+def merge_bias_tensor(
+    lhs: Optional[torch.Tensor],
+    rhs: Optional[torch.Tensor],
+    bs1: int,
+    bs2: int,
+    device: str,
+    default: float,
+):
+    """Merge two bias tensors for batch merging.
+    Args:
+        lhs: Left-hand side tensor
+        rhs: Right-hand side tensor
+        bs1: Batch size of left-hand side tensor
+        bs2: Batch size of right-hand side tensor
+        device: Device to place the merged tensor on
+        default: Default value for missing tensor elements
+    Returns:
+        Merged tensor or None if both inputs are None
+    """
+    if lhs is None and rhs is None:
+        return None
+    if lhs is not None and rhs is not None:
+        return torch.cat([lhs, rhs])
+    else:
+        if lhs is not None:
+            shape, dtype = lhs.shape[1:], lhs.dtype
+        else:
+            shape, dtype = rhs.shape[1:], rhs.dtype
+        if lhs is None:
+            lhs = torch.empty((bs1, *shape), device=device, dtype=dtype).fill_(default)
+        if rhs is None:
+            rhs = torch.empty((bs2, *shape), device=device, dtype=dtype).fill_(default)
+        return torch.cat([lhs, rhs])
 def find_local_repo_dir(repo_id: str, revision: Optional[str] = None) -> Optional[str]:
     import huggingface_hub as hf

sglang/test/attention/test_prefix_chunk_info.py CHANGED Viewed

@@ -2,6 +2,8 @@ import unittest
 import torch
+from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend
+from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.test.test_utils import CustomTestCase

sglang/test/runners.py CHANGED Viewed

@@ -42,6 +42,21 @@ DEFAULT_PROMPTS = [
     # the output of gemma-2-2b from SRT is unstable on the commented prompt
     # "The capital of France is",
 ]
+TEST_RERANK_QUERY_DOCS = [
+    {
+        "query": "How many people live in Berlin?",
+        "documents": [
+            "Berlin is well known for its museums.",
+        ],
+    },
+    {
+        "query": "How many people live in Berlin?",
+        "documents": [
+            "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.",
+            "Berlin is well known for its museums.",
+        ],
+    },
+]
 dirpath = os.path.dirname(__file__)
 with open(os.path.join(dirpath, "long_prompt.txt"), "r") as f:
@@ -241,7 +256,7 @@ class HFRunner:
                 self.model = _get_sentence_transformer_embedding_model(
                     model_path, torch_dtype
                 )
-        elif self.model_type == "reward":
+        elif self.model_type == "reward" or self.model_type == "cross_encoder":
             from transformers import AutoModelForSequenceClassification
             self.model = AutoModelForSequenceClassification.from_pretrained(
@@ -303,6 +318,15 @@ class HFRunner:
                     else:
                         logits = self.model.encode(prompts).tolist()
                     out_queue.put(ModelOutput(embed_logits=logits))
+                elif self.model_type == "cross_encoder":
+                    inputs = self.tokenizer(
+                        prompts, padding=True, return_tensors="pt"
+                    ).to("cuda")
+                    scores = self.model(**inputs).logits
+                    scores = scores.squeeze().tolist()
+                    if not isinstance(scores, list):
+                        scores = [scores]
+                    out_queue.put(ModelOutput(scores=scores))
                 elif self.model_type == "reward":
                     scores = []
@@ -322,7 +346,9 @@ class HFRunner:
     def forward(
         self,
-        prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        prompts: Union[
+            List[List[str]], List[str], List[torch.Tensor]
+        ] = DEFAULT_PROMPTS,
         image_data: Optional[List[str]] = None,
         max_new_tokens: int = 8,
         lora_paths: Optional[List[str]] = None,
@@ -526,7 +552,9 @@ class SRTRunner:
     def forward(
         self,
-        prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        prompts: Union[
+            List[List[str]], List[str], List[torch.Tensor]
+        ] = DEFAULT_PROMPTS,
         image_data: Optional[List[str]] = None,
         max_new_tokens: int = 8,
         lora_paths: Optional[List[str]] = None,
@@ -552,6 +580,13 @@ class SRTRunner:
                 else:
                     logits = [response["embedding"]]
                 return ModelOutput(embed_logits=logits)
+            # cross encoder model
+            elif self.model_type == "cross_encoder":
+                response = self.engine.rerank(prompts)
+                if not isinstance(response, list):
+                    response = [response]
+                scores = [x["embedding"] for x in response]
+                return ModelOutput(scores=scores)
             # reward model
             else:
                 response = self.engine.encode(prompts)

sglang/test/test_block_fp8.py CHANGED Viewed

@@ -343,6 +343,7 @@ class TestW8A8BlockFP8Matmul(CustomTestCase):
         OUT_DTYPES = [torch.bfloat16]
         M = [64, 128, 512, 1024, 4096]
         NKs = [
+            (2112, 7168),
             (1536, 7168),
             (3072, 1536),
             (24576, 7168),

sglang 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.7py3-none-any.whl → 0.4.7.post1py3-none-any.whl