PyPI - sglang - Versions diffs - 0.4.1.post1__py3-none-any.whl → 0.4.1.post2__py3-none-any.whl - Mend

sglang 0.4.1.post1py3-none-any.whl → 0.4.1.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

sglang/bench_offline_throughput.py +1 -0
sglang/srt/configs/model_config.py +11 -2
sglang/srt/layers/attention/__init__.py +0 -1
sglang/srt/layers/attention/flashinfer_backend.py +54 -41
sglang/srt/layers/logits_processor.py +30 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +46 -26
sglang/srt/layers/quantization/fp8.py +42 -2
sglang/srt/layers/quantization/fp8_kernel.py +77 -18
sglang/srt/layers/quantization/fp8_utils.py +8 -2
sglang/srt/managers/io_struct.py +29 -8
sglang/srt/managers/schedule_batch.py +22 -15
sglang/srt/managers/scheduler.py +60 -20
sglang/srt/managers/session_controller.py +102 -27
sglang/srt/managers/tokenizer_manager.py +41 -10
sglang/srt/managers/tp_worker.py +7 -0
sglang/srt/managers/tp_worker_overlap_thread.py +5 -0
sglang/srt/model_executor/forward_batch_info.py +42 -3
sglang/srt/model_executor/model_runner.py +4 -0
sglang/srt/models/llama.py +11 -0
sglang/srt/models/llama_eagle.py +132 -0
sglang/srt/openai_api/adapter.py +60 -2
sglang/srt/openai_api/protocol.py +48 -0
sglang/srt/server.py +26 -3
sglang/srt/server_args.py +17 -30
sglang/srt/speculative/spec_info.py +19 -0
sglang/srt/utils.py +62 -0
sglang/version.py +1 -1
{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/METADATA +3 -3
{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/RECORD +32 -30
{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/top_level.txt +0 -0

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -257,6 +257,34 @@ class ResponseFormat(BaseModel):
     json_schema: Optional[JsonSchemaResponseFormat] = None
+class Function(BaseModel):
+    """Function descriptions."""
+    description: Optional[str] = Field(default=None, examples=[None])
+    name: str
+    parameters: Optional[object] = None
+class Tool(BaseModel):
+    """Function wrapper."""
+    type: str = Field(default="function", examples=["function"])
+    function: Function
+class ToolChoiceFuncName(BaseModel):
+    """The name of tool choice function."""
+    name: str
+class ToolChoice(BaseModel):
+    """The tool choice definition."""
+    function: ToolChoiceFuncName
+    type: Literal["function"] = Field(default="function", examples=["function"])
 class ChatCompletionRequest(BaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/chat/create
@@ -277,6 +305,10 @@ class ChatCompletionRequest(BaseModel):
     temperature: float = 0.7
     top_p: float = 1.0
     user: Optional[str] = None
+    tools: Optional[List[Tool]] = Field(default=None, examples=[None])
+    tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
+        default="auto", examples=["none"]
+    )  # noqa
     # Extra parameters for SRT backend only and will be ignored by OpenAI models.
     top_k: int = -1
@@ -292,9 +324,25 @@ class ChatCompletionRequest(BaseModel):
     ebnf: Optional[str] = None
+class FunctionResponse(BaseModel):
+    """Function response."""
+    name: str
+    arguments: str
+class ToolCall(BaseModel):
+    """Tool call response."""
+    id: str
+    type: Literal["function"] = "function"
+    function: FunctionResponse
 class ChatMessage(BaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
 class ChatCompletionResponseChoice(BaseModel):

sglang/srt/server.py CHANGED Viewed

@@ -57,6 +57,7 @@ from sglang.srt.managers.io_struct import (
     OpenSessionReqInput,
     UpdateWeightFromDiskReqInput,
     UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromTensorReqInput,
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
@@ -109,6 +110,7 @@ app.add_middleware(
 tokenizer_manager: TokenizerManager = None
 scheduler_info: Dict = None
 ##### Native API endpoints #####
@@ -257,6 +259,10 @@ async def open_session(obj: OpenSessionReqInput, request: Request):
     """Open a session, and return its unique session id."""
     try:
         session_id = await tokenizer_manager.open_session(obj, request)
+        if session_id is None:
+            raise Exception(
+                "Failed to open the session. Check if a session with the same id is still open."
+            )
         return session_id
     except Exception as e:
         return _create_error_response(e)
@@ -484,7 +490,16 @@ def launch_engine(
     # Wait for model to finish loading
     scheduler_infos = []
     for i in range(len(scheduler_pipe_readers)):
-        data = scheduler_pipe_readers[i].recv()
+        try:
+            data = scheduler_pipe_readers[i].recv()
+        except EOFError as e:
+            logger.exception(e)
+            logger.error(
+                f"Rank {i} scheduler is dead. Please check if there are relevant logs."
+            )
+            scheduler_procs[i].join()
+            logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
+            raise
         if data["status"] != "ready":
             raise RuntimeError(
@@ -492,7 +507,7 @@ def launch_engine(
             )
         scheduler_infos.append(data)
-    # Assume all schedulers have same max_total_num_tokens
+    # Assume all schedulers have same scheduler_info
     scheduler_info = scheduler_infos[0]
@@ -857,6 +872,14 @@ class Engine:
             tokenizer_manager.update_weights_from_distributed(obj, None)
         )
+    def update_weights_from_tensor(self, name, tensor):
+        """Update weights from distributed source."""
+        obj = UpdateWeightsFromTensorReqInput(name=name, tensor=tensor)
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            tokenizer_manager.update_weights_from_tensor(obj, None)
+        )
     def get_weights_by_name(self, name, truncate_size=100):
         """Get weights by parameter name."""
         obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
@@ -871,7 +894,7 @@ class Runtime:
     using the commond line interface.
     It is mainly used for the frontend language.
-    You should use the Engine class if you want to do normal offline processing.
+    You should use the Engine class above if you want to do normal offline processing.
     """
     def __init__(

sglang/srt/server_args.py CHANGED Viewed

@@ -55,7 +55,7 @@ class ServerArgs:
     is_embedding: bool = False
     revision: Optional[str] = None
-    # Port
+    # Port for the HTTP server
     host: str = "127.0.0.1"
     port: int = 30000
@@ -68,6 +68,7 @@ class ServerArgs:
     schedule_policy: str = "lpm"
     schedule_conservativeness: float = 1.0
     cpu_offload_gb: int = 0
+    prefill_only_one_req: bool = False
     # Other runtime options
     tp_size: int = 1
@@ -94,6 +95,7 @@ class ServerArgs:
     # Data parallelism
     dp_size: int = 1
     load_balance_method: str = "round_robin"
     # Expert parallelism
     ep_size: int = 1
@@ -217,6 +219,13 @@ class ServerArgs:
             )
             self.disable_cuda_graph = True
+        # Expert parallelism
+        if self.enable_ep_moe:
+            self.ep_size = self.tp_size
+            logger.info(
+                f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
+            )
         # Others
         if self.enable_dp_attention:
             self.dp_size = self.tp_size
@@ -229,12 +238,6 @@ class ServerArgs:
                 "Data parallel size is adjusted to be the same as tensor parallel size. "
                 "Overlap scheduler is disabled."
             )
-        # Expert parallelism
-        if self.enable_ep_moe:
-            self.ep_size = self.tp_size
-            logger.info(
-                f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
-            )
         # GGUF
         if (
@@ -430,13 +433,18 @@ class ServerArgs:
             default=ServerArgs.schedule_conservativeness,
             help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
         )
         parser.add_argument(
             "--cpu-offload-gb",
             type=int,
             default=ServerArgs.cpu_offload_gb,
             help="How many GBs of RAM to reserve for CPU offloading",
         )
+        parser.add_argument(
+            "--prefill-only-one-req",
+            type=bool,
+            help="If true, we only prefill one request at one prefill batch",
+            default=ServerArgs.prefill_only_one_req,
+        )
         # Other runtime options
         parser.add_argument(
@@ -555,6 +563,7 @@ class ServerArgs:
                 "shortest_queue",
             ],
         )
         # Expert parallelism
         parser.add_argument(
             "--expert-parallel-size",
@@ -777,28 +786,6 @@ class ServerArgs:
             help="Delete the model checkpoint after loading the model.",
         )
-        # Deprecated arguments
-        parser.add_argument(
-            "--enable-overlap-schedule",
-            action=DeprecatedAction,
-            help="'--enable-overlap-schedule' is deprecated. It is enabled by default now. Please drop this argument.",
-        )
-        parser.add_argument(
-            "--disable-flashinfer",
-            action=DeprecatedAction,
-            help="'--disable-flashinfer' is deprecated. Please use '--attention-backend triton' instead.",
-        )
-        parser.add_argument(
-            "--disable-flashinfer-sampling",
-            action=DeprecatedAction,
-            help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
-        )
-        parser.add_argument(
-            "--disable-disk-cache",
-            action=DeprecatedAction,
-            help="'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead.",
-        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
         args.tp_size = args.tensor_parallel_size

sglang/srt/speculative/spec_info.py ADDED Viewed

@@ -0,0 +1,19 @@
+from enum import IntEnum, auto
+class SpeculativeAlgorithm(IntEnum):
+    EAGLE = auto()
+    def is_eagle(self):
+        return self == SpeculativeAlgorithm.EAGLE
+    @staticmethod
+    def from_string(name: str):
+        name_map = {
+            "EAGLE": SpeculativeAlgorithm.EAGLE,
+        }
+        return name_map[name]
+class SpecInfo:
+    pass

sglang/srt/utils.py CHANGED Viewed

@@ -1273,3 +1273,65 @@ def dataclass_to_string_truncated(data, max_length=2048):
         )
     else:
         return str(data)
+TOOLS_TAG_LIST = ["<|plugin|>", "<function=", "<tool_call>", "<|python_tag|>"]
+def parse_tool_response(text, tools, **kwargs):
+    """Parse model response containing tool information.
+    Args:
+        text(str): model response in string format
+        tools(List): tools from user request
+    """
+    if "<|plugin|>" in text:  # internlm2
+        text, action = text.split("<|action_start|><|plugin|>")
+        action = action.split("<|action_end|>".strip())[0]
+        action = action[action.find("{") :]
+        action = json.loads(action)
+        name, parameters = action["name"], json.dumps(
+            action.get("parameters", action.get("arguments", {})), ensure_ascii=False
+        )
+        call_info_list = [(name, parameters)]
+    elif "<function=" in text:  # llama3.1
+        action, _ = text.split("</function>")
+        parameters = action[action.find("{") :]
+        name = action.split("<function=")[1].split(">{")[0]
+        call_info_list = [(name, parameters)]
+    elif "<tool_call>" in text and "</tool_call>" in text:  # qwen2.5
+        # get tool_call in text
+        pattern = r"<tool_call>(.*?)</tool_call>"
+        match_result_list = re.findall(pattern, text, re.DOTALL)
+        call_info_list = []
+        for match_result in match_result_list:
+            action = json.loads(match_result)
+            call_info_list.append(
+                (action["name"], json.dumps(action["arguments"], ensure_ascii=False))
+            )
+        # get text outside of tags
+        if not text.startswith("<tool_call>"):
+            text = text[: text.find("<tool_call>")]
+        elif not text.endswith("</tool_call>"):
+            text = text[text.rfind("</tool_call>") + len("</tool_call>") :]
+        else:
+            text = ""
+    elif "<|python_tag|>" in text:  # llama3.2
+        _, action = text.split("<|python_tag|>")
+        action = json.loads(action)
+        name, parameters = action["name"], json.dumps(
+            action.get("parameters", action.get("arguments", {})), ensure_ascii=False
+        )
+        call_info_list = [(name, parameters)]
+    else:
+        raise RuntimeError(f"Unexpected model response: {text}")
+    call_info_list = [
+        (
+            [tool.function.name for tool in tools].index(call_info[0]),
+            call_info[0],
+            call_info[1],
+        )
+        for call_info in call_info_list
+    ]
+    return text, call_info_list

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.1.~~post1~~"
1	+ __version__ = "0.4.1.post2"

{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.4.1.post1
+Version: 0.4.1.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -351,14 +351,14 @@ The core features include:
 - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
 - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
-## Benchmark And Performance
+## Benchmark and Performance
 Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
 ## Roadmap
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
 ## Adoption and Sponsorship
-The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
+The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
 ## Acknowledgment and Citation
 We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).

{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 sglang/__init__.py,sha256=b2oIdWzp5P8SzieeOs2TzJoN3Do3tfJbV8gZS_imVcs,1619
 sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
 sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
-sglang/bench_offline_throughput.py,sha256=iQiJCK3KQDCdwU1NVbIwbtthssWzBXiIsKUDA7Z_hO0,12510
+sglang/bench_offline_throughput.py,sha256=r-uBvpnx-30mAnVwQB4WlqiXxy2fn5a1NUARwZcaIo4,12533
 sglang/bench_one_batch.py,sha256=jkyMhK0lqn5dRCYgAh30qZrNHP4gAbXODymBMNXK86I,15859
 sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
 sglang/bench_serving.py,sha256=YQiCZreejCPBTqMmZsCB99RMi1N-Jx-dZtaafcQ8-14,53377
@@ -11,7 +11,7 @@ sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
 sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
 sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
 sglang/utils.py,sha256=23jf4Mz8E5p5a6JOkjnfYZixdjZUk88F_mZ8rZcby5Q,11597
-sglang/version.py,sha256=ARioq8ApVNckeQorLPVfHZeN9mlHMLbaNgLGNbGq-ys,28
+sglang/version.py,sha256=1g4t88smWYt1DD5SJZdzXI13OPbsQPOEMX9twA4wq6A,28
 sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sglang/lang/chat_template.py,sha256=cnfjjxIIcYRGRxXlJlOGnpFxFuhMHut7DS52LsOMKcA,15826
 sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -32,14 +32,14 @@ sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21
 sglang/srt/hf_transformers_utils.py,sha256=38Ms0H2-VMerOS6jnczcFtZMS6lhw9B5rSWKAfxVUfQ,7945
 sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
 sglang/srt/model_parallel.py,sha256=eLXZhvJ4wG6dh0FontNCIdVZvHYdWgaeY-5cu7TD9tE,6078
-sglang/srt/server.py,sha256=vDucJl6qtEK2swzPJ_wYitaJvsI4MigMagGlBlH5V54,34033
-sglang/srt/server_args.py,sha256=LgnQ-kBJZ3E7hMMZj9bSK0mn7Bhjk1nJHxLcxl-lGTM,34572
-sglang/srt/utils.py,sha256=J8kFl6kDBwFZCM6AKaVTiqdhJKRg0JOH0pNrD1ZeWmM,41726
+sglang/srt/server.py,sha256=sDERAZlRa6OTaUk-SfW5aKJbPui1COpPG34HDlMHMNc,34916
+sglang/srt/server_args.py,sha256=lBMOME7OSqG0Opinsin-QsioNrWpp2M0ZosB8cXoRrY,33917
+sglang/srt/utils.py,sha256=i8MjcaSQjPPfPZ0txufTtqLr4Q7YhHQ86L1i9j-y5yY,44131
 sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
 sglang/srt/configs/device_config.py,sha256=dResqHjkg_dq10v6rnVpbXpvABZRB0jylOm-2_JAnx0,428
 sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
 sglang/srt/configs/load_config.py,sha256=TcPi_HY6xu5SiVZsxPOoB5pGeDUNebOk7muoUH9VBDg,3083
-sglang/srt/configs/model_config.py,sha256=vVarlLTw9Ged1PXIwRP-R8UhiG6oaezNIZhTNuF0eQc,16070
+sglang/srt/configs/model_config.py,sha256=QP_6WaWMrE4NNF-XODRomiQPO0FABmVZIj5A-qJfnYg,16427
 sglang/srt/configs/qwen2vl.py,sha256=ZjLy9v2eZY4wptUfY3CWgYKg2B5DDrkfCSyTy_Zf_bg,4351
 sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO0ATuk,786
 sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
@@ -63,16 +63,16 @@ sglang/srt/layers/activation.py,sha256=EboMjT9HV2tNHQ6rzpojtlkzev1lAFbhQlxMg9hwx
 sglang/srt/layers/custom_op_util.py,sha256=0vu-yX2wwonmO1L_o5G7SA6C-8XuhDIh9rPDvNeLhoc,922
 sglang/srt/layers/layernorm.py,sha256=nRQ1w1xSUcU-zlqVC61BnGG6otS5W1w9VaSzeXizrx4,4037
 sglang/srt/layers/linear.py,sha256=KyRFU0VcoNuN-hnQB9QQcBN9NCpeqPtLzzufIHUpV6w,47064
-sglang/srt/layers/logits_processor.py,sha256=JlOU0x8vBGIuTwHSdjR6Kly9_uzilBMv0NE_rvUx0W4,14747
+sglang/srt/layers/logits_processor.py,sha256=Imh-qY1D9J80DZVSVV0LfTiHMEw6oQ3JbY9lXxPZAXE,15656
 sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
 sglang/srt/layers/radix_attention.py,sha256=E4cmvkcCdCtb6VyLNrCKy1D6VwHQ063oH3JQXPaRy6w,2178
 sglang/srt/layers/rotary_embedding.py,sha256=29tx3JNR40AoXqBa2cFGBjva9vU2xgFipETlpMaaZas,3985
 sglang/srt/layers/sampler.py,sha256=k4Op_HMkQfT7t9wgQwBVotfTUXEocrzRyQqEFnff1pc,5511
 sglang/srt/layers/torchao_utils.py,sha256=dQVuWNXxAvOPjr2G5BBMWqC2oKcS2B52rx-fEc_elmc,3545
 sglang/srt/layers/vocab_parallel_embedding.py,sha256=slGwLiWjuFLCUdRe-GTlfumyZpqVX9VF6No_UGOT-hA,21624
-sglang/srt/layers/attention/__init__.py,sha256=KIJhzOJWYioQE7Va4D83-V-ZUZVMZcczuNgDC3dlSRo,2583
+sglang/srt/layers/attention/__init__.py,sha256=lNLfWqePc5NMej-AcXl97vxVXsxQOgP7dNNb2ibyUWI,2562
 sglang/srt/layers/attention/double_sparsity_backend.py,sha256=RQdEKRykSLf9ilnaHmR6T7RFqh4emH_adfB3aJN2BUU,10920
-sglang/srt/layers/attention/flashinfer_backend.py,sha256=umD1E2zvMnPbbgvx2Ex5LQB6a4a41brjsks1M0gFMMU,26357
+sglang/srt/layers/attention/flashinfer_backend.py,sha256=8nH4EIEXvNk9yZVl7mSn78w5Dli5UiWL-ZCeYykG9HI,27280
 sglang/srt/layers/attention/torch_native_backend.py,sha256=nQdeqWEMMH_wrod5wssDCJG-uPKm0uslvkALKqPRPQ8,10509
 sglang/srt/layers/attention/triton_backend.py,sha256=-TobyZHwlbJ5HhbFg-jgCqVOw4Y-opgEuFo-EusASQc,6264
 sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=oJ_UK1t229zF3hbTDiQe7t-X-IbM2dOxx4U2ch-vmjA,17847
@@ -85,27 +85,27 @@ sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
 sglang/srt/layers/moe/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
 sglang/srt/layers/moe/ep_moe/layer.py,sha256=6iQU5ZjQ8IXGoQ8ZlBuJqyQxYTEem9vXI6rbVIWKlZw,22303
 sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
-sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=zXwWUtthLa9E35EvlQ9A_mnIsQyA0_NYKsUBdJqONHo,31163
+sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=KvOy544x_4nRqg50o5YHQpHvF8TUD7q9LXDAWPGJlAA,31796
 sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=BclDj5JyCy-8Bfue4broL1-IG6a4dUyggE9WQLa06sg,20575
 sglang/srt/layers/quantization/__init__.py,sha256=VPYXShHvbvkOgVBlkIqic4RhdJ1y6EZ3r34T-nZMT1k,4606
 sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
-sglang/srt/layers/quantization/fp8.py,sha256=wNnpXLroIl7D98mlfCiXZPE9hrP5ricHrXY1WZBzEEo,30810
-sglang/srt/layers/quantization/fp8_kernel.py,sha256=eoO1enzD9jPC80id2oC3i8bt-LN6-4Ey223yOQ9yIPE,8792
-sglang/srt/layers/quantization/fp8_utils.py,sha256=HBJBaNcln1NrLxzw0ppUjMd6w-ryuGDDHCYJq7mRQac,4035
+sglang/srt/layers/quantization/fp8.py,sha256=k4mw-iKxlaEWRkGgaoxCLzZ_dYydyRj0y1N1B_umMwU,32668
+sglang/srt/layers/quantization/fp8_kernel.py,sha256=cYF4ckqrUyhCO9Ha7zi05R8EhRaqSa8rFpYisz-9Ed0,10743
+sglang/srt/layers/quantization/fp8_utils.py,sha256=qBVJXxbxqmf8-Juq0t-IXWjlaZoePJqFNYcs9-oT5Yo,4150
 sglang/srt/lora/lora.py,sha256=-o2mBmUvoVpdkgdAkWTARN4kfyep3UNEJLcg6moh0SU,15056
 sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
 sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
 sglang/srt/managers/data_parallel_controller.py,sha256=psI4FAuBGjtdnEuwagnGdtRqvqSSxOROfNKQqVDqlVA,8382
 sglang/srt/managers/detokenizer_manager.py,sha256=nZkbwt4yty_oy8rvg4T7PbgyVLoBLohvHl25xlQpBoo,8439
 sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
-sglang/srt/managers/io_struct.py,sha256=_LWWqT3LNwZGaWhg2d3kTg1V2MTHKzRasCvxF9Nfpi4,15429
-sglang/srt/managers/schedule_batch.py,sha256=qryPWCdOTFzxomDa80U-5guShOb1K4kBUWcPCCchYB8,45762
+sglang/srt/managers/io_struct.py,sha256=Gru7LEyc3tcM_LewoteCb7GXIrh-OYhA2CnEvjc1Cis,15769
+sglang/srt/managers/schedule_batch.py,sha256=KnoVuWgINnyard-BOXCo0jm3IMdXN9wIwnftMKcag-s,46097
 sglang/srt/managers/schedule_policy.py,sha256=QxjQ8-le062AMHHxool6CxkhvB4FIwhOQPzTX_JwL6U,15447
-sglang/srt/managers/scheduler.py,sha256=Yh15uQFhJlku8a20-lhtIsiEHAcUmpL3BzL42kLVwiI,61637
-sglang/srt/managers/session_controller.py,sha256=Yp-IV3rXczACZxZXmF-QxW9CWICGy8KHQ9ttBGJ8WXA,2800
-sglang/srt/managers/tokenizer_manager.py,sha256=uKiTt__lCFXG60zQhmM_K7dU7IuedVSIQHVw3x3y5-E,31758
-sglang/srt/managers/tp_worker.py,sha256=X1EwFX3FSsmXx7jeeX2tjZRocaujabQYWm-M-0CFEBE,7363
-sglang/srt/managers/tp_worker_overlap_thread.py,sha256=-QNBJRKxraa9Xt2WI1AFzZYdneIJ1eXv0GjFzDqXoE0,8926
+sglang/srt/managers/scheduler.py,sha256=Z1_wf6OCC8Hevc7y0D4Rt4EW5Et9bgTL9oJOkmNmDjo,63490
+sglang/srt/managers/session_controller.py,sha256=3laMRIXEYWDjfytCjPs0vw_Tw__k-nKBY-bYzycYbfc,5482
+sglang/srt/managers/tokenizer_manager.py,sha256=SXvVZHFMBCtcLkKnq-O3uzwrEhfVqk6Y1fzeBEFNq0E,33010
+sglang/srt/managers/tp_worker.py,sha256=8RVBLQaS3TnX7Z4J35RVrFN0M6PVnRBhct3sczBL4dY,7644
+sglang/srt/managers/tp_worker_overlap_thread.py,sha256=JQfrVPeE56ZGJ3nozkhZR-RSb2oePsY7iuedM7XCtdQ,9157
 sglang/srt/mem_cache/base_prefix_cache.py,sha256=QC8HS8RC5DXu14kyXsxAgEUsn0f932p2DjqzbKjc6Bs,962
 sglang/srt/mem_cache/chunk_cache.py,sha256=R2gHAuqKd5ayQW3NnsgoGUH31---Z5izCDyCqLL0FjQ,2524
 sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
@@ -114,8 +114,8 @@ sglang/srt/mem_cache/radix_cache.py,sha256=c5voySV5L855c0G9cBEc9iQ4nR7PDDmg0V6fW
 sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTluMU,6861
 sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
 sglang/srt/model_executor/cuda_graph_runner.py,sha256=1n5WxoE9-0B3unwkkcR355K_D290h2LGt_7EvH02DQM,16246
-sglang/srt/model_executor/forward_batch_info.py,sha256=L5mVoW5SaO6To-7nGk0TZM-FFB5_78cARpJ-aC2rwD0,12883
-sglang/srt/model_executor/model_runner.py,sha256=MLYBcYIQihu2I3PBTUghiU2mSWsDMzlKzcnX7yHa9JU,29837
+sglang/srt/model_executor/forward_batch_info.py,sha256=vqF8XrHQPk3ZL7HqPvvkfP53oqBx0Fajb5lAIkdifBo,13961
+sglang/srt/model_executor/model_runner.py,sha256=TjvAwwr7EqZdmE-5HbuQMeEa0e0FqY6LeqqzEAHXMPU,30012
 sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
 sglang/srt/model_loader/loader.py,sha256=7OG_8-66vFDFZ9kVKGNK1BFBjZ6ql449dlyvdCbMqvE,43876
 sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
@@ -136,8 +136,9 @@ sglang/srt/models/granite.py,sha256=AeQY9Dxd1ZnwgCYBK0vSXXiMGM-yt9iaOVf_ruOUHXw,
 sglang/srt/models/grok.py,sha256=J9lgNbFebvXgF19nfZyHwlGPlGWY_m0LgP506YvOYrU,15668
 sglang/srt/models/internlm2.py,sha256=_xcKtd6YtEFUTozaN-yUb0xbSYckRpomfPSKcAk4j-Y,12127
 sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
-sglang/srt/models/llama.py,sha256=o3FYyOhkZJirzugyYz1kxs6RpY84O_uKowWWmt3jv24,19929
+sglang/srt/models/llama.py,sha256=4UPKF7erp7qqBD11uvvQkO1Fo_wDs71BmA8Y2csXRcA,20302
 sglang/srt/models/llama_classification.py,sha256=DwboM1xHXdf3Fddf7xGnrfdOLJwXdiJs994cIpAPa2g,2984
+sglang/srt/models/llama_eagle.py,sha256=88DzR54DKBIKJ1h-bkIa8mc1qJnlkdZ1eGYY3c5mpBY,4442
 sglang/srt/models/llama_embedding.py,sha256=rh-AiczPY_pTpzcACHvSMVjh1hsV_MZBBwP0LQxPsGM,3130
 sglang/srt/models/llama_reward.py,sha256=oPxh5E2UkxLULNdR68dFvt2I7j33CJFN6nyA-8L2_cg,4516
 sglang/srt/models/llava.py,sha256=xrkg8sht8tBOID7427IEZtHL-KKWfEivDe2NqGjTSAs,26373
@@ -162,8 +163,8 @@ sglang/srt/models/torch_native_llama.py,sha256=YeXHorFm6QfnczLXwPb5TG9a-He0uiA9R
 sglang/srt/models/xverse.py,sha256=Oq--KqvbYu2H4TMVGEHpSnJLEwXBpxlncR9ilsQeckc,13579
 sglang/srt/models/xverse_moe.py,sha256=7E60YIST4ELYwLRgjtHiLRI5Uyc7XqQTM7jQXiWaQs4,15541
 sglang/srt/models/yivl.py,sha256=88OubtuZ38Dxb2LzfV_MTPBI4wKhh4NJqFu--efbhFM,4809
-sglang/srt/openai_api/adapter.py,sha256=X0HLuNhg-chDQjcdsQIRpZijlImEwZLHum3G0JgU4Go,54834
-sglang/srt/openai_api/protocol.py,sha256=RMzeDfh2tZITjhNwB2nX68wZwQe40N6HBuVebCzEWiU,10468
+sglang/srt/openai_api/adapter.py,sha256=HvgeFPWv-v8LOiYF2iNCo-14BIZLAPznNTCUbubB2Rg,57091
+sglang/srt/openai_api/protocol.py,sha256=anWGr2Br8gVYm6Z0yvDwjXLaPCPuvJZ28gr5rV2dhVQ,11613
 sglang/srt/sampling/sampling_batch_info.py,sha256=s--zNjk-LErZ5lMqnZ7KiuJltaziKRbQAU5qYpKIxAc,8564
 sglang/srt/sampling/sampling_params.py,sha256=BkgCJAOSmQXwJrNXg26zSjKfMy0d5mMN6oHRk_ZuESI,5499
 sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
@@ -172,6 +173,7 @@ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD6
 sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=_Nxv0XgUPirZjw2SEJYp_Cd9ZcLwmt7h6JE6J4hhFq4,3629
 sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=5tOgCg7OvE9kSN9VMCpH1hwqo1YMxt9iS5PVpct9HpU,2468
 sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=m22Rfn1RuB1HpImBDECsiJ2VooBYpsFADAwnk1EPzk0,2751
+sglang/srt/speculative/spec_info.py,sha256=d-82uWEC-QBqAgv3XGDNDW8DlHv4MtUsZghFqzGwV7U,352
 sglang/test/few_shot_gsm8k.py,sha256=7yDbEQe49gZeJhz2wFFX-gf_59ThDKsCS1xwfogNc7k,4034
 sglang/test/few_shot_gsm8k_engine.py,sha256=QQbrwOX6-cJDD3RZC_e7zPnt6aSo8JdF8X_lRHSjdDM,3886
 sglang/test/run_eval.py,sha256=9yO0hXZOcn4abEOs96T-XPguDEklK16Ltco0pGF3zCg,4020
@@ -188,8 +190,8 @@ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c
 sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
 sglang/test/test_utils.py,sha256=HJG7kUQOk6n9FBbH89PDtQ41C3kt1cfJODhAEcFT0AQ,23823
 sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
-sglang-0.4.1.post1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
-sglang-0.4.1.post1.dist-info/METADATA,sha256=R2YDOrUU_49x5TEbNUODNlXvkSIzFqT7-hvInlSCs5k,22527
-sglang-0.4.1.post1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-sglang-0.4.1.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
-sglang-0.4.1.post1.dist-info/RECORD,,
+sglang-0.4.1.post2.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
+sglang-0.4.1.post2.dist-info/METADATA,sha256=eORQMKMQDt_eTOh6PxMzIvyjNbg2FllxXss2Z9jU3Ug,22544
+sglang-0.4.1.post2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+sglang-0.4.1.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.4.1.post2.dist-info/RECORD,,

{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/LICENSE RENAMED Viewed

File without changes

{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/WHEEL RENAMED Viewed

File without changes

{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/top_level.txt RENAMED Viewed

File without changes

sglang 0.4.1.post1__py3-none-any.whl → 0.4.1.post2__py3-none-any.whl

sglang 0.4.1.post1py3-none-any.whl → 0.4.1.post2py3-none-any.whl