PyPI - sglang - Versions diffs - 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl - Mend

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (185) hide show

sglang/__init__.py +2 -0
sglang/api.py +6 -0
sglang/bench_one_batch.py +1 -1
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +26 -4
sglang/check_env.py +3 -4
sglang/lang/backend/openai.py +18 -5
sglang/lang/chat_template.py +28 -7
sglang/lang/interpreter.py +7 -3
sglang/lang/ir.py +10 -0
sglang/srt/_custom_ops.py +1 -1
sglang/srt/code_completion_parser.py +174 -0
sglang/srt/configs/__init__.py +2 -6
sglang/srt/configs/deepseekvl2.py +676 -0
sglang/srt/configs/janus_pro.py +3 -4
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +49 -8
sglang/srt/configs/utils.py +25 -0
sglang/srt/connector/__init__.py +51 -0
sglang/srt/connector/base_connector.py +112 -0
sglang/srt/connector/redis.py +85 -0
sglang/srt/connector/s3.py +122 -0
sglang/srt/connector/serde/__init__.py +31 -0
sglang/srt/connector/serde/safe_serde.py +29 -0
sglang/srt/connector/serde/serde.py +43 -0
sglang/srt/connector/utils.py +35 -0
sglang/srt/conversation.py +88 -0
sglang/srt/disaggregation/conn.py +81 -0
sglang/srt/disaggregation/decode.py +495 -0
sglang/srt/disaggregation/mini_lb.py +285 -0
sglang/srt/disaggregation/prefill.py +249 -0
sglang/srt/disaggregation/utils.py +44 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/parallel_state.py +42 -8
sglang/srt/entrypoints/engine.py +55 -5
sglang/srt/entrypoints/http_server.py +78 -13
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +133 -55
sglang/srt/hf_transformers_utils.py +28 -3
sglang/srt/layers/activation.py +4 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +434 -0
sglang/srt/layers/attention/flashinfer_backend.py +1 -1
sglang/srt/layers/attention/flashmla_backend.py +284 -0
sglang/srt/layers/attention/triton_backend.py +171 -38
sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
sglang/srt/layers/attention/utils.py +53 -0
sglang/srt/layers/attention/vision.py +9 -28
sglang/srt/layers/dp_attention.py +41 -19
sglang/srt/layers/layernorm.py +24 -2
sglang/srt/layers/linear.py +17 -5
sglang/srt/layers/logits_processor.py +25 -7
sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
sglang/srt/layers/moe/ep_moe/layer.py +273 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
sglang/srt/layers/moe/fused_moe_native.py +2 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
sglang/srt/layers/moe/topk.py +60 -20
sglang/srt/layers/parameter.py +1 -1
sglang/srt/layers/quantization/__init__.py +80 -53
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/base_config.py +5 -0
sglang/srt/layers/quantization/blockwise_int8.py +1 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
sglang/srt/layers/quantization/fp8.py +76 -34
sglang/srt/layers/quantization/fp8_kernel.py +25 -8
sglang/srt/layers/quantization/fp8_utils.py +284 -28
sglang/srt/layers/quantization/gptq.py +36 -19
sglang/srt/layers/quantization/kv_cache.py +98 -0
sglang/srt/layers/quantization/modelopt_quant.py +9 -7
sglang/srt/layers/quantization/utils.py +153 -0
sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
sglang/srt/layers/rotary_embedding.py +78 -87
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +87 -33
sglang/srt/lora/lora.py +2 -22
sglang/srt/lora/lora_manager.py +67 -30
sglang/srt/lora/mem_pool.py +117 -52
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +18 -1
sglang/srt/managers/cache_controller.py +2 -5
sglang/srt/managers/data_parallel_controller.py +30 -8
sglang/srt/managers/expert_distribution.py +81 -0
sglang/srt/managers/io_struct.py +43 -5
sglang/srt/managers/mm_utils.py +373 -0
sglang/srt/managers/multimodal_processor.py +68 -0
sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
sglang/srt/managers/multimodal_processors/clip.py +63 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
sglang/srt/managers/schedule_batch.py +134 -30
sglang/srt/managers/scheduler.py +290 -31
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +59 -24
sglang/srt/managers/tp_worker.py +4 -1
sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
sglang/srt/managers/utils.py +6 -1
sglang/srt/mem_cache/hiradix_cache.py +18 -7
sglang/srt/mem_cache/memory_pool.py +255 -98
sglang/srt/mem_cache/paged_allocator.py +2 -2
sglang/srt/mem_cache/radix_cache.py +4 -4
sglang/srt/model_executor/cuda_graph_runner.py +36 -21
sglang/srt/model_executor/forward_batch_info.py +68 -11
sglang/srt/model_executor/model_runner.py +75 -8
sglang/srt/model_loader/loader.py +171 -3
sglang/srt/model_loader/weight_utils.py +51 -3
sglang/srt/models/clip.py +563 -0
sglang/srt/models/deepseek_janus_pro.py +31 -88
sglang/srt/models/deepseek_nextn.py +22 -10
sglang/srt/models/deepseek_v2.py +329 -73
sglang/srt/models/deepseek_vl2.py +358 -0
sglang/srt/models/gemma3_causal.py +694 -0
sglang/srt/models/gemma3_mm.py +468 -0
sglang/srt/models/llama.py +47 -7
sglang/srt/models/llama_eagle.py +1 -0
sglang/srt/models/llama_eagle3.py +196 -0
sglang/srt/models/llava.py +3 -3
sglang/srt/models/llavavid.py +3 -3
sglang/srt/models/minicpmo.py +1995 -0
sglang/srt/models/minicpmv.py +62 -137
sglang/srt/models/mllama.py +4 -4
sglang/srt/models/phi3_small.py +1 -1
sglang/srt/models/qwen2.py +3 -0
sglang/srt/models/qwen2_5_vl.py +68 -146
sglang/srt/models/qwen2_classification.py +75 -0
sglang/srt/models/qwen2_moe.py +9 -1
sglang/srt/models/qwen2_vl.py +25 -63
sglang/srt/openai_api/adapter.py +201 -104
sglang/srt/openai_api/protocol.py +33 -7
sglang/srt/patch_torch.py +71 -0
sglang/srt/sampling/sampling_batch_info.py +1 -1
sglang/srt/sampling/sampling_params.py +6 -6
sglang/srt/server_args.py +114 -14
sglang/srt/speculative/build_eagle_tree.py +7 -347
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
sglang/srt/speculative/eagle_utils.py +208 -252
sglang/srt/speculative/eagle_worker.py +140 -54
sglang/srt/speculative/spec_info.py +6 -1
sglang/srt/torch_memory_saver_adapter.py +22 -0
sglang/srt/utils.py +215 -21
sglang/test/__init__.py +0 -0
sglang/test/attention/__init__.py +0 -0
sglang/test/attention/test_flashattn_backend.py +312 -0
sglang/test/runners.py +29 -2
sglang/test/test_activation.py +2 -1
sglang/test/test_block_fp8.py +5 -4
sglang/test/test_block_fp8_ep.py +2 -1
sglang/test/test_dynamic_grad_mode.py +58 -0
sglang/test/test_layernorm.py +3 -2
sglang/test/test_utils.py +56 -5
sglang/utils.py +31 -0
sglang/version.py +1 -1
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +16 -8
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +180 -132
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +1 -1
sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
sglang/srt/managers/image_processor.py +0 -55
sglang/srt/managers/image_processors/base_image_processor.py +0 -219
sglang/srt/managers/image_processors/minicpmv.py +0 -86
sglang/srt/managers/multi_modality_padding.py +0 -134
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info/licenses}/LICENSE +0 -0
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -27,12 +27,16 @@ import signal
 import threading
 from typing import AsyncIterator, Dict, Iterator, List, Optional, Tuple, Union
+import zmq
+import zmq.asyncio
 # Fix a bug of Python threading
 setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
 import torch
 import uvloop
+from sglang.srt.code_completion_parser import load_completion_template_for_openai_api
 from sglang.srt.managers.data_parallel_controller import (
     run_data_parallel_controller_process,
 )
@@ -44,6 +48,8 @@ from sglang.srt.managers.io_struct import (
     InitWeightsUpdateGroupReqInput,
     ReleaseMemoryOccupationReqInput,
     ResumeMemoryOccupationReqInput,
+    RpcReqInput,
+    RpcReqOutput,
     UpdateWeightFromDiskReqInput,
     UpdateWeightsFromDistributedReqInput,
     UpdateWeightsFromTensorReqInput,
@@ -57,6 +63,7 @@ from sglang.srt.utils import (
     MultiprocessingSerializer,
     assert_pkg_version,
     configure_logger,
+    get_zmq_socket,
     kill_process_tree,
     launch_dummy_health_check_server,
     maybe_set_triton_cache_manager,
@@ -102,15 +109,25 @@ class Engine:
         # Shutdown the subprocesses automatically when the program exits
         atexit.register(self.shutdown)
+        # Allocate ports for inter-process communications
+        port_args = PortArgs.init_new(server_args)
+        logger.info(f"{server_args=}")
         # Launch subprocesses
         tokenizer_manager, scheduler_info = _launch_subprocesses(
-            server_args=server_args
+            server_args=server_args,
+            port_args=port_args,
         )
         self.server_args = server_args
         self.tokenizer_manager = tokenizer_manager
         self.scheduler_info = scheduler_info
+        context = zmq.Context(2)
+        self.send_to_rpc = get_zmq_socket(
+            context, zmq.DEALER, port_args.rpc_ipc_name, True
+        )
     def generate(
         self,
         # The input prompt. It can be a single prompt or a batch of prompts.
@@ -232,6 +249,13 @@ class Engine:
         """Shutdown the engine"""
         kill_process_tree(os.getpid(), include_parent=False)
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.shutdown()
+        return False
     def start_profile(self):
         loop = asyncio.get_event_loop()
         loop.run_until_complete(self.tokenizer_manager.start_profile())
@@ -296,7 +320,10 @@ class Engine:
         """Update weights from distributed source. If there are going to be more updates, set `flush_cache` to be true
         to avoid duplicated operations such as clearing cache."""
         obj = UpdateWeightsFromTensorReqInput(
-            serialized_named_tensors=MultiprocessingSerializer.serialize(named_tensors),
+            serialized_named_tensors=[
+                MultiprocessingSerializer.serialize(named_tensors)
+                for _ in range(self.server_args.tp_size)
+            ],
             load_format=load_format,
             flush_cache=flush_cache,
         )
@@ -350,6 +377,23 @@ class Engine:
             self.tokenizer_manager.resume_memory_occupation(obj, None)
         )
+    """
+    Execute an RPC call on all scheduler processes.
+    """
+    def collective_rpc(self, method: str, **kwargs):
+        obj = RpcReqInput(method=method, parameters=kwargs)
+        self.send_to_rpc.send_pyobj(obj)
+        recv_req = self.send_to_rpc.recv_pyobj(zmq.BLOCKY)
+        assert isinstance(recv_req, RpcReqOutput)
+        assert recv_req.success, recv_req.message
+    def save_remote_model(self, **kwargs):
+        self.collective_rpc("save_remote_model", **kwargs)
+    def save_sharded_model(self, **kwargs):
+        self.collective_rpc("save_sharded_model", **kwargs)
 def _set_envs_and_config(server_args: ServerArgs):
     # Set global environments
@@ -408,7 +452,9 @@ def _set_envs_and_config(server_args: ServerArgs):
     mp.set_start_method("spawn", force=True)
-def _launch_subprocesses(server_args: ServerArgs) -> Tuple[TokenizerManager, Dict]:
+def _launch_subprocesses(
+    server_args: ServerArgs, port_args: Optional[PortArgs] = None
+) -> Tuple[TokenizerManager, Dict]:
     """
     Launch the TokenizerManager in the main process, the Scheduler in a subprocess, and the DetokenizerManager in another subprocess.
     """
@@ -418,8 +464,9 @@ def _launch_subprocesses(server_args: ServerArgs) -> Tuple[TokenizerManager, Dic
     _set_envs_and_config(server_args)
     # Allocate ports for inter-process communications
-    port_args = PortArgs.init_new(server_args)
-    logger.info(f"{server_args=}")
+    if port_args is None:
+        port_args = PortArgs.init_new(server_args)
+        logger.info(f"{server_args=}")
     # If using model from www.modelscope.cn, first download the model.
     server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
@@ -502,6 +549,9 @@ def _launch_subprocesses(server_args: ServerArgs) -> Tuple[TokenizerManager, Dic
             tokenizer_manager, server_args.chat_template, server_args.model_path
         )
+    if server_args.completion_template:
+        load_completion_template_for_openai_api(server_args.completion_template)
     # Wait for the model to finish loading
     scheduler_infos = []
     for i in range(len(scheduler_pipe_readers)):

sglang/srt/entrypoints/http_server.py CHANGED Viewed

@@ -14,11 +14,12 @@
 """
 The entry point of inference server. (SRT = SGLang Runtime)
-This file implements HTTP APIs for the inferenc engine via fastapi.
+This file implements HTTP APIs for the inference engine via fastapi.
 """
 import asyncio
 import dataclasses
+import json
 import logging
 import multiprocessing as multiprocessing
 import os
@@ -259,6 +260,29 @@ async def generate_request(obj: GenerateReqInput, request: Request):
             return _create_error_response(e)
+@app.api_route("/generate_from_file", methods=["POST"])
+async def generate_from_file_request(file: UploadFile, request: Request):
+    """Handle a generate request, this is purely to work with input_embeds."""
+    content = await file.read()
+    input_embeds = json.loads(content.decode("utf-8"))
+    obj = GenerateReqInput(
+        input_embeds=input_embeds,
+        sampling_params={
+            "repetition_penalty": 1.2,
+            "temperature": 0.2,
+            "max_new_tokens": 512,
+        },
+    )
+    try:
+        ret = await _global_state.generate_request(obj, request).__anext__()
+        return ret
+    except ValueError as e:
+        logger.error(f"Error: {e}")
+        return _create_error_response(e)
 @app.api_route("/encode", methods=["POST", "PUT"])
 async def encode_request(obj: EmbeddingReqInput, request: Request):
     """Handle an embedding request."""
@@ -283,7 +307,7 @@ async def classify_request(obj: EmbeddingReqInput, request: Request):
         return _create_error_response(e)
-@app.post("/flush_cache")
+@app.api_route("/flush_cache", methods=["GET", "POST"])
 async def flush_cache():
     """Flush the radix cache."""
     _global_state.tokenizer_manager.flush_cache()
@@ -319,6 +343,36 @@ async def stop_profile_async():
     )
+@app.api_route("/start_expert_distribution_record", methods=["GET", "POST"])
+async def start_expert_distribution_record_async():
+    """Start recording the expert distribution. Clear the previous record if any."""
+    await _global_state.tokenizer_manager.start_expert_distribution_record()
+    return Response(
+        content="Start recording the expert distribution.\n",
+        status_code=200,
+    )
+@app.api_route("/stop_expert_distribution_record", methods=["GET", "POST"])
+async def stop_expert_distribution_record_async():
+    """Stop recording the expert distribution."""
+    await _global_state.tokenizer_manager.stop_expert_distribution_record()
+    return Response(
+        content="Stop recording the expert distribution.\n",
+        status_code=200,
+    )
+@app.api_route("/dump_expert_distribution_record", methods=["GET", "POST"])
+async def dump_expert_distribution_record_async():
+    """Dump expert distribution record."""
+    await _global_state.tokenizer_manager.dump_expert_distribution_record()
+    return Response(
+        content="Dump expert distribution record.\n",
+        status_code=200,
+    )
 @app.post("/update_weights_from_disk")
 async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request):
     """Update the weights from disk inplace without re-launching the server."""
@@ -507,7 +561,13 @@ def available_models():
     served_model_names = [_global_state.tokenizer_manager.served_model_name]
     model_cards = []
     for served_model_name in served_model_names:
-        model_cards.append(ModelCard(id=served_model_name, root=served_model_name))
+        model_cards.append(
+            ModelCard(
+                id=served_model_name,
+                root=served_model_name,
+                max_model_len=_global_state.tokenizer_manager.model_config.context_len,
+            )
+        )
     return ModelList(data=model_cards)
@@ -706,9 +766,15 @@ def _wait_and_warmup(
         },
     }
     if server_args.skip_tokenizer_init:
-        json_data["input_ids"] = [10, 11, 12]
+        json_data["input_ids"] = [[10, 11, 12] for _ in range(server_args.dp_size)]
+        # TODO Workaround the bug that embedding errors for list of size 1
+        if server_args.dp_size == 1:
+            json_data["input_ids"] = json_data["input_ids"][0]
     else:
-        json_data["text"] = "The capital city of France is"
+        json_data["text"] = ["The capital city of France is"] * server_args.dp_size
+        # TODO Workaround the bug that embedding errors for list of size 1
+        if server_args.dp_size == 1:
+            json_data["text"] = json_data["text"][0]
     # Debug dumping
     if server_args.debug_tensor_dump_input_file:
@@ -719,14 +785,13 @@ def _wait_and_warmup(
         json_data["sampling_params"]["max_new_tokens"] = 0
     try:
-        for i in range(server_args.dp_size):
-            res = requests.post(
-                url + request_name,
-                json=json_data,
-                headers=headers,
-                timeout=600,
-            )
-            assert res.status_code == 200, f"{res}"
+        res = requests.post(
+            url + request_name,
+            json=json_data,
+            headers=headers,
+            timeout=600,
+        )
+        assert res.status_code == 200, f"{res}"
     except Exception:
         last_traceback = get_exception_traceback()
         if pipe_finish_writer is not None:

sglang/srt/entrypoints/verl_engine.py CHANGED Viewed

@@ -19,6 +19,7 @@ import torch.distributed as dist
 from torch.distributed.tensor import DeviceMesh, DTensor
 from sglang.srt.model_executor.model_runner import LocalSerializedTensor
+from sglang.srt.patch_torch import monkey_patch_torch_reductions
 from sglang.srt.server import Engine
 from sglang.srt.utils import MultiprocessingSerializer, broadcast_pyobj
@@ -30,6 +31,7 @@ class VerlEngine:
         nnodes: int = 1,
         **kwargs,
     ):
+        monkey_patch_torch_reductions()
         self._device_mesh_cpu = device_mesh_cpu
         self._tp_rank = device_mesh_cpu.get_local_rank()
         self._tp_size = device_mesh_cpu.size()

sglang/srt/function_call_parser.py CHANGED Viewed

@@ -1,12 +1,21 @@
 import json
 import logging
 import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from json import JSONDecodeError, JSONDecoder
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
 import partial_json_parser
+from partial_json_parser.core.exceptions import MalformedJSON
 from partial_json_parser.core.options import Allow
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
+from sglang.srt.openai_api.protocol import (
+    StructuralTagResponseFormat,
+    StructuresResponseFormat,
+    Tool,
+)
 logger = logging.getLogger(__name__)
@@ -19,14 +28,6 @@ TOOLS_TAG_LIST = [
 ]
-class Function(BaseModel):
-    """Function Tool Template."""
-    description: Optional[str] = Field(default=None, examples=[None])
-    name: Optional[str] = None
-    parameters: Optional[object] = None
 class ToolCallItem(BaseModel):
     """Simple encapsulation of the parsed ToolCall result for easier usage in streaming contexts."""
@@ -74,7 +75,22 @@ class StreamingParseResult:
         self.calls = calls or []
-class BaseFormatDetector:
+@dataclass
+class StructureInfo:
+    begin: str
+    end: str
+    trigger: str
+_GetInfoFunc = Callable[[str], StructureInfo]
+"""
+helper alias of function
+ususally it is a function that takes a name string and returns a StructureInfo object,
+which can be used to construct a structural_tag object
+"""
+class BaseFormatDetector(ABC):
     """Base class providing two sets of interfaces: one-time and streaming incremental."""
     def __init__(self):
@@ -90,26 +106,12 @@ class BaseFormatDetector:
         self.bot_token = ""
         self.eot_token = ""
-    def parse_base_json(self, action: Any, tools: List[Function]) -> List[ToolCallItem]:
+    def parse_base_json(self, action: Any, tools: List[Tool]) -> List[ToolCallItem]:
         tool_indices = {
             tool.function.name: i for i, tool in enumerate(tools) if tool.function.name
         }
         if not isinstance(action, list):
-            name = action.get("name")
-            if not name or name not in tool_indices:
-                logger.warning(f"Model attempted to call undefined function: {name}")
-                return []
-            return [
-                ToolCallItem(
-                    tool_index=tool_indices[name],
-                    name=name,
-                    parameters=json.dumps(
-                        action.get("parameters") or action.get("arguments", {}),
-                        ensure_ascii=False,
-                    ),
-                )
-            ]
+            action = [action]
         results = []
         for act in action:
@@ -125,19 +127,22 @@ class BaseFormatDetector:
                         ),
                     )
                 )
+            else:
+                logger.warning(f"Model attempted to call undefined function: {name}")
         return results
-    def detect_and_parse(self, text: str, tools: List[Function]) -> List[ToolCallItem]:
+    @abstractmethod
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
         """
         Parses the text in one go. Returns success=True if the format matches, otherwise False.
         Note that leftover_text here represents "content that this parser will not consume further".
         """
         action = json.loads(text)
-        return self.parse_base_json(action, tools)
+        return StreamingParseResult(calls=self.parse_base_json(action, tools))
     def parse_streaming_increment(
-        self, new_text: str, tools: List[Function]
+        self, new_text: str, tools: List[Tool]
     ) -> StreamingParseResult:
         """
         Streaming incremental parsing with tool validation.
@@ -196,7 +201,7 @@ class BaseFormatDetector:
                         obj["arguments"] = obj["parameters"]
                     tool_call_arr.append(obj)
-            except partial_json_parser.core.exceptions.MalformedJSON:
+            except MalformedJSON:
                 return StreamingParseResult()
             if len(tool_call_arr) == 0:
@@ -285,7 +290,6 @@ class BaseFormatDetector:
                             calls=[
                                 ToolCallItem(
                                     tool_index=self.current_tool_id,
-                                    name="",
                                     parameters=argument_diff,
                                 )
                             ],
@@ -302,6 +306,14 @@ class BaseFormatDetector:
             logger.error(f"Error in parse_streaming_increment: {e}")
             return StreamingParseResult()
+    @abstractmethod
+    def has_tool_call(self, text: str) -> bool:
+        raise NotImplementedError()
+    @abstractmethod
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError()
 class Qwen25Detector(BaseFormatDetector):
     """
@@ -322,7 +334,7 @@ class Qwen25Detector(BaseFormatDetector):
         """Check if the text contains a Qwen 2.5 format tool call."""
         return self.bot_token in text
-    def detect_and_parse(self, text: str, tools: List[Function]) -> List[ToolCallItem]:
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
         """
         One-time parsing: Detects and parses tool calls in the provided text.
@@ -330,15 +342,24 @@ class Qwen25Detector(BaseFormatDetector):
         :param tools: List of available tools.
         :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
         """
-        if "<tool_call>" not in text:
-            return []
-        pattern = r"<tool_call>(.*?)</tool_call>"
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+        pattern = rf"{self.bot_token}(.*?){self.eot_token}"
         match_result_list = re.findall(pattern, text, re.DOTALL)
         calls = []
         for match_result in match_result_list:
             match_result = json.loads(match_result)
             calls.extend(self.parse_base_json(match_result, tools))
-        return calls
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin='<tool_call>{"name":"' + name + '", "arguments":',
+            end="}</tool_call>",
+            trigger="<tool_call>",
+        )
 class MistralDetector(BaseFormatDetector):
@@ -374,7 +395,7 @@ class MistralDetector(BaseFormatDetector):
         else:
             return ""
-    def detect_and_parse(self, text: str, tools: List[Function]) -> List[ToolCallItem]:
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
         """
         One-time parsing: Detects and parses tool calls in the provided text.
@@ -382,6 +403,8 @@ class MistralDetector(BaseFormatDetector):
         :param tools: List of available tools.
         :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
         """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
         text = self._clean_text(text)
         tool_content = text.replace("[TOOL_CALLS]", "").strip()
         raw_tool_calls = self.tool_call_regex.findall(tool_content)
@@ -391,7 +414,14 @@ class MistralDetector(BaseFormatDetector):
             function_call_arr = json.loads(raw_tool_call)
             for match_result in function_call_arr:
                 calls.extend(self.parse_base_json(match_result, tools))
-        return calls
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin='[TOOL_CALLS] [{"name":"' + name + '", "arguments":',
+            end="}]",
+            trigger="[TOOL_CALLS]",
+        )
 class Llama32Detector(BaseFormatDetector):
@@ -411,19 +441,18 @@ class Llama32Detector(BaseFormatDetector):
         # prefix the output with the <|python_tag|> token
         return "<|python_tag|>" in text or text.startswith("{")
-    def detect_and_parse(self, text: str, tools: List[Function]) -> List[ToolCallItem]:
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
         """Parse function calls from text, handling multiple JSON objects."""
         if "<|python_tag|>" not in text and not text.startswith("{"):
-            return []
+            return StreamingParseResult(normal_text=text, calls=[])
         if "<|python_tag|>" in text:
-            _, action_text = text.split("<|python_tag|>")
+            normal_text, action_text = text.split("<|python_tag|>")
         else:
-            action_text = text
+            normal_text, action_text = "", text
         # Split by semicolon and process each part
         json_parts = [part.strip() for part in action_text.split(";") if part.strip()]
         all_actions = []
         for part in json_parts:
             try:
@@ -434,12 +463,18 @@ class Llama32Detector(BaseFormatDetector):
                 logger.warning(f"Failed to parse JSON part: {part}")
                 logger.warning(f"JSON parse error: {str(e)}")
                 continue
+        calls = []
         # Only process if we found valid JSON objects
         if all_actions:
-            return self.parse_base_json(all_actions, tools)
-        return []
+            calls = self.parse_base_json(all_actions, tools)
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin='<|python_tag|>{"name":"' + name + '", "arguments":',
+            end="}",
+            trigger="<|python_tag|>",
+        )
 class MultiFormatParser:
@@ -449,7 +484,9 @@ class MultiFormatParser:
         """
         self.detectors = detectors
-    def parse_once(self, text: str, tools: List[Function]):
+    def parse_once(
+        self, text: str, tools: List[Tool]
+    ) -> Tuple[str, list[ToolCallItem]]:
         """
         One-time parsing: Loop through detectors until there are no new matches or text is exhausted
         Return: (final_text, all_calls)
@@ -459,15 +496,19 @@ class MultiFormatParser:
         final_calls = []
         final_normal_text = text
         for detector in self.detectors:
-            tool_call_list = detector.detect_and_parse(text, tools)
+            parsed_result = detector.detect_and_parse(text, tools)
+            tool_call_list = parsed_result.calls
             if len(tool_call_list) > 0:  # parsed successfully
                 final_calls = tool_call_list
+                final_normal_text = parsed_result.normal_text
                 break
         # leftover_text is the normal text not consumed by any Detector
         return final_normal_text, final_calls
-    def parse_streaming_increment(self, new_text: str, tools: List[Function]):
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> Tuple[str, list[ToolCallItem]]:
         """
         Streaming incremental parsing: Feed new_text to each detector's parse_streaming_increment
         and merge their produced normal_text/calls to return.
@@ -498,13 +539,13 @@ class FunctionCallParser:
     and returns the resulting normal_text and calls to the upper layer (or SSE).
     """
-    ToolCallParserEnum: Dict[str, BaseFormatDetector] = {
+    ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
         "llama3": Llama32Detector,
         "qwen25": Qwen25Detector,
         "mistral": MistralDetector,
     }
-    def __init__(self, tools: List[Function], tool_call_parser: str = None):
+    def __init__(self, tools: List[Tool], tool_call_parser: str):
         detectors = []
         if tool_call_parser:
             detector_class = self.ToolCallParserEnum.get(tool_call_parser)
@@ -532,7 +573,7 @@ class FunctionCallParser:
                 return True
         return False
-    def parse_non_stream(self, full_text: str):
+    def parse_non_stream(self, full_text: str) -> Tuple[str, list[ToolCallItem]]:
         """
         Non-streaming call: one-time parsing
         """
@@ -541,7 +582,7 @@ class FunctionCallParser:
         )
         return full_normal_text, calls
-    def parse_stream_chunk(self, chunk_text: str):
+    def parse_stream_chunk(self, chunk_text: str) -> Tuple[str, list[ToolCallItem]]:
         """
         Streaming call: incremental parsing
         """
@@ -549,3 +590,40 @@ class FunctionCallParser:
             chunk_text, self.tools
         )
         return normal_text, calls
+    def structure_infos(self) -> List[_GetInfoFunc]:
+        """
+        Returns a list of structure_info functions for each detector
+        """
+        return [
+            detector.structure_info() for detector in self.multi_format_parser.detectors
+        ]
+    def get_structure_tag(self) -> StructuralTagResponseFormat:
+        tool_structures: List[StructuresResponseFormat] = list()
+        tool_trigger_set: Set[str] = set()
+        for wrapper in self.structure_infos():
+            for tool in self.tools:
+                function = tool.function
+                name = function.name
+                assert name is not None
+                info = wrapper(name)
+                # accept all if not strict, otherwise only accept the schema
+                schema = function.parameters if function.strict else {}
+                tool_structures.append(
+                    StructuresResponseFormat(
+                        begin=info.begin,
+                        schema=schema,  # type: ignore
+                        end=info.end,
+                    )
+                )
+                tool_trigger_set.add(info.trigger)
+        return StructuralTagResponseFormat(
+            type="structural_tag",
+            structures=tool_structures,
+            triggers=list(tool_trigger_set),
+        )

sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post3py3-none-any.whl