PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -40,7 +40,7 @@ from sglang.srt.conversation import (
     get_conv_template_by_model_path,
     register_conv_template,
 )
-from sglang.srt.function_call_parser import FunctionCallParser
+from sglang.srt.function_call.function_call_parser import FunctionCallParser
 from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
 from sglang.srt.openai_api.protocol import (
     BatchRequest,
@@ -69,10 +69,16 @@ from sglang.srt.openai_api.protocol import (
     FunctionResponse,
     LogProbs,
     MultimodalEmbeddingInput,
+    ScoringRequest,
+    ScoringResponse,
     ToolCall,
     TopLogprob,
     UsageInfo,
 )
+from sglang.srt.openai_api.utils import (
+    detect_template_content_format,
+    process_content_for_template_format,
+)
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.utils import convert_json_schema_to_str, get_exception_traceback
@@ -80,6 +86,11 @@ logger = logging.getLogger(__name__)
 chat_template_name = None
+# Global cache for template content format detection (one model/template per instance)
+# NOTE: A better approach would be to initialize the chat template format when the endpoint is created
+_cached_chat_template = None
+_cached_template_format = None
 class FileMetadata:
     def __init__(self, filename: str, purpose: str):
@@ -604,6 +615,9 @@ def v1_generate_request(
         stream=all_requests[0].stream,
         rid=request_ids,
         lora_path=lora_paths,
+        bootstrap_host=all_requests[0].bootstrap_host,
+        bootstrap_port=all_requests[0].bootstrap_port,
+        bootstrap_room=all_requests[0].bootstrap_room,
     )
     return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
@@ -970,7 +984,7 @@ def v1_chat_generate_request(
         #  - image_data: None or a list of image strings (URLs or base64 strings).
         #  - audio_data: None or a list of audio strings (URLs).
         #    None skips any image processing in GenerateReqInput.
-        strict_tag = None
+        tool_call_constraint = None
         prompt = ""
         prompt_ids = []
         if not isinstance(request.messages, str):
@@ -989,27 +1003,48 @@ def v1_chat_generate_request(
                 tool_call_parser = tokenizer_manager.server_args.tool_call_parser
                 parser = FunctionCallParser(request.tools, tool_call_parser)
-                strict_tag = parser.get_structure_tag()
+                tool_call_constraint = parser.get_structure_constraint(
+                    request.tool_choice
+                )
             if chat_template_name is None:
                 openai_compatible_messages = []
+                image_data = []
+                audio_data = []
+                modalities = []
+                # Detect template content format by analyzing the jinja template (cached globally)
+                global _cached_chat_template, _cached_template_format
+                current_template = tokenizer_manager.tokenizer.chat_template
+                if current_template != _cached_chat_template:
+                    # Template changed or first time - analyze it
+                    _cached_chat_template = current_template
+                    _cached_template_format = detect_template_content_format(
+                        current_template
+                    )
+                    logger.info(
+                        f"Detected chat template content format: {_cached_template_format}"
+                    )
+                template_content_format = _cached_template_format
                 for message in request.messages:
                     if message.content is None:
                         message.content = ""
-                    msg_dict = message.dict()
-                    if isinstance(msg_dict.get("content"), list):
-                        for chunk in msg_dict["content"]:
-                            if isinstance(chunk, dict) and chunk.get("type") == "text":
-                                new_msg = msg_dict.copy()
-                                new_msg["content"] = chunk["text"]
-                                new_msg = {
-                                    k: v for k, v in new_msg.items() if v is not None
-                                }
-                                openai_compatible_messages.append(new_msg)
-                    else:
-                        msg_dict = {k: v for k, v in msg_dict.items() if v is not None}
-                        openai_compatible_messages.append(msg_dict)
+                    msg_dict = message.model_dump()
+                    # Process content based on detected template format
+                    processed_msg = process_content_for_template_format(
+                        msg_dict,
+                        template_content_format,
+                        image_data,
+                        audio_data,
+                        modalities,
+                    )
+                    openai_compatible_messages.append(processed_msg)
+                # Handle assistant prefix for continue_final_message
                 if (
                     openai_compatible_messages
                     and openai_compatible_messages[-1]["role"] == "assistant"
@@ -1063,9 +1098,9 @@ def v1_chat_generate_request(
                 if is_multimodal:
                     prompt = tokenizer_manager.tokenizer.decode(prompt_ids)
                 stop = request.stop
-                image_data = None
-                audio_data = None
-                modalities = []
+                image_data = image_data if image_data else None
+                audio_data = audio_data if audio_data else None
+                modalities = modalities if modalities else []
             else:
                 conv = generate_chat_conv(request, chat_template_name)
                 # If we should continue the final assistant message, adjust the conversation.
@@ -1156,20 +1191,24 @@ def v1_chat_generate_request(
                 request.response_format.model_dump(by_alias=True)
             )
-        if strict_tag is not None:
-            if (
-                sampling_params.get("regex")
-                or sampling_params.get("ebnf")
-                or sampling_params.get("structural_tag")
-                or sampling_params.get("json_schema")
-            ):
-                logger.warning(
-                    "Constrained decoding is not compatible with tool calls."
+        # Check if there are already existing output constraints
+        has_existing_constraints = (
+            sampling_params.get("regex")
+            or sampling_params.get("ebnf")
+            or sampling_params.get("structural_tag")
+            or sampling_params.get("json_schema")
+        )
+        if tool_call_constraint and has_existing_constraints:
+            logger.warning("Constrained decoding is not compatible with tool calls.")
+        elif tool_call_constraint:
+            constraint_type, constraint_value = tool_call_constraint
+            if constraint_type == "structural_tag":
+                sampling_params[constraint_type] = convert_json_schema_to_str(
+                    constraint_value.model_dump(by_alias=True)
                 )
             else:
-                sampling_params["structural_tag"] = convert_json_schema_to_str(
-                    strict_tag.model_dump(by_alias=True)
-                )
+                sampling_params[constraint_type] = constraint_value
         sampling_params_list.append(sampling_params)
@@ -1193,6 +1232,7 @@ def v1_chat_generate_request(
         top_logprobs_nums = top_logprobs_nums[0]
         modalities_list = modalities_list[0]
         lora_paths = lora_paths[0]
+        request_ids = request_ids[0]
     else:
         if tokenizer_manager.model_config.is_multimodal:
             # processor will need text input
@@ -1320,7 +1360,6 @@ def v1_chat_generate_response(
                     tool_calls = [
                         ToolCall(
                             id=f"call_{base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b'=').decode()}",
-                            index=call_info.tool_index,
                             function=FunctionResponse(
                                 name=call_info.name, arguments=call_info.parameters
                             ),
@@ -1384,7 +1423,9 @@ def v1_chat_generate_response(
                     "id": ret[i]["meta_info"]["id"],
                     "object": "chat.completion",
                     "created": created,
-                    "model": request[i].model,
+                    "model": (
+                        request[i].model if isinstance(request, list) else request.model
+                    ),
                     "choices": choice,
                     "usage": {
                         "prompt_tokens": ret[i]["meta_info"]["prompt_tokens"],
@@ -1429,7 +1470,9 @@ async def v1_chat_completions(
         return create_error_response("Invalid request body, error: ", str(e))
     all_requests = [ChatCompletionRequest(**request_json)]
     created = int(time.time())
-    adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
+    adapted_request, request = v1_chat_generate_request(
+        all_requests, tokenizer_manager, request_ids=[all_requests[0].rid]
+    )
     if adapted_request.stream:
         parser_dict = {}
@@ -1609,14 +1652,14 @@ async def v1_chat_completions(
                                     latest_delta_len = len(call_item.parameters)
                                 expected_call = json.dumps(
-                                    parser.multi_format_parser.detectors[0]
-                                    .prev_tool_call_arr[index]
-                                    .get("arguments", {}),
+                                    parser.detector.prev_tool_call_arr[index].get(
+                                        "arguments", {}
+                                    ),
                                     ensure_ascii=False,
                                 )
-                                actual_call = parser.multi_format_parser.detectors[
-                                    0
-                                ].streamed_args_for_tool[index]
+                                actual_call = parser.detector.streamed_args_for_tool[
+                                    index
+                                ]
                                 if latest_delta_len > 0:
                                     actual_call = actual_call[:-latest_delta_len]
                                 remaining_call = expected_call.replace(
@@ -1812,6 +1855,7 @@ def v1_embedding_request(all_requests, tokenizer_manager):
                 prompt_kwargs = {"text": generate_prompts, "image_data": images}
         else:
             prompt_kwargs = {"input_ids": prompt}
+        request_ids = all_requests[0].rid
     else:
         if isinstance(prompts[0], str) or isinstance(prompts[0][0], str):
             prompt_kwargs = {"text": prompts}
@@ -1824,8 +1868,10 @@ def v1_embedding_request(all_requests, tokenizer_manager):
             )
         else:
             prompt_kwargs = {"input_ids": prompts}
+        request_ids = [req.rid for req in all_requests]
     adapted_request = EmbeddingReqInput(
+        rid=request_ids,
         **prompt_kwargs,
     )
@@ -1914,3 +1960,31 @@ def to_openai_style_logprobs(
         append_top_logprobs(output_top_logprobs)
     return ret_logprobs
+async def v1_score(tokenizer_manager, raw_request):
+    try:
+        # Parse request
+        request_data = await raw_request.json()
+        request = ScoringRequest(**request_data)
+        # Use tokenizer_manager's score_request method directly
+        scores = await tokenizer_manager.score_request(
+            query=request.query,
+            items=request.items,
+            label_token_ids=request.label_token_ids,
+            apply_softmax=request.apply_softmax,
+            item_first=request.item_first,
+            request=request,
+        )
+        # Create response with just the scores, without usage info
+        response = ScoringResponse(
+            scores=scores,
+            model=request.model,
+        )
+        return response
+    except Exception as e:
+        logger.error(f"Error in v1_score: {str(e)}")
+        return create_error_response(str(e))

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -183,12 +183,17 @@ class CompletionRequest(BaseModel):
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
     session_params: Optional[Dict] = None
+    # For PD disaggregation
+    bootstrap_host: Optional[str] = None
+    bootstrap_port: Optional[int] = None
+    bootstrap_room: Optional[int] = None
 class CompletionResponseChoice(BaseModel):
     index: int
     text: str
     logprobs: Optional[LogProbs] = None
-    finish_reason: Literal["stop", "length", "content_filter"]
+    finish_reason: Literal["stop", "length", "content_filter", "abort"]
     matched_stop: Union[None, int, str] = None
@@ -392,6 +397,9 @@ class ChatCompletionRequest(BaseModel):
     stream_reasoning: bool = True
     chat_template_kwargs: Optional[Dict] = None
+    # The request id.
+    rid: Optional[str] = None
     # For PD disaggregation
     bootstrap_host: Optional[str] = None
     bootstrap_port: Optional[int] = None
@@ -410,7 +418,7 @@ class ChatCompletionResponseChoice(BaseModel):
     message: ChatMessage
     logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
     finish_reason: Literal[
-        "stop", "length", "tool_calls", "content_filter", "function_call"
+        "stop", "length", "tool_calls", "content_filter", "function_call", "abort"
     ]
     matched_stop: Union[None, int, str] = None
@@ -466,6 +474,9 @@ class EmbeddingRequest(BaseModel):
     dimensions: int = None
     user: Optional[str] = None
+    # The request id.
+    rid: Optional[str] = None
 class EmbeddingObject(BaseModel):
     embedding: List[float]
@@ -478,3 +489,27 @@ class EmbeddingResponse(BaseModel):
     model: str
     object: str = "list"
     usage: Optional[UsageInfo] = None
+class ScoringRequest(BaseModel):
+    query: Optional[Union[str, List[int]]] = (
+        None  # Query text or pre-tokenized token IDs
+    )
+    items: Optional[Union[str, List[str], List[List[int]]]] = (
+        None  # Item text(s) or pre-tokenized token IDs
+    )
+    label_token_ids: Optional[List[int]] = (
+        None  # Token IDs to compute probabilities for
+    )
+    apply_softmax: bool = False
+    item_first: bool = False
+    model: str
+class ScoringResponse(BaseModel):
+    scores: List[
+        List[float]
+    ]  # List of lists of probabilities, each in the order of label_token_ids
+    model: str
+    usage: Optional[UsageInfo] = None
+    object: str = "scoring"

sglang/srt/openai_api/utils.py ADDED Viewed

@@ -0,0 +1,172 @@
+"""
+Utility functions for OpenAI API adapter.
+"""
+import logging
+from typing import Dict, List
+import jinja2.nodes
+import transformers.utils.chat_template_utils as hf_chat_utils
+logger = logging.getLogger(__name__)
+# ============================================================================
+# JINJA TEMPLATE CONTENT FORMAT DETECTION
+# ============================================================================
+#
+# This adapts vLLM's approach for detecting chat template content format:
+# https://github.com/vllm-project/vllm/blob/02f0c7b220422792f5e53de2a7d51d2d3ff2df28/vllm/entrypoints/chat_utils.py#L296-L313
+# - Analyzes Jinja template AST to detect content iteration patterns
+# - 'openai' format: templates with {%- for content in message['content'] -%} loops
+# - 'string' format: templates that expect simple string content
+# - Processes content accordingly to match template expectations
+def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
+    """Check if node is a variable access like {{ varname }}"""
+    if isinstance(node, jinja2.nodes.Name):
+        return node.ctx == "load" and node.name == varname
+    return False
+def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
+    """Check if node is an attribute access like {{ varname['key'] }} or {{ varname.key }}"""
+    if isinstance(node, jinja2.nodes.Getitem):
+        return (
+            _is_var_access(node.node, varname)
+            and isinstance(node.arg, jinja2.nodes.Const)
+            and node.arg.value == key
+        )
+    if isinstance(node, jinja2.nodes.Getattr):
+        return _is_var_access(node.node, varname) and node.attr == key
+    return False
+def _is_var_or_elems_access(
+    node: jinja2.nodes.Node,
+    varname: str,
+    key: str = None,
+) -> bool:
+    """Check if node accesses varname or varname[key] with filters/tests"""
+    if isinstance(node, jinja2.nodes.Filter):
+        return node.node is not None and _is_var_or_elems_access(
+            node.node, varname, key
+        )
+    if isinstance(node, jinja2.nodes.Test):
+        return _is_var_or_elems_access(node.node, varname, key)
+    if isinstance(node, jinja2.nodes.Getitem) and isinstance(
+        node.arg, jinja2.nodes.Slice
+    ):
+        return _is_var_or_elems_access(node.node, varname, key)
+    return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname)
+def _try_extract_ast(chat_template: str):
+    """Try to parse the Jinja template into an AST"""
+    try:
+        jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
+        return jinja_compiled.environment.parse(chat_template)
+    except Exception as e:
+        logger.debug(f"Error when compiling Jinja template: {e}")
+        return None
+def detect_template_content_format(chat_template: str) -> str:
+    """
+    Detect whether a chat template expects 'string' or 'openai' content format.
+    - 'string': content is a simple string (like DeepSeek templates)
+    - 'openai': content is a list of structured dicts (like Llama4 templates)
+    Detection logic:
+    - If template has loops like {%- for content in message['content'] -%} → 'openai'
+    - Otherwise → 'string'
+    """
+    jinja_ast = _try_extract_ast(chat_template)
+    if jinja_ast is None:
+        return "string"
+    try:
+        # Look for patterns like: {%- for content in message['content'] -%}
+        for loop_ast in jinja_ast.find_all(jinja2.nodes.For):
+            loop_iter = loop_ast.iter
+            # Check if iterating over message['content'] or similar
+            if _is_var_or_elems_access(loop_iter, "message", "content"):
+                return "openai"  # Found content iteration → openai format
+        return "string"  # No content loops found → string format
+    except Exception as e:
+        logger.debug(f"Error when parsing AST of Jinja template: {e}")
+        return "string"
+def process_content_for_template_format(
+    msg_dict: dict,
+    content_format: str,
+    image_data: list,
+    audio_data: list,
+    modalities: list,
+) -> dict:
+    """
+    Process message content based on detected template format.
+    Args:
+        msg_dict: Message dictionary with content
+        content_format: 'string' or 'openai' (detected via AST analysis)
+        image_data: List to append extracted image URLs
+        audio_data: List to append extracted audio URLs
+        modalities: List to append modalities
+    Returns:
+        Processed message dictionary
+    """
+    if not isinstance(msg_dict.get("content"), list):
+        # Already a string or None, no processing needed
+        return {k: v for k, v in msg_dict.items() if v is not None}
+    if content_format == "openai":
+        # OpenAI format: preserve structured content list, normalize types
+        processed_content_parts = []
+        for chunk in msg_dict["content"]:
+            if isinstance(chunk, dict):
+                chunk_type = chunk.get("type")
+                if chunk_type == "image_url":
+                    image_data.append(chunk["image_url"]["url"])
+                    if chunk.get("modalities"):
+                        modalities.append(chunk.get("modalities"))
+                    # Normalize to simple 'image' type for template compatibility
+                    processed_content_parts.append({"type": "image"})
+                elif chunk_type == "audio_url":
+                    audio_data.append(chunk["audio_url"]["url"])
+                    # Normalize to simple 'audio' type
+                    processed_content_parts.append({"type": "audio"})
+                else:
+                    # Keep other content as-is (text, etc.)
+                    processed_content_parts.append(chunk)
+        new_msg = {
+            k: v for k, v in msg_dict.items() if v is not None and k != "content"
+        }
+        new_msg["content"] = processed_content_parts
+        return new_msg
+    else:  # content_format == "string"
+        # String format: flatten to text only (for templates like DeepSeek)
+        text_parts = []
+        for chunk in msg_dict["content"]:
+            if isinstance(chunk, dict) and chunk.get("type") == "text":
+                text_parts.append(chunk["text"])
+            # Note: For string format, we ignore images/audio since the template
+            # doesn't expect structured content - multimodal placeholders would
+            # need to be inserted differently
+        new_msg = msg_dict.copy()
+        new_msg["content"] = " ".join(text_parts) if text_parts else ""
+        new_msg = {k: v for k, v in new_msg.items() if v is not None}
+        return new_msg

sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl