PyPI - vllm-cpu - Versions diffs - 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1398) hide show

vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py ADDED Viewed

@@ -0,0 +1,316 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ast
+import json
+from collections.abc import Sequence
+from typing import Any, Union
+import regex as re
+from transformers import PreTrainedTokenizerBase
+import vllm.envs as envs
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+class _UnexpectedAstError(Exception):
+    pass
+@ToolParserManager.register_module("llama4_pythonic")
+class Llama4PythonicToolParser(ToolParser):
+    """
+    Toolcall parser for Llama4 that produce tool calls in a pythonic style
+    Use --enable-auto-tool-choice --tool-call-parser llama4_pythonic
+    """
+    # TODO(mdepinet): Possible future improvements:
+    #   1. Support text + tools separated by either <|python_tag|> or \n\n
+    #   2. Support tools outside of a list (or separated by a semicolon).
+    #      This depends on item 1 for consistent streaming.
+    # Neither of these are necessary for e.g. ToolACE, but both would help make
+    # Llama3.2 models more reliable.
+    TOOL_CALL_REGEX = re.compile(
+        r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
+        re.DOTALL)
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+    # Rename for readability. This is NOT a tool id.
+    @property
+    def current_tool_index(self) -> int:
+        return self.current_tool_id
+    @current_tool_index.setter
+    def current_tool_index(self, value: int) -> None:
+        self.current_tool_id = value
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+        # remove <|python_start|> and <|python_end|>
+        # as Llama 4 model sometime will output those tokens
+        if model_output.startswith("<|python_start|>"):
+            model_output = model_output[len("<|python_start|>"):]
+            model_output = model_output.replace("<|python_end|>", "")
+        is_tool_call_pattern = False
+        try:
+            is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
+                model_output,
+                timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
+        except TimeoutError:
+            logger.warning(
+                "Regex timeout occurred when matching tool call pattern.")
+            logger.debug("Regex timeout occurred when matching user input: %s",
+                         model_output)
+        if not is_tool_call_pattern:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+        try:
+            module = ast.parse(model_output)
+            parsed = getattr(module.body[0], "value", None)
+            if isinstance(parsed, ast.List) and all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=[
+                        _handle_single_tool(e)  # type: ignore
+                        for e in parsed.elts
+                    ],
+                    content=None)
+            else:
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # Treat as regular text
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        if not current_text.startswith("[") and not current_text.startswith(
+                "<|python_start|>"):
+            return DeltaMessage(content=delta_text)
+        try:
+            # remove <|python_start|> and <|python_end|>
+            if current_text.startswith("<|python_start|>"):
+                current_text = current_text[len("<|python_start|>"):]
+            if current_text.endswith("<|python_end|>"):
+                current_text = current_text[:current_text.
+                                            rfind("<|python_end|>")]
+            valid_and_added_text = _make_valid_python(current_text)
+            if valid_and_added_text is None:
+                return None
+            valid_text, added_text = valid_and_added_text
+            module = ast.parse(valid_text)
+            parsed = getattr(module.body[0], "value", None)
+            if not isinstance(parsed, ast.List) or not all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+            tool_calls = [
+                _handle_single_tool(e)  # type: ignore
+                for e in parsed.elts
+            ]
+            tool_deltas = []
+            for index, new_call in enumerate(tool_calls):
+                if index < self.current_tool_index:
+                    continue
+                self.current_tool_index = index
+                if len(self.streamed_args_for_tool) == index:
+                    self.streamed_args_for_tool.append("")
+                new_call_complete = index < len(
+                    tool_calls) - 1 or ")]" not in added_text
+                if new_call_complete:
+                    self.current_tool_index += 1
+                withheld_suffix = (added_text[:-2]
+                                   if not new_call_complete else "")
+                if not new_call_complete and added_text[-2] == ")":
+                    # Function call is incomplete. Withhold the closing bracket.
+                    withheld_suffix = withheld_suffix + "}"
+                # Strings get single quotes in the model-produced string.
+                # JSON requires double quotes.
+                withheld_suffix = withheld_suffix.replace("'", '"')
+                delta = _compute_tool_delta(self.streamed_args_for_tool[index],
+                                            new_call, index, withheld_suffix)
+                if delta is not None:
+                    tool_deltas.append(delta)
+                    if (delta.function is not None
+                            and delta.function.arguments is not None):
+                        self.streamed_args_for_tool[
+                            index] += delta.function.arguments
+        # HACK: serving_chat.py inspects the internal state of tool parsers
+        # when determining its final streaming delta, automatically
+        # adding autocompleted JSON.
+        # These two lines avoid that nonsense while ensuring finish_reason
+        # is set to tool_calls when at least one tool is called.
+            if tool_deltas and not self.prev_tool_call_arr:
+                self.prev_tool_call_arr = [{"arguments": {}}]
+            if tool_deltas:
+                return DeltaMessage(tool_calls=tool_deltas)
+            elif not added_text and self.current_tool_id > 0:
+                # Return an empty DeltaMessage once the tool calls are all done
+                # so that finish_reason gets set.
+                return DeltaMessage(content='')
+            else:
+                return None
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
+def _get_parameter_value(val: ast.expr) -> Any:
+    if isinstance(val, ast.Constant):
+        return val.value
+    elif isinstance(val, ast.Dict):
+        if not all(isinstance(k, ast.Constant) for k in val.keys):
+            raise _UnexpectedAstError(
+                "Dict tool call arguments must have literal keys")
+        return {
+            k.value: _get_parameter_value(v)  # type: ignore
+            for k, v in zip(val.keys, val.values)
+        }
+    elif isinstance(val, ast.List):
+        return [_get_parameter_value(v) for v in val.elts]
+    else:
+        raise _UnexpectedAstError("Tool call arguments must be literals")
+def _handle_single_tool(call: ast.Call) -> ToolCall:
+    if not isinstance(call.func, ast.Name):
+        raise _UnexpectedAstError("Invalid tool call name")
+    function_name = call.func.id
+    arguments = {}
+    for keyword in call.keywords:
+        arguments[keyword.arg] = _get_parameter_value(keyword.value)
+    return ToolCall(type="function",
+                    function=FunctionCall(name=function_name,
+                                          arguments=json.dumps(arguments)))
+def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
+    bracket_stack = []
+    for index, char in enumerate(text):
+        if char in {"[", "(", "{"}:
+            bracket_stack.append(char)
+        elif char == "]":
+            if not bracket_stack or bracket_stack.pop() != "[":
+                raise _UnexpectedAstError("Mismatched square brackets")
+        elif char == ")":
+            if not bracket_stack or bracket_stack.pop() != "(":
+                raise _UnexpectedAstError("Mismatched parentheses")
+        elif char == "}":
+            if not bracket_stack or bracket_stack.pop() != "{":
+                raise _UnexpectedAstError("Mismatched curly braces")
+        elif char in {"'", '"'}:
+            if bracket_stack and bracket_stack[-1] == char:
+                if index > 0 and text[index - 1] == "\\":
+                    # Treat an escaped quote as a regular character
+                    pass
+                else:
+                    bracket_stack.pop()
+            elif bracket_stack and bracket_stack[-1] in {"'", '"'}:
+                # Double quote within a single quote string or vice versa.
+                pass
+            else:
+                bracket_stack.append(char)
+    text = text.rstrip()
+    if text.endswith("=") or text.endswith(":"):
+        # Since we have no type information for this property/parameter value,
+        # we can't fill in a valid value.
+        return None
+    if bracket_stack and bracket_stack[-1] == "{":
+        trailing_dict_text = text[:text.rfind("{")]
+        num_keys = trailing_dict_text.count(":")
+        num_values = trailing_dict_text.count(",")
+        if num_keys <= num_values:
+            return None  # Incomplete property name within parameter value
+    if bracket_stack and bracket_stack[-1] == "(":
+        trailing_params_text = text[:text.rfind("(")]
+        num_full_param_names = trailing_params_text.count("=")
+        num_full_param_values = trailing_params_text.count(",")
+        if num_full_param_names <= num_full_param_values:
+            return None  # Incomplete parameter name
+    if text.endswith(","):
+        text = text[:-1]
+    if bracket_stack and bracket_stack[-1] == "[" and not text.endswith(
+            "[") and not text.endswith(")"):
+        return None  # Incomplete function name
+    added_text = ""
+    for char in reversed(bracket_stack):
+        if char == "[":
+            added_text += "]"
+        elif char == "(":
+            added_text += ")"
+        elif char == "{":
+            added_text += "}"
+        elif char == "'":
+            added_text += "'"
+        elif char == '"':
+            added_text += '"'
+    return text + added_text, added_text
+def _compute_tool_delta(previously_sent_args: str, new_call: ToolCall,
+                        index: int,
+                        withheld_suffix: str) -> Union[DeltaToolCall, None]:
+    new_call_args = new_call.function.arguments
+    if withheld_suffix:
+        assert new_call_args.endswith(withheld_suffix)
+        new_call_args = new_call_args[:-len(withheld_suffix)]
+    if not previously_sent_args:
+        return DeltaToolCall(id=new_call.id,
+                             type="function",
+                             index=index,
+                             function=DeltaFunctionCall(
+                                 name=new_call.function.name,
+                                 arguments=new_call_args,
+                             ))
+    arg_diff = new_call_args[len(previously_sent_args):]
+    return DeltaToolCall(
+        id=None, index=index, function=DeltaFunctionCall(
+            arguments=arg_diff)) if arg_diff else None

vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py ADDED Viewed

@@ -0,0 +1,269 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from collections.abc import Sequence
+from typing import Union
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+from transformers import PreTrainedTokenizerBase
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (find_common_prefix,
+                                                        is_complete_json,
+                                                        partial_json_loads)
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+@ToolParserManager.register_module("llama3_json")
+@ToolParserManager.register_module("llama4_json")
+class Llama3JsonToolParser(ToolParser):
+    """
+    Tool call parser for Llama 3.x and 4 models intended for use with the
+    examples/tool_chat_template_llama.jinja template.
+    Used when --enable-auto-tool-choice --tool-call-parser llama3_json or
+    llama4_json are set.
+    """
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+        # initialize properties used for state when parsing tool calls in
+        # streaming mode
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+        self.bot_token = "<|python_tag|>"
+        self.bot_token_id = tokenizer.encode(self.bot_token,
+                                             add_special_tokens=False)[0]
+        # Updated regex to match multiple JSONs separated by semicolons
+        # This pattern is more robust and can handle nested JSON objects
+        self.tool_call_regex = re.compile(
+            r'{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*',
+            re.DOTALL)
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        Only extracts JSON content and ignores any surrounding plain text.
+        Supports both single JSON and multiple JSONs separated by semicolons.
+        """
+        # Quick check before running regex
+        if not (self.bot_token in model_output or '{' in model_output):
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+        # Find JSON object(s) in the text using regex
+        match = self.tool_call_regex.search(model_output)
+        if not match:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+        try:
+            json_str = match.group(0)
+            # Split by semicolon and strip whitespace
+            json_objects = [obj.strip() for obj in json_str.split(';')]
+            tool_calls: list[ToolCall] = []
+            for json_obj in json_objects:
+                if not json_obj:  # Skip empty strings
+                    continue
+                obj = json.loads(json_obj)
+                tool_calls.append(
+                    ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=obj["name"],
+                            # function call args are JSON but as a string
+                            arguments=json.dumps(
+                                obj["arguments"]
+                                if "arguments" in obj else obj["parameters"],
+                                ensure_ascii=False))))
+            return ExtractedToolCallInformation(tools_called=True,
+                                                tool_calls=tool_calls,
+                                                content=None)
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # return information to just treat the tool call as regular JSON
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        if not (current_text.startswith(self.bot_token)
+                or current_text.startswith('{')):
+            return DeltaMessage(content=delta_text)
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+            tool_call_arr = []
+            is_complete = []
+            try:
+                # depending on the prompt format the Llama model may or may not
+                # prefix the output with the <|python_tag|> token
+                start_idx = len(self.bot_token) if current_text.startswith(
+                    self.bot_token) else 0
+                while start_idx < len(current_text):
+                    (obj,
+                     end_idx) = partial_json_loads(current_text[start_idx:],
+                                                   flags)
+                    is_complete.append(
+                        is_complete_json(current_text[start_idx:start_idx +
+                                                      end_idx]))
+                    start_idx += end_idx + len('; ')
+                    # depending on the prompt Llama can use
+                    # either arguments or parameters
+                    if "parameters" in obj:
+                        assert "arguments" not in obj, \
+                            "model generated both parameters and arguments"
+                        obj["arguments"] = obj["parameters"]
+                    tool_call_arr.append(obj)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+            # select as the current tool call the one we're on the state at
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > 0 else {}
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if len(tool_call_arr) == 0:
+                return None
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    cur_arguments = current_tool_call.get("arguments")
+                    if cur_arguments:
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
+                        sent = len(
+                            self.streamed_args_for_tool[self.current_tool_id])
+                        argument_diff = cur_args_json[sent:]
+                        logger.debug("got arguments diff: %s", argument_diff)
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            elif not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=make_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+                cur_arguments = current_tool_call.get("arguments")
+                delta = None
+                if cur_arguments:
+                    sent = len(
+                        self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_arguments = self.prev_tool_call_arr[
+                        self.current_tool_id].get("arguments")
+                    argument_diff = None
+                    if is_complete[self.current_tool_id]:
+                        argument_diff = cur_args_json[sent:]
+                    elif prev_arguments:
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
+                        if cur_args_json != prev_args_json:
+                            prefix = find_common_prefix(
+                                prev_args_json, cur_args_json)
+                            argument_diff = prefix[sent:]
+                    if argument_diff is not None:
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None

vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py ADDED Viewed

@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import regex as re
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import (
+    Hermes2ProToolParser)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+@ToolParserManager.register_module("longcat")
+class LongcatFlashToolParser(Hermes2ProToolParser):
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+        self.tool_call_start_token: str = "<longcat_tool_call>"
+        self.tool_call_end_token: str = "</longcat_tool_call>"
+        self.tool_call_regex = re.compile(
+            r"<longcat_tool_call>(.*?)</longcat_tool_call>|<longcat_tool_call>(.*)",
+            re.DOTALL)
+        self.tool_call_start_token_ids = self.model_tokenizer.encode(
+            self.tool_call_start_token, add_special_tokens=False)
+        self.tool_call_end_token_ids = self.model_tokenizer.encode(
+            self.tool_call_end_token, add_special_tokens=False)
+        self.tool_call_start_token_array = [
+            self.model_tokenizer.decode([token_id])
+            for token_id in self.tool_call_start_token_ids
+        ]
+        self.tool_call_end_token_array = [
+            self.model_tokenizer.decode([token_id])
+            for token_id in self.tool_call_end_token_ids
+        ]