PyPI - vllm-cpu - Versions diffs - 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1398) hide show

vllm/entrypoints/tool.py ADDED Viewed

@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any
+from openai_harmony import Author, Message, Role, TextContent
+from vllm.logger import init_logger
+if TYPE_CHECKING:
+    # Avoid circular import.
+    from vllm.entrypoints.context import ConversationContext
+logger = init_logger(__name__)
+def validate_gpt_oss_install():
+    """
+    Check if the gpt-oss is installed and its version is at least 0.0.3.
+    If not, raise an ImportError.
+    """
+    from importlib.metadata import PackageNotFoundError, version
+    from packaging.version import InvalidVersion, Version
+    try:
+        pkg_version_str = version("gpt_oss")  # e.g., "0.0.5"
+        pkg_version = Version(pkg_version_str)
+    except PackageNotFoundError:
+        raise ImportError("Package 'gpt_oss' is not installed.") from None
+    except InvalidVersion as e:
+        raise ImportError(
+            f"Invalid version string for 'gpt_oss': {e}") from None
+    if pkg_version < Version("0.0.3"):
+        raise ImportError(
+            f"gpt_oss >= 0.0.3 is required, but {pkg_version} is installed."
+        ) from None
+class Tool(ABC):
+    @abstractmethod
+    async def get_result(self, context: "ConversationContext") -> Any:
+        pass
+class HarmonyBrowserTool(Tool):
+    def __init__(self):
+        self.enabled = True
+        exa_api_key = os.getenv("EXA_API_KEY")
+        if not exa_api_key:
+            self.enabled = False
+            logger.warning_once("EXA_API_KEY is not set, browsing is disabled")
+            return
+        try:
+            validate_gpt_oss_install()
+            from gpt_oss.tools.simple_browser import SimpleBrowserTool
+            from gpt_oss.tools.simple_browser.backend import ExaBackend
+        except ImportError as e:
+            self.enabled = False
+            logger.warning_once(
+                "gpt_oss is not installed properly (%s), browsing is disabled",
+                e)
+            return
+        browser_backend = ExaBackend(source="web", api_key=exa_api_key)
+        self.browser_tool = SimpleBrowserTool(backend=browser_backend)
+        logger.info_once("Browser tool initialized")
+    async def get_result(self, context: "ConversationContext") -> Any:
+        from vllm.entrypoints.context import HarmonyContext
+        assert isinstance(context, HarmonyContext)
+        last_msg = context.messages[-1]
+        tool_output_msgs = []
+        async for msg in self.browser_tool.process(last_msg):
+            tool_output_msgs.append(msg)
+        return tool_output_msgs
+    @property
+    def tool_config(self) -> Any:
+        return self.browser_tool.tool_config
+class HarmonyPythonTool(Tool):
+    def __init__(self):
+        self.enabled = True
+        try:
+            validate_gpt_oss_install()
+            from gpt_oss.tools.python_docker.docker_tool import PythonTool
+        except ImportError as e:
+            self.enabled = False
+            logger.warning_once(
+                "gpt_oss is not installed properly (%s), code interpreter is "
+                "disabled", e)
+            return
+        self.python_tool = PythonTool()
+    async def validate(self):
+        if not self.enabled:
+            return
+        try:
+            message = Message(
+                author=Author(role=Role.ASSISTANT),
+                content=[TextContent(text="print('Hello, world!')")],
+                channel="analysis",
+                recipient="python",
+                content_type="code",
+            )
+            msgs = []
+            async for msg in self.python_tool.process(message):
+                msgs.append(msg)
+            assert msgs[0].content[0].text == "Hello, world!\n"
+        except Exception as e:
+            self.enabled = False
+            logger.warning_once(
+                "Code interpreter tool failed to initialize (%s), code "
+                "interpreter is disabled", e)
+            return
+        logger.info_once("Code interpreter tool initialized")
+    async def get_result(self, context: "ConversationContext") -> Any:
+        from vllm.entrypoints.context import HarmonyContext
+        assert isinstance(context, HarmonyContext)
+        last_msg = context.messages[-1]
+        tool_output_msgs = []
+        async for msg in self.python_tool.process(last_msg):
+            tool_output_msgs.append(msg)
+        return tool_output_msgs
+    @property
+    def tool_config(self) -> Any:
+        return self.python_tool.tool_config

vllm/entrypoints/tool_server.py ADDED Viewed

@@ -0,0 +1,206 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from contextlib import AbstractAsyncContextManager, asynccontextmanager
+from typing import TYPE_CHECKING, Any, Optional
+from openai_harmony import ToolDescription, ToolNamespaceConfig
+from vllm.entrypoints.tool import HarmonyBrowserTool, HarmonyPythonTool, Tool
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+if TYPE_CHECKING:
+    from mcp.types import ListToolsResult
+async def list_server_and_tools(server_url: str):
+    from mcp import ClientSession
+    from mcp.client.sse import sse_client
+    async with sse_client(url=server_url) as streams, ClientSession(
+            *streams) as session:
+        initialize_response = await session.initialize()
+        list_tools_response = await session.list_tools()
+        return initialize_response, list_tools_response
+def trim_schema(schema: dict) -> dict:
+    # Turn JSON Schema from MCP generated into Harmony's variant.
+    if "title" in schema:
+        del schema["title"]
+    if "default" in schema and schema["default"] is None:
+        del schema["default"]
+    if "anyOf" in schema:
+        # Turn "anyOf": [{"type": "type-1"}, {"type": "type-2"}]
+        # into "type": ["type-1", "type-2"]
+        # if there's more than 1 types, also remove "null" type as Harmony will
+        # just ignore it
+        types = [
+            type_dict["type"] for type_dict in schema["anyOf"]
+            if type_dict["type"] != 'null'
+        ]
+        schema["type"] = types
+        del schema["anyOf"]
+    if "properties" in schema:
+        schema["properties"] = {
+            k: trim_schema(v)
+            for k, v in schema["properties"].items()
+        }
+    return schema
+def post_process_tools_description(
+        list_tools_result: "ListToolsResult") -> "ListToolsResult":
+    # Adapt the MCP tool result for Harmony
+    for tool in list_tools_result.tools:
+        tool.inputSchema = trim_schema(tool.inputSchema)
+    # Some tools schema don't need to be part of the prompt (e.g. simple text
+    # in text out for Python)
+    list_tools_result.tools = [
+        tool for tool in list_tools_result.tools
+        if getattr(tool.annotations, "include_in_prompt", True)
+    ]
+    return list_tools_result
+class ToolServer(ABC):
+    @abstractmethod
+    def has_tool(self, tool_name: str) -> bool:
+        """
+        Return True if the tool is supported, False otherwise.
+        """
+        pass
+    @abstractmethod
+    def get_tool_description(self,
+                             tool_name: str) -> Optional[ToolNamespaceConfig]:
+        """
+        Return the tool description for the given tool name.
+        If the tool is not supported, return None.
+        """
+        pass
+    @abstractmethod
+    def new_session(
+        self,
+        tool_name: str,
+        session_id: str,
+        headers: Optional[dict[str, str]] = None
+    ) -> AbstractAsyncContextManager[Any]:
+        """
+        Create a session for the tool.
+        """
+        ...
+class MCPToolServer(ToolServer):
+    def __init__(self):
+        try:
+            import mcp  # noqa: F401
+        except ImportError:
+            raise ImportError(
+                "mcp is not installed. Please run `pip install mcp` to use "
+                "MCPToolServer.") from None
+        self.harmony_tool_descriptions = {}
+    async def add_tool_server(self, server_url: str):
+        tool_urls = server_url.split(",")
+        self.harmony_tool_descriptions = {}
+        self.urls: dict[str, str] = {}
+        for url in tool_urls:
+            url = f"http://{url}/sse"
+            initialize_response, list_tools_response = (
+                await list_server_and_tools(url))
+            list_tools_response = post_process_tools_description(
+                list_tools_response)
+            tool_from_mcp = ToolNamespaceConfig(
+                name=initialize_response.serverInfo.name,
+                description=initialize_response.instructions,
+                tools=[
+                    ToolDescription.new(name=tool.name,
+                                        description=tool.description,
+                                        parameters=tool.inputSchema)
+                    for tool in list_tools_response.tools
+                ],
+            )
+            self.harmony_tool_descriptions[tool_from_mcp.name] = tool_from_mcp
+            if tool_from_mcp.name not in self.urls:
+                self.urls[tool_from_mcp.name] = url
+            else:
+                logger.warning(
+                    "Tool %s already exists. Ignoring duplicate tool server %s",
+                    tool_from_mcp.name, url)
+        logger.info("MCPToolServer initialized with tools: %s",
+                    list(self.harmony_tool_descriptions.keys()))
+    def has_tool(self, tool_name: str):
+        return tool_name in self.harmony_tool_descriptions
+    def get_tool_description(self, tool_name: str):
+        return self.harmony_tool_descriptions.get(tool_name)
+    @asynccontextmanager
+    async def new_session(self,
+                          tool_name: str,
+                          session_id: str,
+                          headers: Optional[dict[str, str]] = None):
+        from mcp import ClientSession
+        from mcp.client.sse import sse_client
+        url = self.urls.get(tool_name)
+        request_headers = {"x-session-id": session_id}
+        if headers is not None:
+            request_headers.update(headers)
+        if not url:
+            raise KeyError(f"Tool '{tool_name}' is not supported")
+        async with sse_client(
+                url=url, headers=request_headers) as streams, ClientSession(
+                    *streams) as session:
+            await session.initialize()
+            yield session
+class DemoToolServer(ToolServer):
+    def __init__(self):
+        self.tools: dict[str, Tool] = {}
+    async def init_and_validate(self):
+        browser_tool = HarmonyBrowserTool()
+        python_tool = HarmonyPythonTool()
+        await python_tool.validate()
+        if browser_tool.enabled:
+            self.tools["browser"] = browser_tool
+        if python_tool.enabled:
+            self.tools["python"] = python_tool
+        logger.info("DemoToolServer initialized with tools: %s",
+                    list(self.tools.keys()))
+    def has_tool(self, tool_name: str) -> bool:
+        return tool_name in self.tools
+    def get_tool_description(self,
+                             tool_name: str) -> Optional[ToolNamespaceConfig]:
+        if tool_name not in self.tools:
+            return None
+        if tool_name == "browser":
+            return ToolNamespaceConfig.browser()
+        elif tool_name == "python":
+            return ToolNamespaceConfig.python()
+        else:
+            raise ValueError(f"Unknown tool {tool_name}")
+    @asynccontextmanager
+    async def new_session(self,
+                          tool_name: str,
+                          session_id: str,
+                          headers: Optional[dict[str, str]] = None):
+        if tool_name not in self.tools:
+            raise KeyError(f"Tool '{tool_name}' is not supported")
+        yield self.tools[tool_name]

vllm/entrypoints/utils.py ADDED Viewed

@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import dataclasses
+import functools
+import os
+from argparse import Namespace
+from typing import Any, Optional, Union
+from fastapi import Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from starlette.background import BackgroundTask, BackgroundTasks
+from vllm.engine.arg_utils import EngineArgs
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              CompletionRequest)
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+logger = init_logger(__name__)
+VLLM_SUBCMD_PARSER_EPILOG = (
+    "For full list:            vllm {subcmd} --help=all\n"
+    "For a section:            vllm {subcmd} --help=ModelConfig    (case-insensitive)\n"  # noqa: E501
+    "For a flag:               vllm {subcmd} --help=max-model-len  (_ or - accepted)\n"  # noqa: E501
+    "Documentation:            https://docs.vllm.ai\n")
+async def listen_for_disconnect(request: Request) -> None:
+    """Returns if a disconnect message is received"""
+    while True:
+        message = await request.receive()
+        if message["type"] == "http.disconnect":
+            # If load tracking is enabled *and* the counter exists, decrement
+            # it. Combines the previous nested checks into a single condition
+            # to satisfy the linter rule.
+            if (getattr(request.app.state, "enable_server_load_tracking",
+                        False)
+                    and hasattr(request.app.state, "server_load_metrics")):
+                request.app.state.server_load_metrics -= 1
+            break
+def with_cancellation(handler_func):
+    """Decorator that allows a route handler to be cancelled by client
+    disconnections.
+    This does _not_ use request.is_disconnected, which does not work with
+    middleware. Instead this follows the pattern from
+    starlette.StreamingResponse, which simultaneously awaits on two tasks- one
+    to wait for an http disconnect message, and the other to do the work that we
+    want done. When the first task finishes, the other is cancelled.
+    A core assumption of this method is that the body of the request has already
+    been read. This is a safe assumption to make for fastapi handlers that have
+    already parsed the body of the request into a pydantic model for us.
+    This decorator is unsafe to use elsewhere, as it will consume and throw away
+    all incoming messages for the request while it looks for a disconnect
+    message.
+    In the case where a `StreamingResponse` is returned by the handler, this
+    wrapper will stop listening for disconnects and instead the response object
+    will start listening for disconnects.
+    """
+    # Functools.wraps is required for this wrapper to appear to fastapi as a
+    # normal route handler, with the correct request type hinting.
+    @functools.wraps(handler_func)
+    async def wrapper(*args, **kwargs):
+        # The request is either the second positional arg or `raw_request`
+        request = args[1] if len(args) > 1 else kwargs["raw_request"]
+        handler_task = asyncio.create_task(handler_func(*args, **kwargs))
+        cancellation_task = asyncio.create_task(listen_for_disconnect(request))
+        done, pending = await asyncio.wait([handler_task, cancellation_task],
+                                           return_when=asyncio.FIRST_COMPLETED)
+        for task in pending:
+            task.cancel()
+        if handler_task in done:
+            return handler_task.result()
+        return None
+    return wrapper
+def decrement_server_load(request: Request):
+    request.app.state.server_load_metrics -= 1
+def load_aware_call(func):
+    @functools.wraps(func)
+    async def wrapper(*args, **kwargs):
+        raw_request = kwargs.get("raw_request",
+                                 args[1] if len(args) > 1 else None)
+        if raw_request is None:
+            raise ValueError(
+                "raw_request required when server load tracking is enabled")
+        if not getattr(raw_request.app.state, "enable_server_load_tracking",
+                       False):
+            return await func(*args, **kwargs)
+        # ensure the counter exists
+        if not hasattr(raw_request.app.state, "server_load_metrics"):
+            raw_request.app.state.server_load_metrics = 0
+        raw_request.app.state.server_load_metrics += 1
+        try:
+            response = await func(*args, **kwargs)
+        except Exception:
+            raw_request.app.state.server_load_metrics -= 1
+            raise
+        if isinstance(response, (JSONResponse, StreamingResponse)):
+            if response.background is None:
+                response.background = BackgroundTask(decrement_server_load,
+                                                     raw_request)
+            elif isinstance(response.background, BackgroundTasks):
+                response.background.add_task(decrement_server_load,
+                                             raw_request)
+            elif isinstance(response.background, BackgroundTask):
+                # Convert the single BackgroundTask to BackgroundTasks
+                # and chain the decrement_server_load task to it
+                tasks = BackgroundTasks()
+                tasks.add_task(response.background.func,
+                               *response.background.args,
+                               **response.background.kwargs)
+                tasks.add_task(decrement_server_load, raw_request)
+                response.background = tasks
+        else:
+            raw_request.app.state.server_load_metrics -= 1
+        return response
+    return wrapper
+def cli_env_setup():
+    # The safest multiprocessing method is `spawn`, as the default `fork` method
+    # is not compatible with some accelerators. The default method will be
+    # changing in future versions of Python, so we should use it explicitly when
+    # possible.
+    #
+    # We only set it here in the CLI entrypoint, because changing to `spawn`
+    # could break some existing code using vLLM as a library. `spawn` will cause
+    # unexpected behavior if the code is not protected by
+    # `if __name__ == "__main__":`.
+    #
+    # References:
+    # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+    # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
+    # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
+    # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
+    if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
+        logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+def _validate_truncation_size(
+    max_model_len: int,
+    truncate_prompt_tokens: Optional[int],
+    tokenization_kwargs: Optional[dict[str, Any]] = None,
+) -> Optional[int]:
+    if truncate_prompt_tokens is not None:
+        if truncate_prompt_tokens <= -1:
+            truncate_prompt_tokens = max_model_len
+        if truncate_prompt_tokens > max_model_len:
+            raise ValueError(
+                f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
+                f"is greater than max_model_len ({max_model_len})."
+                f" Please, select a smaller truncation size.")
+        if tokenization_kwargs is not None:
+            tokenization_kwargs["truncation"] = True
+            tokenization_kwargs["max_length"] = truncate_prompt_tokens
+    else:
+        if tokenization_kwargs is not None:
+            tokenization_kwargs["truncation"] = False
+    return truncate_prompt_tokens
+def get_max_tokens(max_model_len: int, request: Union[ChatCompletionRequest,
+                                                      CompletionRequest],
+                   input_length: int, default_sampling_params: dict) -> int:
+    max_tokens = getattr(request, "max_completion_tokens",
+                         None) or request.max_tokens
+    default_max_tokens = max_model_len - input_length
+    max_output_tokens = current_platform.get_max_output_tokens(input_length)
+    return min(val
+               for val in (default_max_tokens, max_tokens, max_output_tokens,
+                           default_sampling_params.get("max_tokens"))
+               if val is not None)
+def log_non_default_args(args: Union[Namespace, EngineArgs]):
+    non_default_args = {}
+    # Handle Namespace
+    if isinstance(args, Namespace):
+        parser = make_arg_parser(FlexibleArgumentParser())
+        for arg, default in vars(parser.parse_args([])).items():
+            if default != getattr(args, arg):
+                non_default_args[arg] = getattr(args, arg)
+    # Handle EngineArgs instance
+    elif isinstance(args, EngineArgs):
+        default_args = EngineArgs(model=args.model)  # Create default instance
+        for field in dataclasses.fields(args):
+            current_val = getattr(args, field.name)
+            default_val = getattr(default_args, field.name)
+            if current_val != default_val:
+                non_default_args[field.name] = current_val
+        if default_args.model != EngineArgs.model:
+            non_default_args["model"] = default_args.model
+    else:
+        raise TypeError("Unsupported argument type. " \
+        "Must be Namespace or EngineArgs instance.")
+    logger.info("non-default args: %s", non_default_args)

vllm/env_override.py ADDED Viewed

@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import torch
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+# set some common config/environment variables that should be set
+# for all processes created by vllm and all processes
+# that interact with vllm workers.
+# they are executed whenever `import vllm` is called.
+# see https://github.com/vllm-project/vllm/pull/15951
+# it avoids unintentional cuda initialization from torch.cuda.is_available()
+os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'
+# see https://github.com/vllm-project/vllm/issues/10480
+os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+# see https://github.com/vllm-project/vllm/issues/10619
+torch._inductor.config.compile_threads = 1