PyPI - vllm-cpu - Versions diffs - 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show

vllm/entrypoints/cli/__init__.py ADDED Viewed

File without changes

vllm/entrypoints/cli/benchmark/__init__.py ADDED Viewed

File without changes

vllm/entrypoints/cli/benchmark/base.py ADDED Viewed

@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.utils import FlexibleArgumentParser
+class BenchmarkSubcommandBase(CLISubcommand):
+    """ The base class of subcommands for vllm bench. """
+    @property
+    def help(self) -> str:
+        """The help message of the subcommand."""
+        raise NotImplementedError
+    def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
+        """Add the CLI arguments to the parser."""
+        raise NotImplementedError
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        """Run the benchmark.
+        Args:
+            args: The arguments to the command.
+        """
+        raise NotImplementedError
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        parser = subparsers.add_parser(
+            self.name,
+            help=self.help,
+            description=self.help,
+            usage=f"vllm bench {self.name} [options]")
+        self.add_cli_args(parser)
+        return parser

vllm/entrypoints/cli/benchmark/latency.py ADDED Viewed

@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+from vllm.benchmarks.latency import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+from vllm.entrypoints.cli.types import CLISubcommand
+class BenchmarkLatencySubcommand(BenchmarkSubcommandBase):
+    """ The `latency` subcommand for vllm bench. """
+    def __init__(self):
+        self.name = "latency"
+        super().__init__()
+    @property
+    def help(self) -> str:
+        return "Benchmark the latency of a single batch of requests."
+    def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkLatencySubcommand()]

vllm/entrypoints/cli/benchmark/main.py ADDED Viewed

@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import vllm.entrypoints.cli.benchmark.latency
+import vllm.entrypoints.cli.benchmark.serve
+import vllm.entrypoints.cli.benchmark.throughput
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.utils import FlexibleArgumentParser
+BENCHMARK_CMD_MODULES = [
+    vllm.entrypoints.cli.benchmark.latency,
+    vllm.entrypoints.cli.benchmark.serve,
+    vllm.entrypoints.cli.benchmark.throughput,
+]
+class BenchmarkSubcommand(CLISubcommand):
+    """ The `bench` subcommand for the vLLM CLI. """
+    def __init__(self):
+        self.name = "bench"
+        super().__init__()
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        args.dispatch_function(args)
+    def validate(self, args: argparse.Namespace) -> None:
+        if args.bench_type in self.cmds:
+            self.cmds[args.bench_type].validate(args)
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        bench_parser = subparsers.add_parser(
+            "bench",
+            help="vLLM bench subcommand.",
+            description="vLLM bench subcommand.",
+            usage="vllm bench <bench_type> [options]")
+        bench_subparsers = bench_parser.add_subparsers(required=True,
+                                                       dest="bench_type")
+        self.cmds = {}
+        for cmd_module in BENCHMARK_CMD_MODULES:
+            new_cmds = cmd_module.cmd_init()
+            for cmd in new_cmds:
+                cmd.subparser_init(bench_subparsers).set_defaults(
+                    dispatch_function=cmd.cmd)
+                self.cmds[cmd.name] = cmd
+        return bench_parser
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkSubcommand()]

vllm/entrypoints/cli/benchmark/serve.py ADDED Viewed

@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+from vllm.benchmarks.serve import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+from vllm.entrypoints.cli.types import CLISubcommand
+class BenchmarkServingSubcommand(BenchmarkSubcommandBase):
+    """ The `serve` subcommand for vllm bench. """
+    def __init__(self):
+        self.name = "serve"
+        super().__init__()
+    @property
+    def help(self) -> str:
+        return "Benchmark the online serving throughput."
+    def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkServingSubcommand()]

vllm/entrypoints/cli/benchmark/throughput.py ADDED Viewed

@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+from vllm.benchmarks.throughput import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+from vllm.entrypoints.cli.types import CLISubcommand
+class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase):
+    """ The `throughput` subcommand for vllm bench. """
+    def __init__(self):
+        self.name = "throughput"
+        super().__init__()
+    @property
+    def help(self) -> str:
+        return "Benchmark offline inference throughput."
+    def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkThroughputSubcommand()]

vllm/entrypoints/cli/collect_env.py ADDED Viewed

@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+from vllm.collect_env import main as collect_env_main
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.utils import FlexibleArgumentParser
+class CollectEnvSubcommand(CLISubcommand):
+    """The `serve` subcommand for the vLLM CLI. """
+    def __init__(self):
+        self.name = "collect-env"
+        super().__init__()
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        """Collect information about the environment."""
+        collect_env_main()
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        serve_parser = subparsers.add_parser(
+            "collect-env",
+            help="Start collecting environment information.",
+            description="Start collecting environment information.",
+            usage="vllm collect-env")
+        return make_arg_parser(serve_parser)
+def cmd_init() -> list[CLISubcommand]:
+    return [CollectEnvSubcommand()]

vllm/entrypoints/cli/main.py ADDED Viewed

@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# The CLI entrypoint to vLLM.
+import signal
+import sys
+import vllm.entrypoints.cli.benchmark.main
+import vllm.entrypoints.cli.collect_env
+import vllm.entrypoints.cli.openai
+import vllm.entrypoints.cli.serve
+import vllm.version
+from vllm.entrypoints.utils import cli_env_setup
+from vllm.utils import FlexibleArgumentParser
+CMD_MODULES = [
+    vllm.entrypoints.cli.openai,
+    vllm.entrypoints.cli.serve,
+    vllm.entrypoints.cli.benchmark.main,
+    vllm.entrypoints.cli.collect_env,
+]
+def register_signal_handlers():
+    def signal_handler(sig, frame):
+        sys.exit(0)
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTSTP, signal_handler)
+def main():
+    cli_env_setup()
+    parser = FlexibleArgumentParser(description="vLLM CLI")
+    parser.add_argument('-v',
+                        '--version',
+                        action='version',
+                        version=vllm.version.__version__)
+    subparsers = parser.add_subparsers(required=False, dest="subparser")
+    cmds = {}
+    for cmd_module in CMD_MODULES:
+        new_cmds = cmd_module.cmd_init()
+        for cmd in new_cmds:
+            cmd.subparser_init(subparsers).set_defaults(
+                dispatch_function=cmd.cmd)
+            cmds[cmd.name] = cmd
+    args = parser.parse_args()
+    if args.subparser in cmds:
+        cmds[args.subparser].validate(args)
+    if hasattr(args, "dispatch_function"):
+        args.dispatch_function(args)
+    else:
+        parser.print_help()
+if __name__ == "__main__":
+    main()

vllm/entrypoints/cli/openai.py ADDED Viewed

@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# Commands that act as an interactive OpenAI API client
+import argparse
+import os
+import signal
+import sys
+from typing import Optional
+from openai import OpenAI
+from openai.types.chat import ChatCompletionMessageParam
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.utils import FlexibleArgumentParser
+def _register_signal_handlers():
+    def signal_handler(sig, frame):
+        sys.exit(0)
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTSTP, signal_handler)
+def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]:
+    _register_signal_handlers()
+    base_url = args.url
+    api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")
+    openai_client = OpenAI(api_key=api_key, base_url=base_url)
+    if args.model_name:
+        model_name = args.model_name
+    else:
+        available_models = openai_client.models.list()
+        model_name = available_models.data[0].id
+    print(f"Using model: {model_name}")
+    return model_name, openai_client
+def chat(system_prompt: Optional[str], model_name: str,
+         client: OpenAI) -> None:
+    conversation: list[ChatCompletionMessageParam] = []
+    if system_prompt is not None:
+        conversation.append({"role": "system", "content": system_prompt})
+    print("Please enter a message for the chat model:")
+    while True:
+        try:
+            input_message = input("> ")
+        except EOFError:
+            return
+        conversation.append({"role": "user", "content": input_message})
+        chat_completion = client.chat.completions.create(model=model_name,
+                                                         messages=conversation)
+        response_message = chat_completion.choices[0].message
+        output = response_message.content
+        conversation.append(response_message)  # type: ignore
+        print(output)
+def _add_query_options(
+        parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:8000/v1",
+        help="url of the running OpenAI-Compatible RESTful API server")
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default=None,
+        help=("The model name used in prompt completion, default to "
+              "the first model in list models API call."))
+    parser.add_argument(
+        "--api-key",
+        type=str,
+        default=None,
+        help=(
+            "API key for OpenAI services. If provided, this api key "
+            "will overwrite the api key obtained through environment variables."
+        ))
+    return parser
+class ChatCommand(CLISubcommand):
+    """The `chat` subcommand for the vLLM CLI. """
+    def __init__(self):
+        self.name = "chat"
+        super().__init__()
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        model_name, client = _interactive_cli(args)
+        system_prompt = args.system_prompt
+        conversation: list[ChatCompletionMessageParam] = []
+        if system_prompt is not None:
+            conversation.append({"role": "system", "content": system_prompt})
+        print("Please enter a message for the chat model:")
+        while True:
+            try:
+                input_message = input("> ")
+            except EOFError:
+                return
+            conversation.append({"role": "user", "content": input_message})
+            chat_completion = client.chat.completions.create(
+                model=model_name, messages=conversation)
+            response_message = chat_completion.choices[0].message
+            output = response_message.content
+            conversation.append(response_message)  # type: ignore
+            print(output)
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        chat_parser = subparsers.add_parser(
+            "chat",
+            help="Generate chat completions via the running API server.",
+            description="Generate chat completions via the running API server.",
+            usage="vllm chat [options]")
+        _add_query_options(chat_parser)
+        chat_parser.add_argument(
+            "--system-prompt",
+            type=str,
+            default=None,
+            help=("The system prompt to be added to the chat template, "
+                  "used for models that support system prompts."))
+        return chat_parser
+class CompleteCommand(CLISubcommand):
+    """The `complete` subcommand for the vLLM CLI. """
+    def __init__(self):
+        self.name = "complete"
+        super().__init__()
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        model_name, client = _interactive_cli(args)
+        print("Please enter prompt to complete:")
+        while True:
+            input_prompt = input("> ")
+            completion = client.completions.create(model=model_name,
+                                                   prompt=input_prompt)
+            output = completion.choices[0].text
+            print(output)
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        complete_parser = subparsers.add_parser(
+            "complete",
+            help=("Generate text completions based on the given prompt "
+                  "via the running API server."),
+            description=("Generate text completions based on the given prompt "
+                         "via the running API server."),
+            usage="vllm complete [options]")
+        _add_query_options(complete_parser)
+        return complete_parser
+def cmd_init() -> list[CLISubcommand]:
+    return [ChatCommand(), CompleteCommand()]

vllm/entrypoints/cli/serve.py ADDED Viewed

@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import uvloop
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.openai.api_server import run_server
+from vllm.entrypoints.openai.cli_args import (make_arg_parser,
+                                              validate_parsed_serve_args)
+from vllm.utils import FlexibleArgumentParser
+class ServeSubcommand(CLISubcommand):
+    """The `serve` subcommand for the vLLM CLI. """
+    def __init__(self):
+        self.name = "serve"
+        super().__init__()
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        # If model is specified in CLI (as positional arg), it takes precedence
+        if hasattr(args, 'model_tag') and args.model_tag is not None:
+            args.model = args.model_tag
+        uvloop.run(run_server(args))
+    def validate(self, args: argparse.Namespace) -> None:
+        validate_parsed_serve_args(args)
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        serve_parser = subparsers.add_parser(
+            "serve",
+            help="Start the vLLM OpenAI Compatible API server.",
+            description="Start the vLLM OpenAI Compatible API server.",
+            usage="vllm serve [model_tag] [options]")
+        serve_parser.add_argument("model_tag",
+                                  type=str,
+                                  nargs='?',
+                                  help="The model tag to serve "
+                                  "(optional if specified in config)")
+        serve_parser.add_argument(
+            "--config",
+            type=str,
+            default='',
+            required=False,
+            help="Read CLI options from a config file."
+            "Must be a YAML with the following options:"
+            "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
+        )
+        return make_arg_parser(serve_parser)
+def cmd_init() -> list[CLISubcommand]:
+    return [ServeSubcommand()]

vllm/entrypoints/cli/types.py ADDED Viewed

@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+from vllm.utils import FlexibleArgumentParser
+class CLISubcommand:
+    """Base class for CLI argument handlers."""
+    name: str
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        raise NotImplementedError("Subclasses should implement this method")
+    def validate(self, args: argparse.Namespace) -> None:
+        # No validation by default
+        pass
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        raise NotImplementedError("Subclasses should implement this method")

vllm/entrypoints/launcher.py ADDED Viewed

@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+import signal
+import socket
+from http import HTTPStatus
+from typing import Any, Optional
+import uvicorn
+from fastapi import FastAPI, Request, Response
+from vllm import envs
+from vllm.engine.async_llm_engine import AsyncEngineDeadError
+from vllm.engine.multiprocessing import MQEngineDeadError
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.ssl import SSLCertRefresher
+from vllm.logger import init_logger
+from vllm.utils import find_process_using_port
+from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
+logger = init_logger(__name__)
+async def serve_http(app: FastAPI,
+                     sock: Optional[socket.socket],
+                     enable_ssl_refresh: bool = False,
+                     **uvicorn_kwargs: Any):
+    logger.info("Available routes are:")
+    for route in app.routes:
+        methods = getattr(route, "methods", None)
+        path = getattr(route, "path", None)
+        if methods is None or path is None:
+            continue
+        logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
+    config = uvicorn.Config(app, **uvicorn_kwargs)
+    config.load()
+    server = uvicorn.Server(config)
+    _add_shutdown_handlers(app, server)
+    loop = asyncio.get_running_loop()
+    watchdog_task = loop.create_task(
+        watchdog_loop(server, app.state.engine_client))
+    server_task = loop.create_task(
+        server.serve(sockets=[sock] if sock else None))
+    ssl_cert_refresher = None if not enable_ssl_refresh else SSLCertRefresher(
+        ssl_context=config.ssl,
+        key_path=config.ssl_keyfile,
+        cert_path=config.ssl_certfile,
+        ca_path=config.ssl_ca_certs)
+    def signal_handler() -> None:
+        # prevents the uvicorn signal handler to exit early
+        server_task.cancel()
+        watchdog_task.cancel()
+        if ssl_cert_refresher:
+            ssl_cert_refresher.stop()
+    async def dummy_shutdown() -> None:
+        pass
+    loop.add_signal_handler(signal.SIGINT, signal_handler)
+    loop.add_signal_handler(signal.SIGTERM, signal_handler)
+    try:
+        await server_task
+        return dummy_shutdown()
+    except asyncio.CancelledError:
+        port = uvicorn_kwargs["port"]
+        process = find_process_using_port(port)
+        if process is not None:
+            logger.debug(
+                "port %s is used by process %s launched with command:\n%s",
+                port, process, " ".join(process.cmdline()))
+        logger.info("Shutting down FastAPI HTTP server.")
+        return server.shutdown()
+    finally:
+        watchdog_task.cancel()
+async def watchdog_loop(server: uvicorn.Server, engine: EngineClient):
+    """
+    # Watchdog task that runs in the background, checking
+    # for error state in the engine. Needed to trigger shutdown
+    # if an exception arises is StreamingResponse() generator.
+    """
+    VLLM_WATCHDOG_TIME_S = 5.0
+    while True:
+        await asyncio.sleep(VLLM_WATCHDOG_TIME_S)
+        terminate_if_errored(server, engine)
+def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
+    """
+    See discussions here on shutting down a uvicorn server
+    https://github.com/encode/uvicorn/discussions/1103
+    In this case we cannot await the server shutdown here
+    because handler must first return to close the connection
+    for this request.
+    """
+    engine_errored = engine.errored and not engine.is_running
+    if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored:
+        server.should_exit = True
+def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
+    """
+    VLLM V1 AsyncLLM catches exceptions and returns
+    only two types: EngineGenerateError and EngineDeadError.
+    EngineGenerateError is raised by the per request generate()
+    method. This error could be request specific (and therefore
+    recoverable - e.g. if there is an error in input processing).
+    EngineDeadError is raised by the background output_handler
+    method. This error is global and therefore not recoverable.
+    We register these @app.exception_handlers to return nice
+    responses to the end user if they occur and shut down if needed.
+    See https://fastapi.tiangolo.com/tutorial/handling-errors/
+    for more details on how exception handlers work.
+    If an exception is encountered in a StreamingResponse
+    generator, the exception is not raised, since we already sent
+    a 200 status. Rather, we send an error message as the next chunk.
+    Since the exception is not raised, this means that the server
+    will not automatically shut down. Instead, we use the watchdog
+    background task for check for errored state.
+    """
+    @app.exception_handler(RuntimeError)
+    @app.exception_handler(AsyncEngineDeadError)
+    @app.exception_handler(MQEngineDeadError)
+    @app.exception_handler(EngineDeadError)
+    @app.exception_handler(EngineGenerateError)
+    async def runtime_exception_handler(request: Request, __):
+        terminate_if_errored(
+            server=server,
+            engine=request.app.state.engine_client,
+        )
+        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)