PyPI - vllm-cpu - Versions diffs - 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1398) hide show

vllm/entrypoints/cli/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
+from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
+from vllm.entrypoints.cli.benchmark.throughput import (
+    BenchmarkThroughputSubcommand)
+__all__: list[str] = [
+    "BenchmarkLatencySubcommand",
+    "BenchmarkServingSubcommand",
+    "BenchmarkThroughputSubcommand",
+]

vllm/entrypoints/cli/benchmark/__init__.py ADDED Viewed

File without changes

vllm/entrypoints/cli/benchmark/base.py ADDED Viewed

@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+from vllm.entrypoints.cli.types import CLISubcommand
+class BenchmarkSubcommandBase(CLISubcommand):
+    """ The base class of subcommands for vllm bench. """
+    help: str
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        """Add the CLI arguments to the parser."""
+        raise NotImplementedError
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        """Run the benchmark.
+        Args:
+            args: The arguments to the command.
+        """
+        raise NotImplementedError

vllm/entrypoints/cli/benchmark/latency.py ADDED Viewed

@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+from vllm.benchmarks.latency import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+class BenchmarkLatencySubcommand(BenchmarkSubcommandBase):
+    """ The `latency` subcommand for vllm bench. """
+    name = "latency"
+    help = "Benchmark the latency of a single batch of requests."
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)

vllm/entrypoints/cli/benchmark/main.py ADDED Viewed

@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+import argparse
+import typing
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
+if typing.TYPE_CHECKING:
+    from vllm.utils import FlexibleArgumentParser
+class BenchmarkSubcommand(CLISubcommand):
+    """ The `bench` subcommand for the vLLM CLI. """
+    name = "bench"
+    help = "vLLM bench subcommand."
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        args.dispatch_function(args)
+    def validate(self, args: argparse.Namespace) -> None:
+        pass
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        bench_parser = subparsers.add_parser(
+            self.name,
+            description=self.help,
+            usage=f"vllm {self.name} <bench_type> [options]")
+        bench_subparsers = bench_parser.add_subparsers(required=True,
+                                                       dest="bench_type")
+        for cmd_cls in BenchmarkSubcommandBase.__subclasses__():
+            cmd_subparser = bench_subparsers.add_parser(
+                cmd_cls.name,
+                help=cmd_cls.help,
+                description=cmd_cls.help,
+                usage=f"vllm {self.name} {cmd_cls.name} [options]",
+            )
+            cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd)
+            cmd_cls.add_cli_args(cmd_subparser)
+            cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(
+                subcmd=f"{self.name} {cmd_cls.name}")
+        return bench_parser
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkSubcommand()]

vllm/entrypoints/cli/benchmark/serve.py ADDED Viewed

@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+from vllm.benchmarks.serve import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+class BenchmarkServingSubcommand(BenchmarkSubcommandBase):
+    """ The `serve` subcommand for vllm bench. """
+    name = "serve"
+    help = "Benchmark the online serving throughput."
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)

vllm/entrypoints/cli/benchmark/throughput.py ADDED Viewed

@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+from vllm.benchmarks.throughput import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase):
+    """ The `throughput` subcommand for vllm bench. """
+    name = "throughput"
+    help = "Benchmark offline inference throughput."
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)

vllm/entrypoints/cli/collect_env.py ADDED Viewed

@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+import argparse
+import typing
+from vllm.collect_env import main as collect_env_main
+from vllm.entrypoints.cli.types import CLISubcommand
+if typing.TYPE_CHECKING:
+    from vllm.utils import FlexibleArgumentParser
+class CollectEnvSubcommand(CLISubcommand):
+    """The `collect-env` subcommand for the vLLM CLI. """
+    name = "collect-env"
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        """Collect information about the environment."""
+        collect_env_main()
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        return subparsers.add_parser(
+            "collect-env",
+            help="Start collecting environment information.",
+            description="Start collecting environment information.",
+            usage="vllm collect-env")
+def cmd_init() -> list[CLISubcommand]:
+    return [CollectEnvSubcommand()]

vllm/entrypoints/cli/main.py ADDED Viewed

@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+'''The CLI entrypoints of vLLM
+Note that all future modules must be lazily loaded within main
+to avoid certain eager import breakage.'''
+from __future__ import annotations
+import importlib.metadata
+def main():
+    import vllm.entrypoints.cli.benchmark.main
+    import vllm.entrypoints.cli.collect_env
+    import vllm.entrypoints.cli.openai
+    import vllm.entrypoints.cli.run_batch
+    import vllm.entrypoints.cli.serve
+    from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
+    from vllm.utils import FlexibleArgumentParser
+    CMD_MODULES = [
+        vllm.entrypoints.cli.openai,
+        vllm.entrypoints.cli.serve,
+        vllm.entrypoints.cli.benchmark.main,
+        vllm.entrypoints.cli.collect_env,
+        vllm.entrypoints.cli.run_batch,
+    ]
+    cli_env_setup()
+    parser = FlexibleArgumentParser(
+        description="vLLM CLI",
+        epilog=VLLM_SUBCMD_PARSER_EPILOG.format(subcmd="[subcommand]"),
+    )
+    parser.add_argument(
+        '-v',
+        '--version',
+        action='version',
+        version=importlib.metadata.version('vllm'),
+    )
+    subparsers = parser.add_subparsers(required=False, dest="subparser")
+    cmds = {}
+    for cmd_module in CMD_MODULES:
+        new_cmds = cmd_module.cmd_init()
+        for cmd in new_cmds:
+            cmd.subparser_init(subparsers).set_defaults(
+                dispatch_function=cmd.cmd)
+            cmds[cmd.name] = cmd
+    args = parser.parse_args()
+    if args.subparser in cmds:
+        cmds[args.subparser].validate(args)
+    if hasattr(args, "dispatch_function"):
+        args.dispatch_function(args)
+    else:
+        parser.print_help()
+if __name__ == "__main__":
+    main()

vllm/entrypoints/cli/openai.py ADDED Viewed

@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+import argparse
+import os
+import signal
+import sys
+from typing import TYPE_CHECKING
+from openai import OpenAI
+from openai.types.chat import ChatCompletionMessageParam
+from vllm.entrypoints.cli.types import CLISubcommand
+if TYPE_CHECKING:
+    from vllm.utils import FlexibleArgumentParser
+def _register_signal_handlers():
+    def signal_handler(sig, frame):
+        sys.exit(0)
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTSTP, signal_handler)
+def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]:
+    _register_signal_handlers()
+    base_url = args.url
+    api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")
+    openai_client = OpenAI(api_key=api_key, base_url=base_url)
+    if args.model_name:
+        model_name = args.model_name
+    else:
+        available_models = openai_client.models.list()
+        model_name = available_models.data[0].id
+    print(f"Using model: {model_name}")
+    return model_name, openai_client
+def _print_chat_stream(stream) -> str:
+    output = ""
+    for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.content:
+            output += delta.content
+            print(delta.content, end="", flush=True)
+    print()
+    return output
+def _print_completion_stream(stream) -> str:
+    output = ""
+    for chunk in stream:
+        text = chunk.choices[0].text
+        if text is not None:
+            output += text
+            print(text, end="", flush=True)
+    print()
+    return output
+def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None:
+    conversation: list[ChatCompletionMessageParam] = []
+    if system_prompt is not None:
+        conversation.append({"role": "system", "content": system_prompt})
+    print("Please enter a message for the chat model:")
+    while True:
+        try:
+            input_message = input("> ")
+        except EOFError:
+            break
+        conversation.append({"role": "user", "content": input_message})
+        stream = client.chat.completions.create(model=model_name,
+                                                messages=conversation,
+                                                stream=True)
+        output = _print_chat_stream(stream)
+        conversation.append({"role": "assistant", "content": output})
+def _add_query_options(
+        parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:8000/v1",
+        help="url of the running OpenAI-Compatible RESTful API server")
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default=None,
+        help=("The model name used in prompt completion, default to "
+              "the first model in list models API call."))
+    parser.add_argument(
+        "--api-key",
+        type=str,
+        default=None,
+        help=(
+            "API key for OpenAI services. If provided, this api key "
+            "will overwrite the api key obtained through environment variables."
+        ))
+    return parser
+class ChatCommand(CLISubcommand):
+    """The `chat` subcommand for the vLLM CLI. """
+    name = "chat"
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        model_name, client = _interactive_cli(args)
+        system_prompt = args.system_prompt
+        conversation: list[ChatCompletionMessageParam] = []
+        if system_prompt is not None:
+            conversation.append({"role": "system", "content": system_prompt})
+        if args.quick:
+            conversation.append({"role": "user", "content": args.quick})
+            stream = client.chat.completions.create(model=model_name,
+                                                    messages=conversation,
+                                                    stream=True)
+            output = _print_chat_stream(stream)
+            conversation.append({"role": "assistant", "content": output})
+            return
+        print("Please enter a message for the chat model:")
+        while True:
+            try:
+                input_message = input("> ")
+            except EOFError:
+                break
+            conversation.append({"role": "user", "content": input_message})
+            stream = client.chat.completions.create(model=model_name,
+                                                    messages=conversation,
+                                                    stream=True)
+            output = _print_chat_stream(stream)
+            conversation.append({"role": "assistant", "content": output})
+    @staticmethod
+    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+        """Add CLI arguments for the chat command."""
+        _add_query_options(parser)
+        parser.add_argument(
+            "--system-prompt",
+            type=str,
+            default=None,
+            help=("The system prompt to be added to the chat template, "
+                  "used for models that support system prompts."))
+        parser.add_argument("-q",
+                            "--quick",
+                            type=str,
+                            metavar="MESSAGE",
+                            help=("Send a single prompt as MESSAGE "
+                                  "and print the response, then exit."))
+        return parser
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        parser = subparsers.add_parser(
+            "chat",
+            help="Generate chat completions via the running API server.",
+            description="Generate chat completions via the running API server.",
+            usage="vllm chat [options]")
+        return ChatCommand.add_cli_args(parser)
+class CompleteCommand(CLISubcommand):
+    """The `complete` subcommand for the vLLM CLI. """
+    name = 'complete'
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        model_name, client = _interactive_cli(args)
+        if args.quick:
+            stream = client.completions.create(model=model_name,
+                                               prompt=args.quick,
+                                               stream=True)
+            _print_completion_stream(stream)
+            return
+        print("Please enter prompt to complete:")
+        while True:
+            try:
+                input_prompt = input("> ")
+            except EOFError:
+                break
+            stream = client.completions.create(model=model_name,
+                                               prompt=input_prompt,
+                                               stream=True)
+            _print_completion_stream(stream)
+    @staticmethod
+    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+        """Add CLI arguments for the complete command."""
+        _add_query_options(parser)
+        parser.add_argument(
+            "-q",
+            "--quick",
+            type=str,
+            metavar="PROMPT",
+            help=
+            "Send a single prompt and print the completion output, then exit.")
+        return parser
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        parser = subparsers.add_parser(
+            "complete",
+            help=("Generate text completions based on the given prompt "
+                  "via the running API server."),
+            description=("Generate text completions based on the given prompt "
+                         "via the running API server."),
+            usage="vllm complete [options]")
+        return CompleteCommand.add_cli_args(parser)
+def cmd_init() -> list[CLISubcommand]:
+    return [ChatCommand(), CompleteCommand()]

vllm/entrypoints/cli/run_batch.py ADDED Viewed

@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+import argparse
+import asyncio
+import importlib.metadata
+import typing
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
+from vllm.logger import init_logger
+if typing.TYPE_CHECKING:
+    from vllm.utils import FlexibleArgumentParser
+logger = init_logger(__name__)
+class RunBatchSubcommand(CLISubcommand):
+    """The `run-batch` subcommand for vLLM CLI."""
+    name = "run-batch"
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        from vllm.entrypoints.openai.run_batch import main as run_batch_main
+        logger.info("vLLM batch processing API version %s",
+                    importlib.metadata.version("vllm"))
+        logger.info("args: %s", args)
+        # Start the Prometheus metrics server.
+        # LLMEngine uses the Prometheus client
+        # to publish metrics at the /metrics endpoint.
+        if args.enable_metrics:
+            from prometheus_client import start_http_server
+            logger.info("Prometheus metrics enabled")
+            start_http_server(port=args.port, addr=args.url)
+        else:
+            logger.info("Prometheus metrics disabled")
+        asyncio.run(run_batch_main(args))
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        from vllm.entrypoints.openai.run_batch import make_arg_parser
+        run_batch_parser = subparsers.add_parser(
+            self.name,
+            help="Run batch prompts and write results to file.",
+            description=(
+                "Run batch prompts using vLLM's OpenAI-compatible API.\n"
+                "Supports local or HTTP input/output files."),
+            usage=
+            "vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model <model>",
+        )
+        run_batch_parser = make_arg_parser(run_batch_parser)
+        run_batch_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(
+            subcmd=self.name)
+        return run_batch_parser
+def cmd_init() -> list[CLISubcommand]:
+    return [RunBatchSubcommand()]