PyPI - flyteplugins-vllm - Versions diffs - 2.0.0b40__py3-none-any.whl - Mend

flyteplugins-vllm 2.0.0b40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

flyteplugins/__init__.py +0 -0
flyteplugins/vllm/__init__.py +3 -0
flyteplugins/vllm/_app_environment.py +196 -0
flyteplugins/vllm/_model_loader/__init__.py +0 -0
flyteplugins/vllm/_model_loader/shim.py +126 -0
flyteplugins_vllm-2.0.0b40.dist-info/METADATA +54 -0
flyteplugins_vllm-2.0.0b40.dist-info/RECORD +10 -0
flyteplugins_vllm-2.0.0b40.dist-info/WHEEL +5 -0
flyteplugins_vllm-2.0.0b40.dist-info/entry_points.txt +2 -0
flyteplugins_vllm-2.0.0b40.dist-info/top_level.txt +1 -0

flyteplugins/__init__.py ADDED Viewed

File without changes

flyteplugins/vllm/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+__all__ = ["VLLMAppEnvironment"]
+from flyteplugins.vllm._app_environment import VLLMAppEnvironment

flyteplugins/vllm/_app_environment.py ADDED Viewed

@@ -0,0 +1,196 @@
+from __future__ import annotations
+import shlex
+from dataclasses import dataclass, field, replace
+from typing import Any, Literal, Optional, Union
+import flyte.app
+import rich.repr
+from flyte import Environment, Image, Resources, SecretRequest
+from flyte.app import Input, RunOutput
+from flyte.app._types import Port
+from flyte.models import SerializationContext
+DEFAULT_VLLM_IMAGE = (
+    flyte.Image.from_debian_base(name="vllm-app-image", python_version=(3, 12))
+    # install flashinfer and vllm
+    .with_pip_packages("flashinfer-python", "flashinfer-cubin")
+    .with_pip_packages("flashinfer-jit-cache", index_url="https://flashinfer.ai/whl/cu129")
+    # install the vllm flyte plugin
+    .with_pip_packages("flyteplugins-vllm", pre=True)
+)
+@rich.repr.auto
+@dataclass(kw_only=True, repr=True)
+class VLLMAppEnvironment(flyte.app.AppEnvironment):
+    """
+    App environment backed by vLLM for serving large language models.
+    This environment sets up a vLLM server with the specified model and configuration.
+    :param name: The name of the application.
+    :param container_image: The container image to use for the application.
+    :param port: Port application listens to. Defaults to 8000 for vLLM.
+    :param requests: Compute resource requests for application.
+    :param secrets: Secrets that are requested for application.
+    :param limits: Compute resource limits for application.
+    :param env_vars: Environment variables to set for the application.
+    :param scaling: Scaling configuration for the app environment.
+    :param domain: Domain to use for the app.
+    :param cluster_pool: The target cluster_pool where the app should be deployed.
+    :param requires_auth: Whether the public URL requires authentication.
+    :param type: Type of app.
+    :param extra_args: Extra args to pass to `vllm serve`. See
+        https://docs.vllm.ai/en/stable/configuration/engine_args
+        or run `vllm serve --help` for details.
+    :param model_path: Remote path to model (e.g., s3://bucket/path/to/model).
+    :param model_hf_path: Hugging Face path to model (e.g., Qwen/Qwen3-0.6B).
+    :param model_id: Model id that is exposed by vllm.
+    :param stream_model: Set to True to stream model from blob store to the GPU directly.
+        If False, the model will be downloaded to the local file system first and then loaded
+        into the GPU.
+    """
+    port: int | Port = 8080
+    type: str = "vLLM"
+    extra_args: str | list[str] = ""
+    model_path: str | RunOutput = ""
+    model_hf_path: str = ""
+    model_id: str = ""
+    stream_model: bool = True
+    image: str | Image | Literal["auto"] = DEFAULT_VLLM_IMAGE
+    _model_mount_path: str = field(default="/root/flyte", init=False)
+    def __post_init__(self):
+        if self.env_vars is None:
+            self.env_vars = {}
+        if self.model_id == "":
+            raise ValueError("model_id must be defined")
+        if self.model_path == "" and self.model_hf_path == "":
+            raise ValueError("model_path or model_hf_path must be defined")
+        if self.model_path != "" and self.model_hf_path != "":
+            raise ValueError("model_path and model_hf_path cannot be set at the same time")
+        if self.model_hf_path:
+            self._model_mount_path = self.model_hf_path
+        if self.args:
+            raise ValueError("args cannot be set for VLLMAppEnvironment. Use `extra_args` to add extra arguments.")
+        if isinstance(self.extra_args, str):
+            extra_args = shlex.split(self.extra_args)
+        else:
+            extra_args = self.extra_args
+        stream_model_args = []
+        if self.stream_model:
+            stream_model_args.extend(["--load-format", "flyte-vllm-streaming"])
+        self.args = [
+            "vllm-fserve",
+            "serve",
+            self._model_mount_path,
+            "--served-model-name",
+            self.model_id,
+            "--port",
+            str(self.get_port().port),
+            *stream_model_args,
+            *extra_args,
+        ]
+        if self.inputs:
+            raise ValueError("inputs cannot be set for VLLMAppEnvironment")
+        input_kwargs = {}
+        if self.stream_model:
+            self.env_vars["FLYTE_MODEL_LOADER_STREAM_SAFETENSORS"] = "true"
+            input_kwargs["env_var"] = "FLYTE_MODEL_LOADER_REMOTE_MODEL_PATH"
+            input_kwargs["download"] = False
+        else:
+            self.env_vars["FLYTE_MODEL_LOADER_STREAM_SAFETENSORS"] = "false"
+            input_kwargs["download"] = True
+            input_kwargs["mount"] = self._model_mount_path
+        if self.model_path:
+            self.inputs = [Input(name="model_path", value=self.model_path, **input_kwargs)]
+        self.env_vars["FLYTE_MODEL_LOADER_LOCAL_MODEL_PATH"] = self._model_mount_path
+        self.links = [flyte.app.Link(path="/docs", title="vLLM OpenAPI Docs", is_relative=True)]
+        if self.image is None or self.image == "auto":
+            self.image = DEFAULT_VLLM_IMAGE
+        super().__post_init__()
+    def container_args(self, serialization_context: SerializationContext) -> list[str]:
+        """Return the container arguments for vLLM."""
+        if isinstance(self.args, str):
+            return shlex.split(self.args)
+        return self.args or []
+    def clone_with(
+        self,
+        name: str,
+        image: Optional[Union[str, Image, Literal["auto"]]] = None,
+        resources: Optional[Resources] = None,
+        env_vars: Optional[dict[str, str]] = None,
+        secrets: Optional[SecretRequest] = None,
+        depends_on: Optional[list[Environment]] = None,
+        description: Optional[str] = None,
+        interruptible: Optional[bool] = None,
+        **kwargs: Any,
+    ) -> VLLMAppEnvironment:
+        port = kwargs.pop("port", None)
+        extra_args = kwargs.pop("extra_args", None)
+        if "model_path" in kwargs:
+            set_model_path = True
+            model_path = kwargs.pop("model_path", "") or ""
+        else:
+            set_model_path = False
+            model_path = self.model_path
+        if "model_hf_path" in kwargs:
+            set_model_hf_path = True
+            model_hf_path = kwargs.pop("model_hf_path", "") or ""
+        else:
+            set_model_hf_path = False
+            model_hf_path = self.model_hf_path
+        model_id = kwargs.pop("model_id", None)
+        stream_model = kwargs.pop("stream_model", None)
+        if kwargs:
+            raise TypeError(f"Unexpected keyword arguments: {list(kwargs.keys())}")
+        kwargs = self._get_kwargs()
+        kwargs["name"] = name
+        kwargs["args"] = None
+        kwargs["inputs"] = None
+        if image is not None:
+            kwargs["image"] = image
+        if resources is not None:
+            kwargs["resources"] = resources
+        if env_vars is not None:
+            kwargs["env_vars"] = env_vars
+        if secrets is not None:
+            kwargs["secrets"] = secrets
+        if depends_on is not None:
+            kwargs["depends_on"] = depends_on
+        if description is not None:
+            kwargs["description"] = description
+        if interruptible is not None:
+            kwargs["interruptible"] = interruptible
+        if port is not None:
+            kwargs["port"] = port
+        if extra_args is not None:
+            kwargs["extra_args"] = extra_args
+        if set_model_path:
+            kwargs["model_path"] = model_path
+        if set_model_hf_path:
+            kwargs["model_hf_path"] = model_hf_path
+        if model_id is not None:
+            kwargs["model_id"] = model_id
+        if stream_model is not None:
+            kwargs["stream_model"] = stream_model
+        return replace(self, **kwargs)

flyteplugins/vllm/_model_loader/__init__.py ADDED Viewed

File without changes

flyteplugins/vllm/_model_loader/shim.py ADDED Viewed

@@ -0,0 +1,126 @@
+import logging
+from typing import Generator
+import torch
+import vllm
+import vllm.entrypoints.cli.main
+from flyte.app.extras._model_loader.config import (
+    LOCAL_MODEL_PATH,
+    REMOTE_MODEL_PATH,
+    STREAM_SAFETENSORS,
+)
+from flyte.app.extras._model_loader.loader import SafeTensorsStreamer, prefetch
+from vllm.config import ModelConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.model_executor.model_loader import register_model_loader
+from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
+from vllm.model_executor.model_loader.dummy_loader import DummyModelLoader
+from vllm.model_executor.model_loader.sharded_state_loader import ShardedStateLoader
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+logger = logging.getLogger(__name__)
+@register_model_loader("flyte-vllm-streaming")
+class FlyteModelLoader(DefaultModelLoader):
+    """Custom model loader for streaming model weights from object storage."""
+    def _get_weights_iterator(
+        self, source: DefaultModelLoader.Source
+    ) -> Generator[tuple[str, torch.Tensor], None, None]:
+        # Try to load weights using the Flyte SafeTensorsLoader. Fallback to the default loader otherwise.
+        try:
+            streamer = SafeTensorsStreamer(REMOTE_MODEL_PATH, LOCAL_MODEL_PATH)
+        except ValueError:
+            yield from super()._get_weights_iterator(source)
+        else:
+            for name, tensor in streamer.get_tensors():
+                yield source.prefix + name, tensor
+    def download_model(self, model_config: ModelConfig) -> None:
+        # This model loader supports streaming only
+        pass
+    def _load_sharded_model(self, vllm_config: VllmConfig, model_config: ModelConfig) -> torch.nn.Module:
+        # Forked from: https://github.com/vllm-project/vllm/blob/99d01a5e3d5278284bad359ac8b87ee7a551afda/vllm/model_executor/model_loader/loader.py#L613
+        # Sanity checks
+        tensor_parallel_size = vllm_config.parallel_config.tensor_parallel_size
+        rank = get_tensor_model_parallel_rank()
+        if rank >= tensor_parallel_size:
+            raise ValueError(f"Invalid rank {rank} for tensor parallel size {tensor_parallel_size}")
+        with set_default_torch_dtype(vllm_config.model_config.dtype):  # type: ignore[arg-type]
+            with torch.device(vllm_config.device_config.device):  # type: ignore[arg-type]
+                model_loader = DummyModelLoader(load_config=vllm_config.load_config)
+                model = model_loader.load_model(vllm_config=vllm_config, model_config=model_config)
+                for i, (name, module) in enumerate(model.named_modules()):
+                    print(i, name, module)
+                    quant_method = getattr(module, "quant_method", None)
+                    if quant_method is not None:
+                        quant_method.process_weights_after_loading(module)
+            state_dict = ShardedStateLoader._filter_subtensors(model.state_dict())
+            streamer = SafeTensorsStreamer(
+                REMOTE_MODEL_PATH,
+                LOCAL_MODEL_PATH,
+                rank=rank,
+                tensor_parallel_size=tensor_parallel_size,
+            )
+            for name, tensor in streamer.get_tensors():
+                # If loading with LoRA enabled, additional padding may
+                # be added to certain parameters. We only load into a
+                # narrowed view of the parameter data.
+                param_data = state_dict[name].data
+                param_shape = state_dict[name].shape
+                for dim, size in enumerate(tensor.shape):
+                    if size < param_shape[dim]:
+                        param_data = param_data.narrow(dim, 0, size)
+                if tensor.shape != param_shape:
+                    logger.warning(
+                        "loading tensor of shape %s into parameter '%s' of shape %s",
+                        tensor.shape,
+                        name,
+                        param_shape,
+                    )
+                param_data.copy_(tensor)
+                state_dict.pop(name)
+            if state_dict:
+                raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
+        return model.eval()
+    def load_model(
+        self,
+        vllm_config: VllmConfig,
+        model_config: ModelConfig,
+    ) -> torch.nn.Module:
+        logger.info("Loading model with FlyteModelLoader")
+        if vllm_config.parallel_config.tensor_parallel_size > 1:
+            return self._load_sharded_model(vllm_config, model_config)
+        else:
+            return super().load_model(vllm_config, model_config)
+async def _get_model_files():
+    import flyte.storage as storage
+    if not await storage.exists(REMOTE_MODEL_PATH):
+        raise FileNotFoundError(f"Model path not found: {REMOTE_MODEL_PATH}")
+    await prefetch(
+        REMOTE_MODEL_PATH,
+        LOCAL_MODEL_PATH,
+        exclude_safetensors=STREAM_SAFETENSORS,
+    )
+def main():
+    import asyncio
+    # TODO: add CLI here to be able to pass in serialized inputs from AppEnvironment
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    )
+    # Prefetch the model
+    asyncio.run(_get_model_files())
+    vllm.entrypoints.cli.main.main()

flyteplugins_vllm-2.0.0b40.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,54 @@
+Metadata-Version: 2.4
+Name: flyteplugins-vllm
+Version: 2.0.0b40
+Summary: vLLM plugin for flyte
+Author-email: Niels Bantilan <cosmicbboy@users.noreply.github.com>
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: vllm>=0.11.0
+# Union vLLM Plugin
+Serve large language models using vLLM with Flyte Apps.
+This plugin provides the `VLLMAppEnvironment` class for deploying and serving LLMs using [vLLM](https://docs.vllm.ai/).
+## Installation
+```bash
+pip install --pre flyteplugins-vllm
+```
+## Usage
+```python
+import flyte
+import flyte.app
+from flyteplugins.vllm import VLLMAppEnvironment
+# Define the vLLM app environment
+vllm_app = VLLMAppEnvironment(
+    name="my-llm-app",
+    model="s3://your-bucket/models/your-model",
+    model_id="your-model-id",
+    resources=flyte.Resources(cpu="4", memory="16Gi", gpu="L40s:1"),
+    stream_model=True,  # Stream model directly from blob store to GPU
+    scaling=flyte.app.Scaling(
+        replicas=(0, 1),
+        scaledown_after=300,
+    ),
+)
+if __name__ == "__main__":
+    flyte.init_from_config()
+    app = flyte.serve(vllm_app)
+    print(f"Deployed vLLM app: {app.url}")
+```
+## Features
+- **Streaming Model Loading**: Stream model weights directly from object storage to GPU memory, reducing startup time and disk requirements.
+- **OpenAI-Compatible API**: The deployed app exposes an OpenAI-compatible API for chat completions.
+- **Auto-scaling**: Configure scaling policies to scale up/down based on traffic.
+- **Tensor Parallelism**: Support for distributed inference across multiple GPUs.

flyteplugins_vllm-2.0.0b40.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+flyteplugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+flyteplugins/vllm/__init__.py,sha256=FhdW2e_f6PsGo4wyV07jradAFbg7WmB0Luz3zHIDd7A,100
+flyteplugins/vllm/_app_environment.py,sha256=RYB2Oj0aHa_fj9N49_4p89qEYU3GRPuszO5g_U597N4,7664
+flyteplugins/vllm/_model_loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+flyteplugins/vllm/_model_loader/shim.py,sha256=vSb7_r0sGJJVkWVIaqkypry0OTa5PAv-REqdigufYd0,5348
+flyteplugins_vllm-2.0.0b40.dist-info/METADATA,sha256=c7i4cEbfX-tK4i2rmRhNnIr61bUagNRsHxOZhQHLj4A,1577
+flyteplugins_vllm-2.0.0b40.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+flyteplugins_vllm-2.0.0b40.dist-info/entry_points.txt,sha256=lC-uwvkaytwtzbkJWdS69np63yLAakaDpI4mV1Yp9l8,74
+flyteplugins_vllm-2.0.0b40.dist-info/top_level.txt,sha256=cgd779rPu9EsvdtuYgUxNHHgElaQvPn74KhB5XSeMBE,13
+flyteplugins_vllm-2.0.0b40.dist-info/RECORD,,

flyteplugins_vllm-2.0.0b40.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

flyteplugins_vllm-2.0.0b40.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ vllm-fserve = flyteplugins.vllm._model_loader.shim:main

flyteplugins_vllm-2.0.0b40.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ flyteplugins