PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1197) hide show

vllm/compilation/inductor_pass.py ADDED Viewed

@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import hashlib
+import inspect
+import json
+import types
+from contextlib import contextmanager
+from typing import Any, Callable, Optional, Union
+import torch
+from torch import fx
+from vllm.utils import is_torch_equal_or_newer
+if is_torch_equal_or_newer("2.6"):
+    from torch._inductor.custom_graph_pass import CustomGraphPass
+else:
+    # CustomGraphPass is not present in 2.5 or lower, import our version
+    from .torch25_custom_graph_pass import (  # noqa: E501
+        Torch25CustomGraphPass as CustomGraphPass)
+_pass_context = None
+class PassContext:
+    def __init__(self, runtime_shape: Optional[int]):
+        self.runtime_shape = runtime_shape
+def get_pass_context() -> PassContext:
+    """Get the current pass context."""
+    assert _pass_context is not None
+    return _pass_context
+@contextmanager
+def pass_context(runtime_shape: Optional[int]):
+    """A context manager that stores the current pass context,
+    usually it is a list of sizes to specialize.
+    """
+    global _pass_context
+    prev_context = _pass_context
+    _pass_context = PassContext(runtime_shape)
+    try:
+        yield
+    finally:
+        _pass_context = prev_context
+class InductorPass(CustomGraphPass):
+    """
+    A custom graph pass that uses a hash of its source as the UUID.
+    This is defined as a convenience and should work in most cases.
+    """
+    def uuid(self) -> Any:
+        """
+        Provide a unique identifier for the pass, used in Inductor code cache.
+        This should depend on the pass implementation, so that changes to the
+        pass result in recompilation.
+        By default, the object source is hashed.
+        """
+        return InductorPass.hash_source(self)
+    @staticmethod
+    def hash_source(*srcs: Union[str, Any]):
+        """
+        Utility method to hash the sources of functions or objects.
+        :param srcs: strings or objects to add to the hash.
+        Objects and functions have their source inspected.
+        :return:
+        """
+        hasher = hashlib.sha256()
+        for src in srcs:
+            if isinstance(src, str):
+                src_str = src
+            elif isinstance(src, types.FunctionType):
+                src_str = inspect.getsource(src)
+            else:
+                src_str = inspect.getsource(src.__class__)
+            hasher.update(src_str.encode("utf-8"))
+        return hasher.hexdigest()
+    @staticmethod
+    def hash_dict(dict_: dict[Any, Any]):
+        """
+        Utility method to hash a dictionary, can alternatively be used for uuid.
+        :return: A sha256 hash of the json rep of the dictionary.
+        """
+        encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
+        return hashlib.sha256(encoded).hexdigest()
+    def is_applicable_for_shape(self, shape: Optional[int]):
+        return True
+class CallableInductorPass(InductorPass):
+    """
+    This class is a wrapper for a callable that automatically provides an
+    implementation of the UUID.
+    """
+    def __init__(self,
+                 callable: Callable[[fx.Graph], None],
+                 uuid: Optional[Any] = None):
+        self.callable = callable
+        self._uuid = self.hash_source(callable) if uuid is None else uuid
+    def __call__(self, graph: torch.fx.Graph):
+        self.callable(graph)
+    def uuid(self) -> Any:
+        return self._uuid

vllm/compilation/monitor.py ADDED Viewed

@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import time
+from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+context_manager = None
+torch_compile_start_time: float = 0.0
+def start_monitoring_torch_compile(vllm_config: VllmConfig):
+    global torch_compile_start_time
+    torch_compile_start_time = time.time()
+    compilation_config: CompilationConfig = vllm_config.compilation_config
+    if compilation_config.level == CompilationLevel.PIECEWISE and \
+        compilation_config.debug_dump_path:
+        import depyf
+        path = os.path.join(compilation_config.debug_dump_path,
+                            f"rank_{vllm_config.parallel_config.rank}")
+        global context_manager
+        context_manager = depyf.prepare_debug(path)
+        context_manager.__enter__()
+def end_monitoring_torch_compile(vllm_config: VllmConfig):
+    compilation_config: CompilationConfig = vllm_config.compilation_config
+    if compilation_config.level == CompilationLevel.PIECEWISE:
+        logger.info("torch.compile takes %.2f s in total",
+                    compilation_config.compilation_time)
+        global context_manager
+        if context_manager is not None:
+            context_manager.__exit__(None, None, None)
+            context_manager = None

vllm/compilation/multi_output_match.py ADDED Viewed

@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import abc
+import operator
+from abc import abstractmethod
+from collections.abc import Iterable
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor import pattern_matcher as pm
+from torch._ops import OpOverload
+from torch.fx import Node
+from vllm.compilation.fx_utils import find_auto_fn
+class MultiOutputMatch(abc.ABC):
+    """
+    This class provides utilities to process multi-output matches and
+    manually insert replacements.
+    This is necessary because the automatic replacement for multi-output
+    matches is broken: https://github.com/pytorch/pytorch/issues/137280
+    """
+    def __init__(self, match: pm.Match):
+        self.match = match
+    @abstractmethod
+    def process(self):
+        """
+        Process a multi-output match and manually insert the replacement.
+        This method should:
+        1. Insert the replacement nodes after the last node in the match.
+        2. Rebind the users of nodes in the match to use the new nodes.
+        3. Set meta["val"] for de-functionalization.
+        The result of an auto-functionalized node is a tuple of tensors.
+        The first element is the return value of the function, usually None.
+        The remaining elements are the mutated args of the function.
+        All auto-functionalized nodes must contain a proper meta["val"],
+        as it is used by de-functionalization. meta["val"] has to contain the
+        value of the node (tuple of tensors) that would be returned by the
+        functionalized node during tracing.
+        Existing nodes in the graph all have this property set, but we have
+        to set it manually for new nodes we insert.
+        Example:
+        # op schema: foo(a: Tensor!, b: Tensor, c: Tensor!) -> None
+        at = auto_functionalized(torch.ops._C.foo.default, a, b, c)
+        # at.meta["val"] = (None, a, c)
+        """
+        raise NotImplementedError
+    @property
+    def nodes(self) -> list[fx.Node]:
+        return self.match.nodes
+    @property
+    def graph(self) -> fx.Graph:
+        return self.match.graph
+    def find_auto_fn(self, op) -> fx.Node:
+        """
+        Find the first auto_functionalized node with the given op in the match.
+        """
+        return find_auto_fn(self.nodes, op)
+    def inserting_after_match(self):
+        """
+        Insert nodes after the last node in the match.
+        This is done to avoid use-before-definition errors after inserting
+        replacement nodes.
+        """
+        # match.nodes is not guaranteed to be sorted.
+        # Find the last node in the match.
+        for last_node_in_match in reversed(self.graph.nodes):
+            if last_node_in_match in self.match.nodes:
+                break
+        else:
+            raise ValueError("No nodes in graph")
+        return self.graph.inserting_after(last_node_in_match)
+    def insert_getitems(self, tuple_node: fx.Node,
+                        indices: Iterable[int]) -> tuple[fx.Node, ...]:
+        """
+        Insert operator.getitem nodes to extract elements from a tuple node.
+        :param tuple_node: The tuple node to extract elements from.
+        :param indices: The indices of the elements to extract.
+        :return: Tuple of the new getitem nodes, corresponding to the indices.
+        """
+        with self.graph.inserting_after(tuple_node):
+            return tuple(
+                self.graph.call_function(operator.getitem, (tuple_node, idx))
+                for idx in indices)
+    def insert_auto_fn(self, op: OpOverload, kwargs) -> Node:
+        """
+        Insert an auto_functionalized node with the given op and kwargs.
+        """
+        return self.graph.call_function(auto_functionalized, (op, ),
+                                        kwargs=kwargs)

vllm/compilation/noop_elimination.py ADDED Viewed

@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+from typing import Union
+import torch.fx
+from torch import SymInt
+from vllm.logger import init_logger
+from .fx_utils import is_func
+from .vllm_inductor_pass import VllmInductorPass
+logger = init_logger(__name__)
+class NoOpEliminationPass(VllmInductorPass):
+    """
+    This is an inductor pass that removes redundant reshape/slice operations.
+    It is required for RMSNorm-quant fusion to work properly.
+    That's because apply_fp8_linear adds a reshape, which is redundant
+    in the 2D-case. Additionally, torch internal no-op elimination pass does
+    not handle certain slice variants.
+    Example graph 1:
+    getitem_1: "f16[s0, 4096]" = ...
+    view_1: "f16[s0, 4096]" = torch.reshape(getitem_1, [-1, 4096])
+    at = auto_functionalized(static_scaled_fp8_quant, input = view_1, ...)
+    out: "f8e4m3fn[s0, 4096]" = at[1]
+    Can be replaced with:
+    getitem_1: "f16[s0, 4096]" = ...
+    at = auto_functionalized(static_scaled_fp8_quant, input = getitem_1, ...)
+    out: "f8e4m3fn[s0, 4096]" = at[1]
+    Example graph 2:
+    arg0: "s0" = SymInt(s0)
+    scaled_mm: "f16[s0, 4096]" = ...
+    slice_1: "f16[s0, 4096]" = torch.slice(scaled_mm, -1, 0, arg0)
+    at = auto_functionalized(fused_add_rms_norm, input = slice_1, ...)
+    out: "f16[s0, 4096]" = torch.slice_scatter(scaled_mm, at[1], 0, 0, arg0)
+    Can be replaced with:
+    arg0: "s0" = SymInt(s0)
+    scaled_mm: "f16[s0, 4096]" = ...
+    at = auto_functionalized(fused_add_rms_norm, input = scaled_mm, ...)
+    out: "f16[s0, 4096]" = at[1]
+    TODO(luka): This is currently tested in test_fusion,
+     but separate tests could be good.
+    """
+    def __call__(self, graph: torch.fx.Graph):
+        self.begin()
+        self.dump_graph(graph, "before_noop_elimination")
+        count = 0
+        # Remove no-op reshapes/views:
+        for node in graph.nodes:
+            if is_func(node, torch.ops.aten.reshape.default):
+                input, shape = node.args[:2]
+                input_shape = input.meta["val"].shape
+                if len(shape) != len(input_shape):
+                    # Reshape changing rank, skip
+                    continue
+                if shape.count(-1) > 1:
+                    # Invalid reshape args, skip
+                    continue
+                if self.all_dims_equivalent(shape, input_shape):
+                    node.replace_all_uses_with(input)
+                    graph.erase_node(node)
+                    count += 1
+            elif is_func(node, torch.ops.aten.slice.Tensor):
+                input, dim_index, start, end = node.args[:4]
+                input_shape = input.meta["val"].shape
+                i_dim = input_shape[dim_index]
+                if start == 0 and self.dims_equivalent(end, i_dim):
+                    node.replace_all_uses_with(input)
+                    graph.erase_node(node)
+                    count += 1
+            elif is_func(node, torch.ops.aten.slice_scatter.default):
+                base, view, dim_index, start, end = node.args[:5]
+                base_shape = base.meta["val"].shape
+                view_shape = view.meta["val"].shape
+                view_dim = view_shape[dim_index]
+                # Check that view fully covers base and the full view is used
+                # (if the view fully covered the base after slicing but was not
+                # fully used, we could replace slice_scatter with a simple slice
+                # but that's a niche case).
+                if (base_shape == view_shape and start == 0
+                        and self.dims_equivalent(end, view_dim)):
+                    node.replace_all_uses_with(view)
+                    graph.erase_node(node)
+                    count += 1
+        logger.debug("Removed %s no-op reshapes and slices", count)
+        self.dump_graph(graph, "after_noop_elimination")
+        self.end_and_log()
+    def all_dims_equivalent(self, dims: Iterable[Union[int, torch.fx.Node]],
+                            i_dims: Iterable[Union[int, SymInt]]):
+        return all(
+            self.dims_equivalent(s, i_s) for s, i_s in zip(dims, i_dims))
+    def dims_equivalent(self, dim: Union[int, torch.fx.Node],
+                        i_dim: Union[int, SymInt]) -> bool:
+        """
+        This function checks if two dimensions are equivalent.
+        :param dim: The dimension arg to reshape/slice
+        :param i_dim: The corresponding dimension in the input tensor
+        :return: Are the dimensions equivalent?
+        There are three cases in which the dimensions are equivalent:
+        1. The dimensions are equal (both integers)
+        2. The reshape dimension is -1 (i.e. inferred)
+        3. The dimensions both correspond to the same SymInt
+        While case 2 does not guarantee the dimensions are equal,
+        they are equal if all other dimensions are equal.
+        In case 3, the reshape dimension is a torch.fx.Node,
+        and its value is a SymInt. That value is equal to the
+        input dimension.
+        """
+        # Case 1 and 2
+        if dim == i_dim or dim == -1:
+            return True
+        # Case 3
+        return isinstance(dim, torch.fx.Node) and dim.meta["val"] == i_dim

vllm/compilation/pass_manager.py ADDED Viewed

@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from torch import fx as fx
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from .activation_quant_fusion import ActivationQuantFusionPass
+from .collective_fusion import AsyncTPPass
+from .fix_functionalization import FixFunctionalizationPass
+from .fusion import FusionPass
+from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
+from .noop_elimination import NoOpEliminationPass
+from .sequence_parallelism import SequenceParallelismPass
+from .vllm_inductor_pass import VllmInductorPass
+logger = init_logger(__name__)
+class PostGradPassManager(CustomGraphPass):
+    """
+    The pass manager for post-grad passes.
+    It handles configuration, adding custom passes, and running passes.
+    It supports uuid for the Inductor code cache. That includes torch<2.6
+    support using pickling (in .inductor_pass.CustomGraphPass).
+    The order of the post-grad post-passes is:
+    1. passes (constructor parameter)
+    2. default passes (NoopEliminationPass, FusionPass)
+    3. config["post_grad_custom_post_pass"] (if it exists)
+    4. fix_functionalization
+    This way, all passes operate on a functionalized graph.
+    """
+    def __init__(self):
+        self.passes: list[VllmInductorPass] = []
+    def __call__(self, graph: fx.Graph):
+        shape = get_pass_context().runtime_shape
+        for pass_ in self.passes:
+            if pass_.is_applicable_for_shape(shape):
+                pass_(graph)
+        # always run fix_functionalization last
+        self.fix_functionalization(graph)
+    def configure(self, config: VllmConfig):
+        self.pass_config = config.compilation_config.pass_config
+        if self.pass_config.enable_noop:
+            self.passes += [NoOpEliminationPass(config)]
+        if self.pass_config.enable_fusion:
+            self.passes += [FusionPass.instance(config)]
+            self.passes += [ActivationQuantFusionPass(config)]
+        if self.pass_config.enable_sequence_parallelism:
+            self.passes += [SequenceParallelismPass(config)]
+            if self.pass_config.enable_async_tp:
+                self.passes += [AsyncTPPass(config)]
+        self.fix_functionalization = FixFunctionalizationPass(config)
+    def add(self, pass_: InductorPass):
+        assert isinstance(pass_, InductorPass)
+        self.passes.append(pass_)
+    def uuid(self):
+        """
+        The PostGradPassManager is set as a custom pass in the Inductor and
+        affects compilation caching. Its uuid depends on the UUIDs of all
+        dependent passes and the pass config. See InductorPass for more info.
+        """
+        state = {"pass_config": self.pass_config.uuid(), "passes": []}
+        for pass_ in self.passes:
+            state["passes"].append(pass_.uuid())
+        state["passes"].append(self.fix_functionalization.uuid())
+        return InductorPass.hash_dict(state)