PyPI - cuda-morph - Versions diffs - 0.9.1__py3-none-any.whl - Mend

cuda-morph 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

ascend_compat/__init__.py +207 -0
ascend_compat/__main__.py +34 -0
ascend_compat/_backend.py +334 -0
ascend_compat/_exceptions.py +69 -0
ascend_compat/_logging.py +124 -0
ascend_compat/backends/__init__.py +49 -0
ascend_compat/backends/ascend.py +61 -0
ascend_compat/backends/cambricon.py +82 -0
ascend_compat/backends/intel.py +74 -0
ascend_compat/backends/registry.py +138 -0
ascend_compat/backends/rocm.py +106 -0
ascend_compat/bench.py +650 -0
ascend_compat/cli/__init__.py +66 -0
ascend_compat/cli/_info.py +49 -0
ascend_compat/cli/_porter.py +71 -0
ascend_compat/cli/_scanner.py +368 -0
ascend_compat/cli/bench.py +33 -0
ascend_compat/cli/check.py +35 -0
ascend_compat/cli/compile.py +27 -0
ascend_compat/cli/doctor.py +19 -0
ascend_compat/cli/error.py +13 -0
ascend_compat/cli/info.py +12 -0
ascend_compat/cli/port.py +18 -0
ascend_compat/cli/quant.py +14 -0
ascend_compat/cli/run.py +40 -0
ascend_compat/cli/scaffold.py +31 -0
ascend_compat/cli/security.py +29 -0
ascend_compat/cli/verify.py +23 -0
ascend_compat/cli/vllm.py +19 -0
ascend_compat/cli.py +879 -0
ascend_compat/cuda_shim/__init__.py +40 -0
ascend_compat/cuda_shim/_import_hook.py +123 -0
ascend_compat/cuda_shim/_monkey_patch.py +569 -0
ascend_compat/cuda_shim/_patch_manager.py +313 -0
ascend_compat/cuda_shim/_registry.py +195 -0
ascend_compat/cuda_shim/compile_helpers.py +603 -0
ascend_compat/cuda_shim/dtype_manager.py +307 -0
ascend_compat/cuda_shim/quantization.py +279 -0
ascend_compat/device.py +127 -0
ascend_compat/doctor/__init__.py +40 -0
ascend_compat/doctor/env_setup.py +507 -0
ascend_compat/doctor/error_codes.py +408 -0
ascend_compat/doctor/fallback_monitor.py +274 -0
ascend_compat/doctor/op_auditor.py +287 -0
ascend_compat/doctor/security_check.py +274 -0
ascend_compat/doctor/version_check.py +254 -0
ascend_compat/ecosystem/__init__.py +23 -0
ascend_compat/ecosystem/_flash_attn_hook.py +134 -0
ascend_compat/ecosystem/deepspeed_patch.py +180 -0
ascend_compat/ecosystem/flash_attn.py +334 -0
ascend_compat/ecosystem/transformers_patch.py +223 -0
ascend_compat/ecosystem/triton_bridge.py +188 -0
ascend_compat/ecosystem/vllm_patch.py +289 -0
ascend_compat/exceptions.py +41 -0
ascend_compat/kernel_helper/__init__.py +31 -0
ascend_compat/kernel_helper/scaffold.py +681 -0
ascend_compat/kernel_helper/spec.py +128 -0
ascend_compat/memory.py +83 -0
ascend_compat/ops.py +147 -0
ascend_compat/py.typed +0 -0
ascend_compat/streams.py +62 -0
ascend_compat/validation/__init__.py +21 -0
ascend_compat/validation/op_verifier.py +365 -0
cuda_morph-0.9.1.dist-info/METADATA +167 -0
cuda_morph-0.9.1.dist-info/RECORD +69 -0
cuda_morph-0.9.1.dist-info/WHEEL +5 -0
cuda_morph-0.9.1.dist-info/entry_points.txt +3 -0
cuda_morph-0.9.1.dist-info/licenses/LICENSE +189 -0
cuda_morph-0.9.1.dist-info/top_level.txt +1 -0

ascend_compat/__init__.py ADDED Viewed

@@ -0,0 +1,207 @@
+"""cuda-morph: CUDA → Ascend NPU compatibility shim for PyTorch.
+This is **not** a replacement for torch_npu.  torch_npu already handles the
+hard C++/CANN integration via PyTorch's PrivateUse1 dispatch mechanism.
+cuda-morph is a *thin, high-value ecosystem compatibility bridge* that
+fixes the last mile: existing CUDA-assuming Python code that hard-codes
+``torch.cuda`` calls.
+Architecture (four-layer stack)::
+    ┌─────────────────────────────────────────────────────┐
+    │  Layer 4: ascend_compat.doctor                      │
+    │  Environment validation, error translation,         │
+    │  diagnostics CLI                                    │
+    ├─────────────────────────────────────────────────────┤
+    │  Layer 3: ascend_compat.ecosystem                   │
+    │  HuggingFace, DeepSpeed, flash-attn, vLLM shims    │
+    ├─────────────────────────────────────────────────────┤
+    │  Layer 2: ascend_compat.cuda_shim                   │
+    │  torch.cuda API interception + intelligent routing  │
+    ├─────────────────────────────────────────────────────┤
+    │  Layer 1: torch_npu (Huawei — already exists)       │
+    │  PrivateUse1 backend, C++ dispatch, CANN/ACL        │
+    └─────────────────────────────────────────────────────┘
+Activation Modes
+----------------
+``import ascend_compat`` does **not** automatically patch ``torch.cuda``
+by default.  This is a deliberate design choice: imports should not have
+global side effects, especially when cuda-morph might be imported
+transitively by a library.
+There are three ways to activate the shim:
+1. **Explicit activation** (recommended for applications)::
+       import ascend_compat
+       ascend_compat.activate()
+2. **CLI launcher** (recommended for running existing scripts unchanged)::
+       cuda-morph run script.py
+3. **Environment variable** (opt-in to auto-activate on import)::
+       export ASCEND_COMPAT_AUTO_ACTIVATE=1
+       python script.py  # import ascend_compat now auto-activates
+To prevent activation entirely (e.g. in testing)::
+    export ASCEND_COMPAT_NO_PATCH=1
+After activation the shim:
+- Detects your hardware (NPU > CUDA > CPU)
+- Routes ``torch.cuda.*`` calls to ``torch.npu.*`` equivalents
+- Makes ``torch.cuda.is_available()`` return ``False`` to prevent the
+  NCCL-vs-HCCL misdetection bug in accelerate/DeepSpeed
+- Patches ``torch.device("cuda")`` → ``torch.device("npu")``
+- Patches ``Tensor.cuda()`` → ``Tensor.npu()``
+For ecosystem-specific fixes::
+    from ascend_compat.ecosystem import transformers_patch
+    transformers_patch.apply()  # Fixes device_map="auto" on NPU
+    from ascend_compat.ecosystem import flash_attn  # Drop-in flash_attn replacement
+Observability
+-------------
+After activation, you can inspect which patches are being hit::
+    stats = ascend_compat.get_patch_stats()
+    # => {"cuda.is_available": 42, "torch.device": 137, ...}
+Migration from v0.2.x
+---------------------
+In v0.2.x, ``import ascend_compat`` auto-activated the shim.  As of v0.3.0+,
+you must explicitly call ``ascend_compat.activate()`` or use the CLI launcher.
+See MIGRATION.md for details.
+Environment Variables
+---------------------
+``ASCEND_COMPAT_AUTO_ACTIVATE``
+    Set to ``1`` to auto-activate on ``import ascend_compat``.
+``ASCEND_COMPAT_LOG_LEVEL``
+    Set to ``DEBUG`` to see every API translation. Default: ``WARNING``.
+``ASCEND_COMPAT_NO_PATCH``
+    Set to ``1`` to prevent activation entirely (even explicit calls).
+"""
+from __future__ import annotations
+import os
+import warnings
+__version__ = "0.9.0"
+# Core infrastructure (always available — no side effects)
+from ascend_compat._backend import (
+    Backend,
+    detect_backends,
+    has_cuda,
+    has_mlu,
+    has_npu,
+    has_rocm,
+    has_xpu,
+    preferred_backend,
+)
+from ascend_compat._logging import set_log_level
+# Layer 2: CUDA shim (activation is explicit, not on import)
+from ascend_compat.cuda_shim import (
+    activate,
+    deactivate,
+    get_all_patch_stats,
+    get_patch_stats,
+    is_activated,
+    reset_patch_stats,
+)
+# ---------------------------------------------------------------------------
+# Backward compatibility: deprecation warning for v0.2.x users
+# ---------------------------------------------------------------------------
+# In v0.2.x, `import ascend_compat` auto-activated the shim.  We removed that
+# in v0.3.0+ because library imports shouldn't have global side effects.
+# Emit a one-time deprecation warning if the user appears to be relying on
+# the old behavior (i.e. they imported us but haven't called activate()).
+import atexit as _atexit
+import sys as _sys
+def _check_activation_at_exit() -> None:
+    """Emit a deprecation warning at exit if shim was never activated.
+    This catches the v0.2.x pattern where users relied on import-time
+    activation.  We only warn if:
+    1. The shim was imported but never activated
+    2. We're not in a test runner (pytest sets 'pytest' in sys.modules)
+    3. We haven't already warned
+    """
+    if is_activated():
+        return
+    if "pytest" in _sys.modules:
+        return  # Don't warn during testing
+    if os.environ.get("ASCEND_COMPAT_NO_PATCH", "").strip() == "1":
+        return  # User explicitly disabled patches
+    # Check if ascend_compat was imported in user code (not just transitively)
+    import_in_main = False
+    main_mod = _sys.modules.get("__main__")
+    if main_mod is not None:
+        src = getattr(main_mod, "__file__", "") or ""
+        if src:
+            try:
+                with open(src) as f:
+                    content = f.read()
+                import_in_main = "ascend_compat" in content
+            except (OSError, IOError):
+                pass
+    if import_in_main:
+        warnings.warn(
+            "cuda-morph was imported but activate() was never called. "
+            "Since v0.3.0, auto-activation on import is removed. "
+            "Add `ascend_compat.activate()` after import, use "
+            "`cuda-morph run script.py`, or set "
+            "ASCEND_COMPAT_AUTO_ACTIVATE=1. "
+            "See MIGRATION.md for details.",
+            DeprecationWarning,
+            stacklevel=1,
+        )
+_atexit.register(_check_activation_at_exit)
+# ---------------------------------------------------------------------------
+# Conditional auto-activation
+# ---------------------------------------------------------------------------
+# We only auto-activate when the user has explicitly opted in via env var.
+# This prevents the "library import has global side effects" problem.
+# The CLI launcher (cuda-morph run) sets this var automatically.
+if os.environ.get("ASCEND_COMPAT_AUTO_ACTIVATE", "").strip() == "1":
+    activate()
+__all__ = [
+    # Version
+    "__version__",
+    # Backend introspection
+    "Backend",
+    "detect_backends",
+    "preferred_backend",
+    "has_npu",
+    "has_mlu",
+    "has_rocm",
+    "has_xpu",
+    "has_cuda",
+    # Shim control
+    "activate",
+    "deactivate",
+    "is_activated",
+    "set_log_level",
+    # Telemetry
+    "get_patch_stats",
+    "get_all_patch_stats",
+    "reset_patch_stats",
+]

ascend_compat/__main__.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Support ``python -m ascend_compat`` as a launcher.
+Usage::
+    # Run a script with all shims active:
+    python -m ascend_compat run train.py --batch-size 32
+    # Equivalent to adding ``import ascend_compat`` at the top of train.py,
+    # plus auto-applying ecosystem patches.
+    # Or use any CLI command:
+    python -m ascend_compat doctor
+    python -m ascend_compat check model.py
+    python -m ascend_compat error 507035
+The ``run`` subcommand is the primary addition here.  It:
+1. Activates the cuda_shim (torch.cuda → torch.npu)
+2. Installs the flash_attn import hook
+3. Applies ecosystem patches (transformers, deepspeed)
+4. Executes the user's script with full compatibility
+"""
+from __future__ import annotations
+import sys
+def main() -> None:
+    from ascend_compat.cli import main as cli_main
+    cli_main()
+if __name__ == "__main__":
+    main()

ascend_compat/_backend.py ADDED Viewed

@@ -0,0 +1,334 @@
+"""Backend detection and capability probing.
+This module is the single source of truth for "what hardware is available right
+now?"  Every other module in cuda-morph imports from here rather than
+re-running its own detection logic.
+Architecture note
+-----------------
+We intentionally *lazy-import* ``torch``, ``torch_npu``, etc. so that
+cuda-morph can be imported even when PyTorch isn't installed (useful
+for the CLI static-analysis tool ``cuda-morph check``).
+Multi-backend support
+---------------------
+cuda-morph supports multiple domestic AI chip backends:
+1. **Ascend NPU** via ``torch_npu`` (Huawei)
+2. **Cambricon MLU** via ``torch_mlu`` (Cambricon)
+3. **NVIDIA CUDA** via ``torch.cuda`` (reference/fallback)
+4. **CPU** — always available, used for development & CI
+Backend detection uses the pluggable registry in ``backends/``.  Each
+backend module implements a common protocol (``BackendInfo``).  The detection
+loop probes each registered backend in priority order and selects the first
+one that reports hardware available.
+Why a dedicated module?
+-----------------------
+Centralising detection avoids import-order bugs.  For example, if ``device.py``
+and ``memory.py`` both independently tried ``import torch_npu``, a race or
+circular-import could surface.  By funnelling everything through ``_backend``
+we guarantee a single, well-ordered detection pass.
+"""
+from __future__ import annotations
+import enum
+import functools
+from typing import Any, Dict, Optional, Type
+from ascend_compat._logging import get_logger
+logger = get_logger(__name__)
+# ---------------------------------------------------------------------------
+# Backend enumeration
+# ---------------------------------------------------------------------------
+class Backend(enum.Enum):
+    """Available compute backends, ordered by preference."""
+    NPU = "npu"      # Huawei Ascend via torch_npu
+    MLU = "mlu"      # Cambricon via torch_mlu
+    ROCM = "rocm"    # AMD via ROCm/HIP (presents as "cuda" device)
+    XPU = "xpu"      # Intel via IPEX/Level Zero
+    CUDA = "cuda"    # NVIDIA via torch.cuda
+    CPU = "cpu"      # Always available
+# Map backend device type strings to enum values
+_BACKEND_DEVICE_TYPES: Dict[str, Backend] = {
+    "npu": Backend.NPU,
+    "mlu": Backend.MLU,
+    "rocm": Backend.ROCM,
+    "xpu": Backend.XPU,
+    "cuda": Backend.CUDA,
+    "cpu": Backend.CPU,
+}
+# ---------------------------------------------------------------------------
+# Lazy module references (populated on first access)
+# ---------------------------------------------------------------------------
+_torch: Optional[Any] = None
+_torch_npu: Optional[Any] = None
+def _import_torch() -> Any:
+    """Lazily import torch, caching the result."""
+    global _torch  # noqa: PLW0603
+    if _torch is None:
+        try:
+            import torch  # type: ignore[import-untyped]
+            _torch = torch
+        except ImportError:
+            raise ImportError(
+                "PyTorch is required but not installed. "
+                "Install it with: pip install torch>=2.0"
+            ) from None
+    return _torch
+def _import_torch_npu() -> Optional[Any]:
+    """Lazily import torch_npu, returning None if unavailable.
+    torch_npu is Huawei's official PyTorch adapter for Ascend NPUs.
+    It monkey-patches torch to add NPU device support.  If it isn't
+    installed, we gracefully fall back to CUDA or CPU.
+    See: https://gitee.com/ascend/pytorch
+    """
+    global _torch_npu  # noqa: PLW0603
+    if _torch_npu is None:
+        try:
+            import torch_npu  # type: ignore[import-untyped]
+            _torch_npu = torch_npu
+            logger.debug("torch_npu imported successfully — Ascend backend available")
+        except ImportError:
+            logger.debug("torch_npu not found — Ascend backend unavailable")
+            _torch_npu = False  # sentinel: tried and failed
+    return _torch_npu if _torch_npu is not False else None
+# ---------------------------------------------------------------------------
+# Active backend tracking
+# ---------------------------------------------------------------------------
+_active_backend_info: Optional[Any] = None  # BackendInfo subclass, set by activate()
+def get_active_backend_info() -> Optional[Any]:
+    """Return the active backend's BackendInfo, or None if not set."""
+    return _active_backend_info
+def set_active_backend_info(info: Optional[Any]) -> None:
+    """Set the active backend info (called by activate())."""
+    global _active_backend_info  # noqa: PLW0603
+    _active_backend_info = info
+# ---------------------------------------------------------------------------
+# Detection logic
+# ---------------------------------------------------------------------------
+@functools.lru_cache(maxsize=1)
+def detect_backends() -> tuple[Backend, ...]:
+    """Probe the system and return all available backends, best-first.
+    The result is cached for the lifetime of the process because hardware
+    doesn't change at runtime.
+    Detection order:
+    1. Check each registered backend in the pluggable registry
+    2. Check NVIDIA CUDA
+    3. CPU (always available)
+    Returns:
+        Tuple of :class:`Backend` values, ordered from most-preferred to
+        least-preferred.
+    """
+    available: list[Backend] = []
+    # 1. Check pluggable backends from the registry
+    try:
+        from ascend_compat.backends import BACKEND_REGISTRY
+        for name, backend_cls in BACKEND_REGISTRY.items():
+            try:
+                if backend_cls.is_available():
+                    device_type = backend_cls.device_type
+                    backend_enum = _BACKEND_DEVICE_TYPES.get(device_type)
+                    if backend_enum and backend_enum not in available:
+                        available.append(backend_enum)
+                        logger.info(
+                            "%s detected (%d device(s))",
+                            backend_cls.display_name,
+                            backend_cls.device_count(),
+                        )
+            except Exception as exc:  # noqa: BLE001
+                logger.warning(
+                    "Backend '%s' detection failed: %s", name, exc
+                )
+    except ImportError:
+        # Fallback: probe directly if backends package fails to import
+        logger.debug("backends package not available, using legacy detection")
+        _detect_legacy(available)
+    # 2. Check for NVIDIA CUDA (if not already found via registry)
+    if Backend.CUDA not in available:
+        torch = _import_torch()
+        if torch.cuda.is_available():
+            available.append(Backend.CUDA)
+            logger.info(
+                "NVIDIA CUDA detected (%d device(s))",
+                torch.cuda.device_count(),
+            )
+    # 3. CPU is always available
+    if Backend.CPU not in available:
+        available.append(Backend.CPU)
+    logger.debug("Detected backends (preference order): %s", available)
+    return tuple(available)
+def _detect_legacy(available: list[Backend]) -> None:
+    """Legacy detection path (before pluggable backends existed).
+    This is the fallback for when the ``backends`` subpackage can't be
+    imported (e.g. during early development or if the package structure
+    changes).
+    """
+    # Check for Ascend NPU
+    npu_mod = _import_torch_npu()
+    if npu_mod is not None:
+        torch = _import_torch()
+        try:
+            if hasattr(torch, "npu") and torch.npu.is_available():
+                available.append(Backend.NPU)
+                logger.info(
+                    "Ascend NPU detected (%d device(s))",
+                    torch.npu.device_count(),
+                )
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("torch_npu installed but NPU detection failed: %s", exc)
+@functools.lru_cache(maxsize=1)
+def preferred_backend() -> Backend:
+    """Return the single best backend for this system.
+    This drives the default behaviour of ``cuda-morph`` — all CUDA
+    calls are routed to whichever backend this function returns.
+    """
+    return detect_backends()[0]
+# ---------------------------------------------------------------------------
+# Convenience predicates
+# ---------------------------------------------------------------------------
+def has_npu() -> bool:
+    """Return True if at least one Ascend NPU is usable."""
+    return Backend.NPU in detect_backends()
+def has_mlu() -> bool:
+    """Return True if at least one Cambricon MLU is usable."""
+    return Backend.MLU in detect_backends()
+def has_rocm() -> bool:
+    """Return True if AMD ROCm GPU is detected."""
+    return Backend.ROCM in detect_backends()
+def has_xpu() -> bool:
+    """Return True if at least one Intel XPU is usable."""
+    return Backend.XPU in detect_backends()
+def has_cuda() -> bool:
+    """Return True if at least one NVIDIA GPU is usable."""
+    return Backend.CUDA in detect_backends()
+def get_torch() -> Any:
+    """Return the ``torch`` module (importing it if necessary).
+    This is the canonical way for other cuda-morph modules to get a
+    reference to torch without redundant try/except blocks.
+    """
+    return _import_torch()
+def get_torch_npu() -> Optional[Any]:
+    """Return the ``torch_npu`` module, or None if not installed."""
+    return _import_torch_npu()
+# ---------------------------------------------------------------------------
+# Device-string translation
+# ---------------------------------------------------------------------------
+def translate_device_string(device: str) -> str:
+    """Translate a CUDA device string to the appropriate backend string.
+    Mapping rules:
+    - If a domestic backend (NPU, MLU) is preferred, ``"cuda"`` → backend device type
+    - If CUDA is preferred (or we're on CPU), return the string unchanged.
+    Args:
+        device: A PyTorch device string, e.g. ``"cuda"``, ``"cuda:0"``,
+            ``"cpu"``, ``"npu:1"``.
+    Returns:
+        The translated device string.
+    Examples::
+        # On an Ascend system:
+        translate_device_string("cuda")    # → "npu"
+        translate_device_string("cuda:2")  # → "npu:2"
+        # On a Cambricon system:
+        translate_device_string("cuda")    # → "mlu"
+        translate_device_string("cuda:0")  # → "mlu:0"
+        # On CPU:
+        translate_device_string("cuda")    # → "cpu"
+    """
+    backend = preferred_backend()
+    # Backends that need "cuda" → their device type translation
+    _TRANSLATE_BACKENDS = {
+        Backend.NPU: "npu",
+        Backend.MLU: "mlu",
+        Backend.XPU: "xpu",
+        # ROCm does NOT need translation — it presents as "cuda" via HIP
+    }
+    if backend in _TRANSLATE_BACKENDS and device.startswith("cuda"):
+        target_type = _TRANSLATE_BACKENDS[backend]
+        translated = device.replace("cuda", target_type, 1)
+        logger.debug("Device string translated: %r → %r", device, translated)
+        return translated
+    if backend == Backend.CPU and device.startswith("cuda"):
+        # No accelerator at all — fall back to CPU so the code doesn't crash.
+        translated = "cpu"
+        logger.warning(
+            "No GPU/NPU/MLU available — translating device %r → 'cpu'. "
+            "Performance will be significantly lower.",
+            device,
+        )
+        return translated
+    return device

ascend_compat/_exceptions.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""Custom exception hierarchy for cuda-morph.
+Provides distinct exception types so consumers can distinguish
+cuda-morph errors from unrelated failures in their except clauses.
+Usage::
+    from ascend_compat._exceptions import ActivationError
+    try:
+        ascend_compat.activate()
+    except ActivationError as e:
+        print(f"Shim activation failed: {e}")
+"""
+from __future__ import annotations
+class AscendCompatError(Exception):
+    """Base exception for all cuda-morph errors.
+    Catch this to handle any error raised by the library without
+    catching unrelated exceptions.
+    """
+class ActivationError(AscendCompatError):
+    """Raised when shim activation fails (e.g. patch application error).
+    The shim guarantees atomic rollback — if this is raised, no patches
+    were left in a half-applied state.
+    """
+class BackendNotFoundError(AscendCompatError):
+    """Raised when a required backend or adapter is not available.
+    Examples:
+    - ``torch_npu`` is not installed but NPU operations are requested
+    - ``npu_fusion_attention`` is missing from torch_npu
+    """
+class PatchError(AscendCompatError):
+    """Raised when an individual patch cannot be applied or reverted."""
+class CompatibilityError(AscendCompatError):
+    """Raised when a version or compatibility check fails hard.
+    Soft failures emit warnings; this is for fatal incompatibilities
+    (e.g. known-bad torch_npu + PyTorch combinations).
+    """
+class PortError(AscendCompatError):
+    """Raised when code porting/rewriting fails."""
+class ValidationError(AscendCompatError):
+    """Raised when operator verification fails."""
+class SecurityError(AscendCompatError):
+    """Raised when a security or integrity check fails."""
+# Keep the public alias for backward compat
+CudaMorphError = AscendCompatError