PyPI - cuequivariance-ops-cu12 - Versions diffs - 0.4.0__py3-none-manylinux_2_39_aarch64.whl → 0.5.1__py3-none-manylinux_2_39_aarch64.whl - Mend

cuequivariance-ops-cu12 0.4.0__py3-none-manylinux_2_39_aarch64.whl → 0.5.1__py3-none-manylinux_2_39_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuequivariance-ops-cu12 might be problematic. Click here for more details.

Files changed (28) hide show

cuequivariance_ops/triton/gated_gemm_triton.py ADDED Viewed

@@ -0,0 +1,340 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import enum
+import triton
+import triton.language as tl
+class Precision(enum.Enum):
+    DEFAULT = 0
+    TF32 = 1
+    TF32x3 = 2
+    IEEE = 3
+@triton.jit
+def fused_sigmoid_gated_dual_gemm_forward_kernel(
+    x1_ptr,
+    x2_ptr,
+    w1_ptr,
+    w2_ptr,
+    mask_ptr,
+    o_ptr,
+    M,
+    N,
+    K,
+    TILE_M: tl.constexpr,
+    TILE_N: tl.constexpr,
+    TILE_K: tl.constexpr,
+    PRECISION: tl.constexpr,
+    APPLY_MASK: tl.constexpr,
+    TRANSPOSE_OUT: tl.constexpr,
+    TWO_INPUTS: tl.constexpr,
+):
+    # fully gated GEMM kernel with optional mask at the end
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    start_m = pid_m * TILE_M
+    start_n = pid_n * TILE_N
+    offs_xm = start_m + tl.arange(0, TILE_M)
+    offs_wn = start_n + tl.arange(0, TILE_N)
+    offs_k = tl.arange(0, TILE_K)
+    x1_ptrs = x1_ptr + (offs_xm[:, None] * K + offs_k[None, :])
+    if TWO_INPUTS:
+        x2_ptrs = x2_ptr + (offs_xm[:, None] * K + offs_k[None, :])
+    w_tile_offs = offs_wn[None, :] * K + offs_k[:, None]
+    acc_1 = tl.zeros((TILE_M, TILE_N), dtype=tl.float32)
+    acc_2 = tl.zeros((TILE_M, TILE_N), dtype=tl.float32)
+    mask_m = offs_xm < M
+    if TWO_INPUTS:
+        for _ in range(0, tl.cdiv(K, TILE_K)):
+            x1 = tl.load(x1_ptrs, mask=mask_m[:, None], other=0.0).to(
+                w1_ptr.type.element_ty
+            )
+            w1_ptrs = w1_ptr + w_tile_offs
+            w1 = tl.load(w1_ptrs)
+            if PRECISION == 0:
+                acc_1 = tl.dot(x1, w1, acc_1)
+            elif PRECISION == 1:
+                acc_1 = tl.dot(x1, w1, acc_1, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_1 = tl.dot(x1, w1, acc_1, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_1 = tl.dot(x1, w1, acc_1, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            x1_ptrs += TILE_K
+            w1_ptr += TILE_K
+        for _ in range(0, tl.cdiv(K, TILE_K)):
+            x2 = tl.load(x2_ptrs, mask=mask_m[:, None], other=0.0).to(
+                w2_ptr.type.element_ty
+            )
+            w2_ptrs = w2_ptr + w_tile_offs
+            w2 = tl.load(w2_ptrs)
+            if PRECISION == 0:
+                acc_2 = tl.dot(x2, w2, acc_2)
+            elif PRECISION == 1:
+                acc_2 = tl.dot(x2, w2, acc_2, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_2 = tl.dot(x2, w2, acc_2, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_2 = tl.dot(x2, w2, acc_2, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            x2_ptrs += TILE_K
+            w2_ptr += TILE_K
+    else:
+        for _ in range(0, tl.cdiv(K, TILE_K)):
+            x = tl.load(x1_ptrs, mask=mask_m[:, None], other=0.0).to(
+                w1_ptr.type.element_ty
+            )
+            w1_ptrs = w1_ptr + w_tile_offs
+            w1 = tl.load(w1_ptrs)
+            if PRECISION == 0:
+                acc_1 = tl.dot(x, w1, acc_1)
+            elif PRECISION == 1:
+                acc_1 = tl.dot(x, w1, acc_1, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_1 = tl.dot(x, w1, acc_1, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_1 = tl.dot(x, w1, acc_1, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            w2_ptrs = w2_ptr + w_tile_offs
+            w2 = tl.load(w2_ptrs)
+            if PRECISION == 0:
+                acc_2 = tl.dot(x, w2, acc_2)
+            elif PRECISION == 1:
+                acc_2 = tl.dot(x, w2, acc_2, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_2 = tl.dot(x, w2, acc_2, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_2 = tl.dot(x, w2, acc_2, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            x1_ptrs += TILE_K
+            w1_ptr += TILE_K
+            w2_ptr += TILE_K
+    offs_om = pid_m * TILE_M + tl.arange(0, TILE_M)
+    offs_on = pid_n * TILE_N + tl.arange(0, TILE_N)
+    acc_1 = 1.0 / (1.0 + tl.exp(-acc_1))
+    acc_gated = acc_1 * acc_2
+    if APPLY_MASK:
+        mask = tl.load(mask_ptr + offs_om, mask=mask_m, other=0.0)
+        acc_gated = acc_gated * mask[:, None]
+    acc_gated = acc_gated.to(o_ptr.dtype.element_ty)
+    if TRANSPOSE_OUT:
+        o_ptrs = o_ptr + offs_on[None, :] * M + offs_om[:, None]
+    else:
+        o_ptrs = o_ptr + offs_om[:, None] * N + offs_on[None, :]
+    o_mask = offs_om[:, None] < M
+    tl.store(o_ptrs, acc_gated, mask=o_mask)
+@triton.jit
+def fused_sigmoid_gated_dual_gemm_backward_pregemm_kernel(
+    grad_xw1_ptr,
+    grad_xw2_ptr,
+    grad_mask_ptr,
+    grad_o_ptr,
+    x1_ptr,
+    x2_ptr,
+    w1_ptr,
+    w2_ptr,
+    mask_ptr,
+    M,
+    N,
+    K,
+    TILE_M: tl.constexpr,
+    TILE_N: tl.constexpr,
+    TILE_K: tl.constexpr,
+    PRECISION: tl.constexpr,
+    APPLY_MASK: tl.constexpr,
+    TRANSPOSE_OUT: tl.constexpr,
+    TWO_INPUTS: tl.constexpr,
+):
+    # fully gated GEMM kernel with optional mask at the end
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    start_m = pid_m * TILE_M
+    start_n = pid_n * TILE_N
+    offs_xm = start_m + tl.arange(0, TILE_M)
+    offs_wn = start_n + tl.arange(0, TILE_N)
+    offs_k = tl.arange(0, TILE_K)
+    x1_ptrs = x1_ptr + (offs_xm[:, None] * K + offs_k[None, :])
+    if TWO_INPUTS:
+        x2_ptrs = x2_ptr + (offs_xm[:, None] * K + offs_k[None, :])
+    w_tile_offs = offs_wn[None, :] * K + offs_k[:, None]
+    acc_1 = tl.zeros((TILE_M, TILE_N), dtype=tl.float32)
+    acc_2 = tl.zeros((TILE_M, TILE_N), dtype=tl.float32)
+    mask_m = offs_xm < M
+    if TWO_INPUTS:
+        # recompute acc1 and acc2
+        for _ in range(0, tl.cdiv(K, TILE_K)):
+            x1 = tl.load(x1_ptrs, mask=mask_m[:, None], other=0.0).to(
+                w1_ptr.type.element_ty
+            )
+            w1_ptrs = w1_ptr + w_tile_offs
+            w1 = tl.load(w1_ptrs)
+            if PRECISION == 0:
+                acc_1 = tl.dot(x1, w1, acc_1)
+            elif PRECISION == 1:
+                acc_1 = tl.dot(x1, w1, acc_1, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_1 = tl.dot(x1, w1, acc_1, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_1 = tl.dot(x1, w1, acc_1, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            x1_ptrs += TILE_K
+            w1_ptr += TILE_K
+        for _ in range(0, tl.cdiv(K, TILE_K)):
+            x2 = tl.load(x2_ptrs, mask=mask_m[:, None], other=0.0).to(
+                w2_ptr.type.element_ty
+            )
+            w2_ptrs = w2_ptr + w_tile_offs
+            w2 = tl.load(w2_ptrs)
+            if PRECISION == 0:
+                acc_2 = tl.dot(x2, w2, acc_2)
+            elif PRECISION == 1:
+                acc_2 = tl.dot(x2, w2, acc_2, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_2 = tl.dot(x2, w2, acc_2, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_2 = tl.dot(x2, w2, acc_2, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            x2_ptrs += TILE_K
+            w2_ptr += TILE_K
+    else:
+        # recompute acc1 and acc2
+        for _ in range(0, tl.cdiv(K, TILE_K)):
+            x = tl.load(x1_ptrs, mask=mask_m[:, None], other=0.0).to(
+                w1_ptr.type.element_ty
+            )
+            w1_ptrs = w1_ptr + w_tile_offs
+            w1 = tl.load(w1_ptrs)
+            if PRECISION == 0:
+                acc_1 = tl.dot(x, w1, acc_1)
+            elif PRECISION == 1:
+                acc_1 = tl.dot(x, w1, acc_1, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_1 = tl.dot(x, w1, acc_1, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_1 = tl.dot(x, w1, acc_1, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            w2_ptrs = w2_ptr + w_tile_offs
+            w2 = tl.load(w2_ptrs)
+            if PRECISION == 0:
+                acc_2 = tl.dot(x, w2, acc_2)
+            elif PRECISION == 1:
+                acc_2 = tl.dot(x, w2, acc_2, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_2 = tl.dot(x, w2, acc_2, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_2 = tl.dot(x, w2, acc_2, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            x1_ptrs += TILE_K
+            w1_ptr += TILE_K
+            w2_ptr += TILE_K
+    offs_om = pid_m * TILE_M + tl.arange(0, TILE_M)
+    offs_on = pid_n * TILE_N + tl.arange(0, TILE_N)
+    if TRANSPOSE_OUT:
+        grad_o_ptrs = grad_o_ptr + offs_on[None, :] * M + offs_om[:, None]
+    else:
+        grad_o_ptrs = grad_o_ptr + offs_om[:, None] * N + offs_on[None, :]
+    grad_o = tl.load(grad_o_ptrs, mask=mask_m[:, None], other=0.0).to(tl.float32)
+    acc_sig = 1.0 / (1.0 + tl.exp(-acc_1))
+    if APPLY_MASK:
+        tmp = acc_sig * acc_2
+        grad_mask = grad_o * tmp
+        grad_mask = tl.sum(grad_mask, axis=1)
+        grad_mask_ptrs = grad_mask_ptr + pid_n * M + offs_om
+        tl.store(grad_mask_ptrs, grad_mask.to(grad_mask.type.element_ty), mask=mask_m)
+        mask = tl.load(mask_ptr + offs_om, mask=mask_m, other=0.0)
+        grad_o = grad_o * mask[:, None]
+    tmp = (1.0 - acc_sig) * acc_sig
+    grad_xw1 = grad_o * acc_2 * tmp
+    grad_xw2 = grad_o * acc_sig
+    grad_xw1_ptrs = grad_xw1_ptr + offs_om[:, None] * N + offs_on[None, :]
+    grad_xw2_ptrs = grad_xw2_ptr + offs_om[:, None] * N + offs_on[None, :]
+    tl.store(grad_xw1_ptrs, grad_xw1.to(grad_xw1.type.element_ty), mask=mask_m[:, None])
+    tl.store(grad_xw2_ptrs, grad_xw2.to(grad_xw2.type.element_ty), mask=mask_m[:, None])

cuequivariance_ops/triton/tuning_decorator.py ADDED Viewed

@@ -0,0 +1,272 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import inspect
+import logging  # Added logging import
+from enum import Enum
+from typing import Any, Callable
+import torch
+from tqdm import tqdm
+from .cache_manager import get_cache_manager
+# Configure logging
+logger = logging.getLogger(__name__)
+class BenchmarkMode(Enum):
+    FLUSH_CACHE = 0
+    FLUSH_CACHE_PEAK_PROXY = 1
+    ROT_BUFFER = 2
+    ROT_BUFFER_PEAK_PROXY = 3
+def run_bench(
+    f, input_dict, warmup_iter=250, run_iter=250, bench_mode=BenchmarkMode.ROT_BUFFER
+):
+    initial_output = f(**input_dict)
+    if bench_mode in (BenchmarkMode.ROT_BUFFER, BenchmarkMode.ROT_BUFFER_PEAK_PROXY):
+        len_rot = 4
+        inputs_rot = [None] * len_rot
+        for r in range(len_rot):
+            r_inputs = []
+            for key, value in input_dict.items():
+                if isinstance(value, torch.Tensor):
+                    if bench_mode == BenchmarkMode.ROT_BUFFER_PEAK_PROXY:
+                        r_inputs.append(
+                            (
+                                key,
+                                torch.ones_like(
+                                    value, requires_grad=value.requires_grad
+                                ),
+                            )
+                        )
+                    else:
+                        r_inputs.append(
+                            (
+                                key,
+                                torch.randn_like(
+                                    value, requires_grad=value.requires_grad
+                                ),
+                            )
+                        )
+                else:
+                    r_inputs.append((key, value))
+            r_inputs = dict(r_inputs)
+            inputs_rot[r] = r_inputs
+        for it in range(warmup_iter):
+            _ = f(**inputs_rot[it % len_rot])
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        for it in range(run_iter):
+            _ = f(**inputs_rot[it % len_rot])
+        end.record()
+        torch.cuda.synchronize()
+        elapsed = start.elapsed_time(end)
+    elif bench_mode in (
+        BenchmarkMode.FLUSH_CACHE,
+        BenchmarkMode.FLUSH_CACHE_PEAK_PROXY,
+    ):
+        cache_filler = torch.empty(1024 * 1024 * 256, dtype=torch.int8, device="cuda")
+        if bench_mode == BenchmarkMode.FLUSH_CACHE_PEAK_PROXY:
+            _inputs = {}
+            for key, value in input_dict.items():
+                if isinstance(value, torch.Tensor):
+                    _inputs.append(
+                        (key, torch.ones_like(value, requires_grad=value.requires_grad))
+                    )
+                else:
+                    _inputs.append((key, value))
+            input_dict = _inputs
+        for _ in range(warmup_iter):
+            cache_filler.zero_()
+            _ = f(**input_dict)
+        starts = [torch.cuda.Event(enable_timing=True) for _ in range(run_iter)]
+        ends = [torch.cuda.Event(enable_timing=True) for _ in range(run_iter)]
+        for i in range(run_iter):
+            cache_filler.zero_()
+            starts[i].record()
+            _ = f(**input_dict)
+            ends[i].record()
+        torch.cuda.synchronize()
+        elapsed = sum(s.elapsed_time(e) for s, e in zip(starts, ends))
+    return elapsed / run_iter, initial_output
+def input_to_key_default(**args) -> str:
+    key_parts = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            key_parts.append(f"{list(arg.shape)}_{arg.dtype}")
+        elif isinstance(arg, bool):
+            key_parts.append("True" if arg else "False")
+        elif isinstance(arg, str):
+            key_parts.append(arg)
+        else:
+            key_parts.append(str(arg.__class__.__name__))
+    return "_".join(key_parts)
+def combine_all_kwargs(
+    fn: Callable,
+    args: tuple,
+    kwargs: dict[str, Any],
+) -> dict[str, Any]:
+    # Get the function signature
+    sig = inspect.signature(fn)
+    params = sig.parameters
+    param_names = list(params.keys())
+    # Create dictionary of default values
+    defaults = {
+        name: param.default
+        for name, param in params.items()
+        if param.default is not inspect.Parameter.empty
+    }
+    # Create dictionary mapping positional args to parameter names
+    args_as_kwargs = {
+        param_names[i]: args[i] for i in range(min(len(args), len(param_names)))
+    }
+    # Create combined dictionary of all parameters
+    all_kwargs = defaults.copy()  # Start with defaults
+    all_kwargs.update(args_as_kwargs)  # Override with positional args
+    all_kwargs.update(kwargs)  # Override with explicit kwargs
+    return all_kwargs
+def autotune_aot(
+    input_generator: Callable,
+    input_to_key: Callable | None,
+    input_configs: list[dict[str, Any]],
+    tunable_configs: list[dict[str, Any]],
+    prune_configs_fn: Callable[
+        [list[dict[str, Any]], dict[str, Any]], list[dict[str, Any]]
+    ]
+    | None,
+    bench_mode=BenchmarkMode.ROT_BUFFER,
+    warmup_iter=25,
+    run_iter=100,
+) -> None:
+    def decorator(fn: Callable) -> Callable:
+        def wrapper(*args, **kwargs):
+            all_kwargs = combine_all_kwargs(fn, args, kwargs)
+            nonlocal input_to_key
+            nonlocal input_configs
+            if input_to_key is None:
+                input_to_key = input_to_key_default
+            # Check if the function is already cached
+            function_key = fn.__name__
+            input_key = input_to_key(**all_kwargs)
+            cache_manager = get_cache_manager()
+            best_cached_config = cache_manager.get(function_key, input_key)
+            aot_mode = cache_manager.aot_mode
+            if best_cached_config is None and aot_mode is not None:
+                # start autotuning process
+                # input_configs = input_configs + [None]
+                if aot_mode == "ONDEMAND":
+                    input_configs = [None]
+                try:
+                    # Initialize the progress bar
+                    progress_bar = tqdm(
+                        input_configs, desc="Autotuning Progress", unit="config"
+                    )
+                    for input_config in progress_bar:
+                        # generate input based on the config
+                        input_data = (
+                            input_generator(**input_config)
+                            if input_config is not None
+                            else all_kwargs
+                        )
+                        # Make a copy of all_kwargs to avoid modifying the original
+                        current_kwargs = all_kwargs.copy()
+                        current_kwargs.update(input_data)
+                        current_input_key = input_to_key(**current_kwargs)
+                        best_cached_config = cache_manager.get(
+                            function_key, current_input_key
+                        )
+                        if best_cached_config is not None:
+                            continue
+                        # prune the tunable configs based on the all_kwargs
+                        pruned_tunable_configs = (
+                            prune_configs_fn(tunable_configs, **all_kwargs)
+                            if prune_configs_fn is not None
+                            else tunable_configs
+                        )
+                        best_config = None
+                        best_time = float("inf")
+                        working_config = []
+                        for tunable in pruned_tunable_configs:
+                            try:
+                                current_kwargs.update(tunable)
+                                fn(**current_kwargs)
+                                torch.cuda.synchronize()
+                                working_config.append(tunable)
+                            except Exception:
+                                pass
+                        if not working_config:
+                            continue
+                        for tunable in working_config:
+                            current_kwargs.update(tunable)
+                            elapse, _ = run_bench(
+                                fn,
+                                current_kwargs,
+                                warmup_iter=warmup_iter,
+                                run_iter=run_iter,
+                                bench_mode=bench_mode,
+                            )
+                            if elapse < best_time:
+                                best_time = elapse
+                                best_config = tunable
+                        cache_manager.set(
+                            function_key,
+                            current_input_key,
+                            {"config": best_config, "time": best_time},
+                        )
+                    cache_manager.save_cache(function_key)
+                except Exception as e:
+                    print(e)
+                # After tuning, try to get the best config
+                best_cached_config = cache_manager.get(function_key, input_key)
+            if best_cached_config is not None:
+                all_kwargs.update(best_cached_config["config"])
+            return fn(**all_kwargs)
+        return wrapper
+    return decorator

{cuequivariance_ops_cu12-0.4.0.dist-info → cuequivariance_ops_cu12-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: cuequivariance-ops-cu12
-Version: 0.4.0
+Version: 0.5.1
 Summary: cuequivariance-ops - GPU Accelerated Extensions for Equivariant Primitives
 Author: NVIDIA Corporation
 License: # Software License Agreement
@@ -177,6 +177,10 @@ Classifier: Programming Language :: Python
 Project-URL: Homepage, https://github.com/nvidia/cuEquivariance
 Project-URL: Documentation, https://github.com/nvidia/cuEquivariance
 Requires-Python: >=3.10
+Requires-Dist: nvidia-cublas-cu12>=12.5.0
+Requires-Dist: tqdm
+Requires-Dist: pynvml
+Requires-Dist: platformdirs
 Provides-Extra: test
 Requires-Dist: numpy; extra == "test"
 Requires-Dist: pytest; extra == "test"

cuequivariance_ops_cu12-0.5.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,32 @@
+cuequivariance_ops/VERSION,sha256=q6lRYmyGkM5JPLyPAYIFu0aAA_YfvwD9PTxxzrq8AXc,6
+cuequivariance_ops/__init__.py,sha256=wvvAMuXpOg5W4oE-AnHDWoHPzcamAWK_DiUXyg3hgW8,1332
+cuequivariance_ops/_version.py,sha256=o9Flao_mTq2Y7TrrjnSCqEAgebmA0sGozsl15qVI13Y,730
+cuequivariance_ops/common/common.hpp,sha256=2zDyE5lGugQL43vmM4_ylmp-Tz8OBFnPRsdFra_1BdM,2787
+cuequivariance_ops/common/nvtx.hpp,sha256=Wi6z9b-yFUNq6ShJjjcsdxQRqCygd4xGegGJrqUI9Wk,708
+cuequivariance_ops/equivariance/dtypes.hh,sha256=w0BYWZ0LYklODXhp7PR6VYE__DE1Syj0Ur11aFaq9VM,466
+cuequivariance_ops/equivariance/fused_tensor_product.cuh,sha256=bOXR5UWU9gNYRfdh6k28NEkV3CUU2ijmh6y7c0ND0J4,8283
+cuequivariance_ops/equivariance/indexed_linear.hh,sha256=lNqJNafJdPyMAUp6iwWvu6RyassSXh7JqyqJ4bfjoxQ,1402
+cuequivariance_ops/equivariance/run_fmha.h,sha256=7l62dTQJbX7BbHLB7MmVP1t26Cfpmcu3h6eY048Hof0,9505
+cuequivariance_ops/equivariance/run_fmha_cudafree.h,sha256=bF2_nrvSfrqSVZ0eOPcq4CJ-NKqmJ2VgQv1cstvHBkU,2695
+cuequivariance_ops/equivariance/segmented_transpose.cuh,sha256=gfSZhRBwSqwVAgFCCiGtI-NJ8yDy9tV_iCg1G2KpctY,1766
+cuequivariance_ops/equivariance/tensor_product_uniform_1d_jit.hh,sha256=7PPybCWczS58XKA-iFLoCM7MDEomO4-enF6RCBj5G5M,1922
+cuequivariance_ops/lib/libcue_ops.so,sha256=rSk0Km-M-Zu703bOMV-OF8mykMPOgFTyx73OW_r8ofM,112269112
+cuequivariance_ops/triton/__init__.py,sha256=LCHvxif4kwr0Squy7mjgx0NCUyM2AcOjkDg5CXZZtuA,1053
+cuequivariance_ops/triton/cache_manager.py,sha256=sXlbuCKsoRMEc1wQVcdk7Vk18LJdej97Ve9FkxdQRYU,9154
+cuequivariance_ops/triton/fused_layer_norm_triton.py,sha256=SyQf_eJvTKm3Foe8BI0sjWZdEtGp__LQ1qgImlHLc4c,11056
+cuequivariance_ops/triton/gated_gemm_triton.py,sha256=PEJcgNVZUk8G5Z5ukD8Ksbe71kUWHUufjUINI5JGnV8,11405
+cuequivariance_ops/triton/tuning_decorator.py,sha256=ruN_Ck5Np6a09slN-VbK0uF4IEgR9WxNIVK4bKmPKzo,9813
+cuequivariance_ops/triton/cache/fused_sigmoid_gated_dual_gemm_backward_pregemm_kernel_wrapper.10.0.json,sha256=4Gi4yJ_I-smVSPzEWZn_kZWktn6BI5sNFk5wJDE4aH8,1397798
+cuequivariance_ops/triton/cache/fused_sigmoid_gated_dual_gemm_backward_pregemm_kernel_wrapper.8.0.json,sha256=HUK_ayOTS5WrJy_W_sVxyBSADLubp69spp8lfjbkHX8,1392688
+cuequivariance_ops/triton/cache/fused_sigmoid_gated_dual_gemm_backward_pregemm_kernel_wrapper.8.6.json,sha256=HFZDB_XzoSSg1DToHV297NKLrCSycsA4QUQ_aOSelOs,1392431
+cuequivariance_ops/triton/cache/fused_sigmoid_gated_dual_gemm_backward_pregemm_kernel_wrapper.8.9.json,sha256=eU6hDvUU8YAxCGydnXd6Dnl9x6xo52KonOBxRbrvBgw,1392528
+cuequivariance_ops/triton/cache/fused_sigmoid_gated_dual_gemm_backward_pregemm_kernel_wrapper.9.0.json,sha256=3wruFbYFOLXQZXQg0FQPN11X8YYeHCowrARr61yTbew,2785263
+cuequivariance_ops/triton/cache/fused_sigmoid_gated_dual_gemm_forward_kernel_wrapper.10.0.json,sha256=9hMqFldfcq4rFnKalSNm5vO_bUbIw66MO-BrqhSodRI,1754706
+cuequivariance_ops/triton/cache/fused_sigmoid_gated_dual_gemm_forward_kernel_wrapper.8.0.json,sha256=grlXjI18H5d71mIcYY8sgo6s4Ssz3aUSCK4rMCP0dtc,2011725
+cuequivariance_ops/triton/cache/fused_sigmoid_gated_dual_gemm_forward_kernel_wrapper.8.6.json,sha256=ndWI0VB8R0RjvEC2JPZz_GbQQ_xfg6zEfrvmSgSMMMw,2010879
+cuequivariance_ops/triton/cache/fused_sigmoid_gated_dual_gemm_forward_kernel_wrapper.8.9.json,sha256=biWhNBUqJiYNuuC1hyimbUTndFINBrQpAHEcXFOluN0,2011532
+cuequivariance_ops/triton/cache/fused_sigmoid_gated_dual_gemm_forward_kernel_wrapper.9.0.json,sha256=3_v5C0cW_Ab3rNC5cu-SBRUQ7Ala18fvzvytSPa-KHI,4025067
+cuequivariance_ops_cu12-0.5.1.dist-info/METADATA,sha256=PKjZtsBFTYmyoPzltH0OwMWljL8FJRJgWSJSp8tTDFU,20954
+cuequivariance_ops_cu12-0.5.1.dist-info/WHEEL,sha256=RxM28Avh4PDgHOLX-AZLV1MP0dIb1yycxVPEx6_SFW0,116
+cuequivariance_ops_cu12-0.5.1.dist-info/RECORD,,
+cuequivariance_ops_cu12-0.5.1.dist-info/licenses/LICENSE,sha256=rvp0QV9FuOdxz_CGWTd9DgId4xh2BByyXfBBnb0ejZM,18279

{cuequivariance_ops_cu12-0.4.0.dist-info → cuequivariance_ops_cu12-0.5.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: scikit-build-core 0.11.1
+Generator: scikit-build-core 0.11.4
 Root-Is-Purelib: false
 Tag: py3-none-manylinux_2_39_aarch64

cuequivariance_ops_cu12-0.4.0.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-cuequivariance_ops/VERSION,sha256=QLjrQACpE6d5EJBTXykdPTaYdBYqie88nj1OiHobnnk,6
-cuequivariance_ops/__init__.py,sha256=ba7jv_WICRROtLbDU2O1u0MHxp6VkVu0-UGKuQxf9iw,1255
-cuequivariance_ops/_version.py,sha256=o9Flao_mTq2Y7TrrjnSCqEAgebmA0sGozsl15qVI13Y,730
-cuequivariance_ops/common/common.hpp,sha256=2zDyE5lGugQL43vmM4_ylmp-Tz8OBFnPRsdFra_1BdM,2787
-cuequivariance_ops/common/nvtx.hpp,sha256=Wi6z9b-yFUNq6ShJjjcsdxQRqCygd4xGegGJrqUI9Wk,708
-cuequivariance_ops/equivariance/fused_tensor_product.cuh,sha256=bOXR5UWU9gNYRfdh6k28NEkV3CUU2ijmh6y7c0ND0J4,8283
-cuequivariance_ops/equivariance/segmented_transpose.cuh,sha256=gfSZhRBwSqwVAgFCCiGtI-NJ8yDy9tV_iCg1G2KpctY,1766
-cuequivariance_ops/equivariance/tensor_product_uniform_1d_jit.hh,sha256=oWhSS0ZmMHlye8eTucweoGBtzN1H0nN1GX_Rz-MsPqI,2002
-cuequivariance_ops/lib/libcue_ops.so,sha256=VQP3gnNy4jpVad__bbaZiakvx5c8J63Q2knrvfIRLJc,81794536
-cuequivariance_ops_cu12-0.4.0.dist-info/METADATA,sha256=na1Ly8dpRX4aVspiUYshcsTnjVmh0zsW13AtoteoEJ0,20842
-cuequivariance_ops_cu12-0.4.0.dist-info/WHEEL,sha256=teK9zuS7Jv7dMHQejkMfDwwTIgdimcBypnObHv4zSrs,116
-cuequivariance_ops_cu12-0.4.0.dist-info/RECORD,,
-cuequivariance_ops_cu12-0.4.0.dist-info/licenses/LICENSE,sha256=rvp0QV9FuOdxz_CGWTd9DgId4xh2BByyXfBBnb0ejZM,18279

{cuequivariance_ops_cu12-0.4.0.dist-info → cuequivariance_ops_cu12-0.5.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes