PyPI - ista-daslab-optimizers - Versions diffs - 1.1.8__py3-none-any.whl - Mend

ista-daslab-optimizers 1.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

ista_daslab_optimizers/__init__.py +6 -0
ista_daslab_optimizers/acdc/__init__.py +5 -0
ista_daslab_optimizers/acdc/acdc.py +387 -0
ista_daslab_optimizers/acdc/wd_scheduler.py +31 -0
ista_daslab_optimizers/dense_mfac/__init__.py +5 -0
ista_daslab_optimizers/dense_mfac/dense_core_mfac.py +164 -0
ista_daslab_optimizers/dense_mfac/dense_mfac.py +93 -0
ista_daslab_optimizers/fft_low_rank/dct_adamw.py +351 -0
ista_daslab_optimizers/fft_low_rank/fft_projector.py +192 -0
ista_daslab_optimizers/fft_low_rank/trion.py +242 -0
ista_daslab_optimizers/ista_optimizer/__init__.py +5 -0
ista_daslab_optimizers/ista_optimizer/ista_optimizer.py +36 -0
ista_daslab_optimizers/micro_adam/__init__.py +5 -0
ista_daslab_optimizers/micro_adam/micro_adam.py +402 -0
ista_daslab_optimizers/sparse_mfac/__init__.py +7 -0
ista_daslab_optimizers/sparse_mfac/sparse_core_mfac_w_ef.py +226 -0
ista_daslab_optimizers/sparse_mfac/sparse_mfac.py +87 -0
ista_daslab_optimizers/tools.py +218 -0
ista_daslab_optimizers/utils/dct.py +45 -0
ista_daslab_optimizers/utils/global_cache.py +45 -0
ista_daslab_optimizers/utils/matrix_storage.py +58 -0
ista_daslab_optimizers/utils/newton_schulz_triton.py +374 -0
ista_daslab_optimizers/utils/quantizers.py +71 -0
ista_daslab_optimizers/utils/schedulers.py +41 -0
ista_daslab_optimizers-1.1.8.dist-info/METADATA +333 -0
ista_daslab_optimizers-1.1.8.dist-info/RECORD +29 -0
ista_daslab_optimizers-1.1.8.dist-info/WHEEL +5 -0
ista_daslab_optimizers-1.1.8.dist-info/licenses/LICENSE +201 -0
ista_daslab_optimizers-1.1.8.dist-info/top_level.txt +1 -0

ista_daslab_optimizers/utils/newton_schulz_triton.py ADDED Viewed

@@ -0,0 +1,374 @@
+import torch
+import triton
+import triton.language as tl
+from torch import Tensor
+def _get_autotune_configs():
+    return [
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": bm,
+                "BLOCK_SIZE_N": bn,
+                "BLOCK_SIZE_K": bk,
+                "GROUP_SIZE_M": 8,
+                "LOWER_UPPER": 1,
+            },
+            num_stages=stages,
+            num_warps=warps,
+        )
+        for bm in [64, 128]
+        for bn in [64, 128, 256]
+        for bk in [64, 128]
+        for stages, warps in [(3, 4), (3, 8), (4, 4)]
+        if bm // bn <= 2 and bn // bm <= 2
+    ]
+@triton.jit
+def _pid_to_block(
+    pid,
+    M,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """
+    Helper function to map Triton program ID to (batch, row, col) of the output matrix.
+    """
+    # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_N)
+    # Map PID to a single matrix in batch
+    batch_idx = pid // (num_pid_m * num_pid_n)
+    pid = pid % (num_pid_m * num_pid_n)
+    # Map PID to 2D grid of blocks
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M)
+    m_idx = pid_m * BLOCK_SIZE_M
+    n_idx = pid_n * BLOCK_SIZE_N
+    return batch_idx, m_idx, n_idx
+@triton.autotune(
+    configs=_get_autotune_configs(),
+    key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"],
+)
+@triton.jit
+def ns_line_1_kernel(
+    A_ptr,
+    C_ptr,
+    M,
+    K,
+    a_stride_b,
+    a_stride_r,
+    a_stride_c,
+    c_stride_b,
+    c_stride_r,
+    c_stride_c,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    LOWER_UPPER: tl.constexpr,
+):
+    """
+    Input A has shape (M, K)
+    Output C has shape (M, M)
+    Compute C = A @ A.T
+    """
+    pid = tl.program_id(axis=0)
+    batch_idx, m_idx, n_idx = _pid_to_block(
+        pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M
+    )
+    # Skip blocks that don't need to be computed
+    skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx)
+    skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx)
+    if skip_block_below_diag or skip_block_above_diag:
+        return
+    # Index into one matrix of batch
+    A_ptr += batch_idx * a_stride_b
+    C_ptr += batch_idx * c_stride_b
+    # Create pointer arrays for A and A.T
+    offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c)
+    at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    # Accumulate over blocks of K
+    for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+        accumulator = tl.dot(a, at, accumulator)
+        a_ptrs += BLOCK_SIZE_K * a_stride_c
+        at_ptrs += BLOCK_SIZE_K * a_stride_c
+    out_dtype = C_ptr.dtype.element_ty
+    output = accumulator.to(out_dtype)
+    # Store block of C
+    offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c)
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, output, mask=c_mask)
+    # Store block of C mirrored across the diagonal
+    c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c)
+    c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+    tl.store(c_ptrs_t, output.T, mask=c_mask_t)
+def ns_line_1(A: Tensor, *, out: Tensor = None):
+    """
+    Launch Triton kernel to compute C = A @ A.T
+    """
+    if A.ndim > 3 or A.ndim < 2:
+        raise ValueError(f"Input tensor must be 2D or 3D, but got {A.ndim}D tensor.")
+    M, K = A.shape[-2:]
+    if out is None:
+        out = torch.empty((*A.shape[:-1], M), device=A.device, dtype=A.dtype)
+    assert out.size(-2) == M, "Output matrix has incorrect shape"
+    assert out.size(-1) == M, "Output matrix has incorrect shape"
+    batch_size = A.size(0) if A.ndim == 3 else 1
+    input_batch_stride = A.stride(0) if A.ndim == 3 else 0
+    output_batch_stride = out.stride(0) if out.ndim == 3 else 0
+    grid = lambda meta: (
+        batch_size
+        * triton.cdiv(M, meta["BLOCK_SIZE_M"])
+        * triton.cdiv(M, meta["BLOCK_SIZE_N"]),
+    )
+    ns_line_1_kernel[grid](
+        A_ptr=A,
+        C_ptr=out,
+        M=M,
+        K=K,
+        a_stride_b=input_batch_stride,
+        a_stride_r=A.stride(-2),
+        a_stride_c=A.stride(-1),
+        c_stride_b=output_batch_stride,
+        c_stride_r=out.stride(-2),
+        c_stride_c=out.stride(-1),
+    )
+    return out
+@triton.autotune(
+    configs=_get_autotune_configs(),
+    key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"],
+)
+@triton.jit
+def ns_line_2_kernel(
+    A_ptr,
+    C_ptr,
+    M,
+    a_stride_b,
+    a_stride_r,
+    a_stride_c,
+    c_stride_b,
+    c_stride_r,
+    c_stride_c,
+    alpha,
+    beta,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    LOWER_UPPER: tl.constexpr,
+):
+    """
+    Input A is square matrix with shape (M, M)
+    Output C has shape (M, M)
+    Compute C = alpha * A @ A.T + beta * A
+    """
+    pid = tl.program_id(axis=0)
+    batch_idx, m_idx, n_idx = _pid_to_block(
+        pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M
+    )
+    # Skip blocks that don't need to be computed
+    skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx)
+    skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx)
+    if skip_block_below_diag or skip_block_above_diag:
+        return
+    # Index into one matrix of batch
+    A_ptr += batch_idx * a_stride_b
+    C_ptr += batch_idx * c_stride_b
+    # Create pointer arrays for A and A.T
+    offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c)
+    at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    # Accumulate over blocks of K
+    for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0)
+        at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0)
+        accumulator = tl.dot(a, at, accumulator)
+        a_ptrs += BLOCK_SIZE_K * a_stride_c
+        at_ptrs += BLOCK_SIZE_K * a_stride_c
+    # Load block of A to add (corresponds to the current block of C)
+    offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M)
+    offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N)
+    a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c)
+    a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M)
+    a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32)
+    # Apply alpha and beta
+    accumulator *= alpha
+    accumulator += a_add * beta
+    out_dtype = C_ptr.dtype.element_ty
+    output = accumulator.to(out_dtype)
+    # Store block of C
+    offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c)
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, output, mask=c_mask)
+    # Store block of C mirrored across the diagonal
+    c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c)
+    c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+    tl.store(c_ptrs_t, output.T, mask=c_mask_t)
+def ns_line_2(A: Tensor, alpha: float, beta: float, *, out: Tensor = None):
+    """
+    Launch Triton kernel to compute C = alpha * A @ A.T + beta * A
+    """
+    if A.ndim > 3 or A.ndim < 2:
+        raise ValueError(f"Input tensor must be 2D or 3D, but got {A.ndim}D tensor.")
+    M, K = A.shape[-2:]
+    if M != K:
+        raise ValueError(
+            f"Input must be symmetric square matrix, but got shape {A.shape}"
+        )
+    if out is None:
+        out = torch.empty((*A.shape[:-1], M), device=A.device, dtype=A.dtype)
+    assert out.size(-2) == M, "Output matrix has incorrect shape"
+    assert out.size(-1) == M, "Output matrix has incorrect shape"
+    batch_size = A.size(0) if A.ndim == 3 else 1
+    input_batch_stride = A.stride(0) if A.ndim == 3 else 0
+    output_batch_stride = out.stride(0) if out.ndim == 3 else 0
+    grid = lambda meta: (
+        batch_size
+        * triton.cdiv(M, meta["BLOCK_SIZE_M"])
+        * triton.cdiv(M, meta["BLOCK_SIZE_N"]),
+    )
+    ns_line_2_kernel[grid](
+        A_ptr=A,
+        C_ptr=out,
+        M=M,
+        a_stride_b=input_batch_stride,
+        a_stride_r=A.stride(-2),
+        a_stride_c=A.stride(-1),
+        c_stride_b=output_batch_stride,
+        c_stride_r=out.stride(-2),
+        c_stride_c=out.stride(-1),
+        alpha=alpha,
+        beta=beta,
+    )
+    return out
+@torch.compile(dynamic=False, fullgraph=True)
+def zeropower_via_newtonschulz5(G: Tensor, epsilon: float = 1e-7):
+    """
+    Reference implementation of Newton-Schulz without Triton.
+    """
+    # Newton-Schulz constants
+    ns_consts = [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]
+    X = G.to(dtype=torch.bfloat16)
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) + epsilon)
+    for a, b, c in ns_consts:
+        A = X @ X.mT
+        B = b * A + c * (A @ A)
+        X = a * X + B @ X
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+    return X
+@torch.compile(dynamic=False, fullgraph=True)
+def newton_schulz_triton(G: Tensor, epsilon: float = 1e-7):
+    """
+    Triton implementation of Newton-Schulz iteration
+    """
+    # Newton-Schulz constants
+    ns_consts = [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]
+    X = G.to(dtype=torch.bfloat16)
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) + epsilon)
+    # Allocate buffers
+    X = X.contiguous()
+    A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype)
+    B = torch.empty_like(A)
+    C = torch.empty_like(X)
+    ns_line_3 = torch.baddbmm if X.ndim > 2 else torch.addmm
+    # Perform the NS iterations
+    for a, b, c in ns_consts:
+        ns_line_1(X, out=A)  # A = X @ X.mT
+        ns_line_2(A, alpha=c, beta=b, out=B)  # B = b * A + c * A @ A
+        ns_line_3(X, B, X, beta=a, out=C)  # C = a * X + B @ X
+        X, C = C, X  # Swap references to avoid unnecessary copies
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+    return X

ista_daslab_optimizers/utils/quantizers.py ADDED Viewed

@@ -0,0 +1,71 @@
+import torch
+import numpy as np
+class Quantizer4bit:
+    def __init__(self, shape, device, dtype, bucket_size):
+        assert np.prod(shape) % bucket_size == 0
+        self.shape = shape
+        self.device = device
+        self.bucket_size = bucket_size
+        self.numel = np.prod(shape)
+        self.n_buckets = self.numel // self.bucket_size
+        self.xq = torch.zeros(self.numel // 2, dtype=torch.uint8, device=self.device)
+        self.x_min = torch.zeros(self.n_buckets, 1, dtype=dtype, device=self.device)
+        self.x_max = torch.zeros(self.n_buckets, 1, dtype=dtype, device=self.device)
+    def quantize(self, x):
+        N, B = self.n_buckets, self.bucket_size
+        N = self.numel // B
+        self.x_min.copy_(x.view(N, B).min(dim=1).values.view(-1, 1))
+        self.x_max.copy_(x.view(N, B).max(dim=1).values.view(-1, 1))
+        u = (self.x_max - self.x_min) / 15.
+        xq = ((x.view(N, B) - self.x_min) / u + 0.5).floor().to(torch.uint8).view(-1, 2)
+        byte_left = xq[:, 0] << 4
+        byte_right = xq[:, 1]
+        self.xq.copy_(byte_left | byte_right)
+    def quantize_inv(self):
+        N, B = self.n_buckets, self.bucket_size
+        u = (self.x_max - self.x_min) / 15.
+        byte_left = (self.xq & 0xF0) >> 4
+        byte_right = self.xq & 0x0F
+        xq = torch.hstack(
+            (
+                byte_left.view(-1),
+                byte_right.view(-1)
+            )
+        ).view(N, B) # intercalate byte_left and byte_right
+        x = xq * u + self.x_min
+        return x.view(*self.shape)
+class Quantizer8bit:
+    def __init__(self, shape, device, dtype, bucket_size):
+        assert np.prod(shape) % bucket_size == 0
+        self.shape = shape
+        self.device = device
+        self.bucket_size = bucket_size
+        self.numel = np.prod(shape)
+        self.n_buckets = self.numel // self.bucket_size
+        self.xq = torch.zeros(self.numel, dtype=torch.uint8, device=self.device)
+        self.x_min = torch.zeros(self.n_buckets, 1, dtype=dtype, device=self.device)
+        self.x_max = torch.zeros(self.n_buckets, 1, dtype=dtype, device=self.device)
+    def quantize(self, x):
+        N, B = self.n_buckets, self.bucket_size
+        N = self.numel // B
+        self.x_min.copy_(x.view(N, B).min(dim=1).values.view(-1, 1))
+        self.x_max.copy_(x.view(N, B).max(dim=1).values.view(-1, 1))
+        u = (self.x_max - self.x_min) / 15.
+        xq = ((x.view(N, B) - self.x_min) / u + 0.5).floor().to(torch.uint8)
+        self.xq.copy_(xq.view(-1))
+        del xq, u
+    def quantize_inv(self):
+        N, B = self.n_buckets, self.bucket_size
+        u = (self.x_max - self.x_min) / 15.
+        x = self.xq.view(N, B) * u + self.x_min
+        return x.view(*self.shape)

ista_daslab_optimizers/utils/schedulers.py ADDED Viewed

@@ -0,0 +1,41 @@
+import torch
+def ema_standard_schedule(m, g, beta):
+    """
+    Implements the standard EMA: m_new = beta * m_old + (1 - beta) * g
+    :param m: momentum buffer
+    :param g: gradient
+    :param beta: EMA coefficient
+    """
+    m.lerp_(g, 1 - beta)
+def ema_delayed_decay_schedule(m, g, beta, beta_prev, t, T_decay, alpha):
+    """
+    This version is proposed by Mher Safaryan in June 2025 while a postdoc @ ISTA:
+    beta_0 = 1 (tracks largest weight)
+    alpha >= 0 (sub-schedule slope)
+    if t == 1 or t % T_decay == 0:
+        m_t = beta * m_t-1 + (1 - beta) * g
+        beta_t = 1 - beta
+    else:
+        m_t = (1 / (1 + alpha + beta_t-1)) * m_t-1 + (alpha + beta_t-1) / (1 + alpha + beta_t-1) * g
+        beta_t = (alpha + beta_t-1) / (1 + alpha + beta_t-1)
+    :param m: momentum buffer
+    :param g: gradient buffer
+    :param beta: EMA coefficient
+    :param beta_prev: previous EMA coefficient
+    :param alpha: slope (use values between 0.001 and 0.007)
+    :param T_decay: decay interval
+    :return: returns beta_t
+    """
+    if t == 1 or t % T_decay == 0:
+        ema_standard_schedule(m, g, beta)
+        return 1 - beta
+    else:
+        beta_t = (alpha + beta_prev) / (1 + alpha + beta_prev)
+        ema_standard_schedule(m, g, 1-beta_t)
+        return beta_t