PyPI - ista-daslab-optimizers-cuda - Versions diffs - 1.0.0__tar.gz - Mend

ista-daslab-optimizers-cuda 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

ista_daslab_optimizers_cuda-1.0.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 IST Austria Distributed Algorithms and Systems Lab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

ista_daslab_optimizers_cuda-1.0.0/MANIFEST.in ADDED Viewed

	@@ -0,0 +1 @@
1	+ graft ./kernels

ista_daslab_optimizers_cuda-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,40 @@
+Metadata-Version: 2.4
+Name: ista_daslab_optimizers_cuda
+Version: 1.0.0
+Summary: CUDA kernels for ISTA-DASLab-Optimizers project developed in the Distributed Algorithms and Systems group (DASLab) @ Institute of Science and Technology Austria (ISTA)
+Author-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
+Maintainer-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
+License: MIT License
+        Copyright (c) 2026 IST Austria Distributed Algorithms and Systems Lab
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Repository, https://github.com/IST-DASLab/ISTA-DASLab-Optimizers-CUDA
+Keywords: adaptive optimization,deep learning,low memory optimization
+Classifier: Programming Language :: Python :: 3.8
+Classifier: License :: OSI Approved :: Apache Software License
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch
+Requires-Dist: torchaudio
+Requires-Dist: torchvision
+Requires-Dist: numpy
+Dynamic: license-file

ista_daslab_optimizers_cuda-1.0.0/ista_daslab_optimizers_cuda.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,40 @@
+Metadata-Version: 2.4
+Name: ista_daslab_optimizers_cuda
+Version: 1.0.0
+Summary: CUDA kernels for ISTA-DASLab-Optimizers project developed in the Distributed Algorithms and Systems group (DASLab) @ Institute of Science and Technology Austria (ISTA)
+Author-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
+Maintainer-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
+License: MIT License
+        Copyright (c) 2026 IST Austria Distributed Algorithms and Systems Lab
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Repository, https://github.com/IST-DASLab/ISTA-DASLab-Optimizers-CUDA
+Keywords: adaptive optimization,deep learning,low memory optimization
+Classifier: Programming Language :: Python :: 3.8
+Classifier: License :: OSI Approved :: Apache Software License
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch
+Requires-Dist: torchaudio
+Requires-Dist: torchvision
+Requires-Dist: numpy
+Dynamic: license-file

ista_daslab_optimizers_cuda-1.0.0/ista_daslab_optimizers_cuda.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,32 @@
+LICENSE
+MANIFEST.in
+pyproject.toml
+setup.py
+./kernels/dense_mfac/dense_mfac.cpp
+./kernels/dense_mfac/dense_mfac_kernel.cu
+./kernels/micro_adam/micro_adam.cpp
+./kernels/micro_adam/micro_adam_asymm_block_quant.cu
+./kernels/micro_adam/micro_adam_asymm_block_quant_inv.cu
+./kernels/micro_adam/micro_adam_update.cu
+./kernels/sparse_mfac/sparse_mfac.cpp
+./kernels/sparse_mfac/sparse_mfac_LCG_kernel.cu
+./kernels/sparse_mfac/sparse_mfac_SP_kernel.cu
+./kernels/tools/tools.cpp
+./kernels/tools/tools_kernel.cu
+ista_daslab_optimizers_cuda.egg-info/PKG-INFO
+ista_daslab_optimizers_cuda.egg-info/SOURCES.txt
+ista_daslab_optimizers_cuda.egg-info/dependency_links.txt
+ista_daslab_optimizers_cuda.egg-info/requires.txt
+ista_daslab_optimizers_cuda.egg-info/top_level.txt
+kernels/utils.h
+kernels/dense_mfac/dense_mfac.cpp
+kernels/dense_mfac/dense_mfac_kernel.cu
+kernels/micro_adam/micro_adam.cpp
+kernels/micro_adam/micro_adam_asymm_block_quant.cu
+kernels/micro_adam/micro_adam_asymm_block_quant_inv.cu
+kernels/micro_adam/micro_adam_update.cu
+kernels/sparse_mfac/sparse_mfac.cpp
+kernels/sparse_mfac/sparse_mfac_LCG_kernel.cu
+kernels/sparse_mfac/sparse_mfac_SP_kernel.cu
+kernels/tools/tools.cpp
+kernels/tools/tools_kernel.cu

ista_daslab_optimizers_cuda-1.0.0/ista_daslab_optimizers_cuda.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

ista_daslab_optimizers_cuda-1.0.0/ista_daslab_optimizers_cuda.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,4 @@
+torch
+torchaudio
+torchvision
+numpy

ista_daslab_optimizers_cuda-1.0.0/ista_daslab_optimizers_cuda.egg-info/top_level.txt ADDED Viewed

@@ -0,0 +1,4 @@
+ista_daslab_cuda_dense_mfac
+ista_daslab_cuda_micro_adam
+ista_daslab_cuda_sparse_mfac
+ista_daslab_cuda_tools

ista_daslab_optimizers_cuda-1.0.0/kernels/dense_mfac/dense_mfac.cpp ADDED Viewed

@@ -0,0 +1,20 @@
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+torch::Tensor hinv_setup_cuda(torch::Tensor tmp, torch::Tensor coef);
+torch::Tensor hinv_mul_cuda(int rows, torch::Tensor giHig, torch::Tensor giHix);
+torch::Tensor hinv_setup(torch::Tensor tmp, torch::Tensor coef) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(tmp));
+  return hinv_setup_cuda(tmp, coef);
+}
+torch::Tensor hinv_mul(int rows, torch::Tensor giHig, torch::Tensor giHix) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(giHig));
+  return hinv_mul_cuda(rows, giHig, giHix);
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("hinv_setup", &hinv_setup, "Hinv setup (CUDA)");
+  m.def("hinv_mul", &hinv_mul, "Hinv mul (CUDA)");
+}

ista_daslab_optimizers_cuda-1.0.0/kernels/dense_mfac/dense_mfac_kernel.cu ADDED Viewed

@@ -0,0 +1,216 @@
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+const int SIZE = 32;
+const int MAX = 1024;
+template <typename scalar_t>
+__device__ __forceinline__ scalar_t GetElement(
+  const scalar_t* __restrict__ A, int m,
+  int row, int col
+) {
+  return A[row * m + col];
+}
+template <typename scalar_t>
+__device__ __forceinline__ void SetElement(
+  scalar_t* __restrict__ A, int m,
+  int row, int col,
+  scalar_t value
+) {
+  A[row * m + col] = value;
+}
+/* Kernel for computing `coef` (required setup & update operations) */
+template <typename scalar_t>
+__global__ void HinvCoefKernelDiag(
+    int m,
+    const scalar_t* __restrict__ tmp,
+    scalar_t* __restrict__ coef
+);
+template <typename scalar_t>
+__global__ void HinvCoefKernelMain(
+  int m,
+  const scalar_t* __restrict__ tmp,
+  scalar_t* __restrict__ coef,
+  int iter
+);
+// NOTE: for simplicity, we assume that `m` is always divisible by `SIZE`
+torch::Tensor hinv_setup_cuda(torch::Tensor tmp, torch::Tensor coef) {
+  const auto m = tmp.size(0);
+  const dim3 threads(SIZE, SIZE);
+  const dim3 blocks(m / SIZE, m / SIZE);
+  AT_DISPATCH_FLOATING_TYPES(tmp.scalar_type(), "hinv_setup_cuda", ([&] {
+      HinvCoefKernelDiag<scalar_t><<<m / SIZE, threads>>>(
+        m, tmp.data_ptr<scalar_t>(), coef.data_ptr<scalar_t>()
+      );
+    })
+  );
+  for (int i = 0; i < m / SIZE - 1; i++) {
+    AT_DISPATCH_FLOATING_TYPES(tmp.scalar_type(), "hinv_setup_cuda", ([&] {
+        HinvCoefKernelMain<scalar_t><<<blocks, threads>>>(
+          m, tmp.data_ptr<scalar_t>(), coef.data_ptr<scalar_t>(), i
+        );
+      })
+    );
+  }
+  return coef;
+}
+template <typename scalar_t>
+__global__ void HinvCoefKernelMain(
+    int m,
+    const scalar_t* __restrict__ tmp,
+    scalar_t* __restrict__ coef,
+    int iter
+) {
+  // one thread per block element
+  // top-left of target block
+  int toi = blockIdx.x * SIZE;
+  int toj = blockIdx.y * SIZE;
+  // top-left of source block
+  int fromi = (blockIdx.y + iter) * SIZE;
+  int fromj = toj;
+  // only compute below (current) diagonal
+  if (fromi >= toi)
+    return;
+  // current relative position
+  int x = threadIdx.x;
+  int y = threadIdx.y;
+  // current absolute position
+  int i = toi + x;
+  int j = toj + y;
+  // parallel load relevant blocks of `coef` and `tmp`
+  __shared__ scalar_t from_coef[SIZE][SIZE];
+  __shared__ scalar_t to_tmp[SIZE][SIZE];
+  from_coef[x][y] = GetElement(coef, m, fromi + x, fromj + y);
+  to_tmp[x][y] = GetElement(tmp, m, i, fromi + y);
+  __syncthreads();
+  // parallel matmul
+  scalar_t res = GetElement(coef, m, i, j);
+  for (int k = 0; k < SIZE; k++)
+    res += to_tmp[x][k] * from_coef[k][y];
+  SetElement(coef, m, i, j, res);
+  // keep only next sequential blocks
+  if (toi != fromi + SIZE)
+    return;
+  __syncthreads();
+  // parallel load relevant blocks of `coef` and `tmp`
+  from_coef[x][y] = GetElement(coef, m, i, j);
+  to_tmp[x][y] = GetElement(tmp, m, i, toi + y);
+  __syncthreads();
+  // parallel sequential vector-matrix multiplies
+  res = from_coef[x][y];
+  for (int k = 0; k < SIZE; k++) {
+    if (k < x)
+      res += to_tmp[x][k] * from_coef[k][y];
+    if (k == x - 1) {
+      // parallel write block row
+      from_coef[x][y] = res;
+      SetElement(coef, m, i, j, res);
+    }
+    __syncthreads();
+  }
+}
+template <typename scalar_t>
+__global__ void HinvCoefKernelDiag(
+    int m,
+    const scalar_t* __restrict__ tmp,
+    scalar_t* __restrict__ coef
+) {
+  // one thread per block element
+  // current relative position
+  int x = threadIdx.x;
+  int y = threadIdx.y;
+  // current absolute position
+  int i = blockIdx.x * SIZE + x;
+  int j = blockIdx.x * SIZE + y;
+  // parallel load relevant blocks of `coef` and `tmp`
+  __shared__ scalar_t from_coef[SIZE][SIZE];
+  __shared__ scalar_t to_tmp[SIZE][SIZE];
+  from_coef[x][y] = GetElement(coef, m, i, j);
+  to_tmp[x][y] = GetElement(tmp, m, i, j);
+  __syncthreads();
+  // parallel sequential vector-matrix multiplies
+  scalar_t res = 0;
+  for (int k = 0; k < SIZE; k++) {
+    if (k < x)
+      res += to_tmp[x][k] * from_coef[k][y];
+    // don't write diagonal elements
+    if (k == x - 1 && x != y) {
+      // parallel write block row
+      from_coef[x][y] = res;
+      SetElement(coef, m, i, j, res);
+    }
+    __syncthreads();
+  }
+}
+/* Kernel for computing `giHix` (required for multiplication) */
+template <typename scalar_t>
+__global__ void HinvMulKernel(
+	int rows,
+    int m,
+    const scalar_t* __restrict__ giHig,
+    scalar_t* __restrict__ giHix
+);
+// NOTE: currently only works for `m` <= 1024
+torch::Tensor hinv_mul_cuda(int rows, torch::Tensor giHig, torch::Tensor giHix) {
+  const auto m = giHig.size(0);
+  AT_DISPATCH_FLOATING_TYPES(giHig.scalar_type(), "hinv_mul_cuda", ([&] {
+      HinvMulKernel<scalar_t><<<1, m>>>(
+        rows, m, giHig.data_ptr<scalar_t>(), giHix.data_ptr<scalar_t>()
+      );
+    })
+  );
+  return giHix;
+}
+template <typename scalar_t>
+__global__ void HinvMulKernel(
+	int rows,
+    int m,
+    const scalar_t* __restrict__ giHig,
+    scalar_t* __restrict__ giHix
+) {
+    int i = threadIdx.x;
+    // parallel load relevant coefficients from `giHix` and `giHig`
+    __shared__ scalar_t denom[MAX];
+    __shared__ scalar_t tmp[MAX];
+    denom[i] = GetElement(giHig, m, i, i) + rows; // changed by ionut: fix_scaling
+    tmp[i] = GetElement(giHix, m, 0, i);
+    __syncthreads();
+    // compute parallel sequential linear combination
+    for (int j = 1; j < m; j++) {
+        if (j <= i) {
+			scalar_t sub = GetElement(giHig, m, j - 1, i) * tmp[j - 1] / denom[j - 1];
+			tmp[i] -= sub;
+		}
+		__syncthreads();
+    }
+    SetElement(giHix, m, 0, i, tmp[i]);
+}

ista_daslab_optimizers_cuda-1.0.0/kernels/micro_adam/micro_adam.cpp ADDED Viewed

@@ -0,0 +1,62 @@
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_bf16.h>
+#include "../utils.h"
+typedef long long LL;
+// CUDA methods
+void compute_microadam_update_cuda(int blocks, int threads, int carveout,
+						  	   LL t, float beta1, float beta2, float eps,
+						  	   LL d_block_size, LL k_block_size,
+						       LL d, LL m, LL k,
+						       torch::Tensor indices, torch::Tensor values, torch::Tensor out);
+void asymm_block_quant_cuda(LL d, LL q_block_size, torch::Tensor xq, torch::Tensor xmin, torch::Tensor xmax, torch::Tensor x);
+void asymm_block_quant_inv_cuda(LL d, LL q_block_size, torch::Tensor xq, torch::Tensor xmin, torch::Tensor xmax, torch::Tensor x, float alpha);
+// C++ methods
+void compute_microadam_update(int blocks, int threads, int carveout,
+						  LL t, float beta1, float beta2, float eps,
+						  LL d_block_size, LL k_block_size,
+						  LL d, LL m, LL k,
+						  torch::Tensor indices, torch::Tensor values, torch::Tensor out) {
+	CHECK_INPUT(indices);
+	CHECK_INPUT(values);
+	CHECK_INPUT(out);
+	CHECK_THREADS(threads);
+  	const at::cuda::OptionalCUDAGuard device_guard(device_of(indices));
+	compute_microadam_update_cuda(blocks, threads, carveout,
+							  t, beta1, beta2, eps,
+							  d_block_size, k_block_size,
+							  d, m, k,
+							  indices, values, out);
+}
+void asymm_block_quant(LL d, LL q_block_size, torch::Tensor xq, torch::Tensor xmin, torch::Tensor xmax, torch::Tensor x) {
+	CHECK_INPUT(xq);
+	CHECK_INPUT(xmin);
+	CHECK_INPUT(xmax);
+	CHECK_INPUT(x);
+	const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+	asymm_block_quant_cuda(d, q_block_size, xq, xmin, xmax, x);
+}
+void asymm_block_quant_inv(LL d, LL q_block_size, torch::Tensor xq, torch::Tensor xmin, torch::Tensor xmax, torch::Tensor x, float alpha) {
+	CHECK_INPUT(xq);
+	CHECK_INPUT(xmin);
+	CHECK_INPUT(xmax);
+	CHECK_INPUT(x);
+	const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+	asymm_block_quant_inv_cuda(d, q_block_size, xq, xmin, xmax, x, alpha);
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+	m.def("compute_microadam_update", &compute_microadam_update, "Computes the update from Compressed Adam.");
+    m.def("asymm_block_quant", &asymm_block_quant, "Asymmetrically quantizes a vector to 4bits in blocks.");
+    m.def("asymm_block_quant_inv", &asymm_block_quant_inv, "Asymmetrically dequantizes a vector to 4bits in blocks.");
+}

ista_daslab_optimizers_cuda-1.0.0/kernels/micro_adam/micro_adam_asymm_block_quant.cu ADDED Viewed

@@ -0,0 +1,64 @@
+#include "../utils.h"
+__global__ void asymm_block_quant_kernel_bf16_bf16(LL d, LL q_block_size, uint8_t *xq, bfloat16 *xmin, bfloat16 *xmax, bfloat16 *x) {
+    /*
+		This kernel computes xq = Q(x, x_min, x_max) for 4 bits (implements point 4 from PhD notebook page 55)
+		In contrast to "globally" kernel, this kernel works with a single block
+		Make sure block_size is always divisible by 2
+		We have to read:
+		- q_block_size values from x
+		- one value from ranges
+		- q_block_size / 2 values from xq
+	*/
+	bfloat162 *x2 = reinterpret_cast<bfloat162*>(x); // we will read two values from x at once
+	const LL B = gridDim.x; // number of blocks
+	const LL Bid = blockIdx.x; // block id
+	const LL T = blockDim.x; // number of threads
+	const LL Tid = threadIdx.x; // thread id
+	LL half_d = (d >> 1);
+	LL half_q_block_size = (q_block_size >> 1); // block size in xq
+	LL half_start_index = Bid * half_q_block_size; // start index in vector x
+	LL half_end_index = min(half_start_index + half_q_block_size, half_d); // end index in vector x
+	float m = __bfloat162float(xmin[Bid]);
+	float M = __bfloat162float(xmax[Bid]);
+    float u = (M - m) / 15.0f; // 15 = 16 - 1 = 2^4 - 1
+	bfloat162 vx2; // the value that will store x2[index]
+	uint8_t msb; // the MSB of a xq component
+	uint8_t lsb; // the LSB of a xq component
+    for(LL half_index = half_start_index + Tid; half_index < half_end_index; half_index += T) {
+		vx2 = x2[half_index];
+		msb = (uint8_t) floorf((__bfloat162float(vx2.x) - m) / u + 0.5f);
+		lsb = (uint8_t) floorf((__bfloat162float(vx2.y) - m) / u + 0.5f);
+		xq[half_index] = (msb << 4) | lsb;
+    }
+    if((d & 1) && (Bid == B-1) && (Tid == T-1)) {
+        msb = (uint8_t) floorf((__bfloat162float(x[d - 1]) - m) / u + 0.5f);
+        xq[half_d] = (msb << 4);
+    }
+}
+void asymm_block_quant_cuda(LL d, LL q_block_size, torch::Tensor xq, torch::Tensor xmin, torch::Tensor xmax, torch::Tensor x) {
+    torch::ScalarType bf16 = torch::ScalarType::BFloat16;
+    assert(xmin.scalar_type() == bf16 && xmax.scalar_type() == bf16 && x.scalar_type() == torch::ScalarType::BFloat16);
+    LL blocks = 1 + (LL)(d / q_block_size);
+    uint8_t *ptr_xq = (uint8_t*) xq.data_ptr();
+    asymm_block_quant_kernel_bf16_bf16<<<blocks, 1024>>>(d,
+                                                         q_block_size,
+                                                         ptr_xq,
+                                                         (bfloat16*) xmin.data_ptr(),
+                                                         (bfloat16*) xmax.data_ptr(),
+                                                         (bfloat16*) x.data_ptr());
+    // error checks
+	GPU_ERROR_CHECK(cudaGetLastError());
+	GPU_ERROR_CHECK(cudaPeekAtLastError());
+// 	GPU_ERROR_CHECK(cudaDeviceSynchronize());
+}

ista_daslab_optimizers_cuda-1.0.0/kernels/micro_adam/micro_adam_asymm_block_quant_inv.cu ADDED Viewed

@@ -0,0 +1,83 @@
+#include "../utils.h"
+__global__ void asymm_block_quant_inv_kernel_bf16_bf16(LL d, LL q_block_size, uint8_t *xq, bfloat16 *xmin, bfloat16 *xmax, bfloat16 *x, float alpha) {
+	/*
+		This kernel computes x += alpha * Q_inv(xq, xmin, xmax) for 4 bits (implements point 1 from PhD #9 notebook page 55)
+		Here, x is the output buffer and will already contain the dense gradient
+		The output buffer x has d components and xq has d/2 components because one uint8_t stores two 4-bit values
+		In contrast to "globally" kernel, this kernel works with a single block
+		Make sure block_size is always divisible by 2
+		We have to read:
+		- q_block_size values from x
+		- one value from ranges
+		- q_block_size / 2 values from xq
+	*/
+	bfloat162 *x2 = reinterpret_cast<bfloat162*>(x); // we will read two values from x at once
+	const LL B = (LL) gridDim.x; // number of blocks
+	const LL Bid = (LL) blockIdx.x; // block id
+	const LL T = (LL) blockDim.x; // number of threads
+	const LL Tid = (LL) threadIdx.x; // thread id
+	LL half_d = (d >> 1);
+	LL half_q_block_size = (q_block_size >> 1); // block size in xq
+	LL half_start_index = Bid * half_q_block_size; // start index in vector x
+	LL half_end_index = minLL(half_start_index + half_q_block_size, half_d); // end index in vector x
+// 	if (Bid == 0 && Tid == 0) {
+// 	    printf("\n\n\n\t\t\t&&&&&&&&&& half_d=%lld, half_q_block_size=%lld, half_start_index=%lld, half_end_index=%lld\n\n\n");
+// 	}
+	float m = __bfloat162float(xmin[Bid]);
+	float M = __bfloat162float(xmax[Bid]);
+    float u = (M - m) / 15.0f; // 15 = 16 - 1 = 2^4 - 1
+	bfloat162 vx2; // the value that will store x2[index]
+	uint8_t vq; // the value that will store xq[index]
+	uint8_t msb; // the MSB of a xq component
+	uint8_t lsb; // the LSB of a xq component
+	for(LL half_index = half_start_index + Tid; half_index < half_end_index; half_index += T) {
+        vx2 = x2[half_index];
+        vq = xq[half_index];
+		msb = (vq & 0xF0) >> 4;
+		lsb = (vq & 0x0F);
+        // += operation happens here
+		vx2.x += __float2bfloat16((msb * u + m) * alpha);
+		vx2.y += __float2bfloat16((lsb * u + m) * alpha);
+		x2[half_index] = vx2;
+	}
+	if((d & 1) && (Bid == B-1) && (Tid == T-1)) {
+		bfloat16 vx = x[d - 1];
+		vq = xq[half_d];
+		msb = (vq & 0xF0) >> 4;
+		// += operation happens here
+		vx += __float2bfloat16((msb * u + m) * alpha);
+		x[d - 1] = vx;
+	}
+}
+void asymm_block_quant_inv_cuda(LL d, LL q_block_size, torch::Tensor xq, torch::Tensor xmin, torch::Tensor xmax, torch::Tensor x, float alpha) {
+    ASSERT_BF16(xmin);
+    ASSERT_BF16(xmax);
+    ASSERT_BF16(x);
+    LL blocks = 1 + (LL)(d / q_block_size);
+    dim3 B(blocks, 1, 1);
+    dim3 T(1024, 1, 1);
+    uint8_t *ptr_xq = (uint8_t*) xq.data_ptr();
+    asymm_block_quant_inv_kernel_bf16_bf16<<<B, T>>>(d,
+                                                     q_block_size,
+                                                     (uint8_t*) xq.data_ptr(),
+                                                     (bfloat16*) xmin.data_ptr(),
+                                                     (bfloat16*) xmax.data_ptr(),
+                                                     (bfloat16*) x.data_ptr(),
+                                                     alpha);
+    // error checks
+	GPU_ERROR_CHECK(cudaGetLastError());
+	GPU_ERROR_CHECK(cudaPeekAtLastError());
+// 	GPU_ERROR_CHECK(cudaDeviceSynchronize());
+}