aakaar 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ cat > MANIFEST.in << 'EOF'
2
+ include src/*.h
3
+ include src/*.cuh
4
+ include README.md
5
+ EOF
aakaar-0.1.2/PKG-INFO ADDED
@@ -0,0 +1,52 @@
1
+ Metadata-Version: 2.4
2
+ Name: aakaar
3
+ Version: 0.1.2
4
+ Summary: A custom CUDA-accelerated ML library
5
+ Author: Aarav Aggarwal
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: numpy
9
+ Requires-Dist: nvidia-curand-cu12
10
+ Requires-Dist: nvidia-cuda-runtime-cu12
11
+
12
+ # Aakaar
13
+
14
+ Aakaar is a custom, standalone deep learning tensor library built from the ground up using Python, C++, and raw CUDA. It is designed to provide a lightweight, transparent architecture for high-performance GPU computations without relying on heavy external frameworks like PyTorch or TensorFlow.
15
+
16
+ ## Core Architecture
17
+
18
+ Aakaar bypasses standard NumPy arrays by implementing a custom C++ Tensor object that resides directly in GPU VRAM. Python interacts with this data via Pybind11, acting as a lightweight remote control. This prevents severe performance bottlenecks over the PCI-e bus, keeping data on the GPU until explicitly requested back to the host CPU.
19
+
20
+ Current capabilities include:
21
+ * Custom GPU-native Tensor class lifecycle management.
22
+ * CUDA-accelerated uniform random number generation via cuRAND.
23
+ * Direct host-to-device and device-to-host memory mapping.
24
+
25
+ ## Installation
26
+
27
+ To build Aakaar from source, you must have the NVIDIA CUDA Toolkit (nvcc) and a compatible C++ compiler (e.g., g++) installed.
28
+
29
+ 1. Clone the repository:
30
+ ```bash
31
+ git clone [https://github.com/YOUR_USERNAME/aakaar.git](https://github.com/YOUR_USERNAME/aakaar.git)
32
+ cd aakaar
33
+ pip install setuptools wheel pybind11
34
+ pip install -e .
35
+ ```
36
+
37
+ quick start
38
+ ```bash
39
+ import aakaar
40
+
41
+ # Initialize the CUDA engine and allocate a GPU Tensor
42
+ print("Generating 100,000 random numbers on the GPU...")
43
+ data = aakaar.rand(100000, device="cpu", seed=1337)
44
+
45
+ # The data remains on the GPU as an Aakaar Tensor
46
+ print(type(data))
47
+ # <class 'aakaar._C.Tensor'>
48
+
49
+ # Bring the data across the PCI-e bus to the CPU for inspection
50
+ cpu_data = data.cpu()
51
+ print(cpu_data[:5])
52
+ ```
aakaar-0.1.2/README.md ADDED
@@ -0,0 +1,41 @@
1
+ # Aakaar
2
+
3
+ Aakaar is a custom, standalone deep learning tensor library built from the ground up using Python, C++, and raw CUDA. It is designed to provide a lightweight, transparent architecture for high-performance GPU computations without relying on heavy external frameworks like PyTorch or TensorFlow.
4
+
5
+ ## Core Architecture
6
+
7
+ Aakaar bypasses standard NumPy arrays by implementing a custom C++ Tensor object that resides directly in GPU VRAM. Python interacts with this data via Pybind11, acting as a lightweight remote control. This prevents severe performance bottlenecks over the PCI-e bus, keeping data on the GPU until explicitly requested back to the host CPU.
8
+
9
+ Current capabilities include:
10
+ * Custom GPU-native Tensor class lifecycle management.
11
+ * CUDA-accelerated uniform random number generation via cuRAND.
12
+ * Direct host-to-device and device-to-host memory mapping.
13
+
14
+ ## Installation
15
+
16
+ To build Aakaar from source, you must have the NVIDIA CUDA Toolkit (nvcc) and a compatible C++ compiler (e.g., g++) installed.
17
+
18
+ 1. Clone the repository:
19
+ ```bash
20
+ git clone [https://github.com/YOUR_USERNAME/aakaar.git](https://github.com/YOUR_USERNAME/aakaar.git)
21
+ cd aakaar
22
+ pip install setuptools wheel pybind11
23
+ pip install -e .
24
+ ```
25
+
26
+ quick start
27
+ ```bash
28
+ import aakaar
29
+
30
+ # Initialize the CUDA engine and allocate a GPU Tensor
31
+ print("Generating 100,000 random numbers on the GPU...")
32
+ data = aakaar.rand(100000, device="cpu", seed=1337)
33
+
34
+ # The data remains on the GPU as an Aakaar Tensor
35
+ print(type(data))
36
+ # <class 'aakaar._C.Tensor'>
37
+
38
+ # Bring the data across the PCI-e bus to the CPU for inspection
39
+ cpu_data = data.cpu()
40
+ print(cpu_data[:5])
41
+ ```
@@ -0,0 +1,16 @@
1
+ from . import _C
2
+ import numpy as np
3
+
4
+ def rand(size: int, device: str = "cpu", seed: int = 42):
5
+ # 1. Create the Tensor (The C++ constructor handles allocation)
6
+ t = _C.Tensor(size, device)
7
+
8
+ # 2. Route to the correct backend
9
+ if device == "cuda":
10
+ # Call your existing CUDA random generator
11
+ _C.generate_random(t, seed)
12
+ else:
13
+ # Call the CPU random generator
14
+ _C.fill_cpu_random(t, seed)
15
+
16
+ return t
@@ -0,0 +1,52 @@
1
+ Metadata-Version: 2.4
2
+ Name: aakaar
3
+ Version: 0.1.2
4
+ Summary: A custom CUDA-accelerated ML library
5
+ Author: Aarav Aggarwal
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: numpy
9
+ Requires-Dist: nvidia-curand-cu12
10
+ Requires-Dist: nvidia-cuda-runtime-cu12
11
+
12
+ # Aakaar
13
+
14
+ Aakaar is a custom, standalone deep learning tensor library built from the ground up using Python, C++, and raw CUDA. It is designed to provide a lightweight, transparent architecture for high-performance GPU computations without relying on heavy external frameworks like PyTorch or TensorFlow.
15
+
16
+ ## Core Architecture
17
+
18
+ Aakaar bypasses standard NumPy arrays by implementing a custom C++ Tensor object that resides directly in GPU VRAM. Python interacts with this data via Pybind11, acting as a lightweight remote control. This prevents severe performance bottlenecks over the PCI-e bus, keeping data on the GPU until explicitly requested back to the host CPU.
19
+
20
+ Current capabilities include:
21
+ * Custom GPU-native Tensor class lifecycle management.
22
+ * CUDA-accelerated uniform random number generation via cuRAND.
23
+ * Direct host-to-device and device-to-host memory mapping.
24
+
25
+ ## Installation
26
+
27
+ To build Aakaar from source, you must have the NVIDIA CUDA Toolkit (nvcc) and a compatible C++ compiler (e.g., g++) installed.
28
+
29
+ 1. Clone the repository:
30
+ ```bash
31
+ git clone [https://github.com/YOUR_USERNAME/aakaar.git](https://github.com/YOUR_USERNAME/aakaar.git)
32
+ cd aakaar
33
+ pip install setuptools wheel pybind11
34
+ pip install -e .
35
+ ```
36
+
37
+ quick start
38
+ ```bash
39
+ import aakaar
40
+
41
+ # Initialize the CUDA engine and allocate a GPU Tensor
42
+ print("Generating 100,000 random numbers on the GPU...")
43
+ data = aakaar.rand(100000, device="cpu", seed=1337)
44
+
45
+ # The data remains on the GPU as an Aakaar Tensor
46
+ print(type(data))
47
+ # <class 'aakaar._C.Tensor'>
48
+
49
+ # Bring the data across the PCI-e bus to the CPU for inspection
50
+ cpu_data = data.cpu()
51
+ print(cpu_data[:5])
52
+ ```
@@ -0,0 +1,16 @@
1
+ MANIFEST.in
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ aakaar/__init__.py
6
+ aakaar.egg-info/PKG-INFO
7
+ aakaar.egg-info/SOURCES.txt
8
+ aakaar.egg-info/dependency_links.txt
9
+ aakaar.egg-info/requires.txt
10
+ aakaar.egg-info/top_level.txt
11
+ src/allocator.h
12
+ src/bindings.cpp
13
+ src/cpu_kernel.cpp
14
+ src/random_kernel.cu
15
+ src/rng.h
16
+ src/tensor.h
@@ -0,0 +1,3 @@
1
+ numpy
2
+ nvidia-curand-cu12
3
+ nvidia-cuda-runtime-cu12
@@ -0,0 +1 @@
1
+ aakaar
@@ -0,0 +1,16 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64", "wheel", "pybind11>=2.10"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "aakaar"
7
+ version = "0.1.2"
8
+ description = "A custom CUDA-accelerated ML library"
9
+ readme = "README.md"
10
+ authors = [{name = "Aarav Aggarwal"}]
11
+ requires-python = ">=3.9"
12
+ dependencies = [
13
+ "numpy",
14
+ "nvidia-curand-cu12",
15
+ "nvidia-cuda-runtime-cu12",
16
+ ]
aakaar-0.1.2/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
aakaar-0.1.2/setup.py ADDED
@@ -0,0 +1,71 @@
1
+ import os
2
+ import subprocess
3
+ import sysconfig # <--- ADD THIS
4
+ from setuptools import setup, Extension
5
+ from setuptools.command.build_ext import build_ext
6
+ import pybind11
7
+
8
+ class CUDABuildExtension(build_ext):
9
+ def build_extensions(self):
10
+ nvcc_flags = [
11
+ '-O3',
12
+ '-gencode=arch=compute_75,code=sm_75',
13
+ '-gencode=arch=compute_86,code=sm_86',
14
+ '-gencode=arch=compute_89,code=sm_89',
15
+ '-Xcompiler=-fPIC'
16
+ ]
17
+
18
+ for ext in self.extensions:
19
+ # 1. Grab all include directories from the Extension definition
20
+ includes = [f"-I{d}" for d in ext.include_dirs]
21
+ # 2. Explicitly add Python's core header directory (needed for pybind11)
22
+ includes.append(f"-I{sysconfig.get_path('include')}")
23
+
24
+ cu_sources = [s for s in ext.sources if s.endswith('.cu')]
25
+ cpp_sources = [s for s in ext.sources if s.endswith('.cpp')]
26
+
27
+ objects = []
28
+
29
+ for cu_file in cu_sources:
30
+ obj_file = cu_file.replace('.cu', '.o')
31
+ # 3. Add the `includes` list to the nvcc command!
32
+ nvcc_cmd = ['nvcc', '-c', cu_file, '-o', obj_file] + nvcc_flags + includes
33
+ print(f"Compiling CUDA: {' '.join(nvcc_cmd)}")
34
+ subprocess.check_call(nvcc_cmd)
35
+ objects.append(obj_file)
36
+
37
+ ext.sources = cpp_sources
38
+ ext.extra_objects = objects
39
+
40
+ super().build_extensions()
41
+
42
+ # Define the Python Extension
43
+ aakaar_ext = Extension(
44
+ 'aakaar._C',
45
+ sources=['src/bindings.cpp', 'src/cpu_kernel.cpp', 'src/random_kernel.cu'],
46
+ include_dirs=[
47
+ pybind11.get_include(),
48
+ '/usr/local/cuda/include'
49
+ ],
50
+ library_dirs=[
51
+ '/usr/local/cuda/lib64'
52
+ ],
53
+ libraries=['curand', 'cudart'],
54
+ language='c++'
55
+ )
56
+
57
+ setup(
58
+ name='aakaar',
59
+ version='0.1.0',
60
+ author='Aarav Aggarwal',
61
+ description='A custom standalone ML library featuring CUDA-accelerated operations.',
62
+ packages=['aakaar'],
63
+ ext_modules=[aakaar_ext],
64
+ cmdclass={'build_ext': CUDABuildExtension},
65
+ install_requires=[
66
+ 'numpy',
67
+ 'nvidia-curand-cu12',
68
+ 'nvidia-cuda-runtime-cu12'
69
+ ],
70
+ setup_requires=['pybind11']
71
+ )
@@ -0,0 +1,56 @@
1
+ #pragma once
2
+ #include <cuda_runtime.h>
3
+ #include <unordered_map>
4
+ #include <vector>
5
+ #include <stdexcept>
6
+ #include <iostream>
7
+
8
+ class CachingAllocator {
9
+ private:
10
+ // Maps the size of a tensor to a list of available GPU pointers
11
+ std::unordered_map<int, std::vector<float*>> free_blocks;
12
+
13
+ // Private constructor (Singleton pattern)
14
+ CachingAllocator() {}
15
+
16
+ public:
17
+ // Get the global instance of the allocator
18
+ static CachingAllocator& get_instance() {
19
+ static CachingAllocator instance;
20
+ return instance;
21
+ }
22
+
23
+ // Allocate memory (or reuse an old block)
24
+ float* allocate(int size) {
25
+ // 1. Check if we already have a block of this size in the pool
26
+ if (free_blocks.find(size) != free_blocks.end() && !free_blocks[size].empty()) {
27
+ // Pop a recycled pointer off the back of the list and return it instantly
28
+ float* ptr = free_blocks[size].back();
29
+ free_blocks[size].pop_back();
30
+ return ptr;
31
+ }
32
+
33
+ // 2. If no recycled block exists, we MUST ask the OS (cudaMalloc)
34
+ float* ptr;
35
+ cudaError_t err = cudaMalloc((void**)&ptr, size * sizeof(float));
36
+ if (err != cudaSuccess) {
37
+ throw std::runtime_error("Aakaar out of memory!");
38
+ }
39
+ return ptr;
40
+ }
41
+
42
+ // Free memory (put it in the pool, DON'T give it to the OS)
43
+ void free(float* ptr, int size) {
44
+ free_blocks[size].push_back(ptr);
45
+ }
46
+
47
+ // A utility to actually clear the cache and give memory back to the GPU
48
+ void empty_cache() {
49
+ for (auto& pair : free_blocks) {
50
+ for (float* ptr : pair.second) {
51
+ cudaFree(ptr); // ACTUALLY free the memory
52
+ }
53
+ }
54
+ free_blocks.clear();
55
+ }
56
+ };
@@ -0,0 +1,26 @@
1
+ #include <pybind11/pybind11.h>
2
+ #include <pybind11/stl.h>
3
+ #include <memory>
4
+ #include "tensor.h"
5
+ #include "allocator.h"
6
+
7
+ namespace py = pybind11;
8
+
9
+ void run_curand_uniform(std::shared_ptr<Tensor> t, unsigned long long seed);
10
+ void fill_cpu_random(std::shared_ptr<Tensor> t, unsigned long long seed);
11
+
12
+ void empty_cache() {
13
+ CachingAllocator::get_instance().empty_cache();
14
+ }
15
+
16
+ PYBIND11_MODULE(_C, m) {
17
+ py::class_<Tensor, std::shared_ptr<Tensor>>(m, "Tensor")
18
+ .def(py::init<int, std::string>())
19
+ .def_readonly("device", &Tensor::device)
20
+ .def("to_numpy", &Tensor::to_numpy);
21
+
22
+ m.def("generate_random", &run_curand_uniform, "Fill GPU Tensor with random numbers");
23
+ m.def("fill_cpu_random", &fill_cpu_random, "Fill CPU Tensor with random numbers");
24
+
25
+ m.def("empty_cache", &empty_cache, "Release cached GPU memory");
26
+ }
@@ -0,0 +1,11 @@
1
+ #include <random>
2
+ #include <memory>
3
+ #include "tensor.h"
4
+
5
+ void fill_cpu_random(std::shared_ptr<Tensor> t, unsigned long long seed) {
6
+ std::mt19937 gen(seed);
7
+ std::uniform_real_distribution<float> dis(0.0, 1.0);
8
+ for (int i = 0; i < t->size; ++i) {
9
+ t->data_ptr[i] = dis(gen);
10
+ }
11
+ }
@@ -0,0 +1,19 @@
1
+ #include <cuda_runtime.h>
2
+ #include <curand.h>
3
+ #include <memory>
4
+ #include "tensor.h"
5
+ #include "rng.h" // Import the random manager
6
+
7
+ void run_curand_uniform(std::shared_ptr<Tensor> t, unsigned long long seed) {
8
+ // Fetch the persistent global generator
9
+ curandGenerator_t generator = CUDARandomManager::get_instance().get_generator(seed);
10
+
11
+ // Generate random numbers directly into the existing tensor's buffer
12
+ curandStatus_t status = curandGenerateUniform(generator, t->data_ptr, t->size);
13
+ if (status != CURAND_STATUS_SUCCESS) {
14
+ throw std::runtime_error("cuRAND generation failed");
15
+ }
16
+
17
+ // Ensure the GPU finishes execution before returning control to Python
18
+ cudaDeviceSynchronize();
19
+ }
aakaar-0.1.2/src/rng.h ADDED
@@ -0,0 +1,40 @@
1
+ #pragma once
2
+ #include <curand.h>
3
+ #include <stdexcept>
4
+
5
+ class CUDARandomManager {
6
+ private:
7
+ curandGenerator_t generator;
8
+ bool initialized;
9
+
10
+ // Private constructor (Singleton)
11
+ CUDARandomManager() : initialized(false) {}
12
+
13
+ public:
14
+ static CUDARandomManager& get_instance() {
15
+ static CUDARandomManager instance;
16
+ return instance;
17
+ }
18
+
19
+ // Initialize the generator only once
20
+ // Initialize the generator only once
21
+ curandGenerator_t get_generator(unsigned long long seed) {
22
+ if (!initialized) {
23
+ curandStatus_t status = curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT);
24
+ if (status != CURAND_STATUS_SUCCESS) { // <--- FIXED HERE
25
+ throw std::runtime_error("Failed to create cuRAND generator");
26
+ }
27
+ initialized = true;
28
+ }
29
+
30
+ curandSetPseudoRandomGeneratorSeed(generator, seed);
31
+ return generator;
32
+ }
33
+
34
+ // Clean up the generator when the engine shuts down
35
+ ~CUDARandomManager() {
36
+ if (initialized) {
37
+ curandDestroyGenerator(generator);
38
+ }
39
+ }
40
+ };
@@ -0,0 +1,51 @@
1
+ #pragma once
2
+ #include <cuda_runtime.h>
3
+ #include <string>
4
+ #include <stdexcept>
5
+ #include <iostream>
6
+ #include <pybind11/pybind11.h>
7
+ #include <pybind11/numpy.h>
8
+ #include "allocator.h" // Your CachingAllocator
9
+
10
+ namespace py = pybind11;
11
+
12
+ class Tensor {
13
+ public:
14
+ float* data_ptr;
15
+ int size;
16
+ std::string device;
17
+
18
+ Tensor(int s, std::string dev) : size(s), device(dev) {
19
+ if (device == "cuda") {
20
+ data_ptr = CachingAllocator::get_instance().allocate(size);
21
+ } else {
22
+ data_ptr = new float[size]; // Standard heap allocation
23
+ }
24
+ }
25
+
26
+ ~Tensor() {
27
+ if (device == "cuda") {
28
+ CachingAllocator::get_instance().free(data_ptr, size);
29
+ } else {
30
+ delete[] data_ptr;
31
+ }
32
+ }
33
+
34
+ // Copy data back to a numpy array on the host, regardless of device
35
+ py::array_t<float> to_numpy() {
36
+ py::array_t<float> result(size);
37
+ auto buf = result.mutable_data();
38
+
39
+ if (device == "cuda") {
40
+ cudaError_t err = cudaMemcpy(buf, data_ptr, size * sizeof(float),
41
+ cudaMemcpyDeviceToHost);
42
+ if (err != cudaSuccess) {
43
+ throw std::runtime_error("cudaMemcpy failed in to_numpy(): " +
44
+ std::string(cudaGetErrorString(err)));
45
+ }
46
+ } else {
47
+ std::memcpy(buf, data_ptr, size * sizeof(float));
48
+ }
49
+ return result;
50
+ }
51
+ };