aakaar 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aakaar-0.1.2/MANIFEST.in +5 -0
- aakaar-0.1.2/PKG-INFO +52 -0
- aakaar-0.1.2/README.md +41 -0
- aakaar-0.1.2/aakaar/__init__.py +16 -0
- aakaar-0.1.2/aakaar.egg-info/PKG-INFO +52 -0
- aakaar-0.1.2/aakaar.egg-info/SOURCES.txt +16 -0
- aakaar-0.1.2/aakaar.egg-info/dependency_links.txt +1 -0
- aakaar-0.1.2/aakaar.egg-info/requires.txt +3 -0
- aakaar-0.1.2/aakaar.egg-info/top_level.txt +1 -0
- aakaar-0.1.2/pyproject.toml +16 -0
- aakaar-0.1.2/setup.cfg +4 -0
- aakaar-0.1.2/setup.py +71 -0
- aakaar-0.1.2/src/allocator.h +56 -0
- aakaar-0.1.2/src/bindings.cpp +26 -0
- aakaar-0.1.2/src/cpu_kernel.cpp +11 -0
- aakaar-0.1.2/src/random_kernel.cu +19 -0
- aakaar-0.1.2/src/rng.h +40 -0
- aakaar-0.1.2/src/tensor.h +51 -0
aakaar-0.1.2/MANIFEST.in
ADDED
aakaar-0.1.2/PKG-INFO
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aakaar
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: A custom CUDA-accelerated ML library
|
|
5
|
+
Author: Aarav Aggarwal
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: nvidia-curand-cu12
|
|
10
|
+
Requires-Dist: nvidia-cuda-runtime-cu12
|
|
11
|
+
|
|
12
|
+
# Aakaar
|
|
13
|
+
|
|
14
|
+
Aakaar is a custom, standalone deep learning tensor library built from the ground up using Python, C++, and raw CUDA. It is designed to provide a lightweight, transparent architecture for high-performance GPU computations without relying on heavy external frameworks like PyTorch or TensorFlow.
|
|
15
|
+
|
|
16
|
+
## Core Architecture
|
|
17
|
+
|
|
18
|
+
Aakaar bypasses standard NumPy arrays by implementing a custom C++ Tensor object that resides directly in GPU VRAM. Python interacts with this data via Pybind11, acting as a lightweight remote control. This prevents severe performance bottlenecks over the PCI-e bus, keeping data on the GPU until explicitly requested back to the host CPU.
|
|
19
|
+
|
|
20
|
+
Current capabilities include:
|
|
21
|
+
* Custom GPU-native Tensor class lifecycle management.
|
|
22
|
+
* CUDA-accelerated uniform random number generation via cuRAND.
|
|
23
|
+
* Direct host-to-device and device-to-host memory mapping.
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
To build Aakaar from source, you must have the NVIDIA CUDA Toolkit (nvcc) and a compatible C++ compiler (e.g., g++) installed.
|
|
28
|
+
|
|
29
|
+
1. Clone the repository:
|
|
30
|
+
```bash
|
|
31
|
+
git clone [https://github.com/YOUR_USERNAME/aakaar.git](https://github.com/YOUR_USERNAME/aakaar.git)
|
|
32
|
+
cd aakaar
|
|
33
|
+
pip install setuptools wheel pybind11
|
|
34
|
+
pip install -e .
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
quick start
|
|
38
|
+
```bash
|
|
39
|
+
import aakaar
|
|
40
|
+
|
|
41
|
+
# Initialize the CUDA engine and allocate a GPU Tensor
|
|
42
|
+
print("Generating 100,000 random numbers on the GPU...")
|
|
43
|
+
data = aakaar.rand(100000, device="cpu", seed=1337)
|
|
44
|
+
|
|
45
|
+
# The data remains on the GPU as an Aakaar Tensor
|
|
46
|
+
print(type(data))
|
|
47
|
+
# <class 'aakaar._C.Tensor'>
|
|
48
|
+
|
|
49
|
+
# Bring the data across the PCI-e bus to the CPU for inspection
|
|
50
|
+
cpu_data = data.cpu()
|
|
51
|
+
print(cpu_data[:5])
|
|
52
|
+
```
|
aakaar-0.1.2/README.md
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Aakaar
|
|
2
|
+
|
|
3
|
+
Aakaar is a custom, standalone deep learning tensor library built from the ground up using Python, C++, and raw CUDA. It is designed to provide a lightweight, transparent architecture for high-performance GPU computations without relying on heavy external frameworks like PyTorch or TensorFlow.
|
|
4
|
+
|
|
5
|
+
## Core Architecture
|
|
6
|
+
|
|
7
|
+
Aakaar bypasses standard NumPy arrays by implementing a custom C++ Tensor object that resides directly in GPU VRAM. Python interacts with this data via Pybind11, acting as a lightweight remote control. This prevents severe performance bottlenecks over the PCI-e bus, keeping data on the GPU until explicitly requested back to the host CPU.
|
|
8
|
+
|
|
9
|
+
Current capabilities include:
|
|
10
|
+
* Custom GPU-native Tensor class lifecycle management.
|
|
11
|
+
* CUDA-accelerated uniform random number generation via cuRAND.
|
|
12
|
+
* Direct host-to-device and device-to-host memory mapping.
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
To build Aakaar from source, you must have the NVIDIA CUDA Toolkit (nvcc) and a compatible C++ compiler (e.g., g++) installed.
|
|
17
|
+
|
|
18
|
+
1. Clone the repository:
|
|
19
|
+
```bash
|
|
20
|
+
git clone [https://github.com/YOUR_USERNAME/aakaar.git](https://github.com/YOUR_USERNAME/aakaar.git)
|
|
21
|
+
cd aakaar
|
|
22
|
+
pip install setuptools wheel pybind11
|
|
23
|
+
pip install -e .
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
quick start
|
|
27
|
+
```bash
|
|
28
|
+
import aakaar
|
|
29
|
+
|
|
30
|
+
# Initialize the CUDA engine and allocate a GPU Tensor
|
|
31
|
+
print("Generating 100,000 random numbers on the GPU...")
|
|
32
|
+
data = aakaar.rand(100000, device="cpu", seed=1337)
|
|
33
|
+
|
|
34
|
+
# The data remains on the GPU as an Aakaar Tensor
|
|
35
|
+
print(type(data))
|
|
36
|
+
# <class 'aakaar._C.Tensor'>
|
|
37
|
+
|
|
38
|
+
# Bring the data across the PCI-e bus to the CPU for inspection
|
|
39
|
+
cpu_data = data.cpu()
|
|
40
|
+
print(cpu_data[:5])
|
|
41
|
+
```
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from . import _C
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
def rand(size: int, device: str = "cpu", seed: int = 42):
|
|
5
|
+
# 1. Create the Tensor (The C++ constructor handles allocation)
|
|
6
|
+
t = _C.Tensor(size, device)
|
|
7
|
+
|
|
8
|
+
# 2. Route to the correct backend
|
|
9
|
+
if device == "cuda":
|
|
10
|
+
# Call your existing CUDA random generator
|
|
11
|
+
_C.generate_random(t, seed)
|
|
12
|
+
else:
|
|
13
|
+
# Call the CPU random generator
|
|
14
|
+
_C.fill_cpu_random(t, seed)
|
|
15
|
+
|
|
16
|
+
return t
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aakaar
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: A custom CUDA-accelerated ML library
|
|
5
|
+
Author: Aarav Aggarwal
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: nvidia-curand-cu12
|
|
10
|
+
Requires-Dist: nvidia-cuda-runtime-cu12
|
|
11
|
+
|
|
12
|
+
# Aakaar
|
|
13
|
+
|
|
14
|
+
Aakaar is a custom, standalone deep learning tensor library built from the ground up using Python, C++, and raw CUDA. It is designed to provide a lightweight, transparent architecture for high-performance GPU computations without relying on heavy external frameworks like PyTorch or TensorFlow.
|
|
15
|
+
|
|
16
|
+
## Core Architecture
|
|
17
|
+
|
|
18
|
+
Aakaar bypasses standard NumPy arrays by implementing a custom C++ Tensor object that resides directly in GPU VRAM. Python interacts with this data via Pybind11, acting as a lightweight remote control. This prevents severe performance bottlenecks over the PCI-e bus, keeping data on the GPU until explicitly requested back to the host CPU.
|
|
19
|
+
|
|
20
|
+
Current capabilities include:
|
|
21
|
+
* Custom GPU-native Tensor class lifecycle management.
|
|
22
|
+
* CUDA-accelerated uniform random number generation via cuRAND.
|
|
23
|
+
* Direct host-to-device and device-to-host memory mapping.
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
To build Aakaar from source, you must have the NVIDIA CUDA Toolkit (nvcc) and a compatible C++ compiler (e.g., g++) installed.
|
|
28
|
+
|
|
29
|
+
1. Clone the repository:
|
|
30
|
+
```bash
|
|
31
|
+
git clone [https://github.com/YOUR_USERNAME/aakaar.git](https://github.com/YOUR_USERNAME/aakaar.git)
|
|
32
|
+
cd aakaar
|
|
33
|
+
pip install setuptools wheel pybind11
|
|
34
|
+
pip install -e .
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
quick start
|
|
38
|
+
```bash
|
|
39
|
+
import aakaar
|
|
40
|
+
|
|
41
|
+
# Initialize the CUDA engine and allocate a GPU Tensor
|
|
42
|
+
print("Generating 100,000 random numbers on the GPU...")
|
|
43
|
+
data = aakaar.rand(100000, device="cpu", seed=1337)
|
|
44
|
+
|
|
45
|
+
# The data remains on the GPU as an Aakaar Tensor
|
|
46
|
+
print(type(data))
|
|
47
|
+
# <class 'aakaar._C.Tensor'>
|
|
48
|
+
|
|
49
|
+
# Bring the data across the PCI-e bus to the CPU for inspection
|
|
50
|
+
cpu_data = data.cpu()
|
|
51
|
+
print(cpu_data[:5])
|
|
52
|
+
```
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
MANIFEST.in
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
aakaar/__init__.py
|
|
6
|
+
aakaar.egg-info/PKG-INFO
|
|
7
|
+
aakaar.egg-info/SOURCES.txt
|
|
8
|
+
aakaar.egg-info/dependency_links.txt
|
|
9
|
+
aakaar.egg-info/requires.txt
|
|
10
|
+
aakaar.egg-info/top_level.txt
|
|
11
|
+
src/allocator.h
|
|
12
|
+
src/bindings.cpp
|
|
13
|
+
src/cpu_kernel.cpp
|
|
14
|
+
src/random_kernel.cu
|
|
15
|
+
src/rng.h
|
|
16
|
+
src/tensor.h
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
aakaar
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel", "pybind11>=2.10"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "aakaar"
|
|
7
|
+
version = "0.1.2"
|
|
8
|
+
description = "A custom CUDA-accelerated ML library"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [{name = "Aarav Aggarwal"}]
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"numpy",
|
|
14
|
+
"nvidia-curand-cu12",
|
|
15
|
+
"nvidia-cuda-runtime-cu12",
|
|
16
|
+
]
|
aakaar-0.1.2/setup.cfg
ADDED
aakaar-0.1.2/setup.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
import sysconfig # <--- ADD THIS
|
|
4
|
+
from setuptools import setup, Extension
|
|
5
|
+
from setuptools.command.build_ext import build_ext
|
|
6
|
+
import pybind11
|
|
7
|
+
|
|
8
|
+
class CUDABuildExtension(build_ext):
|
|
9
|
+
def build_extensions(self):
|
|
10
|
+
nvcc_flags = [
|
|
11
|
+
'-O3',
|
|
12
|
+
'-gencode=arch=compute_75,code=sm_75',
|
|
13
|
+
'-gencode=arch=compute_86,code=sm_86',
|
|
14
|
+
'-gencode=arch=compute_89,code=sm_89',
|
|
15
|
+
'-Xcompiler=-fPIC'
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
for ext in self.extensions:
|
|
19
|
+
# 1. Grab all include directories from the Extension definition
|
|
20
|
+
includes = [f"-I{d}" for d in ext.include_dirs]
|
|
21
|
+
# 2. Explicitly add Python's core header directory (needed for pybind11)
|
|
22
|
+
includes.append(f"-I{sysconfig.get_path('include')}")
|
|
23
|
+
|
|
24
|
+
cu_sources = [s for s in ext.sources if s.endswith('.cu')]
|
|
25
|
+
cpp_sources = [s for s in ext.sources if s.endswith('.cpp')]
|
|
26
|
+
|
|
27
|
+
objects = []
|
|
28
|
+
|
|
29
|
+
for cu_file in cu_sources:
|
|
30
|
+
obj_file = cu_file.replace('.cu', '.o')
|
|
31
|
+
# 3. Add the `includes` list to the nvcc command!
|
|
32
|
+
nvcc_cmd = ['nvcc', '-c', cu_file, '-o', obj_file] + nvcc_flags + includes
|
|
33
|
+
print(f"Compiling CUDA: {' '.join(nvcc_cmd)}")
|
|
34
|
+
subprocess.check_call(nvcc_cmd)
|
|
35
|
+
objects.append(obj_file)
|
|
36
|
+
|
|
37
|
+
ext.sources = cpp_sources
|
|
38
|
+
ext.extra_objects = objects
|
|
39
|
+
|
|
40
|
+
super().build_extensions()
|
|
41
|
+
|
|
42
|
+
# Define the Python Extension
|
|
43
|
+
aakaar_ext = Extension(
|
|
44
|
+
'aakaar._C',
|
|
45
|
+
sources=['src/bindings.cpp', 'src/cpu_kernel.cpp', 'src/random_kernel.cu'],
|
|
46
|
+
include_dirs=[
|
|
47
|
+
pybind11.get_include(),
|
|
48
|
+
'/usr/local/cuda/include'
|
|
49
|
+
],
|
|
50
|
+
library_dirs=[
|
|
51
|
+
'/usr/local/cuda/lib64'
|
|
52
|
+
],
|
|
53
|
+
libraries=['curand', 'cudart'],
|
|
54
|
+
language='c++'
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
setup(
|
|
58
|
+
name='aakaar',
|
|
59
|
+
version='0.1.0',
|
|
60
|
+
author='Aarav Aggarwal',
|
|
61
|
+
description='A custom standalone ML library featuring CUDA-accelerated operations.',
|
|
62
|
+
packages=['aakaar'],
|
|
63
|
+
ext_modules=[aakaar_ext],
|
|
64
|
+
cmdclass={'build_ext': CUDABuildExtension},
|
|
65
|
+
install_requires=[
|
|
66
|
+
'numpy',
|
|
67
|
+
'nvidia-curand-cu12',
|
|
68
|
+
'nvidia-cuda-runtime-cu12'
|
|
69
|
+
],
|
|
70
|
+
setup_requires=['pybind11']
|
|
71
|
+
)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include <cuda_runtime.h>
|
|
3
|
+
#include <unordered_map>
|
|
4
|
+
#include <vector>
|
|
5
|
+
#include <stdexcept>
|
|
6
|
+
#include <iostream>
|
|
7
|
+
|
|
8
|
+
class CachingAllocator {
|
|
9
|
+
private:
|
|
10
|
+
// Maps the size of a tensor to a list of available GPU pointers
|
|
11
|
+
std::unordered_map<int, std::vector<float*>> free_blocks;
|
|
12
|
+
|
|
13
|
+
// Private constructor (Singleton pattern)
|
|
14
|
+
CachingAllocator() {}
|
|
15
|
+
|
|
16
|
+
public:
|
|
17
|
+
// Get the global instance of the allocator
|
|
18
|
+
static CachingAllocator& get_instance() {
|
|
19
|
+
static CachingAllocator instance;
|
|
20
|
+
return instance;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Allocate memory (or reuse an old block)
|
|
24
|
+
float* allocate(int size) {
|
|
25
|
+
// 1. Check if we already have a block of this size in the pool
|
|
26
|
+
if (free_blocks.find(size) != free_blocks.end() && !free_blocks[size].empty()) {
|
|
27
|
+
// Pop a recycled pointer off the back of the list and return it instantly
|
|
28
|
+
float* ptr = free_blocks[size].back();
|
|
29
|
+
free_blocks[size].pop_back();
|
|
30
|
+
return ptr;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// 2. If no recycled block exists, we MUST ask the OS (cudaMalloc)
|
|
34
|
+
float* ptr;
|
|
35
|
+
cudaError_t err = cudaMalloc((void**)&ptr, size * sizeof(float));
|
|
36
|
+
if (err != cudaSuccess) {
|
|
37
|
+
throw std::runtime_error("Aakaar out of memory!");
|
|
38
|
+
}
|
|
39
|
+
return ptr;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Free memory (put it in the pool, DON'T give it to the OS)
|
|
43
|
+
void free(float* ptr, int size) {
|
|
44
|
+
free_blocks[size].push_back(ptr);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// A utility to actually clear the cache and give memory back to the GPU
|
|
48
|
+
void empty_cache() {
|
|
49
|
+
for (auto& pair : free_blocks) {
|
|
50
|
+
for (float* ptr : pair.second) {
|
|
51
|
+
cudaFree(ptr); // ACTUALLY free the memory
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
free_blocks.clear();
|
|
55
|
+
}
|
|
56
|
+
};
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#include <pybind11/pybind11.h>
|
|
2
|
+
#include <pybind11/stl.h>
|
|
3
|
+
#include <memory>
|
|
4
|
+
#include "tensor.h"
|
|
5
|
+
#include "allocator.h"
|
|
6
|
+
|
|
7
|
+
namespace py = pybind11;
|
|
8
|
+
|
|
9
|
+
void run_curand_uniform(std::shared_ptr<Tensor> t, unsigned long long seed);
|
|
10
|
+
void fill_cpu_random(std::shared_ptr<Tensor> t, unsigned long long seed);
|
|
11
|
+
|
|
12
|
+
void empty_cache() {
|
|
13
|
+
CachingAllocator::get_instance().empty_cache();
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
PYBIND11_MODULE(_C, m) {
|
|
17
|
+
py::class_<Tensor, std::shared_ptr<Tensor>>(m, "Tensor")
|
|
18
|
+
.def(py::init<int, std::string>())
|
|
19
|
+
.def_readonly("device", &Tensor::device)
|
|
20
|
+
.def("to_numpy", &Tensor::to_numpy);
|
|
21
|
+
|
|
22
|
+
m.def("generate_random", &run_curand_uniform, "Fill GPU Tensor with random numbers");
|
|
23
|
+
m.def("fill_cpu_random", &fill_cpu_random, "Fill CPU Tensor with random numbers");
|
|
24
|
+
|
|
25
|
+
m.def("empty_cache", &empty_cache, "Release cached GPU memory");
|
|
26
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#include <random>
|
|
2
|
+
#include <memory>
|
|
3
|
+
#include "tensor.h"
|
|
4
|
+
|
|
5
|
+
void fill_cpu_random(std::shared_ptr<Tensor> t, unsigned long long seed) {
|
|
6
|
+
std::mt19937 gen(seed);
|
|
7
|
+
std::uniform_real_distribution<float> dis(0.0, 1.0);
|
|
8
|
+
for (int i = 0; i < t->size; ++i) {
|
|
9
|
+
t->data_ptr[i] = dis(gen);
|
|
10
|
+
}
|
|
11
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#include <cuda_runtime.h>
|
|
2
|
+
#include <curand.h>
|
|
3
|
+
#include <memory>
|
|
4
|
+
#include "tensor.h"
|
|
5
|
+
#include "rng.h" // Import the random manager
|
|
6
|
+
|
|
7
|
+
void run_curand_uniform(std::shared_ptr<Tensor> t, unsigned long long seed) {
|
|
8
|
+
// Fetch the persistent global generator
|
|
9
|
+
curandGenerator_t generator = CUDARandomManager::get_instance().get_generator(seed);
|
|
10
|
+
|
|
11
|
+
// Generate random numbers directly into the existing tensor's buffer
|
|
12
|
+
curandStatus_t status = curandGenerateUniform(generator, t->data_ptr, t->size);
|
|
13
|
+
if (status != CURAND_STATUS_SUCCESS) {
|
|
14
|
+
throw std::runtime_error("cuRAND generation failed");
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
// Ensure the GPU finishes execution before returning control to Python
|
|
18
|
+
cudaDeviceSynchronize();
|
|
19
|
+
}
|
aakaar-0.1.2/src/rng.h
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include <curand.h>
|
|
3
|
+
#include <stdexcept>
|
|
4
|
+
|
|
5
|
+
class CUDARandomManager {
|
|
6
|
+
private:
|
|
7
|
+
curandGenerator_t generator;
|
|
8
|
+
bool initialized;
|
|
9
|
+
|
|
10
|
+
// Private constructor (Singleton)
|
|
11
|
+
CUDARandomManager() : initialized(false) {}
|
|
12
|
+
|
|
13
|
+
public:
|
|
14
|
+
static CUDARandomManager& get_instance() {
|
|
15
|
+
static CUDARandomManager instance;
|
|
16
|
+
return instance;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// Initialize the generator only once
|
|
20
|
+
// Initialize the generator only once
|
|
21
|
+
curandGenerator_t get_generator(unsigned long long seed) {
|
|
22
|
+
if (!initialized) {
|
|
23
|
+
curandStatus_t status = curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT);
|
|
24
|
+
if (status != CURAND_STATUS_SUCCESS) { // <--- FIXED HERE
|
|
25
|
+
throw std::runtime_error("Failed to create cuRAND generator");
|
|
26
|
+
}
|
|
27
|
+
initialized = true;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
curandSetPseudoRandomGeneratorSeed(generator, seed);
|
|
31
|
+
return generator;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Clean up the generator when the engine shuts down
|
|
35
|
+
~CUDARandomManager() {
|
|
36
|
+
if (initialized) {
|
|
37
|
+
curandDestroyGenerator(generator);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
};
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include <cuda_runtime.h>
|
|
3
|
+
#include <string>
|
|
4
|
+
#include <stdexcept>
|
|
5
|
+
#include <iostream>
|
|
6
|
+
#include <pybind11/pybind11.h>
|
|
7
|
+
#include <pybind11/numpy.h>
|
|
8
|
+
#include "allocator.h" // Your CachingAllocator
|
|
9
|
+
|
|
10
|
+
namespace py = pybind11;
|
|
11
|
+
|
|
12
|
+
class Tensor {
|
|
13
|
+
public:
|
|
14
|
+
float* data_ptr;
|
|
15
|
+
int size;
|
|
16
|
+
std::string device;
|
|
17
|
+
|
|
18
|
+
Tensor(int s, std::string dev) : size(s), device(dev) {
|
|
19
|
+
if (device == "cuda") {
|
|
20
|
+
data_ptr = CachingAllocator::get_instance().allocate(size);
|
|
21
|
+
} else {
|
|
22
|
+
data_ptr = new float[size]; // Standard heap allocation
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
~Tensor() {
|
|
27
|
+
if (device == "cuda") {
|
|
28
|
+
CachingAllocator::get_instance().free(data_ptr, size);
|
|
29
|
+
} else {
|
|
30
|
+
delete[] data_ptr;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Copy data back to a numpy array on the host, regardless of device
|
|
35
|
+
py::array_t<float> to_numpy() {
|
|
36
|
+
py::array_t<float> result(size);
|
|
37
|
+
auto buf = result.mutable_data();
|
|
38
|
+
|
|
39
|
+
if (device == "cuda") {
|
|
40
|
+
cudaError_t err = cudaMemcpy(buf, data_ptr, size * sizeof(float),
|
|
41
|
+
cudaMemcpyDeviceToHost);
|
|
42
|
+
if (err != cudaSuccess) {
|
|
43
|
+
throw std::runtime_error("cudaMemcpy failed in to_numpy(): " +
|
|
44
|
+
std::string(cudaGetErrorString(err)));
|
|
45
|
+
}
|
|
46
|
+
} else {
|
|
47
|
+
std::memcpy(buf, data_ptr, size * sizeof(float));
|
|
48
|
+
}
|
|
49
|
+
return result;
|
|
50
|
+
}
|
|
51
|
+
};
|