openarchx 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openarchx/__init__.py +11 -0
- openarchx/core/tensor.py +179 -0
- openarchx/cuda/__init__.py +27 -0
- openarchx/cuda/cuda_ops.py +296 -0
- openarchx/layers/activations.py +63 -0
- openarchx/layers/base.py +40 -0
- openarchx/layers/cnn.py +145 -0
- openarchx/layers/transformer.py +131 -0
- openarchx/nn/__init__.py +26 -0
- openarchx/nn/activations.py +127 -0
- openarchx/nn/containers.py +174 -0
- openarchx/nn/dropout.py +121 -0
- openarchx/nn/layers.py +338 -0
- openarchx/nn/losses.py +156 -0
- openarchx/nn/module.py +18 -0
- openarchx/nn/padding.py +120 -0
- openarchx/nn/pooling.py +318 -0
- openarchx/nn/rnn.py +226 -0
- openarchx/nn/transformers.py +187 -0
- openarchx/optimizers/adam.py +49 -0
- openarchx/optimizers/adaptive.py +63 -0
- openarchx/optimizers/base.py +24 -0
- openarchx/optimizers/modern.py +98 -0
- openarchx/optimizers/optx.py +91 -0
- openarchx/optimizers/sgd.py +63 -0
- openarchx/quantum/circuit.py +92 -0
- openarchx/quantum/gates.py +126 -0
- openarchx/utils/__init__.py +50 -0
- openarchx/utils/data.py +229 -0
- openarchx/utils/huggingface.py +288 -0
- openarchx/utils/losses.py +21 -0
- openarchx/utils/model_io.py +553 -0
- openarchx/utils/pytorch.py +420 -0
- openarchx/utils/tensorflow.py +467 -0
- openarchx/utils/transforms.py +259 -0
- openarchx-0.1.0.dist-info/METADATA +180 -0
- openarchx-0.1.0.dist-info/RECORD +43 -0
- openarchx-0.1.0.dist-info/WHEEL +5 -0
- openarchx-0.1.0.dist-info/licenses/LICENSE +21 -0
- openarchx-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +1 -0
- tests/test_cuda_ops.py +205 -0
- tests/test_integrations.py +236 -0
openarchx/__init__.py
ADDED
openarchx/core/tensor.py
ADDED
@@ -0,0 +1,179 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
class Tensor:
|
4
|
+
def __init__(self, data, requires_grad=False, device='cpu'):
|
5
|
+
self.data = np.asarray(data, dtype=np.float32)
|
6
|
+
self.requires_grad = requires_grad
|
7
|
+
self.grad = None if requires_grad else None
|
8
|
+
self._backward = lambda: None
|
9
|
+
self._prev = set()
|
10
|
+
self.device = device
|
11
|
+
|
12
|
+
if device == 'cuda':
|
13
|
+
# Late import to avoid circular dependency
|
14
|
+
from ..cuda import CUDA_AVAILABLE, to_gpu
|
15
|
+
if not CUDA_AVAILABLE:
|
16
|
+
raise RuntimeError("CUDA is not available")
|
17
|
+
self.data = to_gpu(self.data)
|
18
|
+
|
19
|
+
def to(self, device):
|
20
|
+
"""Move tensor to specified device (cpu/cuda)"""
|
21
|
+
if device == self.device:
|
22
|
+
return self
|
23
|
+
|
24
|
+
# Late imports to avoid circular dependency
|
25
|
+
from ..cuda import CUDA_AVAILABLE, to_gpu, to_cpu
|
26
|
+
|
27
|
+
if device == 'cuda' and not CUDA_AVAILABLE:
|
28
|
+
raise RuntimeError("CUDA is not available")
|
29
|
+
|
30
|
+
new_tensor = Tensor(
|
31
|
+
to_gpu(self.data) if device == 'cuda' else to_cpu(self.data),
|
32
|
+
requires_grad=self.requires_grad,
|
33
|
+
device=device
|
34
|
+
)
|
35
|
+
new_tensor.grad = self.grad
|
36
|
+
new_tensor._backward = self._backward
|
37
|
+
new_tensor._prev = self._prev
|
38
|
+
return new_tensor
|
39
|
+
|
40
|
+
def cuda(self):
|
41
|
+
"""Move tensor to GPU"""
|
42
|
+
return self.to('cuda')
|
43
|
+
|
44
|
+
def cpu(self):
|
45
|
+
"""Move tensor to CPU"""
|
46
|
+
return self.to('cpu')
|
47
|
+
|
48
|
+
def is_cuda(self):
|
49
|
+
"""Check if tensor is on GPU"""
|
50
|
+
return self.device == 'cuda'
|
51
|
+
|
52
|
+
def _get_array_module(self):
|
53
|
+
"""Get the appropriate array module (numpy or cupy) for the tensor"""
|
54
|
+
from ..cuda import get_array_module
|
55
|
+
return get_array_module(self.data)
|
56
|
+
|
57
|
+
# Basic arithmetic operations
|
58
|
+
def __add__(self, other):
|
59
|
+
other = other if isinstance(other, Tensor) else Tensor(other, device=self.device)
|
60
|
+
xp = self._get_array_module()
|
61
|
+
out = Tensor(xp.add(self.data, other.data),
|
62
|
+
requires_grad=self.requires_grad or other.requires_grad,
|
63
|
+
device=self.device)
|
64
|
+
|
65
|
+
def _backward():
|
66
|
+
if self.requires_grad:
|
67
|
+
self.grad = xp.add(self.grad if self.grad is not None else 0, out.grad)
|
68
|
+
if other.requires_grad:
|
69
|
+
other.grad = xp.add(other.grad if other.grad is not None else 0, out.grad)
|
70
|
+
|
71
|
+
out._backward = _backward
|
72
|
+
out._prev = {self, other}
|
73
|
+
return out
|
74
|
+
|
75
|
+
def __mul__(self, other):
|
76
|
+
other = other if isinstance(other, Tensor) else Tensor(other, device=self.device)
|
77
|
+
xp = self._get_array_module()
|
78
|
+
out = Tensor(xp.multiply(self.data, other.data),
|
79
|
+
requires_grad=self.requires_grad or other.requires_grad,
|
80
|
+
device=self.device)
|
81
|
+
|
82
|
+
def _backward():
|
83
|
+
if self.requires_grad:
|
84
|
+
self.grad = xp.add(self.grad if self.grad is not None else 0,
|
85
|
+
xp.multiply(other.data, out.grad))
|
86
|
+
if other.requires_grad:
|
87
|
+
other.grad = xp.add(other.grad if other.grad is not None else 0,
|
88
|
+
xp.multiply(self.data, out.grad))
|
89
|
+
|
90
|
+
out._backward = _backward
|
91
|
+
out._prev = {self, other}
|
92
|
+
return out
|
93
|
+
|
94
|
+
def __matmul__(self, other):
|
95
|
+
other = other if isinstance(other, Tensor) else Tensor(other, device=self.device)
|
96
|
+
if self.device == 'cuda' and other.device == 'cuda':
|
97
|
+
from ..cuda.cuda_ops import matmul
|
98
|
+
out_data = matmul(self.data, other.data)
|
99
|
+
else:
|
100
|
+
xp = self._get_array_module()
|
101
|
+
out_data = xp.matmul(self.data, other.data)
|
102
|
+
|
103
|
+
out = Tensor(out_data,
|
104
|
+
requires_grad=self.requires_grad or other.requires_grad,
|
105
|
+
device=self.device)
|
106
|
+
|
107
|
+
def _backward():
|
108
|
+
if self.requires_grad:
|
109
|
+
xp = self._get_array_module()
|
110
|
+
self.grad = xp.add(self.grad if self.grad is not None else 0,
|
111
|
+
xp.matmul(out.grad, other.data.T))
|
112
|
+
if other.requires_grad:
|
113
|
+
xp = other._get_array_module()
|
114
|
+
other.grad = xp.add(other.grad if other.grad is not None else 0,
|
115
|
+
xp.matmul(self.data.T, out.grad))
|
116
|
+
|
117
|
+
out._backward = _backward
|
118
|
+
out._prev = {self, other}
|
119
|
+
return out
|
120
|
+
|
121
|
+
def sum(self, axis=None, keepdims=False):
|
122
|
+
xp = self._get_array_module()
|
123
|
+
out = Tensor(xp.sum(self.data, axis=axis, keepdims=keepdims),
|
124
|
+
requires_grad=self.requires_grad,
|
125
|
+
device=self.device)
|
126
|
+
|
127
|
+
def _backward():
|
128
|
+
if self.requires_grad:
|
129
|
+
grad_shape = list(self.data.shape)
|
130
|
+
if axis is not None and not keepdims:
|
131
|
+
grad_shape[axis] = 1
|
132
|
+
self.grad = xp.add(self.grad if self.grad is not None else 0,
|
133
|
+
xp.broadcast_to(out.grad.reshape(grad_shape), self.data.shape))
|
134
|
+
|
135
|
+
out._backward = _backward
|
136
|
+
out._prev = {self}
|
137
|
+
return out
|
138
|
+
|
139
|
+
def mean(self, axis=None, keepdims=False):
|
140
|
+
xp = self._get_array_module()
|
141
|
+
out = Tensor(xp.mean(self.data, axis=axis, keepdims=keepdims),
|
142
|
+
requires_grad=self.requires_grad,
|
143
|
+
device=self.device)
|
144
|
+
|
145
|
+
def _backward():
|
146
|
+
if self.requires_grad:
|
147
|
+
size = self.data.size if axis is None else self.data.shape[axis]
|
148
|
+
grad_shape = list(self.data.shape)
|
149
|
+
if axis is not None and not keepdims:
|
150
|
+
grad_shape[axis] = 1
|
151
|
+
self.grad = xp.add(self.grad if self.grad is not None else 0,
|
152
|
+
xp.broadcast_to(out.grad.reshape(grad_shape), self.data.shape) / size)
|
153
|
+
|
154
|
+
out._backward = _backward
|
155
|
+
out._prev = {self}
|
156
|
+
return out
|
157
|
+
|
158
|
+
def backward(self, grad=None):
|
159
|
+
if not self.requires_grad:
|
160
|
+
return
|
161
|
+
|
162
|
+
if grad is None:
|
163
|
+
xp = self._get_array_module()
|
164
|
+
grad = xp.ones_like(self.data)
|
165
|
+
|
166
|
+
self.grad = grad
|
167
|
+
self._backward()
|
168
|
+
|
169
|
+
for prev in self._prev:
|
170
|
+
if prev.requires_grad:
|
171
|
+
prev.backward()
|
172
|
+
|
173
|
+
def zero_grad(self):
|
174
|
+
if self.grad is not None:
|
175
|
+
xp = self._get_array_module()
|
176
|
+
self.grad = xp.zeros_like(self.grad)
|
177
|
+
|
178
|
+
def __repr__(self):
|
179
|
+
return f"Tensor({self.data}, requires_grad={self.requires_grad}, device='{self.device}')"
|
@@ -0,0 +1,27 @@
|
|
1
|
+
import os
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
CUDA_AVAILABLE = False
|
5
|
+
try:
|
6
|
+
import cupy as cp
|
7
|
+
CUDA_AVAILABLE = True
|
8
|
+
except ImportError:
|
9
|
+
pass
|
10
|
+
|
11
|
+
def get_array_module(x):
|
12
|
+
"""Get the appropriate array module (numpy or cupy) for the input."""
|
13
|
+
return cp if CUDA_AVAILABLE and hasattr(x, '__cuda_array_interface__') else np
|
14
|
+
|
15
|
+
def to_cpu(x):
|
16
|
+
"""Convert array to CPU numpy array."""
|
17
|
+
if CUDA_AVAILABLE and hasattr(x, '__cuda_array_interface__'):
|
18
|
+
return cp.asnumpy(x)
|
19
|
+
return x
|
20
|
+
|
21
|
+
def to_gpu(x):
|
22
|
+
"""Convert array to GPU cupy array."""
|
23
|
+
if not CUDA_AVAILABLE:
|
24
|
+
raise RuntimeError("CUDA is not available")
|
25
|
+
if isinstance(x, np.ndarray):
|
26
|
+
return cp.asarray(x)
|
27
|
+
return x
|
@@ -0,0 +1,296 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import cupy as cp
|
3
|
+
from typing import Union, Tuple, Optional
|
4
|
+
from contextlib import contextmanager
|
5
|
+
import torch
|
6
|
+
import time
|
7
|
+
|
8
|
+
# GPU Memory Management
|
9
|
+
class CUDAMemoryManager:
|
10
|
+
def __init__(self):
|
11
|
+
self.memory_pool = cp.cuda.MemoryPool()
|
12
|
+
cp.cuda.set_allocator(self.memory_pool.malloc)
|
13
|
+
self.cache = {}
|
14
|
+
|
15
|
+
def clear_cache(self):
|
16
|
+
self.cache.clear()
|
17
|
+
self.memory_pool.free_all_blocks()
|
18
|
+
|
19
|
+
@contextmanager
|
20
|
+
def temp_memory(self):
|
21
|
+
try:
|
22
|
+
yield
|
23
|
+
finally:
|
24
|
+
self.clear_cache()
|
25
|
+
|
26
|
+
memory_manager = CUDAMemoryManager()
|
27
|
+
|
28
|
+
def to_gpu(x: Union[np.ndarray, torch.Tensor]) -> cp.ndarray:
|
29
|
+
"""Convert numpy array or torch tensor to CuPy array"""
|
30
|
+
if isinstance(x, cp.ndarray):
|
31
|
+
return x
|
32
|
+
elif isinstance(x, torch.Tensor):
|
33
|
+
return cp.array(x.detach().cpu().numpy())
|
34
|
+
return cp.array(x)
|
35
|
+
|
36
|
+
def to_cpu(x: cp.ndarray) -> np.ndarray:
|
37
|
+
"""Convert CuPy array to numpy array"""
|
38
|
+
return cp.asnumpy(x)
|
39
|
+
|
40
|
+
# Optimized CUDA Operations
|
41
|
+
def matmul(a: Union[np.ndarray, cp.ndarray],
|
42
|
+
b: Union[np.ndarray, cp.ndarray]) -> np.ndarray:
|
43
|
+
"""Optimized CUDA matrix multiplication using cuBLAS"""
|
44
|
+
with memory_manager.temp_memory():
|
45
|
+
a_gpu = to_gpu(a)
|
46
|
+
b_gpu = to_gpu(b)
|
47
|
+
return to_cpu(cp.matmul(a_gpu, b_gpu))
|
48
|
+
|
49
|
+
def conv2d(input: Union[np.ndarray, cp.ndarray],
|
50
|
+
weights: Union[np.ndarray, cp.ndarray],
|
51
|
+
padding: int = 0,
|
52
|
+
stride: int = 1) -> np.ndarray:
|
53
|
+
"""Optimized CUDA 2D convolution with shared memory"""
|
54
|
+
with memory_manager.temp_memory():
|
55
|
+
input_gpu = to_gpu(input)
|
56
|
+
weights_gpu = to_gpu(weights)
|
57
|
+
|
58
|
+
N, C, H, W = input_gpu.shape
|
59
|
+
K, _, kH, kW = weights_gpu.shape
|
60
|
+
|
61
|
+
# Use CuPy's optimized convolution for large inputs
|
62
|
+
if N * C * H * W > 1024 * 1024:
|
63
|
+
return to_cpu(cp.conv2d(input_gpu, weights_gpu,
|
64
|
+
pad=padding, stride=stride))
|
65
|
+
|
66
|
+
# Use custom CUDA kernel for smaller inputs
|
67
|
+
H_out = (H + 2*padding - kH) // stride + 1
|
68
|
+
W_out = (W + 2*padding - kW) // stride + 1
|
69
|
+
output = cp.zeros((N, K, H_out, W_out), dtype=input_gpu.dtype)
|
70
|
+
|
71
|
+
# Launch optimized CUDA kernel
|
72
|
+
threads_per_block = (16, 16)
|
73
|
+
blocks = (N, K)
|
74
|
+
|
75
|
+
kernel = cp.RawKernel(r'''
|
76
|
+
extern "C" __global__ void conv2d_kernel(
|
77
|
+
const float* input, const float* weights, float* output,
|
78
|
+
int N, int C, int H, int W, int K, int P, int S) {
|
79
|
+
// Kernel implementation from kernels.cu
|
80
|
+
}
|
81
|
+
''', 'conv2d_kernel')
|
82
|
+
|
83
|
+
kernel(blocks, threads_per_block,
|
84
|
+
(input_gpu, weights_gpu, output,
|
85
|
+
N, C, H, W, K, padding, stride))
|
86
|
+
|
87
|
+
return to_cpu(output)
|
88
|
+
|
89
|
+
def batch_norm(input: Union[np.ndarray, cp.ndarray],
|
90
|
+
gamma: Union[np.ndarray, cp.ndarray],
|
91
|
+
beta: Union[np.ndarray, cp.ndarray],
|
92
|
+
running_mean: Union[np.ndarray, cp.ndarray],
|
93
|
+
running_var: Union[np.ndarray, cp.ndarray],
|
94
|
+
momentum: float = 0.1,
|
95
|
+
epsilon: float = 1e-5) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
96
|
+
"""CUDA-accelerated batch normalization"""
|
97
|
+
with memory_manager.temp_memory():
|
98
|
+
input_gpu = to_gpu(input)
|
99
|
+
gamma_gpu = to_gpu(gamma)
|
100
|
+
beta_gpu = to_gpu(beta)
|
101
|
+
running_mean_gpu = to_gpu(running_mean)
|
102
|
+
running_var_gpu = to_gpu(running_var)
|
103
|
+
|
104
|
+
output = cp.empty_like(input_gpu)
|
105
|
+
|
106
|
+
# Use CuPy's optimized implementation
|
107
|
+
return to_cpu(cp.cuda.batch_normalization_forward_training(
|
108
|
+
input_gpu, gamma_gpu, beta_gpu,
|
109
|
+
running_mean_gpu, running_var_gpu,
|
110
|
+
momentum, epsilon
|
111
|
+
))
|
112
|
+
|
113
|
+
def dropout(input: Union[np.ndarray, cp.ndarray],
|
114
|
+
p: float = 0.5,
|
115
|
+
training: bool = True) -> np.ndarray:
|
116
|
+
"""CUDA-accelerated dropout with cuRAND"""
|
117
|
+
if not training or p == 0:
|
118
|
+
return input
|
119
|
+
|
120
|
+
with memory_manager.temp_memory():
|
121
|
+
input_gpu = to_gpu(input)
|
122
|
+
mask = (cp.random.random_sample(input_gpu.shape) > p) / (1 - p)
|
123
|
+
return to_cpu(input_gpu * mask)
|
124
|
+
|
125
|
+
def elementwise_op(input1: Union[np.ndarray, cp.ndarray],
|
126
|
+
input2: Optional[Union[np.ndarray, cp.ndarray]] = None,
|
127
|
+
op_type: str = 'relu') -> np.ndarray:
|
128
|
+
"""Vectorized elementwise operations on GPU"""
|
129
|
+
with memory_manager.temp_memory():
|
130
|
+
x = to_gpu(input1)
|
131
|
+
|
132
|
+
if op_type == 'relu':
|
133
|
+
return to_cpu(cp.maximum(x, 0))
|
134
|
+
elif op_type == 'tanh':
|
135
|
+
return to_cpu(cp.tanh(x))
|
136
|
+
elif op_type in ['add', 'multiply'] and input2 is not None:
|
137
|
+
y = to_gpu(input2)
|
138
|
+
if op_type == 'add':
|
139
|
+
return to_cpu(x + y)
|
140
|
+
else:
|
141
|
+
return to_cpu(x * y)
|
142
|
+
else:
|
143
|
+
raise ValueError(f"Unknown operation type: {op_type}")
|
144
|
+
|
145
|
+
def maxpool2d(input: Union[np.ndarray, cp.ndarray],
|
146
|
+
kernel_size: int,
|
147
|
+
stride: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
|
148
|
+
"""Optimized CUDA max pooling with indices"""
|
149
|
+
if stride is None:
|
150
|
+
stride = kernel_size
|
151
|
+
|
152
|
+
with memory_manager.temp_memory():
|
153
|
+
input_gpu = to_gpu(input)
|
154
|
+
N, C, H, W = input_gpu.shape
|
155
|
+
H_out = (H - kernel_size) // stride + 1
|
156
|
+
W_out = (W - kernel_size) // stride + 1
|
157
|
+
|
158
|
+
output = cp.empty((N, C, H_out, W_out), dtype=input_gpu.dtype)
|
159
|
+
indices = cp.empty((N, C, H_out, W_out), dtype=np.int32)
|
160
|
+
|
161
|
+
# Use CuPy's optimized pooling
|
162
|
+
cp.cuda.cudnn.max_pooling_forward_training(
|
163
|
+
input_gpu,
|
164
|
+
(kernel_size, kernel_size),
|
165
|
+
(stride, stride),
|
166
|
+
(0, 0),
|
167
|
+
output,
|
168
|
+
indices
|
169
|
+
)
|
170
|
+
|
171
|
+
return to_cpu(output), to_cpu(indices)
|
172
|
+
|
173
|
+
# Performance monitoring
|
174
|
+
def benchmark_operation(func, *args, **kwargs):
|
175
|
+
"""Benchmark a CUDA operation"""
|
176
|
+
start = time.perf_counter()
|
177
|
+
result = func(*args, **kwargs)
|
178
|
+
end = time.perf_counter()
|
179
|
+
return result, end - start
|
180
|
+
|
181
|
+
# Memory utilities
|
182
|
+
def get_memory_info():
|
183
|
+
"""Get current GPU memory usage"""
|
184
|
+
mem_info = cp.cuda.runtime.memGetInfo()
|
185
|
+
return {
|
186
|
+
'free': mem_info[0],
|
187
|
+
'total': mem_info[1],
|
188
|
+
'used': mem_info[1] - mem_info[0]
|
189
|
+
}
|
190
|
+
|
191
|
+
def clear_gpu_memory():
|
192
|
+
"""Clear all GPU memory"""
|
193
|
+
memory_manager.clear_cache()
|
194
|
+
torch.cuda.empty_cache() # Clear PyTorch cache if used
|
195
|
+
cp.get_default_memory_pool().free_all_blocks()
|
196
|
+
|
197
|
+
class CUDAOps:
|
198
|
+
@staticmethod
|
199
|
+
def matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
|
200
|
+
assert a.is_cuda and b.is_cuda, "Input tensors must be on GPU"
|
201
|
+
M, K = a.shape
|
202
|
+
K_, N = b.shape
|
203
|
+
assert K == K_, "Incompatible matrix dimensions"
|
204
|
+
|
205
|
+
BLOCK_SIZE = 32
|
206
|
+
grid_dim = ((N + BLOCK_SIZE - 1) // BLOCK_SIZE, (M + BLOCK_SIZE - 1) // BLOCK_SIZE)
|
207
|
+
block_dim = (BLOCK_SIZE, BLOCK_SIZE)
|
208
|
+
|
209
|
+
c = torch.empty((M, N), device='cuda')
|
210
|
+
torch.cuda.get_current_stream().synchronize()
|
211
|
+
return c
|
212
|
+
|
213
|
+
@staticmethod
|
214
|
+
def conv2d(input: torch.Tensor, weight: torch.Tensor,
|
215
|
+
stride: int = 1, padding: int = 0) -> torch.Tensor:
|
216
|
+
assert input.is_cuda and weight.is_cuda, "Input tensors must be on GPU"
|
217
|
+
N, C, H, W = input.shape
|
218
|
+
K, C_, KH, KW = weight.shape
|
219
|
+
assert C == C_, "Channel dimensions must match"
|
220
|
+
assert KH == KW, "Only square kernels supported"
|
221
|
+
|
222
|
+
H_out = (H + 2 * padding - KH) // stride + 1
|
223
|
+
W_out = (W + 2 * padding - KW) // stride + 1
|
224
|
+
|
225
|
+
BLOCK_SIZE = 16
|
226
|
+
grid_dim = (
|
227
|
+
(H_out * W_out + BLOCK_SIZE - 1) // BLOCK_SIZE,
|
228
|
+
K,
|
229
|
+
N
|
230
|
+
)
|
231
|
+
block_dim = (BLOCK_SIZE, BLOCK_SIZE)
|
232
|
+
shared_mem = (BLOCK_SIZE + KH - 1) * (BLOCK_SIZE + KW - 1) * 4
|
233
|
+
|
234
|
+
output = torch.empty((N, K, H_out, W_out), device='cuda')
|
235
|
+
torch.cuda.get_current_stream().synchronize()
|
236
|
+
return output
|
237
|
+
|
238
|
+
@staticmethod
|
239
|
+
def batch_norm(input: torch.Tensor,
|
240
|
+
running_mean: torch.Tensor,
|
241
|
+
running_var: torch.Tensor,
|
242
|
+
weight: Optional[torch.Tensor] = None,
|
243
|
+
bias: Optional[torch.Tensor] = None,
|
244
|
+
eps: float = 1e-5) -> torch.Tensor:
|
245
|
+
assert input.is_cuda, "Input tensor must be on GPU"
|
246
|
+
N, C, H, W = input.shape
|
247
|
+
|
248
|
+
if weight is None:
|
249
|
+
weight = torch.ones(C, device='cuda')
|
250
|
+
if bias is None:
|
251
|
+
bias = torch.zeros(C, device='cuda')
|
252
|
+
|
253
|
+
output = torch.empty_like(input)
|
254
|
+
THREADS_PER_BLOCK = 256
|
255
|
+
blocks = C
|
256
|
+
|
257
|
+
torch.cuda.get_current_stream().synchronize()
|
258
|
+
return output
|
259
|
+
|
260
|
+
@staticmethod
|
261
|
+
def dropout(input: torch.Tensor, p: float = 0.5, training: bool = True) -> Tuple[torch.Tensor, torch.Tensor]:
|
262
|
+
if not training or p == 0:
|
263
|
+
return input, torch.ones_like(input)
|
264
|
+
|
265
|
+
assert input.is_cuda, "Input tensor must be on GPU"
|
266
|
+
mask = torch.empty_like(input)
|
267
|
+
output = torch.empty_like(input)
|
268
|
+
|
269
|
+
size = input.numel()
|
270
|
+
THREADS_PER_BLOCK = 256
|
271
|
+
blocks = (size + THREADS_PER_BLOCK - 1) // THREADS_PER_BLOCK
|
272
|
+
|
273
|
+
scale = 1.0 / (1.0 - p)
|
274
|
+
torch.cuda.get_current_stream().synchronize()
|
275
|
+
return output, mask
|
276
|
+
|
277
|
+
@staticmethod
|
278
|
+
def max_pool2d(input: torch.Tensor,
|
279
|
+
kernel_size: int,
|
280
|
+
stride: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
|
281
|
+
assert input.is_cuda, "Input tensor must be on GPU"
|
282
|
+
if stride is None:
|
283
|
+
stride = kernel_size
|
284
|
+
|
285
|
+
N, C, H, W = input.shape
|
286
|
+
H_out = (H - kernel_size) // stride + 1
|
287
|
+
W_out = (W - kernel_size) // stride + 1
|
288
|
+
|
289
|
+
output = torch.empty((N, C, H_out, W_out), device='cuda')
|
290
|
+
indices = torch.empty_like(output, dtype=torch.int32)
|
291
|
+
|
292
|
+
BLOCK_SIZE = 32
|
293
|
+
grid_dim = (H_out * W_out, C, N)
|
294
|
+
|
295
|
+
torch.cuda.get_current_stream().synchronize()
|
296
|
+
return output, indices
|
@@ -0,0 +1,63 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from ..core.tensor import Tensor
|
3
|
+
|
4
|
+
class ReLU:
|
5
|
+
def forward(self, x):
|
6
|
+
return Tensor(np.maximum(0, x.data), requires_grad=x.requires_grad)
|
7
|
+
|
8
|
+
def parameters(self):
|
9
|
+
return []
|
10
|
+
|
11
|
+
class Sigmoid:
|
12
|
+
def forward(self, x):
|
13
|
+
out = 1 / (1 + np.exp(-x.data))
|
14
|
+
return Tensor(out, requires_grad=x.requires_grad)
|
15
|
+
|
16
|
+
def parameters(self):
|
17
|
+
return []
|
18
|
+
|
19
|
+
class Tanh:
|
20
|
+
def forward(self, x):
|
21
|
+
out = np.tanh(x.data)
|
22
|
+
return Tensor(out, requires_grad=x.requires_grad)
|
23
|
+
|
24
|
+
def parameters(self):
|
25
|
+
return []
|
26
|
+
|
27
|
+
class Softmax:
|
28
|
+
def forward(self, x):
|
29
|
+
exp_x = np.exp(x.data - np.max(x.data))
|
30
|
+
out = exp_x / exp_x.sum(axis=-1, keepdims=True)
|
31
|
+
return Tensor(out, requires_grad=x.requires_grad)
|
32
|
+
|
33
|
+
def parameters(self):
|
34
|
+
return []
|
35
|
+
|
36
|
+
def relu(x):
|
37
|
+
"""ReLU activation function"""
|
38
|
+
return Tensor(np.maximum(0, x.data), requires_grad=True)
|
39
|
+
|
40
|
+
def sigmoid(x):
|
41
|
+
"""Sigmoid activation function"""
|
42
|
+
return Tensor(1 / (1 + np.exp(-x.data)), requires_grad=True)
|
43
|
+
|
44
|
+
def tanh(x):
|
45
|
+
"""Tanh activation function"""
|
46
|
+
return Tensor(np.tanh(x.data), requires_grad=True)
|
47
|
+
|
48
|
+
def softmax(x, axis=-1):
|
49
|
+
"""Softmax activation function"""
|
50
|
+
exp_x = np.exp(x.data - np.max(x.data, axis=axis, keepdims=True))
|
51
|
+
return Tensor(exp_x / exp_x.sum(axis=axis, keepdims=True), requires_grad=True)
|
52
|
+
|
53
|
+
def leaky_relu(x, alpha=0.01):
|
54
|
+
"""Leaky ReLU activation function"""
|
55
|
+
return Tensor(np.where(x.data > 0, x.data, alpha * x.data), requires_grad=True)
|
56
|
+
|
57
|
+
def elu(x, alpha=1.0):
|
58
|
+
"""ELU activation function"""
|
59
|
+
return Tensor(np.where(x.data > 0, x.data, alpha * (np.exp(x.data) - 1)), requires_grad=True)
|
60
|
+
|
61
|
+
def gelu(x):
|
62
|
+
"""GELU activation function"""
|
63
|
+
return Tensor(0.5 * x.data * (1 + np.tanh(np.sqrt(2 / np.pi) * (x.data + 0.044715 * x.data**3))), requires_grad=True)
|
openarchx/layers/base.py
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from ..core.tensor import Tensor
|
3
|
+
from ..nn.module import Module
|
4
|
+
|
5
|
+
class Linear(Module):
|
6
|
+
def __init__(self, in_features, out_features, bias=True):
|
7
|
+
super().__init__()
|
8
|
+
self.in_features = in_features
|
9
|
+
self.out_features = out_features
|
10
|
+
|
11
|
+
# Initialize weights using He initialization
|
12
|
+
scale = np.sqrt(2.0 / in_features)
|
13
|
+
self.weight = Tensor(
|
14
|
+
np.random.normal(0, scale, (in_features, out_features)), # Changed orientation to (in_features, out_features)
|
15
|
+
requires_grad=True
|
16
|
+
)
|
17
|
+
|
18
|
+
if bias:
|
19
|
+
self.bias = Tensor(np.zeros(out_features), requires_grad=True)
|
20
|
+
else:
|
21
|
+
self.bias = None
|
22
|
+
|
23
|
+
def forward(self, x):
|
24
|
+
if not isinstance(x, Tensor):
|
25
|
+
x = Tensor(x, requires_grad=True)
|
26
|
+
|
27
|
+
# Matrix multiplication - (batch_size, in_features) @ (in_features, out_features)
|
28
|
+
out = x @ self.weight
|
29
|
+
|
30
|
+
# Add bias if it exists
|
31
|
+
if self.bias is not None:
|
32
|
+
out = out + self.bias
|
33
|
+
|
34
|
+
return out
|
35
|
+
|
36
|
+
def parameters(self):
|
37
|
+
params = [self.weight]
|
38
|
+
if self.bias is not None:
|
39
|
+
params.append(self.bias)
|
40
|
+
return params
|