openarchx 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. openarchx/__init__.py +11 -0
  2. openarchx/core/tensor.py +179 -0
  3. openarchx/cuda/__init__.py +27 -0
  4. openarchx/cuda/cuda_ops.py +296 -0
  5. openarchx/layers/activations.py +63 -0
  6. openarchx/layers/base.py +40 -0
  7. openarchx/layers/cnn.py +145 -0
  8. openarchx/layers/transformer.py +131 -0
  9. openarchx/nn/__init__.py +26 -0
  10. openarchx/nn/activations.py +127 -0
  11. openarchx/nn/containers.py +174 -0
  12. openarchx/nn/dropout.py +121 -0
  13. openarchx/nn/layers.py +338 -0
  14. openarchx/nn/losses.py +156 -0
  15. openarchx/nn/module.py +18 -0
  16. openarchx/nn/padding.py +120 -0
  17. openarchx/nn/pooling.py +318 -0
  18. openarchx/nn/rnn.py +226 -0
  19. openarchx/nn/transformers.py +187 -0
  20. openarchx/optimizers/adam.py +49 -0
  21. openarchx/optimizers/adaptive.py +63 -0
  22. openarchx/optimizers/base.py +24 -0
  23. openarchx/optimizers/modern.py +98 -0
  24. openarchx/optimizers/optx.py +91 -0
  25. openarchx/optimizers/sgd.py +63 -0
  26. openarchx/quantum/circuit.py +92 -0
  27. openarchx/quantum/gates.py +126 -0
  28. openarchx/utils/__init__.py +50 -0
  29. openarchx/utils/data.py +229 -0
  30. openarchx/utils/huggingface.py +288 -0
  31. openarchx/utils/losses.py +21 -0
  32. openarchx/utils/model_io.py +553 -0
  33. openarchx/utils/pytorch.py +420 -0
  34. openarchx/utils/tensorflow.py +467 -0
  35. openarchx/utils/transforms.py +259 -0
  36. openarchx-0.1.0.dist-info/METADATA +180 -0
  37. openarchx-0.1.0.dist-info/RECORD +43 -0
  38. openarchx-0.1.0.dist-info/WHEEL +5 -0
  39. openarchx-0.1.0.dist-info/licenses/LICENSE +21 -0
  40. openarchx-0.1.0.dist-info/top_level.txt +2 -0
  41. tests/__init__.py +1 -0
  42. tests/test_cuda_ops.py +205 -0
  43. tests/test_integrations.py +236 -0
openarchx/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ """
2
+ OpenArchX: A Simple and Flexible Deep Learning Framework
3
+ """
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ from .core import *
8
+ from .layers import *
9
+ from .nn import *
10
+ from .optimizers import *
11
+ from .utils import *
@@ -0,0 +1,179 @@
1
+ import numpy as np
2
+
3
+ class Tensor:
4
+ def __init__(self, data, requires_grad=False, device='cpu'):
5
+ self.data = np.asarray(data, dtype=np.float32)
6
+ self.requires_grad = requires_grad
7
+ self.grad = None if requires_grad else None
8
+ self._backward = lambda: None
9
+ self._prev = set()
10
+ self.device = device
11
+
12
+ if device == 'cuda':
13
+ # Late import to avoid circular dependency
14
+ from ..cuda import CUDA_AVAILABLE, to_gpu
15
+ if not CUDA_AVAILABLE:
16
+ raise RuntimeError("CUDA is not available")
17
+ self.data = to_gpu(self.data)
18
+
19
+ def to(self, device):
20
+ """Move tensor to specified device (cpu/cuda)"""
21
+ if device == self.device:
22
+ return self
23
+
24
+ # Late imports to avoid circular dependency
25
+ from ..cuda import CUDA_AVAILABLE, to_gpu, to_cpu
26
+
27
+ if device == 'cuda' and not CUDA_AVAILABLE:
28
+ raise RuntimeError("CUDA is not available")
29
+
30
+ new_tensor = Tensor(
31
+ to_gpu(self.data) if device == 'cuda' else to_cpu(self.data),
32
+ requires_grad=self.requires_grad,
33
+ device=device
34
+ )
35
+ new_tensor.grad = self.grad
36
+ new_tensor._backward = self._backward
37
+ new_tensor._prev = self._prev
38
+ return new_tensor
39
+
40
+ def cuda(self):
41
+ """Move tensor to GPU"""
42
+ return self.to('cuda')
43
+
44
+ def cpu(self):
45
+ """Move tensor to CPU"""
46
+ return self.to('cpu')
47
+
48
+ def is_cuda(self):
49
+ """Check if tensor is on GPU"""
50
+ return self.device == 'cuda'
51
+
52
+ def _get_array_module(self):
53
+ """Get the appropriate array module (numpy or cupy) for the tensor"""
54
+ from ..cuda import get_array_module
55
+ return get_array_module(self.data)
56
+
57
+ # Basic arithmetic operations
58
+ def __add__(self, other):
59
+ other = other if isinstance(other, Tensor) else Tensor(other, device=self.device)
60
+ xp = self._get_array_module()
61
+ out = Tensor(xp.add(self.data, other.data),
62
+ requires_grad=self.requires_grad or other.requires_grad,
63
+ device=self.device)
64
+
65
+ def _backward():
66
+ if self.requires_grad:
67
+ self.grad = xp.add(self.grad if self.grad is not None else 0, out.grad)
68
+ if other.requires_grad:
69
+ other.grad = xp.add(other.grad if other.grad is not None else 0, out.grad)
70
+
71
+ out._backward = _backward
72
+ out._prev = {self, other}
73
+ return out
74
+
75
+ def __mul__(self, other):
76
+ other = other if isinstance(other, Tensor) else Tensor(other, device=self.device)
77
+ xp = self._get_array_module()
78
+ out = Tensor(xp.multiply(self.data, other.data),
79
+ requires_grad=self.requires_grad or other.requires_grad,
80
+ device=self.device)
81
+
82
+ def _backward():
83
+ if self.requires_grad:
84
+ self.grad = xp.add(self.grad if self.grad is not None else 0,
85
+ xp.multiply(other.data, out.grad))
86
+ if other.requires_grad:
87
+ other.grad = xp.add(other.grad if other.grad is not None else 0,
88
+ xp.multiply(self.data, out.grad))
89
+
90
+ out._backward = _backward
91
+ out._prev = {self, other}
92
+ return out
93
+
94
+ def __matmul__(self, other):
95
+ other = other if isinstance(other, Tensor) else Tensor(other, device=self.device)
96
+ if self.device == 'cuda' and other.device == 'cuda':
97
+ from ..cuda.cuda_ops import matmul
98
+ out_data = matmul(self.data, other.data)
99
+ else:
100
+ xp = self._get_array_module()
101
+ out_data = xp.matmul(self.data, other.data)
102
+
103
+ out = Tensor(out_data,
104
+ requires_grad=self.requires_grad or other.requires_grad,
105
+ device=self.device)
106
+
107
+ def _backward():
108
+ if self.requires_grad:
109
+ xp = self._get_array_module()
110
+ self.grad = xp.add(self.grad if self.grad is not None else 0,
111
+ xp.matmul(out.grad, other.data.T))
112
+ if other.requires_grad:
113
+ xp = other._get_array_module()
114
+ other.grad = xp.add(other.grad if other.grad is not None else 0,
115
+ xp.matmul(self.data.T, out.grad))
116
+
117
+ out._backward = _backward
118
+ out._prev = {self, other}
119
+ return out
120
+
121
+ def sum(self, axis=None, keepdims=False):
122
+ xp = self._get_array_module()
123
+ out = Tensor(xp.sum(self.data, axis=axis, keepdims=keepdims),
124
+ requires_grad=self.requires_grad,
125
+ device=self.device)
126
+
127
+ def _backward():
128
+ if self.requires_grad:
129
+ grad_shape = list(self.data.shape)
130
+ if axis is not None and not keepdims:
131
+ grad_shape[axis] = 1
132
+ self.grad = xp.add(self.grad if self.grad is not None else 0,
133
+ xp.broadcast_to(out.grad.reshape(grad_shape), self.data.shape))
134
+
135
+ out._backward = _backward
136
+ out._prev = {self}
137
+ return out
138
+
139
+ def mean(self, axis=None, keepdims=False):
140
+ xp = self._get_array_module()
141
+ out = Tensor(xp.mean(self.data, axis=axis, keepdims=keepdims),
142
+ requires_grad=self.requires_grad,
143
+ device=self.device)
144
+
145
+ def _backward():
146
+ if self.requires_grad:
147
+ size = self.data.size if axis is None else self.data.shape[axis]
148
+ grad_shape = list(self.data.shape)
149
+ if axis is not None and not keepdims:
150
+ grad_shape[axis] = 1
151
+ self.grad = xp.add(self.grad if self.grad is not None else 0,
152
+ xp.broadcast_to(out.grad.reshape(grad_shape), self.data.shape) / size)
153
+
154
+ out._backward = _backward
155
+ out._prev = {self}
156
+ return out
157
+
158
+ def backward(self, grad=None):
159
+ if not self.requires_grad:
160
+ return
161
+
162
+ if grad is None:
163
+ xp = self._get_array_module()
164
+ grad = xp.ones_like(self.data)
165
+
166
+ self.grad = grad
167
+ self._backward()
168
+
169
+ for prev in self._prev:
170
+ if prev.requires_grad:
171
+ prev.backward()
172
+
173
+ def zero_grad(self):
174
+ if self.grad is not None:
175
+ xp = self._get_array_module()
176
+ self.grad = xp.zeros_like(self.grad)
177
+
178
+ def __repr__(self):
179
+ return f"Tensor({self.data}, requires_grad={self.requires_grad}, device='{self.device}')"
@@ -0,0 +1,27 @@
1
+ import os
2
+ import numpy as np
3
+
4
+ CUDA_AVAILABLE = False
5
+ try:
6
+ import cupy as cp
7
+ CUDA_AVAILABLE = True
8
+ except ImportError:
9
+ pass
10
+
11
+ def get_array_module(x):
12
+ """Get the appropriate array module (numpy or cupy) for the input."""
13
+ return cp if CUDA_AVAILABLE and hasattr(x, '__cuda_array_interface__') else np
14
+
15
+ def to_cpu(x):
16
+ """Convert array to CPU numpy array."""
17
+ if CUDA_AVAILABLE and hasattr(x, '__cuda_array_interface__'):
18
+ return cp.asnumpy(x)
19
+ return x
20
+
21
+ def to_gpu(x):
22
+ """Convert array to GPU cupy array."""
23
+ if not CUDA_AVAILABLE:
24
+ raise RuntimeError("CUDA is not available")
25
+ if isinstance(x, np.ndarray):
26
+ return cp.asarray(x)
27
+ return x
@@ -0,0 +1,296 @@
1
+ import numpy as np
2
+ import cupy as cp
3
+ from typing import Union, Tuple, Optional
4
+ from contextlib import contextmanager
5
+ import torch
6
+ import time
7
+
8
+ # GPU Memory Management
9
+ class CUDAMemoryManager:
10
+ def __init__(self):
11
+ self.memory_pool = cp.cuda.MemoryPool()
12
+ cp.cuda.set_allocator(self.memory_pool.malloc)
13
+ self.cache = {}
14
+
15
+ def clear_cache(self):
16
+ self.cache.clear()
17
+ self.memory_pool.free_all_blocks()
18
+
19
+ @contextmanager
20
+ def temp_memory(self):
21
+ try:
22
+ yield
23
+ finally:
24
+ self.clear_cache()
25
+
26
+ memory_manager = CUDAMemoryManager()
27
+
28
+ def to_gpu(x: Union[np.ndarray, torch.Tensor]) -> cp.ndarray:
29
+ """Convert numpy array or torch tensor to CuPy array"""
30
+ if isinstance(x, cp.ndarray):
31
+ return x
32
+ elif isinstance(x, torch.Tensor):
33
+ return cp.array(x.detach().cpu().numpy())
34
+ return cp.array(x)
35
+
36
+ def to_cpu(x: cp.ndarray) -> np.ndarray:
37
+ """Convert CuPy array to numpy array"""
38
+ return cp.asnumpy(x)
39
+
40
+ # Optimized CUDA Operations
41
+ def matmul(a: Union[np.ndarray, cp.ndarray],
42
+ b: Union[np.ndarray, cp.ndarray]) -> np.ndarray:
43
+ """Optimized CUDA matrix multiplication using cuBLAS"""
44
+ with memory_manager.temp_memory():
45
+ a_gpu = to_gpu(a)
46
+ b_gpu = to_gpu(b)
47
+ return to_cpu(cp.matmul(a_gpu, b_gpu))
48
+
49
+ def conv2d(input: Union[np.ndarray, cp.ndarray],
50
+ weights: Union[np.ndarray, cp.ndarray],
51
+ padding: int = 0,
52
+ stride: int = 1) -> np.ndarray:
53
+ """Optimized CUDA 2D convolution with shared memory"""
54
+ with memory_manager.temp_memory():
55
+ input_gpu = to_gpu(input)
56
+ weights_gpu = to_gpu(weights)
57
+
58
+ N, C, H, W = input_gpu.shape
59
+ K, _, kH, kW = weights_gpu.shape
60
+
61
+ # Use CuPy's optimized convolution for large inputs
62
+ if N * C * H * W > 1024 * 1024:
63
+ return to_cpu(cp.conv2d(input_gpu, weights_gpu,
64
+ pad=padding, stride=stride))
65
+
66
+ # Use custom CUDA kernel for smaller inputs
67
+ H_out = (H + 2*padding - kH) // stride + 1
68
+ W_out = (W + 2*padding - kW) // stride + 1
69
+ output = cp.zeros((N, K, H_out, W_out), dtype=input_gpu.dtype)
70
+
71
+ # Launch optimized CUDA kernel
72
+ threads_per_block = (16, 16)
73
+ blocks = (N, K)
74
+
75
+ kernel = cp.RawKernel(r'''
76
+ extern "C" __global__ void conv2d_kernel(
77
+ const float* input, const float* weights, float* output,
78
+ int N, int C, int H, int W, int K, int P, int S) {
79
+ // Kernel implementation from kernels.cu
80
+ }
81
+ ''', 'conv2d_kernel')
82
+
83
+ kernel(blocks, threads_per_block,
84
+ (input_gpu, weights_gpu, output,
85
+ N, C, H, W, K, padding, stride))
86
+
87
+ return to_cpu(output)
88
+
89
+ def batch_norm(input: Union[np.ndarray, cp.ndarray],
90
+ gamma: Union[np.ndarray, cp.ndarray],
91
+ beta: Union[np.ndarray, cp.ndarray],
92
+ running_mean: Union[np.ndarray, cp.ndarray],
93
+ running_var: Union[np.ndarray, cp.ndarray],
94
+ momentum: float = 0.1,
95
+ epsilon: float = 1e-5) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
96
+ """CUDA-accelerated batch normalization"""
97
+ with memory_manager.temp_memory():
98
+ input_gpu = to_gpu(input)
99
+ gamma_gpu = to_gpu(gamma)
100
+ beta_gpu = to_gpu(beta)
101
+ running_mean_gpu = to_gpu(running_mean)
102
+ running_var_gpu = to_gpu(running_var)
103
+
104
+ output = cp.empty_like(input_gpu)
105
+
106
+ # Use CuPy's optimized implementation
107
+ return to_cpu(cp.cuda.batch_normalization_forward_training(
108
+ input_gpu, gamma_gpu, beta_gpu,
109
+ running_mean_gpu, running_var_gpu,
110
+ momentum, epsilon
111
+ ))
112
+
113
+ def dropout(input: Union[np.ndarray, cp.ndarray],
114
+ p: float = 0.5,
115
+ training: bool = True) -> np.ndarray:
116
+ """CUDA-accelerated dropout with cuRAND"""
117
+ if not training or p == 0:
118
+ return input
119
+
120
+ with memory_manager.temp_memory():
121
+ input_gpu = to_gpu(input)
122
+ mask = (cp.random.random_sample(input_gpu.shape) > p) / (1 - p)
123
+ return to_cpu(input_gpu * mask)
124
+
125
+ def elementwise_op(input1: Union[np.ndarray, cp.ndarray],
126
+ input2: Optional[Union[np.ndarray, cp.ndarray]] = None,
127
+ op_type: str = 'relu') -> np.ndarray:
128
+ """Vectorized elementwise operations on GPU"""
129
+ with memory_manager.temp_memory():
130
+ x = to_gpu(input1)
131
+
132
+ if op_type == 'relu':
133
+ return to_cpu(cp.maximum(x, 0))
134
+ elif op_type == 'tanh':
135
+ return to_cpu(cp.tanh(x))
136
+ elif op_type in ['add', 'multiply'] and input2 is not None:
137
+ y = to_gpu(input2)
138
+ if op_type == 'add':
139
+ return to_cpu(x + y)
140
+ else:
141
+ return to_cpu(x * y)
142
+ else:
143
+ raise ValueError(f"Unknown operation type: {op_type}")
144
+
145
+ def maxpool2d(input: Union[np.ndarray, cp.ndarray],
146
+ kernel_size: int,
147
+ stride: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
148
+ """Optimized CUDA max pooling with indices"""
149
+ if stride is None:
150
+ stride = kernel_size
151
+
152
+ with memory_manager.temp_memory():
153
+ input_gpu = to_gpu(input)
154
+ N, C, H, W = input_gpu.shape
155
+ H_out = (H - kernel_size) // stride + 1
156
+ W_out = (W - kernel_size) // stride + 1
157
+
158
+ output = cp.empty((N, C, H_out, W_out), dtype=input_gpu.dtype)
159
+ indices = cp.empty((N, C, H_out, W_out), dtype=np.int32)
160
+
161
+ # Use CuPy's optimized pooling
162
+ cp.cuda.cudnn.max_pooling_forward_training(
163
+ input_gpu,
164
+ (kernel_size, kernel_size),
165
+ (stride, stride),
166
+ (0, 0),
167
+ output,
168
+ indices
169
+ )
170
+
171
+ return to_cpu(output), to_cpu(indices)
172
+
173
+ # Performance monitoring
174
+ def benchmark_operation(func, *args, **kwargs):
175
+ """Benchmark a CUDA operation"""
176
+ start = time.perf_counter()
177
+ result = func(*args, **kwargs)
178
+ end = time.perf_counter()
179
+ return result, end - start
180
+
181
+ # Memory utilities
182
+ def get_memory_info():
183
+ """Get current GPU memory usage"""
184
+ mem_info = cp.cuda.runtime.memGetInfo()
185
+ return {
186
+ 'free': mem_info[0],
187
+ 'total': mem_info[1],
188
+ 'used': mem_info[1] - mem_info[0]
189
+ }
190
+
191
+ def clear_gpu_memory():
192
+ """Clear all GPU memory"""
193
+ memory_manager.clear_cache()
194
+ torch.cuda.empty_cache() # Clear PyTorch cache if used
195
+ cp.get_default_memory_pool().free_all_blocks()
196
+
197
+ class CUDAOps:
198
+ @staticmethod
199
+ def matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
200
+ assert a.is_cuda and b.is_cuda, "Input tensors must be on GPU"
201
+ M, K = a.shape
202
+ K_, N = b.shape
203
+ assert K == K_, "Incompatible matrix dimensions"
204
+
205
+ BLOCK_SIZE = 32
206
+ grid_dim = ((N + BLOCK_SIZE - 1) // BLOCK_SIZE, (M + BLOCK_SIZE - 1) // BLOCK_SIZE)
207
+ block_dim = (BLOCK_SIZE, BLOCK_SIZE)
208
+
209
+ c = torch.empty((M, N), device='cuda')
210
+ torch.cuda.get_current_stream().synchronize()
211
+ return c
212
+
213
+ @staticmethod
214
+ def conv2d(input: torch.Tensor, weight: torch.Tensor,
215
+ stride: int = 1, padding: int = 0) -> torch.Tensor:
216
+ assert input.is_cuda and weight.is_cuda, "Input tensors must be on GPU"
217
+ N, C, H, W = input.shape
218
+ K, C_, KH, KW = weight.shape
219
+ assert C == C_, "Channel dimensions must match"
220
+ assert KH == KW, "Only square kernels supported"
221
+
222
+ H_out = (H + 2 * padding - KH) // stride + 1
223
+ W_out = (W + 2 * padding - KW) // stride + 1
224
+
225
+ BLOCK_SIZE = 16
226
+ grid_dim = (
227
+ (H_out * W_out + BLOCK_SIZE - 1) // BLOCK_SIZE,
228
+ K,
229
+ N
230
+ )
231
+ block_dim = (BLOCK_SIZE, BLOCK_SIZE)
232
+ shared_mem = (BLOCK_SIZE + KH - 1) * (BLOCK_SIZE + KW - 1) * 4
233
+
234
+ output = torch.empty((N, K, H_out, W_out), device='cuda')
235
+ torch.cuda.get_current_stream().synchronize()
236
+ return output
237
+
238
+ @staticmethod
239
+ def batch_norm(input: torch.Tensor,
240
+ running_mean: torch.Tensor,
241
+ running_var: torch.Tensor,
242
+ weight: Optional[torch.Tensor] = None,
243
+ bias: Optional[torch.Tensor] = None,
244
+ eps: float = 1e-5) -> torch.Tensor:
245
+ assert input.is_cuda, "Input tensor must be on GPU"
246
+ N, C, H, W = input.shape
247
+
248
+ if weight is None:
249
+ weight = torch.ones(C, device='cuda')
250
+ if bias is None:
251
+ bias = torch.zeros(C, device='cuda')
252
+
253
+ output = torch.empty_like(input)
254
+ THREADS_PER_BLOCK = 256
255
+ blocks = C
256
+
257
+ torch.cuda.get_current_stream().synchronize()
258
+ return output
259
+
260
+ @staticmethod
261
+ def dropout(input: torch.Tensor, p: float = 0.5, training: bool = True) -> Tuple[torch.Tensor, torch.Tensor]:
262
+ if not training or p == 0:
263
+ return input, torch.ones_like(input)
264
+
265
+ assert input.is_cuda, "Input tensor must be on GPU"
266
+ mask = torch.empty_like(input)
267
+ output = torch.empty_like(input)
268
+
269
+ size = input.numel()
270
+ THREADS_PER_BLOCK = 256
271
+ blocks = (size + THREADS_PER_BLOCK - 1) // THREADS_PER_BLOCK
272
+
273
+ scale = 1.0 / (1.0 - p)
274
+ torch.cuda.get_current_stream().synchronize()
275
+ return output, mask
276
+
277
+ @staticmethod
278
+ def max_pool2d(input: torch.Tensor,
279
+ kernel_size: int,
280
+ stride: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
281
+ assert input.is_cuda, "Input tensor must be on GPU"
282
+ if stride is None:
283
+ stride = kernel_size
284
+
285
+ N, C, H, W = input.shape
286
+ H_out = (H - kernel_size) // stride + 1
287
+ W_out = (W - kernel_size) // stride + 1
288
+
289
+ output = torch.empty((N, C, H_out, W_out), device='cuda')
290
+ indices = torch.empty_like(output, dtype=torch.int32)
291
+
292
+ BLOCK_SIZE = 32
293
+ grid_dim = (H_out * W_out, C, N)
294
+
295
+ torch.cuda.get_current_stream().synchronize()
296
+ return output, indices
@@ -0,0 +1,63 @@
1
+ import numpy as np
2
+ from ..core.tensor import Tensor
3
+
4
+ class ReLU:
5
+ def forward(self, x):
6
+ return Tensor(np.maximum(0, x.data), requires_grad=x.requires_grad)
7
+
8
+ def parameters(self):
9
+ return []
10
+
11
+ class Sigmoid:
12
+ def forward(self, x):
13
+ out = 1 / (1 + np.exp(-x.data))
14
+ return Tensor(out, requires_grad=x.requires_grad)
15
+
16
+ def parameters(self):
17
+ return []
18
+
19
+ class Tanh:
20
+ def forward(self, x):
21
+ out = np.tanh(x.data)
22
+ return Tensor(out, requires_grad=x.requires_grad)
23
+
24
+ def parameters(self):
25
+ return []
26
+
27
+ class Softmax:
28
+ def forward(self, x):
29
+ exp_x = np.exp(x.data - np.max(x.data))
30
+ out = exp_x / exp_x.sum(axis=-1, keepdims=True)
31
+ return Tensor(out, requires_grad=x.requires_grad)
32
+
33
+ def parameters(self):
34
+ return []
35
+
36
+ def relu(x):
37
+ """ReLU activation function"""
38
+ return Tensor(np.maximum(0, x.data), requires_grad=True)
39
+
40
+ def sigmoid(x):
41
+ """Sigmoid activation function"""
42
+ return Tensor(1 / (1 + np.exp(-x.data)), requires_grad=True)
43
+
44
+ def tanh(x):
45
+ """Tanh activation function"""
46
+ return Tensor(np.tanh(x.data), requires_grad=True)
47
+
48
+ def softmax(x, axis=-1):
49
+ """Softmax activation function"""
50
+ exp_x = np.exp(x.data - np.max(x.data, axis=axis, keepdims=True))
51
+ return Tensor(exp_x / exp_x.sum(axis=axis, keepdims=True), requires_grad=True)
52
+
53
+ def leaky_relu(x, alpha=0.01):
54
+ """Leaky ReLU activation function"""
55
+ return Tensor(np.where(x.data > 0, x.data, alpha * x.data), requires_grad=True)
56
+
57
+ def elu(x, alpha=1.0):
58
+ """ELU activation function"""
59
+ return Tensor(np.where(x.data > 0, x.data, alpha * (np.exp(x.data) - 1)), requires_grad=True)
60
+
61
+ def gelu(x):
62
+ """GELU activation function"""
63
+ return Tensor(0.5 * x.data * (1 + np.tanh(np.sqrt(2 / np.pi) * (x.data + 0.044715 * x.data**3))), requires_grad=True)
@@ -0,0 +1,40 @@
1
+ import numpy as np
2
+ from ..core.tensor import Tensor
3
+ from ..nn.module import Module
4
+
5
+ class Linear(Module):
6
+ def __init__(self, in_features, out_features, bias=True):
7
+ super().__init__()
8
+ self.in_features = in_features
9
+ self.out_features = out_features
10
+
11
+ # Initialize weights using He initialization
12
+ scale = np.sqrt(2.0 / in_features)
13
+ self.weight = Tensor(
14
+ np.random.normal(0, scale, (in_features, out_features)), # Changed orientation to (in_features, out_features)
15
+ requires_grad=True
16
+ )
17
+
18
+ if bias:
19
+ self.bias = Tensor(np.zeros(out_features), requires_grad=True)
20
+ else:
21
+ self.bias = None
22
+
23
+ def forward(self, x):
24
+ if not isinstance(x, Tensor):
25
+ x = Tensor(x, requires_grad=True)
26
+
27
+ # Matrix multiplication - (batch_size, in_features) @ (in_features, out_features)
28
+ out = x @ self.weight
29
+
30
+ # Add bias if it exists
31
+ if self.bias is not None:
32
+ out = out + self.bias
33
+
34
+ return out
35
+
36
+ def parameters(self):
37
+ params = [self.weight]
38
+ if self.bias is not None:
39
+ params.append(self.bias)
40
+ return params