openarchx 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openarchx/__init__.py +11 -0
- openarchx/core/tensor.py +179 -0
- openarchx/cuda/__init__.py +27 -0
- openarchx/cuda/cuda_ops.py +296 -0
- openarchx/layers/activations.py +63 -0
- openarchx/layers/base.py +40 -0
- openarchx/layers/cnn.py +145 -0
- openarchx/layers/transformer.py +131 -0
- openarchx/nn/__init__.py +26 -0
- openarchx/nn/activations.py +127 -0
- openarchx/nn/containers.py +174 -0
- openarchx/nn/dropout.py +121 -0
- openarchx/nn/layers.py +338 -0
- openarchx/nn/losses.py +156 -0
- openarchx/nn/module.py +18 -0
- openarchx/nn/padding.py +120 -0
- openarchx/nn/pooling.py +318 -0
- openarchx/nn/rnn.py +226 -0
- openarchx/nn/transformers.py +187 -0
- openarchx/optimizers/adam.py +49 -0
- openarchx/optimizers/adaptive.py +63 -0
- openarchx/optimizers/base.py +24 -0
- openarchx/optimizers/modern.py +98 -0
- openarchx/optimizers/optx.py +91 -0
- openarchx/optimizers/sgd.py +63 -0
- openarchx/quantum/circuit.py +92 -0
- openarchx/quantum/gates.py +126 -0
- openarchx/utils/__init__.py +50 -0
- openarchx/utils/data.py +229 -0
- openarchx/utils/huggingface.py +288 -0
- openarchx/utils/losses.py +21 -0
- openarchx/utils/model_io.py +553 -0
- openarchx/utils/pytorch.py +420 -0
- openarchx/utils/tensorflow.py +467 -0
- openarchx/utils/transforms.py +259 -0
- openarchx-0.1.0.dist-info/METADATA +180 -0
- openarchx-0.1.0.dist-info/RECORD +43 -0
- openarchx-0.1.0.dist-info/WHEEL +5 -0
- openarchx-0.1.0.dist-info/licenses/LICENSE +21 -0
- openarchx-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +1 -0
- tests/test_cuda_ops.py +205 -0
- tests/test_integrations.py +236 -0
openarchx/layers/cnn.py
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from ..core.tensor import Tensor
|
3
|
+
from ..nn.module import Module
|
4
|
+
|
5
|
+
class Conv2d(Module):
|
6
|
+
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
|
7
|
+
super().__init__()
|
8
|
+
self.in_channels = in_channels
|
9
|
+
self.out_channels = out_channels
|
10
|
+
self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
|
11
|
+
self.stride = stride if isinstance(stride, tuple) else (stride, stride)
|
12
|
+
self.padding = padding if isinstance(padding, tuple) else (padding, padding)
|
13
|
+
|
14
|
+
# Initialize weights using He initialization
|
15
|
+
scale = np.sqrt(2.0 / (in_channels * self.kernel_size[0] * self.kernel_size[1]))
|
16
|
+
self.weight = Tensor(
|
17
|
+
np.random.normal(0, scale,
|
18
|
+
(out_channels, in_channels, *self.kernel_size)),
|
19
|
+
requires_grad=True
|
20
|
+
)
|
21
|
+
self.bias = Tensor(np.zeros(out_channels), requires_grad=True)
|
22
|
+
|
23
|
+
def _extract_patches(self, x, k_h, k_w, stride_h, stride_w):
|
24
|
+
"""Extract patches from input tensor efficiently"""
|
25
|
+
batch_size, channels, height, width = x.shape
|
26
|
+
|
27
|
+
# Calculate output dimensions
|
28
|
+
out_h = (height - k_h) // stride_h + 1
|
29
|
+
out_w = (width - k_w) // stride_w + 1
|
30
|
+
|
31
|
+
# Initialize patches array
|
32
|
+
patches = np.zeros((batch_size, out_h * out_w, channels * k_h * k_w))
|
33
|
+
|
34
|
+
# Extract patches
|
35
|
+
patch_idx = 0
|
36
|
+
for i in range(0, height - k_h + 1, stride_h):
|
37
|
+
for j in range(0, width - k_w + 1, stride_w):
|
38
|
+
# Extract patch for all batches and channels
|
39
|
+
patch = x[:, :, i:i+k_h, j:j+k_w]
|
40
|
+
# Reshape patch to (batch_size, channels * k_h * k_w)
|
41
|
+
patches[:, patch_idx, :] = patch.reshape(batch_size, -1)
|
42
|
+
patch_idx += 1
|
43
|
+
|
44
|
+
return patches, out_h, out_w
|
45
|
+
|
46
|
+
def forward(self, x):
|
47
|
+
batch_size, C, H, W = x.data.shape
|
48
|
+
pad_h, pad_w = self.padding
|
49
|
+
stride_h, stride_w = self.stride
|
50
|
+
k_h, k_w = self.kernel_size
|
51
|
+
|
52
|
+
# Add padding if needed
|
53
|
+
if pad_h > 0 or pad_w > 0:
|
54
|
+
x_padded = np.pad(x.data, ((0,0), (0,0), (pad_h,pad_h), (pad_w,pad_w)), mode='constant')
|
55
|
+
else:
|
56
|
+
x_padded = x.data
|
57
|
+
|
58
|
+
# Extract patches
|
59
|
+
patches, H_out, W_out = self._extract_patches(x_padded, k_h, k_w, stride_h, stride_w)
|
60
|
+
|
61
|
+
# Reshape weights to [out_channels, in_channels * k_h * k_w]
|
62
|
+
w_reshaped = self.weight.data.reshape(self.out_channels, -1)
|
63
|
+
|
64
|
+
# Compute convolution using matrix multiplication
|
65
|
+
out = patches @ w_reshaped.T # [batch_size, H_out * W_out, out_channels]
|
66
|
+
out = out.transpose(0, 2, 1).reshape(batch_size, self.out_channels, H_out, W_out)
|
67
|
+
|
68
|
+
# Add bias
|
69
|
+
out += self.bias.data.reshape(1, -1, 1, 1)
|
70
|
+
|
71
|
+
return Tensor(out, requires_grad=True)
|
72
|
+
|
73
|
+
class MaxPool2d(Module):
|
74
|
+
def __init__(self, kernel_size, stride=None):
|
75
|
+
super().__init__()
|
76
|
+
self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
|
77
|
+
self.stride = self.kernel_size if stride is None else (stride if isinstance(stride, tuple) else (stride, stride))
|
78
|
+
|
79
|
+
def forward(self, x):
|
80
|
+
batch_size, C, H, W = x.data.shape
|
81
|
+
k_h, k_w = self.kernel_size
|
82
|
+
stride_h, stride_w = self.stride
|
83
|
+
|
84
|
+
# Calculate output dimensions
|
85
|
+
H_out = (H - k_h) // stride_h + 1
|
86
|
+
W_out = (W - k_w) // stride_w + 1
|
87
|
+
|
88
|
+
# Initialize output array
|
89
|
+
out = np.zeros((batch_size, C, H_out, W_out))
|
90
|
+
|
91
|
+
# Perform max pooling
|
92
|
+
for b in range(batch_size):
|
93
|
+
for c in range(C):
|
94
|
+
for h in range(H_out):
|
95
|
+
for w in range(W_out):
|
96
|
+
h_start = h * stride_h
|
97
|
+
w_start = w * stride_w
|
98
|
+
h_end = h_start + k_h
|
99
|
+
w_end = w_start + k_w
|
100
|
+
|
101
|
+
pool_region = x.data[b, c, h_start:h_end, w_start:w_end]
|
102
|
+
out[b, c, h, w] = np.max(pool_region)
|
103
|
+
|
104
|
+
return Tensor(out, requires_grad=True)
|
105
|
+
|
106
|
+
class BatchNorm2d(Module):
|
107
|
+
def __init__(self, num_features, eps=1e-5, momentum=0.1):
|
108
|
+
super().__init__()
|
109
|
+
self.num_features = num_features
|
110
|
+
self.eps = eps
|
111
|
+
self.momentum = momentum
|
112
|
+
|
113
|
+
# Parameters
|
114
|
+
self.gamma = Tensor(np.ones(num_features), requires_grad=True)
|
115
|
+
self.beta = Tensor(np.zeros(num_features), requires_grad=True)
|
116
|
+
|
117
|
+
# Running estimates
|
118
|
+
self.running_mean = np.zeros(num_features)
|
119
|
+
self.running_var = np.ones(num_features)
|
120
|
+
|
121
|
+
# Training mode flag
|
122
|
+
self.training = True
|
123
|
+
|
124
|
+
def forward(self, x):
|
125
|
+
if self.training:
|
126
|
+
# Calculate batch statistics
|
127
|
+
batch_mean = x.data.mean(axis=(0,2,3), keepdims=True)
|
128
|
+
batch_var = x.data.var(axis=(0,2,3), keepdims=True)
|
129
|
+
|
130
|
+
# Update running statistics
|
131
|
+
self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean.squeeze()
|
132
|
+
self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var.squeeze()
|
133
|
+
|
134
|
+
# Normalize
|
135
|
+
x_normalized = (x.data - batch_mean) / np.sqrt(batch_var + self.eps)
|
136
|
+
else:
|
137
|
+
# Use running statistics
|
138
|
+
x_normalized = (x.data - self.running_mean.reshape(1,-1,1,1)) / \
|
139
|
+
np.sqrt(self.running_var.reshape(1,-1,1,1) + self.eps)
|
140
|
+
|
141
|
+
# Apply scale and shift
|
142
|
+
out = self.gamma.data.reshape(1,-1,1,1) * x_normalized + \
|
143
|
+
self.beta.data.reshape(1,-1,1,1)
|
144
|
+
|
145
|
+
return Tensor(out, requires_grad=True)
|
@@ -0,0 +1,131 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from ..core.tensor import Tensor
|
3
|
+
from ..nn.module import Module
|
4
|
+
from .base import Linear
|
5
|
+
|
6
|
+
def get_positional_encoding(seq_length, d_model):
|
7
|
+
"""Generate positional encodings for transformer input"""
|
8
|
+
position = np.arange(seq_length)[:, np.newaxis]
|
9
|
+
div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
|
10
|
+
|
11
|
+
pos_encoding = np.zeros((seq_length, d_model))
|
12
|
+
pos_encoding[:, 0::2] = np.sin(position * div_term)
|
13
|
+
pos_encoding[:, 1::2] = np.cos(position * div_term)
|
14
|
+
|
15
|
+
return Tensor(pos_encoding[np.newaxis, :, :]) # Add batch dimension
|
16
|
+
|
17
|
+
class PositionalEncoding(Module):
|
18
|
+
def __init__(self, d_model, max_seq_length=5000):
|
19
|
+
super().__init__()
|
20
|
+
self.pos_encoding = get_positional_encoding(max_seq_length, d_model)
|
21
|
+
|
22
|
+
def forward(self, x):
|
23
|
+
return x + self.pos_encoding[:, :x.data.shape[1], :]
|
24
|
+
|
25
|
+
class MultiHeadAttention(Module):
|
26
|
+
def __init__(self, embed_dim, num_heads):
|
27
|
+
super().__init__()
|
28
|
+
assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
|
29
|
+
|
30
|
+
self.embed_dim = embed_dim
|
31
|
+
self.num_heads = num_heads
|
32
|
+
self.head_dim = embed_dim // num_heads
|
33
|
+
|
34
|
+
self.q_proj = Linear(embed_dim, embed_dim)
|
35
|
+
self.k_proj = Linear(embed_dim, embed_dim)
|
36
|
+
self.v_proj = Linear(embed_dim, embed_dim)
|
37
|
+
self.out_proj = Linear(embed_dim, embed_dim)
|
38
|
+
|
39
|
+
def split_heads(self, x, batch_size):
|
40
|
+
# [batch_size, seq_len, embed_dim] -> [batch_size, seq_len, num_heads, head_dim]
|
41
|
+
new_shape = (batch_size, -1, self.num_heads, self.head_dim)
|
42
|
+
x = x.reshape(*new_shape)
|
43
|
+
# [batch_size, seq_len, num_heads, head_dim] -> [batch_size, num_heads, seq_len, head_dim]
|
44
|
+
return x.transpose(0, 2, 1, 3)
|
45
|
+
|
46
|
+
def merge_heads(self, x, batch_size, seq_len):
|
47
|
+
# [batch_size, num_heads, seq_len, head_dim] -> [batch_size, seq_len, num_heads, head_dim]
|
48
|
+
x = x.transpose(0, 2, 1, 3)
|
49
|
+
# [batch_size, seq_len, num_heads, head_dim] -> [batch_size, seq_len, embed_dim]
|
50
|
+
return x.reshape(batch_size, seq_len, self.embed_dim)
|
51
|
+
|
52
|
+
def forward(self, query, key, value, mask=None):
|
53
|
+
batch_size = query.data.shape[0]
|
54
|
+
q_len, k_len = query.data.shape[1], key.data.shape[1]
|
55
|
+
|
56
|
+
# Linear projections and split heads
|
57
|
+
q = self.split_heads(self.q_proj.forward(query), batch_size) # [batch, heads, q_len, head_dim]
|
58
|
+
k = self.split_heads(self.k_proj.forward(key), batch_size) # [batch, heads, k_len, head_dim]
|
59
|
+
v = self.split_heads(self.v_proj.forward(value), batch_size) # [batch, heads, v_len, head_dim]
|
60
|
+
|
61
|
+
# Scaled dot-product attention
|
62
|
+
# [batch, heads, q_len, head_dim] @ [batch, heads, head_dim, k_len]
|
63
|
+
scores = (q @ k.transpose(0, 1, 3, 2)) / np.sqrt(self.head_dim)
|
64
|
+
|
65
|
+
if mask is not None:
|
66
|
+
scores.data = scores.data + mask.data * -1e9
|
67
|
+
|
68
|
+
# Apply softmax and attention
|
69
|
+
attn = self._softmax(scores) # [batch, heads, q_len, k_len]
|
70
|
+
out = attn @ v # [batch, heads, q_len, head_dim]
|
71
|
+
|
72
|
+
# Merge heads and project
|
73
|
+
out = self.merge_heads(out, batch_size, q_len) # [batch, q_len, embed_dim]
|
74
|
+
return self.out_proj.forward(out)
|
75
|
+
|
76
|
+
def _softmax(self, x):
|
77
|
+
exp_x = Tensor(np.exp(x.data - np.max(x.data, axis=-1, keepdims=True)))
|
78
|
+
return exp_x / exp_x.sum(axis=-1, keepdims=True)
|
79
|
+
|
80
|
+
class LayerNorm(Module):
|
81
|
+
def __init__(self, normalized_shape, eps=1e-5):
|
82
|
+
super().__init__()
|
83
|
+
self.eps = eps
|
84
|
+
self.gamma = Tensor(np.ones(normalized_shape), requires_grad=True)
|
85
|
+
self.beta = Tensor(np.zeros(normalized_shape), requires_grad=True)
|
86
|
+
|
87
|
+
def forward(self, x):
|
88
|
+
mean = x.mean(axis=-1, keepdims=True)
|
89
|
+
var = ((x - mean) ** 2).mean(axis=-1, keepdims=True)
|
90
|
+
return self.gamma * (x - mean) / (var + self.eps).sqrt() + self.beta
|
91
|
+
|
92
|
+
def parameters(self):
|
93
|
+
return [self.gamma, self.beta]
|
94
|
+
|
95
|
+
class TransformerEncoderLayer(Module):
|
96
|
+
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
|
97
|
+
super().__init__()
|
98
|
+
self.self_attn = MultiHeadAttention(d_model, nhead)
|
99
|
+
self.pos_encoding = PositionalEncoding(d_model)
|
100
|
+
self.linear1 = Linear(d_model, dim_feedforward)
|
101
|
+
self.linear2 = Linear(dim_feedforward, d_model)
|
102
|
+
self.norm1 = LayerNorm(d_model)
|
103
|
+
self.norm2 = LayerNorm(d_model)
|
104
|
+
self.dropout = dropout
|
105
|
+
|
106
|
+
def forward(self, src, src_mask=None):
|
107
|
+
# Add positional encoding
|
108
|
+
src = self.pos_encoding.forward(src)
|
109
|
+
|
110
|
+
# Multi-head self-attention
|
111
|
+
attn_output = self.self_attn.forward(src, src, src, mask=src_mask)
|
112
|
+
attn_output = self._dropout(attn_output)
|
113
|
+
out1 = self.norm1.forward(src + attn_output)
|
114
|
+
|
115
|
+
# Position-wise feed-forward network
|
116
|
+
ff_output = self.linear1.forward(out1)
|
117
|
+
ff_output = self._relu(ff_output)
|
118
|
+
ff_output = self._dropout(ff_output)
|
119
|
+
ff_output = self.linear2.forward(ff_output)
|
120
|
+
ff_output = self._dropout(ff_output)
|
121
|
+
|
122
|
+
return self.norm2.forward(out1 + ff_output)
|
123
|
+
|
124
|
+
def _dropout(self, x):
|
125
|
+
if self.dropout > 0:
|
126
|
+
mask = np.random.binomial(1, 1-self.dropout, x.data.shape)
|
127
|
+
return Tensor(x.data * mask / (1-self.dropout))
|
128
|
+
return x
|
129
|
+
|
130
|
+
def _relu(self, x):
|
131
|
+
return Tensor(np.maximum(0, x.data))
|
openarchx/nn/__init__.py
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# Core module
|
2
|
+
from .module import Module
|
3
|
+
|
4
|
+
# Activation functions
|
5
|
+
from .activations import (
|
6
|
+
ReLU, LeakyReLU, Sigmoid, Tanh, GELU, SiLU,
|
7
|
+
ELU, SELU, Softmax, LogSoftmax
|
8
|
+
)
|
9
|
+
|
10
|
+
# Core layers
|
11
|
+
from .layers import (
|
12
|
+
Linear, Conv1d, Conv2d, LayerNorm,
|
13
|
+
Embedding
|
14
|
+
)
|
15
|
+
|
16
|
+
# Pooling layers
|
17
|
+
from .pooling import (
|
18
|
+
MaxPool1d, MaxPool2d, AvgPool1d, AvgPool2d,
|
19
|
+
AdaptiveAvgPool2d, AdaptiveMaxPool2d
|
20
|
+
)
|
21
|
+
|
22
|
+
# Transformer components
|
23
|
+
from ..layers.transformer import PositionalEncoding
|
24
|
+
|
25
|
+
# Container modules
|
26
|
+
from .containers import Sequential, ModuleList
|
@@ -0,0 +1,127 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from ..core.tensor import Tensor
|
3
|
+
from .module import Module
|
4
|
+
|
5
|
+
class ReLU(Module):
|
6
|
+
def forward(self, x):
|
7
|
+
return Tensor(np.maximum(0, x.data), requires_grad=True)
|
8
|
+
|
9
|
+
class LeakyReLU(Module):
|
10
|
+
def __init__(self, negative_slope=0.01):
|
11
|
+
super().__init__()
|
12
|
+
self.negative_slope = negative_slope
|
13
|
+
|
14
|
+
def forward(self, x):
|
15
|
+
return Tensor(np.where(x.data > 0, x.data, self.negative_slope * x.data), requires_grad=True)
|
16
|
+
|
17
|
+
class PReLU(Module):
|
18
|
+
def __init__(self, num_parameters=1, init=0.25):
|
19
|
+
super().__init__()
|
20
|
+
self.weight = Tensor(np.full(num_parameters, init), requires_grad=True)
|
21
|
+
|
22
|
+
def forward(self, x):
|
23
|
+
return Tensor(np.where(x.data > 0, x.data, self.weight.data * x.data), requires_grad=True)
|
24
|
+
|
25
|
+
class ELU(Module):
|
26
|
+
def __init__(self, alpha=1.0):
|
27
|
+
super().__init__()
|
28
|
+
self.alpha = alpha
|
29
|
+
|
30
|
+
def forward(self, x):
|
31
|
+
return Tensor(np.where(x.data > 0, x.data, self.alpha * (np.exp(x.data) - 1)), requires_grad=True)
|
32
|
+
|
33
|
+
class GELU(Module):
|
34
|
+
def forward(self, x):
|
35
|
+
return Tensor(0.5 * x.data * (1 + np.tanh(np.sqrt(2 / np.pi) * (x.data + 0.044715 * x.data**3))), requires_grad=True)
|
36
|
+
|
37
|
+
class Sigmoid(Module):
|
38
|
+
def forward(self, x):
|
39
|
+
return Tensor(1 / (1 + np.exp(-x.data)), requires_grad=True)
|
40
|
+
|
41
|
+
class Tanh(Module):
|
42
|
+
def forward(self, x):
|
43
|
+
return Tensor(np.tanh(x.data), requires_grad=True)
|
44
|
+
|
45
|
+
class Softmax(Module):
|
46
|
+
def __init__(self, dim=-1):
|
47
|
+
super().__init__()
|
48
|
+
self.dim = dim
|
49
|
+
|
50
|
+
def forward(self, x):
|
51
|
+
exp_x = np.exp(x.data - np.max(x.data, axis=self.dim, keepdims=True))
|
52
|
+
return Tensor(exp_x / np.sum(exp_x, axis=self.dim, keepdims=True), requires_grad=True)
|
53
|
+
|
54
|
+
class LogSoftmax(Module):
|
55
|
+
def __init__(self, dim=-1):
|
56
|
+
super().__init__()
|
57
|
+
self.dim = dim
|
58
|
+
|
59
|
+
def forward(self, x):
|
60
|
+
exp_x = np.exp(x.data - np.max(x.data, axis=self.dim, keepdims=True))
|
61
|
+
softmax = exp_x / np.sum(exp_x, axis=self.dim, keepdims=True)
|
62
|
+
return Tensor(np.log(softmax), requires_grad=True)
|
63
|
+
|
64
|
+
class SELU(Module):
|
65
|
+
def __init__(self):
|
66
|
+
super().__init__()
|
67
|
+
self.alpha = 1.6732632423543772848170429916717
|
68
|
+
self.scale = 1.0507009873554804934193349852946
|
69
|
+
|
70
|
+
def forward(self, x):
|
71
|
+
return Tensor(self.scale * np.where(x.data > 0, x.data,
|
72
|
+
self.alpha * (np.exp(x.data) - 1)), requires_grad=True)
|
73
|
+
|
74
|
+
class Hardtanh(Module):
|
75
|
+
def __init__(self, min_val=-1.0, max_val=1.0):
|
76
|
+
super().__init__()
|
77
|
+
self.min_val = min_val
|
78
|
+
self.max_val = max_val
|
79
|
+
|
80
|
+
def forward(self, x):
|
81
|
+
return Tensor(np.clip(x.data, self.min_val, self.max_val), requires_grad=True)
|
82
|
+
|
83
|
+
class SiLU(Module): # Also known as Swish
|
84
|
+
def forward(self, x):
|
85
|
+
return Tensor(x.data * (1 / (1 + np.exp(-x.data))), requires_grad=True)
|
86
|
+
|
87
|
+
class Mish(Module):
|
88
|
+
def forward(self, x):
|
89
|
+
return Tensor(x.data * np.tanh(np.log(1 + np.exp(x.data))), requires_grad=True)
|
90
|
+
|
91
|
+
class ActX(Module):
|
92
|
+
"""
|
93
|
+
Advanced activation function that combines multiple activation types with learnable parameters.
|
94
|
+
ActX(x) = α * GELU(x) + β * SiLU(x) + γ * tanh(λx)
|
95
|
+
where α, β, γ, and λ are learnable parameters
|
96
|
+
"""
|
97
|
+
def __init__(self, num_parameters=1, init_alpha=0.5, init_beta=0.5, init_gamma=0.25, init_lambda=1.0):
|
98
|
+
super().__init__()
|
99
|
+
self.num_parameters = num_parameters
|
100
|
+
|
101
|
+
# Initialize learnable parameters
|
102
|
+
self.alpha = Tensor(np.full(num_parameters, init_alpha), requires_grad=True)
|
103
|
+
self.beta = Tensor(np.full(num_parameters, init_beta), requires_grad=True)
|
104
|
+
self.gamma = Tensor(np.full(num_parameters, init_gamma), requires_grad=True)
|
105
|
+
self.lambda_param = Tensor(np.full(num_parameters, init_lambda), requires_grad=True)
|
106
|
+
|
107
|
+
def forward(self, x):
|
108
|
+
# GELU component
|
109
|
+
gelu = 0.5 * x.data * (1 + np.tanh(np.sqrt(2 / np.pi) * (x.data + 0.044715 * x.data**3)))
|
110
|
+
|
111
|
+
# SiLU (Swish) component
|
112
|
+
silu = x.data * (1 / (1 + np.exp(-x.data)))
|
113
|
+
|
114
|
+
# Tanh component with learnable frequency
|
115
|
+
tanh = np.tanh(self.lambda_param.data.reshape(-1, 1, 1) * x.data)
|
116
|
+
|
117
|
+
# Combine components with learnable weights
|
118
|
+
alpha = self.alpha.data.reshape(-1, 1, 1)
|
119
|
+
beta = self.beta.data.reshape(-1, 1, 1)
|
120
|
+
gamma = self.gamma.data.reshape(-1, 1, 1)
|
121
|
+
|
122
|
+
result = alpha * gelu + beta * silu + gamma * tanh
|
123
|
+
|
124
|
+
return Tensor(result, requires_grad=True)
|
125
|
+
|
126
|
+
def parameters(self):
|
127
|
+
return [self.alpha, self.beta, self.gamma, self.lambda_param]
|
@@ -0,0 +1,174 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from ..core.tensor import Tensor
|
3
|
+
from .module import Module
|
4
|
+
from collections import OrderedDict
|
5
|
+
|
6
|
+
class Sequential(Module):
|
7
|
+
def __init__(self, *args):
|
8
|
+
super().__init__()
|
9
|
+
self.modules = []
|
10
|
+
for arg in args:
|
11
|
+
if isinstance(arg, (list, tuple)):
|
12
|
+
self.modules.extend(arg)
|
13
|
+
elif isinstance(arg, dict):
|
14
|
+
self.modules.extend(arg.values())
|
15
|
+
else:
|
16
|
+
self.modules.append(arg)
|
17
|
+
|
18
|
+
def forward(self, x):
|
19
|
+
for module in self.modules:
|
20
|
+
x = module(x)
|
21
|
+
return x
|
22
|
+
|
23
|
+
def parameters(self):
|
24
|
+
params = []
|
25
|
+
for module in self.modules:
|
26
|
+
params.extend(module.parameters())
|
27
|
+
return params
|
28
|
+
|
29
|
+
class ModuleList(Module):
|
30
|
+
def __init__(self, modules=None):
|
31
|
+
super().__init__()
|
32
|
+
self.modules = []
|
33
|
+
if modules is not None:
|
34
|
+
self.extend(modules)
|
35
|
+
|
36
|
+
def __getitem__(self, idx):
|
37
|
+
return self.modules[idx]
|
38
|
+
|
39
|
+
def __setitem__(self, idx, module):
|
40
|
+
self.modules[idx] = module
|
41
|
+
|
42
|
+
def __len__(self):
|
43
|
+
return len(self.modules)
|
44
|
+
|
45
|
+
def append(self, module):
|
46
|
+
self.modules.append(module)
|
47
|
+
|
48
|
+
def extend(self, modules):
|
49
|
+
if isinstance(modules, (list, tuple)):
|
50
|
+
self.modules.extend(modules)
|
51
|
+
else:
|
52
|
+
self.modules.extend(list(modules))
|
53
|
+
|
54
|
+
def parameters(self):
|
55
|
+
params = []
|
56
|
+
for module in self.modules:
|
57
|
+
params.extend(module.parameters())
|
58
|
+
return params
|
59
|
+
|
60
|
+
class ModuleDict(Module):
|
61
|
+
def __init__(self, modules=None):
|
62
|
+
super().__init__()
|
63
|
+
self.modules = OrderedDict()
|
64
|
+
if modules is not None:
|
65
|
+
self.update(modules)
|
66
|
+
|
67
|
+
def __getitem__(self, key):
|
68
|
+
return self.modules[key]
|
69
|
+
|
70
|
+
def __setitem__(self, key, module):
|
71
|
+
self.modules[key] = module
|
72
|
+
|
73
|
+
def __delitem__(self, key):
|
74
|
+
del self.modules[key]
|
75
|
+
|
76
|
+
def __len__(self):
|
77
|
+
return len(self.modules)
|
78
|
+
|
79
|
+
def __iter__(self):
|
80
|
+
return iter(self.modules)
|
81
|
+
|
82
|
+
def keys(self):
|
83
|
+
return self.modules.keys()
|
84
|
+
|
85
|
+
def items(self):
|
86
|
+
return self.modules.items()
|
87
|
+
|
88
|
+
def values(self):
|
89
|
+
return self.modules.values()
|
90
|
+
|
91
|
+
def update(self, modules):
|
92
|
+
if isinstance(modules, dict):
|
93
|
+
self.modules.update(modules)
|
94
|
+
else:
|
95
|
+
for key, module in modules:
|
96
|
+
self.modules[key] = module
|
97
|
+
|
98
|
+
def parameters(self):
|
99
|
+
params = []
|
100
|
+
for module in self.modules.values():
|
101
|
+
params.extend(module.parameters())
|
102
|
+
return params
|
103
|
+
|
104
|
+
class ParameterList(Module):
|
105
|
+
def __init__(self, parameters=None):
|
106
|
+
super().__init__()
|
107
|
+
self.parameters_list = []
|
108
|
+
if parameters is not None:
|
109
|
+
self.extend(parameters)
|
110
|
+
|
111
|
+
def __getitem__(self, idx):
|
112
|
+
return self.parameters_list[idx]
|
113
|
+
|
114
|
+
def __setitem__(self, idx, parameter):
|
115
|
+
self.parameters_list[idx] = parameter
|
116
|
+
|
117
|
+
def __len__(self):
|
118
|
+
return len(self.parameters_list)
|
119
|
+
|
120
|
+
def append(self, parameter):
|
121
|
+
if not isinstance(parameter, Tensor):
|
122
|
+
parameter = Tensor(parameter, requires_grad=True)
|
123
|
+
self.parameters_list.append(parameter)
|
124
|
+
|
125
|
+
def extend(self, parameters):
|
126
|
+
for param in parameters:
|
127
|
+
self.append(param)
|
128
|
+
|
129
|
+
def parameters(self):
|
130
|
+
return self.parameters_list
|
131
|
+
|
132
|
+
class ParameterDict(Module):
|
133
|
+
def __init__(self, parameters=None):
|
134
|
+
super().__init__()
|
135
|
+
self.parameters_dict = OrderedDict()
|
136
|
+
if parameters is not None:
|
137
|
+
self.update(parameters)
|
138
|
+
|
139
|
+
def __getitem__(self, key):
|
140
|
+
return self.parameters_dict[key]
|
141
|
+
|
142
|
+
def __setitem__(self, key, parameter):
|
143
|
+
if not isinstance(parameter, Tensor):
|
144
|
+
parameter = Tensor(parameter, requires_grad=True)
|
145
|
+
self.parameters_dict[key] = parameter
|
146
|
+
|
147
|
+
def __delitem__(self, key):
|
148
|
+
del self.parameters_dict[key]
|
149
|
+
|
150
|
+
def __len__(self):
|
151
|
+
return len(self.parameters_dict)
|
152
|
+
|
153
|
+
def __iter__(self):
|
154
|
+
return iter(self.parameters_dict)
|
155
|
+
|
156
|
+
def keys(self):
|
157
|
+
return self.parameters_dict.keys()
|
158
|
+
|
159
|
+
def items(self):
|
160
|
+
return self.parameters_dict.items()
|
161
|
+
|
162
|
+
def values(self):
|
163
|
+
return self.parameters_dict.values()
|
164
|
+
|
165
|
+
def update(self, parameters):
|
166
|
+
if isinstance(parameters, dict):
|
167
|
+
for key, param in parameters.items():
|
168
|
+
self[key] = param
|
169
|
+
else:
|
170
|
+
for key, param in parameters:
|
171
|
+
self[key] = param
|
172
|
+
|
173
|
+
def parameters(self):
|
174
|
+
return list(self.parameters_dict.values())
|