openarchx 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openarchx/__init__.py +11 -0
- openarchx/core/tensor.py +179 -0
- openarchx/cuda/__init__.py +27 -0
- openarchx/cuda/cuda_ops.py +296 -0
- openarchx/layers/activations.py +63 -0
- openarchx/layers/base.py +40 -0
- openarchx/layers/cnn.py +145 -0
- openarchx/layers/transformer.py +131 -0
- openarchx/nn/__init__.py +26 -0
- openarchx/nn/activations.py +127 -0
- openarchx/nn/containers.py +174 -0
- openarchx/nn/dropout.py +121 -0
- openarchx/nn/layers.py +338 -0
- openarchx/nn/losses.py +156 -0
- openarchx/nn/module.py +18 -0
- openarchx/nn/padding.py +120 -0
- openarchx/nn/pooling.py +318 -0
- openarchx/nn/rnn.py +226 -0
- openarchx/nn/transformers.py +187 -0
- openarchx/optimizers/adam.py +49 -0
- openarchx/optimizers/adaptive.py +63 -0
- openarchx/optimizers/base.py +24 -0
- openarchx/optimizers/modern.py +98 -0
- openarchx/optimizers/optx.py +91 -0
- openarchx/optimizers/sgd.py +63 -0
- openarchx/quantum/circuit.py +92 -0
- openarchx/quantum/gates.py +126 -0
- openarchx/utils/__init__.py +50 -0
- openarchx/utils/data.py +229 -0
- openarchx/utils/huggingface.py +288 -0
- openarchx/utils/losses.py +21 -0
- openarchx/utils/model_io.py +553 -0
- openarchx/utils/pytorch.py +420 -0
- openarchx/utils/tensorflow.py +467 -0
- openarchx/utils/transforms.py +259 -0
- openarchx-0.1.0.dist-info/METADATA +180 -0
- openarchx-0.1.0.dist-info/RECORD +43 -0
- openarchx-0.1.0.dist-info/WHEEL +5 -0
- openarchx-0.1.0.dist-info/licenses/LICENSE +21 -0
- openarchx-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +1 -0
- tests/test_cuda_ops.py +205 -0
- tests/test_integrations.py +236 -0
@@ -0,0 +1,187 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from ..core.tensor import Tensor
|
3
|
+
from .module import Module
|
4
|
+
from .layers import Linear, LayerNorm
|
5
|
+
from .activations import ReLU
|
6
|
+
|
7
|
+
class MultiheadAttention(Module):
|
8
|
+
def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True):
|
9
|
+
super().__init__()
|
10
|
+
self.embed_dim = embed_dim
|
11
|
+
self.num_heads = num_heads
|
12
|
+
self.dropout = dropout
|
13
|
+
self.head_dim = embed_dim // num_heads
|
14
|
+
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
|
15
|
+
|
16
|
+
self.q_proj = Linear(embed_dim, embed_dim, bias=bias)
|
17
|
+
self.k_proj = Linear(embed_dim, embed_dim, bias=bias)
|
18
|
+
self.v_proj = Linear(embed_dim, embed_dim, bias=bias)
|
19
|
+
self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
|
20
|
+
|
21
|
+
def forward(self, query, key, value, attn_mask=None, key_padding_mask=None):
|
22
|
+
batch_size = query.data.shape[0]
|
23
|
+
|
24
|
+
# Linear projections and reshape
|
25
|
+
q = self._reshape_for_attention(self.q_proj(query))
|
26
|
+
k = self._reshape_for_attention(self.k_proj(key))
|
27
|
+
v = self._reshape_for_attention(self.v_proj(value))
|
28
|
+
|
29
|
+
# Scaled dot-product attention
|
30
|
+
scaling = float(self.head_dim) ** -0.5
|
31
|
+
attn = np.matmul(q.data, k.data.transpose(0, 1, 3, 2)) * scaling
|
32
|
+
|
33
|
+
if attn_mask is not None:
|
34
|
+
attn = np.where(attn_mask, -np.inf, attn)
|
35
|
+
|
36
|
+
attn = self._softmax(attn)
|
37
|
+
|
38
|
+
if self.dropout > 0:
|
39
|
+
attn = np.where(np.random.random(attn.shape) > self.dropout, attn, 0) / (1 - self.dropout)
|
40
|
+
|
41
|
+
output = np.matmul(attn, v.data)
|
42
|
+
output = output.transpose(0, 2, 1, 3).reshape(batch_size, -1, self.embed_dim)
|
43
|
+
|
44
|
+
return self.out_proj(Tensor(output, requires_grad=True))
|
45
|
+
|
46
|
+
def _reshape_for_attention(self, x):
|
47
|
+
batch_size, seq_len, _ = x.data.shape
|
48
|
+
return Tensor(
|
49
|
+
x.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
|
50
|
+
.transpose(0, 2, 1, 3),
|
51
|
+
requires_grad=True
|
52
|
+
)
|
53
|
+
|
54
|
+
def _softmax(self, x):
|
55
|
+
exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
|
56
|
+
return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
|
57
|
+
|
58
|
+
class TransformerEncoderLayer(Module):
|
59
|
+
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
|
60
|
+
super().__init__()
|
61
|
+
self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
|
62
|
+
self.linear1 = Linear(d_model, dim_feedforward)
|
63
|
+
self.linear2 = Linear(dim_feedforward, d_model)
|
64
|
+
self.norm1 = LayerNorm(d_model)
|
65
|
+
self.norm2 = LayerNorm(d_model)
|
66
|
+
self.dropout = dropout
|
67
|
+
self.activation = ReLU()
|
68
|
+
|
69
|
+
def forward(self, src, src_mask=None, src_key_padding_mask=None):
|
70
|
+
# Self attention block
|
71
|
+
src2 = self.self_attn(src, src, src, attn_mask=src_mask,
|
72
|
+
key_padding_mask=src_key_padding_mask)
|
73
|
+
src = src + self._dropout_layer(src2)
|
74
|
+
src = self.norm1(src)
|
75
|
+
|
76
|
+
# Feedforward block
|
77
|
+
src2 = self.linear2(self._dropout_layer(self.activation(self.linear1(src))))
|
78
|
+
src = src + self._dropout_layer(src2)
|
79
|
+
src = self.norm2(src)
|
80
|
+
|
81
|
+
return src
|
82
|
+
|
83
|
+
def _dropout_layer(self, x):
|
84
|
+
if self.dropout > 0:
|
85
|
+
mask = np.random.random(x.data.shape) > self.dropout
|
86
|
+
return Tensor(mask * x.data / (1 - self.dropout), requires_grad=True)
|
87
|
+
return x
|
88
|
+
|
89
|
+
class TransformerDecoderLayer(Module):
|
90
|
+
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
|
91
|
+
super().__init__()
|
92
|
+
self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
|
93
|
+
self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
|
94
|
+
self.linear1 = Linear(d_model, dim_feedforward)
|
95
|
+
self.linear2 = Linear(dim_feedforward, d_model)
|
96
|
+
self.norm1 = LayerNorm(d_model)
|
97
|
+
self.norm2 = LayerNorm(d_model)
|
98
|
+
self.norm3 = LayerNorm(d_model)
|
99
|
+
self.dropout = dropout
|
100
|
+
self.activation = ReLU()
|
101
|
+
|
102
|
+
def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
|
103
|
+
tgt_key_padding_mask=None, memory_key_padding_mask=None):
|
104
|
+
# Self attention block
|
105
|
+
tgt2 = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask,
|
106
|
+
key_padding_mask=tgt_key_padding_mask)
|
107
|
+
tgt = tgt + self._dropout_layer(tgt2)
|
108
|
+
tgt = self.norm1(tgt)
|
109
|
+
|
110
|
+
# Cross attention block
|
111
|
+
tgt2 = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
|
112
|
+
key_padding_mask=memory_key_padding_mask)
|
113
|
+
tgt = tgt + self._dropout_layer(tgt2)
|
114
|
+
tgt = self.norm2(tgt)
|
115
|
+
|
116
|
+
# Feedforward block
|
117
|
+
tgt2 = self.linear2(self._dropout_layer(self.activation(self.linear1(tgt))))
|
118
|
+
tgt = tgt + self._dropout_layer(tgt2)
|
119
|
+
tgt = self.norm3(tgt)
|
120
|
+
|
121
|
+
return tgt
|
122
|
+
|
123
|
+
def _dropout_layer(self, x):
|
124
|
+
if self.dropout > 0:
|
125
|
+
mask = np.random.random(x.data.shape) > self.dropout
|
126
|
+
return Tensor(mask * x.data / (1 - self.dropout), requires_grad=True)
|
127
|
+
return x
|
128
|
+
|
129
|
+
class TransformerEncoder(Module):
|
130
|
+
def __init__(self, encoder_layer, num_layers, norm=None):
|
131
|
+
super().__init__()
|
132
|
+
self.layers = [encoder_layer for _ in range(num_layers)]
|
133
|
+
self.norm = norm
|
134
|
+
|
135
|
+
def forward(self, src, mask=None, src_key_padding_mask=None):
|
136
|
+
output = src
|
137
|
+
for layer in self.layers:
|
138
|
+
output = layer(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask)
|
139
|
+
|
140
|
+
if self.norm is not None:
|
141
|
+
output = self.norm(output)
|
142
|
+
|
143
|
+
return output
|
144
|
+
|
145
|
+
class TransformerDecoder(Module):
|
146
|
+
def __init__(self, decoder_layer, num_layers, norm=None):
|
147
|
+
super().__init__()
|
148
|
+
self.layers = [decoder_layer for _ in range(num_layers)]
|
149
|
+
self.norm = norm
|
150
|
+
|
151
|
+
def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
|
152
|
+
tgt_key_padding_mask=None, memory_key_padding_mask=None):
|
153
|
+
output = tgt
|
154
|
+
for layer in self.layers:
|
155
|
+
output = layer(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
|
156
|
+
tgt_key_padding_mask=tgt_key_padding_mask,
|
157
|
+
memory_key_padding_mask=memory_key_padding_mask)
|
158
|
+
|
159
|
+
if self.norm is not None:
|
160
|
+
output = self.norm(output)
|
161
|
+
|
162
|
+
return output
|
163
|
+
|
164
|
+
class Transformer(Module):
|
165
|
+
def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
|
166
|
+
num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
|
167
|
+
super().__init__()
|
168
|
+
|
169
|
+
encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout)
|
170
|
+
encoder_norm = LayerNorm(d_model)
|
171
|
+
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
|
172
|
+
|
173
|
+
decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout)
|
174
|
+
decoder_norm = LayerNorm(d_model)
|
175
|
+
self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
|
176
|
+
|
177
|
+
self.d_model = d_model
|
178
|
+
self.nhead = nhead
|
179
|
+
|
180
|
+
def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None,
|
181
|
+
src_key_padding_mask=None, tgt_key_padding_mask=None,
|
182
|
+
memory_key_padding_mask=None):
|
183
|
+
memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
|
184
|
+
output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
|
185
|
+
tgt_key_padding_mask=tgt_key_padding_mask,
|
186
|
+
memory_key_padding_mask=memory_key_padding_mask)
|
187
|
+
return output
|
@@ -0,0 +1,49 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
class Adam:
|
4
|
+
def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0, clip_grad=None):
|
5
|
+
self.parameters = parameters
|
6
|
+
self.lr = lr
|
7
|
+
self.beta1, self.beta2 = betas
|
8
|
+
self.eps = eps
|
9
|
+
self.weight_decay = weight_decay
|
10
|
+
self.clip_grad = clip_grad
|
11
|
+
|
12
|
+
# Initialize momentum and velocity
|
13
|
+
self.m = [np.zeros_like(param.data) for param in parameters]
|
14
|
+
self.v = [np.zeros_like(param.data) for param in parameters]
|
15
|
+
self.t = 0
|
16
|
+
|
17
|
+
def clip_gradients(self):
|
18
|
+
if self.clip_grad is not None:
|
19
|
+
for param in self.parameters:
|
20
|
+
if param.grad is not None:
|
21
|
+
np.clip(param.grad, -self.clip_grad, self.clip_grad, out=param.grad)
|
22
|
+
|
23
|
+
def step(self):
|
24
|
+
self.t += 1
|
25
|
+
self.clip_gradients()
|
26
|
+
|
27
|
+
for i, param in enumerate(self.parameters):
|
28
|
+
if param.grad is not None:
|
29
|
+
grad = param.grad
|
30
|
+
if self.weight_decay > 0:
|
31
|
+
grad = grad + self.weight_decay * param.data
|
32
|
+
|
33
|
+
# Update momentum
|
34
|
+
self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad
|
35
|
+
|
36
|
+
# Update velocity
|
37
|
+
self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * (grad ** 2)
|
38
|
+
|
39
|
+
# Bias correction
|
40
|
+
m_hat = self.m[i] / (1 - self.beta1 ** self.t)
|
41
|
+
v_hat = self.v[i] / (1 - self.beta2 ** self.t)
|
42
|
+
|
43
|
+
# Update parameters
|
44
|
+
param.data -= self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
|
45
|
+
|
46
|
+
def zero_grad(self):
|
47
|
+
for param in self.parameters:
|
48
|
+
if param.grad is not None:
|
49
|
+
param.grad.fill(0)
|
@@ -0,0 +1,63 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from .base import Optimizer
|
3
|
+
|
4
|
+
class Adagrad(Optimizer):
|
5
|
+
"""Adaptive Gradient Algorithm"""
|
6
|
+
def __init__(self, parameters, lr=0.01, eps=1e-8, weight_decay=0.0, clip_grad=None):
|
7
|
+
super().__init__(parameters, lr, weight_decay, clip_grad)
|
8
|
+
self.eps = eps
|
9
|
+
self.G = [np.zeros_like(param.data) for param in parameters]
|
10
|
+
|
11
|
+
def step(self):
|
12
|
+
self.clip_gradients()
|
13
|
+
for i, param in enumerate(self.parameters):
|
14
|
+
if param.grad is not None:
|
15
|
+
grad = param.grad
|
16
|
+
if self.weight_decay > 0:
|
17
|
+
grad = grad + self.weight_decay * param.data
|
18
|
+
self.G[i] += np.square(grad)
|
19
|
+
param.data -= self.lr * grad / (np.sqrt(self.G[i]) + self.eps)
|
20
|
+
|
21
|
+
class Adadelta(Optimizer):
|
22
|
+
"""Adaptive Delta Algorithm"""
|
23
|
+
def __init__(self, parameters, rho=0.95, eps=1e-6, weight_decay=0.0, clip_grad=None):
|
24
|
+
super().__init__(parameters, 1.0, weight_decay, clip_grad) # lr=1.0 as it's not used
|
25
|
+
self.rho = rho
|
26
|
+
self.eps = eps
|
27
|
+
self.G = [np.zeros_like(param.data) for param in parameters]
|
28
|
+
self.delta = [np.zeros_like(param.data) for param in parameters]
|
29
|
+
|
30
|
+
def step(self):
|
31
|
+
self.clip_gradients()
|
32
|
+
for i, param in enumerate(self.parameters):
|
33
|
+
if param.grad is not None:
|
34
|
+
grad = param.grad
|
35
|
+
if self.weight_decay > 0:
|
36
|
+
grad = grad + self.weight_decay * param.data
|
37
|
+
|
38
|
+
self.G[i] = self.rho * self.G[i] + (1 - self.rho) * np.square(grad)
|
39
|
+
rms_g = np.sqrt(self.G[i] + self.eps)
|
40
|
+
rms_delta = np.sqrt(self.delta[i] + self.eps)
|
41
|
+
|
42
|
+
update = -rms_delta / rms_g * grad
|
43
|
+
param.data += update
|
44
|
+
|
45
|
+
self.delta[i] = self.rho * self.delta[i] + (1 - self.rho) * np.square(update)
|
46
|
+
|
47
|
+
class RMSprop(Optimizer):
|
48
|
+
"""Root Mean Square Propagation"""
|
49
|
+
def __init__(self, parameters, lr=0.01, alpha=0.99, eps=1e-8, weight_decay=0.0, clip_grad=None):
|
50
|
+
super().__init__(parameters, lr, weight_decay, clip_grad)
|
51
|
+
self.alpha = alpha
|
52
|
+
self.eps = eps
|
53
|
+
self.G = [np.zeros_like(param.data) for param in parameters]
|
54
|
+
|
55
|
+
def step(self):
|
56
|
+
self.clip_gradients()
|
57
|
+
for i, param in enumerate(self.parameters):
|
58
|
+
if param.grad is not None:
|
59
|
+
grad = param.grad
|
60
|
+
if self.weight_decay > 0:
|
61
|
+
grad = grad + self.weight_decay * param.data
|
62
|
+
self.G[i] = self.alpha * self.G[i] + (1 - self.alpha) * np.square(grad)
|
63
|
+
param.data -= self.lr * grad / (np.sqrt(self.G[i]) + self.eps)
|
@@ -0,0 +1,24 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from abc import ABC, abstractmethod
|
3
|
+
|
4
|
+
class Optimizer(ABC):
|
5
|
+
def __init__(self, parameters, lr=0.001, weight_decay=0.0, clip_grad=None):
|
6
|
+
self.parameters = parameters
|
7
|
+
self.lr = lr
|
8
|
+
self.weight_decay = weight_decay
|
9
|
+
self.clip_grad = clip_grad
|
10
|
+
|
11
|
+
def clip_gradients(self):
|
12
|
+
if self.clip_grad is not None:
|
13
|
+
for param in self.parameters:
|
14
|
+
if param.grad is not None:
|
15
|
+
np.clip(param.grad, -self.clip_grad, self.clip_grad, out=param.grad)
|
16
|
+
|
17
|
+
@abstractmethod
|
18
|
+
def step(self):
|
19
|
+
pass
|
20
|
+
|
21
|
+
def zero_grad(self):
|
22
|
+
for param in self.parameters:
|
23
|
+
if param.grad is not None:
|
24
|
+
param.grad.fill(0)
|
@@ -0,0 +1,98 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from .base import Optimizer
|
3
|
+
|
4
|
+
class RAdam(Optimizer):
|
5
|
+
"""Rectified Adam Optimizer"""
|
6
|
+
def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0, clip_grad=None):
|
7
|
+
super().__init__(parameters, lr, weight_decay, clip_grad)
|
8
|
+
self.betas = betas
|
9
|
+
self.eps = eps
|
10
|
+
self.m = [np.zeros_like(param.data) for param in parameters]
|
11
|
+
self.v = [np.zeros_like(param.data) for param in parameters]
|
12
|
+
self.step_count = 0
|
13
|
+
|
14
|
+
def step(self):
|
15
|
+
self.step_count += 1
|
16
|
+
self.clip_gradients()
|
17
|
+
|
18
|
+
beta1, beta2 = self.betas
|
19
|
+
bias_correction1 = 1 - beta1 ** self.step_count
|
20
|
+
bias_correction2 = 1 - beta2 ** self.step_count
|
21
|
+
|
22
|
+
rho_inf = 2 / (1 - beta2) - 1
|
23
|
+
rho_t = rho_inf - 2 * self.step_count * beta2 ** self.step_count / bias_correction2
|
24
|
+
|
25
|
+
for i, param in enumerate(self.parameters):
|
26
|
+
if param.grad is not None:
|
27
|
+
grad = param.grad
|
28
|
+
if self.weight_decay > 0:
|
29
|
+
grad = grad + self.weight_decay * param.data
|
30
|
+
|
31
|
+
self.m[i] = beta1 * self.m[i] + (1 - beta1) * grad
|
32
|
+
self.v[i] = beta2 * self.v[i] + (1 - beta2) * np.square(grad)
|
33
|
+
|
34
|
+
m_hat = self.m[i] / bias_correction1
|
35
|
+
|
36
|
+
if rho_t > 4:
|
37
|
+
r = np.sqrt(((rho_t - 4) * (rho_t - 2) * rho_inf) / ((rho_inf - 4) * (rho_inf - 2) * rho_t))
|
38
|
+
v_hat = np.sqrt(self.v[i] / bias_correction2)
|
39
|
+
param.data -= self.lr * r * m_hat / (v_hat + self.eps)
|
40
|
+
else:
|
41
|
+
param.data -= self.lr * m_hat
|
42
|
+
|
43
|
+
class AdaBelief(Optimizer):
|
44
|
+
"""AdaBelief Optimizer"""
|
45
|
+
def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-16, weight_decay=0.0, clip_grad=None):
|
46
|
+
super().__init__(parameters, lr, weight_decay, clip_grad)
|
47
|
+
self.betas = betas
|
48
|
+
self.eps = eps
|
49
|
+
self.m = [np.zeros_like(param.data) for param in parameters]
|
50
|
+
self.s = [np.zeros_like(param.data) for param in parameters]
|
51
|
+
self.step_count = 0
|
52
|
+
|
53
|
+
def step(self):
|
54
|
+
self.step_count += 1
|
55
|
+
self.clip_gradients()
|
56
|
+
|
57
|
+
beta1, beta2 = self.betas
|
58
|
+
bias_correction1 = 1 - beta1 ** self.step_count
|
59
|
+
bias_correction2 = 1 - beta2 ** self.step_count
|
60
|
+
|
61
|
+
for i, param in enumerate(self.parameters):
|
62
|
+
if param.grad is not None:
|
63
|
+
grad = param.grad
|
64
|
+
if self.weight_decay > 0:
|
65
|
+
grad = grad + self.weight_decay * param.data
|
66
|
+
|
67
|
+
self.m[i] = beta1 * self.m[i] + (1 - beta1) * grad
|
68
|
+
diff = grad - self.m[i]
|
69
|
+
self.s[i] = beta2 * self.s[i] + (1 - beta2) * np.square(diff)
|
70
|
+
|
71
|
+
m_hat = self.m[i] / bias_correction1
|
72
|
+
s_hat = self.s[i] / bias_correction2
|
73
|
+
|
74
|
+
param.data -= self.lr * m_hat / (np.sqrt(s_hat) + self.eps)
|
75
|
+
|
76
|
+
class Lion(Optimizer):
|
77
|
+
"""Lion Optimizer (Learning with Inner Optimization)"""
|
78
|
+
def __init__(self, parameters, lr=0.0001, betas=(0.9, 0.99), weight_decay=0.0, clip_grad=None):
|
79
|
+
super().__init__(parameters, lr, weight_decay, clip_grad)
|
80
|
+
self.betas = betas
|
81
|
+
self.m = [np.zeros_like(param.data) for param in parameters]
|
82
|
+
|
83
|
+
def step(self):
|
84
|
+
self.clip_gradients()
|
85
|
+
beta1, beta2 = self.betas
|
86
|
+
|
87
|
+
for i, param in enumerate(self.parameters):
|
88
|
+
if param.grad is not None:
|
89
|
+
grad = param.grad
|
90
|
+
if self.weight_decay > 0:
|
91
|
+
grad = grad + self.weight_decay * param.data
|
92
|
+
|
93
|
+
update = beta1 * self.m[i] + (1 - beta1) * grad
|
94
|
+
old_m = self.m[i].copy()
|
95
|
+
self.m[i] = beta2 * self.m[i] + (1 - beta2) * grad
|
96
|
+
|
97
|
+
# Update using sign of momentum
|
98
|
+
param.data -= self.lr * np.sign(update)
|
@@ -0,0 +1,91 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
class OptX:
|
4
|
+
def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-8,
|
5
|
+
weight_decay=0.0, clip_grad=None, rectify=True,
|
6
|
+
lookahead_steps=5, lookahead_alpha=0.5):
|
7
|
+
self.parameters = parameters
|
8
|
+
self.lr = lr
|
9
|
+
self.beta1, self.beta2 = betas
|
10
|
+
self.eps = eps
|
11
|
+
self.weight_decay = weight_decay
|
12
|
+
self.clip_grad = clip_grad
|
13
|
+
self.rectify = rectify
|
14
|
+
self.lookahead_steps = lookahead_steps
|
15
|
+
self.lookahead_alpha = lookahead_alpha
|
16
|
+
|
17
|
+
# Initialize momentum and velocity
|
18
|
+
self.m = [np.zeros_like(param.data) for param in parameters]
|
19
|
+
self.v = [np.zeros_like(param.data) for param in parameters]
|
20
|
+
self.slow_weights = [param.data.copy() for param in parameters]
|
21
|
+
self.t = 0
|
22
|
+
self.step_counter = 0
|
23
|
+
|
24
|
+
# Gradient variance tracking
|
25
|
+
self.grad_var = [np.zeros_like(param.data) for param in parameters]
|
26
|
+
|
27
|
+
def clip_gradients(self):
|
28
|
+
if self.clip_grad is not None:
|
29
|
+
for param in self.parameters:
|
30
|
+
if param.grad is not None:
|
31
|
+
np.clip(param.grad, -self.clip_grad, self.clip_grad, out=param.grad)
|
32
|
+
|
33
|
+
def update_grad_variance(self, i, grad):
|
34
|
+
# Update running variance of gradients
|
35
|
+
if self.t > 1:
|
36
|
+
self.grad_var[i] = 0.9 * self.grad_var[i] + 0.1 * (grad - self.m[i])**2
|
37
|
+
|
38
|
+
def compute_adaptive_lr(self, i, v_hat):
|
39
|
+
# Compute adaptive learning rate based on gradient variance
|
40
|
+
if self.t > 1:
|
41
|
+
variance_scaling = 1.0 / (1.0 + np.sqrt(self.grad_var[i]) + self.eps)
|
42
|
+
return self.lr * variance_scaling
|
43
|
+
return self.lr
|
44
|
+
|
45
|
+
def step(self):
|
46
|
+
self.t += 1
|
47
|
+
self.step_counter += 1
|
48
|
+
self.clip_gradients()
|
49
|
+
|
50
|
+
for i, param in enumerate(self.parameters):
|
51
|
+
if param.grad is not None:
|
52
|
+
grad = param.grad
|
53
|
+
if self.weight_decay > 0:
|
54
|
+
grad = grad + self.weight_decay * param.data
|
55
|
+
|
56
|
+
# Update momentum with bias correction
|
57
|
+
self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad
|
58
|
+
m_hat = self.m[i] / (1 - self.beta1 ** self.t)
|
59
|
+
|
60
|
+
# Update velocity with bias correction
|
61
|
+
self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * (grad ** 2)
|
62
|
+
v_hat = self.v[i] / (1 - self.beta2 ** self.t)
|
63
|
+
|
64
|
+
# Update gradient variance for adaptive scaling
|
65
|
+
self.update_grad_variance(i, grad)
|
66
|
+
|
67
|
+
# Compute adaptive learning rate
|
68
|
+
adaptive_lr = self.compute_adaptive_lr(i, v_hat)
|
69
|
+
|
70
|
+
# Compute update
|
71
|
+
update = adaptive_lr * m_hat / (np.sqrt(v_hat) + self.eps)
|
72
|
+
|
73
|
+
if self.rectify:
|
74
|
+
# Apply rectification to prevent overshooting
|
75
|
+
variance_ratio = np.sqrt(self.grad_var[i]) / (np.sqrt(v_hat) + self.eps)
|
76
|
+
update *= np.minimum(1.0, np.maximum(0.1, variance_ratio))
|
77
|
+
|
78
|
+
# Apply update
|
79
|
+
param.data -= update
|
80
|
+
|
81
|
+
# Lookahead update
|
82
|
+
if self.step_counter % self.lookahead_steps == 0:
|
83
|
+
# Move slow weights toward current parameters
|
84
|
+
self.slow_weights[i] += self.lookahead_alpha * (param.data - self.slow_weights[i])
|
85
|
+
# Update parameters to interpolated position
|
86
|
+
param.data = self.slow_weights[i].copy()
|
87
|
+
|
88
|
+
def zero_grad(self):
|
89
|
+
for param in self.parameters:
|
90
|
+
if param.grad is not None:
|
91
|
+
param.grad.fill(0)
|
@@ -0,0 +1,63 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from .base import Optimizer
|
3
|
+
|
4
|
+
class BGD(Optimizer):
|
5
|
+
"""Batch Gradient Descent"""
|
6
|
+
def step(self):
|
7
|
+
self.clip_gradients()
|
8
|
+
for param in self.parameters:
|
9
|
+
if param.grad is not None:
|
10
|
+
grad = param.grad
|
11
|
+
if self.weight_decay > 0:
|
12
|
+
grad = grad + self.weight_decay * param.data
|
13
|
+
param.data -= self.lr * grad
|
14
|
+
|
15
|
+
class MBGD(Optimizer):
|
16
|
+
"""Mini-Batch Gradient Descent"""
|
17
|
+
def __init__(self, parameters, lr=0.001, batch_size=32, weight_decay=0.0, clip_grad=None):
|
18
|
+
super().__init__(parameters, lr, weight_decay, clip_grad)
|
19
|
+
self.batch_size = batch_size
|
20
|
+
|
21
|
+
def step(self):
|
22
|
+
self.clip_gradients()
|
23
|
+
for param in self.parameters:
|
24
|
+
if param.grad is not None:
|
25
|
+
grad = param.grad / self.batch_size
|
26
|
+
if self.weight_decay > 0:
|
27
|
+
grad = grad + self.weight_decay * param.data
|
28
|
+
param.data -= self.lr * grad
|
29
|
+
|
30
|
+
class SGD(Optimizer):
|
31
|
+
"""SGD with Momentum"""
|
32
|
+
def __init__(self, parameters, lr=0.001, momentum=0.9, weight_decay=0.0, clip_grad=None):
|
33
|
+
super().__init__(parameters, lr, weight_decay, clip_grad)
|
34
|
+
self.momentum = momentum
|
35
|
+
self.v = [np.zeros_like(param.data) for param in parameters]
|
36
|
+
|
37
|
+
def step(self):
|
38
|
+
self.clip_gradients()
|
39
|
+
for i, param in enumerate(self.parameters):
|
40
|
+
if param.grad is not None:
|
41
|
+
grad = param.grad
|
42
|
+
if self.weight_decay > 0:
|
43
|
+
grad = grad + self.weight_decay * param.data
|
44
|
+
self.v[i] = self.momentum * self.v[i] - self.lr * grad
|
45
|
+
param.data += self.v[i]
|
46
|
+
|
47
|
+
class NAG(Optimizer):
|
48
|
+
"""Nesterov Accelerated Gradient"""
|
49
|
+
def __init__(self, parameters, lr=0.001, momentum=0.9, weight_decay=0.0, clip_grad=None):
|
50
|
+
super().__init__(parameters, lr, weight_decay, clip_grad)
|
51
|
+
self.momentum = momentum
|
52
|
+
self.v = [np.zeros_like(param.data) for param in parameters]
|
53
|
+
|
54
|
+
def step(self):
|
55
|
+
self.clip_gradients()
|
56
|
+
for i, param in enumerate(self.parameters):
|
57
|
+
if param.grad is not None:
|
58
|
+
grad = param.grad
|
59
|
+
if self.weight_decay > 0:
|
60
|
+
grad = grad + self.weight_decay * param.data
|
61
|
+
v_prev = self.v[i].copy()
|
62
|
+
self.v[i] = self.momentum * self.v[i] - self.lr * grad
|
63
|
+
param.data += -self.momentum * v_prev + (1 + self.momentum) * self.v[i]
|
@@ -0,0 +1,92 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from ..core.tensor import Tensor
|
3
|
+
from ..nn.module import Module
|
4
|
+
from .gates import QuantumGates, QuantumRegister
|
5
|
+
|
6
|
+
class QuantumCircuit(Module):
|
7
|
+
def __init__(self, num_qubits):
|
8
|
+
super().__init__()
|
9
|
+
self.num_qubits = num_qubits
|
10
|
+
self.register = None
|
11
|
+
self.gates = []
|
12
|
+
|
13
|
+
def reset(self):
|
14
|
+
"""Initialize a new quantum register"""
|
15
|
+
self.register = QuantumRegister(self.num_qubits)
|
16
|
+
self.gates = []
|
17
|
+
|
18
|
+
def h(self, qubit):
|
19
|
+
"""Apply Hadamard gate"""
|
20
|
+
self.gates.append(('H', qubit))
|
21
|
+
return self
|
22
|
+
|
23
|
+
def x(self, qubit):
|
24
|
+
"""Apply Pauli-X gate"""
|
25
|
+
self.gates.append(('X', qubit))
|
26
|
+
return self
|
27
|
+
|
28
|
+
def y(self, qubit):
|
29
|
+
"""Apply Pauli-Y gate"""
|
30
|
+
self.gates.append(('Y', qubit))
|
31
|
+
return self
|
32
|
+
|
33
|
+
def z(self, qubit):
|
34
|
+
"""Apply Pauli-Z gate"""
|
35
|
+
self.gates.append(('Z', qubit))
|
36
|
+
return self
|
37
|
+
|
38
|
+
def rx(self, theta, qubit):
|
39
|
+
"""Apply parameterized rotation around X axis"""
|
40
|
+
self.gates.append(('RX', theta, qubit))
|
41
|
+
return self
|
42
|
+
|
43
|
+
def ry(self, theta, qubit):
|
44
|
+
"""Apply parameterized rotation around Y axis"""
|
45
|
+
self.gates.append(('RY', theta, qubit))
|
46
|
+
return self
|
47
|
+
|
48
|
+
def rz(self, theta, qubit):
|
49
|
+
"""Apply parameterized rotation around Z axis"""
|
50
|
+
self.gates.append(('RZ', theta, qubit))
|
51
|
+
return self
|
52
|
+
|
53
|
+
def cnot(self, control, target):
|
54
|
+
"""Apply CNOT gate"""
|
55
|
+
self.gates.append(('CNOT', control, target))
|
56
|
+
return self
|
57
|
+
|
58
|
+
def forward(self, input_params=None):
|
59
|
+
"""Execute quantum circuit and return measurement results"""
|
60
|
+
self.reset()
|
61
|
+
|
62
|
+
# Apply gates with batch support
|
63
|
+
for gate in self.gates:
|
64
|
+
if gate[0] in ['RX', 'RY', 'RZ']:
|
65
|
+
theta = gate[1] if input_params is None else input_params[gate[1]]
|
66
|
+
# Scale parameters to prevent gradient explosion
|
67
|
+
theta = np.clip(theta, -np.pi, np.pi)
|
68
|
+
self.register.apply_gate(getattr(QuantumGates, gate[0])(theta), gate[2])
|
69
|
+
else:
|
70
|
+
gate_op = getattr(QuantumGates, gate[0])()
|
71
|
+
if gate[0] == 'CNOT':
|
72
|
+
self.register.apply_controlled_gate(gate_op, gate[1], gate[2])
|
73
|
+
else:
|
74
|
+
self.register.apply_gate(gate_op, gate[1])
|
75
|
+
|
76
|
+
# Return the final state with gradient tracking
|
77
|
+
return Tensor(self.register.state, requires_grad=True)
|
78
|
+
|
79
|
+
class QuantumLayer(Module):
|
80
|
+
def __init__(self, num_qubits, num_params):
|
81
|
+
super().__init__()
|
82
|
+
self.circuit = QuantumCircuit(num_qubits)
|
83
|
+
self.params = Tensor(np.random.randn(num_params) * 0.1, requires_grad=True)
|
84
|
+
|
85
|
+
def build_circuit(self):
|
86
|
+
"""Override this method to define the quantum circuit architecture"""
|
87
|
+
raise NotImplementedError
|
88
|
+
|
89
|
+
def forward(self, x):
|
90
|
+
"""Execute quantum circuit with current parameters"""
|
91
|
+
self.build_circuit()
|
92
|
+
return self.circuit.forward(self.params)
|