openarchx 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. openarchx/__init__.py +11 -0
  2. openarchx/core/tensor.py +179 -0
  3. openarchx/cuda/__init__.py +27 -0
  4. openarchx/cuda/cuda_ops.py +296 -0
  5. openarchx/layers/activations.py +63 -0
  6. openarchx/layers/base.py +40 -0
  7. openarchx/layers/cnn.py +145 -0
  8. openarchx/layers/transformer.py +131 -0
  9. openarchx/nn/__init__.py +26 -0
  10. openarchx/nn/activations.py +127 -0
  11. openarchx/nn/containers.py +174 -0
  12. openarchx/nn/dropout.py +121 -0
  13. openarchx/nn/layers.py +338 -0
  14. openarchx/nn/losses.py +156 -0
  15. openarchx/nn/module.py +18 -0
  16. openarchx/nn/padding.py +120 -0
  17. openarchx/nn/pooling.py +318 -0
  18. openarchx/nn/rnn.py +226 -0
  19. openarchx/nn/transformers.py +187 -0
  20. openarchx/optimizers/adam.py +49 -0
  21. openarchx/optimizers/adaptive.py +63 -0
  22. openarchx/optimizers/base.py +24 -0
  23. openarchx/optimizers/modern.py +98 -0
  24. openarchx/optimizers/optx.py +91 -0
  25. openarchx/optimizers/sgd.py +63 -0
  26. openarchx/quantum/circuit.py +92 -0
  27. openarchx/quantum/gates.py +126 -0
  28. openarchx/utils/__init__.py +50 -0
  29. openarchx/utils/data.py +229 -0
  30. openarchx/utils/huggingface.py +288 -0
  31. openarchx/utils/losses.py +21 -0
  32. openarchx/utils/model_io.py +553 -0
  33. openarchx/utils/pytorch.py +420 -0
  34. openarchx/utils/tensorflow.py +467 -0
  35. openarchx/utils/transforms.py +259 -0
  36. openarchx-0.1.0.dist-info/METADATA +180 -0
  37. openarchx-0.1.0.dist-info/RECORD +43 -0
  38. openarchx-0.1.0.dist-info/WHEEL +5 -0
  39. openarchx-0.1.0.dist-info/licenses/LICENSE +21 -0
  40. openarchx-0.1.0.dist-info/top_level.txt +2 -0
  41. tests/__init__.py +1 -0
  42. tests/test_cuda_ops.py +205 -0
  43. tests/test_integrations.py +236 -0
@@ -0,0 +1,187 @@
1
+ import numpy as np
2
+ from ..core.tensor import Tensor
3
+ from .module import Module
4
+ from .layers import Linear, LayerNorm
5
+ from .activations import ReLU
6
+
7
+ class MultiheadAttention(Module):
8
+ def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True):
9
+ super().__init__()
10
+ self.embed_dim = embed_dim
11
+ self.num_heads = num_heads
12
+ self.dropout = dropout
13
+ self.head_dim = embed_dim // num_heads
14
+ assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
15
+
16
+ self.q_proj = Linear(embed_dim, embed_dim, bias=bias)
17
+ self.k_proj = Linear(embed_dim, embed_dim, bias=bias)
18
+ self.v_proj = Linear(embed_dim, embed_dim, bias=bias)
19
+ self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
20
+
21
+ def forward(self, query, key, value, attn_mask=None, key_padding_mask=None):
22
+ batch_size = query.data.shape[0]
23
+
24
+ # Linear projections and reshape
25
+ q = self._reshape_for_attention(self.q_proj(query))
26
+ k = self._reshape_for_attention(self.k_proj(key))
27
+ v = self._reshape_for_attention(self.v_proj(value))
28
+
29
+ # Scaled dot-product attention
30
+ scaling = float(self.head_dim) ** -0.5
31
+ attn = np.matmul(q.data, k.data.transpose(0, 1, 3, 2)) * scaling
32
+
33
+ if attn_mask is not None:
34
+ attn = np.where(attn_mask, -np.inf, attn)
35
+
36
+ attn = self._softmax(attn)
37
+
38
+ if self.dropout > 0:
39
+ attn = np.where(np.random.random(attn.shape) > self.dropout, attn, 0) / (1 - self.dropout)
40
+
41
+ output = np.matmul(attn, v.data)
42
+ output = output.transpose(0, 2, 1, 3).reshape(batch_size, -1, self.embed_dim)
43
+
44
+ return self.out_proj(Tensor(output, requires_grad=True))
45
+
46
+ def _reshape_for_attention(self, x):
47
+ batch_size, seq_len, _ = x.data.shape
48
+ return Tensor(
49
+ x.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
50
+ .transpose(0, 2, 1, 3),
51
+ requires_grad=True
52
+ )
53
+
54
+ def _softmax(self, x):
55
+ exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
56
+ return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
57
+
58
+ class TransformerEncoderLayer(Module):
59
+ def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
60
+ super().__init__()
61
+ self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
62
+ self.linear1 = Linear(d_model, dim_feedforward)
63
+ self.linear2 = Linear(dim_feedforward, d_model)
64
+ self.norm1 = LayerNorm(d_model)
65
+ self.norm2 = LayerNorm(d_model)
66
+ self.dropout = dropout
67
+ self.activation = ReLU()
68
+
69
+ def forward(self, src, src_mask=None, src_key_padding_mask=None):
70
+ # Self attention block
71
+ src2 = self.self_attn(src, src, src, attn_mask=src_mask,
72
+ key_padding_mask=src_key_padding_mask)
73
+ src = src + self._dropout_layer(src2)
74
+ src = self.norm1(src)
75
+
76
+ # Feedforward block
77
+ src2 = self.linear2(self._dropout_layer(self.activation(self.linear1(src))))
78
+ src = src + self._dropout_layer(src2)
79
+ src = self.norm2(src)
80
+
81
+ return src
82
+
83
+ def _dropout_layer(self, x):
84
+ if self.dropout > 0:
85
+ mask = np.random.random(x.data.shape) > self.dropout
86
+ return Tensor(mask * x.data / (1 - self.dropout), requires_grad=True)
87
+ return x
88
+
89
+ class TransformerDecoderLayer(Module):
90
+ def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
91
+ super().__init__()
92
+ self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
93
+ self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
94
+ self.linear1 = Linear(d_model, dim_feedforward)
95
+ self.linear2 = Linear(dim_feedforward, d_model)
96
+ self.norm1 = LayerNorm(d_model)
97
+ self.norm2 = LayerNorm(d_model)
98
+ self.norm3 = LayerNorm(d_model)
99
+ self.dropout = dropout
100
+ self.activation = ReLU()
101
+
102
+ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
103
+ tgt_key_padding_mask=None, memory_key_padding_mask=None):
104
+ # Self attention block
105
+ tgt2 = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask,
106
+ key_padding_mask=tgt_key_padding_mask)
107
+ tgt = tgt + self._dropout_layer(tgt2)
108
+ tgt = self.norm1(tgt)
109
+
110
+ # Cross attention block
111
+ tgt2 = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
112
+ key_padding_mask=memory_key_padding_mask)
113
+ tgt = tgt + self._dropout_layer(tgt2)
114
+ tgt = self.norm2(tgt)
115
+
116
+ # Feedforward block
117
+ tgt2 = self.linear2(self._dropout_layer(self.activation(self.linear1(tgt))))
118
+ tgt = tgt + self._dropout_layer(tgt2)
119
+ tgt = self.norm3(tgt)
120
+
121
+ return tgt
122
+
123
+ def _dropout_layer(self, x):
124
+ if self.dropout > 0:
125
+ mask = np.random.random(x.data.shape) > self.dropout
126
+ return Tensor(mask * x.data / (1 - self.dropout), requires_grad=True)
127
+ return x
128
+
129
+ class TransformerEncoder(Module):
130
+ def __init__(self, encoder_layer, num_layers, norm=None):
131
+ super().__init__()
132
+ self.layers = [encoder_layer for _ in range(num_layers)]
133
+ self.norm = norm
134
+
135
+ def forward(self, src, mask=None, src_key_padding_mask=None):
136
+ output = src
137
+ for layer in self.layers:
138
+ output = layer(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask)
139
+
140
+ if self.norm is not None:
141
+ output = self.norm(output)
142
+
143
+ return output
144
+
145
+ class TransformerDecoder(Module):
146
+ def __init__(self, decoder_layer, num_layers, norm=None):
147
+ super().__init__()
148
+ self.layers = [decoder_layer for _ in range(num_layers)]
149
+ self.norm = norm
150
+
151
+ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
152
+ tgt_key_padding_mask=None, memory_key_padding_mask=None):
153
+ output = tgt
154
+ for layer in self.layers:
155
+ output = layer(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
156
+ tgt_key_padding_mask=tgt_key_padding_mask,
157
+ memory_key_padding_mask=memory_key_padding_mask)
158
+
159
+ if self.norm is not None:
160
+ output = self.norm(output)
161
+
162
+ return output
163
+
164
+ class Transformer(Module):
165
+ def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
166
+ num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
167
+ super().__init__()
168
+
169
+ encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout)
170
+ encoder_norm = LayerNorm(d_model)
171
+ self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
172
+
173
+ decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout)
174
+ decoder_norm = LayerNorm(d_model)
175
+ self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
176
+
177
+ self.d_model = d_model
178
+ self.nhead = nhead
179
+
180
+ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None,
181
+ src_key_padding_mask=None, tgt_key_padding_mask=None,
182
+ memory_key_padding_mask=None):
183
+ memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
184
+ output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
185
+ tgt_key_padding_mask=tgt_key_padding_mask,
186
+ memory_key_padding_mask=memory_key_padding_mask)
187
+ return output
@@ -0,0 +1,49 @@
1
+ import numpy as np
2
+
3
+ class Adam:
4
+ def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0, clip_grad=None):
5
+ self.parameters = parameters
6
+ self.lr = lr
7
+ self.beta1, self.beta2 = betas
8
+ self.eps = eps
9
+ self.weight_decay = weight_decay
10
+ self.clip_grad = clip_grad
11
+
12
+ # Initialize momentum and velocity
13
+ self.m = [np.zeros_like(param.data) for param in parameters]
14
+ self.v = [np.zeros_like(param.data) for param in parameters]
15
+ self.t = 0
16
+
17
+ def clip_gradients(self):
18
+ if self.clip_grad is not None:
19
+ for param in self.parameters:
20
+ if param.grad is not None:
21
+ np.clip(param.grad, -self.clip_grad, self.clip_grad, out=param.grad)
22
+
23
+ def step(self):
24
+ self.t += 1
25
+ self.clip_gradients()
26
+
27
+ for i, param in enumerate(self.parameters):
28
+ if param.grad is not None:
29
+ grad = param.grad
30
+ if self.weight_decay > 0:
31
+ grad = grad + self.weight_decay * param.data
32
+
33
+ # Update momentum
34
+ self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad
35
+
36
+ # Update velocity
37
+ self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * (grad ** 2)
38
+
39
+ # Bias correction
40
+ m_hat = self.m[i] / (1 - self.beta1 ** self.t)
41
+ v_hat = self.v[i] / (1 - self.beta2 ** self.t)
42
+
43
+ # Update parameters
44
+ param.data -= self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
45
+
46
+ def zero_grad(self):
47
+ for param in self.parameters:
48
+ if param.grad is not None:
49
+ param.grad.fill(0)
@@ -0,0 +1,63 @@
1
+ import numpy as np
2
+ from .base import Optimizer
3
+
4
+ class Adagrad(Optimizer):
5
+ """Adaptive Gradient Algorithm"""
6
+ def __init__(self, parameters, lr=0.01, eps=1e-8, weight_decay=0.0, clip_grad=None):
7
+ super().__init__(parameters, lr, weight_decay, clip_grad)
8
+ self.eps = eps
9
+ self.G = [np.zeros_like(param.data) for param in parameters]
10
+
11
+ def step(self):
12
+ self.clip_gradients()
13
+ for i, param in enumerate(self.parameters):
14
+ if param.grad is not None:
15
+ grad = param.grad
16
+ if self.weight_decay > 0:
17
+ grad = grad + self.weight_decay * param.data
18
+ self.G[i] += np.square(grad)
19
+ param.data -= self.lr * grad / (np.sqrt(self.G[i]) + self.eps)
20
+
21
+ class Adadelta(Optimizer):
22
+ """Adaptive Delta Algorithm"""
23
+ def __init__(self, parameters, rho=0.95, eps=1e-6, weight_decay=0.0, clip_grad=None):
24
+ super().__init__(parameters, 1.0, weight_decay, clip_grad) # lr=1.0 as it's not used
25
+ self.rho = rho
26
+ self.eps = eps
27
+ self.G = [np.zeros_like(param.data) for param in parameters]
28
+ self.delta = [np.zeros_like(param.data) for param in parameters]
29
+
30
+ def step(self):
31
+ self.clip_gradients()
32
+ for i, param in enumerate(self.parameters):
33
+ if param.grad is not None:
34
+ grad = param.grad
35
+ if self.weight_decay > 0:
36
+ grad = grad + self.weight_decay * param.data
37
+
38
+ self.G[i] = self.rho * self.G[i] + (1 - self.rho) * np.square(grad)
39
+ rms_g = np.sqrt(self.G[i] + self.eps)
40
+ rms_delta = np.sqrt(self.delta[i] + self.eps)
41
+
42
+ update = -rms_delta / rms_g * grad
43
+ param.data += update
44
+
45
+ self.delta[i] = self.rho * self.delta[i] + (1 - self.rho) * np.square(update)
46
+
47
+ class RMSprop(Optimizer):
48
+ """Root Mean Square Propagation"""
49
+ def __init__(self, parameters, lr=0.01, alpha=0.99, eps=1e-8, weight_decay=0.0, clip_grad=None):
50
+ super().__init__(parameters, lr, weight_decay, clip_grad)
51
+ self.alpha = alpha
52
+ self.eps = eps
53
+ self.G = [np.zeros_like(param.data) for param in parameters]
54
+
55
+ def step(self):
56
+ self.clip_gradients()
57
+ for i, param in enumerate(self.parameters):
58
+ if param.grad is not None:
59
+ grad = param.grad
60
+ if self.weight_decay > 0:
61
+ grad = grad + self.weight_decay * param.data
62
+ self.G[i] = self.alpha * self.G[i] + (1 - self.alpha) * np.square(grad)
63
+ param.data -= self.lr * grad / (np.sqrt(self.G[i]) + self.eps)
@@ -0,0 +1,24 @@
1
+ import numpy as np
2
+ from abc import ABC, abstractmethod
3
+
4
+ class Optimizer(ABC):
5
+ def __init__(self, parameters, lr=0.001, weight_decay=0.0, clip_grad=None):
6
+ self.parameters = parameters
7
+ self.lr = lr
8
+ self.weight_decay = weight_decay
9
+ self.clip_grad = clip_grad
10
+
11
+ def clip_gradients(self):
12
+ if self.clip_grad is not None:
13
+ for param in self.parameters:
14
+ if param.grad is not None:
15
+ np.clip(param.grad, -self.clip_grad, self.clip_grad, out=param.grad)
16
+
17
+ @abstractmethod
18
+ def step(self):
19
+ pass
20
+
21
+ def zero_grad(self):
22
+ for param in self.parameters:
23
+ if param.grad is not None:
24
+ param.grad.fill(0)
@@ -0,0 +1,98 @@
1
+ import numpy as np
2
+ from .base import Optimizer
3
+
4
+ class RAdam(Optimizer):
5
+ """Rectified Adam Optimizer"""
6
+ def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0, clip_grad=None):
7
+ super().__init__(parameters, lr, weight_decay, clip_grad)
8
+ self.betas = betas
9
+ self.eps = eps
10
+ self.m = [np.zeros_like(param.data) for param in parameters]
11
+ self.v = [np.zeros_like(param.data) for param in parameters]
12
+ self.step_count = 0
13
+
14
+ def step(self):
15
+ self.step_count += 1
16
+ self.clip_gradients()
17
+
18
+ beta1, beta2 = self.betas
19
+ bias_correction1 = 1 - beta1 ** self.step_count
20
+ bias_correction2 = 1 - beta2 ** self.step_count
21
+
22
+ rho_inf = 2 / (1 - beta2) - 1
23
+ rho_t = rho_inf - 2 * self.step_count * beta2 ** self.step_count / bias_correction2
24
+
25
+ for i, param in enumerate(self.parameters):
26
+ if param.grad is not None:
27
+ grad = param.grad
28
+ if self.weight_decay > 0:
29
+ grad = grad + self.weight_decay * param.data
30
+
31
+ self.m[i] = beta1 * self.m[i] + (1 - beta1) * grad
32
+ self.v[i] = beta2 * self.v[i] + (1 - beta2) * np.square(grad)
33
+
34
+ m_hat = self.m[i] / bias_correction1
35
+
36
+ if rho_t > 4:
37
+ r = np.sqrt(((rho_t - 4) * (rho_t - 2) * rho_inf) / ((rho_inf - 4) * (rho_inf - 2) * rho_t))
38
+ v_hat = np.sqrt(self.v[i] / bias_correction2)
39
+ param.data -= self.lr * r * m_hat / (v_hat + self.eps)
40
+ else:
41
+ param.data -= self.lr * m_hat
42
+
43
+ class AdaBelief(Optimizer):
44
+ """AdaBelief Optimizer"""
45
+ def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-16, weight_decay=0.0, clip_grad=None):
46
+ super().__init__(parameters, lr, weight_decay, clip_grad)
47
+ self.betas = betas
48
+ self.eps = eps
49
+ self.m = [np.zeros_like(param.data) for param in parameters]
50
+ self.s = [np.zeros_like(param.data) for param in parameters]
51
+ self.step_count = 0
52
+
53
+ def step(self):
54
+ self.step_count += 1
55
+ self.clip_gradients()
56
+
57
+ beta1, beta2 = self.betas
58
+ bias_correction1 = 1 - beta1 ** self.step_count
59
+ bias_correction2 = 1 - beta2 ** self.step_count
60
+
61
+ for i, param in enumerate(self.parameters):
62
+ if param.grad is not None:
63
+ grad = param.grad
64
+ if self.weight_decay > 0:
65
+ grad = grad + self.weight_decay * param.data
66
+
67
+ self.m[i] = beta1 * self.m[i] + (1 - beta1) * grad
68
+ diff = grad - self.m[i]
69
+ self.s[i] = beta2 * self.s[i] + (1 - beta2) * np.square(diff)
70
+
71
+ m_hat = self.m[i] / bias_correction1
72
+ s_hat = self.s[i] / bias_correction2
73
+
74
+ param.data -= self.lr * m_hat / (np.sqrt(s_hat) + self.eps)
75
+
76
+ class Lion(Optimizer):
77
+ """Lion Optimizer (Learning with Inner Optimization)"""
78
+ def __init__(self, parameters, lr=0.0001, betas=(0.9, 0.99), weight_decay=0.0, clip_grad=None):
79
+ super().__init__(parameters, lr, weight_decay, clip_grad)
80
+ self.betas = betas
81
+ self.m = [np.zeros_like(param.data) for param in parameters]
82
+
83
+ def step(self):
84
+ self.clip_gradients()
85
+ beta1, beta2 = self.betas
86
+
87
+ for i, param in enumerate(self.parameters):
88
+ if param.grad is not None:
89
+ grad = param.grad
90
+ if self.weight_decay > 0:
91
+ grad = grad + self.weight_decay * param.data
92
+
93
+ update = beta1 * self.m[i] + (1 - beta1) * grad
94
+ old_m = self.m[i].copy()
95
+ self.m[i] = beta2 * self.m[i] + (1 - beta2) * grad
96
+
97
+ # Update using sign of momentum
98
+ param.data -= self.lr * np.sign(update)
@@ -0,0 +1,91 @@
1
+ import numpy as np
2
+
3
+ class OptX:
4
+ def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-8,
5
+ weight_decay=0.0, clip_grad=None, rectify=True,
6
+ lookahead_steps=5, lookahead_alpha=0.5):
7
+ self.parameters = parameters
8
+ self.lr = lr
9
+ self.beta1, self.beta2 = betas
10
+ self.eps = eps
11
+ self.weight_decay = weight_decay
12
+ self.clip_grad = clip_grad
13
+ self.rectify = rectify
14
+ self.lookahead_steps = lookahead_steps
15
+ self.lookahead_alpha = lookahead_alpha
16
+
17
+ # Initialize momentum and velocity
18
+ self.m = [np.zeros_like(param.data) for param in parameters]
19
+ self.v = [np.zeros_like(param.data) for param in parameters]
20
+ self.slow_weights = [param.data.copy() for param in parameters]
21
+ self.t = 0
22
+ self.step_counter = 0
23
+
24
+ # Gradient variance tracking
25
+ self.grad_var = [np.zeros_like(param.data) for param in parameters]
26
+
27
+ def clip_gradients(self):
28
+ if self.clip_grad is not None:
29
+ for param in self.parameters:
30
+ if param.grad is not None:
31
+ np.clip(param.grad, -self.clip_grad, self.clip_grad, out=param.grad)
32
+
33
+ def update_grad_variance(self, i, grad):
34
+ # Update running variance of gradients
35
+ if self.t > 1:
36
+ self.grad_var[i] = 0.9 * self.grad_var[i] + 0.1 * (grad - self.m[i])**2
37
+
38
+ def compute_adaptive_lr(self, i, v_hat):
39
+ # Compute adaptive learning rate based on gradient variance
40
+ if self.t > 1:
41
+ variance_scaling = 1.0 / (1.0 + np.sqrt(self.grad_var[i]) + self.eps)
42
+ return self.lr * variance_scaling
43
+ return self.lr
44
+
45
+ def step(self):
46
+ self.t += 1
47
+ self.step_counter += 1
48
+ self.clip_gradients()
49
+
50
+ for i, param in enumerate(self.parameters):
51
+ if param.grad is not None:
52
+ grad = param.grad
53
+ if self.weight_decay > 0:
54
+ grad = grad + self.weight_decay * param.data
55
+
56
+ # Update momentum with bias correction
57
+ self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad
58
+ m_hat = self.m[i] / (1 - self.beta1 ** self.t)
59
+
60
+ # Update velocity with bias correction
61
+ self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * (grad ** 2)
62
+ v_hat = self.v[i] / (1 - self.beta2 ** self.t)
63
+
64
+ # Update gradient variance for adaptive scaling
65
+ self.update_grad_variance(i, grad)
66
+
67
+ # Compute adaptive learning rate
68
+ adaptive_lr = self.compute_adaptive_lr(i, v_hat)
69
+
70
+ # Compute update
71
+ update = adaptive_lr * m_hat / (np.sqrt(v_hat) + self.eps)
72
+
73
+ if self.rectify:
74
+ # Apply rectification to prevent overshooting
75
+ variance_ratio = np.sqrt(self.grad_var[i]) / (np.sqrt(v_hat) + self.eps)
76
+ update *= np.minimum(1.0, np.maximum(0.1, variance_ratio))
77
+
78
+ # Apply update
79
+ param.data -= update
80
+
81
+ # Lookahead update
82
+ if self.step_counter % self.lookahead_steps == 0:
83
+ # Move slow weights toward current parameters
84
+ self.slow_weights[i] += self.lookahead_alpha * (param.data - self.slow_weights[i])
85
+ # Update parameters to interpolated position
86
+ param.data = self.slow_weights[i].copy()
87
+
88
+ def zero_grad(self):
89
+ for param in self.parameters:
90
+ if param.grad is not None:
91
+ param.grad.fill(0)
@@ -0,0 +1,63 @@
1
+ import numpy as np
2
+ from .base import Optimizer
3
+
4
+ class BGD(Optimizer):
5
+ """Batch Gradient Descent"""
6
+ def step(self):
7
+ self.clip_gradients()
8
+ for param in self.parameters:
9
+ if param.grad is not None:
10
+ grad = param.grad
11
+ if self.weight_decay > 0:
12
+ grad = grad + self.weight_decay * param.data
13
+ param.data -= self.lr * grad
14
+
15
+ class MBGD(Optimizer):
16
+ """Mini-Batch Gradient Descent"""
17
+ def __init__(self, parameters, lr=0.001, batch_size=32, weight_decay=0.0, clip_grad=None):
18
+ super().__init__(parameters, lr, weight_decay, clip_grad)
19
+ self.batch_size = batch_size
20
+
21
+ def step(self):
22
+ self.clip_gradients()
23
+ for param in self.parameters:
24
+ if param.grad is not None:
25
+ grad = param.grad / self.batch_size
26
+ if self.weight_decay > 0:
27
+ grad = grad + self.weight_decay * param.data
28
+ param.data -= self.lr * grad
29
+
30
+ class SGD(Optimizer):
31
+ """SGD with Momentum"""
32
+ def __init__(self, parameters, lr=0.001, momentum=0.9, weight_decay=0.0, clip_grad=None):
33
+ super().__init__(parameters, lr, weight_decay, clip_grad)
34
+ self.momentum = momentum
35
+ self.v = [np.zeros_like(param.data) for param in parameters]
36
+
37
+ def step(self):
38
+ self.clip_gradients()
39
+ for i, param in enumerate(self.parameters):
40
+ if param.grad is not None:
41
+ grad = param.grad
42
+ if self.weight_decay > 0:
43
+ grad = grad + self.weight_decay * param.data
44
+ self.v[i] = self.momentum * self.v[i] - self.lr * grad
45
+ param.data += self.v[i]
46
+
47
+ class NAG(Optimizer):
48
+ """Nesterov Accelerated Gradient"""
49
+ def __init__(self, parameters, lr=0.001, momentum=0.9, weight_decay=0.0, clip_grad=None):
50
+ super().__init__(parameters, lr, weight_decay, clip_grad)
51
+ self.momentum = momentum
52
+ self.v = [np.zeros_like(param.data) for param in parameters]
53
+
54
+ def step(self):
55
+ self.clip_gradients()
56
+ for i, param in enumerate(self.parameters):
57
+ if param.grad is not None:
58
+ grad = param.grad
59
+ if self.weight_decay > 0:
60
+ grad = grad + self.weight_decay * param.data
61
+ v_prev = self.v[i].copy()
62
+ self.v[i] = self.momentum * self.v[i] - self.lr * grad
63
+ param.data += -self.momentum * v_prev + (1 + self.momentum) * self.v[i]
@@ -0,0 +1,92 @@
1
+ import numpy as np
2
+ from ..core.tensor import Tensor
3
+ from ..nn.module import Module
4
+ from .gates import QuantumGates, QuantumRegister
5
+
6
+ class QuantumCircuit(Module):
7
+ def __init__(self, num_qubits):
8
+ super().__init__()
9
+ self.num_qubits = num_qubits
10
+ self.register = None
11
+ self.gates = []
12
+
13
+ def reset(self):
14
+ """Initialize a new quantum register"""
15
+ self.register = QuantumRegister(self.num_qubits)
16
+ self.gates = []
17
+
18
+ def h(self, qubit):
19
+ """Apply Hadamard gate"""
20
+ self.gates.append(('H', qubit))
21
+ return self
22
+
23
+ def x(self, qubit):
24
+ """Apply Pauli-X gate"""
25
+ self.gates.append(('X', qubit))
26
+ return self
27
+
28
+ def y(self, qubit):
29
+ """Apply Pauli-Y gate"""
30
+ self.gates.append(('Y', qubit))
31
+ return self
32
+
33
+ def z(self, qubit):
34
+ """Apply Pauli-Z gate"""
35
+ self.gates.append(('Z', qubit))
36
+ return self
37
+
38
+ def rx(self, theta, qubit):
39
+ """Apply parameterized rotation around X axis"""
40
+ self.gates.append(('RX', theta, qubit))
41
+ return self
42
+
43
+ def ry(self, theta, qubit):
44
+ """Apply parameterized rotation around Y axis"""
45
+ self.gates.append(('RY', theta, qubit))
46
+ return self
47
+
48
+ def rz(self, theta, qubit):
49
+ """Apply parameterized rotation around Z axis"""
50
+ self.gates.append(('RZ', theta, qubit))
51
+ return self
52
+
53
+ def cnot(self, control, target):
54
+ """Apply CNOT gate"""
55
+ self.gates.append(('CNOT', control, target))
56
+ return self
57
+
58
+ def forward(self, input_params=None):
59
+ """Execute quantum circuit and return measurement results"""
60
+ self.reset()
61
+
62
+ # Apply gates with batch support
63
+ for gate in self.gates:
64
+ if gate[0] in ['RX', 'RY', 'RZ']:
65
+ theta = gate[1] if input_params is None else input_params[gate[1]]
66
+ # Scale parameters to prevent gradient explosion
67
+ theta = np.clip(theta, -np.pi, np.pi)
68
+ self.register.apply_gate(getattr(QuantumGates, gate[0])(theta), gate[2])
69
+ else:
70
+ gate_op = getattr(QuantumGates, gate[0])()
71
+ if gate[0] == 'CNOT':
72
+ self.register.apply_controlled_gate(gate_op, gate[1], gate[2])
73
+ else:
74
+ self.register.apply_gate(gate_op, gate[1])
75
+
76
+ # Return the final state with gradient tracking
77
+ return Tensor(self.register.state, requires_grad=True)
78
+
79
+ class QuantumLayer(Module):
80
+ def __init__(self, num_qubits, num_params):
81
+ super().__init__()
82
+ self.circuit = QuantumCircuit(num_qubits)
83
+ self.params = Tensor(np.random.randn(num_params) * 0.1, requires_grad=True)
84
+
85
+ def build_circuit(self):
86
+ """Override this method to define the quantum circuit architecture"""
87
+ raise NotImplementedError
88
+
89
+ def forward(self, x):
90
+ """Execute quantum circuit with current parameters"""
91
+ self.build_circuit()
92
+ return self.circuit.forward(self.params)