ilovetools 0.2.17__tar.gz → 0.2.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ilovetools-0.2.17/ilovetools.egg-info → ilovetools-0.2.19}/PKG-INFO +2 -2
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/__init__.py +2 -2
- ilovetools-0.2.19/ilovetools/ml/attention.py +525 -0
- ilovetools-0.2.19/ilovetools/ml/cnn.py +619 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19/ilovetools.egg-info}/PKG-INFO +2 -2
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools.egg-info/SOURCES.txt +4 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/pyproject.toml +2 -2
- {ilovetools-0.2.17 → ilovetools-0.2.19}/setup.py +2 -2
- ilovetools-0.2.19/tests/test_attention.py +466 -0
- ilovetools-0.2.19/tests/test_cnn.py +394 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/LICENSE +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/MANIFEST.in +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/README.md +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ai/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ai/embeddings.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ai/inference.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ai/llm_helpers.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/audio/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/automation/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/automation/file_organizer.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/conversion/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/conversion/config_converter.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/conversion/config_converter_fixed_header.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/data/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/data/feature_engineering.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/data/preprocessing.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/database/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/datetime/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/email/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/email/template_engine.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/files/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/image/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/activations.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/anomaly_detection.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/clustering.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/cross_validation.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/dimensionality.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/ensemble.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/feature_selection.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/gradient_descent.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/imbalanced.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/interpretation.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/loss_functions.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/metrics.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/neural_network.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/normalization.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/optimizers.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/pipeline.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/regularization.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/timeseries.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/tuning.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/security/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/security/password_checker.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/text/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/utils/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/utils/cache_system.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/utils/logger.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/utils/rate_limiter.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/utils/retry.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/validation/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/validation/data_validator.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/web/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/web/scraper.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/web/url_shortener.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools.egg-info/dependency_links.txt +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools.egg-info/requires.txt +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools.egg-info/top_level.txt +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/requirements.txt +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/setup.cfg +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/__init__.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_activations.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_gradient_descent.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_loss_functions.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_neural_network.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_normalization.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_optimizers.py +0 -0
- {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_regularization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.19
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -11,7 +11,7 @@ Project-URL: Repository, https://github.com/AliMehdi512/ilovetools
|
|
|
11
11
|
Project-URL: Issues, https://github.com/AliMehdi512/ilovetools/issues
|
|
12
12
|
Project-URL: Bug Reports, https://github.com/AliMehdi512/ilovetools/issues
|
|
13
13
|
Project-URL: Source, https://github.com/AliMehdi512/ilovetools
|
|
14
|
-
Keywords: utilities,tools,ai,ml,data-processing,automation,
|
|
14
|
+
Keywords: utilities,tools,ai,ml,data-processing,automation,cnn,convolutional-neural-networks,conv2d,pooling,computer-vision,image-processing
|
|
15
15
|
Classifier: Development Status :: 3 - Alpha
|
|
16
16
|
Classifier: Intended Audience :: Developers
|
|
17
17
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
@@ -0,0 +1,525 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Attention Mechanisms for Neural Networks
|
|
3
|
+
|
|
4
|
+
This module provides various attention mechanisms used in deep learning:
|
|
5
|
+
- Scaled Dot-Product Attention
|
|
6
|
+
- Multi-Head Attention
|
|
7
|
+
- Self-Attention
|
|
8
|
+
- Cross-Attention
|
|
9
|
+
- Causal/Masked Attention
|
|
10
|
+
- Positional Encoding
|
|
11
|
+
- Attention Masks
|
|
12
|
+
|
|
13
|
+
All attention functions support batched operations and are optimized for Transformers.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
from typing import Tuple, Optional, Union
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ============================================================================
|
|
21
|
+
# SCALED DOT-PRODUCT ATTENTION
|
|
22
|
+
# ============================================================================
|
|
23
|
+
|
|
24
|
+
def scaled_dot_product_attention(
|
|
25
|
+
query: np.ndarray,
|
|
26
|
+
key: np.ndarray,
|
|
27
|
+
value: np.ndarray,
|
|
28
|
+
mask: Optional[np.ndarray] = None,
|
|
29
|
+
dropout_rate: float = 0.0
|
|
30
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
31
|
+
"""
|
|
32
|
+
Scaled Dot-Product Attention
|
|
33
|
+
|
|
34
|
+
The fundamental attention mechanism used in Transformers.
|
|
35
|
+
|
|
36
|
+
Formula: Attention(Q, K, V) = softmax(QK^T / sqrt(d_k)) * V
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
query: Query tensor of shape (..., seq_len_q, d_k)
|
|
40
|
+
key: Key tensor of shape (..., seq_len_k, d_k)
|
|
41
|
+
value: Value tensor of shape (..., seq_len_v, d_v)
|
|
42
|
+
mask: Optional mask tensor of shape (..., seq_len_q, seq_len_k)
|
|
43
|
+
Values should be 0 (keep) or -inf (mask out)
|
|
44
|
+
dropout_rate: Dropout rate for attention weights (default: 0.0)
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Tuple of (attention_output, attention_weights)
|
|
48
|
+
- attention_output: shape (..., seq_len_q, d_v)
|
|
49
|
+
- attention_weights: shape (..., seq_len_q, seq_len_k)
|
|
50
|
+
|
|
51
|
+
Example:
|
|
52
|
+
>>> # Single head attention
|
|
53
|
+
>>> q = np.random.randn(32, 10, 64) # (batch, seq_len, d_k)
|
|
54
|
+
>>> k = np.random.randn(32, 10, 64)
|
|
55
|
+
>>> v = np.random.randn(32, 10, 64)
|
|
56
|
+
>>> output, weights = scaled_dot_product_attention(q, k, v)
|
|
57
|
+
>>> print(output.shape) # (32, 10, 64)
|
|
58
|
+
>>> print(weights.shape) # (32, 10, 10)
|
|
59
|
+
"""
|
|
60
|
+
# Get dimension for scaling
|
|
61
|
+
d_k = query.shape[-1]
|
|
62
|
+
|
|
63
|
+
# Compute attention scores: Q * K^T / sqrt(d_k)
|
|
64
|
+
scores = np.matmul(query, key.swapaxes(-2, -1)) / np.sqrt(d_k)
|
|
65
|
+
|
|
66
|
+
# Apply mask if provided
|
|
67
|
+
if mask is not None:
|
|
68
|
+
scores = scores + mask
|
|
69
|
+
|
|
70
|
+
# Apply softmax to get attention weights
|
|
71
|
+
attention_weights = softmax(scores, axis=-1)
|
|
72
|
+
|
|
73
|
+
# Apply dropout if specified
|
|
74
|
+
if dropout_rate > 0.0:
|
|
75
|
+
attention_weights = dropout(attention_weights, dropout_rate)
|
|
76
|
+
|
|
77
|
+
# Compute weighted sum of values
|
|
78
|
+
output = np.matmul(attention_weights, value)
|
|
79
|
+
|
|
80
|
+
return output, attention_weights
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
|
|
84
|
+
"""
|
|
85
|
+
Numerically stable softmax
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
x: Input array
|
|
89
|
+
axis: Axis along which to compute softmax
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Softmax probabilities
|
|
93
|
+
"""
|
|
94
|
+
# Subtract max for numerical stability
|
|
95
|
+
x_shifted = x - np.max(x, axis=axis, keepdims=True)
|
|
96
|
+
exp_x = np.exp(x_shifted)
|
|
97
|
+
return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def dropout(x: np.ndarray, rate: float) -> np.ndarray:
|
|
101
|
+
"""
|
|
102
|
+
Apply dropout (for training)
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
x: Input array
|
|
106
|
+
rate: Dropout rate (probability of dropping)
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Array with dropout applied
|
|
110
|
+
"""
|
|
111
|
+
if rate <= 0.0 or rate >= 1.0:
|
|
112
|
+
return x
|
|
113
|
+
|
|
114
|
+
mask = np.random.binomial(1, 1 - rate, size=x.shape)
|
|
115
|
+
return x * mask / (1 - rate)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ============================================================================
|
|
119
|
+
# MULTI-HEAD ATTENTION
|
|
120
|
+
# ============================================================================
|
|
121
|
+
|
|
122
|
+
def multi_head_attention(
|
|
123
|
+
query: np.ndarray,
|
|
124
|
+
key: np.ndarray,
|
|
125
|
+
value: np.ndarray,
|
|
126
|
+
num_heads: int,
|
|
127
|
+
d_model: int,
|
|
128
|
+
mask: Optional[np.ndarray] = None,
|
|
129
|
+
dropout_rate: float = 0.0
|
|
130
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
131
|
+
"""
|
|
132
|
+
Multi-Head Attention
|
|
133
|
+
|
|
134
|
+
Applies multiple attention heads in parallel, allowing the model to attend
|
|
135
|
+
to information from different representation subspaces.
|
|
136
|
+
|
|
137
|
+
Formula: MultiHead(Q,K,V) = Concat(head_1,...,head_h)W^O
|
|
138
|
+
where head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
query: Query tensor of shape (batch, seq_len_q, d_model)
|
|
142
|
+
key: Key tensor of shape (batch, seq_len_k, d_model)
|
|
143
|
+
value: Value tensor of shape (batch, seq_len_v, d_model)
|
|
144
|
+
num_heads: Number of attention heads
|
|
145
|
+
d_model: Model dimension (must be divisible by num_heads)
|
|
146
|
+
mask: Optional mask tensor
|
|
147
|
+
dropout_rate: Dropout rate for attention weights
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Tuple of (output, attention_weights)
|
|
151
|
+
- output: shape (batch, seq_len_q, d_model)
|
|
152
|
+
- attention_weights: shape (batch, num_heads, seq_len_q, seq_len_k)
|
|
153
|
+
|
|
154
|
+
Example:
|
|
155
|
+
>>> q = np.random.randn(32, 10, 512) # (batch, seq_len, d_model)
|
|
156
|
+
>>> k = np.random.randn(32, 10, 512)
|
|
157
|
+
>>> v = np.random.randn(32, 10, 512)
|
|
158
|
+
>>> output, weights = multi_head_attention(q, k, v, num_heads=8, d_model=512)
|
|
159
|
+
>>> print(output.shape) # (32, 10, 512)
|
|
160
|
+
>>> print(weights.shape) # (32, 8, 10, 10)
|
|
161
|
+
"""
|
|
162
|
+
if d_model % num_heads != 0:
|
|
163
|
+
raise ValueError(f"d_model ({d_model}) must be divisible by num_heads ({num_heads})")
|
|
164
|
+
|
|
165
|
+
batch_size = query.shape[0]
|
|
166
|
+
seq_len_q = query.shape[1]
|
|
167
|
+
seq_len_k = key.shape[1]
|
|
168
|
+
|
|
169
|
+
# Dimension per head
|
|
170
|
+
d_k = d_model // num_heads
|
|
171
|
+
|
|
172
|
+
# Initialize projection weights (in practice, these would be learned)
|
|
173
|
+
W_q = np.random.randn(d_model, d_model) * 0.01
|
|
174
|
+
W_k = np.random.randn(d_model, d_model) * 0.01
|
|
175
|
+
W_v = np.random.randn(d_model, d_model) * 0.01
|
|
176
|
+
W_o = np.random.randn(d_model, d_model) * 0.01
|
|
177
|
+
|
|
178
|
+
# Linear projections
|
|
179
|
+
Q = np.matmul(query, W_q) # (batch, seq_len_q, d_model)
|
|
180
|
+
K = np.matmul(key, W_k) # (batch, seq_len_k, d_model)
|
|
181
|
+
V = np.matmul(value, W_v) # (batch, seq_len_v, d_model)
|
|
182
|
+
|
|
183
|
+
# Split into multiple heads
|
|
184
|
+
# Reshape: (batch, seq_len, d_model) -> (batch, seq_len, num_heads, d_k)
|
|
185
|
+
Q = Q.reshape(batch_size, seq_len_q, num_heads, d_k)
|
|
186
|
+
K = K.reshape(batch_size, seq_len_k, num_heads, d_k)
|
|
187
|
+
V = V.reshape(batch_size, seq_len_k, num_heads, d_k)
|
|
188
|
+
|
|
189
|
+
# Transpose: (batch, seq_len, num_heads, d_k) -> (batch, num_heads, seq_len, d_k)
|
|
190
|
+
Q = Q.transpose(0, 2, 1, 3)
|
|
191
|
+
K = K.transpose(0, 2, 1, 3)
|
|
192
|
+
V = V.transpose(0, 2, 1, 3)
|
|
193
|
+
|
|
194
|
+
# Apply scaled dot-product attention for each head
|
|
195
|
+
attention_output, attention_weights = scaled_dot_product_attention(
|
|
196
|
+
Q, K, V, mask=mask, dropout_rate=dropout_rate
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Concatenate heads
|
|
200
|
+
# Transpose back: (batch, num_heads, seq_len_q, d_k) -> (batch, seq_len_q, num_heads, d_k)
|
|
201
|
+
attention_output = attention_output.transpose(0, 2, 1, 3)
|
|
202
|
+
|
|
203
|
+
# Reshape: (batch, seq_len_q, num_heads, d_k) -> (batch, seq_len_q, d_model)
|
|
204
|
+
attention_output = attention_output.reshape(batch_size, seq_len_q, d_model)
|
|
205
|
+
|
|
206
|
+
# Final linear projection
|
|
207
|
+
output = np.matmul(attention_output, W_o)
|
|
208
|
+
|
|
209
|
+
return output, attention_weights
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
# ============================================================================
|
|
213
|
+
# SELF-ATTENTION
|
|
214
|
+
# ============================================================================
|
|
215
|
+
|
|
216
|
+
def self_attention(
|
|
217
|
+
x: np.ndarray,
|
|
218
|
+
d_model: int,
|
|
219
|
+
mask: Optional[np.ndarray] = None,
|
|
220
|
+
dropout_rate: float = 0.0
|
|
221
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
222
|
+
"""
|
|
223
|
+
Self-Attention
|
|
224
|
+
|
|
225
|
+
Special case where query, key, and value all come from the same input.
|
|
226
|
+
Used in BERT and GPT.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
x: Input tensor of shape (batch, seq_len, d_model)
|
|
230
|
+
d_model: Model dimension
|
|
231
|
+
mask: Optional mask tensor
|
|
232
|
+
dropout_rate: Dropout rate
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
Tuple of (output, attention_weights)
|
|
236
|
+
|
|
237
|
+
Example:
|
|
238
|
+
>>> x = np.random.randn(32, 10, 512)
|
|
239
|
+
>>> output, weights = self_attention(x, d_model=512)
|
|
240
|
+
>>> print(output.shape) # (32, 10, 512)
|
|
241
|
+
"""
|
|
242
|
+
# Use same input for Q, K, V
|
|
243
|
+
return scaled_dot_product_attention(x, x, x, mask=mask, dropout_rate=dropout_rate)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def multi_head_self_attention(
|
|
247
|
+
x: np.ndarray,
|
|
248
|
+
num_heads: int,
|
|
249
|
+
d_model: int,
|
|
250
|
+
mask: Optional[np.ndarray] = None,
|
|
251
|
+
dropout_rate: float = 0.0
|
|
252
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
253
|
+
"""
|
|
254
|
+
Multi-Head Self-Attention
|
|
255
|
+
|
|
256
|
+
Combines multi-head attention with self-attention.
|
|
257
|
+
Standard building block in Transformers.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
x: Input tensor of shape (batch, seq_len, d_model)
|
|
261
|
+
num_heads: Number of attention heads
|
|
262
|
+
d_model: Model dimension
|
|
263
|
+
mask: Optional mask tensor
|
|
264
|
+
dropout_rate: Dropout rate
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Tuple of (output, attention_weights)
|
|
268
|
+
|
|
269
|
+
Example:
|
|
270
|
+
>>> x = np.random.randn(32, 10, 512)
|
|
271
|
+
>>> output, weights = multi_head_self_attention(x, num_heads=8, d_model=512)
|
|
272
|
+
>>> print(output.shape) # (32, 10, 512)
|
|
273
|
+
"""
|
|
274
|
+
return multi_head_attention(x, x, x, num_heads, d_model, mask, dropout_rate)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# ============================================================================
|
|
278
|
+
# CROSS-ATTENTION
|
|
279
|
+
# ============================================================================
|
|
280
|
+
|
|
281
|
+
def cross_attention(
|
|
282
|
+
query: np.ndarray,
|
|
283
|
+
context: np.ndarray,
|
|
284
|
+
d_model: int,
|
|
285
|
+
mask: Optional[np.ndarray] = None,
|
|
286
|
+
dropout_rate: float = 0.0
|
|
287
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
288
|
+
"""
|
|
289
|
+
Cross-Attention
|
|
290
|
+
|
|
291
|
+
Attention between two different sequences. Query comes from one sequence,
|
|
292
|
+
while key and value come from another (context).
|
|
293
|
+
|
|
294
|
+
Used in encoder-decoder architectures and multimodal models.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
query: Query tensor of shape (batch, seq_len_q, d_model)
|
|
298
|
+
context: Context tensor of shape (batch, seq_len_c, d_model)
|
|
299
|
+
d_model: Model dimension
|
|
300
|
+
mask: Optional mask tensor
|
|
301
|
+
dropout_rate: Dropout rate
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
Tuple of (output, attention_weights)
|
|
305
|
+
|
|
306
|
+
Example:
|
|
307
|
+
>>> # Decoder attending to encoder
|
|
308
|
+
>>> decoder_out = np.random.randn(32, 10, 512)
|
|
309
|
+
>>> encoder_out = np.random.randn(32, 20, 512)
|
|
310
|
+
>>> output, weights = cross_attention(decoder_out, encoder_out, d_model=512)
|
|
311
|
+
>>> print(output.shape) # (32, 10, 512)
|
|
312
|
+
>>> print(weights.shape) # (32, 10, 20)
|
|
313
|
+
"""
|
|
314
|
+
# Query from first sequence, Key and Value from context
|
|
315
|
+
return scaled_dot_product_attention(
|
|
316
|
+
query, context, context, mask=mask, dropout_rate=dropout_rate
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
# ============================================================================
|
|
321
|
+
# ATTENTION MASKS
|
|
322
|
+
# ============================================================================
|
|
323
|
+
|
|
324
|
+
def create_padding_mask(seq: np.ndarray, pad_token: int = 0) -> np.ndarray:
|
|
325
|
+
"""
|
|
326
|
+
Create padding mask for sequences with padding tokens
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
seq: Sequence tensor of shape (batch, seq_len)
|
|
330
|
+
pad_token: Token ID used for padding (default: 0)
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
Mask tensor of shape (batch, 1, 1, seq_len)
|
|
334
|
+
Values are 0 (keep) or -inf (mask out)
|
|
335
|
+
|
|
336
|
+
Example:
|
|
337
|
+
>>> seq = np.array([[1, 2, 3, 0, 0], [1, 2, 0, 0, 0]])
|
|
338
|
+
>>> mask = create_padding_mask(seq, pad_token=0)
|
|
339
|
+
>>> print(mask.shape) # (2, 1, 1, 5)
|
|
340
|
+
"""
|
|
341
|
+
# Create mask: 1 for padding tokens, 0 for real tokens
|
|
342
|
+
mask = (seq == pad_token).astype(np.float32)
|
|
343
|
+
|
|
344
|
+
# Add dimensions for broadcasting
|
|
345
|
+
# (batch, seq_len) -> (batch, 1, 1, seq_len)
|
|
346
|
+
mask = mask[:, np.newaxis, np.newaxis, :]
|
|
347
|
+
|
|
348
|
+
# Convert to -inf for masked positions
|
|
349
|
+
mask = mask * -1e9
|
|
350
|
+
|
|
351
|
+
return mask
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def create_causal_mask(seq_len: int) -> np.ndarray:
|
|
355
|
+
"""
|
|
356
|
+
Create causal (look-ahead) mask for autoregressive models
|
|
357
|
+
|
|
358
|
+
Prevents positions from attending to subsequent positions.
|
|
359
|
+
Used in GPT and other autoregressive models.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
seq_len: Sequence length
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
Mask tensor of shape (1, 1, seq_len, seq_len)
|
|
366
|
+
Upper triangle is -inf (masked), lower triangle is 0 (keep)
|
|
367
|
+
|
|
368
|
+
Example:
|
|
369
|
+
>>> mask = create_causal_mask(5)
|
|
370
|
+
>>> print(mask.shape) # (1, 1, 5, 5)
|
|
371
|
+
>>> # Position i can only attend to positions <= i
|
|
372
|
+
"""
|
|
373
|
+
# Create upper triangular matrix of 1s
|
|
374
|
+
mask = np.triu(np.ones((seq_len, seq_len)), k=1)
|
|
375
|
+
|
|
376
|
+
# Add batch and head dimensions
|
|
377
|
+
mask = mask[np.newaxis, np.newaxis, :, :]
|
|
378
|
+
|
|
379
|
+
# Convert to -inf for masked positions
|
|
380
|
+
mask = mask * -1e9
|
|
381
|
+
|
|
382
|
+
return mask
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def create_look_ahead_mask(seq_len: int) -> np.ndarray:
|
|
386
|
+
"""
|
|
387
|
+
Alias for create_causal_mask
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
seq_len: Sequence length
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
Causal mask tensor
|
|
394
|
+
"""
|
|
395
|
+
return create_causal_mask(seq_len)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
# ============================================================================
|
|
399
|
+
# POSITIONAL ENCODING
|
|
400
|
+
# ============================================================================
|
|
401
|
+
|
|
402
|
+
def positional_encoding(
|
|
403
|
+
seq_len: int,
|
|
404
|
+
d_model: int,
|
|
405
|
+
n: int = 10000
|
|
406
|
+
) -> np.ndarray:
|
|
407
|
+
"""
|
|
408
|
+
Sinusoidal Positional Encoding
|
|
409
|
+
|
|
410
|
+
Adds position information to embeddings using sine and cosine functions.
|
|
411
|
+
Used in original Transformer paper.
|
|
412
|
+
|
|
413
|
+
Formula:
|
|
414
|
+
PE(pos, 2i) = sin(pos / n^(2i/d_model))
|
|
415
|
+
PE(pos, 2i+1) = cos(pos / n^(2i/d_model))
|
|
416
|
+
|
|
417
|
+
Args:
|
|
418
|
+
seq_len: Maximum sequence length
|
|
419
|
+
d_model: Model dimension (embedding size)
|
|
420
|
+
n: Base for positional encoding (default: 10000)
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
Positional encoding tensor of shape (seq_len, d_model)
|
|
424
|
+
|
|
425
|
+
Example:
|
|
426
|
+
>>> pos_enc = positional_encoding(seq_len=100, d_model=512)
|
|
427
|
+
>>> print(pos_enc.shape) # (100, 512)
|
|
428
|
+
>>>
|
|
429
|
+
>>> # Add to embeddings
|
|
430
|
+
>>> embeddings = np.random.randn(32, 100, 512)
|
|
431
|
+
>>> embeddings_with_pos = embeddings + pos_enc
|
|
432
|
+
"""
|
|
433
|
+
# Create position indices
|
|
434
|
+
position = np.arange(seq_len)[:, np.newaxis] # (seq_len, 1)
|
|
435
|
+
|
|
436
|
+
# Create dimension indices
|
|
437
|
+
div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(n) / d_model))
|
|
438
|
+
|
|
439
|
+
# Initialize positional encoding matrix
|
|
440
|
+
pos_encoding = np.zeros((seq_len, d_model))
|
|
441
|
+
|
|
442
|
+
# Apply sine to even indices
|
|
443
|
+
pos_encoding[:, 0::2] = np.sin(position * div_term)
|
|
444
|
+
|
|
445
|
+
# Apply cosine to odd indices
|
|
446
|
+
pos_encoding[:, 1::2] = np.cos(position * div_term)
|
|
447
|
+
|
|
448
|
+
return pos_encoding
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def learned_positional_encoding(
|
|
452
|
+
seq_len: int,
|
|
453
|
+
d_model: int
|
|
454
|
+
) -> np.ndarray:
|
|
455
|
+
"""
|
|
456
|
+
Learned Positional Encoding
|
|
457
|
+
|
|
458
|
+
Alternative to sinusoidal encoding where positions are learned parameters.
|
|
459
|
+
Used in BERT and GPT.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
seq_len: Maximum sequence length
|
|
463
|
+
d_model: Model dimension
|
|
464
|
+
|
|
465
|
+
Returns:
|
|
466
|
+
Initialized positional encoding tensor of shape (seq_len, d_model)
|
|
467
|
+
|
|
468
|
+
Example:
|
|
469
|
+
>>> pos_enc = learned_positional_encoding(seq_len=512, d_model=768)
|
|
470
|
+
>>> print(pos_enc.shape) # (512, 768)
|
|
471
|
+
"""
|
|
472
|
+
# Initialize with small random values (would be learned during training)
|
|
473
|
+
return np.random.randn(seq_len, d_model) * 0.01
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
# ============================================================================
|
|
477
|
+
# UTILITY FUNCTIONS
|
|
478
|
+
# ============================================================================
|
|
479
|
+
|
|
480
|
+
def attention_score_visualization(
|
|
481
|
+
attention_weights: np.ndarray,
|
|
482
|
+
tokens: Optional[list] = None
|
|
483
|
+
) -> dict:
|
|
484
|
+
"""
|
|
485
|
+
Prepare attention weights for visualization
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
attention_weights: Attention weights of shape (batch, num_heads, seq_len, seq_len)
|
|
489
|
+
or (batch, seq_len, seq_len)
|
|
490
|
+
tokens: Optional list of token strings for labeling
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
Dictionary with visualization data
|
|
494
|
+
|
|
495
|
+
Example:
|
|
496
|
+
>>> weights = np.random.rand(1, 8, 10, 10)
|
|
497
|
+
>>> tokens = ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']
|
|
498
|
+
>>> viz_data = attention_score_visualization(weights, tokens)
|
|
499
|
+
"""
|
|
500
|
+
# Average across heads if multi-head
|
|
501
|
+
if attention_weights.ndim == 4:
|
|
502
|
+
avg_weights = np.mean(attention_weights, axis=1) # (batch, seq_len, seq_len)
|
|
503
|
+
else:
|
|
504
|
+
avg_weights = attention_weights
|
|
505
|
+
|
|
506
|
+
# Take first batch
|
|
507
|
+
weights_2d = avg_weights[0]
|
|
508
|
+
|
|
509
|
+
return {
|
|
510
|
+
'weights': weights_2d,
|
|
511
|
+
'tokens': tokens,
|
|
512
|
+
'shape': weights_2d.shape,
|
|
513
|
+
'max_attention': np.max(weights_2d),
|
|
514
|
+
'min_attention': np.min(weights_2d)
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
# Aliases for convenience
|
|
519
|
+
sdp_attention = scaled_dot_product_attention
|
|
520
|
+
mha = multi_head_attention
|
|
521
|
+
self_attn = self_attention
|
|
522
|
+
cross_attn = cross_attention
|
|
523
|
+
pos_encoding = positional_encoding
|
|
524
|
+
causal_mask = create_causal_mask
|
|
525
|
+
padding_mask = create_padding_mask
|