ilovetools 0.2.17__tar.gz → 0.2.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {ilovetools-0.2.17/ilovetools.egg-info → ilovetools-0.2.19}/PKG-INFO +2 -2
  2. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/__init__.py +2 -2
  3. ilovetools-0.2.19/ilovetools/ml/attention.py +525 -0
  4. ilovetools-0.2.19/ilovetools/ml/cnn.py +619 -0
  5. {ilovetools-0.2.17 → ilovetools-0.2.19/ilovetools.egg-info}/PKG-INFO +2 -2
  6. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools.egg-info/SOURCES.txt +4 -0
  7. {ilovetools-0.2.17 → ilovetools-0.2.19}/pyproject.toml +2 -2
  8. {ilovetools-0.2.17 → ilovetools-0.2.19}/setup.py +2 -2
  9. ilovetools-0.2.19/tests/test_attention.py +466 -0
  10. ilovetools-0.2.19/tests/test_cnn.py +394 -0
  11. {ilovetools-0.2.17 → ilovetools-0.2.19}/LICENSE +0 -0
  12. {ilovetools-0.2.17 → ilovetools-0.2.19}/MANIFEST.in +0 -0
  13. {ilovetools-0.2.17 → ilovetools-0.2.19}/README.md +0 -0
  14. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ai/__init__.py +0 -0
  15. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ai/embeddings.py +0 -0
  16. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ai/inference.py +0 -0
  17. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ai/llm_helpers.py +0 -0
  18. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/audio/__init__.py +0 -0
  19. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/automation/__init__.py +0 -0
  20. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/automation/file_organizer.py +0 -0
  21. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/conversion/__init__.py +0 -0
  22. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/conversion/config_converter.py +0 -0
  23. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/conversion/config_converter_fixed_header.py +0 -0
  24. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/data/__init__.py +0 -0
  25. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/data/feature_engineering.py +0 -0
  26. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/data/preprocessing.py +0 -0
  27. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/database/__init__.py +0 -0
  28. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/datetime/__init__.py +0 -0
  29. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/email/__init__.py +0 -0
  30. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/email/template_engine.py +0 -0
  31. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/files/__init__.py +0 -0
  32. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/image/__init__.py +0 -0
  33. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/__init__.py +0 -0
  34. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/activations.py +0 -0
  35. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/anomaly_detection.py +0 -0
  36. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/clustering.py +0 -0
  37. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/cross_validation.py +0 -0
  38. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/dimensionality.py +0 -0
  39. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/ensemble.py +0 -0
  40. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/feature_selection.py +0 -0
  41. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/gradient_descent.py +0 -0
  42. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/imbalanced.py +0 -0
  43. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/interpretation.py +0 -0
  44. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/loss_functions.py +0 -0
  45. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/metrics.py +0 -0
  46. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/neural_network.py +0 -0
  47. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/normalization.py +0 -0
  48. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/optimizers.py +0 -0
  49. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/pipeline.py +0 -0
  50. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/regularization.py +0 -0
  51. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/timeseries.py +0 -0
  52. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/ml/tuning.py +0 -0
  53. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/security/__init__.py +0 -0
  54. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/security/password_checker.py +0 -0
  55. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/text/__init__.py +0 -0
  56. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/utils/__init__.py +0 -0
  57. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/utils/cache_system.py +0 -0
  58. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/utils/logger.py +0 -0
  59. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/utils/rate_limiter.py +0 -0
  60. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/utils/retry.py +0 -0
  61. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/validation/__init__.py +0 -0
  62. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/validation/data_validator.py +0 -0
  63. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/web/__init__.py +0 -0
  64. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/web/scraper.py +0 -0
  65. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools/web/url_shortener.py +0 -0
  66. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools.egg-info/dependency_links.txt +0 -0
  67. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools.egg-info/requires.txt +0 -0
  68. {ilovetools-0.2.17 → ilovetools-0.2.19}/ilovetools.egg-info/top_level.txt +0 -0
  69. {ilovetools-0.2.17 → ilovetools-0.2.19}/requirements.txt +0 -0
  70. {ilovetools-0.2.17 → ilovetools-0.2.19}/setup.cfg +0 -0
  71. {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/__init__.py +0 -0
  72. {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_activations.py +0 -0
  73. {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_gradient_descent.py +0 -0
  74. {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_loss_functions.py +0 -0
  75. {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_neural_network.py +0 -0
  76. {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_normalization.py +0 -0
  77. {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_optimizers.py +0 -0
  78. {ilovetools-0.2.17 → ilovetools-0.2.19}/tests/test_regularization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ilovetools
3
- Version: 0.2.17
3
+ Version: 0.2.19
4
4
  Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
5
5
  Home-page: https://github.com/AliMehdi512/ilovetools
6
6
  Author: Ali Mehdi
@@ -11,7 +11,7 @@ Project-URL: Repository, https://github.com/AliMehdi512/ilovetools
11
11
  Project-URL: Issues, https://github.com/AliMehdi512/ilovetools/issues
12
12
  Project-URL: Bug Reports, https://github.com/AliMehdi512/ilovetools/issues
13
13
  Project-URL: Source, https://github.com/AliMehdi512/ilovetools
14
- Keywords: utilities,tools,ai,ml,data-processing,automation,batch-normalization,layer-normalization,group-normalization,instance-normalization,weight-normalization,deep-learning,transformers
14
+ Keywords: utilities,tools,ai,ml,data-processing,automation,cnn,convolutional-neural-networks,conv2d,pooling,computer-vision,image-processing
15
15
  Classifier: Development Status :: 3 - Alpha
16
16
  Classifier: Intended Audience :: Developers
17
17
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
@@ -2,8 +2,8 @@
2
2
  ilovetools - A comprehensive Python utility library
3
3
  """
4
4
 
5
- __version__ = "0.2.16"
6
- # release marker: 0.2.16
5
+ __version__ = "0.2.18"
6
+ # release marker: 0.2.18
7
7
  __author__ = "Ali Mehdi"
8
8
  __email__ = "ali.mehdi.dev579@gmail.com"
9
9
 
@@ -0,0 +1,525 @@
1
+ """
2
+ Attention Mechanisms for Neural Networks
3
+
4
+ This module provides various attention mechanisms used in deep learning:
5
+ - Scaled Dot-Product Attention
6
+ - Multi-Head Attention
7
+ - Self-Attention
8
+ - Cross-Attention
9
+ - Causal/Masked Attention
10
+ - Positional Encoding
11
+ - Attention Masks
12
+
13
+ All attention functions support batched operations and are optimized for Transformers.
14
+ """
15
+
16
+ import numpy as np
17
+ from typing import Tuple, Optional, Union
18
+
19
+
20
+ # ============================================================================
21
+ # SCALED DOT-PRODUCT ATTENTION
22
+ # ============================================================================
23
+
24
+ def scaled_dot_product_attention(
25
+ query: np.ndarray,
26
+ key: np.ndarray,
27
+ value: np.ndarray,
28
+ mask: Optional[np.ndarray] = None,
29
+ dropout_rate: float = 0.0
30
+ ) -> Tuple[np.ndarray, np.ndarray]:
31
+ """
32
+ Scaled Dot-Product Attention
33
+
34
+ The fundamental attention mechanism used in Transformers.
35
+
36
+ Formula: Attention(Q, K, V) = softmax(QK^T / sqrt(d_k)) * V
37
+
38
+ Args:
39
+ query: Query tensor of shape (..., seq_len_q, d_k)
40
+ key: Key tensor of shape (..., seq_len_k, d_k)
41
+ value: Value tensor of shape (..., seq_len_v, d_v)
42
+ mask: Optional mask tensor of shape (..., seq_len_q, seq_len_k)
43
+ Values should be 0 (keep) or -inf (mask out)
44
+ dropout_rate: Dropout rate for attention weights (default: 0.0)
45
+
46
+ Returns:
47
+ Tuple of (attention_output, attention_weights)
48
+ - attention_output: shape (..., seq_len_q, d_v)
49
+ - attention_weights: shape (..., seq_len_q, seq_len_k)
50
+
51
+ Example:
52
+ >>> # Single head attention
53
+ >>> q = np.random.randn(32, 10, 64) # (batch, seq_len, d_k)
54
+ >>> k = np.random.randn(32, 10, 64)
55
+ >>> v = np.random.randn(32, 10, 64)
56
+ >>> output, weights = scaled_dot_product_attention(q, k, v)
57
+ >>> print(output.shape) # (32, 10, 64)
58
+ >>> print(weights.shape) # (32, 10, 10)
59
+ """
60
+ # Get dimension for scaling
61
+ d_k = query.shape[-1]
62
+
63
+ # Compute attention scores: Q * K^T / sqrt(d_k)
64
+ scores = np.matmul(query, key.swapaxes(-2, -1)) / np.sqrt(d_k)
65
+
66
+ # Apply mask if provided
67
+ if mask is not None:
68
+ scores = scores + mask
69
+
70
+ # Apply softmax to get attention weights
71
+ attention_weights = softmax(scores, axis=-1)
72
+
73
+ # Apply dropout if specified
74
+ if dropout_rate > 0.0:
75
+ attention_weights = dropout(attention_weights, dropout_rate)
76
+
77
+ # Compute weighted sum of values
78
+ output = np.matmul(attention_weights, value)
79
+
80
+ return output, attention_weights
81
+
82
+
83
+ def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
84
+ """
85
+ Numerically stable softmax
86
+
87
+ Args:
88
+ x: Input array
89
+ axis: Axis along which to compute softmax
90
+
91
+ Returns:
92
+ Softmax probabilities
93
+ """
94
+ # Subtract max for numerical stability
95
+ x_shifted = x - np.max(x, axis=axis, keepdims=True)
96
+ exp_x = np.exp(x_shifted)
97
+ return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
98
+
99
+
100
+ def dropout(x: np.ndarray, rate: float) -> np.ndarray:
101
+ """
102
+ Apply dropout (for training)
103
+
104
+ Args:
105
+ x: Input array
106
+ rate: Dropout rate (probability of dropping)
107
+
108
+ Returns:
109
+ Array with dropout applied
110
+ """
111
+ if rate <= 0.0 or rate >= 1.0:
112
+ return x
113
+
114
+ mask = np.random.binomial(1, 1 - rate, size=x.shape)
115
+ return x * mask / (1 - rate)
116
+
117
+
118
+ # ============================================================================
119
+ # MULTI-HEAD ATTENTION
120
+ # ============================================================================
121
+
122
+ def multi_head_attention(
123
+ query: np.ndarray,
124
+ key: np.ndarray,
125
+ value: np.ndarray,
126
+ num_heads: int,
127
+ d_model: int,
128
+ mask: Optional[np.ndarray] = None,
129
+ dropout_rate: float = 0.0
130
+ ) -> Tuple[np.ndarray, np.ndarray]:
131
+ """
132
+ Multi-Head Attention
133
+
134
+ Applies multiple attention heads in parallel, allowing the model to attend
135
+ to information from different representation subspaces.
136
+
137
+ Formula: MultiHead(Q,K,V) = Concat(head_1,...,head_h)W^O
138
+ where head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)
139
+
140
+ Args:
141
+ query: Query tensor of shape (batch, seq_len_q, d_model)
142
+ key: Key tensor of shape (batch, seq_len_k, d_model)
143
+ value: Value tensor of shape (batch, seq_len_v, d_model)
144
+ num_heads: Number of attention heads
145
+ d_model: Model dimension (must be divisible by num_heads)
146
+ mask: Optional mask tensor
147
+ dropout_rate: Dropout rate for attention weights
148
+
149
+ Returns:
150
+ Tuple of (output, attention_weights)
151
+ - output: shape (batch, seq_len_q, d_model)
152
+ - attention_weights: shape (batch, num_heads, seq_len_q, seq_len_k)
153
+
154
+ Example:
155
+ >>> q = np.random.randn(32, 10, 512) # (batch, seq_len, d_model)
156
+ >>> k = np.random.randn(32, 10, 512)
157
+ >>> v = np.random.randn(32, 10, 512)
158
+ >>> output, weights = multi_head_attention(q, k, v, num_heads=8, d_model=512)
159
+ >>> print(output.shape) # (32, 10, 512)
160
+ >>> print(weights.shape) # (32, 8, 10, 10)
161
+ """
162
+ if d_model % num_heads != 0:
163
+ raise ValueError(f"d_model ({d_model}) must be divisible by num_heads ({num_heads})")
164
+
165
+ batch_size = query.shape[0]
166
+ seq_len_q = query.shape[1]
167
+ seq_len_k = key.shape[1]
168
+
169
+ # Dimension per head
170
+ d_k = d_model // num_heads
171
+
172
+ # Initialize projection weights (in practice, these would be learned)
173
+ W_q = np.random.randn(d_model, d_model) * 0.01
174
+ W_k = np.random.randn(d_model, d_model) * 0.01
175
+ W_v = np.random.randn(d_model, d_model) * 0.01
176
+ W_o = np.random.randn(d_model, d_model) * 0.01
177
+
178
+ # Linear projections
179
+ Q = np.matmul(query, W_q) # (batch, seq_len_q, d_model)
180
+ K = np.matmul(key, W_k) # (batch, seq_len_k, d_model)
181
+ V = np.matmul(value, W_v) # (batch, seq_len_v, d_model)
182
+
183
+ # Split into multiple heads
184
+ # Reshape: (batch, seq_len, d_model) -> (batch, seq_len, num_heads, d_k)
185
+ Q = Q.reshape(batch_size, seq_len_q, num_heads, d_k)
186
+ K = K.reshape(batch_size, seq_len_k, num_heads, d_k)
187
+ V = V.reshape(batch_size, seq_len_k, num_heads, d_k)
188
+
189
+ # Transpose: (batch, seq_len, num_heads, d_k) -> (batch, num_heads, seq_len, d_k)
190
+ Q = Q.transpose(0, 2, 1, 3)
191
+ K = K.transpose(0, 2, 1, 3)
192
+ V = V.transpose(0, 2, 1, 3)
193
+
194
+ # Apply scaled dot-product attention for each head
195
+ attention_output, attention_weights = scaled_dot_product_attention(
196
+ Q, K, V, mask=mask, dropout_rate=dropout_rate
197
+ )
198
+
199
+ # Concatenate heads
200
+ # Transpose back: (batch, num_heads, seq_len_q, d_k) -> (batch, seq_len_q, num_heads, d_k)
201
+ attention_output = attention_output.transpose(0, 2, 1, 3)
202
+
203
+ # Reshape: (batch, seq_len_q, num_heads, d_k) -> (batch, seq_len_q, d_model)
204
+ attention_output = attention_output.reshape(batch_size, seq_len_q, d_model)
205
+
206
+ # Final linear projection
207
+ output = np.matmul(attention_output, W_o)
208
+
209
+ return output, attention_weights
210
+
211
+
212
+ # ============================================================================
213
+ # SELF-ATTENTION
214
+ # ============================================================================
215
+
216
+ def self_attention(
217
+ x: np.ndarray,
218
+ d_model: int,
219
+ mask: Optional[np.ndarray] = None,
220
+ dropout_rate: float = 0.0
221
+ ) -> Tuple[np.ndarray, np.ndarray]:
222
+ """
223
+ Self-Attention
224
+
225
+ Special case where query, key, and value all come from the same input.
226
+ Used in BERT and GPT.
227
+
228
+ Args:
229
+ x: Input tensor of shape (batch, seq_len, d_model)
230
+ d_model: Model dimension
231
+ mask: Optional mask tensor
232
+ dropout_rate: Dropout rate
233
+
234
+ Returns:
235
+ Tuple of (output, attention_weights)
236
+
237
+ Example:
238
+ >>> x = np.random.randn(32, 10, 512)
239
+ >>> output, weights = self_attention(x, d_model=512)
240
+ >>> print(output.shape) # (32, 10, 512)
241
+ """
242
+ # Use same input for Q, K, V
243
+ return scaled_dot_product_attention(x, x, x, mask=mask, dropout_rate=dropout_rate)
244
+
245
+
246
+ def multi_head_self_attention(
247
+ x: np.ndarray,
248
+ num_heads: int,
249
+ d_model: int,
250
+ mask: Optional[np.ndarray] = None,
251
+ dropout_rate: float = 0.0
252
+ ) -> Tuple[np.ndarray, np.ndarray]:
253
+ """
254
+ Multi-Head Self-Attention
255
+
256
+ Combines multi-head attention with self-attention.
257
+ Standard building block in Transformers.
258
+
259
+ Args:
260
+ x: Input tensor of shape (batch, seq_len, d_model)
261
+ num_heads: Number of attention heads
262
+ d_model: Model dimension
263
+ mask: Optional mask tensor
264
+ dropout_rate: Dropout rate
265
+
266
+ Returns:
267
+ Tuple of (output, attention_weights)
268
+
269
+ Example:
270
+ >>> x = np.random.randn(32, 10, 512)
271
+ >>> output, weights = multi_head_self_attention(x, num_heads=8, d_model=512)
272
+ >>> print(output.shape) # (32, 10, 512)
273
+ """
274
+ return multi_head_attention(x, x, x, num_heads, d_model, mask, dropout_rate)
275
+
276
+
277
+ # ============================================================================
278
+ # CROSS-ATTENTION
279
+ # ============================================================================
280
+
281
+ def cross_attention(
282
+ query: np.ndarray,
283
+ context: np.ndarray,
284
+ d_model: int,
285
+ mask: Optional[np.ndarray] = None,
286
+ dropout_rate: float = 0.0
287
+ ) -> Tuple[np.ndarray, np.ndarray]:
288
+ """
289
+ Cross-Attention
290
+
291
+ Attention between two different sequences. Query comes from one sequence,
292
+ while key and value come from another (context).
293
+
294
+ Used in encoder-decoder architectures and multimodal models.
295
+
296
+ Args:
297
+ query: Query tensor of shape (batch, seq_len_q, d_model)
298
+ context: Context tensor of shape (batch, seq_len_c, d_model)
299
+ d_model: Model dimension
300
+ mask: Optional mask tensor
301
+ dropout_rate: Dropout rate
302
+
303
+ Returns:
304
+ Tuple of (output, attention_weights)
305
+
306
+ Example:
307
+ >>> # Decoder attending to encoder
308
+ >>> decoder_out = np.random.randn(32, 10, 512)
309
+ >>> encoder_out = np.random.randn(32, 20, 512)
310
+ >>> output, weights = cross_attention(decoder_out, encoder_out, d_model=512)
311
+ >>> print(output.shape) # (32, 10, 512)
312
+ >>> print(weights.shape) # (32, 10, 20)
313
+ """
314
+ # Query from first sequence, Key and Value from context
315
+ return scaled_dot_product_attention(
316
+ query, context, context, mask=mask, dropout_rate=dropout_rate
317
+ )
318
+
319
+
320
+ # ============================================================================
321
+ # ATTENTION MASKS
322
+ # ============================================================================
323
+
324
+ def create_padding_mask(seq: np.ndarray, pad_token: int = 0) -> np.ndarray:
325
+ """
326
+ Create padding mask for sequences with padding tokens
327
+
328
+ Args:
329
+ seq: Sequence tensor of shape (batch, seq_len)
330
+ pad_token: Token ID used for padding (default: 0)
331
+
332
+ Returns:
333
+ Mask tensor of shape (batch, 1, 1, seq_len)
334
+ Values are 0 (keep) or -inf (mask out)
335
+
336
+ Example:
337
+ >>> seq = np.array([[1, 2, 3, 0, 0], [1, 2, 0, 0, 0]])
338
+ >>> mask = create_padding_mask(seq, pad_token=0)
339
+ >>> print(mask.shape) # (2, 1, 1, 5)
340
+ """
341
+ # Create mask: 1 for padding tokens, 0 for real tokens
342
+ mask = (seq == pad_token).astype(np.float32)
343
+
344
+ # Add dimensions for broadcasting
345
+ # (batch, seq_len) -> (batch, 1, 1, seq_len)
346
+ mask = mask[:, np.newaxis, np.newaxis, :]
347
+
348
+ # Convert to -inf for masked positions
349
+ mask = mask * -1e9
350
+
351
+ return mask
352
+
353
+
354
+ def create_causal_mask(seq_len: int) -> np.ndarray:
355
+ """
356
+ Create causal (look-ahead) mask for autoregressive models
357
+
358
+ Prevents positions from attending to subsequent positions.
359
+ Used in GPT and other autoregressive models.
360
+
361
+ Args:
362
+ seq_len: Sequence length
363
+
364
+ Returns:
365
+ Mask tensor of shape (1, 1, seq_len, seq_len)
366
+ Upper triangle is -inf (masked), lower triangle is 0 (keep)
367
+
368
+ Example:
369
+ >>> mask = create_causal_mask(5)
370
+ >>> print(mask.shape) # (1, 1, 5, 5)
371
+ >>> # Position i can only attend to positions <= i
372
+ """
373
+ # Create upper triangular matrix of 1s
374
+ mask = np.triu(np.ones((seq_len, seq_len)), k=1)
375
+
376
+ # Add batch and head dimensions
377
+ mask = mask[np.newaxis, np.newaxis, :, :]
378
+
379
+ # Convert to -inf for masked positions
380
+ mask = mask * -1e9
381
+
382
+ return mask
383
+
384
+
385
+ def create_look_ahead_mask(seq_len: int) -> np.ndarray:
386
+ """
387
+ Alias for create_causal_mask
388
+
389
+ Args:
390
+ seq_len: Sequence length
391
+
392
+ Returns:
393
+ Causal mask tensor
394
+ """
395
+ return create_causal_mask(seq_len)
396
+
397
+
398
+ # ============================================================================
399
+ # POSITIONAL ENCODING
400
+ # ============================================================================
401
+
402
+ def positional_encoding(
403
+ seq_len: int,
404
+ d_model: int,
405
+ n: int = 10000
406
+ ) -> np.ndarray:
407
+ """
408
+ Sinusoidal Positional Encoding
409
+
410
+ Adds position information to embeddings using sine and cosine functions.
411
+ Used in original Transformer paper.
412
+
413
+ Formula:
414
+ PE(pos, 2i) = sin(pos / n^(2i/d_model))
415
+ PE(pos, 2i+1) = cos(pos / n^(2i/d_model))
416
+
417
+ Args:
418
+ seq_len: Maximum sequence length
419
+ d_model: Model dimension (embedding size)
420
+ n: Base for positional encoding (default: 10000)
421
+
422
+ Returns:
423
+ Positional encoding tensor of shape (seq_len, d_model)
424
+
425
+ Example:
426
+ >>> pos_enc = positional_encoding(seq_len=100, d_model=512)
427
+ >>> print(pos_enc.shape) # (100, 512)
428
+ >>>
429
+ >>> # Add to embeddings
430
+ >>> embeddings = np.random.randn(32, 100, 512)
431
+ >>> embeddings_with_pos = embeddings + pos_enc
432
+ """
433
+ # Create position indices
434
+ position = np.arange(seq_len)[:, np.newaxis] # (seq_len, 1)
435
+
436
+ # Create dimension indices
437
+ div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(n) / d_model))
438
+
439
+ # Initialize positional encoding matrix
440
+ pos_encoding = np.zeros((seq_len, d_model))
441
+
442
+ # Apply sine to even indices
443
+ pos_encoding[:, 0::2] = np.sin(position * div_term)
444
+
445
+ # Apply cosine to odd indices
446
+ pos_encoding[:, 1::2] = np.cos(position * div_term)
447
+
448
+ return pos_encoding
449
+
450
+
451
+ def learned_positional_encoding(
452
+ seq_len: int,
453
+ d_model: int
454
+ ) -> np.ndarray:
455
+ """
456
+ Learned Positional Encoding
457
+
458
+ Alternative to sinusoidal encoding where positions are learned parameters.
459
+ Used in BERT and GPT.
460
+
461
+ Args:
462
+ seq_len: Maximum sequence length
463
+ d_model: Model dimension
464
+
465
+ Returns:
466
+ Initialized positional encoding tensor of shape (seq_len, d_model)
467
+
468
+ Example:
469
+ >>> pos_enc = learned_positional_encoding(seq_len=512, d_model=768)
470
+ >>> print(pos_enc.shape) # (512, 768)
471
+ """
472
+ # Initialize with small random values (would be learned during training)
473
+ return np.random.randn(seq_len, d_model) * 0.01
474
+
475
+
476
+ # ============================================================================
477
+ # UTILITY FUNCTIONS
478
+ # ============================================================================
479
+
480
+ def attention_score_visualization(
481
+ attention_weights: np.ndarray,
482
+ tokens: Optional[list] = None
483
+ ) -> dict:
484
+ """
485
+ Prepare attention weights for visualization
486
+
487
+ Args:
488
+ attention_weights: Attention weights of shape (batch, num_heads, seq_len, seq_len)
489
+ or (batch, seq_len, seq_len)
490
+ tokens: Optional list of token strings for labeling
491
+
492
+ Returns:
493
+ Dictionary with visualization data
494
+
495
+ Example:
496
+ >>> weights = np.random.rand(1, 8, 10, 10)
497
+ >>> tokens = ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']
498
+ >>> viz_data = attention_score_visualization(weights, tokens)
499
+ """
500
+ # Average across heads if multi-head
501
+ if attention_weights.ndim == 4:
502
+ avg_weights = np.mean(attention_weights, axis=1) # (batch, seq_len, seq_len)
503
+ else:
504
+ avg_weights = attention_weights
505
+
506
+ # Take first batch
507
+ weights_2d = avg_weights[0]
508
+
509
+ return {
510
+ 'weights': weights_2d,
511
+ 'tokens': tokens,
512
+ 'shape': weights_2d.shape,
513
+ 'max_attention': np.max(weights_2d),
514
+ 'min_attention': np.min(weights_2d)
515
+ }
516
+
517
+
518
+ # Aliases for convenience
519
+ sdp_attention = scaled_dot_product_attention
520
+ mha = multi_head_attention
521
+ self_attn = self_attention
522
+ cross_attn = cross_attention
523
+ pos_encoding = positional_encoding
524
+ causal_mask = create_causal_mask
525
+ padding_mask = create_padding_mask