ilovetools 0.2.32__tar.gz → 0.2.33__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. {ilovetools-0.2.32/ilovetools.egg-info → ilovetools-0.2.33}/PKG-INFO +2 -2
  2. ilovetools-0.2.33/ilovetools/ml/embedding.py +484 -0
  3. {ilovetools-0.2.32 → ilovetools-0.2.33/ilovetools.egg-info}/PKG-INFO +2 -2
  4. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools.egg-info/SOURCES.txt +2 -0
  5. {ilovetools-0.2.32 → ilovetools-0.2.33}/pyproject.toml +2 -2
  6. {ilovetools-0.2.32 → ilovetools-0.2.33}/setup.py +2 -2
  7. ilovetools-0.2.33/tests/test_embedding.py +335 -0
  8. {ilovetools-0.2.32 → ilovetools-0.2.33}/LICENSE +0 -0
  9. {ilovetools-0.2.32 → ilovetools-0.2.33}/MANIFEST.in +0 -0
  10. {ilovetools-0.2.32 → ilovetools-0.2.33}/README.md +0 -0
  11. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/__init__.py +0 -0
  12. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ai/__init__.py +0 -0
  13. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ai/embeddings.py +0 -0
  14. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ai/inference.py +0 -0
  15. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ai/llm_helpers.py +0 -0
  16. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/audio/__init__.py +0 -0
  17. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/automation/__init__.py +0 -0
  18. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/automation/file_organizer.py +0 -0
  19. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/conversion/__init__.py +0 -0
  20. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/conversion/config_converter.py +0 -0
  21. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/conversion/config_converter_fixed_header.py +0 -0
  22. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/data/__init__.py +0 -0
  23. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/data/feature_engineering.py +0 -0
  24. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/data/preprocessing.py +0 -0
  25. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/database/__init__.py +0 -0
  26. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/datetime/__init__.py +0 -0
  27. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/email/__init__.py +0 -0
  28. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/email/template_engine.py +0 -0
  29. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/files/__init__.py +0 -0
  30. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/image/__init__.py +0 -0
  31. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/__init__.py +0 -0
  32. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/activations.py +0 -0
  33. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/anomaly_detection.py +0 -0
  34. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/attention.py +0 -0
  35. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/augmentation.py +0 -0
  36. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/clustering.py +0 -0
  37. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/cnn.py +0 -0
  38. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/convolution.py +0 -0
  39. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/cross_validation.py +0 -0
  40. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/dimensionality.py +0 -0
  41. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/dropout.py +0 -0
  42. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/ensemble.py +0 -0
  43. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/feature_selection.py +0 -0
  44. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/gradient_descent.py +0 -0
  45. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/imbalanced.py +0 -0
  46. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/interpretation.py +0 -0
  47. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/loss_functions.py +0 -0
  48. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/losses.py +0 -0
  49. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/lr_schedulers.py +0 -0
  50. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/metrics.py +0 -0
  51. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/neural_network.py +0 -0
  52. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/normalization.py +0 -0
  53. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/normalization_advanced.py +0 -0
  54. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/optimizers.py +0 -0
  55. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/pipeline.py +0 -0
  56. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/pooling.py +0 -0
  57. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/positional_encoding.py +0 -0
  58. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/recurrent.py +0 -0
  59. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/regularization.py +0 -0
  60. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/rnn.py +0 -0
  61. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/schedulers.py +0 -0
  62. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/timeseries.py +0 -0
  63. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/tuning.py +0 -0
  64. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/weight_init.py +0 -0
  65. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/security/__init__.py +0 -0
  66. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/security/password_checker.py +0 -0
  67. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/text/__init__.py +0 -0
  68. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/utils/__init__.py +0 -0
  69. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/utils/cache_system.py +0 -0
  70. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/utils/logger.py +0 -0
  71. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/utils/rate_limiter.py +0 -0
  72. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/utils/retry.py +0 -0
  73. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/validation/__init__.py +0 -0
  74. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/validation/data_validator.py +0 -0
  75. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/web/__init__.py +0 -0
  76. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/web/scraper.py +0 -0
  77. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/web/url_shortener.py +0 -0
  78. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools.egg-info/dependency_links.txt +0 -0
  79. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools.egg-info/requires.txt +0 -0
  80. {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools.egg-info/top_level.txt +0 -0
  81. {ilovetools-0.2.32 → ilovetools-0.2.33}/requirements.txt +0 -0
  82. {ilovetools-0.2.32 → ilovetools-0.2.33}/setup.cfg +0 -0
  83. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/__init__.py +0 -0
  84. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_activations.py +0 -0
  85. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_attention.py +0 -0
  86. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_augmentation.py +0 -0
  87. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_cnn.py +0 -0
  88. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_convolution.py +0 -0
  89. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_dropout.py +0 -0
  90. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_gradient_descent.py +0 -0
  91. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_loss_functions.py +0 -0
  92. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_losses.py +0 -0
  93. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_lr_schedulers.py +0 -0
  94. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_neural_network.py +0 -0
  95. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_normalization.py +0 -0
  96. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_normalization_advanced.py +0 -0
  97. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_optimizers.py +0 -0
  98. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_pooling.py +0 -0
  99. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_positional_encoding.py +0 -0
  100. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_pypi_installation.py +0 -0
  101. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_recurrent.py +0 -0
  102. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_regularization.py +0 -0
  103. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_rnn.py +0 -0
  104. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_schedulers.py +0 -0
  105. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_weight_init.py +0 -0
  106. {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/verify_positional_encoding.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ilovetools
3
- Version: 0.2.32
3
+ Version: 0.2.33
4
4
  Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
5
5
  Home-page: https://github.com/AliMehdi512/ilovetools
6
6
  Author: Ali Mehdi
@@ -11,7 +11,7 @@ Project-URL: Repository, https://github.com/AliMehdi512/ilovetools
11
11
  Project-URL: Issues, https://github.com/AliMehdi512/ilovetools/issues
12
12
  Project-URL: Bug Reports, https://github.com/AliMehdi512/ilovetools/issues
13
13
  Project-URL: Source, https://github.com/AliMehdi512/ilovetools
14
- Keywords: utilities,tools,ai,ml,data-processing,automation,recurrent-neural-networks,rnn,lstm,gru,bilstm,bigru,long-short-term-memory,gated-recurrent-unit,bidirectional-rnn,bidirectional-lstm,bidirectional-gru,sequence-modeling,sequence-to-sequence,seq2seq,nlp,natural-language-processing,time-series,time-series-forecasting,speech-recognition,machine-translation,text-classification,sentiment-analysis,named-entity-recognition,ner,pos-tagging,vanishing-gradient,exploding-gradient,gates,forget-gate,input-gate,output-gate,update-gate,reset-gate,cell-state,hidden-state,deep-learning,neural-networks,pytorch,tensorflow,keras
14
+ Keywords: utilities,tools,ai,ml,data-processing,automation,embeddings,word-embeddings,word2vec,glove,fasttext,positional-encoding,sinusoidal-encoding,learned-embeddings,token-embeddings,character-embeddings,segment-embeddings,token-type-embeddings,semantic-similarity,cosine-similarity,embedding-space,embedding-matrix,lookup-table,distributed-representations,dense-vectors,nlp,natural-language-processing,transformers,bert,gpt,attention,vocabulary,tokenization,pretrained-embeddings,transfer-learning,deep-learning,neural-networks,pytorch,tensorflow,keras
15
15
  Classifier: Development Status :: 3 - Alpha
16
16
  Classifier: Intended Audience :: Developers
17
17
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
@@ -0,0 +1,484 @@
1
+ """
2
+ Embedding Layers Suite
3
+
4
+ This module implements various embedding layers for neural networks.
5
+ Embeddings convert discrete tokens (words, characters) into dense continuous vectors.
6
+
7
+ Implemented Embedding Types:
8
+ 1. Embedding - Standard learned embedding layer
9
+ 2. PositionalEncoding - Sinusoidal positional encoding for Transformers
10
+ 3. LearnedPositionalEmbedding - Learned positional embeddings
11
+ 4. TokenTypeEmbedding - Segment/token type embeddings (BERT-style)
12
+ 5. CharacterEmbedding - Character-level embeddings
13
+
14
+ Key Benefits:
15
+ - Dense vector representations
16
+ - Semantic similarity capture
17
+ - Dimensionality reduction (vocab_size → embedding_dim)
18
+ - Learned from data
19
+ - Transfer learning support
20
+
21
+ References:
22
+ - Word2Vec: Mikolov et al., "Efficient Estimation of Word Representations in Vector Space" (2013)
23
+ - GloVe: Pennington et al., "GloVe: Global Vectors for Word Representation" (2014)
24
+ - Positional Encoding: Vaswani et al., "Attention Is All You Need" (2017)
25
+ - FastText: Bojanowski et al., "Enriching Word Vectors with Subword Information" (2017)
26
+
27
+ Author: Ali Mehdi
28
+ Date: January 22, 2026
29
+ """
30
+
31
+ import numpy as np
32
+ from typing import Optional, Tuple
33
+
34
+
35
+ # ============================================================================
36
+ # STANDARD EMBEDDING LAYER
37
+ # ============================================================================
38
+
39
+ class Embedding:
40
+ """
41
+ Standard Embedding Layer.
42
+
43
+ Converts token indices to dense vectors via lookup table.
44
+
45
+ Formula:
46
+ output = embedding_matrix[token_indices]
47
+
48
+ Args:
49
+ vocab_size: Size of vocabulary (number of unique tokens)
50
+ embedding_dim: Dimension of embedding vectors
51
+ padding_idx: Index for padding token (optional, won't be updated during training)
52
+ max_norm: If given, embeddings are normalized to have max L2 norm
53
+ scale_grad_by_freq: Scale gradients by token frequency (default: False)
54
+
55
+ Example:
56
+ >>> emb = Embedding(vocab_size=10000, embedding_dim=300)
57
+ >>> tokens = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) # (batch, seq_len)
58
+ >>> output = emb.forward(tokens)
59
+ >>> print(output.shape) # (2, 4, 300)
60
+
61
+ Use Case:
62
+ Word embeddings, token embeddings, any discrete to continuous mapping
63
+
64
+ Reference:
65
+ Mikolov et al., "Efficient Estimation of Word Representations" (2013)
66
+ """
67
+
68
+ def __init__(self, vocab_size: int, embedding_dim: int,
69
+ padding_idx: Optional[int] = None,
70
+ max_norm: Optional[float] = None,
71
+ scale_grad_by_freq: bool = False):
72
+ self.vocab_size = vocab_size
73
+ self.embedding_dim = embedding_dim
74
+ self.padding_idx = padding_idx
75
+ self.max_norm = max_norm
76
+ self.scale_grad_by_freq = scale_grad_by_freq
77
+
78
+ # Initialize embedding matrix (Xavier/Glorot initialization)
79
+ self.weight = np.random.randn(vocab_size, embedding_dim) * np.sqrt(2.0 / (vocab_size + embedding_dim))
80
+
81
+ # Set padding embedding to zeros
82
+ if padding_idx is not None:
83
+ self.weight[padding_idx] = 0.0
84
+
85
+ self.cache = None
86
+
87
+ def forward(self, indices: np.ndarray) -> np.ndarray:
88
+ """
89
+ Forward pass.
90
+
91
+ Args:
92
+ indices: Token indices, shape (batch, seq_len) or (batch, seq_len, ...)
93
+
94
+ Returns:
95
+ Embedded vectors, shape (*indices.shape, embedding_dim)
96
+ """
97
+ # Lookup embeddings
98
+ output = self.weight[indices]
99
+
100
+ # Apply max norm if specified
101
+ if self.max_norm is not None:
102
+ norms = np.linalg.norm(output, axis=-1, keepdims=True)
103
+ output = output * np.minimum(1.0, self.max_norm / (norms + 1e-8))
104
+
105
+ self.cache = indices
106
+ return output
107
+
108
+ def load_pretrained(self, pretrained_embeddings: np.ndarray):
109
+ """
110
+ Load pretrained embeddings.
111
+
112
+ Args:
113
+ pretrained_embeddings: Pretrained embedding matrix (vocab_size, embedding_dim)
114
+ """
115
+ if pretrained_embeddings.shape != self.weight.shape:
116
+ raise ValueError(f"Shape mismatch: expected {self.weight.shape}, got {pretrained_embeddings.shape}")
117
+
118
+ self.weight = pretrained_embeddings.copy()
119
+
120
+ # Reset padding embedding
121
+ if self.padding_idx is not None:
122
+ self.weight[self.padding_idx] = 0.0
123
+
124
+ def freeze(self):
125
+ """Freeze embeddings (don't update during training)."""
126
+ self.frozen = True
127
+
128
+ def unfreeze(self):
129
+ """Unfreeze embeddings (allow updates during training)."""
130
+ self.frozen = False
131
+
132
+ def __call__(self, indices: np.ndarray) -> np.ndarray:
133
+ return self.forward(indices)
134
+
135
+
136
+ # ============================================================================
137
+ # POSITIONAL ENCODING (SINUSOIDAL)
138
+ # ============================================================================
139
+
140
+ class PositionalEncoding:
141
+ """
142
+ Sinusoidal Positional Encoding for Transformers.
143
+
144
+ Adds position information to token embeddings using sine and cosine functions.
145
+
146
+ Formula:
147
+ PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
148
+ PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
149
+
150
+ Args:
151
+ embedding_dim: Dimension of embeddings (must be even)
152
+ max_len: Maximum sequence length (default: 5000)
153
+ dropout: Dropout rate (default: 0.1)
154
+
155
+ Example:
156
+ >>> pos_enc = PositionalEncoding(embedding_dim=512, max_len=1000)
157
+ >>> x = np.random.randn(32, 100, 512) # (batch, seq_len, embedding_dim)
158
+ >>> output = pos_enc.forward(x)
159
+ >>> print(output.shape) # (32, 100, 512)
160
+
161
+ Use Case:
162
+ Transformers, attention mechanisms, sequence position encoding
163
+
164
+ Reference:
165
+ Vaswani et al., "Attention Is All You Need" (2017)
166
+ """
167
+
168
+ def __init__(self, embedding_dim: int, max_len: int = 5000, dropout: float = 0.1):
169
+ if embedding_dim % 2 != 0:
170
+ raise ValueError(f"embedding_dim must be even, got {embedding_dim}")
171
+
172
+ self.embedding_dim = embedding_dim
173
+ self.max_len = max_len
174
+ self.dropout = dropout
175
+
176
+ # Create positional encoding matrix
177
+ pe = np.zeros((max_len, embedding_dim))
178
+ position = np.arange(0, max_len).reshape(-1, 1)
179
+ div_term = np.exp(np.arange(0, embedding_dim, 2) * -(np.log(10000.0) / embedding_dim))
180
+
181
+ pe[:, 0::2] = np.sin(position * div_term)
182
+ pe[:, 1::2] = np.cos(position * div_term)
183
+
184
+ self.pe = pe # (max_len, embedding_dim)
185
+
186
+ def forward(self, x: np.ndarray, training: bool = True) -> np.ndarray:
187
+ """
188
+ Forward pass.
189
+
190
+ Args:
191
+ x: Input embeddings (batch, seq_len, embedding_dim)
192
+ training: Whether in training mode (apply dropout)
193
+
194
+ Returns:
195
+ Embeddings with positional encoding added
196
+ """
197
+ batch_size, seq_len, embedding_dim = x.shape
198
+
199
+ if seq_len > self.max_len:
200
+ raise ValueError(f"Sequence length {seq_len} exceeds max_len {self.max_len}")
201
+
202
+ if embedding_dim != self.embedding_dim:
203
+ raise ValueError(f"Expected embedding_dim {self.embedding_dim}, got {embedding_dim}")
204
+
205
+ # Add positional encoding
206
+ output = x + self.pe[:seq_len, :]
207
+
208
+ # Apply dropout if training
209
+ if training and self.dropout > 0:
210
+ mask = np.random.binomial(1, 1 - self.dropout, size=output.shape)
211
+ output = output * mask / (1 - self.dropout)
212
+
213
+ return output
214
+
215
+ def __call__(self, x: np.ndarray, training: bool = True) -> np.ndarray:
216
+ return self.forward(x, training)
217
+
218
+
219
+ # ============================================================================
220
+ # LEARNED POSITIONAL EMBEDDING
221
+ # ============================================================================
222
+
223
+ class LearnedPositionalEmbedding:
224
+ """
225
+ Learned Positional Embeddings.
226
+
227
+ Alternative to sinusoidal encoding, learns position embeddings from data.
228
+
229
+ Args:
230
+ max_len: Maximum sequence length
231
+ embedding_dim: Dimension of embeddings
232
+
233
+ Example:
234
+ >>> pos_emb = LearnedPositionalEmbedding(max_len=512, embedding_dim=768)
235
+ >>> x = np.random.randn(32, 100, 768)
236
+ >>> output = pos_emb.forward(x)
237
+ >>> print(output.shape) # (32, 100, 768)
238
+
239
+ Use Case:
240
+ BERT-style models, learned position representations
241
+
242
+ Reference:
243
+ Devlin et al., "BERT: Pre-training of Deep Bidirectional Transformers" (2019)
244
+ """
245
+
246
+ def __init__(self, max_len: int, embedding_dim: int):
247
+ self.max_len = max_len
248
+ self.embedding_dim = embedding_dim
249
+
250
+ # Initialize position embeddings
251
+ self.position_embeddings = np.random.randn(max_len, embedding_dim) * 0.02
252
+
253
+ def forward(self, x: np.ndarray) -> np.ndarray:
254
+ """
255
+ Forward pass.
256
+
257
+ Args:
258
+ x: Input embeddings (batch, seq_len, embedding_dim)
259
+
260
+ Returns:
261
+ Embeddings with learned positional embeddings added
262
+ """
263
+ batch_size, seq_len, embedding_dim = x.shape
264
+
265
+ if seq_len > self.max_len:
266
+ raise ValueError(f"Sequence length {seq_len} exceeds max_len {self.max_len}")
267
+
268
+ if embedding_dim != self.embedding_dim:
269
+ raise ValueError(f"Expected embedding_dim {self.embedding_dim}, got {embedding_dim}")
270
+
271
+ # Add position embeddings
272
+ output = x + self.position_embeddings[:seq_len, :]
273
+
274
+ return output
275
+
276
+ def __call__(self, x: np.ndarray) -> np.ndarray:
277
+ return self.forward(x)
278
+
279
+
280
+ # ============================================================================
281
+ # TOKEN TYPE EMBEDDING (SEGMENT EMBEDDING)
282
+ # ============================================================================
283
+
284
+ class TokenTypeEmbedding:
285
+ """
286
+ Token Type (Segment) Embeddings.
287
+
288
+ Used in BERT to distinguish between different segments (e.g., sentence A vs B).
289
+
290
+ Args:
291
+ num_types: Number of token types (default: 2 for BERT)
292
+ embedding_dim: Dimension of embeddings
293
+
294
+ Example:
295
+ >>> token_type_emb = TokenTypeEmbedding(num_types=2, embedding_dim=768)
296
+ >>> token_type_ids = np.array([[0, 0, 0, 1, 1, 1]]) # (batch, seq_len)
297
+ >>> output = token_type_emb.forward(token_type_ids)
298
+ >>> print(output.shape) # (1, 6, 768)
299
+
300
+ Use Case:
301
+ BERT, sentence pair tasks, multi-segment inputs
302
+
303
+ Reference:
304
+ Devlin et al., "BERT" (2019)
305
+ """
306
+
307
+ def __init__(self, num_types: int = 2, embedding_dim: int = 768):
308
+ self.num_types = num_types
309
+ self.embedding_dim = embedding_dim
310
+
311
+ # Initialize token type embeddings
312
+ self.token_type_embeddings = np.random.randn(num_types, embedding_dim) * 0.02
313
+
314
+ def forward(self, token_type_ids: np.ndarray) -> np.ndarray:
315
+ """
316
+ Forward pass.
317
+
318
+ Args:
319
+ token_type_ids: Token type indices (batch, seq_len)
320
+
321
+ Returns:
322
+ Token type embeddings (batch, seq_len, embedding_dim)
323
+ """
324
+ return self.token_type_embeddings[token_type_ids]
325
+
326
+ def __call__(self, token_type_ids: np.ndarray) -> np.ndarray:
327
+ return self.forward(token_type_ids)
328
+
329
+
330
+ # ============================================================================
331
+ # CHARACTER EMBEDDING
332
+ # ============================================================================
333
+
334
+ class CharacterEmbedding:
335
+ """
336
+ Character-level Embeddings.
337
+
338
+ Embeds individual characters, useful for handling OOV words.
339
+
340
+ Args:
341
+ num_chars: Number of unique characters
342
+ embedding_dim: Dimension of character embeddings
343
+ padding_idx: Index for padding character
344
+
345
+ Example:
346
+ >>> char_emb = CharacterEmbedding(num_chars=128, embedding_dim=50)
347
+ >>> char_ids = np.random.randint(0, 128, (32, 20, 15)) # (batch, words, chars)
348
+ >>> output = char_emb.forward(char_ids)
349
+ >>> print(output.shape) # (32, 20, 50)
350
+
351
+ Use Case:
352
+ Character-level models, handling rare words, morphologically rich languages
353
+
354
+ Reference:
355
+ Kim et al., "Character-Aware Neural Language Models" (2016)
356
+ """
357
+
358
+ def __init__(self, num_chars: int, embedding_dim: int, padding_idx: Optional[int] = None):
359
+ self.num_chars = num_chars
360
+ self.embedding_dim = embedding_dim
361
+ self.padding_idx = padding_idx
362
+
363
+ # Initialize character embeddings
364
+ self.weight = np.random.randn(num_chars, embedding_dim) * 0.02
365
+
366
+ if padding_idx is not None:
367
+ self.weight[padding_idx] = 0.0
368
+
369
+ def forward(self, char_ids: np.ndarray) -> np.ndarray:
370
+ """
371
+ Forward pass.
372
+
373
+ Args:
374
+ char_ids: Character indices (batch, num_words, max_word_len)
375
+
376
+ Returns:
377
+ Word embeddings from characters (batch, num_words, embedding_dim)
378
+ """
379
+ batch_size, num_words, max_word_len = char_ids.shape
380
+
381
+ # Lookup character embeddings
382
+ char_embeddings = self.weight[char_ids] # (batch, num_words, max_word_len, embedding_dim)
383
+
384
+ # Aggregate character embeddings (mean pooling)
385
+ word_embeddings = np.mean(char_embeddings, axis=2) # (batch, num_words, embedding_dim)
386
+
387
+ return word_embeddings
388
+
389
+ def __call__(self, char_ids: np.ndarray) -> np.ndarray:
390
+ return self.forward(char_ids)
391
+
392
+
393
+ # ============================================================================
394
+ # UTILITY FUNCTIONS
395
+ # ============================================================================
396
+
397
+ def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
398
+ """
399
+ Compute cosine similarity between two vectors.
400
+
401
+ Formula:
402
+ cos(θ) = (a · b) / (||a|| ||b||)
403
+
404
+ Args:
405
+ a: First vector
406
+ b: Second vector
407
+
408
+ Returns:
409
+ Cosine similarity (-1 to 1)
410
+
411
+ Example:
412
+ >>> a = np.array([1, 2, 3])
413
+ >>> b = np.array([4, 5, 6])
414
+ >>> sim = cosine_similarity(a, b)
415
+ >>> print(f"Similarity: {sim:.4f}")
416
+ """
417
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8)
418
+
419
+
420
+ def euclidean_distance(a: np.ndarray, b: np.ndarray) -> float:
421
+ """
422
+ Compute Euclidean distance between two vectors.
423
+
424
+ Formula:
425
+ d = ||a - b||
426
+
427
+ Args:
428
+ a: First vector
429
+ b: Second vector
430
+
431
+ Returns:
432
+ Euclidean distance
433
+ """
434
+ return np.linalg.norm(a - b)
435
+
436
+
437
+ def most_similar(embedding_matrix: np.ndarray, query_vector: np.ndarray,
438
+ top_k: int = 5, exclude_idx: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
439
+ """
440
+ Find most similar vectors to query vector.
441
+
442
+ Args:
443
+ embedding_matrix: Matrix of embeddings (vocab_size, embedding_dim)
444
+ query_vector: Query vector (embedding_dim,)
445
+ top_k: Number of most similar vectors to return
446
+ exclude_idx: Index to exclude (e.g., query word itself)
447
+
448
+ Returns:
449
+ Tuple of (indices, similarities)
450
+
451
+ Example:
452
+ >>> emb = Embedding(vocab_size=10000, embedding_dim=300)
453
+ >>> query = emb.weight[100] # Get embedding for word at index 100
454
+ >>> indices, sims = most_similar(emb.weight, query, top_k=5, exclude_idx=100)
455
+ >>> print(f"Most similar indices: {indices}")
456
+ >>> print(f"Similarities: {sims}")
457
+ """
458
+ # Compute cosine similarities
459
+ norms = np.linalg.norm(embedding_matrix, axis=1, keepdims=True)
460
+ query_norm = np.linalg.norm(query_vector)
461
+
462
+ similarities = np.dot(embedding_matrix, query_vector) / (norms.flatten() * query_norm + 1e-8)
463
+
464
+ # Exclude specified index
465
+ if exclude_idx is not None:
466
+ similarities[exclude_idx] = -np.inf
467
+
468
+ # Get top-k
469
+ top_indices = np.argsort(similarities)[::-1][:top_k]
470
+ top_similarities = similarities[top_indices]
471
+
472
+ return top_indices, top_similarities
473
+
474
+
475
+ __all__ = [
476
+ 'Embedding',
477
+ 'PositionalEncoding',
478
+ 'LearnedPositionalEmbedding',
479
+ 'TokenTypeEmbedding',
480
+ 'CharacterEmbedding',
481
+ 'cosine_similarity',
482
+ 'euclidean_distance',
483
+ 'most_similar',
484
+ ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ilovetools
3
- Version: 0.2.32
3
+ Version: 0.2.33
4
4
  Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
5
5
  Home-page: https://github.com/AliMehdi512/ilovetools
6
6
  Author: Ali Mehdi
@@ -11,7 +11,7 @@ Project-URL: Repository, https://github.com/AliMehdi512/ilovetools
11
11
  Project-URL: Issues, https://github.com/AliMehdi512/ilovetools/issues
12
12
  Project-URL: Bug Reports, https://github.com/AliMehdi512/ilovetools/issues
13
13
  Project-URL: Source, https://github.com/AliMehdi512/ilovetools
14
- Keywords: utilities,tools,ai,ml,data-processing,automation,recurrent-neural-networks,rnn,lstm,gru,bilstm,bigru,long-short-term-memory,gated-recurrent-unit,bidirectional-rnn,bidirectional-lstm,bidirectional-gru,sequence-modeling,sequence-to-sequence,seq2seq,nlp,natural-language-processing,time-series,time-series-forecasting,speech-recognition,machine-translation,text-classification,sentiment-analysis,named-entity-recognition,ner,pos-tagging,vanishing-gradient,exploding-gradient,gates,forget-gate,input-gate,output-gate,update-gate,reset-gate,cell-state,hidden-state,deep-learning,neural-networks,pytorch,tensorflow,keras
14
+ Keywords: utilities,tools,ai,ml,data-processing,automation,embeddings,word-embeddings,word2vec,glove,fasttext,positional-encoding,sinusoidal-encoding,learned-embeddings,token-embeddings,character-embeddings,segment-embeddings,token-type-embeddings,semantic-similarity,cosine-similarity,embedding-space,embedding-matrix,lookup-table,distributed-representations,dense-vectors,nlp,natural-language-processing,transformers,bert,gpt,attention,vocabulary,tokenization,pretrained-embeddings,transfer-learning,deep-learning,neural-networks,pytorch,tensorflow,keras
15
15
  Classifier: Development Status :: 3 - Alpha
16
16
  Classifier: Intended Audience :: Developers
17
17
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
@@ -40,6 +40,7 @@ ilovetools/ml/convolution.py
40
40
  ilovetools/ml/cross_validation.py
41
41
  ilovetools/ml/dimensionality.py
42
42
  ilovetools/ml/dropout.py
43
+ ilovetools/ml/embedding.py
43
44
  ilovetools/ml/ensemble.py
44
45
  ilovetools/ml/feature_selection.py
45
46
  ilovetools/ml/gradient_descent.py
@@ -83,6 +84,7 @@ tests/test_augmentation.py
83
84
  tests/test_cnn.py
84
85
  tests/test_convolution.py
85
86
  tests/test_dropout.py
87
+ tests/test_embedding.py
86
88
  tests/test_gradient_descent.py
87
89
  tests/test_loss_functions.py
88
90
  tests/test_losses.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ilovetools"
7
- version = "0.2.32"
7
+ version = "0.2.33"
8
8
  description = "A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -12,7 +12,7 @@ license = "MIT"
12
12
  authors = [
13
13
  {name = "Ali Mehdi", email = "ali.mehdi.dev579@gmail.com"}
14
14
  ]
15
- keywords = ["utilities", "tools", "ai", "ml", "data-processing", "automation", "recurrent-neural-networks", "rnn", "lstm", "gru", "bilstm", "bigru", "long-short-term-memory", "gated-recurrent-unit", "bidirectional-rnn", "bidirectional-lstm", "bidirectional-gru", "sequence-modeling", "sequence-to-sequence", "seq2seq", "nlp", "natural-language-processing", "time-series", "time-series-forecasting", "speech-recognition", "machine-translation", "text-classification", "sentiment-analysis", "named-entity-recognition", "ner", "pos-tagging", "vanishing-gradient", "exploding-gradient", "gates", "forget-gate", "input-gate", "output-gate", "update-gate", "reset-gate", "cell-state", "hidden-state", "deep-learning", "neural-networks", "pytorch", "tensorflow", "keras"]
15
+ keywords = ["utilities", "tools", "ai", "ml", "data-processing", "automation", "embeddings", "word-embeddings", "word2vec", "glove", "fasttext", "positional-encoding", "sinusoidal-encoding", "learned-embeddings", "token-embeddings", "character-embeddings", "segment-embeddings", "token-type-embeddings", "semantic-similarity", "cosine-similarity", "embedding-space", "embedding-matrix", "lookup-table", "distributed-representations", "dense-vectors", "nlp", "natural-language-processing", "transformers", "bert", "gpt", "attention", "vocabulary", "tokenization", "pretrained-embeddings", "transfer-learning", "deep-learning", "neural-networks", "pytorch", "tensorflow", "keras"]
16
16
  classifiers = [
17
17
  "Development Status :: 3 - Alpha",
18
18
  "Intended Audience :: Developers",
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="ilovetools",
8
- version="0.2.32",
8
+ version="0.2.33",
9
9
  author="Ali Mehdi",
10
10
  author_email="ali.mehdi.dev579@gmail.com",
11
11
  description="A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs",
@@ -58,7 +58,7 @@ setup(
58
58
  "soundfile>=0.12.0",
59
59
  ],
60
60
  },
61
- keywords="utilities, tools, ai, ml, data-processing, automation, python-library, neural-networks, recurrent-neural-networks, rnn, lstm, gru, bilstm, bigru, sequence-modeling, nlp, time-series, speech-recognition, machine-translation, vanishing-gradient, long-short-term-memory, gated-recurrent-unit, bidirectional-rnn, deep-learning, pytorch, tensorflow, keras",
61
+ keywords="utilities, tools, ai, ml, data-processing, automation, python-library, neural-networks, embeddings, word-embeddings, word2vec, glove, fasttext, positional-encoding, learned-embeddings, token-embeddings, character-embeddings, semantic-similarity, embedding-space, nlp, natural-language-processing, transformers, bert, deep-learning, pytorch, tensorflow, keras",
62
62
  project_urls={
63
63
  "Bug Reports": "https://github.com/AliMehdi512/ilovetools/issues",
64
64
  "Source": "https://github.com/AliMehdi512/ilovetools",