ilovetools 0.2.32__tar.gz → 0.2.33__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ilovetools-0.2.32/ilovetools.egg-info → ilovetools-0.2.33}/PKG-INFO +2 -2
- ilovetools-0.2.33/ilovetools/ml/embedding.py +484 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33/ilovetools.egg-info}/PKG-INFO +2 -2
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools.egg-info/SOURCES.txt +2 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/pyproject.toml +2 -2
- {ilovetools-0.2.32 → ilovetools-0.2.33}/setup.py +2 -2
- ilovetools-0.2.33/tests/test_embedding.py +335 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/LICENSE +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/MANIFEST.in +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/README.md +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ai/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ai/embeddings.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ai/inference.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ai/llm_helpers.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/audio/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/automation/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/automation/file_organizer.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/conversion/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/conversion/config_converter.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/conversion/config_converter_fixed_header.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/data/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/data/feature_engineering.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/data/preprocessing.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/database/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/datetime/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/email/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/email/template_engine.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/files/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/image/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/activations.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/anomaly_detection.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/attention.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/augmentation.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/clustering.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/cnn.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/convolution.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/cross_validation.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/dimensionality.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/dropout.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/ensemble.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/feature_selection.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/gradient_descent.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/imbalanced.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/interpretation.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/loss_functions.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/losses.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/lr_schedulers.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/metrics.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/neural_network.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/normalization.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/normalization_advanced.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/optimizers.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/pipeline.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/pooling.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/positional_encoding.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/recurrent.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/regularization.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/rnn.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/schedulers.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/timeseries.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/tuning.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/ml/weight_init.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/security/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/security/password_checker.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/text/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/utils/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/utils/cache_system.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/utils/logger.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/utils/rate_limiter.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/utils/retry.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/validation/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/validation/data_validator.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/web/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/web/scraper.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools/web/url_shortener.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools.egg-info/dependency_links.txt +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools.egg-info/requires.txt +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/ilovetools.egg-info/top_level.txt +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/requirements.txt +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/setup.cfg +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/__init__.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_activations.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_attention.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_augmentation.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_cnn.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_convolution.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_dropout.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_gradient_descent.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_loss_functions.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_losses.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_lr_schedulers.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_neural_network.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_normalization.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_normalization_advanced.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_optimizers.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_pooling.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_positional_encoding.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_pypi_installation.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_recurrent.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_regularization.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_rnn.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_schedulers.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/test_weight_init.py +0 -0
- {ilovetools-0.2.32 → ilovetools-0.2.33}/tests/verify_positional_encoding.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.33
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -11,7 +11,7 @@ Project-URL: Repository, https://github.com/AliMehdi512/ilovetools
|
|
|
11
11
|
Project-URL: Issues, https://github.com/AliMehdi512/ilovetools/issues
|
|
12
12
|
Project-URL: Bug Reports, https://github.com/AliMehdi512/ilovetools/issues
|
|
13
13
|
Project-URL: Source, https://github.com/AliMehdi512/ilovetools
|
|
14
|
-
Keywords: utilities,tools,ai,ml,data-processing,automation,
|
|
14
|
+
Keywords: utilities,tools,ai,ml,data-processing,automation,embeddings,word-embeddings,word2vec,glove,fasttext,positional-encoding,sinusoidal-encoding,learned-embeddings,token-embeddings,character-embeddings,segment-embeddings,token-type-embeddings,semantic-similarity,cosine-similarity,embedding-space,embedding-matrix,lookup-table,distributed-representations,dense-vectors,nlp,natural-language-processing,transformers,bert,gpt,attention,vocabulary,tokenization,pretrained-embeddings,transfer-learning,deep-learning,neural-networks,pytorch,tensorflow,keras
|
|
15
15
|
Classifier: Development Status :: 3 - Alpha
|
|
16
16
|
Classifier: Intended Audience :: Developers
|
|
17
17
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
@@ -0,0 +1,484 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding Layers Suite
|
|
3
|
+
|
|
4
|
+
This module implements various embedding layers for neural networks.
|
|
5
|
+
Embeddings convert discrete tokens (words, characters) into dense continuous vectors.
|
|
6
|
+
|
|
7
|
+
Implemented Embedding Types:
|
|
8
|
+
1. Embedding - Standard learned embedding layer
|
|
9
|
+
2. PositionalEncoding - Sinusoidal positional encoding for Transformers
|
|
10
|
+
3. LearnedPositionalEmbedding - Learned positional embeddings
|
|
11
|
+
4. TokenTypeEmbedding - Segment/token type embeddings (BERT-style)
|
|
12
|
+
5. CharacterEmbedding - Character-level embeddings
|
|
13
|
+
|
|
14
|
+
Key Benefits:
|
|
15
|
+
- Dense vector representations
|
|
16
|
+
- Semantic similarity capture
|
|
17
|
+
- Dimensionality reduction (vocab_size → embedding_dim)
|
|
18
|
+
- Learned from data
|
|
19
|
+
- Transfer learning support
|
|
20
|
+
|
|
21
|
+
References:
|
|
22
|
+
- Word2Vec: Mikolov et al., "Efficient Estimation of Word Representations in Vector Space" (2013)
|
|
23
|
+
- GloVe: Pennington et al., "GloVe: Global Vectors for Word Representation" (2014)
|
|
24
|
+
- Positional Encoding: Vaswani et al., "Attention Is All You Need" (2017)
|
|
25
|
+
- FastText: Bojanowski et al., "Enriching Word Vectors with Subword Information" (2017)
|
|
26
|
+
|
|
27
|
+
Author: Ali Mehdi
|
|
28
|
+
Date: January 22, 2026
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
import numpy as np
|
|
32
|
+
from typing import Optional, Tuple
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ============================================================================
|
|
36
|
+
# STANDARD EMBEDDING LAYER
|
|
37
|
+
# ============================================================================
|
|
38
|
+
|
|
39
|
+
class Embedding:
|
|
40
|
+
"""
|
|
41
|
+
Standard Embedding Layer.
|
|
42
|
+
|
|
43
|
+
Converts token indices to dense vectors via lookup table.
|
|
44
|
+
|
|
45
|
+
Formula:
|
|
46
|
+
output = embedding_matrix[token_indices]
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
vocab_size: Size of vocabulary (number of unique tokens)
|
|
50
|
+
embedding_dim: Dimension of embedding vectors
|
|
51
|
+
padding_idx: Index for padding token (optional, won't be updated during training)
|
|
52
|
+
max_norm: If given, embeddings are normalized to have max L2 norm
|
|
53
|
+
scale_grad_by_freq: Scale gradients by token frequency (default: False)
|
|
54
|
+
|
|
55
|
+
Example:
|
|
56
|
+
>>> emb = Embedding(vocab_size=10000, embedding_dim=300)
|
|
57
|
+
>>> tokens = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) # (batch, seq_len)
|
|
58
|
+
>>> output = emb.forward(tokens)
|
|
59
|
+
>>> print(output.shape) # (2, 4, 300)
|
|
60
|
+
|
|
61
|
+
Use Case:
|
|
62
|
+
Word embeddings, token embeddings, any discrete to continuous mapping
|
|
63
|
+
|
|
64
|
+
Reference:
|
|
65
|
+
Mikolov et al., "Efficient Estimation of Word Representations" (2013)
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(self, vocab_size: int, embedding_dim: int,
|
|
69
|
+
padding_idx: Optional[int] = None,
|
|
70
|
+
max_norm: Optional[float] = None,
|
|
71
|
+
scale_grad_by_freq: bool = False):
|
|
72
|
+
self.vocab_size = vocab_size
|
|
73
|
+
self.embedding_dim = embedding_dim
|
|
74
|
+
self.padding_idx = padding_idx
|
|
75
|
+
self.max_norm = max_norm
|
|
76
|
+
self.scale_grad_by_freq = scale_grad_by_freq
|
|
77
|
+
|
|
78
|
+
# Initialize embedding matrix (Xavier/Glorot initialization)
|
|
79
|
+
self.weight = np.random.randn(vocab_size, embedding_dim) * np.sqrt(2.0 / (vocab_size + embedding_dim))
|
|
80
|
+
|
|
81
|
+
# Set padding embedding to zeros
|
|
82
|
+
if padding_idx is not None:
|
|
83
|
+
self.weight[padding_idx] = 0.0
|
|
84
|
+
|
|
85
|
+
self.cache = None
|
|
86
|
+
|
|
87
|
+
def forward(self, indices: np.ndarray) -> np.ndarray:
|
|
88
|
+
"""
|
|
89
|
+
Forward pass.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
indices: Token indices, shape (batch, seq_len) or (batch, seq_len, ...)
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Embedded vectors, shape (*indices.shape, embedding_dim)
|
|
96
|
+
"""
|
|
97
|
+
# Lookup embeddings
|
|
98
|
+
output = self.weight[indices]
|
|
99
|
+
|
|
100
|
+
# Apply max norm if specified
|
|
101
|
+
if self.max_norm is not None:
|
|
102
|
+
norms = np.linalg.norm(output, axis=-1, keepdims=True)
|
|
103
|
+
output = output * np.minimum(1.0, self.max_norm / (norms + 1e-8))
|
|
104
|
+
|
|
105
|
+
self.cache = indices
|
|
106
|
+
return output
|
|
107
|
+
|
|
108
|
+
def load_pretrained(self, pretrained_embeddings: np.ndarray):
|
|
109
|
+
"""
|
|
110
|
+
Load pretrained embeddings.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
pretrained_embeddings: Pretrained embedding matrix (vocab_size, embedding_dim)
|
|
114
|
+
"""
|
|
115
|
+
if pretrained_embeddings.shape != self.weight.shape:
|
|
116
|
+
raise ValueError(f"Shape mismatch: expected {self.weight.shape}, got {pretrained_embeddings.shape}")
|
|
117
|
+
|
|
118
|
+
self.weight = pretrained_embeddings.copy()
|
|
119
|
+
|
|
120
|
+
# Reset padding embedding
|
|
121
|
+
if self.padding_idx is not None:
|
|
122
|
+
self.weight[self.padding_idx] = 0.0
|
|
123
|
+
|
|
124
|
+
def freeze(self):
|
|
125
|
+
"""Freeze embeddings (don't update during training)."""
|
|
126
|
+
self.frozen = True
|
|
127
|
+
|
|
128
|
+
def unfreeze(self):
|
|
129
|
+
"""Unfreeze embeddings (allow updates during training)."""
|
|
130
|
+
self.frozen = False
|
|
131
|
+
|
|
132
|
+
def __call__(self, indices: np.ndarray) -> np.ndarray:
|
|
133
|
+
return self.forward(indices)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# ============================================================================
|
|
137
|
+
# POSITIONAL ENCODING (SINUSOIDAL)
|
|
138
|
+
# ============================================================================
|
|
139
|
+
|
|
140
|
+
class PositionalEncoding:
|
|
141
|
+
"""
|
|
142
|
+
Sinusoidal Positional Encoding for Transformers.
|
|
143
|
+
|
|
144
|
+
Adds position information to token embeddings using sine and cosine functions.
|
|
145
|
+
|
|
146
|
+
Formula:
|
|
147
|
+
PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
|
|
148
|
+
PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
embedding_dim: Dimension of embeddings (must be even)
|
|
152
|
+
max_len: Maximum sequence length (default: 5000)
|
|
153
|
+
dropout: Dropout rate (default: 0.1)
|
|
154
|
+
|
|
155
|
+
Example:
|
|
156
|
+
>>> pos_enc = PositionalEncoding(embedding_dim=512, max_len=1000)
|
|
157
|
+
>>> x = np.random.randn(32, 100, 512) # (batch, seq_len, embedding_dim)
|
|
158
|
+
>>> output = pos_enc.forward(x)
|
|
159
|
+
>>> print(output.shape) # (32, 100, 512)
|
|
160
|
+
|
|
161
|
+
Use Case:
|
|
162
|
+
Transformers, attention mechanisms, sequence position encoding
|
|
163
|
+
|
|
164
|
+
Reference:
|
|
165
|
+
Vaswani et al., "Attention Is All You Need" (2017)
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
def __init__(self, embedding_dim: int, max_len: int = 5000, dropout: float = 0.1):
|
|
169
|
+
if embedding_dim % 2 != 0:
|
|
170
|
+
raise ValueError(f"embedding_dim must be even, got {embedding_dim}")
|
|
171
|
+
|
|
172
|
+
self.embedding_dim = embedding_dim
|
|
173
|
+
self.max_len = max_len
|
|
174
|
+
self.dropout = dropout
|
|
175
|
+
|
|
176
|
+
# Create positional encoding matrix
|
|
177
|
+
pe = np.zeros((max_len, embedding_dim))
|
|
178
|
+
position = np.arange(0, max_len).reshape(-1, 1)
|
|
179
|
+
div_term = np.exp(np.arange(0, embedding_dim, 2) * -(np.log(10000.0) / embedding_dim))
|
|
180
|
+
|
|
181
|
+
pe[:, 0::2] = np.sin(position * div_term)
|
|
182
|
+
pe[:, 1::2] = np.cos(position * div_term)
|
|
183
|
+
|
|
184
|
+
self.pe = pe # (max_len, embedding_dim)
|
|
185
|
+
|
|
186
|
+
def forward(self, x: np.ndarray, training: bool = True) -> np.ndarray:
|
|
187
|
+
"""
|
|
188
|
+
Forward pass.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
x: Input embeddings (batch, seq_len, embedding_dim)
|
|
192
|
+
training: Whether in training mode (apply dropout)
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Embeddings with positional encoding added
|
|
196
|
+
"""
|
|
197
|
+
batch_size, seq_len, embedding_dim = x.shape
|
|
198
|
+
|
|
199
|
+
if seq_len > self.max_len:
|
|
200
|
+
raise ValueError(f"Sequence length {seq_len} exceeds max_len {self.max_len}")
|
|
201
|
+
|
|
202
|
+
if embedding_dim != self.embedding_dim:
|
|
203
|
+
raise ValueError(f"Expected embedding_dim {self.embedding_dim}, got {embedding_dim}")
|
|
204
|
+
|
|
205
|
+
# Add positional encoding
|
|
206
|
+
output = x + self.pe[:seq_len, :]
|
|
207
|
+
|
|
208
|
+
# Apply dropout if training
|
|
209
|
+
if training and self.dropout > 0:
|
|
210
|
+
mask = np.random.binomial(1, 1 - self.dropout, size=output.shape)
|
|
211
|
+
output = output * mask / (1 - self.dropout)
|
|
212
|
+
|
|
213
|
+
return output
|
|
214
|
+
|
|
215
|
+
def __call__(self, x: np.ndarray, training: bool = True) -> np.ndarray:
|
|
216
|
+
return self.forward(x, training)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
# ============================================================================
|
|
220
|
+
# LEARNED POSITIONAL EMBEDDING
|
|
221
|
+
# ============================================================================
|
|
222
|
+
|
|
223
|
+
class LearnedPositionalEmbedding:
|
|
224
|
+
"""
|
|
225
|
+
Learned Positional Embeddings.
|
|
226
|
+
|
|
227
|
+
Alternative to sinusoidal encoding, learns position embeddings from data.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
max_len: Maximum sequence length
|
|
231
|
+
embedding_dim: Dimension of embeddings
|
|
232
|
+
|
|
233
|
+
Example:
|
|
234
|
+
>>> pos_emb = LearnedPositionalEmbedding(max_len=512, embedding_dim=768)
|
|
235
|
+
>>> x = np.random.randn(32, 100, 768)
|
|
236
|
+
>>> output = pos_emb.forward(x)
|
|
237
|
+
>>> print(output.shape) # (32, 100, 768)
|
|
238
|
+
|
|
239
|
+
Use Case:
|
|
240
|
+
BERT-style models, learned position representations
|
|
241
|
+
|
|
242
|
+
Reference:
|
|
243
|
+
Devlin et al., "BERT: Pre-training of Deep Bidirectional Transformers" (2019)
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
def __init__(self, max_len: int, embedding_dim: int):
|
|
247
|
+
self.max_len = max_len
|
|
248
|
+
self.embedding_dim = embedding_dim
|
|
249
|
+
|
|
250
|
+
# Initialize position embeddings
|
|
251
|
+
self.position_embeddings = np.random.randn(max_len, embedding_dim) * 0.02
|
|
252
|
+
|
|
253
|
+
def forward(self, x: np.ndarray) -> np.ndarray:
|
|
254
|
+
"""
|
|
255
|
+
Forward pass.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
x: Input embeddings (batch, seq_len, embedding_dim)
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Embeddings with learned positional embeddings added
|
|
262
|
+
"""
|
|
263
|
+
batch_size, seq_len, embedding_dim = x.shape
|
|
264
|
+
|
|
265
|
+
if seq_len > self.max_len:
|
|
266
|
+
raise ValueError(f"Sequence length {seq_len} exceeds max_len {self.max_len}")
|
|
267
|
+
|
|
268
|
+
if embedding_dim != self.embedding_dim:
|
|
269
|
+
raise ValueError(f"Expected embedding_dim {self.embedding_dim}, got {embedding_dim}")
|
|
270
|
+
|
|
271
|
+
# Add position embeddings
|
|
272
|
+
output = x + self.position_embeddings[:seq_len, :]
|
|
273
|
+
|
|
274
|
+
return output
|
|
275
|
+
|
|
276
|
+
def __call__(self, x: np.ndarray) -> np.ndarray:
|
|
277
|
+
return self.forward(x)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# ============================================================================
|
|
281
|
+
# TOKEN TYPE EMBEDDING (SEGMENT EMBEDDING)
|
|
282
|
+
# ============================================================================
|
|
283
|
+
|
|
284
|
+
class TokenTypeEmbedding:
|
|
285
|
+
"""
|
|
286
|
+
Token Type (Segment) Embeddings.
|
|
287
|
+
|
|
288
|
+
Used in BERT to distinguish between different segments (e.g., sentence A vs B).
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
num_types: Number of token types (default: 2 for BERT)
|
|
292
|
+
embedding_dim: Dimension of embeddings
|
|
293
|
+
|
|
294
|
+
Example:
|
|
295
|
+
>>> token_type_emb = TokenTypeEmbedding(num_types=2, embedding_dim=768)
|
|
296
|
+
>>> token_type_ids = np.array([[0, 0, 0, 1, 1, 1]]) # (batch, seq_len)
|
|
297
|
+
>>> output = token_type_emb.forward(token_type_ids)
|
|
298
|
+
>>> print(output.shape) # (1, 6, 768)
|
|
299
|
+
|
|
300
|
+
Use Case:
|
|
301
|
+
BERT, sentence pair tasks, multi-segment inputs
|
|
302
|
+
|
|
303
|
+
Reference:
|
|
304
|
+
Devlin et al., "BERT" (2019)
|
|
305
|
+
"""
|
|
306
|
+
|
|
307
|
+
def __init__(self, num_types: int = 2, embedding_dim: int = 768):
|
|
308
|
+
self.num_types = num_types
|
|
309
|
+
self.embedding_dim = embedding_dim
|
|
310
|
+
|
|
311
|
+
# Initialize token type embeddings
|
|
312
|
+
self.token_type_embeddings = np.random.randn(num_types, embedding_dim) * 0.02
|
|
313
|
+
|
|
314
|
+
def forward(self, token_type_ids: np.ndarray) -> np.ndarray:
|
|
315
|
+
"""
|
|
316
|
+
Forward pass.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
token_type_ids: Token type indices (batch, seq_len)
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Token type embeddings (batch, seq_len, embedding_dim)
|
|
323
|
+
"""
|
|
324
|
+
return self.token_type_embeddings[token_type_ids]
|
|
325
|
+
|
|
326
|
+
def __call__(self, token_type_ids: np.ndarray) -> np.ndarray:
|
|
327
|
+
return self.forward(token_type_ids)
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
# ============================================================================
|
|
331
|
+
# CHARACTER EMBEDDING
|
|
332
|
+
# ============================================================================
|
|
333
|
+
|
|
334
|
+
class CharacterEmbedding:
|
|
335
|
+
"""
|
|
336
|
+
Character-level Embeddings.
|
|
337
|
+
|
|
338
|
+
Embeds individual characters, useful for handling OOV words.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
num_chars: Number of unique characters
|
|
342
|
+
embedding_dim: Dimension of character embeddings
|
|
343
|
+
padding_idx: Index for padding character
|
|
344
|
+
|
|
345
|
+
Example:
|
|
346
|
+
>>> char_emb = CharacterEmbedding(num_chars=128, embedding_dim=50)
|
|
347
|
+
>>> char_ids = np.random.randint(0, 128, (32, 20, 15)) # (batch, words, chars)
|
|
348
|
+
>>> output = char_emb.forward(char_ids)
|
|
349
|
+
>>> print(output.shape) # (32, 20, 50)
|
|
350
|
+
|
|
351
|
+
Use Case:
|
|
352
|
+
Character-level models, handling rare words, morphologically rich languages
|
|
353
|
+
|
|
354
|
+
Reference:
|
|
355
|
+
Kim et al., "Character-Aware Neural Language Models" (2016)
|
|
356
|
+
"""
|
|
357
|
+
|
|
358
|
+
def __init__(self, num_chars: int, embedding_dim: int, padding_idx: Optional[int] = None):
|
|
359
|
+
self.num_chars = num_chars
|
|
360
|
+
self.embedding_dim = embedding_dim
|
|
361
|
+
self.padding_idx = padding_idx
|
|
362
|
+
|
|
363
|
+
# Initialize character embeddings
|
|
364
|
+
self.weight = np.random.randn(num_chars, embedding_dim) * 0.02
|
|
365
|
+
|
|
366
|
+
if padding_idx is not None:
|
|
367
|
+
self.weight[padding_idx] = 0.0
|
|
368
|
+
|
|
369
|
+
def forward(self, char_ids: np.ndarray) -> np.ndarray:
|
|
370
|
+
"""
|
|
371
|
+
Forward pass.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
char_ids: Character indices (batch, num_words, max_word_len)
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
Word embeddings from characters (batch, num_words, embedding_dim)
|
|
378
|
+
"""
|
|
379
|
+
batch_size, num_words, max_word_len = char_ids.shape
|
|
380
|
+
|
|
381
|
+
# Lookup character embeddings
|
|
382
|
+
char_embeddings = self.weight[char_ids] # (batch, num_words, max_word_len, embedding_dim)
|
|
383
|
+
|
|
384
|
+
# Aggregate character embeddings (mean pooling)
|
|
385
|
+
word_embeddings = np.mean(char_embeddings, axis=2) # (batch, num_words, embedding_dim)
|
|
386
|
+
|
|
387
|
+
return word_embeddings
|
|
388
|
+
|
|
389
|
+
def __call__(self, char_ids: np.ndarray) -> np.ndarray:
|
|
390
|
+
return self.forward(char_ids)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
# ============================================================================
|
|
394
|
+
# UTILITY FUNCTIONS
|
|
395
|
+
# ============================================================================
|
|
396
|
+
|
|
397
|
+
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
|
398
|
+
"""
|
|
399
|
+
Compute cosine similarity between two vectors.
|
|
400
|
+
|
|
401
|
+
Formula:
|
|
402
|
+
cos(θ) = (a · b) / (||a|| ||b||)
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
a: First vector
|
|
406
|
+
b: Second vector
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
Cosine similarity (-1 to 1)
|
|
410
|
+
|
|
411
|
+
Example:
|
|
412
|
+
>>> a = np.array([1, 2, 3])
|
|
413
|
+
>>> b = np.array([4, 5, 6])
|
|
414
|
+
>>> sim = cosine_similarity(a, b)
|
|
415
|
+
>>> print(f"Similarity: {sim:.4f}")
|
|
416
|
+
"""
|
|
417
|
+
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8)
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def euclidean_distance(a: np.ndarray, b: np.ndarray) -> float:
|
|
421
|
+
"""
|
|
422
|
+
Compute Euclidean distance between two vectors.
|
|
423
|
+
|
|
424
|
+
Formula:
|
|
425
|
+
d = ||a - b||
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
a: First vector
|
|
429
|
+
b: Second vector
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
Euclidean distance
|
|
433
|
+
"""
|
|
434
|
+
return np.linalg.norm(a - b)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def most_similar(embedding_matrix: np.ndarray, query_vector: np.ndarray,
|
|
438
|
+
top_k: int = 5, exclude_idx: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
|
|
439
|
+
"""
|
|
440
|
+
Find most similar vectors to query vector.
|
|
441
|
+
|
|
442
|
+
Args:
|
|
443
|
+
embedding_matrix: Matrix of embeddings (vocab_size, embedding_dim)
|
|
444
|
+
query_vector: Query vector (embedding_dim,)
|
|
445
|
+
top_k: Number of most similar vectors to return
|
|
446
|
+
exclude_idx: Index to exclude (e.g., query word itself)
|
|
447
|
+
|
|
448
|
+
Returns:
|
|
449
|
+
Tuple of (indices, similarities)
|
|
450
|
+
|
|
451
|
+
Example:
|
|
452
|
+
>>> emb = Embedding(vocab_size=10000, embedding_dim=300)
|
|
453
|
+
>>> query = emb.weight[100] # Get embedding for word at index 100
|
|
454
|
+
>>> indices, sims = most_similar(emb.weight, query, top_k=5, exclude_idx=100)
|
|
455
|
+
>>> print(f"Most similar indices: {indices}")
|
|
456
|
+
>>> print(f"Similarities: {sims}")
|
|
457
|
+
"""
|
|
458
|
+
# Compute cosine similarities
|
|
459
|
+
norms = np.linalg.norm(embedding_matrix, axis=1, keepdims=True)
|
|
460
|
+
query_norm = np.linalg.norm(query_vector)
|
|
461
|
+
|
|
462
|
+
similarities = np.dot(embedding_matrix, query_vector) / (norms.flatten() * query_norm + 1e-8)
|
|
463
|
+
|
|
464
|
+
# Exclude specified index
|
|
465
|
+
if exclude_idx is not None:
|
|
466
|
+
similarities[exclude_idx] = -np.inf
|
|
467
|
+
|
|
468
|
+
# Get top-k
|
|
469
|
+
top_indices = np.argsort(similarities)[::-1][:top_k]
|
|
470
|
+
top_similarities = similarities[top_indices]
|
|
471
|
+
|
|
472
|
+
return top_indices, top_similarities
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
__all__ = [
|
|
476
|
+
'Embedding',
|
|
477
|
+
'PositionalEncoding',
|
|
478
|
+
'LearnedPositionalEmbedding',
|
|
479
|
+
'TokenTypeEmbedding',
|
|
480
|
+
'CharacterEmbedding',
|
|
481
|
+
'cosine_similarity',
|
|
482
|
+
'euclidean_distance',
|
|
483
|
+
'most_similar',
|
|
484
|
+
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.33
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -11,7 +11,7 @@ Project-URL: Repository, https://github.com/AliMehdi512/ilovetools
|
|
|
11
11
|
Project-URL: Issues, https://github.com/AliMehdi512/ilovetools/issues
|
|
12
12
|
Project-URL: Bug Reports, https://github.com/AliMehdi512/ilovetools/issues
|
|
13
13
|
Project-URL: Source, https://github.com/AliMehdi512/ilovetools
|
|
14
|
-
Keywords: utilities,tools,ai,ml,data-processing,automation,
|
|
14
|
+
Keywords: utilities,tools,ai,ml,data-processing,automation,embeddings,word-embeddings,word2vec,glove,fasttext,positional-encoding,sinusoidal-encoding,learned-embeddings,token-embeddings,character-embeddings,segment-embeddings,token-type-embeddings,semantic-similarity,cosine-similarity,embedding-space,embedding-matrix,lookup-table,distributed-representations,dense-vectors,nlp,natural-language-processing,transformers,bert,gpt,attention,vocabulary,tokenization,pretrained-embeddings,transfer-learning,deep-learning,neural-networks,pytorch,tensorflow,keras
|
|
15
15
|
Classifier: Development Status :: 3 - Alpha
|
|
16
16
|
Classifier: Intended Audience :: Developers
|
|
17
17
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
@@ -40,6 +40,7 @@ ilovetools/ml/convolution.py
|
|
|
40
40
|
ilovetools/ml/cross_validation.py
|
|
41
41
|
ilovetools/ml/dimensionality.py
|
|
42
42
|
ilovetools/ml/dropout.py
|
|
43
|
+
ilovetools/ml/embedding.py
|
|
43
44
|
ilovetools/ml/ensemble.py
|
|
44
45
|
ilovetools/ml/feature_selection.py
|
|
45
46
|
ilovetools/ml/gradient_descent.py
|
|
@@ -83,6 +84,7 @@ tests/test_augmentation.py
|
|
|
83
84
|
tests/test_cnn.py
|
|
84
85
|
tests/test_convolution.py
|
|
85
86
|
tests/test_dropout.py
|
|
87
|
+
tests/test_embedding.py
|
|
86
88
|
tests/test_gradient_descent.py
|
|
87
89
|
tests/test_loss_functions.py
|
|
88
90
|
tests/test_losses.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ilovetools"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.33"
|
|
8
8
|
description = "A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -12,7 +12,7 @@ license = "MIT"
|
|
|
12
12
|
authors = [
|
|
13
13
|
{name = "Ali Mehdi", email = "ali.mehdi.dev579@gmail.com"}
|
|
14
14
|
]
|
|
15
|
-
keywords = ["utilities", "tools", "ai", "ml", "data-processing", "automation", "
|
|
15
|
+
keywords = ["utilities", "tools", "ai", "ml", "data-processing", "automation", "embeddings", "word-embeddings", "word2vec", "glove", "fasttext", "positional-encoding", "sinusoidal-encoding", "learned-embeddings", "token-embeddings", "character-embeddings", "segment-embeddings", "token-type-embeddings", "semantic-similarity", "cosine-similarity", "embedding-space", "embedding-matrix", "lookup-table", "distributed-representations", "dense-vectors", "nlp", "natural-language-processing", "transformers", "bert", "gpt", "attention", "vocabulary", "tokenization", "pretrained-embeddings", "transfer-learning", "deep-learning", "neural-networks", "pytorch", "tensorflow", "keras"]
|
|
16
16
|
classifiers = [
|
|
17
17
|
"Development Status :: 3 - Alpha",
|
|
18
18
|
"Intended Audience :: Developers",
|
|
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
|
5
5
|
|
|
6
6
|
setup(
|
|
7
7
|
name="ilovetools",
|
|
8
|
-
version="0.2.
|
|
8
|
+
version="0.2.33",
|
|
9
9
|
author="Ali Mehdi",
|
|
10
10
|
author_email="ali.mehdi.dev579@gmail.com",
|
|
11
11
|
description="A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs",
|
|
@@ -58,7 +58,7 @@ setup(
|
|
|
58
58
|
"soundfile>=0.12.0",
|
|
59
59
|
],
|
|
60
60
|
},
|
|
61
|
-
keywords="utilities, tools, ai, ml, data-processing, automation, python-library, neural-networks,
|
|
61
|
+
keywords="utilities, tools, ai, ml, data-processing, automation, python-library, neural-networks, embeddings, word-embeddings, word2vec, glove, fasttext, positional-encoding, learned-embeddings, token-embeddings, character-embeddings, semantic-similarity, embedding-space, nlp, natural-language-processing, transformers, bert, deep-learning, pytorch, tensorflow, keras",
|
|
62
62
|
project_urls={
|
|
63
63
|
"Bug Reports": "https://github.com/AliMehdi512/ilovetools/issues",
|
|
64
64
|
"Source": "https://github.com/AliMehdi512/ilovetools",
|