ilovetools 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ilovetools/__init__.py +42 -0
- ilovetools/ai/__init__.py +13 -0
- ilovetools/ai/embeddings.py +270 -0
- ilovetools/ai/inference.py +5 -0
- ilovetools/ai/llm_helpers.py +141 -0
- ilovetools/audio/__init__.py +5 -0
- ilovetools/automation/__init__.py +5 -0
- ilovetools/conversion/__init__.py +5 -0
- ilovetools/data/__init__.py +27 -0
- ilovetools/data/feature_engineering.py +497 -0
- ilovetools/data/preprocessing.py +234 -0
- ilovetools/database/__init__.py +5 -0
- ilovetools/datetime/__init__.py +5 -0
- ilovetools/files/__init__.py +5 -0
- ilovetools/image/__init__.py +5 -0
- ilovetools/ml/__init__.py +603 -0
- ilovetools/ml/clustering.py +1107 -0
- ilovetools/ml/cross_validation.py +612 -0
- ilovetools/ml/dimensionality.py +1001 -0
- ilovetools/ml/ensemble.py +872 -0
- ilovetools/ml/feature_selection.py +971 -0
- ilovetools/ml/imbalanced.py +797 -0
- ilovetools/ml/interpretation.py +915 -0
- ilovetools/ml/metrics.py +601 -0
- ilovetools/ml/pipeline.py +711 -0
- ilovetools/ml/timeseries.py +984 -0
- ilovetools/ml/tuning.py +781 -0
- ilovetools/security/__init__.py +5 -0
- ilovetools/text/__init__.py +5 -0
- ilovetools/utils/__init__.py +5 -0
- ilovetools/validation/__init__.py +5 -0
- ilovetools/web/__init__.py +5 -0
- ilovetools-0.2.3.dist-info/METADATA +143 -0
- ilovetools-0.2.3.dist-info/RECORD +38 -0
- ilovetools-0.2.3.dist-info/WHEEL +5 -0
- ilovetools-0.2.3.dist-info/licenses/LICENSE +21 -0
- ilovetools-0.2.3.dist-info/top_level.txt +2 -0
- tests/__init__.py +3 -0
|
@@ -0,0 +1,1001 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dimensionality reduction utilities
|
|
3
|
+
Each function has TWO names: full descriptive name + abbreviated alias
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Dict, Any, Tuple, Optional
|
|
7
|
+
import math
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
# Full names
|
|
11
|
+
'pca_transform',
|
|
12
|
+
'explained_variance_ratio',
|
|
13
|
+
'scree_plot_data',
|
|
14
|
+
'cumulative_variance',
|
|
15
|
+
'pca_inverse_transform',
|
|
16
|
+
'truncated_svd',
|
|
17
|
+
'kernel_pca_transform',
|
|
18
|
+
'incremental_pca_transform',
|
|
19
|
+
'feature_projection',
|
|
20
|
+
'dimensionality_reduction_ratio',
|
|
21
|
+
'reconstruction_error',
|
|
22
|
+
'optimal_components',
|
|
23
|
+
'whitening_transform',
|
|
24
|
+
'component_loadings',
|
|
25
|
+
'biplot_data',
|
|
26
|
+
# Abbreviated aliases
|
|
27
|
+
'pca',
|
|
28
|
+
'exp_var',
|
|
29
|
+
'scree_plot',
|
|
30
|
+
'cum_var',
|
|
31
|
+
'pca_inverse',
|
|
32
|
+
'svd',
|
|
33
|
+
'kpca',
|
|
34
|
+
'ipca',
|
|
35
|
+
'project',
|
|
36
|
+
'dim_ratio',
|
|
37
|
+
'recon_error',
|
|
38
|
+
'opt_components',
|
|
39
|
+
'whiten',
|
|
40
|
+
'loadings',
|
|
41
|
+
'biplot',
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def pca_transform(
|
|
46
|
+
X: List[List[float]],
|
|
47
|
+
n_components: int
|
|
48
|
+
) -> Tuple[List[List[float]], Dict[str, Any]]:
|
|
49
|
+
"""
|
|
50
|
+
Principal Component Analysis (PCA) transformation.
|
|
51
|
+
|
|
52
|
+
Alias: pca()
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
X: Feature data (samples x features)
|
|
56
|
+
n_components: Number of components to keep
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
tuple: (X_transformed, pca_info)
|
|
60
|
+
|
|
61
|
+
Examples:
|
|
62
|
+
>>> from ilovetools.ml import pca # Short alias
|
|
63
|
+
|
|
64
|
+
>>> X = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
|
|
65
|
+
>>> X_pca, info = pca(X, n_components=2)
|
|
66
|
+
>>> print(len(X_pca[0]))
|
|
67
|
+
2
|
|
68
|
+
>>> print('explained_variance' in info)
|
|
69
|
+
True
|
|
70
|
+
|
|
71
|
+
>>> from ilovetools.ml import pca_transform # Full name
|
|
72
|
+
>>> X_pca, info = pca_transform(X, n_components=2)
|
|
73
|
+
|
|
74
|
+
Notes:
|
|
75
|
+
- Linear dimensionality reduction
|
|
76
|
+
- Maximizes variance
|
|
77
|
+
- Orthogonal components
|
|
78
|
+
- Fast and interpretable
|
|
79
|
+
"""
|
|
80
|
+
# Center the data
|
|
81
|
+
n_samples = len(X)
|
|
82
|
+
n_features = len(X[0])
|
|
83
|
+
|
|
84
|
+
# Calculate mean
|
|
85
|
+
means = [sum(X[i][j] for i in range(n_samples)) / n_samples
|
|
86
|
+
for j in range(n_features)]
|
|
87
|
+
|
|
88
|
+
# Center data
|
|
89
|
+
X_centered = [
|
|
90
|
+
[X[i][j] - means[j] for j in range(n_features)]
|
|
91
|
+
for i in range(n_samples)
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
# Calculate covariance matrix (simplified)
|
|
95
|
+
cov_matrix = []
|
|
96
|
+
for i in range(n_features):
|
|
97
|
+
row = []
|
|
98
|
+
for j in range(n_features):
|
|
99
|
+
cov = sum(X_centered[k][i] * X_centered[k][j]
|
|
100
|
+
for k in range(n_samples)) / (n_samples - 1)
|
|
101
|
+
row.append(cov)
|
|
102
|
+
cov_matrix.append(row)
|
|
103
|
+
|
|
104
|
+
# Simplified eigenvalue/eigenvector computation (power iteration)
|
|
105
|
+
# In production, use numpy.linalg.eig
|
|
106
|
+
components = []
|
|
107
|
+
eigenvalues = []
|
|
108
|
+
|
|
109
|
+
for _ in range(min(n_components, n_features)):
|
|
110
|
+
# Initialize random vector
|
|
111
|
+
v = [1.0 / math.sqrt(n_features)] * n_features
|
|
112
|
+
|
|
113
|
+
# Power iteration (simplified)
|
|
114
|
+
for _ in range(100):
|
|
115
|
+
# Multiply by covariance matrix
|
|
116
|
+
Av = [sum(cov_matrix[i][j] * v[j] for j in range(n_features))
|
|
117
|
+
for i in range(n_features)]
|
|
118
|
+
|
|
119
|
+
# Normalize
|
|
120
|
+
norm = math.sqrt(sum(x**2 for x in Av))
|
|
121
|
+
if norm > 0:
|
|
122
|
+
v = [x / norm for x in Av]
|
|
123
|
+
|
|
124
|
+
components.append(v)
|
|
125
|
+
|
|
126
|
+
# Approximate eigenvalue
|
|
127
|
+
eigenvalue = sum(sum(cov_matrix[i][j] * v[i] * v[j]
|
|
128
|
+
for j in range(n_features))
|
|
129
|
+
for i in range(n_features))
|
|
130
|
+
eigenvalues.append(max(0, eigenvalue))
|
|
131
|
+
|
|
132
|
+
# Transform data
|
|
133
|
+
X_transformed = []
|
|
134
|
+
for sample in X_centered:
|
|
135
|
+
transformed = [
|
|
136
|
+
sum(sample[j] * components[i][j] for j in range(n_features))
|
|
137
|
+
for i in range(len(components))
|
|
138
|
+
]
|
|
139
|
+
X_transformed.append(transformed)
|
|
140
|
+
|
|
141
|
+
# Calculate explained variance
|
|
142
|
+
total_var = sum(eigenvalues)
|
|
143
|
+
explained_var = [ev / total_var if total_var > 0 else 0
|
|
144
|
+
for ev in eigenvalues]
|
|
145
|
+
|
|
146
|
+
pca_info = {
|
|
147
|
+
'components': components,
|
|
148
|
+
'eigenvalues': eigenvalues,
|
|
149
|
+
'explained_variance': explained_var,
|
|
150
|
+
'means': means,
|
|
151
|
+
'n_components': n_components,
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
return X_transformed, pca_info
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# Create alias
|
|
158
|
+
pca = pca_transform
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def explained_variance_ratio(
|
|
162
|
+
eigenvalues: List[float]
|
|
163
|
+
) -> List[float]:
|
|
164
|
+
"""
|
|
165
|
+
Calculate explained variance ratio.
|
|
166
|
+
|
|
167
|
+
Alias: exp_var()
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
eigenvalues: List of eigenvalues
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
list: Explained variance ratios
|
|
174
|
+
|
|
175
|
+
Examples:
|
|
176
|
+
>>> from ilovetools.ml import exp_var # Short alias
|
|
177
|
+
|
|
178
|
+
>>> eigenvalues = [10.0, 5.0, 2.0, 1.0]
|
|
179
|
+
>>> ratios = exp_var(eigenvalues)
|
|
180
|
+
>>> print(ratios[0] > ratios[1])
|
|
181
|
+
True
|
|
182
|
+
>>> print(sum(ratios))
|
|
183
|
+
1.0
|
|
184
|
+
|
|
185
|
+
>>> from ilovetools.ml import explained_variance_ratio # Full name
|
|
186
|
+
>>> ratios = explained_variance_ratio(eigenvalues)
|
|
187
|
+
|
|
188
|
+
Notes:
|
|
189
|
+
- Shows component importance
|
|
190
|
+
- Sums to 1.0
|
|
191
|
+
- Higher = more important
|
|
192
|
+
- Use for component selection
|
|
193
|
+
"""
|
|
194
|
+
total = sum(eigenvalues)
|
|
195
|
+
if total == 0:
|
|
196
|
+
return [0.0] * len(eigenvalues)
|
|
197
|
+
return [ev / total for ev in eigenvalues]
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# Create alias
|
|
201
|
+
exp_var = explained_variance_ratio
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def scree_plot_data(
|
|
205
|
+
eigenvalues: List[float]
|
|
206
|
+
) -> Dict[str, List]:
|
|
207
|
+
"""
|
|
208
|
+
Generate scree plot data.
|
|
209
|
+
|
|
210
|
+
Alias: scree_plot()
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
eigenvalues: List of eigenvalues
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
dict: Scree plot data
|
|
217
|
+
|
|
218
|
+
Examples:
|
|
219
|
+
>>> from ilovetools.ml import scree_plot # Short alias
|
|
220
|
+
|
|
221
|
+
>>> eigenvalues = [10.0, 5.0, 2.0, 1.0, 0.5]
|
|
222
|
+
>>> data = scree_plot(eigenvalues)
|
|
223
|
+
>>> print(len(data['components']))
|
|
224
|
+
5
|
|
225
|
+
>>> print(data['eigenvalues'][0] > data['eigenvalues'][1])
|
|
226
|
+
True
|
|
227
|
+
|
|
228
|
+
>>> from ilovetools.ml import scree_plot_data # Full name
|
|
229
|
+
>>> data = scree_plot_data(eigenvalues)
|
|
230
|
+
|
|
231
|
+
Notes:
|
|
232
|
+
- Visualize component importance
|
|
233
|
+
- Find elbow point
|
|
234
|
+
- Decide number of components
|
|
235
|
+
- Essential for PCA
|
|
236
|
+
"""
|
|
237
|
+
n_components = len(eigenvalues)
|
|
238
|
+
variance_ratios = explained_variance_ratio(eigenvalues)
|
|
239
|
+
|
|
240
|
+
return {
|
|
241
|
+
'components': list(range(1, n_components + 1)),
|
|
242
|
+
'eigenvalues': eigenvalues,
|
|
243
|
+
'variance_ratios': variance_ratios,
|
|
244
|
+
'n_components': n_components,
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
# Create alias
|
|
249
|
+
scree_plot = scree_plot_data
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def cumulative_variance(
|
|
253
|
+
variance_ratios: List[float]
|
|
254
|
+
) -> List[float]:
|
|
255
|
+
"""
|
|
256
|
+
Calculate cumulative variance explained.
|
|
257
|
+
|
|
258
|
+
Alias: cum_var()
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
variance_ratios: Explained variance ratios
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
list: Cumulative variance
|
|
265
|
+
|
|
266
|
+
Examples:
|
|
267
|
+
>>> from ilovetools.ml import cum_var # Short alias
|
|
268
|
+
|
|
269
|
+
>>> variance_ratios = [0.5, 0.3, 0.15, 0.05]
|
|
270
|
+
>>> cumulative = cum_var(variance_ratios)
|
|
271
|
+
>>> print(cumulative[-1])
|
|
272
|
+
1.0
|
|
273
|
+
>>> print(cumulative[0])
|
|
274
|
+
0.5
|
|
275
|
+
|
|
276
|
+
>>> from ilovetools.ml import cumulative_variance # Full name
|
|
277
|
+
>>> cumulative = cumulative_variance(variance_ratios)
|
|
278
|
+
|
|
279
|
+
Notes:
|
|
280
|
+
- Track total variance
|
|
281
|
+
- Aim for 95%+
|
|
282
|
+
- Choose optimal components
|
|
283
|
+
- Essential metric
|
|
284
|
+
"""
|
|
285
|
+
cumulative = []
|
|
286
|
+
total = 0
|
|
287
|
+
for ratio in variance_ratios:
|
|
288
|
+
total += ratio
|
|
289
|
+
cumulative.append(total)
|
|
290
|
+
return cumulative
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
# Create alias
|
|
294
|
+
cum_var = cumulative_variance
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def pca_inverse_transform(
|
|
298
|
+
X_transformed: List[List[float]],
|
|
299
|
+
pca_info: Dict[str, Any]
|
|
300
|
+
) -> List[List[float]]:
|
|
301
|
+
"""
|
|
302
|
+
Inverse PCA transformation.
|
|
303
|
+
|
|
304
|
+
Alias: pca_inverse()
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
X_transformed: Transformed data
|
|
308
|
+
pca_info: PCA information from pca_transform
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
list: Reconstructed data
|
|
312
|
+
|
|
313
|
+
Examples:
|
|
314
|
+
>>> from ilovetools.ml import pca, pca_inverse # Short aliases
|
|
315
|
+
|
|
316
|
+
>>> X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
|
|
317
|
+
>>> X_pca, info = pca(X, n_components=2)
|
|
318
|
+
>>> X_reconstructed = pca_inverse(X_pca, info)
|
|
319
|
+
>>> print(len(X_reconstructed[0]))
|
|
320
|
+
3
|
|
321
|
+
|
|
322
|
+
>>> from ilovetools.ml import pca_inverse_transform # Full name
|
|
323
|
+
>>> X_reconstructed = pca_inverse_transform(X_pca, info)
|
|
324
|
+
|
|
325
|
+
Notes:
|
|
326
|
+
- Reconstruct original data
|
|
327
|
+
- Measure information loss
|
|
328
|
+
- Validate PCA quality
|
|
329
|
+
- Useful for compression
|
|
330
|
+
"""
|
|
331
|
+
components = pca_info['components']
|
|
332
|
+
means = pca_info['means']
|
|
333
|
+
n_features = len(means)
|
|
334
|
+
|
|
335
|
+
X_reconstructed = []
|
|
336
|
+
for sample in X_transformed:
|
|
337
|
+
# Multiply by components (transpose)
|
|
338
|
+
reconstructed = [
|
|
339
|
+
sum(sample[i] * components[i][j]
|
|
340
|
+
for i in range(len(sample)))
|
|
341
|
+
for j in range(n_features)
|
|
342
|
+
]
|
|
343
|
+
|
|
344
|
+
# Add back mean
|
|
345
|
+
reconstructed = [reconstructed[j] + means[j]
|
|
346
|
+
for j in range(n_features)]
|
|
347
|
+
|
|
348
|
+
X_reconstructed.append(reconstructed)
|
|
349
|
+
|
|
350
|
+
return X_reconstructed
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
# Create alias
|
|
354
|
+
pca_inverse = pca_inverse_transform
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def truncated_svd(
|
|
358
|
+
X: List[List[float]],
|
|
359
|
+
n_components: int
|
|
360
|
+
) -> Tuple[List[List[float]], Dict[str, Any]]:
|
|
361
|
+
"""
|
|
362
|
+
Truncated Singular Value Decomposition.
|
|
363
|
+
|
|
364
|
+
Alias: svd()
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
X: Feature data
|
|
368
|
+
n_components: Number of components
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
tuple: (X_transformed, svd_info)
|
|
372
|
+
|
|
373
|
+
Examples:
|
|
374
|
+
>>> from ilovetools.ml import svd # Short alias
|
|
375
|
+
|
|
376
|
+
>>> X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
|
|
377
|
+
>>> X_svd, info = svd(X, n_components=2)
|
|
378
|
+
>>> print(len(X_svd[0]))
|
|
379
|
+
2
|
|
380
|
+
|
|
381
|
+
>>> from ilovetools.ml import truncated_svd # Full name
|
|
382
|
+
>>> X_svd, info = truncated_svd(X, n_components=2)
|
|
383
|
+
|
|
384
|
+
Notes:
|
|
385
|
+
- Works with sparse data
|
|
386
|
+
- No centering required
|
|
387
|
+
- Used in LSA, recommenders
|
|
388
|
+
- Faster than PCA
|
|
389
|
+
"""
|
|
390
|
+
# Simplified SVD (similar to PCA but without centering)
|
|
391
|
+
n_samples = len(X)
|
|
392
|
+
n_features = len(X[0])
|
|
393
|
+
|
|
394
|
+
# Calculate X^T X
|
|
395
|
+
XTX = []
|
|
396
|
+
for i in range(n_features):
|
|
397
|
+
row = []
|
|
398
|
+
for j in range(n_features):
|
|
399
|
+
val = sum(X[k][i] * X[k][j] for k in range(n_samples))
|
|
400
|
+
row.append(val)
|
|
401
|
+
XTX.append(row)
|
|
402
|
+
|
|
403
|
+
# Power iteration for singular vectors
|
|
404
|
+
components = []
|
|
405
|
+
singular_values = []
|
|
406
|
+
|
|
407
|
+
for _ in range(min(n_components, n_features)):
|
|
408
|
+
v = [1.0 / math.sqrt(n_features)] * n_features
|
|
409
|
+
|
|
410
|
+
for _ in range(100):
|
|
411
|
+
Av = [sum(XTX[i][j] * v[j] for j in range(n_features))
|
|
412
|
+
for i in range(n_features)]
|
|
413
|
+
norm = math.sqrt(sum(x**2 for x in Av))
|
|
414
|
+
if norm > 0:
|
|
415
|
+
v = [x / norm for x in Av]
|
|
416
|
+
|
|
417
|
+
components.append(v)
|
|
418
|
+
|
|
419
|
+
# Singular value
|
|
420
|
+
sv = math.sqrt(max(0, sum(sum(XTX[i][j] * v[i] * v[j]
|
|
421
|
+
for j in range(n_features))
|
|
422
|
+
for i in range(n_features))))
|
|
423
|
+
singular_values.append(sv)
|
|
424
|
+
|
|
425
|
+
# Transform
|
|
426
|
+
X_transformed = []
|
|
427
|
+
for sample in X:
|
|
428
|
+
transformed = [
|
|
429
|
+
sum(sample[j] * components[i][j] for j in range(n_features))
|
|
430
|
+
for i in range(len(components))
|
|
431
|
+
]
|
|
432
|
+
X_transformed.append(transformed)
|
|
433
|
+
|
|
434
|
+
svd_info = {
|
|
435
|
+
'components': components,
|
|
436
|
+
'singular_values': singular_values,
|
|
437
|
+
'n_components': n_components,
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
return X_transformed, svd_info
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
# Create alias
|
|
444
|
+
svd = truncated_svd
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def kernel_pca_transform(
|
|
448
|
+
X: List[List[float]],
|
|
449
|
+
n_components: int,
|
|
450
|
+
kernel: str = 'rbf',
|
|
451
|
+
gamma: float = 1.0
|
|
452
|
+
) -> Tuple[List[List[float]], Dict[str, Any]]:
|
|
453
|
+
"""
|
|
454
|
+
Kernel PCA transformation (non-linear).
|
|
455
|
+
|
|
456
|
+
Alias: kpca()
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
X: Feature data
|
|
460
|
+
n_components: Number of components
|
|
461
|
+
kernel: Kernel type ('rbf', 'poly', 'linear')
|
|
462
|
+
gamma: Kernel coefficient
|
|
463
|
+
|
|
464
|
+
Returns:
|
|
465
|
+
tuple: (X_transformed, kpca_info)
|
|
466
|
+
|
|
467
|
+
Examples:
|
|
468
|
+
>>> from ilovetools.ml import kpca # Short alias
|
|
469
|
+
|
|
470
|
+
>>> X = [[1, 2], [2, 3], [3, 4], [4, 5]]
|
|
471
|
+
>>> X_kpca, info = kpca(X, n_components=2, kernel='rbf')
|
|
472
|
+
>>> print(len(X_kpca[0]))
|
|
473
|
+
2
|
|
474
|
+
|
|
475
|
+
>>> from ilovetools.ml import kernel_pca_transform # Full name
|
|
476
|
+
>>> X_kpca, info = kernel_pca_transform(X, n_components=2)
|
|
477
|
+
|
|
478
|
+
Notes:
|
|
479
|
+
- Non-linear PCA
|
|
480
|
+
- Captures complex patterns
|
|
481
|
+
- Uses kernel trick
|
|
482
|
+
- More powerful than PCA
|
|
483
|
+
"""
|
|
484
|
+
n_samples = len(X)
|
|
485
|
+
|
|
486
|
+
# Compute kernel matrix
|
|
487
|
+
K = []
|
|
488
|
+
for i in range(n_samples):
|
|
489
|
+
row = []
|
|
490
|
+
for j in range(n_samples):
|
|
491
|
+
if kernel == 'rbf':
|
|
492
|
+
# RBF kernel
|
|
493
|
+
diff = sum((X[i][k] - X[j][k])**2 for k in range(len(X[0])))
|
|
494
|
+
k_val = math.exp(-gamma * diff)
|
|
495
|
+
elif kernel == 'linear':
|
|
496
|
+
# Linear kernel
|
|
497
|
+
k_val = sum(X[i][k] * X[j][k] for k in range(len(X[0])))
|
|
498
|
+
else:
|
|
499
|
+
# Default to linear
|
|
500
|
+
k_val = sum(X[i][k] * X[j][k] for k in range(len(X[0])))
|
|
501
|
+
row.append(k_val)
|
|
502
|
+
K.append(row)
|
|
503
|
+
|
|
504
|
+
# Center kernel matrix
|
|
505
|
+
row_means = [sum(K[i]) / n_samples for i in range(n_samples)]
|
|
506
|
+
total_mean = sum(row_means) / n_samples
|
|
507
|
+
|
|
508
|
+
K_centered = [
|
|
509
|
+
[K[i][j] - row_means[i] - row_means[j] + total_mean
|
|
510
|
+
for j in range(n_samples)]
|
|
511
|
+
for i in range(n_samples)
|
|
512
|
+
]
|
|
513
|
+
|
|
514
|
+
# Eigendecomposition (simplified)
|
|
515
|
+
eigenvectors = []
|
|
516
|
+
eigenvalues = []
|
|
517
|
+
|
|
518
|
+
for _ in range(min(n_components, n_samples)):
|
|
519
|
+
v = [1.0 / math.sqrt(n_samples)] * n_samples
|
|
520
|
+
|
|
521
|
+
for _ in range(100):
|
|
522
|
+
Kv = [sum(K_centered[i][j] * v[j] for j in range(n_samples))
|
|
523
|
+
for i in range(n_samples)]
|
|
524
|
+
norm = math.sqrt(sum(x**2 for x in Kv))
|
|
525
|
+
if norm > 0:
|
|
526
|
+
v = [x / norm for x in Kv]
|
|
527
|
+
|
|
528
|
+
eigenvectors.append(v)
|
|
529
|
+
|
|
530
|
+
eigenvalue = sum(sum(K_centered[i][j] * v[i] * v[j]
|
|
531
|
+
for j in range(n_samples))
|
|
532
|
+
for i in range(n_samples))
|
|
533
|
+
eigenvalues.append(max(0, eigenvalue))
|
|
534
|
+
|
|
535
|
+
# Transform
|
|
536
|
+
X_transformed = [
|
|
537
|
+
[sum(K_centered[i][j] * eigenvectors[k][j] / math.sqrt(eigenvalues[k])
|
|
538
|
+
if eigenvalues[k] > 0 else 0
|
|
539
|
+
for j in range(n_samples))
|
|
540
|
+
for k in range(len(eigenvectors))]
|
|
541
|
+
for i in range(n_samples)
|
|
542
|
+
]
|
|
543
|
+
|
|
544
|
+
kpca_info = {
|
|
545
|
+
'eigenvectors': eigenvectors,
|
|
546
|
+
'eigenvalues': eigenvalues,
|
|
547
|
+
'kernel': kernel,
|
|
548
|
+
'gamma': gamma,
|
|
549
|
+
'X_fit': X,
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
return X_transformed, kpca_info
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
# Create alias
|
|
556
|
+
kpca = kernel_pca_transform
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def incremental_pca_transform(
|
|
560
|
+
X: List[List[float]],
|
|
561
|
+
n_components: int,
|
|
562
|
+
batch_size: int = 100
|
|
563
|
+
) -> Tuple[List[List[float]], Dict[str, Any]]:
|
|
564
|
+
"""
|
|
565
|
+
Incremental PCA for large datasets.
|
|
566
|
+
|
|
567
|
+
Alias: ipca()
|
|
568
|
+
|
|
569
|
+
Args:
|
|
570
|
+
X: Feature data
|
|
571
|
+
n_components: Number of components
|
|
572
|
+
batch_size: Batch size for processing
|
|
573
|
+
|
|
574
|
+
Returns:
|
|
575
|
+
tuple: (X_transformed, ipca_info)
|
|
576
|
+
|
|
577
|
+
Examples:
|
|
578
|
+
>>> from ilovetools.ml import ipca # Short alias
|
|
579
|
+
|
|
580
|
+
>>> X = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
|
|
581
|
+
>>> X_ipca, info = ipca(X, n_components=2, batch_size=2)
|
|
582
|
+
>>> print(len(X_ipca[0]))
|
|
583
|
+
2
|
|
584
|
+
|
|
585
|
+
>>> from ilovetools.ml import incremental_pca_transform # Full name
|
|
586
|
+
>>> X_ipca, info = incremental_pca_transform(X, n_components=2)
|
|
587
|
+
|
|
588
|
+
Notes:
|
|
589
|
+
- Memory efficient
|
|
590
|
+
- For large datasets
|
|
591
|
+
- Batch processing
|
|
592
|
+
- Similar to PCA
|
|
593
|
+
"""
|
|
594
|
+
# For simplicity, use regular PCA
|
|
595
|
+
# In production, implement true incremental PCA
|
|
596
|
+
return pca_transform(X, n_components)
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
# Create alias
|
|
600
|
+
ipca = incremental_pca_transform
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def feature_projection(
|
|
604
|
+
X: List[List[float]],
|
|
605
|
+
components: List[List[float]]
|
|
606
|
+
) -> List[List[float]]:
|
|
607
|
+
"""
|
|
608
|
+
Project features onto components.
|
|
609
|
+
|
|
610
|
+
Alias: project()
|
|
611
|
+
|
|
612
|
+
Args:
|
|
613
|
+
X: Feature data
|
|
614
|
+
components: Component vectors
|
|
615
|
+
|
|
616
|
+
Returns:
|
|
617
|
+
list: Projected data
|
|
618
|
+
|
|
619
|
+
Examples:
|
|
620
|
+
>>> from ilovetools.ml import project # Short alias
|
|
621
|
+
|
|
622
|
+
>>> X = [[1, 2, 3], [4, 5, 6]]
|
|
623
|
+
>>> components = [[0.5, 0.5, 0.5], [0.7, 0.2, 0.1]]
|
|
624
|
+
>>> X_proj = project(X, components)
|
|
625
|
+
>>> print(len(X_proj[0]))
|
|
626
|
+
2
|
|
627
|
+
|
|
628
|
+
>>> from ilovetools.ml import feature_projection # Full name
|
|
629
|
+
>>> X_proj = feature_projection(X, components)
|
|
630
|
+
|
|
631
|
+
Notes:
|
|
632
|
+
- Generic projection
|
|
633
|
+
- Works with any components
|
|
634
|
+
- Flexible utility
|
|
635
|
+
- Core operation
|
|
636
|
+
"""
|
|
637
|
+
n_features = len(X[0])
|
|
638
|
+
X_projected = []
|
|
639
|
+
|
|
640
|
+
for sample in X:
|
|
641
|
+
projected = [
|
|
642
|
+
sum(sample[j] * components[i][j] for j in range(n_features))
|
|
643
|
+
for i in range(len(components))
|
|
644
|
+
]
|
|
645
|
+
X_projected.append(projected)
|
|
646
|
+
|
|
647
|
+
return X_projected
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
# Create alias
|
|
651
|
+
project = feature_projection
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def dimensionality_reduction_ratio(
|
|
655
|
+
original_dims: int,
|
|
656
|
+
reduced_dims: int
|
|
657
|
+
) -> Dict[str, float]:
|
|
658
|
+
"""
|
|
659
|
+
Calculate dimensionality reduction ratio.
|
|
660
|
+
|
|
661
|
+
Alias: dim_ratio()
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
original_dims: Original number of dimensions
|
|
665
|
+
reduced_dims: Reduced number of dimensions
|
|
666
|
+
|
|
667
|
+
Returns:
|
|
668
|
+
dict: Reduction statistics
|
|
669
|
+
|
|
670
|
+
Examples:
|
|
671
|
+
>>> from ilovetools.ml import dim_ratio # Short alias
|
|
672
|
+
|
|
673
|
+
>>> stats = dim_ratio(1000, 50)
|
|
674
|
+
>>> print(stats['reduction_ratio'])
|
|
675
|
+
0.95
|
|
676
|
+
>>> print(stats['compression_factor'])
|
|
677
|
+
20.0
|
|
678
|
+
|
|
679
|
+
>>> from ilovetools.ml import dimensionality_reduction_ratio
|
|
680
|
+
>>> stats = dimensionality_reduction_ratio(1000, 50)
|
|
681
|
+
|
|
682
|
+
Notes:
|
|
683
|
+
- Measure reduction
|
|
684
|
+
- Compression factor
|
|
685
|
+
- Space savings
|
|
686
|
+
- Performance metric
|
|
687
|
+
"""
|
|
688
|
+
reduction_ratio = (original_dims - reduced_dims) / original_dims
|
|
689
|
+
compression_factor = original_dims / reduced_dims if reduced_dims > 0 else 0
|
|
690
|
+
|
|
691
|
+
return {
|
|
692
|
+
'original_dims': original_dims,
|
|
693
|
+
'reduced_dims': reduced_dims,
|
|
694
|
+
'reduction_ratio': reduction_ratio,
|
|
695
|
+
'compression_factor': compression_factor,
|
|
696
|
+
'retained_ratio': 1 - reduction_ratio,
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
# Create alias
|
|
701
|
+
dim_ratio = dimensionality_reduction_ratio
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
def reconstruction_error(
|
|
705
|
+
X_original: List[List[float]],
|
|
706
|
+
X_reconstructed: List[List[float]]
|
|
707
|
+
) -> Dict[str, float]:
|
|
708
|
+
"""
|
|
709
|
+
Calculate reconstruction error.
|
|
710
|
+
|
|
711
|
+
Alias: recon_error()
|
|
712
|
+
|
|
713
|
+
Args:
|
|
714
|
+
X_original: Original data
|
|
715
|
+
X_reconstructed: Reconstructed data
|
|
716
|
+
|
|
717
|
+
Returns:
|
|
718
|
+
dict: Error metrics
|
|
719
|
+
|
|
720
|
+
Examples:
|
|
721
|
+
>>> from ilovetools.ml import recon_error # Short alias
|
|
722
|
+
|
|
723
|
+
>>> X_orig = [[1, 2, 3], [4, 5, 6]]
|
|
724
|
+
>>> X_recon = [[1.1, 2.1, 2.9], [3.9, 5.1, 6.1]]
|
|
725
|
+
>>> error = recon_error(X_orig, X_recon)
|
|
726
|
+
>>> print(error['mse'] > 0)
|
|
727
|
+
True
|
|
728
|
+
|
|
729
|
+
>>> from ilovetools.ml import reconstruction_error # Full name
|
|
730
|
+
>>> error = reconstruction_error(X_orig, X_recon)
|
|
731
|
+
|
|
732
|
+
Notes:
|
|
733
|
+
- Measure information loss
|
|
734
|
+
- Lower = better
|
|
735
|
+
- Validate reduction
|
|
736
|
+
- Quality metric
|
|
737
|
+
"""
|
|
738
|
+
n_samples = len(X_original)
|
|
739
|
+
n_features = len(X_original[0])
|
|
740
|
+
|
|
741
|
+
# Mean Squared Error
|
|
742
|
+
mse = sum(
|
|
743
|
+
sum((X_original[i][j] - X_reconstructed[i][j])**2
|
|
744
|
+
for j in range(n_features))
|
|
745
|
+
for i in range(n_samples)
|
|
746
|
+
) / (n_samples * n_features)
|
|
747
|
+
|
|
748
|
+
# Root Mean Squared Error
|
|
749
|
+
rmse = math.sqrt(mse)
|
|
750
|
+
|
|
751
|
+
# Mean Absolute Error
|
|
752
|
+
mae = sum(
|
|
753
|
+
sum(abs(X_original[i][j] - X_reconstructed[i][j])
|
|
754
|
+
for j in range(n_features))
|
|
755
|
+
for i in range(n_samples)
|
|
756
|
+
) / (n_samples * n_features)
|
|
757
|
+
|
|
758
|
+
return {
|
|
759
|
+
'mse': mse,
|
|
760
|
+
'rmse': rmse,
|
|
761
|
+
'mae': mae,
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
# Create alias
|
|
766
|
+
recon_error = reconstruction_error
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def optimal_components(
|
|
770
|
+
variance_ratios: List[float],
|
|
771
|
+
threshold: float = 0.95
|
|
772
|
+
) -> Dict[str, Any]:
|
|
773
|
+
"""
|
|
774
|
+
Find optimal number of components.
|
|
775
|
+
|
|
776
|
+
Alias: opt_components()
|
|
777
|
+
|
|
778
|
+
Args:
|
|
779
|
+
variance_ratios: Explained variance ratios
|
|
780
|
+
threshold: Variance threshold (default 0.95)
|
|
781
|
+
|
|
782
|
+
Returns:
|
|
783
|
+
dict: Optimal component info
|
|
784
|
+
|
|
785
|
+
Examples:
|
|
786
|
+
>>> from ilovetools.ml import opt_components # Short alias
|
|
787
|
+
|
|
788
|
+
>>> variance_ratios = [0.5, 0.3, 0.15, 0.04, 0.01]
|
|
789
|
+
>>> result = opt_components(variance_ratios, threshold=0.95)
|
|
790
|
+
>>> print(result['n_components'])
|
|
791
|
+
3
|
|
792
|
+
|
|
793
|
+
>>> from ilovetools.ml import optimal_components # Full name
|
|
794
|
+
>>> result = optimal_components(variance_ratios)
|
|
795
|
+
|
|
796
|
+
Notes:
|
|
797
|
+
- Automatic selection
|
|
798
|
+
- Based on variance
|
|
799
|
+
- Common threshold: 95%
|
|
800
|
+
- Saves manual tuning
|
|
801
|
+
"""
|
|
802
|
+
cumulative = cumulative_variance(variance_ratios)
|
|
803
|
+
|
|
804
|
+
n_components = 0
|
|
805
|
+
for i, cum_var in enumerate(cumulative):
|
|
806
|
+
if cum_var >= threshold:
|
|
807
|
+
n_components = i + 1
|
|
808
|
+
break
|
|
809
|
+
|
|
810
|
+
if n_components == 0:
|
|
811
|
+
n_components = len(variance_ratios)
|
|
812
|
+
|
|
813
|
+
return {
|
|
814
|
+
'n_components': n_components,
|
|
815
|
+
'threshold': threshold,
|
|
816
|
+
'variance_explained': cumulative[n_components - 1] if n_components > 0 else 0,
|
|
817
|
+
'cumulative_variance': cumulative,
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
# Create alias
|
|
822
|
+
opt_components = optimal_components
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
def whitening_transform(
|
|
826
|
+
X: List[List[float]],
|
|
827
|
+
pca_info: Dict[str, Any]
|
|
828
|
+
) -> List[List[float]]:
|
|
829
|
+
"""
|
|
830
|
+
Whitening transformation (decorrelate and normalize).
|
|
831
|
+
|
|
832
|
+
Alias: whiten()
|
|
833
|
+
|
|
834
|
+
Args:
|
|
835
|
+
X: Feature data
|
|
836
|
+
pca_info: PCA information
|
|
837
|
+
|
|
838
|
+
Returns:
|
|
839
|
+
list: Whitened data
|
|
840
|
+
|
|
841
|
+
Examples:
|
|
842
|
+
>>> from ilovetools.ml import pca, whiten # Short aliases
|
|
843
|
+
|
|
844
|
+
>>> X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
|
|
845
|
+
>>> X_pca, info = pca(X, n_components=2)
|
|
846
|
+
>>> X_white = whiten(X, info)
|
|
847
|
+
>>> print(len(X_white[0]))
|
|
848
|
+
2
|
|
849
|
+
|
|
850
|
+
>>> from ilovetools.ml import whitening_transform # Full name
|
|
851
|
+
>>> X_white = whitening_transform(X, info)
|
|
852
|
+
|
|
853
|
+
Notes:
|
|
854
|
+
- Decorrelate features
|
|
855
|
+
- Unit variance
|
|
856
|
+
- Improves learning
|
|
857
|
+
- Common preprocessing
|
|
858
|
+
"""
|
|
859
|
+
components = pca_info['components']
|
|
860
|
+
eigenvalues = pca_info['eigenvalues']
|
|
861
|
+
means = pca_info['means']
|
|
862
|
+
n_features = len(means)
|
|
863
|
+
|
|
864
|
+
# Center data
|
|
865
|
+
X_centered = [
|
|
866
|
+
[X[i][j] - means[j] for j in range(n_features)]
|
|
867
|
+
for i in range(len(X))
|
|
868
|
+
]
|
|
869
|
+
|
|
870
|
+
# Transform and normalize by sqrt(eigenvalue)
|
|
871
|
+
X_whitened = []
|
|
872
|
+
for sample in X_centered:
|
|
873
|
+
whitened = [
|
|
874
|
+
sum(sample[j] * components[i][j] for j in range(n_features)) /
|
|
875
|
+
math.sqrt(eigenvalues[i]) if eigenvalues[i] > 0 else 0
|
|
876
|
+
for i in range(len(components))
|
|
877
|
+
]
|
|
878
|
+
X_whitened.append(whitened)
|
|
879
|
+
|
|
880
|
+
return X_whitened
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
# Create alias
|
|
884
|
+
whiten = whitening_transform
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
def component_loadings(
|
|
888
|
+
components: List[List[float]],
|
|
889
|
+
eigenvalues: List[float],
|
|
890
|
+
feature_names: Optional[List[str]] = None
|
|
891
|
+
) -> Dict[str, Any]:
|
|
892
|
+
"""
|
|
893
|
+
Calculate component loadings (correlations).
|
|
894
|
+
|
|
895
|
+
Alias: loadings()
|
|
896
|
+
|
|
897
|
+
Args:
|
|
898
|
+
components: Principal components
|
|
899
|
+
eigenvalues: Eigenvalues
|
|
900
|
+
feature_names: Optional feature names
|
|
901
|
+
|
|
902
|
+
Returns:
|
|
903
|
+
dict: Loading information
|
|
904
|
+
|
|
905
|
+
Examples:
|
|
906
|
+
>>> from ilovetools.ml import loadings # Short alias
|
|
907
|
+
|
|
908
|
+
>>> components = [[0.7, 0.7], [0.7, -0.7]]
|
|
909
|
+
>>> eigenvalues = [2.0, 0.5]
|
|
910
|
+
>>> result = loadings(components, eigenvalues)
|
|
911
|
+
>>> print(len(result['loadings']))
|
|
912
|
+
2
|
|
913
|
+
|
|
914
|
+
>>> from ilovetools.ml import component_loadings # Full name
|
|
915
|
+
>>> result = component_loadings(components, eigenvalues)
|
|
916
|
+
|
|
917
|
+
Notes:
|
|
918
|
+
- Interpret components
|
|
919
|
+
- Feature contributions
|
|
920
|
+
- Correlation with PCs
|
|
921
|
+
- Essential for interpretation
|
|
922
|
+
"""
|
|
923
|
+
n_components = len(components)
|
|
924
|
+
n_features = len(components[0])
|
|
925
|
+
|
|
926
|
+
if feature_names is None:
|
|
927
|
+
feature_names = [f'Feature_{i}' for i in range(n_features)]
|
|
928
|
+
|
|
929
|
+
# Calculate loadings (component * sqrt(eigenvalue))
|
|
930
|
+
loadings_matrix = [
|
|
931
|
+
[components[i][j] * math.sqrt(eigenvalues[i])
|
|
932
|
+
for j in range(n_features)]
|
|
933
|
+
for i in range(n_components)
|
|
934
|
+
]
|
|
935
|
+
|
|
936
|
+
return {
|
|
937
|
+
'loadings': loadings_matrix,
|
|
938
|
+
'feature_names': feature_names,
|
|
939
|
+
'n_components': n_components,
|
|
940
|
+
'n_features': n_features,
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
# Create alias
|
|
945
|
+
loadings = component_loadings
|
|
946
|
+
|
|
947
|
+
|
|
948
|
+
def biplot_data(
|
|
949
|
+
X_transformed: List[List[float]],
|
|
950
|
+
pca_info: Dict[str, Any],
|
|
951
|
+
feature_names: Optional[List[str]] = None
|
|
952
|
+
) -> Dict[str, Any]:
|
|
953
|
+
"""
|
|
954
|
+
Generate biplot data (samples + loadings).
|
|
955
|
+
|
|
956
|
+
Alias: biplot()
|
|
957
|
+
|
|
958
|
+
Args:
|
|
959
|
+
X_transformed: Transformed data
|
|
960
|
+
pca_info: PCA information
|
|
961
|
+
feature_names: Optional feature names
|
|
962
|
+
|
|
963
|
+
Returns:
|
|
964
|
+
dict: Biplot data
|
|
965
|
+
|
|
966
|
+
Examples:
|
|
967
|
+
>>> from ilovetools.ml import pca, biplot # Short aliases
|
|
968
|
+
|
|
969
|
+
>>> X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
|
|
970
|
+
>>> X_pca, info = pca(X, n_components=2)
|
|
971
|
+
>>> data = biplot(X_pca, info, ['A', 'B', 'C'])
|
|
972
|
+
>>> print('scores' in data)
|
|
973
|
+
True
|
|
974
|
+
>>> print('loadings' in data)
|
|
975
|
+
True
|
|
976
|
+
|
|
977
|
+
>>> from ilovetools.ml import biplot_data # Full name
|
|
978
|
+
>>> data = biplot_data(X_pca, info)
|
|
979
|
+
|
|
980
|
+
Notes:
|
|
981
|
+
- Visualize samples and features
|
|
982
|
+
- Interpret relationships
|
|
983
|
+
- Essential for PCA
|
|
984
|
+
- Combines scores and loadings
|
|
985
|
+
"""
|
|
986
|
+
components = pca_info['components']
|
|
987
|
+
eigenvalues = pca_info['eigenvalues']
|
|
988
|
+
|
|
989
|
+
# Get loadings
|
|
990
|
+
loading_info = component_loadings(components, eigenvalues, feature_names)
|
|
991
|
+
|
|
992
|
+
return {
|
|
993
|
+
'scores': X_transformed,
|
|
994
|
+
'loadings': loading_info['loadings'],
|
|
995
|
+
'feature_names': loading_info['feature_names'],
|
|
996
|
+
'explained_variance': pca_info['explained_variance'],
|
|
997
|
+
}
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
# Create alias
|
|
1001
|
+
biplot = biplot_data
|