itertoolkit 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bm_preprocessing/__init__.py +14 -0
- bm_preprocessing/importer/DM/__init__.py +7 -0
- bm_preprocessing/importer/DM/agg.py +6 -0
- bm_preprocessing/importer/DM/dbscan.py +6 -0
- bm_preprocessing/importer/DM/finals.py +6 -0
- bm_preprocessing/importer/DM/gsp.py +6 -0
- bm_preprocessing/importer/DM/test.py +6 -0
- bm_preprocessing/importer/Finals/__init__.py +7 -0
- bm_preprocessing/importer/Finals/kaadhal.py +6 -0
- bm_preprocessing/importer/Finals/raaka.py +6 -0
- bm_preprocessing/importer/Finals/seedan.py +6 -0
- bm_preprocessing/importer/Finals/vikram.py +6 -0
- bm_preprocessing/importer/IR/__init__.py +6 -0
- bm_preprocessing/importer/IR/finals.py +6 -0
- bm_preprocessing/importer/IR/pagerank.py +6 -0
- bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
- bm_preprocessing/importer/IR/test.py +6 -0
- bm_preprocessing/importer/PY/__init__.py +4 -0
- bm_preprocessing/importer/PY/lib_doc.py +6 -0
- bm_preprocessing/importer/PY/python_doc.py +6 -0
- bm_preprocessing/importer/__init__.py +8 -0
- bm_preprocessing/importer/_module_printer.py +23 -0
- bm_preprocessing/src/DM/__init__.py +1 -0
- bm_preprocessing/src/DM/agg.py +267 -0
- bm_preprocessing/src/DM/dbscan.py +218 -0
- bm_preprocessing/src/DM/finals.py +19 -0
- bm_preprocessing/src/DM/gsp.py +378 -0
- bm_preprocessing/src/DM/test.py +19 -0
- bm_preprocessing/src/Finals/__init__.py +1 -0
- bm_preprocessing/src/Finals/kaadhal.py +1453 -0
- bm_preprocessing/src/Finals/raaka.py +1338 -0
- bm_preprocessing/src/Finals/seedan.py +1173 -0
- bm_preprocessing/src/Finals/vikram.py +520 -0
- bm_preprocessing/src/IR/__init__.py +1 -0
- bm_preprocessing/src/IR/finals.py +14 -0
- bm_preprocessing/src/IR/pagerank.py +109 -0
- bm_preprocessing/src/IR/recommenders_pca.py +487 -0
- bm_preprocessing/src/IR/test.py +14 -0
- bm_preprocessing/src/PY/__init__.py +1 -0
- bm_preprocessing/src/PY/lib_doc.py +295 -0
- bm_preprocessing/src/PY/python_doc.py +177 -0
- bm_preprocessing/src/__init__.py +1 -0
- itertoolkit-1.5.0.dist-info/METADATA +120 -0
- itertoolkit-1.5.0.dist-info/RECORD +45 -0
- itertoolkit-1.5.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
import networkx as nx
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from sklearn.decomposition import PCA as SklearnPCA
|
|
6
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
7
|
+
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
|
8
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
9
|
+
|
|
10
|
+
plt.rcParams["figure.figsize"] = (8, 5)
|
|
11
|
+
pd.set_option("display.max_columns", None)
|
|
12
|
+
pd.set_option("display.width", 120)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
items_cb = pd.DataFrame(
|
|
16
|
+
{
|
|
17
|
+
"item_id": [1, 2, 3, 4, 5, 6, 7, 8, 9],
|
|
18
|
+
"title": [
|
|
19
|
+
"Red Mars",
|
|
20
|
+
"Jurassic Park",
|
|
21
|
+
"Lost World",
|
|
22
|
+
"2001",
|
|
23
|
+
"Foundation",
|
|
24
|
+
"Difference Engine",
|
|
25
|
+
"Machine Learning",
|
|
26
|
+
"Neuromancer",
|
|
27
|
+
"2010",
|
|
28
|
+
],
|
|
29
|
+
"description": [
|
|
30
|
+
"science fiction mars colony politics space future",
|
|
31
|
+
"science thriller dinosaurs genetics adventure park",
|
|
32
|
+
"adventure dinosaurs island science thriller sequel",
|
|
33
|
+
"science fiction space artificial intelligence mission",
|
|
34
|
+
"classic science fiction empire foundation future space",
|
|
35
|
+
"alternate history computing steam engine science fiction",
|
|
36
|
+
"artificial intelligence data algorithms prediction learning",
|
|
37
|
+
"cyberpunk hackers network dystopian future artificial intelligence",
|
|
38
|
+
"science fiction space mission sequel future",
|
|
39
|
+
],
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Example user ratings inspired by the books/movies personalization example in the slides
|
|
44
|
+
ratings_cb = pd.DataFrame(
|
|
45
|
+
{
|
|
46
|
+
"user_id": [1, 1, 1, 1, 2, 2, 2],
|
|
47
|
+
"item_id": [1, 4, 5, 8, 2, 3, 7],
|
|
48
|
+
"rating": [5, 4, 5, 4, 5, 4, 3],
|
|
49
|
+
}
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
print("Items used for content-based recommendation:")
|
|
53
|
+
display(items_cb)
|
|
54
|
+
|
|
55
|
+
print("Sample user ratings:")
|
|
56
|
+
display(ratings_cb)
|
|
57
|
+
|
|
58
|
+
"""### Step 1: Build item profiles using TF-IDF"""
|
|
59
|
+
|
|
60
|
+
vectorizer = TfidfVectorizer(stop_words="english")
|
|
61
|
+
tfidf_matrix = vectorizer.fit_transform(items_cb["description"])
|
|
62
|
+
|
|
63
|
+
print("TF-IDF matrix shape:", tfidf_matrix.shape)
|
|
64
|
+
|
|
65
|
+
# Cosine similarity between items
|
|
66
|
+
item_similarity_cb = cosine_similarity(tfidf_matrix)
|
|
67
|
+
item_similarity_cb_df = pd.DataFrame(
|
|
68
|
+
item_similarity_cb, index=items_cb["title"], columns=items_cb["title"]
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
item_similarity_cb_df.round(2)
|
|
72
|
+
|
|
73
|
+
"""### Step 2: Build a user profile from items rated highly"""
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def build_user_profile(user_id, ratings_df, items_df, tfidf_matrix, min_rating=4):
|
|
77
|
+
liked_item_ids = ratings_df[
|
|
78
|
+
(ratings_df["user_id"] == user_id) & (ratings_df["rating"] >= min_rating)
|
|
79
|
+
]["item_id"]
|
|
80
|
+
liked_idx = items_df[items_df["item_id"].isin(liked_item_ids)].index.tolist()
|
|
81
|
+
|
|
82
|
+
if not liked_idx:
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
# weighted average of item profiles, using rating as weight
|
|
86
|
+
liked_ratings = ratings_df[
|
|
87
|
+
(ratings_df["user_id"] == user_id) & (ratings_df["rating"] >= min_rating)
|
|
88
|
+
]
|
|
89
|
+
weights = (
|
|
90
|
+
liked_ratings.set_index("item_id")
|
|
91
|
+
.loc[items_df.loc[liked_idx, "item_id"], "rating"]
|
|
92
|
+
.values
|
|
93
|
+
)
|
|
94
|
+
profile = np.average(tfidf_matrix[liked_idx].toarray(), axis=0, weights=weights)
|
|
95
|
+
return profile.reshape(1, -1)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
user1_profile = build_user_profile(1, ratings_cb, items_cb, tfidf_matrix, min_rating=4)
|
|
99
|
+
print("User 1 profile shape:", user1_profile.shape)
|
|
100
|
+
|
|
101
|
+
"""### Step 3: Match user profile to item profiles"""
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def recommend_content_based(user_id, ratings_df, items_df, tfidf_matrix, top_n=5):
|
|
105
|
+
user_profile = build_user_profile(
|
|
106
|
+
user_id, ratings_df, items_df, tfidf_matrix, min_rating=4
|
|
107
|
+
)
|
|
108
|
+
if user_profile is None:
|
|
109
|
+
return pd.DataFrame(columns=["item_id", "title", "score"])
|
|
110
|
+
|
|
111
|
+
scores = cosine_similarity(user_profile, tfidf_matrix).flatten()
|
|
112
|
+
result = items_df.copy()
|
|
113
|
+
result["score"] = scores
|
|
114
|
+
|
|
115
|
+
rated_items = ratings_df[ratings_df["user_id"] == user_id]["item_id"].tolist()
|
|
116
|
+
result = result[~result["item_id"].isin(rated_items)]
|
|
117
|
+
|
|
118
|
+
return result.sort_values("score", ascending=False)[
|
|
119
|
+
["item_id", "title", "score"]
|
|
120
|
+
].head(top_n)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
cb_recommendations = recommend_content_based(
|
|
124
|
+
1, ratings_cb, items_cb, tfidf_matrix, top_n=5
|
|
125
|
+
)
|
|
126
|
+
cb_recommendations
|
|
127
|
+
|
|
128
|
+
"""
|
|
129
|
+
### Evaluation metrics
|
|
130
|
+
The recommender slides mention metrics such as:
|
|
131
|
+
- RMSE
|
|
132
|
+
- Precision at top-k
|
|
133
|
+
- Rank correlation
|
|
134
|
+
|
|
135
|
+
For a compact lab exam notebook, for this content-based section we demonstrate:
|
|
136
|
+
- **Precision@K**
|
|
137
|
+
- **Recall@K**
|
|
138
|
+
|
|
139
|
+
Here we simulate a tiny held-out relevant set for User 1.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def precision_at_k(recommended, relevant, k):
|
|
144
|
+
recommended_k = recommended[:k]
|
|
145
|
+
hits = len(set(recommended_k) & set(relevant))
|
|
146
|
+
return hits / k if k else 0
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def recall_at_k(recommended, relevant, k):
|
|
150
|
+
recommended_k = recommended[:k]
|
|
151
|
+
hits = len(set(recommended_k) & set(relevant))
|
|
152
|
+
return hits / len(relevant) if relevant else 0
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
recommended_items = cb_recommendations["item_id"].tolist()
|
|
156
|
+
relevant_items = [9, 6]
|
|
157
|
+
|
|
158
|
+
print("Recommended item IDs:", recommended_items)
|
|
159
|
+
print("Relevant item IDs (held-out assumption):", relevant_items)
|
|
160
|
+
print("Precision@3 =", round(precision_at_k(recommended_items, relevant_items, 3), 3))
|
|
161
|
+
print("Recall@3 =", round(recall_at_k(recommended_items, relevant_items, 3), 3))
|
|
162
|
+
|
|
163
|
+
"""### Visualizations"""
|
|
164
|
+
|
|
165
|
+
plt.imshow(item_similarity_cb_df.values, aspect="auto")
|
|
166
|
+
plt.colorbar(label="Cosine similarity")
|
|
167
|
+
plt.xticks(
|
|
168
|
+
range(len(item_similarity_cb_df.columns)),
|
|
169
|
+
item_similarity_cb_df.columns,
|
|
170
|
+
rotation=90,
|
|
171
|
+
)
|
|
172
|
+
plt.yticks(range(len(item_similarity_cb_df.index)), item_similarity_cb_df.index)
|
|
173
|
+
plt.title("Content-Based Item Similarity Matrix")
|
|
174
|
+
plt.tight_layout()
|
|
175
|
+
plt.show()
|
|
176
|
+
|
|
177
|
+
plt.bar(cb_recommendations["title"], cb_recommendations["score"])
|
|
178
|
+
plt.xticks(rotation=45, ha="right")
|
|
179
|
+
plt.ylabel("Similarity score")
|
|
180
|
+
plt.title("Top Content-Based Recommendations for User 1")
|
|
181
|
+
plt.tight_layout()
|
|
182
|
+
plt.show()
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
cf_matrix = pd.DataFrame(
|
|
186
|
+
[
|
|
187
|
+
[2, 3, np.nan, 1, 3, 8],
|
|
188
|
+
[0, 3, 1, 4, 6, 7],
|
|
189
|
+
[3, 0, 0, 3, 4, 6],
|
|
190
|
+
[9, 5, 1, 5, 0, 7],
|
|
191
|
+
[3, 4, 6, 7, 9, 9],
|
|
192
|
+
[4, 0, 1, 4, 8, 0],
|
|
193
|
+
[2, 4, 0, 0, 0, 8],
|
|
194
|
+
],
|
|
195
|
+
index=["U1", "U2", "U3", "U4", "U5", "U6", "U7"],
|
|
196
|
+
columns=["I1", "I2", "I3", "I4", "I5", "I6"],
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
cf_matrix
|
|
200
|
+
|
|
201
|
+
"""### Step 1: Define Pearson similarity on co-rated items only"""
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def pearson_similarity(user_a, user_b):
|
|
205
|
+
# consider only co-rated items (non-missing and non-zero)
|
|
206
|
+
mask = user_a.notna() & user_b.notna() & (user_a != 0) & (user_b != 0)
|
|
207
|
+
a = user_a[mask]
|
|
208
|
+
b = user_b[mask]
|
|
209
|
+
|
|
210
|
+
if len(a) < 2:
|
|
211
|
+
return 0.0
|
|
212
|
+
|
|
213
|
+
a_centered = a - a.mean()
|
|
214
|
+
b_centered = b - b.mean()
|
|
215
|
+
|
|
216
|
+
denom = np.sqrt((a_centered**2).sum()) * np.sqrt((b_centered**2).sum())
|
|
217
|
+
if denom == 0:
|
|
218
|
+
return 0.0
|
|
219
|
+
|
|
220
|
+
return ((a_centered * b_centered).sum()) / denom
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
target_user = "U1"
|
|
224
|
+
sims = {}
|
|
225
|
+
|
|
226
|
+
for u in cf_matrix.index:
|
|
227
|
+
if u != target_user:
|
|
228
|
+
sims[u] = pearson_similarity(cf_matrix.loc[target_user], cf_matrix.loc[u])
|
|
229
|
+
|
|
230
|
+
user_similarity_df = pd.DataFrame.from_dict(
|
|
231
|
+
sims, orient="index", columns=["pearson_similarity"]
|
|
232
|
+
).sort_values("pearson_similarity", ascending=False)
|
|
233
|
+
user_similarity_df
|
|
234
|
+
|
|
235
|
+
"""### Step 2: Select top neighbors who rated the target item"""
|
|
236
|
+
|
|
237
|
+
target_item = "I3"
|
|
238
|
+
|
|
239
|
+
neighbors = []
|
|
240
|
+
for u, sim in sims.items():
|
|
241
|
+
rating = cf_matrix.loc[u, target_item]
|
|
242
|
+
if pd.notna(rating) and rating != 0:
|
|
243
|
+
neighbors.append((u, sim, rating))
|
|
244
|
+
|
|
245
|
+
neighbors_df = pd.DataFrame(
|
|
246
|
+
neighbors, columns=["neighbor", "similarity", "rating_on_I3"]
|
|
247
|
+
)
|
|
248
|
+
neighbors_df = neighbors_df.sort_values("similarity", ascending=False)
|
|
249
|
+
neighbors_df
|
|
250
|
+
|
|
251
|
+
"""### Step 3: Predict the missing rating"""
|
|
252
|
+
|
|
253
|
+
# Use top-k positively similar neighbors for a simple exam-friendly prediction
|
|
254
|
+
top_k = neighbors_df[neighbors_df["similarity"] > 0].head(3).copy()
|
|
255
|
+
|
|
256
|
+
if len(top_k) == 0 or top_k["similarity"].sum() == 0:
|
|
257
|
+
predicted_rating_user_based = np.nan
|
|
258
|
+
else:
|
|
259
|
+
predicted_rating_user_based = np.sum(
|
|
260
|
+
top_k["similarity"] * top_k["rating_on_I3"]
|
|
261
|
+
) / np.sum(top_k["similarity"])
|
|
262
|
+
|
|
263
|
+
print("Top neighbors used:")
|
|
264
|
+
display(top_k)
|
|
265
|
+
|
|
266
|
+
print("Predicted rating for U1 on I3 =", round(predicted_rating_user_based, 3))
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
item_cf_small = pd.DataFrame(
|
|
270
|
+
[[4, 5, np.nan, 2], [5, 3, 4, 3], [2, 4, 5, 1]],
|
|
271
|
+
index=["U1", "U2", "U3"],
|
|
272
|
+
columns=["Item1", "Item2", "Item3", "Item4"],
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
item_cf_small
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def cosine_sim(x, y):
|
|
279
|
+
x = np.array(x, dtype=float)
|
|
280
|
+
y = np.array(y, dtype=float)
|
|
281
|
+
return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
sim_13 = cosine_sim([5, 2], [4, 5]) # expected approx 0.87
|
|
285
|
+
sim_23 = cosine_sim([3, 4], [4, 5]) # expected approx 1.00
|
|
286
|
+
sim_43 = cosine_sim([3, 1], [4, 5]) # expected approx 0.84
|
|
287
|
+
|
|
288
|
+
print("sim(Item1, Item3) =", round(sim_13, 2))
|
|
289
|
+
print("sim(Item2, Item3) =", round(sim_23, 2))
|
|
290
|
+
print("sim(Item4, Item3) =", round(sim_43, 2))
|
|
291
|
+
|
|
292
|
+
"""### Step 2: Use U1's existing ratings to predict U1 on Item3"""
|
|
293
|
+
|
|
294
|
+
u1_ratings = {"Item1": 4, "Item2": 5, "Item4": 2}
|
|
295
|
+
|
|
296
|
+
numerator = (
|
|
297
|
+
sim_13 * u1_ratings["Item1"]
|
|
298
|
+
+ sim_23 * u1_ratings["Item2"]
|
|
299
|
+
+ sim_43 * u1_ratings["Item4"]
|
|
300
|
+
)
|
|
301
|
+
denominator = sim_13 + sim_23 + sim_43
|
|
302
|
+
|
|
303
|
+
predicted_u1_item3 = numerator / denominator
|
|
304
|
+
|
|
305
|
+
print("Numerator =", round(numerator, 2))
|
|
306
|
+
print("Denominator =", round(denominator, 2))
|
|
307
|
+
print("Predicted rating for U1 on Item3 =", round(predicted_u1_item3, 2))
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# Tiny leave-one-out evaluation for item-based CF on the 3x4 matrix
|
|
311
|
+
known_entries = []
|
|
312
|
+
for u in item_cf_small.index:
|
|
313
|
+
for i in item_cf_small.columns:
|
|
314
|
+
if pd.notna(item_cf_small.loc[u, i]):
|
|
315
|
+
known_entries.append((u, i))
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def predict_item_based_leave_one_out(df, user, target_item):
|
|
319
|
+
temp = df.copy()
|
|
320
|
+
true_rating = temp.loc[user, target_item]
|
|
321
|
+
temp.loc[user, target_item] = np.nan
|
|
322
|
+
|
|
323
|
+
# items rated by target user
|
|
324
|
+
rated_items = [
|
|
325
|
+
col
|
|
326
|
+
for col in temp.columns
|
|
327
|
+
if col != target_item and pd.notna(temp.loc[user, col])
|
|
328
|
+
]
|
|
329
|
+
|
|
330
|
+
sims = []
|
|
331
|
+
for other_item in rated_items:
|
|
332
|
+
both = temp[[other_item, target_item]].dropna()
|
|
333
|
+
if len(both) < 2:
|
|
334
|
+
continue
|
|
335
|
+
sim = cosine_sim(both[other_item].values, both[target_item].values)
|
|
336
|
+
sims.append((other_item, sim, temp.loc[user, other_item]))
|
|
337
|
+
|
|
338
|
+
sims = [(it, s, r) for it, s, r in sims if s > 0]
|
|
339
|
+
if not sims:
|
|
340
|
+
return np.nan
|
|
341
|
+
|
|
342
|
+
num = sum(s * r for _, s, r in sims)
|
|
343
|
+
den = sum(s for _, s, _ in sims)
|
|
344
|
+
return num / den if den != 0 else np.nan
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
y_true, y_pred = [], []
|
|
348
|
+
for user, item in known_entries:
|
|
349
|
+
pred = predict_item_based_leave_one_out(item_cf_small, user, item)
|
|
350
|
+
if not np.isnan(pred):
|
|
351
|
+
y_true.append(item_cf_small.loc[user, item])
|
|
352
|
+
y_pred.append(pred)
|
|
353
|
+
|
|
354
|
+
print("Predictions used in evaluation:", len(y_true))
|
|
355
|
+
print("MAE =", round(mean_absolute_error(y_true, y_pred), 4))
|
|
356
|
+
print("RMSE =", round(np.sqrt(mean_squared_error(y_true, y_pred)), 4))
|
|
357
|
+
|
|
358
|
+
"""### Visualizations"""
|
|
359
|
+
|
|
360
|
+
# user-based similarity bar chart
|
|
361
|
+
plt.bar(user_similarity_df.index, user_similarity_df["pearson_similarity"])
|
|
362
|
+
plt.title("User Similarity with U1 (Pearson)")
|
|
363
|
+
plt.xlabel("Neighbor user")
|
|
364
|
+
plt.ylabel("Similarity")
|
|
365
|
+
plt.tight_layout()
|
|
366
|
+
plt.show()
|
|
367
|
+
|
|
368
|
+
sim_values = pd.Series(
|
|
369
|
+
{"Item1 vs Item3": sim_13, "Item2 vs Item3": sim_23, "Item4 vs Item3": sim_43}
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
plt.bar(sim_values.index, sim_values.values)
|
|
373
|
+
plt.ylabel("Cosine similarity")
|
|
374
|
+
plt.title("Solved PPT: Similarity with Item3")
|
|
375
|
+
plt.xticks(rotation=30, ha="right")
|
|
376
|
+
plt.tight_layout()
|
|
377
|
+
plt.show()
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
X = np.array(
|
|
381
|
+
[
|
|
382
|
+
[2.5, 2.4],
|
|
383
|
+
[0.5, 0.7],
|
|
384
|
+
[2.2, 2.9],
|
|
385
|
+
[1.9, 2.2],
|
|
386
|
+
[3.1, 3.0],
|
|
387
|
+
[2.3, 2.7],
|
|
388
|
+
[2.0, 1.6],
|
|
389
|
+
[1.0, 1.1],
|
|
390
|
+
[1.5, 1.6],
|
|
391
|
+
[1.1, 0.9],
|
|
392
|
+
],
|
|
393
|
+
dtype=float,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
pca_df = pd.DataFrame(X, columns=["X1", "X2"])
|
|
397
|
+
pca_df
|
|
398
|
+
|
|
399
|
+
"""### Step 1: Center the data"""
|
|
400
|
+
|
|
401
|
+
mean_vector = X.mean(axis=0)
|
|
402
|
+
X_centered = X - mean_vector
|
|
403
|
+
|
|
404
|
+
print("Mean vector:", mean_vector)
|
|
405
|
+
pd.DataFrame(X_centered, columns=["X1_centered", "X2_centered"]).head()
|
|
406
|
+
|
|
407
|
+
"""### Step 2: Covariance matrix computation"""
|
|
408
|
+
|
|
409
|
+
cov_matrix = np.cov(X_centered, rowvar=False)
|
|
410
|
+
cov_df = pd.DataFrame(cov_matrix, index=["X1", "X2"], columns=["X1", "X2"])
|
|
411
|
+
cov_df
|
|
412
|
+
|
|
413
|
+
"""### Step 3: Eigenvalue and eigenvector computation"""
|
|
414
|
+
|
|
415
|
+
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
|
|
416
|
+
|
|
417
|
+
print("Eigenvalues:")
|
|
418
|
+
print(eigenvalues)
|
|
419
|
+
|
|
420
|
+
print("\nEigenvectors:")
|
|
421
|
+
print(eigenvectors)
|
|
422
|
+
|
|
423
|
+
"""### Step 4: Sort eigenvalues in descending order"""
|
|
424
|
+
|
|
425
|
+
sorted_idx = np.argsort(eigenvalues)[::-1]
|
|
426
|
+
eigenvalues_sorted = eigenvalues[sorted_idx]
|
|
427
|
+
eigenvectors_sorted = eigenvectors[:, sorted_idx]
|
|
428
|
+
|
|
429
|
+
print("Sorted eigenvalues:", eigenvalues_sorted)
|
|
430
|
+
print("\nSorted eigenvectors:\n", eigenvectors_sorted)
|
|
431
|
+
|
|
432
|
+
"""### Step 5: Select principal components"""
|
|
433
|
+
|
|
434
|
+
# Keep only the first principal component for maximum variance direction
|
|
435
|
+
W = eigenvectors_sorted[:, :1]
|
|
436
|
+
print("Selected principal component vector:\n", W)
|
|
437
|
+
|
|
438
|
+
"""### Step 6: Project data onto principal components"""
|
|
439
|
+
|
|
440
|
+
X_pca_manual = X_centered @ W
|
|
441
|
+
pd.DataFrame(X_pca_manual, columns=["PC1"]).head()
|
|
442
|
+
|
|
443
|
+
sk_pca = SklearnPCA(n_components=2)
|
|
444
|
+
X_pca_sklearn = sk_pca.fit_transform(X_centered)
|
|
445
|
+
|
|
446
|
+
print("Explained variance ratio:", sk_pca.explained_variance_ratio_)
|
|
447
|
+
print("Cumulative explained variance:", np.cumsum(sk_pca.explained_variance_ratio_))
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
explained_variance_ratio = eigenvalues_sorted / eigenvalues_sorted.sum()
|
|
451
|
+
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
|
|
452
|
+
|
|
453
|
+
# reconstruct from first PC
|
|
454
|
+
X_reconstructed = X_pca_manual @ W.T + mean_vector
|
|
455
|
+
reconstruction_error = np.mean((X - X_reconstructed) ** 2)
|
|
456
|
+
|
|
457
|
+
print("Explained variance ratio:", explained_variance_ratio)
|
|
458
|
+
print("Cumulative explained variance:", cumulative_explained_variance)
|
|
459
|
+
print("Reconstruction error using 1 PC:", reconstruction_error)
|
|
460
|
+
|
|
461
|
+
"""### Visualizations"""
|
|
462
|
+
|
|
463
|
+
plt.scatter(X[:, 0], X[:, 1])
|
|
464
|
+
plt.xlabel("X1")
|
|
465
|
+
plt.ylabel("X2")
|
|
466
|
+
plt.title("Original Data")
|
|
467
|
+
plt.grid(True)
|
|
468
|
+
plt.show()
|
|
469
|
+
|
|
470
|
+
plt.scatter(X_pca_manual[:, 0], np.zeros(len(X_pca_manual)))
|
|
471
|
+
plt.xlabel("PC1")
|
|
472
|
+
plt.title("Projection of Data onto First Principal Component")
|
|
473
|
+
plt.yticks([])
|
|
474
|
+
plt.grid(True)
|
|
475
|
+
plt.show()
|
|
476
|
+
|
|
477
|
+
plt.bar(["PC1", "PC2"], explained_variance_ratio)
|
|
478
|
+
plt.ylabel("Explained Variance Ratio")
|
|
479
|
+
plt.title("Scree Plot")
|
|
480
|
+
plt.show()
|
|
481
|
+
|
|
482
|
+
plt.plot(["PC1", "PC2"], cumulative_explained_variance, marker="o")
|
|
483
|
+
plt.ylabel("Cumulative Explained Variance")
|
|
484
|
+
plt.title("Cumulative Explained Variance Plot")
|
|
485
|
+
plt.ylim(0, 1.05)
|
|
486
|
+
plt.grid(True)
|
|
487
|
+
plt.show()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Python-related source snippets."""
|