itertoolkit 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. bm_preprocessing/__init__.py +14 -0
  2. bm_preprocessing/importer/DM/__init__.py +7 -0
  3. bm_preprocessing/importer/DM/agg.py +6 -0
  4. bm_preprocessing/importer/DM/dbscan.py +6 -0
  5. bm_preprocessing/importer/DM/finals.py +6 -0
  6. bm_preprocessing/importer/DM/gsp.py +6 -0
  7. bm_preprocessing/importer/DM/test.py +6 -0
  8. bm_preprocessing/importer/Finals/__init__.py +7 -0
  9. bm_preprocessing/importer/Finals/kaadhal.py +6 -0
  10. bm_preprocessing/importer/Finals/raaka.py +6 -0
  11. bm_preprocessing/importer/Finals/seedan.py +6 -0
  12. bm_preprocessing/importer/Finals/vikram.py +6 -0
  13. bm_preprocessing/importer/IR/__init__.py +6 -0
  14. bm_preprocessing/importer/IR/finals.py +6 -0
  15. bm_preprocessing/importer/IR/pagerank.py +6 -0
  16. bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
  17. bm_preprocessing/importer/IR/test.py +6 -0
  18. bm_preprocessing/importer/PY/__init__.py +4 -0
  19. bm_preprocessing/importer/PY/lib_doc.py +6 -0
  20. bm_preprocessing/importer/PY/python_doc.py +6 -0
  21. bm_preprocessing/importer/__init__.py +8 -0
  22. bm_preprocessing/importer/_module_printer.py +23 -0
  23. bm_preprocessing/src/DM/__init__.py +1 -0
  24. bm_preprocessing/src/DM/agg.py +267 -0
  25. bm_preprocessing/src/DM/dbscan.py +218 -0
  26. bm_preprocessing/src/DM/finals.py +19 -0
  27. bm_preprocessing/src/DM/gsp.py +378 -0
  28. bm_preprocessing/src/DM/test.py +19 -0
  29. bm_preprocessing/src/Finals/__init__.py +1 -0
  30. bm_preprocessing/src/Finals/kaadhal.py +1453 -0
  31. bm_preprocessing/src/Finals/raaka.py +1338 -0
  32. bm_preprocessing/src/Finals/seedan.py +1173 -0
  33. bm_preprocessing/src/Finals/vikram.py +520 -0
  34. bm_preprocessing/src/IR/__init__.py +1 -0
  35. bm_preprocessing/src/IR/finals.py +14 -0
  36. bm_preprocessing/src/IR/pagerank.py +109 -0
  37. bm_preprocessing/src/IR/recommenders_pca.py +487 -0
  38. bm_preprocessing/src/IR/test.py +14 -0
  39. bm_preprocessing/src/PY/__init__.py +1 -0
  40. bm_preprocessing/src/PY/lib_doc.py +295 -0
  41. bm_preprocessing/src/PY/python_doc.py +177 -0
  42. bm_preprocessing/src/__init__.py +1 -0
  43. itertoolkit-1.5.0.dist-info/METADATA +120 -0
  44. itertoolkit-1.5.0.dist-info/RECORD +45 -0
  45. itertoolkit-1.5.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,487 @@
1
+ import matplotlib.pyplot as plt
2
+ import networkx as nx
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.decomposition import PCA as SklearnPCA
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics import mean_absolute_error, mean_squared_error
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+
10
+ plt.rcParams["figure.figsize"] = (8, 5)
11
+ pd.set_option("display.max_columns", None)
12
+ pd.set_option("display.width", 120)
13
+
14
+
15
+ items_cb = pd.DataFrame(
16
+ {
17
+ "item_id": [1, 2, 3, 4, 5, 6, 7, 8, 9],
18
+ "title": [
19
+ "Red Mars",
20
+ "Jurassic Park",
21
+ "Lost World",
22
+ "2001",
23
+ "Foundation",
24
+ "Difference Engine",
25
+ "Machine Learning",
26
+ "Neuromancer",
27
+ "2010",
28
+ ],
29
+ "description": [
30
+ "science fiction mars colony politics space future",
31
+ "science thriller dinosaurs genetics adventure park",
32
+ "adventure dinosaurs island science thriller sequel",
33
+ "science fiction space artificial intelligence mission",
34
+ "classic science fiction empire foundation future space",
35
+ "alternate history computing steam engine science fiction",
36
+ "artificial intelligence data algorithms prediction learning",
37
+ "cyberpunk hackers network dystopian future artificial intelligence",
38
+ "science fiction space mission sequel future",
39
+ ],
40
+ }
41
+ )
42
+
43
+ # Example user ratings inspired by the books/movies personalization example in the slides
44
+ ratings_cb = pd.DataFrame(
45
+ {
46
+ "user_id": [1, 1, 1, 1, 2, 2, 2],
47
+ "item_id": [1, 4, 5, 8, 2, 3, 7],
48
+ "rating": [5, 4, 5, 4, 5, 4, 3],
49
+ }
50
+ )
51
+
52
+ print("Items used for content-based recommendation:")
53
+ display(items_cb)
54
+
55
+ print("Sample user ratings:")
56
+ display(ratings_cb)
57
+
58
+ """### Step 1: Build item profiles using TF-IDF"""
59
+
60
+ vectorizer = TfidfVectorizer(stop_words="english")
61
+ tfidf_matrix = vectorizer.fit_transform(items_cb["description"])
62
+
63
+ print("TF-IDF matrix shape:", tfidf_matrix.shape)
64
+
65
+ # Cosine similarity between items
66
+ item_similarity_cb = cosine_similarity(tfidf_matrix)
67
+ item_similarity_cb_df = pd.DataFrame(
68
+ item_similarity_cb, index=items_cb["title"], columns=items_cb["title"]
69
+ )
70
+
71
+ item_similarity_cb_df.round(2)
72
+
73
+ """### Step 2: Build a user profile from items rated highly"""
74
+
75
+
76
+ def build_user_profile(user_id, ratings_df, items_df, tfidf_matrix, min_rating=4):
77
+ liked_item_ids = ratings_df[
78
+ (ratings_df["user_id"] == user_id) & (ratings_df["rating"] >= min_rating)
79
+ ]["item_id"]
80
+ liked_idx = items_df[items_df["item_id"].isin(liked_item_ids)].index.tolist()
81
+
82
+ if not liked_idx:
83
+ return None
84
+
85
+ # weighted average of item profiles, using rating as weight
86
+ liked_ratings = ratings_df[
87
+ (ratings_df["user_id"] == user_id) & (ratings_df["rating"] >= min_rating)
88
+ ]
89
+ weights = (
90
+ liked_ratings.set_index("item_id")
91
+ .loc[items_df.loc[liked_idx, "item_id"], "rating"]
92
+ .values
93
+ )
94
+ profile = np.average(tfidf_matrix[liked_idx].toarray(), axis=0, weights=weights)
95
+ return profile.reshape(1, -1)
96
+
97
+
98
+ user1_profile = build_user_profile(1, ratings_cb, items_cb, tfidf_matrix, min_rating=4)
99
+ print("User 1 profile shape:", user1_profile.shape)
100
+
101
+ """### Step 3: Match user profile to item profiles"""
102
+
103
+
104
+ def recommend_content_based(user_id, ratings_df, items_df, tfidf_matrix, top_n=5):
105
+ user_profile = build_user_profile(
106
+ user_id, ratings_df, items_df, tfidf_matrix, min_rating=4
107
+ )
108
+ if user_profile is None:
109
+ return pd.DataFrame(columns=["item_id", "title", "score"])
110
+
111
+ scores = cosine_similarity(user_profile, tfidf_matrix).flatten()
112
+ result = items_df.copy()
113
+ result["score"] = scores
114
+
115
+ rated_items = ratings_df[ratings_df["user_id"] == user_id]["item_id"].tolist()
116
+ result = result[~result["item_id"].isin(rated_items)]
117
+
118
+ return result.sort_values("score", ascending=False)[
119
+ ["item_id", "title", "score"]
120
+ ].head(top_n)
121
+
122
+
123
+ cb_recommendations = recommend_content_based(
124
+ 1, ratings_cb, items_cb, tfidf_matrix, top_n=5
125
+ )
126
+ cb_recommendations
127
+
128
+ """
129
+ ### Evaluation metrics
130
+ The recommender slides mention metrics such as:
131
+ - RMSE
132
+ - Precision at top-k
133
+ - Rank correlation
134
+
135
+ For a compact lab exam notebook, for this content-based section we demonstrate:
136
+ - **Precision@K**
137
+ - **Recall@K**
138
+
139
+ Here we simulate a tiny held-out relevant set for User 1.
140
+ """
141
+
142
+
143
+ def precision_at_k(recommended, relevant, k):
144
+ recommended_k = recommended[:k]
145
+ hits = len(set(recommended_k) & set(relevant))
146
+ return hits / k if k else 0
147
+
148
+
149
+ def recall_at_k(recommended, relevant, k):
150
+ recommended_k = recommended[:k]
151
+ hits = len(set(recommended_k) & set(relevant))
152
+ return hits / len(relevant) if relevant else 0
153
+
154
+
155
+ recommended_items = cb_recommendations["item_id"].tolist()
156
+ relevant_items = [9, 6]
157
+
158
+ print("Recommended item IDs:", recommended_items)
159
+ print("Relevant item IDs (held-out assumption):", relevant_items)
160
+ print("Precision@3 =", round(precision_at_k(recommended_items, relevant_items, 3), 3))
161
+ print("Recall@3 =", round(recall_at_k(recommended_items, relevant_items, 3), 3))
162
+
163
+ """### Visualizations"""
164
+
165
+ plt.imshow(item_similarity_cb_df.values, aspect="auto")
166
+ plt.colorbar(label="Cosine similarity")
167
+ plt.xticks(
168
+ range(len(item_similarity_cb_df.columns)),
169
+ item_similarity_cb_df.columns,
170
+ rotation=90,
171
+ )
172
+ plt.yticks(range(len(item_similarity_cb_df.index)), item_similarity_cb_df.index)
173
+ plt.title("Content-Based Item Similarity Matrix")
174
+ plt.tight_layout()
175
+ plt.show()
176
+
177
+ plt.bar(cb_recommendations["title"], cb_recommendations["score"])
178
+ plt.xticks(rotation=45, ha="right")
179
+ plt.ylabel("Similarity score")
180
+ plt.title("Top Content-Based Recommendations for User 1")
181
+ plt.tight_layout()
182
+ plt.show()
183
+
184
+
185
+ cf_matrix = pd.DataFrame(
186
+ [
187
+ [2, 3, np.nan, 1, 3, 8],
188
+ [0, 3, 1, 4, 6, 7],
189
+ [3, 0, 0, 3, 4, 6],
190
+ [9, 5, 1, 5, 0, 7],
191
+ [3, 4, 6, 7, 9, 9],
192
+ [4, 0, 1, 4, 8, 0],
193
+ [2, 4, 0, 0, 0, 8],
194
+ ],
195
+ index=["U1", "U2", "U3", "U4", "U5", "U6", "U7"],
196
+ columns=["I1", "I2", "I3", "I4", "I5", "I6"],
197
+ )
198
+
199
+ cf_matrix
200
+
201
+ """### Step 1: Define Pearson similarity on co-rated items only"""
202
+
203
+
204
+ def pearson_similarity(user_a, user_b):
205
+ # consider only co-rated items (non-missing and non-zero)
206
+ mask = user_a.notna() & user_b.notna() & (user_a != 0) & (user_b != 0)
207
+ a = user_a[mask]
208
+ b = user_b[mask]
209
+
210
+ if len(a) < 2:
211
+ return 0.0
212
+
213
+ a_centered = a - a.mean()
214
+ b_centered = b - b.mean()
215
+
216
+ denom = np.sqrt((a_centered**2).sum()) * np.sqrt((b_centered**2).sum())
217
+ if denom == 0:
218
+ return 0.0
219
+
220
+ return ((a_centered * b_centered).sum()) / denom
221
+
222
+
223
+ target_user = "U1"
224
+ sims = {}
225
+
226
+ for u in cf_matrix.index:
227
+ if u != target_user:
228
+ sims[u] = pearson_similarity(cf_matrix.loc[target_user], cf_matrix.loc[u])
229
+
230
+ user_similarity_df = pd.DataFrame.from_dict(
231
+ sims, orient="index", columns=["pearson_similarity"]
232
+ ).sort_values("pearson_similarity", ascending=False)
233
+ user_similarity_df
234
+
235
+ """### Step 2: Select top neighbors who rated the target item"""
236
+
237
+ target_item = "I3"
238
+
239
+ neighbors = []
240
+ for u, sim in sims.items():
241
+ rating = cf_matrix.loc[u, target_item]
242
+ if pd.notna(rating) and rating != 0:
243
+ neighbors.append((u, sim, rating))
244
+
245
+ neighbors_df = pd.DataFrame(
246
+ neighbors, columns=["neighbor", "similarity", "rating_on_I3"]
247
+ )
248
+ neighbors_df = neighbors_df.sort_values("similarity", ascending=False)
249
+ neighbors_df
250
+
251
+ """### Step 3: Predict the missing rating"""
252
+
253
+ # Use top-k positively similar neighbors for a simple exam-friendly prediction
254
+ top_k = neighbors_df[neighbors_df["similarity"] > 0].head(3).copy()
255
+
256
+ if len(top_k) == 0 or top_k["similarity"].sum() == 0:
257
+ predicted_rating_user_based = np.nan
258
+ else:
259
+ predicted_rating_user_based = np.sum(
260
+ top_k["similarity"] * top_k["rating_on_I3"]
261
+ ) / np.sum(top_k["similarity"])
262
+
263
+ print("Top neighbors used:")
264
+ display(top_k)
265
+
266
+ print("Predicted rating for U1 on I3 =", round(predicted_rating_user_based, 3))
267
+
268
+
269
+ item_cf_small = pd.DataFrame(
270
+ [[4, 5, np.nan, 2], [5, 3, 4, 3], [2, 4, 5, 1]],
271
+ index=["U1", "U2", "U3"],
272
+ columns=["Item1", "Item2", "Item3", "Item4"],
273
+ )
274
+
275
+ item_cf_small
276
+
277
+
278
+ def cosine_sim(x, y):
279
+ x = np.array(x, dtype=float)
280
+ y = np.array(y, dtype=float)
281
+ return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
282
+
283
+
284
+ sim_13 = cosine_sim([5, 2], [4, 5]) # expected approx 0.87
285
+ sim_23 = cosine_sim([3, 4], [4, 5]) # expected approx 1.00
286
+ sim_43 = cosine_sim([3, 1], [4, 5]) # expected approx 0.84
287
+
288
+ print("sim(Item1, Item3) =", round(sim_13, 2))
289
+ print("sim(Item2, Item3) =", round(sim_23, 2))
290
+ print("sim(Item4, Item3) =", round(sim_43, 2))
291
+
292
+ """### Step 2: Use U1's existing ratings to predict U1 on Item3"""
293
+
294
+ u1_ratings = {"Item1": 4, "Item2": 5, "Item4": 2}
295
+
296
+ numerator = (
297
+ sim_13 * u1_ratings["Item1"]
298
+ + sim_23 * u1_ratings["Item2"]
299
+ + sim_43 * u1_ratings["Item4"]
300
+ )
301
+ denominator = sim_13 + sim_23 + sim_43
302
+
303
+ predicted_u1_item3 = numerator / denominator
304
+
305
+ print("Numerator =", round(numerator, 2))
306
+ print("Denominator =", round(denominator, 2))
307
+ print("Predicted rating for U1 on Item3 =", round(predicted_u1_item3, 2))
308
+
309
+
310
+ # Tiny leave-one-out evaluation for item-based CF on the 3x4 matrix
311
+ known_entries = []
312
+ for u in item_cf_small.index:
313
+ for i in item_cf_small.columns:
314
+ if pd.notna(item_cf_small.loc[u, i]):
315
+ known_entries.append((u, i))
316
+
317
+
318
+ def predict_item_based_leave_one_out(df, user, target_item):
319
+ temp = df.copy()
320
+ true_rating = temp.loc[user, target_item]
321
+ temp.loc[user, target_item] = np.nan
322
+
323
+ # items rated by target user
324
+ rated_items = [
325
+ col
326
+ for col in temp.columns
327
+ if col != target_item and pd.notna(temp.loc[user, col])
328
+ ]
329
+
330
+ sims = []
331
+ for other_item in rated_items:
332
+ both = temp[[other_item, target_item]].dropna()
333
+ if len(both) < 2:
334
+ continue
335
+ sim = cosine_sim(both[other_item].values, both[target_item].values)
336
+ sims.append((other_item, sim, temp.loc[user, other_item]))
337
+
338
+ sims = [(it, s, r) for it, s, r in sims if s > 0]
339
+ if not sims:
340
+ return np.nan
341
+
342
+ num = sum(s * r for _, s, r in sims)
343
+ den = sum(s for _, s, _ in sims)
344
+ return num / den if den != 0 else np.nan
345
+
346
+
347
+ y_true, y_pred = [], []
348
+ for user, item in known_entries:
349
+ pred = predict_item_based_leave_one_out(item_cf_small, user, item)
350
+ if not np.isnan(pred):
351
+ y_true.append(item_cf_small.loc[user, item])
352
+ y_pred.append(pred)
353
+
354
+ print("Predictions used in evaluation:", len(y_true))
355
+ print("MAE =", round(mean_absolute_error(y_true, y_pred), 4))
356
+ print("RMSE =", round(np.sqrt(mean_squared_error(y_true, y_pred)), 4))
357
+
358
+ """### Visualizations"""
359
+
360
+ # user-based similarity bar chart
361
+ plt.bar(user_similarity_df.index, user_similarity_df["pearson_similarity"])
362
+ plt.title("User Similarity with U1 (Pearson)")
363
+ plt.xlabel("Neighbor user")
364
+ plt.ylabel("Similarity")
365
+ plt.tight_layout()
366
+ plt.show()
367
+
368
+ sim_values = pd.Series(
369
+ {"Item1 vs Item3": sim_13, "Item2 vs Item3": sim_23, "Item4 vs Item3": sim_43}
370
+ )
371
+
372
+ plt.bar(sim_values.index, sim_values.values)
373
+ plt.ylabel("Cosine similarity")
374
+ plt.title("Solved PPT: Similarity with Item3")
375
+ plt.xticks(rotation=30, ha="right")
376
+ plt.tight_layout()
377
+ plt.show()
378
+
379
+
380
+ X = np.array(
381
+ [
382
+ [2.5, 2.4],
383
+ [0.5, 0.7],
384
+ [2.2, 2.9],
385
+ [1.9, 2.2],
386
+ [3.1, 3.0],
387
+ [2.3, 2.7],
388
+ [2.0, 1.6],
389
+ [1.0, 1.1],
390
+ [1.5, 1.6],
391
+ [1.1, 0.9],
392
+ ],
393
+ dtype=float,
394
+ )
395
+
396
+ pca_df = pd.DataFrame(X, columns=["X1", "X2"])
397
+ pca_df
398
+
399
+ """### Step 1: Center the data"""
400
+
401
+ mean_vector = X.mean(axis=0)
402
+ X_centered = X - mean_vector
403
+
404
+ print("Mean vector:", mean_vector)
405
+ pd.DataFrame(X_centered, columns=["X1_centered", "X2_centered"]).head()
406
+
407
+ """### Step 2: Covariance matrix computation"""
408
+
409
+ cov_matrix = np.cov(X_centered, rowvar=False)
410
+ cov_df = pd.DataFrame(cov_matrix, index=["X1", "X2"], columns=["X1", "X2"])
411
+ cov_df
412
+
413
+ """### Step 3: Eigenvalue and eigenvector computation"""
414
+
415
+ eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
416
+
417
+ print("Eigenvalues:")
418
+ print(eigenvalues)
419
+
420
+ print("\nEigenvectors:")
421
+ print(eigenvectors)
422
+
423
+ """### Step 4: Sort eigenvalues in descending order"""
424
+
425
+ sorted_idx = np.argsort(eigenvalues)[::-1]
426
+ eigenvalues_sorted = eigenvalues[sorted_idx]
427
+ eigenvectors_sorted = eigenvectors[:, sorted_idx]
428
+
429
+ print("Sorted eigenvalues:", eigenvalues_sorted)
430
+ print("\nSorted eigenvectors:\n", eigenvectors_sorted)
431
+
432
+ """### Step 5: Select principal components"""
433
+
434
+ # Keep only the first principal component for maximum variance direction
435
+ W = eigenvectors_sorted[:, :1]
436
+ print("Selected principal component vector:\n", W)
437
+
438
+ """### Step 6: Project data onto principal components"""
439
+
440
+ X_pca_manual = X_centered @ W
441
+ pd.DataFrame(X_pca_manual, columns=["PC1"]).head()
442
+
443
+ sk_pca = SklearnPCA(n_components=2)
444
+ X_pca_sklearn = sk_pca.fit_transform(X_centered)
445
+
446
+ print("Explained variance ratio:", sk_pca.explained_variance_ratio_)
447
+ print("Cumulative explained variance:", np.cumsum(sk_pca.explained_variance_ratio_))
448
+
449
+
450
+ explained_variance_ratio = eigenvalues_sorted / eigenvalues_sorted.sum()
451
+ cumulative_explained_variance = np.cumsum(explained_variance_ratio)
452
+
453
+ # reconstruct from first PC
454
+ X_reconstructed = X_pca_manual @ W.T + mean_vector
455
+ reconstruction_error = np.mean((X - X_reconstructed) ** 2)
456
+
457
+ print("Explained variance ratio:", explained_variance_ratio)
458
+ print("Cumulative explained variance:", cumulative_explained_variance)
459
+ print("Reconstruction error using 1 PC:", reconstruction_error)
460
+
461
+ """### Visualizations"""
462
+
463
+ plt.scatter(X[:, 0], X[:, 1])
464
+ plt.xlabel("X1")
465
+ plt.ylabel("X2")
466
+ plt.title("Original Data")
467
+ plt.grid(True)
468
+ plt.show()
469
+
470
+ plt.scatter(X_pca_manual[:, 0], np.zeros(len(X_pca_manual)))
471
+ plt.xlabel("PC1")
472
+ plt.title("Projection of Data onto First Principal Component")
473
+ plt.yticks([])
474
+ plt.grid(True)
475
+ plt.show()
476
+
477
+ plt.bar(["PC1", "PC2"], explained_variance_ratio)
478
+ plt.ylabel("Explained Variance Ratio")
479
+ plt.title("Scree Plot")
480
+ plt.show()
481
+
482
+ plt.plot(["PC1", "PC2"], cumulative_explained_variance, marker="o")
483
+ plt.ylabel("Cumulative Explained Variance")
484
+ plt.title("Cumulative Explained Variance Plot")
485
+ plt.ylim(0, 1.05)
486
+ plt.grid(True)
487
+ plt.show()
@@ -0,0 +1,14 @@
1
+ "IR Test source snippets."
2
+
3
+ import pandas as pd
4
+
5
+ # Dimensionality reduction source snippets
6
+
7
+ df = pd.DataFrame(
8
+ {
9
+ "A": [1, 2, 3],
10
+ "B": [4, 5, 6],
11
+ "C": [7, 8, 9],
12
+ }
13
+ )
14
+ print(df)
@@ -0,0 +1 @@
1
+ """Python-related source snippets."""