itertoolkit 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. bm_preprocessing/__init__.py +14 -0
  2. bm_preprocessing/importer/DM/__init__.py +7 -0
  3. bm_preprocessing/importer/DM/agg.py +6 -0
  4. bm_preprocessing/importer/DM/dbscan.py +6 -0
  5. bm_preprocessing/importer/DM/finals.py +6 -0
  6. bm_preprocessing/importer/DM/gsp.py +6 -0
  7. bm_preprocessing/importer/DM/test.py +6 -0
  8. bm_preprocessing/importer/Finals/__init__.py +7 -0
  9. bm_preprocessing/importer/Finals/kaadhal.py +6 -0
  10. bm_preprocessing/importer/Finals/raaka.py +6 -0
  11. bm_preprocessing/importer/Finals/seedan.py +6 -0
  12. bm_preprocessing/importer/Finals/vikram.py +6 -0
  13. bm_preprocessing/importer/IR/__init__.py +6 -0
  14. bm_preprocessing/importer/IR/finals.py +6 -0
  15. bm_preprocessing/importer/IR/pagerank.py +6 -0
  16. bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
  17. bm_preprocessing/importer/IR/test.py +6 -0
  18. bm_preprocessing/importer/PY/__init__.py +4 -0
  19. bm_preprocessing/importer/PY/lib_doc.py +6 -0
  20. bm_preprocessing/importer/PY/python_doc.py +6 -0
  21. bm_preprocessing/importer/__init__.py +8 -0
  22. bm_preprocessing/importer/_module_printer.py +23 -0
  23. bm_preprocessing/src/DM/__init__.py +1 -0
  24. bm_preprocessing/src/DM/agg.py +267 -0
  25. bm_preprocessing/src/DM/dbscan.py +218 -0
  26. bm_preprocessing/src/DM/finals.py +19 -0
  27. bm_preprocessing/src/DM/gsp.py +378 -0
  28. bm_preprocessing/src/DM/test.py +19 -0
  29. bm_preprocessing/src/Finals/__init__.py +1 -0
  30. bm_preprocessing/src/Finals/kaadhal.py +1453 -0
  31. bm_preprocessing/src/Finals/raaka.py +1338 -0
  32. bm_preprocessing/src/Finals/seedan.py +1173 -0
  33. bm_preprocessing/src/Finals/vikram.py +520 -0
  34. bm_preprocessing/src/IR/__init__.py +1 -0
  35. bm_preprocessing/src/IR/finals.py +14 -0
  36. bm_preprocessing/src/IR/pagerank.py +109 -0
  37. bm_preprocessing/src/IR/recommenders_pca.py +487 -0
  38. bm_preprocessing/src/IR/test.py +14 -0
  39. bm_preprocessing/src/PY/__init__.py +1 -0
  40. bm_preprocessing/src/PY/lib_doc.py +295 -0
  41. bm_preprocessing/src/PY/python_doc.py +177 -0
  42. bm_preprocessing/src/__init__.py +1 -0
  43. itertoolkit-1.5.0.dist-info/METADATA +120 -0
  44. itertoolkit-1.5.0.dist-info/RECORD +45 -0
  45. itertoolkit-1.5.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,1453 @@
1
+ #----------------------------------
2
+ #DM--------------------------------
3
+ #----------------------------------
4
+ # ==================== START OF dm.py ====================
5
+
6
+ import pandas as pd
7
+ import numpy as np
8
+ import matplotlib.pyplot as plt
9
+
10
+ # LOAD DATA
11
+ def load_data(path):
12
+ df = pd.read_csv(path)
13
+ return df
14
+
15
+ # PREPROCESSING
16
+ def preprocess(df):
17
+ # Remove CustomerID
18
+ df = df.drop(columns=["CustomerID"])
19
+
20
+ # Check missing values
21
+ print("Missing Values:\n", df.isnull().sum())
22
+
23
+ # Convert Gender to numeric
24
+ df["Gender"] = df["Gender"].map({"Male": 1, "Female": 0})
25
+
26
+ return df
27
+
28
+ # VISUALIZATION
29
+ def plot_gender_ratio(df):
30
+ counts = df["Gender"].value_counts()
31
+ labels = ['Female', 'Male']
32
+ plt.figure()
33
+ plt.pie(counts, labels=labels, autopct='%1.1f%%')
34
+ plt.title("Gender Ratio")
35
+ plt.show()
36
+
37
+ def plot_bar_graphs(df):
38
+ plt.figure()
39
+ df["Age"].value_counts().sort_index().plot(kind='bar')
40
+ plt.title("Age Distribution")
41
+ plt.xlabel("Age")
42
+ plt.ylabel("Count")
43
+ plt.show()
44
+
45
+ plt.figure()
46
+ df["Annual Income (k$)"].value_counts().sort_index().plot(kind='bar')
47
+ plt.title("Annual Income Distribution")
48
+ plt.xlabel("Income (k$)")
49
+ plt.ylabel("Count")
50
+ plt.show()
51
+
52
+ # DISTANCE FUNCTION
53
+ def euclidean_distance(a, b):
54
+ return np.sqrt(np.sum((a - b) ** 2))
55
+
56
+ # WARD LINKAGE (MANUAL)
57
+ def ward_distance(cluster1, cluster2, data):
58
+ # Compute centroids
59
+ c1 = np.mean(data[cluster1], axis=0)
60
+ c2 = np.mean(data[cluster2], axis=0)
61
+
62
+ n1 = len(cluster1)
63
+ n2 = len(cluster2)
64
+
65
+ # Ward formula
66
+ return (n1 * n2) / (n1 + n2) * np.sum((c1 - c2) ** 2)
67
+
68
+ # AGGLOMERATIVE CLUSTERING
69
+ def agglomerative_clustering(data):
70
+ clusters = [[i] for i in range(len(data))]
71
+ history = []
72
+
73
+ while len(clusters) > 1:
74
+ min_dist = float('inf')
75
+ pair = None
76
+
77
+ for i in range(len(clusters)):
78
+ for j in range(i + 1, len(clusters)):
79
+ dist = ward_distance(clusters[i], clusters[j], data)
80
+ if dist < min_dist:
81
+ min_dist = dist
82
+ pair = (i, j)
83
+
84
+ i, j = pair
85
+ new_cluster = clusters[i] + clusters[j]
86
+
87
+ history.append((clusters[i], clusters[j], min_dist))
88
+
89
+ # Merge clusters
90
+ clusters.pop(j)
91
+ clusters.pop(i)
92
+ clusters.append(new_cluster)
93
+
94
+ return history
95
+
96
+ # DENDROGRAM
97
+ def plot_dendrogram(history):
98
+ plt.figure(figsize=(10,6))
99
+
100
+ n = len(history) + 1
101
+ positions = {i: i for i in range(n)}
102
+ heights = {i: 0 for i in range(n)}
103
+ current_cluster_id = n
104
+
105
+ for cluster1, cluster2, dist in history:
106
+ c1 = tuple(cluster1)
107
+ c2 = tuple(cluster2)
108
+
109
+ # Get positions
110
+ x1 = np.mean([positions[i] for i in cluster1])
111
+ x2 = np.mean([positions[i] for i in cluster2])
112
+
113
+ # Get heights
114
+ h1 = max([heights.get(i, 0) for i in cluster1])
115
+ h2 = max([heights.get(i, 0) for i in cluster2])
116
+
117
+ # Draw vertical lines
118
+ plt.plot([x1, x1], [h1, dist])
119
+ plt.plot([x2, x2], [h2, dist])
120
+
121
+ # Draw horizontal line
122
+ plt.plot([x1, x2], [dist, dist])
123
+
124
+ # Update new cluster
125
+ new_cluster = cluster1 + cluster2
126
+ for i in new_cluster:
127
+ positions[i] = (x1 + x2) / 2
128
+ heights[i] = dist
129
+
130
+ current_cluster_id += 1
131
+
132
+ plt.title("Dendrogram (Manual - Tree Structure)")
133
+ plt.xlabel("Data Points")
134
+ plt.ylabel("Distance")
135
+ plt.show()
136
+
137
+ if __name__ == "__main__":
138
+ path = "Mall_Customers.csv" # update path
139
+
140
+ df = load_data(path)
141
+
142
+ print("Original Dataset:")
143
+ print(df.head())
144
+
145
+ df = preprocess(df)
146
+
147
+ print("Dataset After Preprocessing:")
148
+ print(df.head())
149
+
150
+ # Gender Count
151
+ print("Gender Count (0=Female, 1=Male):")
152
+ print(df["Gender"].value_counts())
153
+
154
+ # Visualizations
155
+ plot_gender_ratio(df)
156
+ plot_bar_graphs(df)
157
+
158
+ # Prepare data for clustering
159
+ data = df.values.astype(float)
160
+
161
+ # Perform clustering
162
+ history = agglomerative_clustering(data)
163
+
164
+ print("Clustering Merge Steps (Cluster1, Cluster2, Distance):")
165
+ for step in history[:10]: # print first 10 steps
166
+ print(step)
167
+
168
+ # Plot dendrogram
169
+ plot_dendrogram(history)
170
+
171
+
172
+ # ==================== START OF dm1.py ====================
173
+
174
+ import numpy as np
175
+ import pandas as pd
176
+ import matplotlib.pyplot as plt
177
+ from sklearn.datasets import make_moons
178
+
179
+ # LOAD DATA
180
+ def load_data(path):
181
+ df = pd.read_csv(path)
182
+ return df
183
+
184
+ # TASK 1
185
+ def preprocess_wholesale(df):
186
+ # Drop columns
187
+ df = df.drop(columns=["Channel", "Region"])
188
+
189
+ print("First few records:\n")
190
+ print(df.head())
191
+
192
+ return df
193
+
194
+ # NORMALIZATION (Z-SCORE)
195
+ def normalize(data):
196
+ mean = np.mean(data, axis=0)
197
+ std = np.std(data, axis=0)
198
+ return (data - mean) / std
199
+
200
+ # VISUALIZATION
201
+ def plot_data(data, title):
202
+ plt.figure()
203
+ plt.scatter(data[:,0], data[:,1])
204
+ plt.title(title)
205
+ plt.xlabel("Groceries")
206
+ plt.ylabel("Milk")
207
+ plt.show()
208
+
209
+ # DBSCAN
210
+ def euclidean(p, q):
211
+ return np.sqrt(np.sum((p - q)**2))
212
+
213
+ def region_query(data, point_idx, eps):
214
+ neighbors = []
215
+ for i in range(len(data)):
216
+ if euclidean(data[point_idx], data[i]) <= eps:
217
+ neighbors.append(i)
218
+ return neighbors
219
+
220
+ def expand_cluster(data, labels, point_idx, cluster_id, eps, min_pts):
221
+ seeds = region_query(data, point_idx, eps)
222
+
223
+ if len(seeds) < min_pts:
224
+ labels[point_idx] = -1 # noise
225
+ return False
226
+
227
+ for seed in seeds:
228
+ labels[seed] = cluster_id
229
+
230
+ i = 0
231
+ while i < len(seeds):
232
+ current = seeds[i]
233
+ result = region_query(data, current, eps)
234
+
235
+ if len(result) >= min_pts:
236
+ for r in result:
237
+ if labels[r] == 0:
238
+ seeds.append(r)
239
+ labels[r] = cluster_id
240
+ elif labels[r] == -1:
241
+ labels[r] = cluster_id
242
+ i += 1
243
+
244
+ return True
245
+
246
+ def dbscan(data, eps, min_pts):
247
+ labels = [0] * len(data)
248
+ cluster_id = 0
249
+
250
+ for i in range(len(data)):
251
+ if labels[i] != 0:
252
+ continue
253
+
254
+ if expand_cluster(data, labels, i, cluster_id + 1, eps, min_pts):
255
+ cluster_id += 1
256
+
257
+ return np.array(labels)
258
+
259
+ # PLOT CLUSTERS
260
+ def plot_clusters(data, labels, title):
261
+ plt.figure()
262
+ unique_labels = set(labels)
263
+
264
+ for label in unique_labels:
265
+ if label == -1:
266
+ color = 'k'
267
+ else:
268
+ color = None
269
+
270
+ points = data[labels == label]
271
+ plt.scatter(points[:,0], points[:,1])
272
+
273
+ plt.title(title)
274
+ plt.xlabel("Feature 1")
275
+ plt.ylabel("Feature 2")
276
+ plt.show()
277
+
278
+ # TASK 2 - MOONS
279
+ def moons_dbscan():
280
+ # Generate moons dataset
281
+ X, _ = make_moons(n_samples=2000, noise=0.05)
282
+
283
+ # Apply manual DBSCAN
284
+ labels = dbscan(X, eps=0.2, min_pts=5)
285
+ plot_clusters(X, labels, "DBSCAN on Moons")
286
+
287
+ # Add noise manually
288
+ noise = np.random.uniform(low=-1.5, high=2.5, size=(200,2))
289
+ X_noisy = np.vstack((X, noise))
290
+
291
+ # Apply manual DBSCAN again
292
+ labels_noisy = dbscan(X_noisy, eps=0.2, min_pts=5)
293
+ plot_clusters(X_noisy, labels_noisy, "DBSCAN with Noise")
294
+
295
+ if __name__ == "__main__":
296
+ path = "Wholesale customers data.csv" # update path
297
+
298
+ # Load and preprocess
299
+ df = load_data(path)
300
+ print("Original Dataset:")
301
+ print(df.head())
302
+
303
+ df = preprocess_wholesale(df)
304
+
305
+ # Select features
306
+ subset = df[["Grocery", "Milk"]].values
307
+
308
+ print("Selected Features (Groceries, Milk):")
309
+ print(subset[:5])
310
+
311
+ # Normalize
312
+ normalized = normalize(subset)
313
+
314
+ print("Normalized Data (first 5 rows):")
315
+ print(normalized[:5])
316
+
317
+ # Plot normalized data
318
+ plot_data(normalized, "Normalized Data")
319
+
320
+ # Run DBSCAN (manual)
321
+ labels = dbscan(normalized, eps=0.5, min_pts=15)
322
+
323
+ print("DBSCAN Cluster Labels (first 20):")
324
+ print(labels[:20])
325
+
326
+ print("Number of clusters (excluding noise): ", len(set(labels)) - (1 if -1 in labels else 0))
327
+ print("Number of noise points: ", list(labels).count(-1))
328
+
329
+ plot_clusters(normalized, labels, "Manual DBSCAN Clusters (Wholesale)")
330
+
331
+ # Task 2: Moons
332
+ print("--- MOONS DATASET ---")
333
+ X, _ = make_moons(n_samples=2000, noise=0.05)
334
+
335
+ print("Moons Dataset Sample:")
336
+ print(X[:5])
337
+
338
+ labels_moons = dbscan(X, eps=0.2, min_pts=5)
339
+
340
+ print("Moons Cluster Labels (first 20):")
341
+ print(labels_moons[:20])
342
+
343
+ plot_clusters(X, labels_moons, "DBSCAN on Moons")
344
+
345
+ # Add noise
346
+ noise = np.random.uniform(low=-1.5, high=2.5, size=(200,2))
347
+ X_noisy = np.vstack((X, noise))
348
+
349
+ print("Noisy Dataset Sample:")
350
+ print(X_noisy[:5])
351
+
352
+ labels_noisy = dbscan(X_noisy, eps=0.2, min_pts=5)
353
+
354
+ print("Noisy Data Cluster Labels (first 20):")
355
+ print(labels_noisy[:20])
356
+
357
+ print("Number of clusters (noisy): ", len(set(labels_noisy)) - (1 if -1 in labels_noisy else 0))
358
+ print("Noise points (noisy data): ", list(labels_noisy).count(-1))
359
+
360
+ plot_clusters(X_noisy, labels_noisy, "Manual DBSCAN with Noise")
361
+
362
+
363
+ # ==================== START OF dm2.py ====================
364
+
365
+ from collections import defaultdict
366
+ import itertools
367
+
368
+ # READ INPUT FILES
369
+ def read_data(file):
370
+ sequences = []
371
+ with open(file, 'r') as f:
372
+ for line in f:
373
+ seq = []
374
+ parts = line.strip().split('}')
375
+ for p in parts:
376
+ if '{' in p:
377
+ items = p.split('{')[1]
378
+ if items:
379
+ seq.append(list(map(int, items.split(','))))
380
+ sequences.append(seq)
381
+ return sequences
382
+
383
+ def read_parameters(file):
384
+ MIS = {}
385
+ SDC = 0
386
+
387
+ with open(file, 'r') as f:
388
+ for line in f:
389
+ if "MIS" in line:
390
+ item = int(line.split('(')[1].split(')')[0])
391
+ value = float(line.split('=')[1])
392
+ MIS[item] = value
393
+ elif "SDC" in line:
394
+ SDC = float(line.split('=')[1])
395
+
396
+ return MIS, SDC
397
+
398
+ # SUPPORT COUNT
399
+ def support_count(sequences):
400
+ count = defaultdict(int)
401
+ total = len(sequences)
402
+
403
+ for seq in sequences:
404
+ items = set()
405
+ for itemset in seq:
406
+ items.update(itemset)
407
+ for item in items:
408
+ count[item] += 1
409
+
410
+ support = {item: count[item]/total for item in count}
411
+ return support, count
412
+
413
+ # INIT PASS (L)
414
+ def init_pass(MIS, support):
415
+ sorted_items = sorted(MIS.items(), key=lambda x: x[1])
416
+
417
+ L = []
418
+ for item, mis in sorted_items:
419
+ if support.get(item, 0) >= mis:
420
+ L.append(item)
421
+
422
+ return L
423
+
424
+ # LEVEL 1 FREQUENT
425
+ def level1(L, support, MIS):
426
+ F1 = []
427
+ for item in L:
428
+ if support[item] >= MIS[item]:
429
+ F1.append([item])
430
+ return F1
431
+
432
+ # CANDIDATE GENERATION (LEVEL 2)
433
+ def candidate_gen_L2(L, support, MIS, SDC):
434
+ C2 = []
435
+
436
+ for i in range(len(L)):
437
+ for j in range(i+1, len(L)):
438
+ if support[L[j]] >= MIS[L[i]] and abs(support[L[i]] - support[L[j]]) <= SDC:
439
+ C2.append([[L[i]], [L[j]]])
440
+ C2.append([[L[i], L[j]]])
441
+
442
+ return C2
443
+
444
+ # SUBSEQUENCE CHECK
445
+ def is_subsequence(candidate, sequence):
446
+ i = 0
447
+ for itemset in sequence:
448
+ if set(candidate[i]).issubset(set(itemset)):
449
+ i += 1
450
+ if i == len(candidate):
451
+ return True
452
+ return False
453
+
454
+ # COUNT SUPPORT FOR SEQUENCE
455
+ def count_support_seq(candidates, sequences):
456
+ counts = [0]*len(candidates)
457
+
458
+ for i, cand in enumerate(candidates):
459
+ for seq in sequences:
460
+ if is_subsequence(cand, seq):
461
+ counts[i] += 1
462
+
463
+ return counts
464
+
465
+ # FILTER FREQUENT
466
+ def filter_candidates(candidates, counts, MIS, sequences):
467
+ F = []
468
+ total = len(sequences)
469
+
470
+ for i, cand in enumerate(candidates):
471
+ first_item = cand[0][0]
472
+ if counts[i]/total >= MIS[first_item]:
473
+ F.append((cand, counts[i]))
474
+
475
+ return F
476
+
477
+ # MAIN MS-GSP
478
+ def format_pattern(pattern):
479
+ result = "<"
480
+ for itemset in pattern:
481
+ result += "{" + ",".join(map(str, itemset)) + "}"
482
+ result += ">"
483
+ return result
484
+
485
+ def MSGSP(data_file, para_file):
486
+ sequences = read_data(data_file)
487
+ MIS, SDC = read_parameters(para_file)
488
+
489
+ print("\nInput Sequences:\n")
490
+ for s in sequences[:5]:
491
+ print(s)
492
+
493
+ print("\nMIS Values:")
494
+ print(MIS)
495
+ print("SDC:", SDC)
496
+
497
+ support, item_counts = support_count(sequences)
498
+
499
+ print("\nItem Supports:")
500
+ for k,v in support.items():
501
+ print(f"Item {k}: {v:.3f}")
502
+
503
+ L = init_pass(MIS, support)
504
+ print("\nL (after init pass):", L)
505
+
506
+ F1 = level1(L, support, MIS)
507
+ print("\nF1 (Frequent 1-sequences):", F1)
508
+
509
+ C2 = candidate_gen_L2(L, support, MIS, SDC)
510
+ print("\nC2 (Candidate sequences):")
511
+ for c in C2[:10]:
512
+ print(c)
513
+
514
+ counts = count_support_seq(C2, sequences)
515
+
516
+ F2 = filter_candidates(C2, counts, MIS, sequences)
517
+
518
+ print("\nFrequent Patterns:")
519
+ for pattern, count in F2:
520
+ print(f"Pattern :{format_pattern(pattern)} count: {count}")
521
+
522
+ # Visualization
523
+ if F2:
524
+ labels = [format_pattern(p) for p,_ in F2[:10]]
525
+ values = [c for _,c in F2[:10]]
526
+
527
+ import matplotlib.pyplot as plt
528
+ plt.figure()
529
+ plt.bar(range(len(values)), values)
530
+ plt.xticks(range(len(values)), labels, rotation=45)
531
+ plt.title("Top Frequent Sequential Patterns")
532
+ plt.xlabel("Patterns")
533
+ plt.ylabel("Count")
534
+ plt.tight_layout()
535
+ plt.show()
536
+
537
+ if __name__ == "__main__":
538
+ MSGSP("data.txt", "para.txt")
539
+
540
+ #------------------------------------------------
541
+ #IR----------------------------------------------
542
+ #------------------------------------------------
543
+ # ==========================================
544
+ # SECTION 1: CONTENT_BASED.PY
545
+ # ==========================================
546
+
547
+ import pandas as pd
548
+ import numpy as np
549
+ import matplotlib.pyplot as plt
550
+ import re
551
+ from sklearn.feature_extraction.text import TfidfVectorizer
552
+ from sklearn.metrics.pairwise import cosine_similarity
553
+ from sklearn.preprocessing import MinMaxScaler
554
+
555
+ # LOAD DATA
556
+ def load_data():
557
+ movies = pd.DataFrame({
558
+ 'movie_id': [1,2,3,4,5,6,7,8],
559
+ 'title': [
560
+ 'Inception','Interstellar','Dark Knight','Memento',
561
+ 'Tenet','Avatar','Titanic','The Matrix'
562
+ ],
563
+ 'description': [
564
+ 'dream subconscious thriller mind',
565
+ 'space time black hole science',
566
+ 'batman joker crime action',
567
+ 'memory loss psychological thriller',
568
+ 'time inversion action thriller',
569
+ 'alien planet sci fi adventure',
570
+ 'romance ship tragedy love',
571
+ 'virtual reality ai action'
572
+ ]
573
+ })
574
+
575
+ user_ratings = pd.Series(
576
+ [5, 4, 0, 0, 3, 0, 0, 5],
577
+ index=movies['title'],
578
+ name="User1"
579
+ )
580
+
581
+ return movies, user_ratings
582
+
583
+ # TEXT PREPROCESSING
584
+ def clean_text(text):
585
+ text = text.lower()
586
+ text = re.sub(r'[^a-z\s]', '', text)
587
+ return text
588
+
589
+ def preprocess_text(movies):
590
+ movies['clean_description'] = movies['description'].apply(clean_text)
591
+ return movies
592
+
593
+ # NORMALIZE RATINGS
594
+ def normalize_ratings(user_ratings):
595
+ user_ratings = user_ratings.fillna(0)
596
+ scaler = MinMaxScaler()
597
+
598
+ normalized = scaler.fit_transform(user_ratings.values.reshape(-1,1)).flatten()
599
+
600
+ return pd.Series(normalized, index=user_ratings.index)
601
+
602
+ # FEATURE EXTRACTION
603
+ def build_item_profiles(movies):
604
+ tfidf = TfidfVectorizer(stop_words='english')
605
+ tfidf_matrix = tfidf.fit_transform(movies['clean_description'])
606
+
607
+ item_profiles = pd.DataFrame(
608
+ tfidf_matrix.toarray(),
609
+ index=movies['title']
610
+ )
611
+
612
+ return item_profiles, tfidf
613
+
614
+ # USER PROFILE
615
+ def build_user_profile(user_ratings_norm, item_profiles):
616
+ user_vector = np.dot(user_ratings_norm.values, item_profiles.values)
617
+ return pd.Series(user_vector, index=item_profiles.columns)
618
+
619
+ # RECOMMENDATION
620
+ def recommend(user_profile, item_profiles, user_ratings, top_n=5, threshold=0.0):
621
+
622
+ user_vec = user_profile.values.reshape(1, -1)
623
+
624
+ scores = cosine_similarity(user_vec, item_profiles.values)[0]
625
+
626
+ scores_df = pd.DataFrame({
627
+ 'movie': item_profiles.index,
628
+ 'score': scores
629
+ })
630
+
631
+ # Remove already rated items
632
+ rated_items = user_ratings[user_ratings > 0].index
633
+ scores_df = scores_df[~scores_df['movie'].isin(rated_items)]
634
+
635
+ # Apply threshold
636
+ scores_df = scores_df[scores_df['score'] > threshold]
637
+
638
+ # Sort
639
+ scores_df = scores_df.sort_values(by='score', ascending=False)
640
+
641
+ return scores_df.head(top_n)
642
+
643
+ # VISUALIZATION
644
+ def plot_item_similarity(item_profiles, movies):
645
+ similarity_matrix = cosine_similarity(item_profiles)
646
+
647
+ plt.figure()
648
+ plt.imshow(similarity_matrix)
649
+ plt.colorbar()
650
+ plt.title("Item-Item Similarity Matrix")
651
+ plt.xticks(range(len(movies)), movies['title'], rotation=90)
652
+ plt.yticks(range(len(movies)), movies['title'])
653
+ plt.tight_layout()
654
+ plt.show()
655
+
656
+ def plot_user_ratings(user_ratings):
657
+ plt.figure()
658
+ plt.bar(user_ratings.index, user_ratings.values)
659
+ plt.xticks(rotation=90)
660
+ plt.title("User Ratings")
661
+ plt.ylabel("Rating")
662
+ plt.show()
663
+
664
+ def plot_recommendations(rec_df):
665
+ if rec_df.empty:
666
+ print("No recommendations to display")
667
+ return
668
+
669
+ plt.figure()
670
+ plt.bar(rec_df['movie'], rec_df['score'])
671
+ plt.xticks(rotation=90)
672
+ plt.title("Top Recommended Movies")
673
+ plt.ylabel("Similarity Score")
674
+ plt.show()
675
+
676
+ def plot_similarity_distribution(item_profiles):
677
+ sim_matrix = cosine_similarity(item_profiles)
678
+
679
+ plt.figure()
680
+ plt.hist(sim_matrix.flatten(), bins=20)
681
+ plt.title("Similarity Score Distribution")
682
+ plt.xlabel("Similarity")
683
+ plt.ylabel("Frequency")
684
+ plt.show()
685
+
686
+ # EVALUATION
687
+ def precision_at_k(actual, predicted, k):
688
+ predicted = predicted[:k]
689
+ if len(predicted) == 0:
690
+ return 0
691
+ return len(set(predicted) & set(actual)) / len(predicted)
692
+
693
+ def recall_at_k(actual, predicted, k):
694
+ if len(actual) == 0:
695
+ return 0
696
+ predicted = predicted[:k]
697
+ return len(set(predicted) & set(actual)) / len(actual)
698
+
699
+ def f1_score(precision, recall):
700
+ if precision + recall == 0:
701
+ return 0
702
+ return 2 * (precision * recall) / (precision + recall)
703
+
704
+ def evaluate_system(recommendations, ground_truth, k):
705
+ predicted = recommendations['movie'].tolist()
706
+
707
+ precision = precision_at_k(ground_truth, predicted, k)
708
+ recall = recall_at_k(ground_truth, predicted, k)
709
+ f1 = f1_score(precision, recall)
710
+
711
+ return precision, recall, f1
712
+
713
+ def main_content_based():
714
+ # Load data
715
+ movies, user_ratings = load_data()
716
+
717
+ # Preprocess
718
+ movies = preprocess_text(movies)
719
+
720
+ # Normalize ratings
721
+ user_ratings_norm = normalize_ratings(user_ratings)
722
+
723
+ # Build item profiles
724
+ item_profiles, _ = build_item_profiles(movies)
725
+
726
+ # Build user profile
727
+ user_profile = build_user_profile(user_ratings_norm, item_profiles)
728
+
729
+ # Generate recommendations
730
+ rec_df = recommend(user_profile, item_profiles, user_ratings, top_n=5)
731
+
732
+ # Visualization
733
+ plot_item_similarity(item_profiles, movies)
734
+ plot_user_ratings(user_ratings)
735
+ plot_recommendations(rec_df)
736
+ plot_similarity_distribution(item_profiles)
737
+
738
+ # Evaluation
739
+ ground_truth = ['Memento', 'Avatar']
740
+ precision, recall, f1 = evaluate_system(rec_df, ground_truth, k=3)
741
+
742
+ print("Precision@K:", precision)
743
+ print("Recall@K:", recall)
744
+ print("F1 Score:", f1)
745
+
746
+ # Output
747
+ print("\nRecommended Movies:")
748
+ print(rec_df)
749
+
750
+
751
+ # ==========================================
752
+ # SECTION 2: DR.PY
753
+ # ==========================================
754
+
755
+ # DATA
756
+ def get_data():
757
+ return np.array([
758
+ [5, 3, np.nan, 1],
759
+ [4, np.nan, np.nan, 1],
760
+ [1, 1, np.nan, 5],
761
+ [np.nan, np.nan, 5, 4]
762
+ ], dtype=float)
763
+
764
+ # MISSING VALUE HANDLING
765
+ def fill_item_mean(R):
766
+ R_filled = R.copy()
767
+ for j in range(R.shape[1]):
768
+ mean = np.nanmean(R[:, j])
769
+ for i in range(R.shape[0]):
770
+ if np.isnan(R_filled[i, j]):
771
+ R_filled[i, j] = mean
772
+ return R_filled
773
+
774
+ def fill_user_mean(R):
775
+ R_filled = R.copy()
776
+ for i in range(R.shape[0]):
777
+ mean = np.nanmean(R[i])
778
+ for j in range(R.shape[1]):
779
+ if np.isnan(R_filled[i, j]):
780
+ R_filled[i, j] = mean
781
+ return R_filled
782
+
783
+ # PCA
784
+ def pca_process(R_filled, k=2):
785
+
786
+ # STEP 1: Mean
787
+ mean = np.mean(R_filled, axis=0)
788
+ print("\nSTEP 1: MEAN\n", mean)
789
+
790
+ # STEP 2: Centering
791
+ X_centered = R_filled - mean
792
+ print("\nSTEP 2: CENTERED DATA\n", X_centered)
793
+
794
+ # STEP 3: Covariance
795
+ n = R_filled.shape[0]
796
+ cov = np.dot(X_centered.T, X_centered) / (n - 1)
797
+ print("\nSTEP 3: COVARIANCE MATRIX\n", cov)
798
+
799
+ # STEP 4 & 5: Eigenvalues & Eigenvectors
800
+ eigenvalues, eigenvectors = np.linalg.eig(cov)
801
+ print("\nSTEP 4: EIGENVALUES\n", eigenvalues)
802
+ print("\nSTEP 5: EIGENVECTORS\n", eigenvectors)
803
+
804
+ # STEP 6: Normalize
805
+ for i in range(eigenvectors.shape[1]):
806
+ eigenvectors[:, i] /= np.linalg.norm(eigenvectors[:, i])
807
+ print("\nSTEP 6: UNIT EIGENVECTORS\n", eigenvectors)
808
+
809
+ # STEP 7: Top-k
810
+ idx = np.argsort(eigenvalues)[::-1]
811
+ eigenvalues = eigenvalues[idx]
812
+ eigenvectors = eigenvectors[:, idx]
813
+ W = eigenvectors[:, :k]
814
+ print("\nSTEP 7: TOP-K EIGENVECTORS\n", W)
815
+
816
+ # STEP 8: Projection
817
+ Z = np.dot(X_centered, W)
818
+ print("\nSTEP 8: PCA COMPONENTS\n", Z)
819
+
820
+ # Reconstruction
821
+ R_recon = np.dot(Z, W.T) + mean
822
+ print("\nPCA RECONSTRUCTION\n", R_recon)
823
+
824
+ return Z, R_recon
825
+
826
+ # SVD
827
+ def svd_process(R_filled, k=2):
828
+
829
+ RtR = np.dot(R_filled.T, R_filled)
830
+ eigenvalues, V = np.linalg.eig(RtR)
831
+
832
+ idx = np.argsort(eigenvalues)[::-1]
833
+ eigenvalues = eigenvalues[idx]
834
+ V = V[:, idx]
835
+
836
+ singular_values = np.sqrt(np.abs(eigenvalues))
837
+
838
+ U = []
839
+ for i in range(len(singular_values)):
840
+ if singular_values[i] > 1e-10:
841
+ u = np.dot(R_filled, V[:, i]) / singular_values[i]
842
+ U.append(u)
843
+
844
+ U = np.array(U).T
845
+
846
+ U_k = U[:, :k]
847
+ S_k = np.diag(singular_values[:k])
848
+ V_k = V[:, :k]
849
+
850
+ R_recon = np.dot(np.dot(U_k, S_k), V_k.T)
851
+
852
+ print("\nSingular Values\n", singular_values)
853
+ print("\nSVD RECONSTRUCTION\n", R_recon)
854
+
855
+ return R_recon
856
+
857
+ # RECOMMENDATION + ERROR
858
+ def recommend_dr(R_original, R_pred, user):
859
+ missing = np.where(np.isnan(R_original[user]))[0]
860
+ scores = R_pred[user][missing]
861
+ return missing[np.argmax(scores)]
862
+
863
+ def compute_error(R_original, R_pred):
864
+ mask = ~np.isnan(R_original)
865
+ return np.sum((R_original[mask] - R_pred[mask]) ** 2)
866
+
867
+ # VISUALIZATION
868
+ def plot_matrix(matrix, title):
869
+ plt.figure()
870
+ plt.imshow(matrix)
871
+ plt.title(title)
872
+ plt.colorbar()
873
+ plt.show()
874
+
875
+ def plot_latent(Z):
876
+ plt.figure()
877
+ for i in range(Z.shape[0]):
878
+ plt.scatter(Z[i, 0], Z[i, 1])
879
+ plt.text(Z[i, 0], Z[i, 1], f"U{i}")
880
+ plt.title("Latent Space (PCA)")
881
+ plt.xlabel("Component 1")
882
+ plt.ylabel("Component 2")
883
+ plt.show()
884
+
885
+ def plot_error(errors, labels):
886
+ plt.figure()
887
+ plt.bar(labels, errors)
888
+ plt.title("Error Comparison")
889
+ plt.show()
890
+
891
+ def main_dr():
892
+ R = get_data()
893
+ print("ORIGINAL MATRIX: ", R)
894
+
895
+ R_filled = fill_item_mean(R)
896
+ print("\nITEM MEAN FILLED MATRIX: ", R_filled)
897
+
898
+ # PCA
899
+ Z_pca, R_pca = pca_process(R_filled)
900
+
901
+ # SVD
902
+ R_svd = svd_process(R_filled)
903
+
904
+ # Recommendation
905
+ user = 0
906
+ print("\nRECOMMENDATIONS")
907
+ print("PCA:", recommend_dr(R, R_pca, user))
908
+ print("SVD:", recommend_dr(R, R_svd, user))
909
+
910
+ # Error
911
+ print("\nERRORS")
912
+ error_pca = compute_error(R, R_pca)
913
+ error_svd = compute_error(R, R_svd)
914
+
915
+ print("PCA Error:", error_pca)
916
+ print("SVD Error:", error_svd)
917
+
918
+ # Visualization
919
+ plot_matrix(R_filled, "Filled Matrix")
920
+ plot_matrix(R_pca, "PCA Reconstruction")
921
+ plot_matrix(R_svd, "SVD Reconstruction")
922
+
923
+ plot_latent(Z_pca)
924
+
925
+ plot_error(
926
+ [error_pca, error_svd],
927
+ ["PCA", "SVD"]
928
+ )
929
+
930
+
931
+ # ==========================================
932
+ # SECTION 3: RECOMMENDERS.PY
933
+ # ==========================================
934
+
935
+ # =========================
936
+ # DATA PREPARATION
937
+ def load_data_cf():
938
+ data = [
939
+ [5, 3, 4, '?'],
940
+ [3, 1, 2, 3],
941
+ [4, 3, 4, 5],
942
+ [3, 3, 1, 5]
943
+ ]
944
+
945
+ df = pd.DataFrame(
946
+ data,
947
+ index=['User1', 'User2', 'User3', 'User4'],
948
+ columns=['Item1', 'Item2', 'Item3', 'Item4']
949
+ )
950
+
951
+ df = df.mask(df == '?', np.nan).astype(float)
952
+ return df
953
+
954
+ def mean_center(df):
955
+ user_mean = df.mean(axis=1)
956
+ df_centered = df.sub(user_mean, axis=0)
957
+ return user_mean, df_centered
958
+
959
+ # SIMILARITY FUNCTION
960
+ def cosine_similarity_cf(a, b):
961
+ mask = ~np.isnan(a) & ~np.isnan(b)
962
+
963
+ if np.sum(mask) == 0:
964
+ return 0
965
+
966
+ a, b = a[mask], b[mask]
967
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
968
+
969
+ # USER-BASED CF
970
+ def predict_user_based(df, user, item):
971
+ target_vector = df.loc[user].values
972
+ similarities = {}
973
+
974
+ for other_user in df.index:
975
+ if other_user == user:
976
+ continue
977
+
978
+ if not np.isnan(df.loc[other_user, item]):
979
+ sim = cosine_similarity_cf(
980
+ target_vector,
981
+ df.loc[other_user].values
982
+ )
983
+ similarities[other_user] = sim
984
+
985
+ num, den = 0, 0
986
+ for u, sim in similarities.items():
987
+ num += sim * df.loc[u, item]
988
+ den += abs(sim)
989
+
990
+ return np.nan if den == 0 else num / den
991
+
992
+ def predict_user_based_mean_centered(df, user, item):
993
+ user_mean = df.mean(axis=1)
994
+ target_vector = df.loc[user].values
995
+ similarities = {}
996
+
997
+ for other_user in df.index:
998
+ if other_user == user:
999
+ continue
1000
+
1001
+ if not np.isnan(df.loc[other_user, item]):
1002
+ sim = cosine_similarity_cf(
1003
+ target_vector,
1004
+ df.loc[other_user].values
1005
+ )
1006
+ similarities[other_user] = sim
1007
+
1008
+ num, den = 0, 0
1009
+
1010
+ for u, sim in similarities.items():
1011
+ ru_i = df.loc[u, item]
1012
+ ru_mean = user_mean[u]
1013
+
1014
+ num += sim * (ru_i - ru_mean)
1015
+ den += abs(sim)
1016
+
1017
+ if den == 0:
1018
+ return user_mean[user]
1019
+
1020
+ return user_mean[user] + (num / den)
1021
+
1022
+ def predict_user_based_topk(df, user, item, k=2):
1023
+ target_vector = df.loc[user].values
1024
+ similarities = []
1025
+
1026
+ for other_user in df.index:
1027
+ if other_user == user:
1028
+ continue
1029
+
1030
+ if not np.isnan(df.loc[other_user, item]):
1031
+ sim = cosine_similarity_cf(
1032
+ target_vector,
1033
+ df.loc[other_user].values
1034
+ )
1035
+ similarities.append((other_user, sim))
1036
+
1037
+ # Sort by similarity (descending)
1038
+ similarities.sort(key=lambda x: x[1], reverse=True)
1039
+
1040
+ # Select Top-K
1041
+ top_k = similarities[:k]
1042
+
1043
+ num, den = 0, 0
1044
+ for u, sim in top_k:
1045
+ num += sim * df.loc[u, item]
1046
+ den += abs(sim)
1047
+
1048
+ return np.nan if den == 0 else num / den
1049
+
1050
+ # ITEM-BASED CF
1051
+ def predict_item_based(df, user, item):
1052
+ target_vector = df[item].values
1053
+ similarities = {}
1054
+
1055
+ for other_item in df.columns:
1056
+ if other_item == item:
1057
+ continue
1058
+
1059
+ if not np.isnan(df.loc[user, other_item]):
1060
+ sim = cosine_similarity_cf(
1061
+ target_vector,
1062
+ df[other_item].values
1063
+ )
1064
+ similarities[other_item] = sim
1065
+
1066
+ num, den = 0, 0
1067
+
1068
+ for i, sim in similarities.items():
1069
+ num += sim * df.loc[user, i]
1070
+ den += abs(sim)
1071
+
1072
+ return np.nan if den == 0 else num / den
1073
+
1074
+ def predict_item_based_topk(df, user, item, k=2):
1075
+ target_vector = df[item].values
1076
+ similarities = []
1077
+
1078
+ for other_item in df.columns:
1079
+ if other_item == item:
1080
+ continue
1081
+
1082
+ if not np.isnan(df.loc[user, other_item]):
1083
+ sim = cosine_similarity_cf(
1084
+ target_vector,
1085
+ df[other_item].values
1086
+ )
1087
+ similarities.append((other_item, sim))
1088
+
1089
+ # Sort & select Top-K
1090
+ similarities.sort(key=lambda x: x[1], reverse=True)
1091
+ top_k = similarities[:k]
1092
+
1093
+ num, den = 0, 0
1094
+ for i, sim in top_k:
1095
+ num += sim * df.loc[user, i]
1096
+ den += abs(sim)
1097
+
1098
+ return np.nan if den == 0 else num / den
1099
+
1100
+ # EVALUATION
1101
+ def evaluate(df):
1102
+ actuals, preds = [], []
1103
+
1104
+ for u in df.index:
1105
+ for i in df.columns:
1106
+ if not np.isnan(df.loc[u, i]):
1107
+
1108
+ temp = df.copy()
1109
+ actual = temp.loc[u, i]
1110
+ temp.loc[u, i] = np.nan
1111
+
1112
+ p = predict_user_based(temp, u, i)
1113
+
1114
+ if not np.isnan(p):
1115
+ actuals.append(actual)
1116
+ preds.append(p)
1117
+
1118
+ actuals = np.array(actuals)
1119
+ preds = np.array(preds)
1120
+
1121
+ rmse = np.sqrt(np.mean((actuals - preds) ** 2))
1122
+ mae = np.mean(np.abs(actuals - preds))
1123
+
1124
+ return rmse, mae
1125
+
1126
+ # VISUALIZATION
1127
+ def plot_matrix_cf(matrix, title):
1128
+ plt.figure()
1129
+ plt.imshow(matrix, aspect='auto')
1130
+ plt.title(title)
1131
+ plt.colorbar()
1132
+ plt.show()
1133
+
1134
+ # MAIN EXECUTION
1135
+ def main_recommenders():
1136
+ df = load_data_cf()
1137
+
1138
+ # Mean Centering
1139
+ user_mean, df_centered = mean_center(df)
1140
+ print("\nMean Centered Matrix:\n", df_centered)
1141
+
1142
+ """
1143
+ for u in df.index:
1144
+ for i in df.columns:
1145
+ if np.isnan(df.loc[u, i]):
1146
+ user_pred_matrix.loc[u, i] = predict_user_based(df, u, i)
1147
+ item_pred_matrix.loc[u, i] = predict_item_based(df, u, i)
1148
+
1149
+ print("\nUser-Based Prediction Matrix:\n", user_pred_matrix)
1150
+ print("\nItem-Based Prediction Matrix:\n", item_pred_matrix)
1151
+ """
1152
+
1153
+ # Prediction
1154
+ user = "User1"
1155
+ item = "Item4"
1156
+
1157
+ user_pred = predict_user_based(df, user, item)
1158
+ item_pred = predict_item_based(df, user, item)
1159
+
1160
+ print("\nUser-Based Prediction:\n", user_pred)
1161
+ print("\nItem-Based Prediction:\n", item_pred)
1162
+
1163
+ # Evaluation
1164
+ rmse, mae = evaluate(df)
1165
+ print("\nEvaluation Metrics:")
1166
+ print("RMSE =", rmse)
1167
+ print("MAE =", mae)
1168
+
1169
+ # Fill NaNs for visualization
1170
+ user_pred_matrix = df.copy().fillna(df.mean().mean())
1171
+ item_pred_matrix = df.copy().fillna(df.mean().mean())
1172
+
1173
+ # Plots
1174
+ plot_matrix_cf(df.fillna(0), "Original Matrix")
1175
+ plot_matrix_cf(df_centered.fillna(0), "Mean Centered Matrix")
1176
+ plot_matrix_cf(user_pred_matrix, "User-Based Predicted Matrix")
1177
+ plot_matrix_cf(item_pred_matrix, "Item-Based Predicted Matrix")
1178
+
1179
+
1180
+ # ==========================================
1181
+ # SECTION 4: PAGERANK.PY
1182
+ # ==========================================
1183
+
1184
+ import math
1185
+
1186
+ # GRAPH CREATION
1187
+ def create_graph(matrix, nodes):
1188
+ graph = {}
1189
+ for i in range(len(matrix)):
1190
+ graph[nodes[i]] = []
1191
+ for j in range(len(matrix[i])):
1192
+ if matrix[i][j] == 1:
1193
+ graph[nodes[i]].append(nodes[j])
1194
+ return graph
1195
+
1196
+ def edges_to_graph(edges):
1197
+ graph = {}
1198
+ for src, dst in edges:
1199
+ if src not in graph:
1200
+ graph[src] = []
1201
+ if dst not in graph:
1202
+ graph[dst] = []
1203
+ graph[src].append(dst)
1204
+ return graph
1205
+
1206
+ # GRAPH ANALYSIS
1207
+ def count_outgoing_links(graph):
1208
+ return {page: len(graph[page]) for page in graph}
1209
+
1210
+
1211
+ def store_incoming_links(graph):
1212
+ incoming_links = {page: [] for page in graph}
1213
+ incoming_count = {page: 0 for page in graph}
1214
+
1215
+ for page in graph:
1216
+ for link in graph[page]:
1217
+ incoming_links[link].append(page)
1218
+ incoming_count[link] += 1
1219
+
1220
+ return incoming_links, incoming_count
1221
+
1222
+ # PAGERANK ALGORITHM
1223
+ def calculate_pagerank(graph, incoming_links, outgoing_count):
1224
+ d = 0.85
1225
+ iterations = 10
1226
+ N = len(graph)
1227
+
1228
+ pr = {page: 1 / N for page in graph}
1229
+
1230
+ print("\nPageRank Algorithm:")
1231
+
1232
+ for i in range(iterations):
1233
+ new_pr = {}
1234
+ for page in graph:
1235
+ rank_sum = 0
1236
+ for incoming in incoming_links[page]:
1237
+ rank_sum += pr[incoming] / outgoing_count[incoming]
1238
+
1239
+ new_pr[page] = (1 - d) / N + d * rank_sum
1240
+
1241
+ pr = new_pr
1242
+ print(f"\nIteration {i+1}: {pr}")
1243
+
1244
+ return pr
1245
+
1246
+ # HITS ALGORITHM
1247
+ def initialize_scores(graph):
1248
+ authority = {node: 1.0 for node in graph}
1249
+ hub = {node: 1.0 for node in graph}
1250
+ return authority, hub
1251
+
1252
+
1253
+ def normalize_sum(scores):
1254
+ total = sum(scores.values())
1255
+ return {k: round(v / total, 4) for k, v in scores.items()}
1256
+
1257
+
1258
+ def hits_algorithm(graph, iterations=5):
1259
+ incoming_links, _ = store_incoming_links(graph)
1260
+ authority, hub = initialize_scores(graph)
1261
+
1262
+ print("\nHITS Algorithm:")
1263
+
1264
+ for i in range(iterations):
1265
+
1266
+ # Authority update
1267
+ new_authority = {}
1268
+ for node in graph:
1269
+ new_authority[node] = sum(
1270
+ hub.get(in_node, 0) for in_node in incoming_links.get(node, [])
1271
+ )
1272
+
1273
+ # Hub update
1274
+ new_hub = {}
1275
+ for node in graph:
1276
+ new_hub[node] = sum(
1277
+ new_authority.get(out_node, 0) for out_node in graph[node]
1278
+ )
1279
+
1280
+ authority = normalize_sum(new_authority)
1281
+ hub = normalize_sum(new_hub)
1282
+
1283
+ print(f"\nIteration {i+1}")
1284
+ print("Authority:", authority)
1285
+ print("Hub:", hub)
1286
+
1287
+ return authority, hub
1288
+
1289
+ # EQUATION METHOD
1290
+ def build_matrix(graph):
1291
+ nodes = list(graph.keys())
1292
+ n = len(nodes)
1293
+ index = {node: i for i, node in enumerate(nodes)}
1294
+
1295
+ incoming, _ = store_incoming_links(graph)
1296
+ outdeg = {node: len(graph[node]) for node in graph}
1297
+
1298
+ A = np.zeros((n, n))
1299
+
1300
+ for i, node in enumerate(nodes):
1301
+ for inc in incoming[node]:
1302
+ j = index[inc]
1303
+ A[i][j] = 1 / outdeg[inc]
1304
+
1305
+ return A, nodes
1306
+
1307
+ def solve_ranks(A, iterations=20):
1308
+ n = len(A)
1309
+ r = np.ones(n) / n
1310
+
1311
+ history = []
1312
+
1313
+ for _ in range(iterations):
1314
+ r = A @ r
1315
+ r = r / np.sum(r)
1316
+ history.append(r.copy())
1317
+
1318
+ return r, history
1319
+
1320
+ def print_equations(graph):
1321
+ incoming, _ = store_incoming_links(graph)
1322
+ outdeg = {node: len(graph[node]) for node in graph}
1323
+
1324
+ print("\nEquations:\n")
1325
+ for node in graph:
1326
+ eq = f"r_{node} = "
1327
+ terms = [f"(r_{inc}/{outdeg[inc]})" for inc in incoming[node]]
1328
+ eq += " + ".join(terms) if terms else "0"
1329
+ print(eq)
1330
+
1331
+ # VISUALIZATION
1332
+ def visualize_graph(graph):
1333
+ nodes = list(graph.keys())
1334
+ n = len(nodes)
1335
+
1336
+ positions = {}
1337
+ for i, node in enumerate(nodes):
1338
+ angle = 2 * math.pi * i / n
1339
+ positions[node] = (math.cos(angle), math.sin(angle))
1340
+
1341
+ plt.figure()
1342
+
1343
+ for node, (x, y) in positions.items():
1344
+ plt.scatter(x, y)
1345
+ plt.text(x, y, node, ha='center', va='center')
1346
+
1347
+ for src in graph:
1348
+ for dst in graph[src]:
1349
+ x1, y1 = positions[src]
1350
+ x2, y2 = positions[dst]
1351
+ plt.arrow(x1, y1, x2 - x1, y2 - y1, head_width=0.05)
1352
+
1353
+ plt.title("Web Graph")
1354
+ plt.axis('off')
1355
+ plt.show()
1356
+
1357
+ def visualize_pagerank(pr):
1358
+ plt.figure()
1359
+ plt.bar(list(pr.keys()), list(pr.values()))
1360
+ plt.title("PageRank Values")
1361
+ plt.show()
1362
+
1363
+ def plot_convergence(history, nodes):
1364
+ for i, node in enumerate(nodes):
1365
+ plt.plot([h[i] for h in history], label=node)
1366
+
1367
+ plt.title("Rank Convergence")
1368
+ plt.legend()
1369
+ plt.show()
1370
+
1371
+ def plot_scores(authority, hub):
1372
+ nodes = list(authority.keys())
1373
+
1374
+ plt.figure()
1375
+ plt.bar(nodes, authority.values())
1376
+ plt.title("Authority Scores")
1377
+ plt.show()
1378
+
1379
+ plt.figure()
1380
+ plt.bar(nodes, hub.values())
1381
+ plt.title("Hub Scores")
1382
+ plt.show()
1383
+
1384
+ # INFERENCE
1385
+ def infer_pagerank(pr):
1386
+ print("\nFinal PageRank:")
1387
+ sorted_pr = sorted(pr.items(), key=lambda x: x[1], reverse=True)
1388
+ for node, val in sorted_pr:
1389
+ print(f"{node}: {val:.4f}")
1390
+
1391
+ print(f"\nMost important: {sorted_pr[0][0]}")
1392
+ print(f"Least important: {sorted_pr[-1][0]}")
1393
+
1394
+
1395
+ def infer_hits(authority, hub):
1396
+ sorted_auth = sorted(authority.items(), key=lambda x: x[1], reverse=True)
1397
+ sorted_hub = sorted(hub.items(), key=lambda x: x[1], reverse=True)
1398
+
1399
+ print("\nBest Authority:", sorted_auth[0][0])
1400
+ print("Best Hub:", sorted_hub[0][0])
1401
+
1402
+
1403
+ def infer_ranks(ranks, nodes):
1404
+ rank_dict = dict(zip(nodes, ranks))
1405
+ sorted_nodes = sorted(rank_dict.items(), key=lambda x: x[1], reverse=True)
1406
+
1407
+ print("\nEquation Method Ranking:")
1408
+ for node, val in sorted_nodes:
1409
+ print(f"{node}: {val:.4f}")
1410
+
1411
+ def main_pagerank():
1412
+ edges = [
1413
+ ('A', 'B'),
1414
+ ('A', 'C'),
1415
+ ('B', 'C'),
1416
+ ('C', 'A'),
1417
+ ('D', 'C')
1418
+ ]
1419
+
1420
+ graph = edges_to_graph(edges)
1421
+
1422
+ print("Graph:", graph)
1423
+
1424
+ # Analysis
1425
+ outgoing_count = count_outgoing_links(graph)
1426
+ incoming_links, incoming_count = store_incoming_links(graph)
1427
+
1428
+ print("\nCount of Outgoing Links:", outgoing_count)
1429
+ print("Count of Incoming Links:", incoming_count)
1430
+ print("Incoming Links:", incoming_links)
1431
+
1432
+ # PageRank
1433
+ pr = calculate_pagerank(graph, incoming_links, outgoing_count)
1434
+
1435
+ # HITS
1436
+ authority, hub = hits_algorithm(graph)
1437
+
1438
+ # Equation Method
1439
+ print_equations(graph)
1440
+ A, nodes = build_matrix(graph)
1441
+ ranks, history = solve_ranks(A)
1442
+
1443
+ # Inference
1444
+ infer_pagerank(pr)
1445
+ infer_hits(authority, hub)
1446
+ infer_ranks(ranks, nodes)
1447
+
1448
+ # Visualization
1449
+ visualize_graph(graph)
1450
+ visualize_pagerank(pr)
1451
+ plot_scores(authority, hub)
1452
+ plot_convergence(history, nodes)
1453
+