itertoolkit 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. bm_preprocessing/__init__.py +14 -0
  2. bm_preprocessing/importer/DM/__init__.py +7 -0
  3. bm_preprocessing/importer/DM/agg.py +6 -0
  4. bm_preprocessing/importer/DM/dbscan.py +6 -0
  5. bm_preprocessing/importer/DM/finals.py +6 -0
  6. bm_preprocessing/importer/DM/gsp.py +6 -0
  7. bm_preprocessing/importer/DM/test.py +6 -0
  8. bm_preprocessing/importer/Finals/__init__.py +7 -0
  9. bm_preprocessing/importer/Finals/kaadhal.py +6 -0
  10. bm_preprocessing/importer/Finals/raaka.py +6 -0
  11. bm_preprocessing/importer/Finals/seedan.py +6 -0
  12. bm_preprocessing/importer/Finals/vikram.py +6 -0
  13. bm_preprocessing/importer/IR/__init__.py +6 -0
  14. bm_preprocessing/importer/IR/finals.py +6 -0
  15. bm_preprocessing/importer/IR/pagerank.py +6 -0
  16. bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
  17. bm_preprocessing/importer/IR/test.py +6 -0
  18. bm_preprocessing/importer/PY/__init__.py +4 -0
  19. bm_preprocessing/importer/PY/lib_doc.py +6 -0
  20. bm_preprocessing/importer/PY/python_doc.py +6 -0
  21. bm_preprocessing/importer/__init__.py +8 -0
  22. bm_preprocessing/importer/_module_printer.py +23 -0
  23. bm_preprocessing/src/DM/__init__.py +1 -0
  24. bm_preprocessing/src/DM/agg.py +267 -0
  25. bm_preprocessing/src/DM/dbscan.py +218 -0
  26. bm_preprocessing/src/DM/finals.py +19 -0
  27. bm_preprocessing/src/DM/gsp.py +378 -0
  28. bm_preprocessing/src/DM/test.py +19 -0
  29. bm_preprocessing/src/Finals/__init__.py +1 -0
  30. bm_preprocessing/src/Finals/kaadhal.py +1453 -0
  31. bm_preprocessing/src/Finals/raaka.py +1338 -0
  32. bm_preprocessing/src/Finals/seedan.py +1173 -0
  33. bm_preprocessing/src/Finals/vikram.py +520 -0
  34. bm_preprocessing/src/IR/__init__.py +1 -0
  35. bm_preprocessing/src/IR/finals.py +14 -0
  36. bm_preprocessing/src/IR/pagerank.py +109 -0
  37. bm_preprocessing/src/IR/recommenders_pca.py +487 -0
  38. bm_preprocessing/src/IR/test.py +14 -0
  39. bm_preprocessing/src/PY/__init__.py +1 -0
  40. bm_preprocessing/src/PY/lib_doc.py +295 -0
  41. bm_preprocessing/src/PY/python_doc.py +177 -0
  42. bm_preprocessing/src/__init__.py +1 -0
  43. itertoolkit-1.5.0.dist-info/METADATA +120 -0
  44. itertoolkit-1.5.0.dist-info/RECORD +45 -0
  45. itertoolkit-1.5.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,1173 @@
1
+ #--------------------------------
2
+ #Data Mining
3
+ #--------------------------------
4
+
5
+ #Agglomerative Hierarchical Clustering
6
+ def ac():
7
+ print(r"""
8
+ import numpy as np
9
+ from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
10
+ import matplotlib.pyplot as plt
11
+
12
+ # Sample dataset
13
+ X = np.array([
14
+ [1, 2],
15
+ [2, 3],
16
+ [5, 6],
17
+ [6, 7],
18
+ [8, 9]
19
+ ])
20
+
21
+ # Step 1: Perform clustering
22
+ Z = linkage(X, method='single') # 'single', 'complete', 'average', 'ward'
23
+
24
+ # Step 2: Plot dendrogram
25
+ plt.figure()
26
+ dendrogram(Z)
27
+ plt.title("Dendrogram")
28
+ plt.xlabel("Data Points")
29
+ plt.ylabel("Distance")
30
+ plt.show()
31
+
32
+ # Step 3: Form clusters (k = 2)
33
+ clusters = fcluster(Z, 2, criterion='maxclust')
34
+
35
+ print("Cluster labels:", clusters)""")
36
+
37
+
38
+ ##DBScan clustering
39
+ def db():
40
+ print(r"""
41
+ import numpy as np
42
+ from sklearn.cluster import DBSCAN
43
+ import matplotlib.pyplot as plt
44
+
45
+ # Sample dataset
46
+ X = np.array([
47
+ [1, 2],
48
+ [2, 2],
49
+ [2, 3],
50
+ [8, 7],
51
+ [8, 8],
52
+ [25, 80] # noise
53
+ ])
54
+
55
+ # Apply DBSCAN
56
+ db = DBSCAN(eps=2, min_samples=2)
57
+ labels = db.fit_predict(X)
58
+
59
+ print("Cluster labels:", labels)
60
+
61
+ # Plot
62
+ plt.scatter(X[:,0], X[:,1], c=labels)
63
+ plt.title("DBSCAN Clustering")
64
+ plt.show()""")
65
+
66
+ ##Generalized Sequential Pattern (GSP) Algorithm
67
+ def gsp():
68
+ print(r"""
69
+ from collections import defaultdict
70
+
71
+ # Sample sequence database
72
+ D = [
73
+ ['A', 'B', 'C'],
74
+ ['A', 'C'],
75
+ ['A', 'B', 'C'],
76
+ ['B', 'C']
77
+ ]
78
+
79
+ min_sup = 2
80
+
81
+ # Step 1: Find frequent 1-sequences
82
+ def get_frequent_1(D, min_sup):
83
+ count = defaultdict(int)
84
+
85
+ for seq in D:
86
+ for item in set(seq):
87
+ count[item] += 1
88
+
89
+ return { (item,): count[item] for item in count if count[item] >= min_sup }
90
+
91
+
92
+ # Step 2: Generate candidates
93
+ def generate_candidates(prev_freq):
94
+ candidates = []
95
+ keys = list(prev_freq.keys())
96
+
97
+ for i in range(len(keys)):
98
+ for j in range(len(keys)):
99
+ if keys[i][1:] == keys[j][:-1]:
100
+ new_seq = keys[i] + (keys[j][-1],)
101
+ candidates.append(new_seq)
102
+
103
+ return list(set(candidates))
104
+
105
+
106
+ # Step 3: Count support
107
+ def count_support(D, candidates):
108
+ count = defaultdict(int)
109
+
110
+ for seq in D:
111
+ for cand in candidates:
112
+ if is_subsequence(cand, seq):
113
+ count[cand] += 1
114
+
115
+ return {c: count[c] for c in count}
116
+
117
+
118
+ # Check subsequence
119
+ def is_subsequence(subseq, seq):
120
+ i = 0
121
+ for item in seq:
122
+ if i < len(subseq) and subseq[i] == item:
123
+ i += 1
124
+ return i == len(subseq)
125
+
126
+
127
+ # GSP Main
128
+ def GSP(D, min_sup):
129
+ freq_patterns = []
130
+
131
+ # Step 1
132
+ L1 = get_frequent_1(D, min_sup)
133
+ current = L1
134
+ freq_patterns.extend(L1.keys())
135
+
136
+ while current:
137
+ # Step 2
138
+ candidates = generate_candidates(current)
139
+
140
+ # Step 3
141
+ counted = count_support(D, candidates)
142
+
143
+ # Prune
144
+ current = {c: counted[c] for c in counted if counted[c] >= min_sup}
145
+
146
+ freq_patterns.extend(current.keys())
147
+
148
+ return freq_patterns
149
+
150
+
151
+ # Run
152
+ patterns = GSP(D, min_sup)
153
+
154
+ print("Frequent Sequential Patterns:")
155
+ for p in patterns:
156
+ print(p)
157
+ """)
158
+
159
+ ##Problem statement 13
160
+ def ps13():
161
+ print(r"""
162
+ Consider Customer mall dataset with following details.
163
+ 1. CustomerID: An identifier for each customer.
164
+ 2. Gender: Indicates the gender of the customer (Male or Female).
165
+ 3. Age: Represents the age of the customer in years.
166
+ 4. Annual Income (k$): Denotes the annual income of the customer in thousands of dollars.
167
+ 5. Spending Score (1–100): A score ranging from 1 to 100 that quantifies the customer’s
168
+ spending habits and preferences. A higher score indicates a higher tendency to spend.
169
+
170
+ Do the following tasks
171
+ • Remove CustomerID column .
172
+ • Check for missing values.
173
+ • Convert categorical variable value of gender to numerical (Male-1, Female-0).
174
+ • Display male, female ratio as pie chart.
175
+ • Display age, annual income as bar graph
176
+ • Perform agglomerative clustering with ward algorithm as linkage.
177
+ • Display dendrogram.
178
+
179
+
180
+ import pandas as pd
181
+ import matplotlib.pyplot as plt
182
+ from scipy.cluster.hierarchy import linkage, dendrogram
183
+ from sklearn.cluster import AgglomerativeClustering
184
+
185
+ # Load dataset (update path if needed)
186
+ df = pd.read_csv("Mall_Customers.csv")
187
+
188
+ # -------------------------------
189
+ # 1. Remove CustomerID column
190
+ # -------------------------------
191
+ df = df.drop("CustomerID", axis=1)
192
+
193
+ # -------------------------------
194
+ # 2. Check for missing values
195
+ # -------------------------------
196
+ print("Missing values:\n", df.isnull().sum())
197
+
198
+ # -------------------------------
199
+ # 3. Convert Gender to numerical
200
+ # Male = 1, Female = 0
201
+ # -------------------------------
202
+ df["Gender"] = df["Gender"].map({"Male": 1, "Female": 0})
203
+
204
+ # -------------------------------
205
+ # 4. Pie chart (Male vs Female)
206
+ # -------------------------------
207
+ gender_counts = df["Gender"].value_counts()
208
+
209
+ labels = ["Male", "Female"]
210
+ values = [gender_counts.get(1, 0), gender_counts.get(0, 0)]
211
+
212
+ plt.figure()
213
+ plt.pie(values, labels=labels, autopct='%1.1f%%')
214
+ plt.title("Male vs Female Ratio")
215
+ plt.show()
216
+
217
+ # -------------------------------
218
+ # 5. Bar graph (Age & Income)
219
+ # -------------------------------
220
+ plt.figure()
221
+
222
+ plt.bar(range(len(df)), df["Age"])
223
+ plt.title("Age Distribution")
224
+ plt.xlabel("Customers")
225
+ plt.ylabel("Age")
226
+ plt.show()
227
+
228
+ plt.figure()
229
+
230
+ plt.bar(range(len(df)), df["Annual Income (k$)"])
231
+ plt.title("Annual Income Distribution")
232
+ plt.xlabel("Customers")
233
+ plt.ylabel("Income (k$)")
234
+ plt.show()
235
+
236
+ # -------------------------------
237
+ # 6. Agglomerative Clustering (Ward)
238
+ # -------------------------------
239
+ # Use relevant features
240
+ X = df[["Age", "Annual Income (k$)", "Spending Score (1-100)"]]
241
+
242
+ model = AgglomerativeClustering(n_clusters=3, linkage='ward')
243
+ labels = model.fit_predict(X)
244
+
245
+ df["Cluster"] = labels
246
+ print("Cluster labels:\n", df["Cluster"].head())
247
+
248
+ # -------------------------------
249
+ # 7. Dendrogram
250
+ # -------------------------------
251
+ plt.figure()
252
+ Z = linkage(X, method='ward')
253
+ dendrogram(Z)
254
+ plt.title("Dendrogram (Ward Linkage)")
255
+ plt.xlabel("Customers")
256
+ plt.ylabel("Distance")
257
+ plt.show()
258
+ """)
259
+
260
+ ##Problem Sheet 14
261
+ def ps14():
262
+ print(r"""
263
+ 1. Consider dataset consisting of annual customer data for a wholesale distributor. The dataset
264
+ contains 440 customers and has 8 attributes. Perform the following tasks.
265
+ • Drop the columns channel and region and display first few records.
266
+ • Consider Groceries and Milk attributes. Normalize these attribute values by scaling it from 0
267
+ mean to unit variance. Visualize normalized dataset.
268
+ • Write your own implementation of DBSCAN (no library) with Minpts = 15 and EPS=0.5.
269
+ • Plot the cluster Results.
270
+
271
+ 2. Use the make_moons function from the sklearn.datasets module to generate a synthetic
272
+ dataset that has a moon-shaped pattern. Give 2000 as value to sample parameter.
273
+ Run built-in DBSCAN algorithm and visualize the result. Add some noise data and again
274
+ visualize the results.
275
+
276
+
277
+ import pandas as pd
278
+ import numpy as np
279
+ import matplotlib.pyplot as plt
280
+ from sklearn.datasets import make_moons
281
+ from sklearn.cluster import DBSCAN
282
+
283
+ # =====================================
284
+ # PART 1: WHOLESALE DATASET
285
+ # =====================================
286
+
287
+ # Load dataset
288
+ df = pd.read_csv("Wholesale customers data.csv")
289
+
290
+ # 1. Drop columns
291
+ df = df.drop(["Channel", "Region"], axis=1)
292
+ print("First few records:\n", df.head())
293
+
294
+ # 2. Normalize Groceries & Milk
295
+ X = df[["Groceries", "Milk"]].values
296
+
297
+ mean = X.mean(axis=0)
298
+ std = X.std(axis=0)
299
+ X_norm = (X - mean) / std
300
+
301
+ # Plot normalized data
302
+ plt.figure()
303
+ plt.scatter(X_norm[:, 0], X_norm[:, 1])
304
+ plt.title("Normalized Data (Groceries vs Milk)")
305
+ plt.xlabel("Groceries")
306
+ plt.ylabel("Milk")
307
+ plt.show()
308
+
309
+
310
+ # =====================================
311
+ # CUSTOM DBSCAN IMPLEMENTATION
312
+ # =====================================
313
+
314
+ def euclidean(p1, p2):
315
+ return np.sqrt(np.sum((p1 - p2) ** 2))
316
+
317
+ def region_query(X, point_idx, eps):
318
+ neighbors = []
319
+ for i in range(len(X)):
320
+ if euclidean(X[point_idx], X[i]) <= eps:
321
+ neighbors.append(i)
322
+ return neighbors
323
+
324
+ def expand_cluster(X, labels, point_idx, neighbors, cluster_id, eps, min_pts):
325
+ labels[point_idx] = cluster_id
326
+
327
+ i = 0
328
+ while i < len(neighbors):
329
+ n_point = neighbors[i]
330
+
331
+ if labels[n_point] == -1:
332
+ labels[n_point] = cluster_id
333
+
334
+ if labels[n_point] == 0:
335
+ labels[n_point] = cluster_id
336
+ n_neighbors = region_query(X, n_point, eps)
337
+
338
+ if len(n_neighbors) >= min_pts:
339
+ neighbors += n_neighbors
340
+
341
+ i += 1
342
+
343
+ def dbscan(X, eps, min_pts):
344
+ labels = [0] * len(X) # 0 = unvisited
345
+ cluster_id = 0
346
+
347
+ for i in range(len(X)):
348
+ if labels[i] != 0:
349
+ continue
350
+
351
+ neighbors = region_query(X, i, eps)
352
+
353
+ if len(neighbors) < min_pts:
354
+ labels[i] = -1 # noise
355
+ else:
356
+ cluster_id += 1
357
+ expand_cluster(X, labels, i, neighbors, cluster_id, eps, min_pts)
358
+
359
+ return np.array(labels)
360
+
361
+ # 3. Run custom DBSCAN
362
+ labels_custom = dbscan(X_norm, eps=0.5, min_pts=15)
363
+
364
+ # 4. Plot results
365
+ plt.figure()
366
+ plt.scatter(X_norm[:, 0], X_norm[:, 1], c=labels_custom)
367
+ plt.title("Custom DBSCAN Clustering")
368
+ plt.xlabel("Groceries")
369
+ plt.ylabel("Milk")
370
+ plt.show()
371
+
372
+
373
+ # =====================================
374
+ # PART 2: MOON DATASET
375
+ # =====================================
376
+
377
+ # Generate moon data
378
+ X_moon, _ = make_moons(n_samples=2000, noise=0.05)
379
+
380
+ plt.figure()
381
+ plt.scatter(X_moon[:, 0], X_moon[:, 1])
382
+ plt.title("Moon Dataset")
383
+ plt.show()
384
+
385
+ # Built-in DBSCAN
386
+ model = DBSCAN(eps=0.2, min_samples=5)
387
+ labels_moon = model.fit_predict(X_moon)
388
+
389
+ plt.figure()
390
+ plt.scatter(X_moon[:, 0], X_moon[:, 1], c=labels_moon)
391
+ plt.title("DBSCAN on Moon Data")
392
+ plt.show()
393
+
394
+ # Add noise
395
+ noise = np.random.uniform(low=-2, high=3, size=(200, 2))
396
+ X_noisy = np.vstack((X_moon, noise))
397
+
398
+ plt.figure()
399
+ plt.scatter(X_noisy[:, 0], X_noisy[:, 1])
400
+ plt.title("Moon Data with Noise")
401
+ plt.show()
402
+
403
+ # DBSCAN on noisy data
404
+ labels_noisy = model.fit_predict(X_noisy)
405
+
406
+ plt.figure()
407
+ plt.scatter(X_noisy[:, 0], X_noisy[:, 1], c=labels_noisy)
408
+ plt.title("DBSCAN with Noise")
409
+ plt.show()
410
+ """)
411
+
412
+ ##Problem sheet 15
413
+ def ps15():
414
+ print(r"""
415
+ Implement Mining Sequential Patterns Based on GSP (Generalized Sequential Patterns) MS-GSP
416
+ algorithm - Sequential pattern mining using multiple minimum supports with a support difference
417
+ constraint.
418
+ Input format:
419
+ data.txt:Each line represents a Transaction Sequence and each set in a sequence represents a set of
420
+ items.
421
+ para.txt:Gives the minimum item support for each item as well as the support difference constraint
422
+ Output format:
423
+ Pattern :<{30,20}{70,80}{20,30,70}> count: 10
424
+
425
+ import re
426
+ from collections import defaultdict
427
+
428
+ # ==========================================
429
+ # 1. READ DATA
430
+ # ==========================================
431
+
432
+ def read_data(file):
433
+ sequences = []
434
+ with open(file, 'r') as f:
435
+ for line in f:
436
+ seq = []
437
+ sets = re.findall(r'\{([^}]*)\}', line)
438
+ for s in sets:
439
+ items = list(map(int, s.split(',')))
440
+ seq.append(items)
441
+ sequences.append(seq)
442
+ return sequences
443
+
444
+
445
+ def read_params(file):
446
+ MIS = {}
447
+ SDC = 0
448
+
449
+ with open(file, 'r') as f:
450
+ for line in f:
451
+ if "MIS" in line:
452
+ item = int(re.findall(r'\d+', line)[0])
453
+ value = float(re.findall(r'\d+\.\d+', line)[0])
454
+ MIS[item] = value
455
+ elif "SDC" in line:
456
+ SDC = float(re.findall(r'\d+\.\d+', line)[0])
457
+
458
+ return MIS, SDC
459
+
460
+
461
+ # ==========================================
462
+ # 2. SUPPORT COUNT
463
+ # ==========================================
464
+
465
+ def get_support(sequences):
466
+ count = defaultdict(int)
467
+ total = len(sequences)
468
+
469
+ for seq in sequences:
470
+ unique_items = set()
471
+ for itemset in seq:
472
+ for item in itemset:
473
+ unique_items.add(item)
474
+ for item in unique_items:
475
+ count[item] += 1
476
+
477
+ support = {item: count[item]/total for item in count}
478
+ return support, count
479
+
480
+
481
+ # ==========================================
482
+ # 3. CHECK SUBSEQUENCE
483
+ # ==========================================
484
+
485
+ def is_subsequence(candidate, sequence):
486
+ i = 0
487
+ for itemset in sequence:
488
+ if all(item in itemset for item in candidate[i]):
489
+ i += 1
490
+ if i == len(candidate):
491
+ return True
492
+ return False
493
+
494
+
495
+ # ==========================================
496
+ # 4. COUNT SUPPORT FOR SEQUENCE
497
+ # ==========================================
498
+
499
+ def count_sequence_support(sequences, candidates):
500
+ count = defaultdict(int)
501
+
502
+ for seq in sequences:
503
+ for cand in candidates:
504
+ if is_subsequence(cand, seq):
505
+ count[str(cand)] += 1
506
+
507
+ return count
508
+
509
+
510
+ # ==========================================
511
+ # 5. JOIN STEP (CANDIDATE GENERATION)
512
+ # ==========================================
513
+
514
+ def join(L):
515
+ candidates = []
516
+
517
+ for i in range(len(L)):
518
+ for j in range(len(L)):
519
+ s1 = L[i]
520
+ s2 = L[j]
521
+
522
+ if s1[1:] == s2[:-1]:
523
+ new_seq = s1 + [s2[-1]]
524
+ if new_seq not in candidates:
525
+ candidates.append(new_seq)
526
+
527
+ return candidates
528
+
529
+
530
+ # ==========================================
531
+ # 6. MS-GSP MAIN
532
+ # ==========================================
533
+
534
+ def msgsp(sequences, MIS, SDC):
535
+ support, raw_count = get_support(sequences)
536
+ N = len(sequences)
537
+
538
+ # Sort items by MIS
539
+ items = sorted(MIS.keys(), key=lambda x: MIS[x])
540
+
541
+ # Init pass
542
+ L = []
543
+ for item in items:
544
+ if support.get(item, 0) >= MIS[item]:
545
+ L.append(item)
546
+
547
+ # Frequent 1-sequences
548
+ F = []
549
+ F1 = []
550
+ for item in L:
551
+ if support[item] >= MIS[item]:
552
+ F1.append([[item]])
553
+
554
+ F.append(F1)
555
+
556
+ k = 2
557
+ while True:
558
+ prev = F[k-2]
559
+ if not prev:
560
+ break
561
+
562
+ # Join
563
+ candidates = join(prev)
564
+
565
+ # Count support
566
+ counts = count_sequence_support(sequences, candidates)
567
+
568
+ freq_k = []
569
+
570
+ for cand in candidates:
571
+ key = str(cand)
572
+ sup = counts[key] / N if key in counts else 0
573
+
574
+ # MIS check (first item)
575
+ first_item = cand[0][0]
576
+
577
+ # SDC check
578
+ valid = True
579
+ flat = [item for subset in cand for item in subset]
580
+ for i in range(len(flat)):
581
+ for j in range(len(flat)):
582
+ if abs(support.get(flat[i],0) - support.get(flat[j],0)) > SDC:
583
+ valid = False
584
+
585
+ if sup >= MIS[first_item] and valid:
586
+ freq_k.append(cand)
587
+
588
+ if not freq_k:
589
+ break
590
+
591
+ F.append(freq_k)
592
+ k += 1
593
+
594
+ return F, sequences
595
+
596
+
597
+ # ==========================================
598
+ # 7. PRINT OUTPUT
599
+ # ==========================================
600
+
601
+ def print_patterns(F, sequences):
602
+ counts = count_sequence_support(sequences,
603
+ [cand for level in F for cand in level])
604
+
605
+ for level in F:
606
+ for pattern in level:
607
+ key = str(pattern)
608
+ print(f"Pattern :<{pattern}> count: {counts.get(key,0)}")
609
+
610
+
611
+ # ==========================================
612
+ # RUN
613
+ # ==========================================
614
+
615
+ sequences = read_data("data.txt")
616
+ MIS, SDC = read_params("para.txt")
617
+
618
+ F, sequences = msgsp(sequences, MIS, SDC)
619
+
620
+ print_patterns(F, sequences)
621
+ """)
622
+
623
+
624
+ #--------------------------------------
625
+ #IR
626
+ #--------------------------------------
627
+ #Collaborative Filtering (User-Based)
628
+ def cf():
629
+ print(r"""
630
+ import math
631
+ # Sample user-item matrix (0 = not rated)
632
+ R = {
633
+ 'A': {'item1': 5, 'item2': 3, 'item3': 0},
634
+ 'B': {'item1': 4, 'item2': 0, 'item3': 2},
635
+ 'C': {'item1': 0, 'item2': 4, 'item3': 5}
636
+ }
637
+
638
+ # Step 1: Cosine similarity
639
+ def cosine_similarity(user1, user2):
640
+ dot = 0
641
+ norm1 = 0
642
+ norm2 = 0
643
+
644
+ for item in R[user1]:
645
+ r1 = R[user1][item]
646
+ r2 = R[user2][item]
647
+
648
+ dot += r1 * r2
649
+ norm1 += r1 ** 2
650
+ norm2 += r2 ** 2
651
+
652
+ if norm1 == 0 or norm2 == 0:
653
+ return 0
654
+
655
+ return dot / (math.sqrt(norm1) * math.sqrt(norm2))
656
+
657
+ # Step 2: Get similar users
658
+ def get_neighbors(target_user):
659
+ similarities = []
660
+
661
+ for user in R:
662
+ if user != target_user:
663
+ sim = cosine_similarity(target_user, user)
664
+ similarities.append((user, sim))
665
+
666
+ similarities.sort(key=lambda x: x[1], reverse=True)
667
+ return similarities
668
+
669
+ # Step 3: Predict rating
670
+ def predict_rating(user, item):
671
+ neighbors = get_neighbors(user)
672
+
673
+ numerator = 0
674
+ denominator = 0
675
+
676
+ for neighbor, sim in neighbors:
677
+ rating = R[neighbor][item]
678
+ if rating != 0:
679
+ numerator += sim * rating
680
+ denominator += abs(sim)
681
+
682
+ if denominator == 0:
683
+ return 0
684
+
685
+ return numerator / denominator
686
+
687
+ # Example
688
+ print("Predicted rating for A on item3:", predict_rating('A', 'item3'))""")
689
+
690
+ ##Content-Based Recommendation
691
+ def cbr():
692
+ print(r"""import math
693
+
694
+ # Item feature vectors
695
+ items = {
696
+ 'item1': [1, 0, 1],
697
+ 'item2': [0, 1, 1],
698
+ 'item3': [1, 1, 0]
699
+ }
700
+
701
+ # User liked items
702
+ user_likes = ['item1', 'item2']
703
+
704
+ # Step 1: Build user profile
705
+ def build_user_profile(likes):
706
+ profile = [0] * len(items['item1'])
707
+
708
+ for item in likes:
709
+ for i in range(len(profile)):
710
+ profile[i] += items[item][i]
711
+
712
+ # average
713
+ for i in range(len(profile)):
714
+ profile[i] /= len(likes)
715
+
716
+ return profile
717
+
718
+ # Step 2: Cosine similarity
719
+ def cosine(v1, v2):
720
+ dot = sum(v1[i] * v2[i] for i in range(len(v1)))
721
+ norm1 = math.sqrt(sum(x*x for x in v1))
722
+ norm2 = math.sqrt(sum(x*x for x in v2))
723
+
724
+ if norm1 == 0 or norm2 == 0:
725
+ return 0
726
+
727
+ return dot / (norm1 * norm2)
728
+
729
+ # Step 3: Recommend items
730
+ def recommend():
731
+ profile = build_user_profile(user_likes)
732
+ scores = []
733
+
734
+ for item in items:
735
+ if item not in user_likes:
736
+ sim = cosine(profile, items[item])
737
+ scores.append((item, sim))
738
+
739
+ scores.sort(key=lambda x: x[1], reverse=True)
740
+ return scores
741
+
742
+ # Example
743
+ print("Recommendations:", recommend())""")
744
+
745
+ ##Page rank
746
+
747
+ def pr():
748
+
749
+ print(r"""
750
+ import math
751
+ def pagerank(graph, d=0.85, iterations=10):
752
+ N = len(graph)
753
+
754
+ # Step 1: Initialize PageRank
755
+ pr = {}
756
+ for page in graph:
757
+ pr[page] = 1 / N
758
+
759
+ # Step 2: Iterations
760
+ for _ in range(iterations):
761
+ new_pr = {}
762
+
763
+ for page in graph:
764
+ # Base value (random jump)
765
+ new_pr[page] = (1 - d) / N
766
+
767
+ for node in graph:
768
+ # Case 1: Normal link
769
+ if len(graph[node]) > 0:
770
+ if page in graph[node]: # node -> page
771
+ new_pr[page] += d * (pr[node] / len(graph[node]))
772
+
773
+ # Case 2: Dangling node (no outgoing links)
774
+ else:
775
+ # Distribute its rank equally to all pages
776
+ new_pr[page] += d * (pr[node] / N)
777
+
778
+ pr = new_pr
779
+
780
+ return pr
781
+
782
+
783
+ # Example graph with dangling node
784
+ graph = {
785
+ 'A': ['B'],
786
+ 'B': ['C'],
787
+ 'C': ['A'],
788
+ 'D': [] # Dangling node
789
+ }
790
+
791
+ # Run PageRank
792
+ result = pagerank(graph)
793
+
794
+ # Print results
795
+ for page, rank in result.items():
796
+ print(page, ":", round(rank, 4))""")
797
+
798
+
799
+
800
+ ##Principal Component Analysis (PCA)
801
+ def pca():
802
+ print(r"""
803
+ import math
804
+
805
+ # Example dataset (2D → reduce to 1D)
806
+ X = [
807
+ [2.5, 2.4],
808
+ [0.5, 0.7],
809
+ [2.2, 2.9],
810
+ [1.9, 2.2],
811
+ [3.1, 3.0]
812
+ ]
813
+
814
+ # Step 1: Mean Centering
815
+ def mean_center(X):
816
+ mean = [sum(col)/len(col) for col in zip(*X)]
817
+
818
+ X_centered = []
819
+ for row in X:
820
+ X_centered.append([row[i] - mean[i] for i in range(len(row))])
821
+
822
+ return X_centered, mean
823
+
824
+ # Step 2: Covariance Matrix
825
+ def covariance_matrix(X):
826
+ n = len(X)
827
+ m = len(X[0])
828
+
829
+ cov = [[0]*m for _ in range(m)]
830
+
831
+ for i in range(m):
832
+ for j in range(m):
833
+ for row in X:
834
+ cov[i][j] += row[i] * row[j]
835
+ cov[i][j] /= n
836
+
837
+ return cov
838
+
839
+ # Step 3: Eigenvalues & Eigenvectors (2x2 only for lab simplicity)
840
+ def eigen_2x2(matrix):
841
+ a, b = matrix[0]
842
+ c, d = matrix[1]
843
+
844
+ # Eigenvalues
845
+ trace = a + d
846
+ det = a*d - b*c
847
+
848
+ term = math.sqrt(trace**2 - 4*det)
849
+
850
+ lambda1 = (trace + term)/2
851
+ lambda2 = (trace - term)/2
852
+
853
+ # Eigenvectors
854
+ vec1 = [b, lambda1 - a] if b != 0 else [1, 0]
855
+ vec2 = [b, lambda2 - a] if b != 0 else [0, 1]
856
+
857
+ return [(lambda1, vec1), (lambda2, vec2)]
858
+
859
+ # Step 4: Projection
860
+ def project(X, vector):
861
+ projected = []
862
+ for row in X:
863
+ val = sum(row[i] * vector[i] for i in range(len(vector)))
864
+ projected.append(val)
865
+ return projected
866
+
867
+
868
+ # Run PCA
869
+ X_centered, mean = mean_center(X)
870
+ cov = covariance_matrix(X_centered)
871
+ eigen_pairs = eigen_2x2(cov)
872
+
873
+ # Sort by eigenvalue
874
+ eigen_pairs.sort(key=lambda x: x[0], reverse=True)
875
+
876
+ # Take top eigenvector
877
+ top_vector = eigen_pairs[0][1]
878
+
879
+ # Project data
880
+ result = project(X_centered, top_vector)
881
+
882
+ print("Reduced Data:", result)
883
+
884
+ ##Feature Selection (Simple Reduction)
885
+
886
+ def variance(feature):
887
+ mean = sum(feature)/len(feature)
888
+ return sum((x - mean)**2 for x in feature) / len(feature)
889
+
890
+ def feature_selection(X, threshold=0.5):
891
+ selected = []
892
+
893
+ # Transpose dataset
894
+ features = list(zip(*X))
895
+
896
+ for i, feature in enumerate(features):
897
+ var = variance(feature)
898
+ if var >= threshold:
899
+ selected.append(i)
900
+
901
+ # Build reduced dataset
902
+ reduced = []
903
+ for row in X:
904
+ reduced.append([row[i] for i in selected])
905
+
906
+ return reduced
907
+
908
+
909
+ # Example
910
+ X = [
911
+ [1, 100, 2],
912
+ [2, 100, 3],
913
+ [3, 100, 4]
914
+ ]
915
+
916
+ print("Reduced:", feature_selection(X))""")
917
+
918
+ def recommend():
919
+ print(r"""
920
+ import numpy as np
921
+ import pandas as pd
922
+ import matplotlib.pyplot as plt
923
+
924
+
925
+ # =========================
926
+ # DATA PREPARATION
927
+ def load_data():
928
+ data = [
929
+ [5, 3, 4, '?'],
930
+ [3, 1, 2, 3],
931
+ [4, 3, 4, 5],
932
+ [3, 3, 1, 5]
933
+ ]
934
+
935
+ df = pd.DataFrame(
936
+ data,
937
+ index=['User1', 'User2', 'User3', 'User4'],
938
+ columns=['Item1', 'Item2', 'Item3', 'Item4']
939
+ )
940
+
941
+ df = df.mask(df == '?', np.nan).astype(float)
942
+ return df
943
+
944
+ def mean_center(df):
945
+ user_mean = df.mean(axis=1)
946
+ df_centered = df.sub(user_mean, axis=0)
947
+ return user_mean, df_centered
948
+
949
+ # SIMILARITY FUNCTION
950
+ def cosine_similarity(a, b):
951
+ mask = ~np.isnan(a) & ~np.isnan(b)
952
+
953
+ if np.sum(mask) == 0:
954
+ return 0
955
+
956
+ a, b = a[mask], b[mask]
957
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
958
+
959
+ # USER-BASED CF
960
+ def predict_user_based(df, user, item):
961
+ target_vector = df.loc[user].values
962
+ similarities = {}
963
+
964
+ for other_user in df.index:
965
+ if other_user == user:
966
+ continue
967
+
968
+ if not np.isnan(df.loc[other_user, item]):
969
+ sim = cosine_similarity(
970
+ target_vector,
971
+ df.loc[other_user].values
972
+ )
973
+ similarities[other_user] = sim
974
+
975
+ num, den = 0, 0
976
+ for u, sim in similarities.items():
977
+ num += sim * df.loc[u, item]
978
+ den += abs(sim)
979
+
980
+ return np.nan if den == 0 else num / den
981
+
982
+ def predict_user_based_mean_centered(df, user, item):
983
+ user_mean = df.mean(axis=1)
984
+ target_vector = df.loc[user].values
985
+ similarities = {}
986
+
987
+ for other_user in df.index:
988
+ if other_user == user:
989
+ continue
990
+
991
+ if not np.isnan(df.loc[other_user, item]):
992
+ sim = cosine_similarity(
993
+ target_vector,
994
+ df.loc[other_user].values
995
+ )
996
+ similarities[other_user] = sim
997
+
998
+ num, den = 0, 0
999
+
1000
+ for u, sim in similarities.items():
1001
+ ru_i = df.loc[u, item]
1002
+ ru_mean = user_mean[u]
1003
+
1004
+ num += sim * (ru_i - ru_mean)
1005
+ den += abs(sim)
1006
+
1007
+ if den == 0:
1008
+ return user_mean[user]
1009
+
1010
+ return user_mean[user] + (num / den)
1011
+
1012
+ def predict_user_based_topk(df, user, item, k=2):
1013
+ target_vector = df.loc[user].values
1014
+ similarities = []
1015
+
1016
+ for other_user in df.index:
1017
+ if other_user == user:
1018
+ continue
1019
+
1020
+ if not np.isnan(df.loc[other_user, item]):
1021
+ sim = cosine_similarity(
1022
+ target_vector,
1023
+ df.loc[other_user].values
1024
+ )
1025
+ similarities.append((other_user, sim))
1026
+
1027
+ # Sort by similarity (descending)
1028
+ similarities.sort(key=lambda x: x[1], reverse=True)
1029
+
1030
+ # Select Top-K
1031
+ top_k = similarities[:k]
1032
+
1033
+ num, den = 0, 0
1034
+ for u, sim in top_k:
1035
+ num += sim * df.loc[u, item]
1036
+ den += abs(sim)
1037
+
1038
+ return np.nan if den == 0 else num / den
1039
+
1040
+ # ITEM-BASED CF
1041
+ def predict_item_based(df, user, item):
1042
+ target_vector = df[item].values
1043
+ similarities = {}
1044
+
1045
+ for other_item in df.columns:
1046
+ if other_item == item:
1047
+ continue
1048
+
1049
+ if not np.isnan(df.loc[user, other_item]):
1050
+ sim = cosine_similarity(
1051
+ target_vector,
1052
+ df[other_item].values
1053
+ )
1054
+ similarities[other_item] = sim
1055
+
1056
+ num, den = 0, 0
1057
+
1058
+ for i, sim in similarities.items():
1059
+ num += sim * df.loc[user, i]
1060
+ den += abs(sim)
1061
+
1062
+ return np.nan if den == 0 else num / den
1063
+
1064
+ def predict_item_based_topk(df, user, item, k=2):
1065
+ target_vector = df[item].values
1066
+ similarities = []
1067
+
1068
+ for other_item in df.columns:
1069
+ if other_item == item:
1070
+ continue
1071
+
1072
+ if not np.isnan(df.loc[user, other_item]):
1073
+ sim = cosine_similarity(
1074
+ target_vector,
1075
+ df[other_item].values
1076
+ )
1077
+ similarities.append((other_item, sim))
1078
+
1079
+ # Sort & select Top-K
1080
+ similarities.sort(key=lambda x: x[1], reverse=True)
1081
+ top_k = similarities[:k]
1082
+
1083
+ num, den = 0, 0
1084
+ for i, sim in top_k:
1085
+ num += sim * df.loc[user, i]
1086
+ den += abs(sim)
1087
+
1088
+ return np.nan if den == 0 else num / den
1089
+
1090
+ # EVALUATION
1091
+ def evaluate(df):
1092
+ actuals, preds = [], []
1093
+
1094
+ for u in df.index:
1095
+ for i in df.columns:
1096
+ if not np.isnan(df.loc[u, i]):
1097
+
1098
+ temp = df.copy()
1099
+ actual = temp.loc[u, i]
1100
+ temp.loc[u, i] = np.nan
1101
+
1102
+ p = predict_user_based(temp, u, i)
1103
+
1104
+ if not np.isnan(p):
1105
+ actuals.append(actual)
1106
+ preds.append(p)
1107
+
1108
+ actuals = np.array(actuals)
1109
+ preds = np.array(preds)
1110
+
1111
+ rmse = np.sqrt(np.mean((actuals - preds) ** 2))
1112
+ mae = np.mean(np.abs(actuals - preds))
1113
+
1114
+ return rmse, mae
1115
+
1116
+ # VISUALIZATION
1117
+ def plot_matrix(matrix, title):
1118
+ plt.figure()
1119
+ plt.imshow(matrix, aspect='auto')
1120
+ plt.title(title)
1121
+ plt.colorbar()
1122
+ plt.show()
1123
+
1124
+ # MAIN EXECUTION
1125
+ def main():
1126
+ df = load_data()
1127
+
1128
+ # Mean Centering
1129
+ user_mean, df_centered = mean_center(df)
1130
+ print("\nMean Centered Matrix:\n", df_centered)
1131
+
1132
+
1133
+ #for u in df.index:
1134
+ # for i in df.columns:
1135
+ # if np.isnan(df.loc[u, i]):
1136
+ # user_pred_matrix.loc[u, i] = predict_user_based(df, u, i)
1137
+ # item_pred_matrix.loc[u, i] = predict_item_based(df, u, i)
1138
+
1139
+ #print("\nUser-Based Prediction Matrix:\n", user_pred_matrix)
1140
+ #print("\nItem-Based Prediction Matrix:\n", item_pred_matrix)
1141
+
1142
+
1143
+ # Prediction
1144
+ user = "User1"
1145
+ item = "Item4"
1146
+
1147
+ user_pred = predict_user_based(df, user, item)
1148
+ item_pred = predict_item_based(df, user, item)
1149
+
1150
+ print("\nUser-Based Prediction:\n", user_pred)
1151
+ print("\nItem-Based Prediction:\n", item_pred)
1152
+
1153
+ # Evaluation
1154
+ rmse, mae = evaluate(df)
1155
+ print("\nEvaluation Metrics:")
1156
+ print("RMSE =", rmse)
1157
+ print("MAE =", mae)
1158
+
1159
+ # Fill NaNs for visualization
1160
+ user_pred_matrix = df.copy().fillna(df.mean().mean())
1161
+ item_pred_matrix = df.copy().fillna(df.mean().mean())
1162
+
1163
+ # Plots
1164
+ plot_matrix(df.fillna(0), "Original Matrix")
1165
+ plot_matrix(df_centered.fillna(0), "Mean Centered Matrix")
1166
+ plot_matrix(user_pred_matrix, "User-Based Predicted Matrix")
1167
+ plot_matrix(item_pred_matrix, "Item-Based Predicted Matrix")
1168
+
1169
+
1170
+ if __name__ == "__main__":
1171
+ main()"""
1172
+ )
1173
+ recommend()