itertoolkit 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. bm_preprocessing/__init__.py +14 -0
  2. bm_preprocessing/importer/DM/__init__.py +7 -0
  3. bm_preprocessing/importer/DM/agg.py +6 -0
  4. bm_preprocessing/importer/DM/dbscan.py +6 -0
  5. bm_preprocessing/importer/DM/finals.py +6 -0
  6. bm_preprocessing/importer/DM/gsp.py +6 -0
  7. bm_preprocessing/importer/DM/test.py +6 -0
  8. bm_preprocessing/importer/Finals/__init__.py +7 -0
  9. bm_preprocessing/importer/Finals/kaadhal.py +6 -0
  10. bm_preprocessing/importer/Finals/raaka.py +6 -0
  11. bm_preprocessing/importer/Finals/seedan.py +6 -0
  12. bm_preprocessing/importer/Finals/vikram.py +6 -0
  13. bm_preprocessing/importer/IR/__init__.py +6 -0
  14. bm_preprocessing/importer/IR/finals.py +6 -0
  15. bm_preprocessing/importer/IR/pagerank.py +6 -0
  16. bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
  17. bm_preprocessing/importer/IR/test.py +6 -0
  18. bm_preprocessing/importer/PY/__init__.py +4 -0
  19. bm_preprocessing/importer/PY/lib_doc.py +6 -0
  20. bm_preprocessing/importer/PY/python_doc.py +6 -0
  21. bm_preprocessing/importer/__init__.py +8 -0
  22. bm_preprocessing/importer/_module_printer.py +23 -0
  23. bm_preprocessing/src/DM/__init__.py +1 -0
  24. bm_preprocessing/src/DM/agg.py +267 -0
  25. bm_preprocessing/src/DM/dbscan.py +218 -0
  26. bm_preprocessing/src/DM/finals.py +19 -0
  27. bm_preprocessing/src/DM/gsp.py +378 -0
  28. bm_preprocessing/src/DM/test.py +19 -0
  29. bm_preprocessing/src/Finals/__init__.py +1 -0
  30. bm_preprocessing/src/Finals/kaadhal.py +1453 -0
  31. bm_preprocessing/src/Finals/raaka.py +1338 -0
  32. bm_preprocessing/src/Finals/seedan.py +1173 -0
  33. bm_preprocessing/src/Finals/vikram.py +520 -0
  34. bm_preprocessing/src/IR/__init__.py +1 -0
  35. bm_preprocessing/src/IR/finals.py +14 -0
  36. bm_preprocessing/src/IR/pagerank.py +109 -0
  37. bm_preprocessing/src/IR/recommenders_pca.py +487 -0
  38. bm_preprocessing/src/IR/test.py +14 -0
  39. bm_preprocessing/src/PY/__init__.py +1 -0
  40. bm_preprocessing/src/PY/lib_doc.py +295 -0
  41. bm_preprocessing/src/PY/python_doc.py +177 -0
  42. bm_preprocessing/src/__init__.py +1 -0
  43. itertoolkit-1.5.0.dist-info/METADATA +120 -0
  44. itertoolkit-1.5.0.dist-info/RECORD +45 -0
  45. itertoolkit-1.5.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,1338 @@
1
+ #dm1
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ import re
7
+ from collections import defaultdict
8
+ from sklearn.preprocessing import StandardScaler
9
+ from sklearn.cluster import AgglomerativeClustering, DBSCAN
10
+ from scipy.cluster.hierarchy import dendrogram, linkage
11
+
12
+ # 🔹 1. PREPROCESSING
13
+ def load_and_clean(filepath, remove_cols=None):
14
+ data = pd.read_csv(filepath)
15
+
16
+ if remove_cols:
17
+ data.drop(columns=remove_cols, inplace=True)
18
+
19
+ # Fill missing values instead of dropping
20
+ for col in data.columns:
21
+ if data[col].dtype == 'object':
22
+ data[col].fillna(data[col].mode()[0], inplace=True)
23
+ else:
24
+ data[col].fillna(data[col].mean(), inplace=True)
25
+
26
+ # Manual label encoding using category codes
27
+ cat_cols = data.select_dtypes(include='object').columns
28
+ for col in cat_cols:
29
+ data[col] = pd.Categorical(data[col]).codes
30
+
31
+ return data
32
+
33
+ def normalize(matrix):
34
+ scaler = StandardScaler()
35
+ return scaler.fit_transform(matrix)
36
+
37
+ # 🔹 2. VISUALIZATION
38
+ def scatter_clusters(features, cluster_labels, chart_title):
39
+ unique_labels = np.unique(cluster_labels)
40
+ colors = plt.cm.tab10(np.linspace(0, 1, len(unique_labels)))
41
+ for lbl, col in zip(unique_labels, colors):
42
+ mask = np.array(cluster_labels) == lbl
43
+ plt.scatter(features[mask, 0], features[mask, 1], color=col, label=f'Cluster {lbl}', s=20)
44
+ plt.title(chart_title)
45
+ plt.xlabel("Feature 1")
46
+ plt.ylabel("Feature 2")
47
+ plt.legend(fontsize=6)
48
+ plt.show()
49
+
50
+ def show_dendrogram(data_matrix):
51
+ Z = linkage(data_matrix, method='ward')
52
+ dendrogram(Z)
53
+ plt.title("Dendrogram")
54
+ plt.show()
55
+
56
+ def bar_chart(x_vals, y_vals, x_label, y_label, title):
57
+ plt.bar(range(len(x_vals)), y_vals, tick_label=x_vals if len(x_vals) < 30 else None)
58
+ plt.xlabel(x_label)
59
+ plt.ylabel(y_label)
60
+ plt.title(title)
61
+ plt.show()
62
+
63
+ def pie_chart(dataframe, col_name):
64
+ freq = dataframe[col_name].value_counts()
65
+ plt.pie(freq.values, labels=freq.index.tolist(), autopct='%1.1f%%')
66
+ plt.title(f"{col_name} Distribution")
67
+ plt.show()
68
+
69
+ # 🔹 3. AGGLOMERATIVE CLUSTERING
70
+ def run_agglomerative(file):
71
+ data = load_and_clean(file, remove_cols=["CustomerID"])
72
+
73
+ feature_cols = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
74
+ X = data[feature_cols].values
75
+ X_norm = normalize(X)
76
+
77
+ pie_chart(data, "Gender")
78
+ bar_chart(data['Age'].values, data['Annual Income (k$)'].values, "Age", "Income", "Age vs Income")
79
+
80
+ # Ward linkage agglomerative clustering with 5 clusters
81
+ clusterer = AgglomerativeClustering(n_clusters=5, linkage='ward')
82
+ cluster_labels = clusterer.fit_predict(X_norm)
83
+
84
+ scatter_clusters(X_norm, cluster_labels, "Agglomerative Clustering")
85
+ show_dendrogram(X_norm)
86
+
87
+ # 🔹 4. DBSCAN (CUSTOM)
88
+ def compute_distance_matrix(X):
89
+ # Vectorized pairwise Euclidean distances using broadcasting
90
+ diff = X[:, np.newaxis, :] - X[np.newaxis, :, :]
91
+ return np.sqrt((diff ** 2).sum(axis=2))
92
+
93
+ def custom_dbscan(X, eps=0.5, min_pts=5):
94
+ dist_matrix = compute_distance_matrix(X)
95
+ n = len(X)
96
+ # -1 = noise, 0 = unvisited
97
+ labels = [0] * n
98
+ current_cluster = 0
99
+
100
+ for point_idx in range(n):
101
+ if labels[point_idx] != 0:
102
+ continue
103
+
104
+ # Get neighbors using precomputed distances
105
+ neighbor_indices = list(np.where(dist_matrix[point_idx] <= eps)[0])
106
+
107
+ if len(neighbor_indices) < min_pts:
108
+ labels[point_idx] = -1 # noise
109
+ continue
110
+
111
+ current_cluster += 1
112
+ labels[point_idx] = current_cluster
113
+
114
+ seed_set = set(neighbor_indices) - {point_idx}
115
+
116
+ while seed_set:
117
+ q = seed_set.pop()
118
+
119
+ if labels[q] == -1:
120
+ labels[q] = current_cluster
121
+
122
+ if labels[q] != 0:
123
+ continue
124
+
125
+ labels[q] = current_cluster
126
+ q_neighbors = list(np.where(dist_matrix[q] <= eps)[0])
127
+
128
+ if len(q_neighbors) >= min_pts:
129
+ seed_set.update(q_neighbors)
130
+
131
+ return labels
132
+
133
+ def run_dbscan(file):
134
+ data = load_and_clean(file, remove_cols=["Channel", "Region"])
135
+
136
+ # Display first few records
137
+ print("First 5 records:\n", data.head())
138
+
139
+ features = data[['Grocery', 'Milk']].values
140
+ features_scaled = normalize(features)
141
+
142
+ plt.scatter(features_scaled[:, 0], features_scaled[:, 1])
143
+ plt.title("Normalized Data")
144
+ plt.show()
145
+
146
+ # Custom DBSCAN
147
+ my_labels = custom_dbscan(features_scaled, eps=0.5, min_pts=15)
148
+ scatter_clusters(features_scaled, my_labels, "Custom DBSCAN")
149
+
150
+ # Sklearn DBSCAN (for comparison)
151
+ sk_labels = DBSCAN(eps=0.5, min_samples=15).fit_predict(features_scaled)
152
+ scatter_clusters(features_scaled, sk_labels, "Sklearn DBSCAN")
153
+
154
+ # 🔹 5. MS-GSP (FULL)
155
+ def parse_sequences(filepath):
156
+ all_seqs = []
157
+ with open(filepath) as f:
158
+ for line in f:
159
+ itemsets = re.findall(r'\{(.*?)\}', line.strip())
160
+ seq = [frozenset(map(int, s.split(','))) for s in itemsets]
161
+ if seq:
162
+ all_seqs.append(seq)
163
+ return all_seqs
164
+
165
+ def parse_params(filepath):
166
+ mis_vals = {}
167
+ sdc_val = 0.0
168
+ with open(filepath) as f:
169
+ for line in f:
170
+ line = line.strip()
171
+ if 'MIS' in line:
172
+ item_id = int(re.search(r'\((\d+)\)', line).group(1))
173
+ mis_vals[item_id] = float(line.split('=')[-1].strip())
174
+ elif 'SDC' in line:
175
+ sdc_val = float(line.split('=')[-1].strip())
176
+ return mis_vals, sdc_val
177
+
178
+ def item_support(sequences):
179
+ total = len(sequences)
180
+ freq = defaultdict(set)
181
+ for sid, seq in enumerate(sequences):
182
+ seen = set()
183
+ for itemset in seq:
184
+ seen |= itemset
185
+ for item in seen:
186
+ freq[item].add(sid)
187
+ return {item: len(sids) / total for item, sids in freq.items()}
188
+
189
+ def pattern_contains(pattern, sequence):
190
+ # Check if pattern is a subsequence of sequence
191
+ pos = 0
192
+ for itemset in sequence:
193
+ if pos < len(pattern) and pattern[pos].issubset(itemset):
194
+ pos += 1
195
+ return pos == len(pattern)
196
+
197
+ def support_of(pattern, sequences):
198
+ return sum(1 for seq in sequences if pattern_contains(pattern, seq))
199
+
200
+ def all_items_in(pattern):
201
+ return set().union(*pattern)
202
+
203
+ def sdc_ok(pattern, sup_map, sdc):
204
+ items = list(all_items_in(pattern))
205
+ for a in range(len(items)):
206
+ for b in range(a + 1, len(items)):
207
+ if abs(sup_map[items[a]] - sup_map[items[b]]) > sdc:
208
+ return False
209
+ return True
210
+
211
+ def MSGSP(file_data, file_para):
212
+ sequences = parse_sequences(file_data)
213
+ MIS, SDC = parse_params(file_para)
214
+ sup_map = item_support(sequences)
215
+ total = len(sequences)
216
+
217
+ # Plot item support
218
+ plt.bar(list(sup_map.keys()), list(sup_map.values()))
219
+ plt.title("Support Distribution")
220
+ plt.show()
221
+
222
+ # --- F1: items sorted by MIS that meet their own threshold ---
223
+ sorted_items = sorted(MIS.keys(), key=lambda x: MIS[x])
224
+ freq_items = [i for i in sorted_items if sup_map.get(i, 0) >= MIS[i]]
225
+ print("L:", freq_items)
226
+
227
+ F1 = [[frozenset([i])] for i in freq_items]
228
+ print("\nF1:", F1)
229
+
230
+ F = [F1]
231
+
232
+ # --- F2: candidate 2-sequences and 2-itemsets ---
233
+ C2 = []
234
+ for i in range(len(freq_items)):
235
+ for j in range(i + 1, len(freq_items)):
236
+ a, b = freq_items[i], freq_items[j]
237
+ if abs(sup_map[a] - sup_map[b]) <= SDC:
238
+ C2.append([frozenset([a, b])]) # itemset
239
+ C2.append([frozenset([a]), frozenset([b])]) # sequence
240
+
241
+ F2 = []
242
+ for cand in C2:
243
+ items = all_items_in(cand)
244
+ min_mis = min(MIS[i] for i in items)
245
+ s = support_of(cand, sequences) / total
246
+ if s >= min_mis and sdc_ok(cand, sup_map, SDC) and cand not in F2:
247
+ F2.append(cand)
248
+
249
+ print("\nF2:", F2)
250
+ F.append(F2)
251
+
252
+ # --- Higher-order levels ---
253
+ level = 3
254
+ while True:
255
+ prev = F[level - 2]
256
+ candidates = []
257
+
258
+ for p in prev:
259
+ for q in prev:
260
+ if p[1:] == q[:-1]:
261
+ extended = p + [q[-1]]
262
+ if extended not in candidates:
263
+ candidates.append(extended)
264
+
265
+ if not candidates:
266
+ break
267
+
268
+ Fk = []
269
+ for cand in candidates:
270
+ items = all_items_in(cand)
271
+ min_mis = min(MIS[i] for i in items)
272
+ s = support_of(cand, sequences) / total
273
+ if s >= min_mis and sdc_ok(cand, sup_map, SDC) and cand not in Fk:
274
+ Fk.append(cand)
275
+
276
+ if not Fk:
277
+ break
278
+
279
+ print(f"\nF{level}:", Fk)
280
+ F.append(Fk)
281
+ level += 1
282
+
283
+ # --- Final output ---
284
+ print("\nFinal Patterns:\n")
285
+ for level_patterns in F:
286
+ for pat in level_patterns:
287
+ cnt = support_of(pat, sequences)
288
+ pat_str = "<" + "".join("{" + ",".join(map(str, s)) + "}" for s in pat) + ">"
289
+ print(f"Pattern: {pat_str} count: {cnt}")
290
+
291
+ # 🔥 RUN EVERYTHING
292
+
293
+ # Part 1
294
+ run_agglomerative("filepath")
295
+
296
+ # Part 2
297
+ run_dbscan("/filepath")
298
+
299
+ # Part 3
300
+ MSGSP("/file1path", "/file2path")
301
+
302
+
303
+ #dm2
304
+ import pandas as pd
305
+ import numpy as np
306
+ import matplotlib.pyplot as plt
307
+ from sklearn.preprocessing import LabelEncoder
308
+ from sklearn.cluster import AgglomerativeClustering
309
+ from scipy.cluster.hierarchy import dendrogram, linkage
310
+
311
+ # Load dataset
312
+ df = pd.read_csv("FinalDM/Dataset/Mall_Customers.csv")
313
+
314
+ # 1. Remove CustomerID
315
+ df = df.drop("CustomerID", axis=1)
316
+
317
+ # 2. Check missing values
318
+ print("Missing values:\n", df.isnull().sum())
319
+
320
+ # 3. Encode Gender (Male=1, Female=0)
321
+ df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
322
+
323
+ # 4. Pie chart (Male vs Female)
324
+ gender_counts = df['Gender'].value_counts()
325
+ labels = ['Female', 'Male']
326
+ plt.pie(gender_counts, labels=labels, autopct='%1.1f%%')
327
+ plt.title("Gender Distribution")
328
+ plt.show()
329
+
330
+ # 5. Bar graph (Age & Income)
331
+ plt.bar(df['Age'], df['Annual Income (k$)'])
332
+ plt.xlabel("Age")
333
+ plt.ylabel("Income")
334
+ plt.title("Age vs Income")
335
+ plt.show()
336
+
337
+ # 6. Agglomerative Clustering
338
+ X = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]
339
+
340
+ model = AgglomerativeClustering(n_clusters=5, linkage='ward')
341
+ labels = model.fit_predict(X)
342
+
343
+ # Plot clusters
344
+ plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels)
345
+ plt.xlabel("Age")
346
+ plt.ylabel("Income")
347
+ plt.title("Agglomerative Clustering")
348
+ plt.show()
349
+
350
+ # 7. Dendrogram
351
+ linked = linkage(X, method='ward')
352
+
353
+ plt.figure(figsize=(10, 5))
354
+ dendrogram(linked)
355
+ plt.title("Dendrogram")
356
+ plt.show()
357
+
358
+
359
+ # =========================================================
360
+ # problem 1 - using manual implementation
361
+ # =========================================================
362
+ import pandas as pd
363
+ import numpy as np
364
+ import matplotlib.pyplot as plt
365
+ from sklearn.preprocessing import StandardScaler
366
+
367
+ # Load dataset
368
+ df = pd.read_csv("FinalDM/Dataset/Wholesale customers data.csv")
369
+
370
+ # Drop columns
371
+ df = df.drop(['Channel', 'Region'], axis=1)
372
+
373
+ # 1. Display first few records
374
+ print("First 5 records:\n", df.head())
375
+
376
+ # Select Grocery & Milk
377
+ X = df[['Grocery', 'Milk']]
378
+
379
+ # Normalize
380
+ scaler = StandardScaler()
381
+ X_scaled = scaler.fit_transform(X)
382
+
383
+ # 2. Visualize normalized dataset
384
+ plt.scatter(X_scaled[:, 0], X_scaled[:, 1])
385
+ plt.xlabel("Grocery (Normalized)")
386
+ plt.ylabel("Milk (Normalized)")
387
+ plt.title("Normalized Dataset")
388
+ plt.show()
389
+
390
+ # -------- CUSTOM DBSCAN --------
391
+ def euclidean(p1, p2):
392
+ return np.sqrt(np.sum((p1 - p2) ** 2))
393
+
394
+ def get_neighbors(X, point_idx, eps):
395
+ return [i for i in range(len(X)) if euclidean(X[point_idx], X[i]) <= eps]
396
+
397
+ def dbscan(X, eps, min_pts):
398
+ labels = [-1] * len(X)
399
+ cluster_id = 0
400
+
401
+ for i in range(len(X)):
402
+ if labels[i] != -1:
403
+ continue
404
+
405
+ neighbors = get_neighbors(X, i, eps)
406
+
407
+ if len(neighbors) < min_pts:
408
+ labels[i] = 0 # noise
409
+ else:
410
+ cluster_id += 1
411
+ labels[i] = cluster_id
412
+
413
+ j = 0
414
+ while j < len(neighbors):
415
+ n = neighbors[j]
416
+
417
+ if labels[n] == 0:
418
+ labels[n] = cluster_id
419
+
420
+ if labels[n] == -1:
421
+ labels[n] = cluster_id
422
+ new_neighbors = get_neighbors(X, n, eps)
423
+
424
+ if len(new_neighbors) >= min_pts:
425
+ neighbors += new_neighbors
426
+
427
+ j += 1
428
+
429
+ return labels
430
+
431
+ # Apply DBSCAN
432
+ labels = dbscan(X_scaled, eps=0.5, min_pts=15)
433
+
434
+ # 3. Plot cluster results
435
+ plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels)
436
+ plt.xlabel("Grocery")
437
+ plt.ylabel("Milk")
438
+ plt.title("Custom DBSCAN Clustering")
439
+ plt.show()
440
+
441
+ # problem 2 -- Using Built-in
442
+ from sklearn.datasets import make_moons
443
+ from sklearn.cluster import DBSCAN
444
+
445
+ # Generate moons dataset
446
+ X, _ = make_moons(n_samples=2000, noise=0.05)
447
+
448
+ # Apply DBSCAN
449
+ db = DBSCAN(eps=0.2, min_samples=5)
450
+ labels = db.fit_predict(X)
451
+
452
+ # Plot
453
+ plt.scatter(X[:, 0], X[:, 1], c=labels)
454
+ plt.title("DBSCAN on Moons Dataset")
455
+ plt.show()
456
+
457
+ # Add noise
458
+ noise = np.random.uniform(low=-1.5, high=2.5, size=(200, 2))
459
+ X_noisy = np.vstack([X, noise])
460
+
461
+ labels_noisy = db.fit_predict(X_noisy)
462
+
463
+ plt.scatter(X_noisy[:, 0], X_noisy[:, 1], c=labels_noisy)
464
+ plt.title("DBSCAN with Noise")
465
+ plt.show()
466
+
467
+
468
+ # =========================================================
469
+ # FULL MS-GSP
470
+ # =========================================================
471
+ import re
472
+ import matplotlib.pyplot as plt
473
+ from collections import defaultdict
474
+
475
+ # -------------------------------
476
+ # 1. READ DATA
477
+ # -------------------------------
478
+ def read_data(file):
479
+ sequences = []
480
+ with open(file, 'r') as f:
481
+ for line in f:
482
+ sets = re.findall(r'\{(.*?)\}', line)
483
+ seq = [set(map(int, s.split(','))) for s in sets]
484
+ sequences.append(seq)
485
+ return sequences
486
+
487
+ # -------------------------------
488
+ # 2. READ PARAMETERS
489
+ # -------------------------------
490
+ def read_params(file):
491
+ MIS = {}
492
+ SDC = 0
493
+
494
+ with open(file, 'r') as f:
495
+ for line in f:
496
+ if "MIS" in line:
497
+ item = int(re.findall(r'\((\d+)\)', line)[0])
498
+ MIS[item] = float(line.split('=')[1])
499
+ elif "SDC" in line:
500
+ SDC = float(line.split('=')[1])
501
+
502
+ return MIS, SDC
503
+
504
+ # -------------------------------
505
+ # 3. SUPPORT
506
+ # -------------------------------
507
+ def get_support(sequences):
508
+ count = defaultdict(int)
509
+ total = len(sequences)
510
+
511
+ for seq in sequences:
512
+ items = set()
513
+ for s in seq:
514
+ items |= s
515
+ for item in items:
516
+ count[item] += 1
517
+
518
+ support = {k: v / total for k, v in count.items()}
519
+ return support, count
520
+
521
+ # -------------------------------
522
+ # 4. INIT PASS (L)
523
+ # -------------------------------
524
+ def init_pass(MIS, support):
525
+ sorted_items = sorted(MIS.keys(), key=lambda x: MIS[x])
526
+ L = []
527
+
528
+ for item in sorted_items:
529
+ if support.get(item, 0) >= MIS[item]:
530
+ L.append(item)
531
+
532
+ return L
533
+
534
+ # -------------------------------
535
+ # 5. F1
536
+ # -------------------------------
537
+ def generate_F1(L):
538
+ return [[{item}] for item in L]
539
+
540
+ # -------------------------------
541
+ # 6. SUBSEQUENCE CHECK
542
+ # -------------------------------
543
+ def is_subsequence(pattern, sequence):
544
+ i = 0
545
+ for s in sequence:
546
+ if i < len(pattern) and pattern[i].issubset(s):
547
+ i += 1
548
+ return i == len(pattern)
549
+
550
+ # -------------------------------
551
+ # 7. COUNT SUPPORT
552
+ # -------------------------------
553
+ def count_support(pattern, sequences):
554
+ return sum(is_subsequence(pattern, seq) for seq in sequences)
555
+
556
+ # -------------------------------
557
+ # 8. GET ITEMS
558
+ # -------------------------------
559
+ def get_items(pattern):
560
+ items = set()
561
+ for s in pattern:
562
+ items |= s
563
+ return items
564
+
565
+ # -------------------------------
566
+ # 9. SDC CHECK
567
+ # -------------------------------
568
+ def check_SDC(pattern, support, SDC):
569
+ items = list(get_items(pattern))
570
+ for i in range(len(items)):
571
+ for j in range(i + 1, len(items)):
572
+ if abs(support[items[i]] - support[items[j]]) > SDC:
573
+ return False
574
+ return True
575
+
576
+ # -------------------------------
577
+ # 10. C2 GENERATION
578
+ # -------------------------------
579
+ def generate_C2(L, support, MIS, SDC):
580
+ C2 = []
581
+
582
+ for i in range(len(L)):
583
+ for j in range(i + 1, len(L)):
584
+ i1, i2 = L[i], L[j]
585
+
586
+ if support[i2] >= MIS[i1] and abs(support[i2] - support[i1]) <= SDC:
587
+ C2.append([{i1, i2}]) # same itemset
588
+ C2.append([{i1}, {i2}]) # sequence
589
+
590
+ return C2
591
+
592
+ # -------------------------------
593
+ # 11. JOIN STEP
594
+ # -------------------------------
595
+ def join_step(Fk_1):
596
+ Ck = []
597
+
598
+ for p in Fk_1:
599
+ for q in Fk_1:
600
+ if p[1:] == q[:-1]:
601
+ candidate = p + [q[-1]]
602
+ if candidate not in Ck:
603
+ Ck.append(candidate)
604
+
605
+ return Ck
606
+
607
+ # -------------------------------
608
+ # 12. PRUNE STEP
609
+ # -------------------------------
610
+ def prune(Ck, Fk_1):
611
+ pruned = []
612
+
613
+ for c in Ck:
614
+ valid = True
615
+
616
+ for i in range(len(c)):
617
+ sub = c[:i] + c[i+1:]
618
+ if sub not in Fk_1:
619
+ valid = False
620
+ break
621
+
622
+ if valid:
623
+ pruned.append(c)
624
+
625
+ return pruned
626
+
627
+ # -------------------------------
628
+ # 13. MS-GSP MAIN
629
+ # -------------------------------
630
+ def MSGSP(sequences, MIS, SDC):
631
+ support, raw_count = get_support(sequences)
632
+
633
+ # Plot support
634
+ plt.bar(list(support.keys()), list(support.values()))
635
+ plt.title("Support Distribution")
636
+ plt.show()
637
+
638
+ L = init_pass(MIS, support)
639
+ print("L:", L)
640
+
641
+ F = []
642
+
643
+ # F1
644
+ F1 = generate_F1(L)
645
+ print("\nF1:", F1)
646
+ F.append(F1)
647
+
648
+ # F2
649
+ C2 = generate_C2(L, support, MIS, SDC)
650
+
651
+ F2 = []
652
+ for c in C2:
653
+ count = count_support(c, sequences)
654
+ sup = count / len(sequences)
655
+
656
+ min_mis = min(MIS[item] for item in get_items(c))
657
+
658
+ if sup >= min_mis and check_SDC(c, support, SDC):
659
+ if c not in F2:
660
+ F2.append(c)
661
+
662
+ print("\nF2:", F2)
663
+ F.append(F2)
664
+
665
+ # Fk
666
+ k = 3
667
+ while True:
668
+ Ck = join_step(F[k-2])
669
+ Ck = prune(Ck, F[k-2])
670
+
671
+ if not Ck:
672
+ break
673
+
674
+ Fk = []
675
+ for c in Ck:
676
+ count = count_support(c, sequences)
677
+ sup = count / len(sequences)
678
+
679
+ min_mis = min(MIS[item] for item in get_items(c))
680
+
681
+ if sup >= min_mis and check_SDC(c, support, SDC):
682
+ if c not in Fk:
683
+ Fk.append(c)
684
+
685
+ if not Fk:
686
+ break
687
+
688
+ print(f"\nF{k}:", Fk)
689
+ F.append(Fk)
690
+
691
+ k += 1
692
+
693
+ return F
694
+
695
+ # -------------------------------
696
+ # 14. PRINT OUTPUT
697
+ # -------------------------------
698
+ def print_patterns(F, sequences):
699
+ print("\nFinal Patterns:\n")
700
+
701
+ for level in F:
702
+ for pattern in level:
703
+ count = count_support(pattern, sequences)
704
+
705
+ pattern_str = "<"
706
+ for s in pattern:
707
+ pattern_str += "{" + ",".join(map(str, s)) + "}"
708
+ pattern_str += ">"
709
+
710
+ print(f"Pattern: {pattern_str} count: {count}")
711
+
712
+ # -------------------------------
713
+ # RUN
714
+ # -------------------------------
715
+ sequences = read_data("FinalDM/Dataset/data (1).txt")
716
+ MIS, SDC = read_params("FinalDM/Dataset/para.txt")
717
+
718
+ F = MSGSP(sequences, MIS, SDC)
719
+ print_patterns(F, sequences)
720
+
721
+
722
+ # =========================================================
723
+ # GSP
724
+ # =========================================================
725
+ import re
726
+ import matplotlib.pyplot as plt
727
+ from collections import defaultdict
728
+ from itertools import combinations
729
+
730
+ # -------------------------------
731
+ # 1. READ DATA
732
+ # -------------------------------
733
+ def read_data(file):
734
+ sequences = []
735
+ with open(file, 'r') as f:
736
+ for line in f:
737
+ sets = re.findall(r'\{(.*?)\}', line)
738
+ seq = [set(map(int, s.split(','))) for s in sets]
739
+ sequences.append(seq)
740
+ return sequences
741
+
742
+ # -------------------------------
743
+ # 2. READ PARAMETERS (MIS + SDC)
744
+ # -------------------------------
745
+ def read_params(file):
746
+ MIS = {}
747
+ SDC = 0
748
+
749
+ with open(file, 'r') as f:
750
+ for line in f:
751
+ if "MIS" in line:
752
+ item = int(re.findall(r'\((\d+)\)', line)[0])
753
+ val = float(line.split('=')[1])
754
+ MIS[item] = val
755
+ elif "SDC" in line:
756
+ SDC = float(line.split('=')[1])
757
+
758
+ return MIS, SDC
759
+
760
+ # -------------------------------
761
+ # 3. SUPPORT CALCULATION
762
+ # -------------------------------
763
+ def get_support(sequences):
764
+ count = defaultdict(int)
765
+ total = len(sequences)
766
+
767
+ for seq in sequences:
768
+ unique_items = set()
769
+ for s in seq:
770
+ unique_items |= s
771
+ for item in unique_items:
772
+ count[item] += 1
773
+
774
+ support = {k: v / total for k, v in count.items()}
775
+ return support, count
776
+
777
+ # -------------------------------
778
+ # 4. PLOT SUPPORT GRAPH
779
+ # -------------------------------
780
+ def plot_support(support):
781
+ items = list(support.keys())
782
+ values = list(support.values())
783
+
784
+ plt.bar(items, values)
785
+ plt.xlabel("Items")
786
+ plt.ylabel("Support")
787
+ plt.title("Support Distribution")
788
+ plt.show()
789
+
790
+ # -------------------------------
791
+ # 5. FREQUENT 1-ITEMSETS
792
+ # -------------------------------
793
+ def frequent_1_itemsets(support, MIS):
794
+ return {item for item in support if support[item] >= MIS[item]}
795
+
796
+ # -------------------------------
797
+ # 6. CHECK SUBSEQUENCE
798
+ # -------------------------------
799
+ def is_subsequence(pattern, sequence):
800
+ i = 0
801
+ for s in sequence:
802
+ if pattern[i].issubset(s):
803
+ i += 1
804
+ if i == len(pattern):
805
+ return True
806
+ return False
807
+
808
+ # -------------------------------
809
+ # 7. COUNT PATTERN SUPPORT
810
+ # -------------------------------
811
+ def count_pattern(pattern, sequences):
812
+ count = 0
813
+ for seq in sequences:
814
+ if is_subsequence(pattern, seq):
815
+ count += 1
816
+ return count
817
+
818
+ # -------------------------------
819
+ # 8. GENERATE CANDIDATES (GSP STYLE)
820
+ # -------------------------------
821
+ def generate_candidates(prev_patterns):
822
+ candidates = []
823
+
824
+ for p1 in prev_patterns:
825
+ for p2 in prev_patterns:
826
+ if p1[1:] == p2[:-1]:
827
+ new_pattern = p1 + [p2[-1]]
828
+ candidates.append(new_pattern)
829
+
830
+ return candidates
831
+
832
+ # -------------------------------
833
+ # 9. FILTER USING MIS + SDC
834
+ # -------------------------------
835
+ def filter_patterns(candidates, sequences, MIS, SDC):
836
+ total = len(sequences)
837
+ valid_patterns = []
838
+
839
+ for pattern in candidates:
840
+ count = count_pattern(pattern, sequences)
841
+ support = count / total
842
+
843
+ items = set()
844
+ for p in pattern:
845
+ items |= p
846
+
847
+ # MIS condition
848
+ if all(support >= MIS[item] for item in items):
849
+
850
+ # SDC condition (approx)
851
+ supports = [support]
852
+ if max(supports) - min(supports) <= SDC:
853
+ valid_patterns.append((pattern, count))
854
+
855
+ return valid_patterns
856
+
857
+ # -------------------------------
858
+ # 10. PRINT PATTERNS
859
+ # -------------------------------
860
+ def print_patterns(patterns):
861
+ for pattern, count in patterns:
862
+ pattern_str = "<"
863
+ for p in pattern:
864
+ pattern_str += "{" + ",".join(map(str, p)) + "}"
865
+ pattern_str += ">"
866
+
867
+ print(f"Pattern: {pattern_str} count: {count}")
868
+
869
+ # -------------------------------
870
+ # MAIN EXECUTION
871
+ # -------------------------------
872
+ sequences = read_data("FinalDM/Dataset/data (1).txt")
873
+ MIS, SDC = read_params("FinalDM/Dataset/para.txt")
874
+
875
+ # Support
876
+ support, raw_count = get_support(sequences)
877
+
878
+ print("SUPPORT VALUES:")
879
+ for k, v in support.items():
880
+ print(f"Item {k}: {v:.2f}")
881
+
882
+ # Plot graph
883
+ plot_support(support)
884
+
885
+ # Frequent 1-itemsets
886
+ F1 = frequent_1_itemsets(support, MIS)
887
+ print("\nFrequent 1-itemsets:", F1)
888
+
889
+ # Convert F1 to sequence format
890
+ patterns = [[{item}] for item in F1]
891
+
892
+ k = 2
893
+ all_patterns = []
894
+
895
+ while patterns:
896
+ print(f"\nGenerating patterns of length {k}...")
897
+
898
+ candidates = generate_candidates(patterns)
899
+ valid_patterns = filter_patterns(candidates, sequences, MIS, SDC)
900
+
901
+ if not valid_patterns:
902
+ break
903
+
904
+ print_patterns(valid_patterns)
905
+ all_patterns.extend(valid_patterns)
906
+
907
+ # Prepare next iteration
908
+ patterns = [pattern for pattern, _ in valid_patterns]
909
+ k += 1
910
+
911
+ # Final Output
912
+ print("\nFINAL FREQUENT SEQUENTIAL PATTERNS:")
913
+ print_patterns(all_patterns)
914
+
915
+
916
+ #IR1
917
+ import pandas as pd
918
+ import numpy as np
919
+ from sklearn.metrics.pairwise import cosine_similarity
920
+ from sklearn.feature_extraction.text import CountVectorizer
921
+ from sklearn.decomposition import PCA
922
+ import networkx as nx
923
+ import matplotlib.pyplot as plt
924
+ import seaborn as sns
925
+
926
+ # -------------------- STOPWORDS --------------------
927
+ stop_words = {
928
+ "is","am","are","was","were","be","been","being",
929
+ "a","an","the","and","or","not","in","on","at","to",
930
+ "for","with","by","of","that","this","it","as","from",
931
+ "but","about","into","over","after","before","between",
932
+ "out","up","down","so","than","too","very","can","will"
933
+ }
934
+
935
+ def simple_stem(word):
936
+ suffixes = ["ing","ed","ly","es","s","ment"]
937
+ for suf in suffixes:
938
+ if word.endswith(suf) and len(word) > len(suf)+2:
939
+ return word[:-len(suf)]
940
+ return word
941
+
942
+ def preprocess(text):
943
+ tokens = text.lower().split()
944
+ tokens = [t.strip(".,!?:;()[]{}\"'") for t in tokens]
945
+ tokens = [t for t in tokens if t and t not in stop_words]
946
+ tokens = [simple_stem(t) for t in tokens]
947
+ return " ".join(tokens)
948
+
949
+ # -------------------- LOAD DATASET --------------------
950
+ df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IRdata.csv")
951
+ df["clean_text"] = df["text"].astype(str).apply(preprocess)
952
+ docs = df["clean_text"].tolist()
953
+
954
+ print("\n--- Preprocessed Documents ---")
955
+ print(df[["id","clean_text"]])
956
+
957
+ # ===============================================================
958
+ # 1️⃣ CONTENT-BASED RECOMMENDATION
959
+ # ===============================================================
960
+
961
+ vectorizer = CountVectorizer()
962
+ X = vectorizer.fit_transform(docs)
963
+ similarity_matrix = cosine_similarity(X)
964
+
965
+ def content_recommend(doc_id, top_n=3):
966
+ scores = list(enumerate(similarity_matrix[doc_id]))
967
+ scores = sorted(scores, key=lambda x: x[1], reverse=True)
968
+ scores = scores[1:top_n+1]
969
+ print(f"\n--- Content-Based Recommendations for Document {doc_id+1} ---")
970
+ for idx, score in scores:
971
+ print(f"Doc {idx+1} (Score={score:.3f}): {df.iloc[idx]['text']}")
972
+
973
+ content_recommend(0)
974
+
975
+ # ---------------- VISUALIZATION: Similarity Heatmap ----------------
976
+ plt.figure(figsize=(8,6))
977
+ sns.heatmap(similarity_matrix, annot=True, cmap="Blues")
978
+ plt.title("Document Similarity Matrix (Content-Based)")
979
+ plt.xlabel("Document ID")
980
+ plt.ylabel("Document ID")
981
+ plt.show()
982
+
983
+ # ===============================================================
984
+ # 2️⃣ COLLABORATIVE FILTERING (USER–ITEM)
985
+ # ===============================================================
986
+ ratings = pd.DataFrame({
987
+ "user": ["u1","u1","u2","u2","u3","u3","u4","u4"],
988
+ "item": [1,2,2,3,3,4,4,5],
989
+ "rating": [5,4,4,5,3,4,5,4]
990
+ })
991
+
992
+ user_item_matrix = ratings.pivot_table(index="user", columns="item", values="rating")
993
+ user_item_matrix = user_item_matrix.fillna(0)
994
+
995
+ user_sim = cosine_similarity(user_item_matrix)
996
+ user_sim_df = pd.DataFrame(user_sim, index=user_item_matrix.index, columns=user_item_matrix.index)
997
+
998
+ def recommend_item(user, top_n=3):
999
+ similar_users = user_sim_df[user].sort_values(ascending=False).index[1:top_n+1]
1000
+ print(f"\n--- Collaborative Filtering Recommendation for {user} ---")
1001
+ print("Similar Users:", list(similar_users))
1002
+
1003
+ recommend_item("u1")
1004
+
1005
+ #rating recommendation - given user and item:
1006
+ # ===============================================================
1007
+ # 🔹 PREDICT MISSING RATING (USER-BASED CF)
1008
+ # ===============================================================
1009
+
1010
+ def predict_rating(user, item):
1011
+ numerator = 0
1012
+ denominator = 0
1013
+
1014
+ for other_user in user_item_matrix.index:
1015
+ # consider only users who rated this item
1016
+ if user_item_matrix.loc[other_user, item] > 0:
1017
+ sim = user_sim_df.loc[user, other_user]
1018
+ rating = user_item_matrix.loc[other_user, item]
1019
+
1020
+ numerator += sim * rating
1021
+ denominator += abs(sim)
1022
+
1023
+ if denominator == 0:
1024
+ return 0 # no similar users found
1025
+ return numerator / denominator
1026
+
1027
+ print("\n--- Predicted Rating ---")
1028
+ print("u1 rating for item 3:", predict_rating("u1", 3))
1029
+
1030
+ # Optional - rating for all missing values:
1031
+ def predict_rating(user, item):
1032
+ numerator = 0
1033
+ denominator = 0
1034
+
1035
+ for other_user in user_item_matrix.index:
1036
+ if user_item_matrix.loc[other_user, item] > 0:
1037
+ sim = user_sim_df.loc[user, other_user]
1038
+ rating = user_item_matrix.loc[other_user, item]
1039
+
1040
+ numerator += sim * rating
1041
+ denominator += abs(sim)
1042
+
1043
+ if denominator == 0:
1044
+ # fallback: average rating of the item
1045
+ item_ratings = user_item_matrix[item]
1046
+ non_zero_ratings = item_ratings[item_ratings > 0]
1047
+
1048
+ if len(non_zero_ratings) == 0:
1049
+ return 0
1050
+ return non_zero_ratings.mean()
1051
+
1052
+ return numerator / denominator
1053
+
1054
+ def fill_missing_ratings():
1055
+ filled = user_item_matrix.copy()
1056
+ for user in user_item_matrix.index:
1057
+ for item in user_item_matrix.columns:
1058
+ if user_item_matrix.loc[user, item] == 0:
1059
+ filled.loc[user, item] = predict_rating(user, item)
1060
+ return filled
1061
+
1062
+ filled_matrix = fill_missing_ratings()
1063
+ print("\n--- Filled User-Item Matrix ---")
1064
+ print(filled_matrix)
1065
+
1066
+ # ---------------- VISUALIZATION: User Similarity Heatmap ----------------
1067
+ plt.figure(figsize=(6,4))
1068
+ sns.heatmap(user_sim_df, annot=True, cmap="Greens")
1069
+ plt.title("User Similarity Matrix (Collaborative Filtering)")
1070
+ plt.show()
1071
+
1072
+ # ===============================================================
1073
+ # 3️⃣ PAGE RANK ALGORITHM
1074
+ # ===============================================================
1075
+
1076
+ G = nx.Graph()
1077
+ for i in range(len(docs)):
1078
+ for j in range(i+1, len(docs)):
1079
+ if similarity_matrix[i][j] > 0.2:
1080
+ G.add_edge(i, j, weight=similarity_matrix[i][j])
1081
+
1082
+ pr = nx.pagerank(G)
1083
+
1084
+ print("\n--- PageRank Scores for Documents ---")
1085
+ for i, score in pr.items():
1086
+ print(f"Doc {i+1}: Score = {score:.4f}")
1087
+
1088
+ # ---------------- VISUALIZATION: PageRank Graph ----------------
1089
+ plt.figure(figsize=(8,6))
1090
+ pos = nx.spring_layout(G, seed=7)
1091
+ sizes = [5000 * pr[i] for i in G.nodes()]
1092
+ nx.draw(G, pos, with_labels=True, node_size=sizes, node_color='skyblue', edge_color='gray')
1093
+ plt.title("Document Graph Based on Similarity (PageRank Size = Score)")
1094
+ plt.show()
1095
+
1096
+ # ===============================================================
1097
+ # 4️⃣ DIMENSION REDUCTION (PCA)
1098
+ # ===============================================================
1099
+
1100
+ pca = PCA(n_components=2)
1101
+ X_reduced = pca.fit_transform(X.toarray())
1102
+
1103
+ print("\n--- PCA Dimensionality Reduction (2D) ---")
1104
+ for i, vec in enumerate(X_reduced):
1105
+ print(f"Doc {i+1}: {vec}")
1106
+
1107
+ # ---------------- VISUALIZATION: PCA Scatter Plot ----------------
1108
+ plt.figure(figsize=(8,6))
1109
+ plt.scatter(X_reduced[:,0], X_reduced[:,1], s=120, color='purple')
1110
+
1111
+ for i in range(len(X_reduced)):
1112
+ plt.text(X_reduced[i,0]+0.02, X_reduced[i,1]+0.02, f"Doc {i+1}")
1113
+
1114
+ plt.title("PCA - 2D Document Vector Visualization")
1115
+ plt.xlabel("PC1")
1116
+ plt.ylabel("PC2")
1117
+ plt.grid(True)
1118
+ plt.show()
1119
+
1120
+ #IR2
1121
+ import pandas as pd
1122
+ import numpy as np
1123
+ from sklearn.metrics.pairwise import cosine_similarity
1124
+ from sklearn.feature_extraction.text import CountVectorizer
1125
+ from sklearn.decomposition import PCA
1126
+ import networkx as nx
1127
+ import matplotlib.pyplot as plt
1128
+ import seaborn as sns
1129
+
1130
+ # -------------------- STOPWORDS --------------------
1131
+ stop_words = {
1132
+ "is","am","are","was","were","be","been","being",
1133
+ "a","an","the","and","or","not","in","on","at","to",
1134
+ "for","with","by","of","that","this","it","as","from",
1135
+ "but","about","into","over","after","before","between",
1136
+ "out","up","down","so","than","too","very","can","will"
1137
+ }
1138
+
1139
+ # Different style: uses a loop with early return instead of checking all suffixes
1140
+ def simple_stem(word):
1141
+ for suf in ["ing","ed","ly","es","s","ment"]:
1142
+ if word.endswith(suf) and len(word) > len(suf) + 2:
1143
+ return word[:-len(suf)]
1144
+ return word
1145
+
1146
+ # Different style: uses list comprehension chain instead of step-by-step
1147
+ def preprocess(text):
1148
+ raw_tokens = text.lower().split()
1149
+ cleaned = [t.strip(".,!?:;()[]{}\"'") for t in raw_tokens]
1150
+ filtered = [t for t in cleaned if t and t not in stop_words]
1151
+ stemmed = [simple_stem(t) for t in filtered]
1152
+ return " ".join(stemmed)
1153
+
1154
+ # -------------------- LOAD DATASET --------------------
1155
+ df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IRdata.csv")
1156
+ df["clean_text"] = df["text"].astype(str).apply(preprocess)
1157
+ docs = df["clean_text"].tolist()
1158
+
1159
+ print("\n--- Preprocessed Documents ---")
1160
+ print(df[["id", "clean_text"]])
1161
+
1162
+ # ===============================================================
1163
+ # 1️⃣ CONTENT-BASED RECOMMENDATION
1164
+ # ===============================================================
1165
+
1166
+ # Same CountVectorizer — different style: fit and transform separated
1167
+ cv = CountVectorizer()
1168
+ cv.fit(docs)
1169
+ X = cv.transform(docs)
1170
+
1171
+ # Manual cosine similarity — same result as sklearn's cosine_similarity
1172
+ # dot(A, B) / (||A|| * ||B||) — done via matrix multiplication on normalized vectors
1173
+ X_dense = X.toarray().astype(float)
1174
+ norms = np.linalg.norm(X_dense, axis=1, keepdims=True)
1175
+ norms[norms == 0] = 1e-10
1176
+ X_normed = X_dense / norms
1177
+ similarity_matrix = X_normed @ X_normed.T # identical to cosine_similarity(X)
1178
+
1179
+ def content_recommend(doc_id, top_n=3):
1180
+ # Different style: uses dictionary then sorts, instead of list of tuples
1181
+ score_dict = {i: similarity_matrix[doc_id][i]
1182
+ for i in range(len(docs)) if i != doc_id}
1183
+ top_docs = sorted(score_dict, key=score_dict.get, reverse=True)[:top_n]
1184
+
1185
+ print(f"\n--- Content-Based Recommendations for Document {doc_id+1} ---")
1186
+ for idx in top_docs:
1187
+ print(f"Doc {idx+1} (Score={score_dict[idx]:.3f}): {df.iloc[idx]['text']}")
1188
+
1189
+ content_recommend(0)
1190
+
1191
+ # ---------------- VISUALIZATION: Similarity Heatmap ----------------
1192
+ plt.figure(figsize=(8, 6))
1193
+ sns.heatmap(similarity_matrix, annot=True, cmap="Blues")
1194
+ plt.title("Document Similarity Matrix (Content-Based)")
1195
+ plt.xlabel("Document ID")
1196
+ plt.ylabel("Document ID")
1197
+ plt.show()
1198
+
1199
+ # ===============================================================
1200
+ # 2️⃣ COLLABORATIVE FILTERING (USER–ITEM)
1201
+ # ===============================================================
1202
+
1203
+ # Different style: build as list of tuples first, then DataFrame
1204
+ user_item_data = [
1205
+ ("u1", 1, 5), ("u1", 2, 4),
1206
+ ("u2", 2, 4), ("u2", 3, 5),
1207
+ ("u3", 3, 3), ("u3", 4, 4),
1208
+ ("u4", 4, 5), ("u4", 5, 4)
1209
+ ]
1210
+ ratings = pd.DataFrame(user_item_data, columns=["user", "item", "rating"])
1211
+
1212
+ # Build user-item matrix — same pivot logic, different variable names
1213
+ user_item_matrix = ratings.pivot_table(index="user", columns="item", values="rating").fillna(0)
1214
+
1215
+ # User similarity — same cosine_similarity call, wrapped differently
1216
+ ui_array = user_item_matrix.values.astype(float)
1217
+ raw_sim = cosine_similarity(ui_array)
1218
+ user_sim_df = pd.DataFrame(raw_sim,
1219
+ index=user_item_matrix.index,
1220
+ columns=user_item_matrix.index)
1221
+
1222
+ def recommend_item(user, top_n=3):
1223
+ # Different style: drops the user itself first, then takes top_n
1224
+ others = user_sim_df[user].drop(labels=[user])
1225
+ similar_users = others.sort_values(ascending=False).head(top_n).index.tolist()
1226
+ print(f"\n--- Collaborative Filtering Recommendation for {user} ---")
1227
+ print("Similar Users:", similar_users)
1228
+
1229
+ recommend_item("u1")
1230
+
1231
+ # ===============================================================
1232
+ # 🔹 PREDICT MISSING RATING (USER-BASED CF)
1233
+ # ===============================================================
1234
+
1235
+ def predict_rating(user, item):
1236
+ num = 0.0
1237
+ den = 0.0
1238
+ for other in user_item_matrix.index:
1239
+ r = user_item_matrix.loc[other, item]
1240
+ if r > 0:
1241
+ s = user_sim_df.loc[user, other]
1242
+ num += s * r
1243
+ den += abs(s)
1244
+ return num / den if den != 0 else 0
1245
+
1246
+ print("\n--- Predicted Rating ---")
1247
+ print("u1 rating for item 3:", predict_rating("u1", 3))
1248
+
1249
+ # Redefine with fallback
1250
+ def predict_rating(user, item):
1251
+ num = 0.0
1252
+ den = 0.0
1253
+ for other in user_item_matrix.index:
1254
+ r = user_item_matrix.loc[other, item]
1255
+ if r > 0:
1256
+ s = user_sim_df.loc[user, other]
1257
+ num += s * r
1258
+ den += abs(s)
1259
+
1260
+ if den == 0:
1261
+ col = user_item_matrix[item]
1262
+ non_zero = col[col > 0]
1263
+ return non_zero.mean() if len(non_zero) > 0 else 0
1264
+
1265
+ return num / den
1266
+
1267
+ def fill_missing_ratings():
1268
+ filled = user_item_matrix.copy()
1269
+ for u in user_item_matrix.index:
1270
+ for it in user_item_matrix.columns:
1271
+ if user_item_matrix.loc[u, it] == 0:
1272
+ filled.loc[u, it] = predict_rating(u, it)
1273
+ return filled
1274
+
1275
+ filled_matrix = fill_missing_ratings()
1276
+ print("\n--- Filled User-Item Matrix ---")
1277
+ print(filled_matrix)
1278
+
1279
+ # ---------------- VISUALIZATION: User Similarity Heatmap ----------------
1280
+ plt.figure(figsize=(6, 4))
1281
+ sns.heatmap(user_sim_df, annot=True, cmap="Greens")
1282
+ plt.title("User Similarity Matrix (Collaborative Filtering)")
1283
+ plt.show()
1284
+
1285
+ # ===============================================================
1286
+ # 3️⃣ PAGE RANK ALGORITHM
1287
+ # ===============================================================
1288
+
1289
+ # Different style: builds edge list first, then adds all at once
1290
+ edge_list = [
1291
+ (i, j, similarity_matrix[i][j])
1292
+ for i in range(len(docs))
1293
+ for j in range(i + 1, len(docs))
1294
+ if similarity_matrix[i][j] > 0.2
1295
+ ]
1296
+
1297
+ G = nx.Graph()
1298
+ G.add_weighted_edges_from(edge_list)
1299
+
1300
+ pr = nx.pagerank(G)
1301
+
1302
+ print("\n--- PageRank Scores for Documents ---")
1303
+ for node, score in pr.items():
1304
+ print(f"Doc {node+1}: Score = {score:.4f}")
1305
+
1306
+ # ---------------- VISUALIZATION: PageRank Graph ----------------
1307
+ plt.figure(figsize=(8, 6))
1308
+ pos = nx.spring_layout(G, seed=7)
1309
+ sizes = [5000 * pr[n] for n in G.nodes()]
1310
+ nx.draw(G, pos, with_labels=True, node_size=sizes, node_color='skyblue', edge_color='gray')
1311
+ plt.title("Document Graph Based on Similarity (PageRank Size = Score)")
1312
+ plt.show()
1313
+
1314
+ # ===============================================================
1315
+ # 4️⃣ DIMENSION REDUCTION (PCA)
1316
+ # ===============================================================
1317
+
1318
+ # Different style: PCA fit and transform on separate lines
1319
+ reducer = PCA(n_components=2)
1320
+ reducer.fit(X.toarray())
1321
+ X_reduced = reducer.transform(X.toarray())
1322
+
1323
+ print("\n--- PCA Dimensionality Reduction (2D) ---")
1324
+ for i, vec in enumerate(X_reduced):
1325
+ print(f"Doc {i+1}: {vec}")
1326
+
1327
+ # ---------------- VISUALIZATION: PCA Scatter Plot ----------------
1328
+ plt.figure(figsize=(8, 6))
1329
+ plt.scatter(X_reduced[:, 0], X_reduced[:, 1], s=120, color='purple')
1330
+
1331
+ for i in range(len(X_reduced)):
1332
+ plt.text(X_reduced[i, 0] + 0.02, X_reduced[i, 1] + 0.02, f"Doc {i+1}")
1333
+
1334
+ plt.title("PCA - 2D Document Vector Visualization")
1335
+ plt.xlabel("PC1")
1336
+ plt.ylabel("PC2")
1337
+ plt.grid(True)
1338
+ plt.show()