itertoolkit 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bm_preprocessing/__init__.py +14 -0
- bm_preprocessing/importer/DM/__init__.py +7 -0
- bm_preprocessing/importer/DM/agg.py +6 -0
- bm_preprocessing/importer/DM/dbscan.py +6 -0
- bm_preprocessing/importer/DM/finals.py +6 -0
- bm_preprocessing/importer/DM/gsp.py +6 -0
- bm_preprocessing/importer/DM/test.py +6 -0
- bm_preprocessing/importer/Finals/__init__.py +7 -0
- bm_preprocessing/importer/Finals/kaadhal.py +6 -0
- bm_preprocessing/importer/Finals/raaka.py +6 -0
- bm_preprocessing/importer/Finals/seedan.py +6 -0
- bm_preprocessing/importer/Finals/vikram.py +6 -0
- bm_preprocessing/importer/IR/__init__.py +6 -0
- bm_preprocessing/importer/IR/finals.py +6 -0
- bm_preprocessing/importer/IR/pagerank.py +6 -0
- bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
- bm_preprocessing/importer/IR/test.py +6 -0
- bm_preprocessing/importer/PY/__init__.py +4 -0
- bm_preprocessing/importer/PY/lib_doc.py +6 -0
- bm_preprocessing/importer/PY/python_doc.py +6 -0
- bm_preprocessing/importer/__init__.py +8 -0
- bm_preprocessing/importer/_module_printer.py +23 -0
- bm_preprocessing/src/DM/__init__.py +1 -0
- bm_preprocessing/src/DM/agg.py +267 -0
- bm_preprocessing/src/DM/dbscan.py +218 -0
- bm_preprocessing/src/DM/finals.py +19 -0
- bm_preprocessing/src/DM/gsp.py +378 -0
- bm_preprocessing/src/DM/test.py +19 -0
- bm_preprocessing/src/Finals/__init__.py +1 -0
- bm_preprocessing/src/Finals/kaadhal.py +1453 -0
- bm_preprocessing/src/Finals/raaka.py +1338 -0
- bm_preprocessing/src/Finals/seedan.py +1173 -0
- bm_preprocessing/src/Finals/vikram.py +520 -0
- bm_preprocessing/src/IR/__init__.py +1 -0
- bm_preprocessing/src/IR/finals.py +14 -0
- bm_preprocessing/src/IR/pagerank.py +109 -0
- bm_preprocessing/src/IR/recommenders_pca.py +487 -0
- bm_preprocessing/src/IR/test.py +14 -0
- bm_preprocessing/src/PY/__init__.py +1 -0
- bm_preprocessing/src/PY/lib_doc.py +295 -0
- bm_preprocessing/src/PY/python_doc.py +177 -0
- bm_preprocessing/src/__init__.py +1 -0
- itertoolkit-1.5.0.dist-info/METADATA +120 -0
- itertoolkit-1.5.0.dist-info/RECORD +45 -0
- itertoolkit-1.5.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1453 @@
|
|
|
1
|
+
#----------------------------------
|
|
2
|
+
#DM--------------------------------
|
|
3
|
+
#----------------------------------
|
|
4
|
+
# ==================== START OF dm.py ====================
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import numpy as np
|
|
8
|
+
import matplotlib.pyplot as plt
|
|
9
|
+
|
|
10
|
+
# LOAD DATA
|
|
11
|
+
def load_data(path):
|
|
12
|
+
df = pd.read_csv(path)
|
|
13
|
+
return df
|
|
14
|
+
|
|
15
|
+
# PREPROCESSING
|
|
16
|
+
def preprocess(df):
|
|
17
|
+
# Remove CustomerID
|
|
18
|
+
df = df.drop(columns=["CustomerID"])
|
|
19
|
+
|
|
20
|
+
# Check missing values
|
|
21
|
+
print("Missing Values:\n", df.isnull().sum())
|
|
22
|
+
|
|
23
|
+
# Convert Gender to numeric
|
|
24
|
+
df["Gender"] = df["Gender"].map({"Male": 1, "Female": 0})
|
|
25
|
+
|
|
26
|
+
return df
|
|
27
|
+
|
|
28
|
+
# VISUALIZATION
|
|
29
|
+
def plot_gender_ratio(df):
|
|
30
|
+
counts = df["Gender"].value_counts()
|
|
31
|
+
labels = ['Female', 'Male']
|
|
32
|
+
plt.figure()
|
|
33
|
+
plt.pie(counts, labels=labels, autopct='%1.1f%%')
|
|
34
|
+
plt.title("Gender Ratio")
|
|
35
|
+
plt.show()
|
|
36
|
+
|
|
37
|
+
def plot_bar_graphs(df):
|
|
38
|
+
plt.figure()
|
|
39
|
+
df["Age"].value_counts().sort_index().plot(kind='bar')
|
|
40
|
+
plt.title("Age Distribution")
|
|
41
|
+
plt.xlabel("Age")
|
|
42
|
+
plt.ylabel("Count")
|
|
43
|
+
plt.show()
|
|
44
|
+
|
|
45
|
+
plt.figure()
|
|
46
|
+
df["Annual Income (k$)"].value_counts().sort_index().plot(kind='bar')
|
|
47
|
+
plt.title("Annual Income Distribution")
|
|
48
|
+
plt.xlabel("Income (k$)")
|
|
49
|
+
plt.ylabel("Count")
|
|
50
|
+
plt.show()
|
|
51
|
+
|
|
52
|
+
# DISTANCE FUNCTION
|
|
53
|
+
def euclidean_distance(a, b):
|
|
54
|
+
return np.sqrt(np.sum((a - b) ** 2))
|
|
55
|
+
|
|
56
|
+
# WARD LINKAGE (MANUAL)
|
|
57
|
+
def ward_distance(cluster1, cluster2, data):
|
|
58
|
+
# Compute centroids
|
|
59
|
+
c1 = np.mean(data[cluster1], axis=0)
|
|
60
|
+
c2 = np.mean(data[cluster2], axis=0)
|
|
61
|
+
|
|
62
|
+
n1 = len(cluster1)
|
|
63
|
+
n2 = len(cluster2)
|
|
64
|
+
|
|
65
|
+
# Ward formula
|
|
66
|
+
return (n1 * n2) / (n1 + n2) * np.sum((c1 - c2) ** 2)
|
|
67
|
+
|
|
68
|
+
# AGGLOMERATIVE CLUSTERING
|
|
69
|
+
def agglomerative_clustering(data):
|
|
70
|
+
clusters = [[i] for i in range(len(data))]
|
|
71
|
+
history = []
|
|
72
|
+
|
|
73
|
+
while len(clusters) > 1:
|
|
74
|
+
min_dist = float('inf')
|
|
75
|
+
pair = None
|
|
76
|
+
|
|
77
|
+
for i in range(len(clusters)):
|
|
78
|
+
for j in range(i + 1, len(clusters)):
|
|
79
|
+
dist = ward_distance(clusters[i], clusters[j], data)
|
|
80
|
+
if dist < min_dist:
|
|
81
|
+
min_dist = dist
|
|
82
|
+
pair = (i, j)
|
|
83
|
+
|
|
84
|
+
i, j = pair
|
|
85
|
+
new_cluster = clusters[i] + clusters[j]
|
|
86
|
+
|
|
87
|
+
history.append((clusters[i], clusters[j], min_dist))
|
|
88
|
+
|
|
89
|
+
# Merge clusters
|
|
90
|
+
clusters.pop(j)
|
|
91
|
+
clusters.pop(i)
|
|
92
|
+
clusters.append(new_cluster)
|
|
93
|
+
|
|
94
|
+
return history
|
|
95
|
+
|
|
96
|
+
# DENDROGRAM
|
|
97
|
+
def plot_dendrogram(history):
|
|
98
|
+
plt.figure(figsize=(10,6))
|
|
99
|
+
|
|
100
|
+
n = len(history) + 1
|
|
101
|
+
positions = {i: i for i in range(n)}
|
|
102
|
+
heights = {i: 0 for i in range(n)}
|
|
103
|
+
current_cluster_id = n
|
|
104
|
+
|
|
105
|
+
for cluster1, cluster2, dist in history:
|
|
106
|
+
c1 = tuple(cluster1)
|
|
107
|
+
c2 = tuple(cluster2)
|
|
108
|
+
|
|
109
|
+
# Get positions
|
|
110
|
+
x1 = np.mean([positions[i] for i in cluster1])
|
|
111
|
+
x2 = np.mean([positions[i] for i in cluster2])
|
|
112
|
+
|
|
113
|
+
# Get heights
|
|
114
|
+
h1 = max([heights.get(i, 0) for i in cluster1])
|
|
115
|
+
h2 = max([heights.get(i, 0) for i in cluster2])
|
|
116
|
+
|
|
117
|
+
# Draw vertical lines
|
|
118
|
+
plt.plot([x1, x1], [h1, dist])
|
|
119
|
+
plt.plot([x2, x2], [h2, dist])
|
|
120
|
+
|
|
121
|
+
# Draw horizontal line
|
|
122
|
+
plt.plot([x1, x2], [dist, dist])
|
|
123
|
+
|
|
124
|
+
# Update new cluster
|
|
125
|
+
new_cluster = cluster1 + cluster2
|
|
126
|
+
for i in new_cluster:
|
|
127
|
+
positions[i] = (x1 + x2) / 2
|
|
128
|
+
heights[i] = dist
|
|
129
|
+
|
|
130
|
+
current_cluster_id += 1
|
|
131
|
+
|
|
132
|
+
plt.title("Dendrogram (Manual - Tree Structure)")
|
|
133
|
+
plt.xlabel("Data Points")
|
|
134
|
+
plt.ylabel("Distance")
|
|
135
|
+
plt.show()
|
|
136
|
+
|
|
137
|
+
if __name__ == "__main__":
|
|
138
|
+
path = "Mall_Customers.csv" # update path
|
|
139
|
+
|
|
140
|
+
df = load_data(path)
|
|
141
|
+
|
|
142
|
+
print("Original Dataset:")
|
|
143
|
+
print(df.head())
|
|
144
|
+
|
|
145
|
+
df = preprocess(df)
|
|
146
|
+
|
|
147
|
+
print("Dataset After Preprocessing:")
|
|
148
|
+
print(df.head())
|
|
149
|
+
|
|
150
|
+
# Gender Count
|
|
151
|
+
print("Gender Count (0=Female, 1=Male):")
|
|
152
|
+
print(df["Gender"].value_counts())
|
|
153
|
+
|
|
154
|
+
# Visualizations
|
|
155
|
+
plot_gender_ratio(df)
|
|
156
|
+
plot_bar_graphs(df)
|
|
157
|
+
|
|
158
|
+
# Prepare data for clustering
|
|
159
|
+
data = df.values.astype(float)
|
|
160
|
+
|
|
161
|
+
# Perform clustering
|
|
162
|
+
history = agglomerative_clustering(data)
|
|
163
|
+
|
|
164
|
+
print("Clustering Merge Steps (Cluster1, Cluster2, Distance):")
|
|
165
|
+
for step in history[:10]: # print first 10 steps
|
|
166
|
+
print(step)
|
|
167
|
+
|
|
168
|
+
# Plot dendrogram
|
|
169
|
+
plot_dendrogram(history)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# ==================== START OF dm1.py ====================
|
|
173
|
+
|
|
174
|
+
import numpy as np
|
|
175
|
+
import pandas as pd
|
|
176
|
+
import matplotlib.pyplot as plt
|
|
177
|
+
from sklearn.datasets import make_moons
|
|
178
|
+
|
|
179
|
+
# LOAD DATA
|
|
180
|
+
def load_data(path):
|
|
181
|
+
df = pd.read_csv(path)
|
|
182
|
+
return df
|
|
183
|
+
|
|
184
|
+
# TASK 1
|
|
185
|
+
def preprocess_wholesale(df):
|
|
186
|
+
# Drop columns
|
|
187
|
+
df = df.drop(columns=["Channel", "Region"])
|
|
188
|
+
|
|
189
|
+
print("First few records:\n")
|
|
190
|
+
print(df.head())
|
|
191
|
+
|
|
192
|
+
return df
|
|
193
|
+
|
|
194
|
+
# NORMALIZATION (Z-SCORE)
|
|
195
|
+
def normalize(data):
|
|
196
|
+
mean = np.mean(data, axis=0)
|
|
197
|
+
std = np.std(data, axis=0)
|
|
198
|
+
return (data - mean) / std
|
|
199
|
+
|
|
200
|
+
# VISUALIZATION
|
|
201
|
+
def plot_data(data, title):
|
|
202
|
+
plt.figure()
|
|
203
|
+
plt.scatter(data[:,0], data[:,1])
|
|
204
|
+
plt.title(title)
|
|
205
|
+
plt.xlabel("Groceries")
|
|
206
|
+
plt.ylabel("Milk")
|
|
207
|
+
plt.show()
|
|
208
|
+
|
|
209
|
+
# DBSCAN
|
|
210
|
+
def euclidean(p, q):
|
|
211
|
+
return np.sqrt(np.sum((p - q)**2))
|
|
212
|
+
|
|
213
|
+
def region_query(data, point_idx, eps):
|
|
214
|
+
neighbors = []
|
|
215
|
+
for i in range(len(data)):
|
|
216
|
+
if euclidean(data[point_idx], data[i]) <= eps:
|
|
217
|
+
neighbors.append(i)
|
|
218
|
+
return neighbors
|
|
219
|
+
|
|
220
|
+
def expand_cluster(data, labels, point_idx, cluster_id, eps, min_pts):
|
|
221
|
+
seeds = region_query(data, point_idx, eps)
|
|
222
|
+
|
|
223
|
+
if len(seeds) < min_pts:
|
|
224
|
+
labels[point_idx] = -1 # noise
|
|
225
|
+
return False
|
|
226
|
+
|
|
227
|
+
for seed in seeds:
|
|
228
|
+
labels[seed] = cluster_id
|
|
229
|
+
|
|
230
|
+
i = 0
|
|
231
|
+
while i < len(seeds):
|
|
232
|
+
current = seeds[i]
|
|
233
|
+
result = region_query(data, current, eps)
|
|
234
|
+
|
|
235
|
+
if len(result) >= min_pts:
|
|
236
|
+
for r in result:
|
|
237
|
+
if labels[r] == 0:
|
|
238
|
+
seeds.append(r)
|
|
239
|
+
labels[r] = cluster_id
|
|
240
|
+
elif labels[r] == -1:
|
|
241
|
+
labels[r] = cluster_id
|
|
242
|
+
i += 1
|
|
243
|
+
|
|
244
|
+
return True
|
|
245
|
+
|
|
246
|
+
def dbscan(data, eps, min_pts):
|
|
247
|
+
labels = [0] * len(data)
|
|
248
|
+
cluster_id = 0
|
|
249
|
+
|
|
250
|
+
for i in range(len(data)):
|
|
251
|
+
if labels[i] != 0:
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
if expand_cluster(data, labels, i, cluster_id + 1, eps, min_pts):
|
|
255
|
+
cluster_id += 1
|
|
256
|
+
|
|
257
|
+
return np.array(labels)
|
|
258
|
+
|
|
259
|
+
# PLOT CLUSTERS
|
|
260
|
+
def plot_clusters(data, labels, title):
|
|
261
|
+
plt.figure()
|
|
262
|
+
unique_labels = set(labels)
|
|
263
|
+
|
|
264
|
+
for label in unique_labels:
|
|
265
|
+
if label == -1:
|
|
266
|
+
color = 'k'
|
|
267
|
+
else:
|
|
268
|
+
color = None
|
|
269
|
+
|
|
270
|
+
points = data[labels == label]
|
|
271
|
+
plt.scatter(points[:,0], points[:,1])
|
|
272
|
+
|
|
273
|
+
plt.title(title)
|
|
274
|
+
plt.xlabel("Feature 1")
|
|
275
|
+
plt.ylabel("Feature 2")
|
|
276
|
+
plt.show()
|
|
277
|
+
|
|
278
|
+
# TASK 2 - MOONS
|
|
279
|
+
def moons_dbscan():
|
|
280
|
+
# Generate moons dataset
|
|
281
|
+
X, _ = make_moons(n_samples=2000, noise=0.05)
|
|
282
|
+
|
|
283
|
+
# Apply manual DBSCAN
|
|
284
|
+
labels = dbscan(X, eps=0.2, min_pts=5)
|
|
285
|
+
plot_clusters(X, labels, "DBSCAN on Moons")
|
|
286
|
+
|
|
287
|
+
# Add noise manually
|
|
288
|
+
noise = np.random.uniform(low=-1.5, high=2.5, size=(200,2))
|
|
289
|
+
X_noisy = np.vstack((X, noise))
|
|
290
|
+
|
|
291
|
+
# Apply manual DBSCAN again
|
|
292
|
+
labels_noisy = dbscan(X_noisy, eps=0.2, min_pts=5)
|
|
293
|
+
plot_clusters(X_noisy, labels_noisy, "DBSCAN with Noise")
|
|
294
|
+
|
|
295
|
+
if __name__ == "__main__":
|
|
296
|
+
path = "Wholesale customers data.csv" # update path
|
|
297
|
+
|
|
298
|
+
# Load and preprocess
|
|
299
|
+
df = load_data(path)
|
|
300
|
+
print("Original Dataset:")
|
|
301
|
+
print(df.head())
|
|
302
|
+
|
|
303
|
+
df = preprocess_wholesale(df)
|
|
304
|
+
|
|
305
|
+
# Select features
|
|
306
|
+
subset = df[["Grocery", "Milk"]].values
|
|
307
|
+
|
|
308
|
+
print("Selected Features (Groceries, Milk):")
|
|
309
|
+
print(subset[:5])
|
|
310
|
+
|
|
311
|
+
# Normalize
|
|
312
|
+
normalized = normalize(subset)
|
|
313
|
+
|
|
314
|
+
print("Normalized Data (first 5 rows):")
|
|
315
|
+
print(normalized[:5])
|
|
316
|
+
|
|
317
|
+
# Plot normalized data
|
|
318
|
+
plot_data(normalized, "Normalized Data")
|
|
319
|
+
|
|
320
|
+
# Run DBSCAN (manual)
|
|
321
|
+
labels = dbscan(normalized, eps=0.5, min_pts=15)
|
|
322
|
+
|
|
323
|
+
print("DBSCAN Cluster Labels (first 20):")
|
|
324
|
+
print(labels[:20])
|
|
325
|
+
|
|
326
|
+
print("Number of clusters (excluding noise): ", len(set(labels)) - (1 if -1 in labels else 0))
|
|
327
|
+
print("Number of noise points: ", list(labels).count(-1))
|
|
328
|
+
|
|
329
|
+
plot_clusters(normalized, labels, "Manual DBSCAN Clusters (Wholesale)")
|
|
330
|
+
|
|
331
|
+
# Task 2: Moons
|
|
332
|
+
print("--- MOONS DATASET ---")
|
|
333
|
+
X, _ = make_moons(n_samples=2000, noise=0.05)
|
|
334
|
+
|
|
335
|
+
print("Moons Dataset Sample:")
|
|
336
|
+
print(X[:5])
|
|
337
|
+
|
|
338
|
+
labels_moons = dbscan(X, eps=0.2, min_pts=5)
|
|
339
|
+
|
|
340
|
+
print("Moons Cluster Labels (first 20):")
|
|
341
|
+
print(labels_moons[:20])
|
|
342
|
+
|
|
343
|
+
plot_clusters(X, labels_moons, "DBSCAN on Moons")
|
|
344
|
+
|
|
345
|
+
# Add noise
|
|
346
|
+
noise = np.random.uniform(low=-1.5, high=2.5, size=(200,2))
|
|
347
|
+
X_noisy = np.vstack((X, noise))
|
|
348
|
+
|
|
349
|
+
print("Noisy Dataset Sample:")
|
|
350
|
+
print(X_noisy[:5])
|
|
351
|
+
|
|
352
|
+
labels_noisy = dbscan(X_noisy, eps=0.2, min_pts=5)
|
|
353
|
+
|
|
354
|
+
print("Noisy Data Cluster Labels (first 20):")
|
|
355
|
+
print(labels_noisy[:20])
|
|
356
|
+
|
|
357
|
+
print("Number of clusters (noisy): ", len(set(labels_noisy)) - (1 if -1 in labels_noisy else 0))
|
|
358
|
+
print("Noise points (noisy data): ", list(labels_noisy).count(-1))
|
|
359
|
+
|
|
360
|
+
plot_clusters(X_noisy, labels_noisy, "Manual DBSCAN with Noise")
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
# ==================== START OF dm2.py ====================
|
|
364
|
+
|
|
365
|
+
from collections import defaultdict
|
|
366
|
+
import itertools
|
|
367
|
+
|
|
368
|
+
# READ INPUT FILES
|
|
369
|
+
def read_data(file):
|
|
370
|
+
sequences = []
|
|
371
|
+
with open(file, 'r') as f:
|
|
372
|
+
for line in f:
|
|
373
|
+
seq = []
|
|
374
|
+
parts = line.strip().split('}')
|
|
375
|
+
for p in parts:
|
|
376
|
+
if '{' in p:
|
|
377
|
+
items = p.split('{')[1]
|
|
378
|
+
if items:
|
|
379
|
+
seq.append(list(map(int, items.split(','))))
|
|
380
|
+
sequences.append(seq)
|
|
381
|
+
return sequences
|
|
382
|
+
|
|
383
|
+
def read_parameters(file):
|
|
384
|
+
MIS = {}
|
|
385
|
+
SDC = 0
|
|
386
|
+
|
|
387
|
+
with open(file, 'r') as f:
|
|
388
|
+
for line in f:
|
|
389
|
+
if "MIS" in line:
|
|
390
|
+
item = int(line.split('(')[1].split(')')[0])
|
|
391
|
+
value = float(line.split('=')[1])
|
|
392
|
+
MIS[item] = value
|
|
393
|
+
elif "SDC" in line:
|
|
394
|
+
SDC = float(line.split('=')[1])
|
|
395
|
+
|
|
396
|
+
return MIS, SDC
|
|
397
|
+
|
|
398
|
+
# SUPPORT COUNT
|
|
399
|
+
def support_count(sequences):
|
|
400
|
+
count = defaultdict(int)
|
|
401
|
+
total = len(sequences)
|
|
402
|
+
|
|
403
|
+
for seq in sequences:
|
|
404
|
+
items = set()
|
|
405
|
+
for itemset in seq:
|
|
406
|
+
items.update(itemset)
|
|
407
|
+
for item in items:
|
|
408
|
+
count[item] += 1
|
|
409
|
+
|
|
410
|
+
support = {item: count[item]/total for item in count}
|
|
411
|
+
return support, count
|
|
412
|
+
|
|
413
|
+
# INIT PASS (L)
|
|
414
|
+
def init_pass(MIS, support):
|
|
415
|
+
sorted_items = sorted(MIS.items(), key=lambda x: x[1])
|
|
416
|
+
|
|
417
|
+
L = []
|
|
418
|
+
for item, mis in sorted_items:
|
|
419
|
+
if support.get(item, 0) >= mis:
|
|
420
|
+
L.append(item)
|
|
421
|
+
|
|
422
|
+
return L
|
|
423
|
+
|
|
424
|
+
# LEVEL 1 FREQUENT
|
|
425
|
+
def level1(L, support, MIS):
|
|
426
|
+
F1 = []
|
|
427
|
+
for item in L:
|
|
428
|
+
if support[item] >= MIS[item]:
|
|
429
|
+
F1.append([item])
|
|
430
|
+
return F1
|
|
431
|
+
|
|
432
|
+
# CANDIDATE GENERATION (LEVEL 2)
|
|
433
|
+
def candidate_gen_L2(L, support, MIS, SDC):
|
|
434
|
+
C2 = []
|
|
435
|
+
|
|
436
|
+
for i in range(len(L)):
|
|
437
|
+
for j in range(i+1, len(L)):
|
|
438
|
+
if support[L[j]] >= MIS[L[i]] and abs(support[L[i]] - support[L[j]]) <= SDC:
|
|
439
|
+
C2.append([[L[i]], [L[j]]])
|
|
440
|
+
C2.append([[L[i], L[j]]])
|
|
441
|
+
|
|
442
|
+
return C2
|
|
443
|
+
|
|
444
|
+
# SUBSEQUENCE CHECK
|
|
445
|
+
def is_subsequence(candidate, sequence):
|
|
446
|
+
i = 0
|
|
447
|
+
for itemset in sequence:
|
|
448
|
+
if set(candidate[i]).issubset(set(itemset)):
|
|
449
|
+
i += 1
|
|
450
|
+
if i == len(candidate):
|
|
451
|
+
return True
|
|
452
|
+
return False
|
|
453
|
+
|
|
454
|
+
# COUNT SUPPORT FOR SEQUENCE
|
|
455
|
+
def count_support_seq(candidates, sequences):
|
|
456
|
+
counts = [0]*len(candidates)
|
|
457
|
+
|
|
458
|
+
for i, cand in enumerate(candidates):
|
|
459
|
+
for seq in sequences:
|
|
460
|
+
if is_subsequence(cand, seq):
|
|
461
|
+
counts[i] += 1
|
|
462
|
+
|
|
463
|
+
return counts
|
|
464
|
+
|
|
465
|
+
# FILTER FREQUENT
|
|
466
|
+
def filter_candidates(candidates, counts, MIS, sequences):
|
|
467
|
+
F = []
|
|
468
|
+
total = len(sequences)
|
|
469
|
+
|
|
470
|
+
for i, cand in enumerate(candidates):
|
|
471
|
+
first_item = cand[0][0]
|
|
472
|
+
if counts[i]/total >= MIS[first_item]:
|
|
473
|
+
F.append((cand, counts[i]))
|
|
474
|
+
|
|
475
|
+
return F
|
|
476
|
+
|
|
477
|
+
# MAIN MS-GSP
|
|
478
|
+
def format_pattern(pattern):
|
|
479
|
+
result = "<"
|
|
480
|
+
for itemset in pattern:
|
|
481
|
+
result += "{" + ",".join(map(str, itemset)) + "}"
|
|
482
|
+
result += ">"
|
|
483
|
+
return result
|
|
484
|
+
|
|
485
|
+
def MSGSP(data_file, para_file):
|
|
486
|
+
sequences = read_data(data_file)
|
|
487
|
+
MIS, SDC = read_parameters(para_file)
|
|
488
|
+
|
|
489
|
+
print("\nInput Sequences:\n")
|
|
490
|
+
for s in sequences[:5]:
|
|
491
|
+
print(s)
|
|
492
|
+
|
|
493
|
+
print("\nMIS Values:")
|
|
494
|
+
print(MIS)
|
|
495
|
+
print("SDC:", SDC)
|
|
496
|
+
|
|
497
|
+
support, item_counts = support_count(sequences)
|
|
498
|
+
|
|
499
|
+
print("\nItem Supports:")
|
|
500
|
+
for k,v in support.items():
|
|
501
|
+
print(f"Item {k}: {v:.3f}")
|
|
502
|
+
|
|
503
|
+
L = init_pass(MIS, support)
|
|
504
|
+
print("\nL (after init pass):", L)
|
|
505
|
+
|
|
506
|
+
F1 = level1(L, support, MIS)
|
|
507
|
+
print("\nF1 (Frequent 1-sequences):", F1)
|
|
508
|
+
|
|
509
|
+
C2 = candidate_gen_L2(L, support, MIS, SDC)
|
|
510
|
+
print("\nC2 (Candidate sequences):")
|
|
511
|
+
for c in C2[:10]:
|
|
512
|
+
print(c)
|
|
513
|
+
|
|
514
|
+
counts = count_support_seq(C2, sequences)
|
|
515
|
+
|
|
516
|
+
F2 = filter_candidates(C2, counts, MIS, sequences)
|
|
517
|
+
|
|
518
|
+
print("\nFrequent Patterns:")
|
|
519
|
+
for pattern, count in F2:
|
|
520
|
+
print(f"Pattern :{format_pattern(pattern)} count: {count}")
|
|
521
|
+
|
|
522
|
+
# Visualization
|
|
523
|
+
if F2:
|
|
524
|
+
labels = [format_pattern(p) for p,_ in F2[:10]]
|
|
525
|
+
values = [c for _,c in F2[:10]]
|
|
526
|
+
|
|
527
|
+
import matplotlib.pyplot as plt
|
|
528
|
+
plt.figure()
|
|
529
|
+
plt.bar(range(len(values)), values)
|
|
530
|
+
plt.xticks(range(len(values)), labels, rotation=45)
|
|
531
|
+
plt.title("Top Frequent Sequential Patterns")
|
|
532
|
+
plt.xlabel("Patterns")
|
|
533
|
+
plt.ylabel("Count")
|
|
534
|
+
plt.tight_layout()
|
|
535
|
+
plt.show()
|
|
536
|
+
|
|
537
|
+
if __name__ == "__main__":
|
|
538
|
+
MSGSP("data.txt", "para.txt")
|
|
539
|
+
|
|
540
|
+
#------------------------------------------------
|
|
541
|
+
#IR----------------------------------------------
|
|
542
|
+
#------------------------------------------------
|
|
543
|
+
# ==========================================
|
|
544
|
+
# SECTION 1: CONTENT_BASED.PY
|
|
545
|
+
# ==========================================
|
|
546
|
+
|
|
547
|
+
import pandas as pd
|
|
548
|
+
import numpy as np
|
|
549
|
+
import matplotlib.pyplot as plt
|
|
550
|
+
import re
|
|
551
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
552
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
553
|
+
from sklearn.preprocessing import MinMaxScaler
|
|
554
|
+
|
|
555
|
+
# LOAD DATA
|
|
556
|
+
def load_data():
|
|
557
|
+
movies = pd.DataFrame({
|
|
558
|
+
'movie_id': [1,2,3,4,5,6,7,8],
|
|
559
|
+
'title': [
|
|
560
|
+
'Inception','Interstellar','Dark Knight','Memento',
|
|
561
|
+
'Tenet','Avatar','Titanic','The Matrix'
|
|
562
|
+
],
|
|
563
|
+
'description': [
|
|
564
|
+
'dream subconscious thriller mind',
|
|
565
|
+
'space time black hole science',
|
|
566
|
+
'batman joker crime action',
|
|
567
|
+
'memory loss psychological thriller',
|
|
568
|
+
'time inversion action thriller',
|
|
569
|
+
'alien planet sci fi adventure',
|
|
570
|
+
'romance ship tragedy love',
|
|
571
|
+
'virtual reality ai action'
|
|
572
|
+
]
|
|
573
|
+
})
|
|
574
|
+
|
|
575
|
+
user_ratings = pd.Series(
|
|
576
|
+
[5, 4, 0, 0, 3, 0, 0, 5],
|
|
577
|
+
index=movies['title'],
|
|
578
|
+
name="User1"
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
return movies, user_ratings
|
|
582
|
+
|
|
583
|
+
# TEXT PREPROCESSING
|
|
584
|
+
def clean_text(text):
|
|
585
|
+
text = text.lower()
|
|
586
|
+
text = re.sub(r'[^a-z\s]', '', text)
|
|
587
|
+
return text
|
|
588
|
+
|
|
589
|
+
def preprocess_text(movies):
|
|
590
|
+
movies['clean_description'] = movies['description'].apply(clean_text)
|
|
591
|
+
return movies
|
|
592
|
+
|
|
593
|
+
# NORMALIZE RATINGS
|
|
594
|
+
def normalize_ratings(user_ratings):
|
|
595
|
+
user_ratings = user_ratings.fillna(0)
|
|
596
|
+
scaler = MinMaxScaler()
|
|
597
|
+
|
|
598
|
+
normalized = scaler.fit_transform(user_ratings.values.reshape(-1,1)).flatten()
|
|
599
|
+
|
|
600
|
+
return pd.Series(normalized, index=user_ratings.index)
|
|
601
|
+
|
|
602
|
+
# FEATURE EXTRACTION
|
|
603
|
+
def build_item_profiles(movies):
|
|
604
|
+
tfidf = TfidfVectorizer(stop_words='english')
|
|
605
|
+
tfidf_matrix = tfidf.fit_transform(movies['clean_description'])
|
|
606
|
+
|
|
607
|
+
item_profiles = pd.DataFrame(
|
|
608
|
+
tfidf_matrix.toarray(),
|
|
609
|
+
index=movies['title']
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
return item_profiles, tfidf
|
|
613
|
+
|
|
614
|
+
# USER PROFILE
|
|
615
|
+
def build_user_profile(user_ratings_norm, item_profiles):
|
|
616
|
+
user_vector = np.dot(user_ratings_norm.values, item_profiles.values)
|
|
617
|
+
return pd.Series(user_vector, index=item_profiles.columns)
|
|
618
|
+
|
|
619
|
+
# RECOMMENDATION
|
|
620
|
+
def recommend(user_profile, item_profiles, user_ratings, top_n=5, threshold=0.0):
|
|
621
|
+
|
|
622
|
+
user_vec = user_profile.values.reshape(1, -1)
|
|
623
|
+
|
|
624
|
+
scores = cosine_similarity(user_vec, item_profiles.values)[0]
|
|
625
|
+
|
|
626
|
+
scores_df = pd.DataFrame({
|
|
627
|
+
'movie': item_profiles.index,
|
|
628
|
+
'score': scores
|
|
629
|
+
})
|
|
630
|
+
|
|
631
|
+
# Remove already rated items
|
|
632
|
+
rated_items = user_ratings[user_ratings > 0].index
|
|
633
|
+
scores_df = scores_df[~scores_df['movie'].isin(rated_items)]
|
|
634
|
+
|
|
635
|
+
# Apply threshold
|
|
636
|
+
scores_df = scores_df[scores_df['score'] > threshold]
|
|
637
|
+
|
|
638
|
+
# Sort
|
|
639
|
+
scores_df = scores_df.sort_values(by='score', ascending=False)
|
|
640
|
+
|
|
641
|
+
return scores_df.head(top_n)
|
|
642
|
+
|
|
643
|
+
# VISUALIZATION
|
|
644
|
+
def plot_item_similarity(item_profiles, movies):
|
|
645
|
+
similarity_matrix = cosine_similarity(item_profiles)
|
|
646
|
+
|
|
647
|
+
plt.figure()
|
|
648
|
+
plt.imshow(similarity_matrix)
|
|
649
|
+
plt.colorbar()
|
|
650
|
+
plt.title("Item-Item Similarity Matrix")
|
|
651
|
+
plt.xticks(range(len(movies)), movies['title'], rotation=90)
|
|
652
|
+
plt.yticks(range(len(movies)), movies['title'])
|
|
653
|
+
plt.tight_layout()
|
|
654
|
+
plt.show()
|
|
655
|
+
|
|
656
|
+
def plot_user_ratings(user_ratings):
|
|
657
|
+
plt.figure()
|
|
658
|
+
plt.bar(user_ratings.index, user_ratings.values)
|
|
659
|
+
plt.xticks(rotation=90)
|
|
660
|
+
plt.title("User Ratings")
|
|
661
|
+
plt.ylabel("Rating")
|
|
662
|
+
plt.show()
|
|
663
|
+
|
|
664
|
+
def plot_recommendations(rec_df):
|
|
665
|
+
if rec_df.empty:
|
|
666
|
+
print("No recommendations to display")
|
|
667
|
+
return
|
|
668
|
+
|
|
669
|
+
plt.figure()
|
|
670
|
+
plt.bar(rec_df['movie'], rec_df['score'])
|
|
671
|
+
plt.xticks(rotation=90)
|
|
672
|
+
plt.title("Top Recommended Movies")
|
|
673
|
+
plt.ylabel("Similarity Score")
|
|
674
|
+
plt.show()
|
|
675
|
+
|
|
676
|
+
def plot_similarity_distribution(item_profiles):
|
|
677
|
+
sim_matrix = cosine_similarity(item_profiles)
|
|
678
|
+
|
|
679
|
+
plt.figure()
|
|
680
|
+
plt.hist(sim_matrix.flatten(), bins=20)
|
|
681
|
+
plt.title("Similarity Score Distribution")
|
|
682
|
+
plt.xlabel("Similarity")
|
|
683
|
+
plt.ylabel("Frequency")
|
|
684
|
+
plt.show()
|
|
685
|
+
|
|
686
|
+
# EVALUATION
|
|
687
|
+
def precision_at_k(actual, predicted, k):
|
|
688
|
+
predicted = predicted[:k]
|
|
689
|
+
if len(predicted) == 0:
|
|
690
|
+
return 0
|
|
691
|
+
return len(set(predicted) & set(actual)) / len(predicted)
|
|
692
|
+
|
|
693
|
+
def recall_at_k(actual, predicted, k):
|
|
694
|
+
if len(actual) == 0:
|
|
695
|
+
return 0
|
|
696
|
+
predicted = predicted[:k]
|
|
697
|
+
return len(set(predicted) & set(actual)) / len(actual)
|
|
698
|
+
|
|
699
|
+
def f1_score(precision, recall):
|
|
700
|
+
if precision + recall == 0:
|
|
701
|
+
return 0
|
|
702
|
+
return 2 * (precision * recall) / (precision + recall)
|
|
703
|
+
|
|
704
|
+
def evaluate_system(recommendations, ground_truth, k):
|
|
705
|
+
predicted = recommendations['movie'].tolist()
|
|
706
|
+
|
|
707
|
+
precision = precision_at_k(ground_truth, predicted, k)
|
|
708
|
+
recall = recall_at_k(ground_truth, predicted, k)
|
|
709
|
+
f1 = f1_score(precision, recall)
|
|
710
|
+
|
|
711
|
+
return precision, recall, f1
|
|
712
|
+
|
|
713
|
+
def main_content_based():
|
|
714
|
+
# Load data
|
|
715
|
+
movies, user_ratings = load_data()
|
|
716
|
+
|
|
717
|
+
# Preprocess
|
|
718
|
+
movies = preprocess_text(movies)
|
|
719
|
+
|
|
720
|
+
# Normalize ratings
|
|
721
|
+
user_ratings_norm = normalize_ratings(user_ratings)
|
|
722
|
+
|
|
723
|
+
# Build item profiles
|
|
724
|
+
item_profiles, _ = build_item_profiles(movies)
|
|
725
|
+
|
|
726
|
+
# Build user profile
|
|
727
|
+
user_profile = build_user_profile(user_ratings_norm, item_profiles)
|
|
728
|
+
|
|
729
|
+
# Generate recommendations
|
|
730
|
+
rec_df = recommend(user_profile, item_profiles, user_ratings, top_n=5)
|
|
731
|
+
|
|
732
|
+
# Visualization
|
|
733
|
+
plot_item_similarity(item_profiles, movies)
|
|
734
|
+
plot_user_ratings(user_ratings)
|
|
735
|
+
plot_recommendations(rec_df)
|
|
736
|
+
plot_similarity_distribution(item_profiles)
|
|
737
|
+
|
|
738
|
+
# Evaluation
|
|
739
|
+
ground_truth = ['Memento', 'Avatar']
|
|
740
|
+
precision, recall, f1 = evaluate_system(rec_df, ground_truth, k=3)
|
|
741
|
+
|
|
742
|
+
print("Precision@K:", precision)
|
|
743
|
+
print("Recall@K:", recall)
|
|
744
|
+
print("F1 Score:", f1)
|
|
745
|
+
|
|
746
|
+
# Output
|
|
747
|
+
print("\nRecommended Movies:")
|
|
748
|
+
print(rec_df)
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
# ==========================================
|
|
752
|
+
# SECTION 2: DR.PY
|
|
753
|
+
# ==========================================
|
|
754
|
+
|
|
755
|
+
# DATA
|
|
756
|
+
def get_data():
|
|
757
|
+
return np.array([
|
|
758
|
+
[5, 3, np.nan, 1],
|
|
759
|
+
[4, np.nan, np.nan, 1],
|
|
760
|
+
[1, 1, np.nan, 5],
|
|
761
|
+
[np.nan, np.nan, 5, 4]
|
|
762
|
+
], dtype=float)
|
|
763
|
+
|
|
764
|
+
# MISSING VALUE HANDLING
|
|
765
|
+
def fill_item_mean(R):
|
|
766
|
+
R_filled = R.copy()
|
|
767
|
+
for j in range(R.shape[1]):
|
|
768
|
+
mean = np.nanmean(R[:, j])
|
|
769
|
+
for i in range(R.shape[0]):
|
|
770
|
+
if np.isnan(R_filled[i, j]):
|
|
771
|
+
R_filled[i, j] = mean
|
|
772
|
+
return R_filled
|
|
773
|
+
|
|
774
|
+
def fill_user_mean(R):
|
|
775
|
+
R_filled = R.copy()
|
|
776
|
+
for i in range(R.shape[0]):
|
|
777
|
+
mean = np.nanmean(R[i])
|
|
778
|
+
for j in range(R.shape[1]):
|
|
779
|
+
if np.isnan(R_filled[i, j]):
|
|
780
|
+
R_filled[i, j] = mean
|
|
781
|
+
return R_filled
|
|
782
|
+
|
|
783
|
+
# PCA
|
|
784
|
+
def pca_process(R_filled, k=2):
|
|
785
|
+
|
|
786
|
+
# STEP 1: Mean
|
|
787
|
+
mean = np.mean(R_filled, axis=0)
|
|
788
|
+
print("\nSTEP 1: MEAN\n", mean)
|
|
789
|
+
|
|
790
|
+
# STEP 2: Centering
|
|
791
|
+
X_centered = R_filled - mean
|
|
792
|
+
print("\nSTEP 2: CENTERED DATA\n", X_centered)
|
|
793
|
+
|
|
794
|
+
# STEP 3: Covariance
|
|
795
|
+
n = R_filled.shape[0]
|
|
796
|
+
cov = np.dot(X_centered.T, X_centered) / (n - 1)
|
|
797
|
+
print("\nSTEP 3: COVARIANCE MATRIX\n", cov)
|
|
798
|
+
|
|
799
|
+
# STEP 4 & 5: Eigenvalues & Eigenvectors
|
|
800
|
+
eigenvalues, eigenvectors = np.linalg.eig(cov)
|
|
801
|
+
print("\nSTEP 4: EIGENVALUES\n", eigenvalues)
|
|
802
|
+
print("\nSTEP 5: EIGENVECTORS\n", eigenvectors)
|
|
803
|
+
|
|
804
|
+
# STEP 6: Normalize
|
|
805
|
+
for i in range(eigenvectors.shape[1]):
|
|
806
|
+
eigenvectors[:, i] /= np.linalg.norm(eigenvectors[:, i])
|
|
807
|
+
print("\nSTEP 6: UNIT EIGENVECTORS\n", eigenvectors)
|
|
808
|
+
|
|
809
|
+
# STEP 7: Top-k
|
|
810
|
+
idx = np.argsort(eigenvalues)[::-1]
|
|
811
|
+
eigenvalues = eigenvalues[idx]
|
|
812
|
+
eigenvectors = eigenvectors[:, idx]
|
|
813
|
+
W = eigenvectors[:, :k]
|
|
814
|
+
print("\nSTEP 7: TOP-K EIGENVECTORS\n", W)
|
|
815
|
+
|
|
816
|
+
# STEP 8: Projection
|
|
817
|
+
Z = np.dot(X_centered, W)
|
|
818
|
+
print("\nSTEP 8: PCA COMPONENTS\n", Z)
|
|
819
|
+
|
|
820
|
+
# Reconstruction
|
|
821
|
+
R_recon = np.dot(Z, W.T) + mean
|
|
822
|
+
print("\nPCA RECONSTRUCTION\n", R_recon)
|
|
823
|
+
|
|
824
|
+
return Z, R_recon
|
|
825
|
+
|
|
826
|
+
# SVD
|
|
827
|
+
def svd_process(R_filled, k=2):
|
|
828
|
+
|
|
829
|
+
RtR = np.dot(R_filled.T, R_filled)
|
|
830
|
+
eigenvalues, V = np.linalg.eig(RtR)
|
|
831
|
+
|
|
832
|
+
idx = np.argsort(eigenvalues)[::-1]
|
|
833
|
+
eigenvalues = eigenvalues[idx]
|
|
834
|
+
V = V[:, idx]
|
|
835
|
+
|
|
836
|
+
singular_values = np.sqrt(np.abs(eigenvalues))
|
|
837
|
+
|
|
838
|
+
U = []
|
|
839
|
+
for i in range(len(singular_values)):
|
|
840
|
+
if singular_values[i] > 1e-10:
|
|
841
|
+
u = np.dot(R_filled, V[:, i]) / singular_values[i]
|
|
842
|
+
U.append(u)
|
|
843
|
+
|
|
844
|
+
U = np.array(U).T
|
|
845
|
+
|
|
846
|
+
U_k = U[:, :k]
|
|
847
|
+
S_k = np.diag(singular_values[:k])
|
|
848
|
+
V_k = V[:, :k]
|
|
849
|
+
|
|
850
|
+
R_recon = np.dot(np.dot(U_k, S_k), V_k.T)
|
|
851
|
+
|
|
852
|
+
print("\nSingular Values\n", singular_values)
|
|
853
|
+
print("\nSVD RECONSTRUCTION\n", R_recon)
|
|
854
|
+
|
|
855
|
+
return R_recon
|
|
856
|
+
|
|
857
|
+
# RECOMMENDATION + ERROR
|
|
858
|
+
def recommend_dr(R_original, R_pred, user):
|
|
859
|
+
missing = np.where(np.isnan(R_original[user]))[0]
|
|
860
|
+
scores = R_pred[user][missing]
|
|
861
|
+
return missing[np.argmax(scores)]
|
|
862
|
+
|
|
863
|
+
def compute_error(R_original, R_pred):
|
|
864
|
+
mask = ~np.isnan(R_original)
|
|
865
|
+
return np.sum((R_original[mask] - R_pred[mask]) ** 2)
|
|
866
|
+
|
|
867
|
+
# VISUALIZATION
|
|
868
|
+
def plot_matrix(matrix, title):
|
|
869
|
+
plt.figure()
|
|
870
|
+
plt.imshow(matrix)
|
|
871
|
+
plt.title(title)
|
|
872
|
+
plt.colorbar()
|
|
873
|
+
plt.show()
|
|
874
|
+
|
|
875
|
+
def plot_latent(Z):
|
|
876
|
+
plt.figure()
|
|
877
|
+
for i in range(Z.shape[0]):
|
|
878
|
+
plt.scatter(Z[i, 0], Z[i, 1])
|
|
879
|
+
plt.text(Z[i, 0], Z[i, 1], f"U{i}")
|
|
880
|
+
plt.title("Latent Space (PCA)")
|
|
881
|
+
plt.xlabel("Component 1")
|
|
882
|
+
plt.ylabel("Component 2")
|
|
883
|
+
plt.show()
|
|
884
|
+
|
|
885
|
+
def plot_error(errors, labels):
|
|
886
|
+
plt.figure()
|
|
887
|
+
plt.bar(labels, errors)
|
|
888
|
+
plt.title("Error Comparison")
|
|
889
|
+
plt.show()
|
|
890
|
+
|
|
891
|
+
def main_dr():
|
|
892
|
+
R = get_data()
|
|
893
|
+
print("ORIGINAL MATRIX: ", R)
|
|
894
|
+
|
|
895
|
+
R_filled = fill_item_mean(R)
|
|
896
|
+
print("\nITEM MEAN FILLED MATRIX: ", R_filled)
|
|
897
|
+
|
|
898
|
+
# PCA
|
|
899
|
+
Z_pca, R_pca = pca_process(R_filled)
|
|
900
|
+
|
|
901
|
+
# SVD
|
|
902
|
+
R_svd = svd_process(R_filled)
|
|
903
|
+
|
|
904
|
+
# Recommendation
|
|
905
|
+
user = 0
|
|
906
|
+
print("\nRECOMMENDATIONS")
|
|
907
|
+
print("PCA:", recommend_dr(R, R_pca, user))
|
|
908
|
+
print("SVD:", recommend_dr(R, R_svd, user))
|
|
909
|
+
|
|
910
|
+
# Error
|
|
911
|
+
print("\nERRORS")
|
|
912
|
+
error_pca = compute_error(R, R_pca)
|
|
913
|
+
error_svd = compute_error(R, R_svd)
|
|
914
|
+
|
|
915
|
+
print("PCA Error:", error_pca)
|
|
916
|
+
print("SVD Error:", error_svd)
|
|
917
|
+
|
|
918
|
+
# Visualization
|
|
919
|
+
plot_matrix(R_filled, "Filled Matrix")
|
|
920
|
+
plot_matrix(R_pca, "PCA Reconstruction")
|
|
921
|
+
plot_matrix(R_svd, "SVD Reconstruction")
|
|
922
|
+
|
|
923
|
+
plot_latent(Z_pca)
|
|
924
|
+
|
|
925
|
+
plot_error(
|
|
926
|
+
[error_pca, error_svd],
|
|
927
|
+
["PCA", "SVD"]
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
# ==========================================
|
|
932
|
+
# SECTION 3: RECOMMENDERS.PY
|
|
933
|
+
# ==========================================
|
|
934
|
+
|
|
935
|
+
# =========================
|
|
936
|
+
# DATA PREPARATION
|
|
937
|
+
def load_data_cf():
|
|
938
|
+
data = [
|
|
939
|
+
[5, 3, 4, '?'],
|
|
940
|
+
[3, 1, 2, 3],
|
|
941
|
+
[4, 3, 4, 5],
|
|
942
|
+
[3, 3, 1, 5]
|
|
943
|
+
]
|
|
944
|
+
|
|
945
|
+
df = pd.DataFrame(
|
|
946
|
+
data,
|
|
947
|
+
index=['User1', 'User2', 'User3', 'User4'],
|
|
948
|
+
columns=['Item1', 'Item2', 'Item3', 'Item4']
|
|
949
|
+
)
|
|
950
|
+
|
|
951
|
+
df = df.mask(df == '?', np.nan).astype(float)
|
|
952
|
+
return df
|
|
953
|
+
|
|
954
|
+
def mean_center(df):
|
|
955
|
+
user_mean = df.mean(axis=1)
|
|
956
|
+
df_centered = df.sub(user_mean, axis=0)
|
|
957
|
+
return user_mean, df_centered
|
|
958
|
+
|
|
959
|
+
# SIMILARITY FUNCTION
|
|
960
|
+
def cosine_similarity_cf(a, b):
|
|
961
|
+
mask = ~np.isnan(a) & ~np.isnan(b)
|
|
962
|
+
|
|
963
|
+
if np.sum(mask) == 0:
|
|
964
|
+
return 0
|
|
965
|
+
|
|
966
|
+
a, b = a[mask], b[mask]
|
|
967
|
+
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
|
968
|
+
|
|
969
|
+
# USER-BASED CF
|
|
970
|
+
def predict_user_based(df, user, item):
|
|
971
|
+
target_vector = df.loc[user].values
|
|
972
|
+
similarities = {}
|
|
973
|
+
|
|
974
|
+
for other_user in df.index:
|
|
975
|
+
if other_user == user:
|
|
976
|
+
continue
|
|
977
|
+
|
|
978
|
+
if not np.isnan(df.loc[other_user, item]):
|
|
979
|
+
sim = cosine_similarity_cf(
|
|
980
|
+
target_vector,
|
|
981
|
+
df.loc[other_user].values
|
|
982
|
+
)
|
|
983
|
+
similarities[other_user] = sim
|
|
984
|
+
|
|
985
|
+
num, den = 0, 0
|
|
986
|
+
for u, sim in similarities.items():
|
|
987
|
+
num += sim * df.loc[u, item]
|
|
988
|
+
den += abs(sim)
|
|
989
|
+
|
|
990
|
+
return np.nan if den == 0 else num / den
|
|
991
|
+
|
|
992
|
+
def predict_user_based_mean_centered(df, user, item):
|
|
993
|
+
user_mean = df.mean(axis=1)
|
|
994
|
+
target_vector = df.loc[user].values
|
|
995
|
+
similarities = {}
|
|
996
|
+
|
|
997
|
+
for other_user in df.index:
|
|
998
|
+
if other_user == user:
|
|
999
|
+
continue
|
|
1000
|
+
|
|
1001
|
+
if not np.isnan(df.loc[other_user, item]):
|
|
1002
|
+
sim = cosine_similarity_cf(
|
|
1003
|
+
target_vector,
|
|
1004
|
+
df.loc[other_user].values
|
|
1005
|
+
)
|
|
1006
|
+
similarities[other_user] = sim
|
|
1007
|
+
|
|
1008
|
+
num, den = 0, 0
|
|
1009
|
+
|
|
1010
|
+
for u, sim in similarities.items():
|
|
1011
|
+
ru_i = df.loc[u, item]
|
|
1012
|
+
ru_mean = user_mean[u]
|
|
1013
|
+
|
|
1014
|
+
num += sim * (ru_i - ru_mean)
|
|
1015
|
+
den += abs(sim)
|
|
1016
|
+
|
|
1017
|
+
if den == 0:
|
|
1018
|
+
return user_mean[user]
|
|
1019
|
+
|
|
1020
|
+
return user_mean[user] + (num / den)
|
|
1021
|
+
|
|
1022
|
+
def predict_user_based_topk(df, user, item, k=2):
|
|
1023
|
+
target_vector = df.loc[user].values
|
|
1024
|
+
similarities = []
|
|
1025
|
+
|
|
1026
|
+
for other_user in df.index:
|
|
1027
|
+
if other_user == user:
|
|
1028
|
+
continue
|
|
1029
|
+
|
|
1030
|
+
if not np.isnan(df.loc[other_user, item]):
|
|
1031
|
+
sim = cosine_similarity_cf(
|
|
1032
|
+
target_vector,
|
|
1033
|
+
df.loc[other_user].values
|
|
1034
|
+
)
|
|
1035
|
+
similarities.append((other_user, sim))
|
|
1036
|
+
|
|
1037
|
+
# Sort by similarity (descending)
|
|
1038
|
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
1039
|
+
|
|
1040
|
+
# Select Top-K
|
|
1041
|
+
top_k = similarities[:k]
|
|
1042
|
+
|
|
1043
|
+
num, den = 0, 0
|
|
1044
|
+
for u, sim in top_k:
|
|
1045
|
+
num += sim * df.loc[u, item]
|
|
1046
|
+
den += abs(sim)
|
|
1047
|
+
|
|
1048
|
+
return np.nan if den == 0 else num / den
|
|
1049
|
+
|
|
1050
|
+
# ITEM-BASED CF
|
|
1051
|
+
def predict_item_based(df, user, item):
|
|
1052
|
+
target_vector = df[item].values
|
|
1053
|
+
similarities = {}
|
|
1054
|
+
|
|
1055
|
+
for other_item in df.columns:
|
|
1056
|
+
if other_item == item:
|
|
1057
|
+
continue
|
|
1058
|
+
|
|
1059
|
+
if not np.isnan(df.loc[user, other_item]):
|
|
1060
|
+
sim = cosine_similarity_cf(
|
|
1061
|
+
target_vector,
|
|
1062
|
+
df[other_item].values
|
|
1063
|
+
)
|
|
1064
|
+
similarities[other_item] = sim
|
|
1065
|
+
|
|
1066
|
+
num, den = 0, 0
|
|
1067
|
+
|
|
1068
|
+
for i, sim in similarities.items():
|
|
1069
|
+
num += sim * df.loc[user, i]
|
|
1070
|
+
den += abs(sim)
|
|
1071
|
+
|
|
1072
|
+
return np.nan if den == 0 else num / den
|
|
1073
|
+
|
|
1074
|
+
def predict_item_based_topk(df, user, item, k=2):
|
|
1075
|
+
target_vector = df[item].values
|
|
1076
|
+
similarities = []
|
|
1077
|
+
|
|
1078
|
+
for other_item in df.columns:
|
|
1079
|
+
if other_item == item:
|
|
1080
|
+
continue
|
|
1081
|
+
|
|
1082
|
+
if not np.isnan(df.loc[user, other_item]):
|
|
1083
|
+
sim = cosine_similarity_cf(
|
|
1084
|
+
target_vector,
|
|
1085
|
+
df[other_item].values
|
|
1086
|
+
)
|
|
1087
|
+
similarities.append((other_item, sim))
|
|
1088
|
+
|
|
1089
|
+
# Sort & select Top-K
|
|
1090
|
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
1091
|
+
top_k = similarities[:k]
|
|
1092
|
+
|
|
1093
|
+
num, den = 0, 0
|
|
1094
|
+
for i, sim in top_k:
|
|
1095
|
+
num += sim * df.loc[user, i]
|
|
1096
|
+
den += abs(sim)
|
|
1097
|
+
|
|
1098
|
+
return np.nan if den == 0 else num / den
|
|
1099
|
+
|
|
1100
|
+
# EVALUATION
|
|
1101
|
+
def evaluate(df):
|
|
1102
|
+
actuals, preds = [], []
|
|
1103
|
+
|
|
1104
|
+
for u in df.index:
|
|
1105
|
+
for i in df.columns:
|
|
1106
|
+
if not np.isnan(df.loc[u, i]):
|
|
1107
|
+
|
|
1108
|
+
temp = df.copy()
|
|
1109
|
+
actual = temp.loc[u, i]
|
|
1110
|
+
temp.loc[u, i] = np.nan
|
|
1111
|
+
|
|
1112
|
+
p = predict_user_based(temp, u, i)
|
|
1113
|
+
|
|
1114
|
+
if not np.isnan(p):
|
|
1115
|
+
actuals.append(actual)
|
|
1116
|
+
preds.append(p)
|
|
1117
|
+
|
|
1118
|
+
actuals = np.array(actuals)
|
|
1119
|
+
preds = np.array(preds)
|
|
1120
|
+
|
|
1121
|
+
rmse = np.sqrt(np.mean((actuals - preds) ** 2))
|
|
1122
|
+
mae = np.mean(np.abs(actuals - preds))
|
|
1123
|
+
|
|
1124
|
+
return rmse, mae
|
|
1125
|
+
|
|
1126
|
+
# VISUALIZATION
|
|
1127
|
+
def plot_matrix_cf(matrix, title):
|
|
1128
|
+
plt.figure()
|
|
1129
|
+
plt.imshow(matrix, aspect='auto')
|
|
1130
|
+
plt.title(title)
|
|
1131
|
+
plt.colorbar()
|
|
1132
|
+
plt.show()
|
|
1133
|
+
|
|
1134
|
+
# MAIN EXECUTION
|
|
1135
|
+
def main_recommenders():
|
|
1136
|
+
df = load_data_cf()
|
|
1137
|
+
|
|
1138
|
+
# Mean Centering
|
|
1139
|
+
user_mean, df_centered = mean_center(df)
|
|
1140
|
+
print("\nMean Centered Matrix:\n", df_centered)
|
|
1141
|
+
|
|
1142
|
+
"""
|
|
1143
|
+
for u in df.index:
|
|
1144
|
+
for i in df.columns:
|
|
1145
|
+
if np.isnan(df.loc[u, i]):
|
|
1146
|
+
user_pred_matrix.loc[u, i] = predict_user_based(df, u, i)
|
|
1147
|
+
item_pred_matrix.loc[u, i] = predict_item_based(df, u, i)
|
|
1148
|
+
|
|
1149
|
+
print("\nUser-Based Prediction Matrix:\n", user_pred_matrix)
|
|
1150
|
+
print("\nItem-Based Prediction Matrix:\n", item_pred_matrix)
|
|
1151
|
+
"""
|
|
1152
|
+
|
|
1153
|
+
# Prediction
|
|
1154
|
+
user = "User1"
|
|
1155
|
+
item = "Item4"
|
|
1156
|
+
|
|
1157
|
+
user_pred = predict_user_based(df, user, item)
|
|
1158
|
+
item_pred = predict_item_based(df, user, item)
|
|
1159
|
+
|
|
1160
|
+
print("\nUser-Based Prediction:\n", user_pred)
|
|
1161
|
+
print("\nItem-Based Prediction:\n", item_pred)
|
|
1162
|
+
|
|
1163
|
+
# Evaluation
|
|
1164
|
+
rmse, mae = evaluate(df)
|
|
1165
|
+
print("\nEvaluation Metrics:")
|
|
1166
|
+
print("RMSE =", rmse)
|
|
1167
|
+
print("MAE =", mae)
|
|
1168
|
+
|
|
1169
|
+
# Fill NaNs for visualization
|
|
1170
|
+
user_pred_matrix = df.copy().fillna(df.mean().mean())
|
|
1171
|
+
item_pred_matrix = df.copy().fillna(df.mean().mean())
|
|
1172
|
+
|
|
1173
|
+
# Plots
|
|
1174
|
+
plot_matrix_cf(df.fillna(0), "Original Matrix")
|
|
1175
|
+
plot_matrix_cf(df_centered.fillna(0), "Mean Centered Matrix")
|
|
1176
|
+
plot_matrix_cf(user_pred_matrix, "User-Based Predicted Matrix")
|
|
1177
|
+
plot_matrix_cf(item_pred_matrix, "Item-Based Predicted Matrix")
|
|
1178
|
+
|
|
1179
|
+
|
|
1180
|
+
# ==========================================
|
|
1181
|
+
# SECTION 4: PAGERANK.PY
|
|
1182
|
+
# ==========================================
|
|
1183
|
+
|
|
1184
|
+
import math
|
|
1185
|
+
|
|
1186
|
+
# GRAPH CREATION
|
|
1187
|
+
def create_graph(matrix, nodes):
|
|
1188
|
+
graph = {}
|
|
1189
|
+
for i in range(len(matrix)):
|
|
1190
|
+
graph[nodes[i]] = []
|
|
1191
|
+
for j in range(len(matrix[i])):
|
|
1192
|
+
if matrix[i][j] == 1:
|
|
1193
|
+
graph[nodes[i]].append(nodes[j])
|
|
1194
|
+
return graph
|
|
1195
|
+
|
|
1196
|
+
def edges_to_graph(edges):
|
|
1197
|
+
graph = {}
|
|
1198
|
+
for src, dst in edges:
|
|
1199
|
+
if src not in graph:
|
|
1200
|
+
graph[src] = []
|
|
1201
|
+
if dst not in graph:
|
|
1202
|
+
graph[dst] = []
|
|
1203
|
+
graph[src].append(dst)
|
|
1204
|
+
return graph
|
|
1205
|
+
|
|
1206
|
+
# GRAPH ANALYSIS
|
|
1207
|
+
def count_outgoing_links(graph):
|
|
1208
|
+
return {page: len(graph[page]) for page in graph}
|
|
1209
|
+
|
|
1210
|
+
|
|
1211
|
+
def store_incoming_links(graph):
|
|
1212
|
+
incoming_links = {page: [] for page in graph}
|
|
1213
|
+
incoming_count = {page: 0 for page in graph}
|
|
1214
|
+
|
|
1215
|
+
for page in graph:
|
|
1216
|
+
for link in graph[page]:
|
|
1217
|
+
incoming_links[link].append(page)
|
|
1218
|
+
incoming_count[link] += 1
|
|
1219
|
+
|
|
1220
|
+
return incoming_links, incoming_count
|
|
1221
|
+
|
|
1222
|
+
# PAGERANK ALGORITHM
|
|
1223
|
+
def calculate_pagerank(graph, incoming_links, outgoing_count):
|
|
1224
|
+
d = 0.85
|
|
1225
|
+
iterations = 10
|
|
1226
|
+
N = len(graph)
|
|
1227
|
+
|
|
1228
|
+
pr = {page: 1 / N for page in graph}
|
|
1229
|
+
|
|
1230
|
+
print("\nPageRank Algorithm:")
|
|
1231
|
+
|
|
1232
|
+
for i in range(iterations):
|
|
1233
|
+
new_pr = {}
|
|
1234
|
+
for page in graph:
|
|
1235
|
+
rank_sum = 0
|
|
1236
|
+
for incoming in incoming_links[page]:
|
|
1237
|
+
rank_sum += pr[incoming] / outgoing_count[incoming]
|
|
1238
|
+
|
|
1239
|
+
new_pr[page] = (1 - d) / N + d * rank_sum
|
|
1240
|
+
|
|
1241
|
+
pr = new_pr
|
|
1242
|
+
print(f"\nIteration {i+1}: {pr}")
|
|
1243
|
+
|
|
1244
|
+
return pr
|
|
1245
|
+
|
|
1246
|
+
# HITS ALGORITHM
|
|
1247
|
+
def initialize_scores(graph):
|
|
1248
|
+
authority = {node: 1.0 for node in graph}
|
|
1249
|
+
hub = {node: 1.0 for node in graph}
|
|
1250
|
+
return authority, hub
|
|
1251
|
+
|
|
1252
|
+
|
|
1253
|
+
def normalize_sum(scores):
|
|
1254
|
+
total = sum(scores.values())
|
|
1255
|
+
return {k: round(v / total, 4) for k, v in scores.items()}
|
|
1256
|
+
|
|
1257
|
+
|
|
1258
|
+
def hits_algorithm(graph, iterations=5):
|
|
1259
|
+
incoming_links, _ = store_incoming_links(graph)
|
|
1260
|
+
authority, hub = initialize_scores(graph)
|
|
1261
|
+
|
|
1262
|
+
print("\nHITS Algorithm:")
|
|
1263
|
+
|
|
1264
|
+
for i in range(iterations):
|
|
1265
|
+
|
|
1266
|
+
# Authority update
|
|
1267
|
+
new_authority = {}
|
|
1268
|
+
for node in graph:
|
|
1269
|
+
new_authority[node] = sum(
|
|
1270
|
+
hub.get(in_node, 0) for in_node in incoming_links.get(node, [])
|
|
1271
|
+
)
|
|
1272
|
+
|
|
1273
|
+
# Hub update
|
|
1274
|
+
new_hub = {}
|
|
1275
|
+
for node in graph:
|
|
1276
|
+
new_hub[node] = sum(
|
|
1277
|
+
new_authority.get(out_node, 0) for out_node in graph[node]
|
|
1278
|
+
)
|
|
1279
|
+
|
|
1280
|
+
authority = normalize_sum(new_authority)
|
|
1281
|
+
hub = normalize_sum(new_hub)
|
|
1282
|
+
|
|
1283
|
+
print(f"\nIteration {i+1}")
|
|
1284
|
+
print("Authority:", authority)
|
|
1285
|
+
print("Hub:", hub)
|
|
1286
|
+
|
|
1287
|
+
return authority, hub
|
|
1288
|
+
|
|
1289
|
+
# EQUATION METHOD
|
|
1290
|
+
def build_matrix(graph):
|
|
1291
|
+
nodes = list(graph.keys())
|
|
1292
|
+
n = len(nodes)
|
|
1293
|
+
index = {node: i for i, node in enumerate(nodes)}
|
|
1294
|
+
|
|
1295
|
+
incoming, _ = store_incoming_links(graph)
|
|
1296
|
+
outdeg = {node: len(graph[node]) for node in graph}
|
|
1297
|
+
|
|
1298
|
+
A = np.zeros((n, n))
|
|
1299
|
+
|
|
1300
|
+
for i, node in enumerate(nodes):
|
|
1301
|
+
for inc in incoming[node]:
|
|
1302
|
+
j = index[inc]
|
|
1303
|
+
A[i][j] = 1 / outdeg[inc]
|
|
1304
|
+
|
|
1305
|
+
return A, nodes
|
|
1306
|
+
|
|
1307
|
+
def solve_ranks(A, iterations=20):
|
|
1308
|
+
n = len(A)
|
|
1309
|
+
r = np.ones(n) / n
|
|
1310
|
+
|
|
1311
|
+
history = []
|
|
1312
|
+
|
|
1313
|
+
for _ in range(iterations):
|
|
1314
|
+
r = A @ r
|
|
1315
|
+
r = r / np.sum(r)
|
|
1316
|
+
history.append(r.copy())
|
|
1317
|
+
|
|
1318
|
+
return r, history
|
|
1319
|
+
|
|
1320
|
+
def print_equations(graph):
|
|
1321
|
+
incoming, _ = store_incoming_links(graph)
|
|
1322
|
+
outdeg = {node: len(graph[node]) for node in graph}
|
|
1323
|
+
|
|
1324
|
+
print("\nEquations:\n")
|
|
1325
|
+
for node in graph:
|
|
1326
|
+
eq = f"r_{node} = "
|
|
1327
|
+
terms = [f"(r_{inc}/{outdeg[inc]})" for inc in incoming[node]]
|
|
1328
|
+
eq += " + ".join(terms) if terms else "0"
|
|
1329
|
+
print(eq)
|
|
1330
|
+
|
|
1331
|
+
# VISUALIZATION
|
|
1332
|
+
def visualize_graph(graph):
|
|
1333
|
+
nodes = list(graph.keys())
|
|
1334
|
+
n = len(nodes)
|
|
1335
|
+
|
|
1336
|
+
positions = {}
|
|
1337
|
+
for i, node in enumerate(nodes):
|
|
1338
|
+
angle = 2 * math.pi * i / n
|
|
1339
|
+
positions[node] = (math.cos(angle), math.sin(angle))
|
|
1340
|
+
|
|
1341
|
+
plt.figure()
|
|
1342
|
+
|
|
1343
|
+
for node, (x, y) in positions.items():
|
|
1344
|
+
plt.scatter(x, y)
|
|
1345
|
+
plt.text(x, y, node, ha='center', va='center')
|
|
1346
|
+
|
|
1347
|
+
for src in graph:
|
|
1348
|
+
for dst in graph[src]:
|
|
1349
|
+
x1, y1 = positions[src]
|
|
1350
|
+
x2, y2 = positions[dst]
|
|
1351
|
+
plt.arrow(x1, y1, x2 - x1, y2 - y1, head_width=0.05)
|
|
1352
|
+
|
|
1353
|
+
plt.title("Web Graph")
|
|
1354
|
+
plt.axis('off')
|
|
1355
|
+
plt.show()
|
|
1356
|
+
|
|
1357
|
+
def visualize_pagerank(pr):
|
|
1358
|
+
plt.figure()
|
|
1359
|
+
plt.bar(list(pr.keys()), list(pr.values()))
|
|
1360
|
+
plt.title("PageRank Values")
|
|
1361
|
+
plt.show()
|
|
1362
|
+
|
|
1363
|
+
def plot_convergence(history, nodes):
|
|
1364
|
+
for i, node in enumerate(nodes):
|
|
1365
|
+
plt.plot([h[i] for h in history], label=node)
|
|
1366
|
+
|
|
1367
|
+
plt.title("Rank Convergence")
|
|
1368
|
+
plt.legend()
|
|
1369
|
+
plt.show()
|
|
1370
|
+
|
|
1371
|
+
def plot_scores(authority, hub):
|
|
1372
|
+
nodes = list(authority.keys())
|
|
1373
|
+
|
|
1374
|
+
plt.figure()
|
|
1375
|
+
plt.bar(nodes, authority.values())
|
|
1376
|
+
plt.title("Authority Scores")
|
|
1377
|
+
plt.show()
|
|
1378
|
+
|
|
1379
|
+
plt.figure()
|
|
1380
|
+
plt.bar(nodes, hub.values())
|
|
1381
|
+
plt.title("Hub Scores")
|
|
1382
|
+
plt.show()
|
|
1383
|
+
|
|
1384
|
+
# INFERENCE
|
|
1385
|
+
def infer_pagerank(pr):
|
|
1386
|
+
print("\nFinal PageRank:")
|
|
1387
|
+
sorted_pr = sorted(pr.items(), key=lambda x: x[1], reverse=True)
|
|
1388
|
+
for node, val in sorted_pr:
|
|
1389
|
+
print(f"{node}: {val:.4f}")
|
|
1390
|
+
|
|
1391
|
+
print(f"\nMost important: {sorted_pr[0][0]}")
|
|
1392
|
+
print(f"Least important: {sorted_pr[-1][0]}")
|
|
1393
|
+
|
|
1394
|
+
|
|
1395
|
+
def infer_hits(authority, hub):
|
|
1396
|
+
sorted_auth = sorted(authority.items(), key=lambda x: x[1], reverse=True)
|
|
1397
|
+
sorted_hub = sorted(hub.items(), key=lambda x: x[1], reverse=True)
|
|
1398
|
+
|
|
1399
|
+
print("\nBest Authority:", sorted_auth[0][0])
|
|
1400
|
+
print("Best Hub:", sorted_hub[0][0])
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
def infer_ranks(ranks, nodes):
|
|
1404
|
+
rank_dict = dict(zip(nodes, ranks))
|
|
1405
|
+
sorted_nodes = sorted(rank_dict.items(), key=lambda x: x[1], reverse=True)
|
|
1406
|
+
|
|
1407
|
+
print("\nEquation Method Ranking:")
|
|
1408
|
+
for node, val in sorted_nodes:
|
|
1409
|
+
print(f"{node}: {val:.4f}")
|
|
1410
|
+
|
|
1411
|
+
def main_pagerank():
|
|
1412
|
+
edges = [
|
|
1413
|
+
('A', 'B'),
|
|
1414
|
+
('A', 'C'),
|
|
1415
|
+
('B', 'C'),
|
|
1416
|
+
('C', 'A'),
|
|
1417
|
+
('D', 'C')
|
|
1418
|
+
]
|
|
1419
|
+
|
|
1420
|
+
graph = edges_to_graph(edges)
|
|
1421
|
+
|
|
1422
|
+
print("Graph:", graph)
|
|
1423
|
+
|
|
1424
|
+
# Analysis
|
|
1425
|
+
outgoing_count = count_outgoing_links(graph)
|
|
1426
|
+
incoming_links, incoming_count = store_incoming_links(graph)
|
|
1427
|
+
|
|
1428
|
+
print("\nCount of Outgoing Links:", outgoing_count)
|
|
1429
|
+
print("Count of Incoming Links:", incoming_count)
|
|
1430
|
+
print("Incoming Links:", incoming_links)
|
|
1431
|
+
|
|
1432
|
+
# PageRank
|
|
1433
|
+
pr = calculate_pagerank(graph, incoming_links, outgoing_count)
|
|
1434
|
+
|
|
1435
|
+
# HITS
|
|
1436
|
+
authority, hub = hits_algorithm(graph)
|
|
1437
|
+
|
|
1438
|
+
# Equation Method
|
|
1439
|
+
print_equations(graph)
|
|
1440
|
+
A, nodes = build_matrix(graph)
|
|
1441
|
+
ranks, history = solve_ranks(A)
|
|
1442
|
+
|
|
1443
|
+
# Inference
|
|
1444
|
+
infer_pagerank(pr)
|
|
1445
|
+
infer_hits(authority, hub)
|
|
1446
|
+
infer_ranks(ranks, nodes)
|
|
1447
|
+
|
|
1448
|
+
# Visualization
|
|
1449
|
+
visualize_graph(graph)
|
|
1450
|
+
visualize_pagerank(pr)
|
|
1451
|
+
plot_scores(authority, hub)
|
|
1452
|
+
plot_convergence(history, nodes)
|
|
1453
|
+
|