itertoolkit 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bm_preprocessing/__init__.py +14 -0
- bm_preprocessing/importer/DM/__init__.py +7 -0
- bm_preprocessing/importer/DM/agg.py +6 -0
- bm_preprocessing/importer/DM/dbscan.py +6 -0
- bm_preprocessing/importer/DM/finals.py +6 -0
- bm_preprocessing/importer/DM/gsp.py +6 -0
- bm_preprocessing/importer/DM/test.py +6 -0
- bm_preprocessing/importer/Finals/__init__.py +7 -0
- bm_preprocessing/importer/Finals/kaadhal.py +6 -0
- bm_preprocessing/importer/Finals/raaka.py +6 -0
- bm_preprocessing/importer/Finals/seedan.py +6 -0
- bm_preprocessing/importer/Finals/vikram.py +6 -0
- bm_preprocessing/importer/IR/__init__.py +6 -0
- bm_preprocessing/importer/IR/finals.py +6 -0
- bm_preprocessing/importer/IR/pagerank.py +6 -0
- bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
- bm_preprocessing/importer/IR/test.py +6 -0
- bm_preprocessing/importer/PY/__init__.py +4 -0
- bm_preprocessing/importer/PY/lib_doc.py +6 -0
- bm_preprocessing/importer/PY/python_doc.py +6 -0
- bm_preprocessing/importer/__init__.py +8 -0
- bm_preprocessing/importer/_module_printer.py +23 -0
- bm_preprocessing/src/DM/__init__.py +1 -0
- bm_preprocessing/src/DM/agg.py +267 -0
- bm_preprocessing/src/DM/dbscan.py +218 -0
- bm_preprocessing/src/DM/finals.py +19 -0
- bm_preprocessing/src/DM/gsp.py +378 -0
- bm_preprocessing/src/DM/test.py +19 -0
- bm_preprocessing/src/Finals/__init__.py +1 -0
- bm_preprocessing/src/Finals/kaadhal.py +1453 -0
- bm_preprocessing/src/Finals/raaka.py +1338 -0
- bm_preprocessing/src/Finals/seedan.py +1173 -0
- bm_preprocessing/src/Finals/vikram.py +520 -0
- bm_preprocessing/src/IR/__init__.py +1 -0
- bm_preprocessing/src/IR/finals.py +14 -0
- bm_preprocessing/src/IR/pagerank.py +109 -0
- bm_preprocessing/src/IR/recommenders_pca.py +487 -0
- bm_preprocessing/src/IR/test.py +14 -0
- bm_preprocessing/src/PY/__init__.py +1 -0
- bm_preprocessing/src/PY/lib_doc.py +295 -0
- bm_preprocessing/src/PY/python_doc.py +177 -0
- bm_preprocessing/src/__init__.py +1 -0
- itertoolkit-1.5.0.dist-info/METADATA +120 -0
- itertoolkit-1.5.0.dist-info/RECORD +45 -0
- itertoolkit-1.5.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1173 @@
|
|
|
1
|
+
#--------------------------------
|
|
2
|
+
#Data Mining
|
|
3
|
+
#--------------------------------
|
|
4
|
+
|
|
5
|
+
#Agglomerative Hierarchical Clustering
|
|
6
|
+
def ac():
|
|
7
|
+
print(r"""
|
|
8
|
+
import numpy as np
|
|
9
|
+
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
|
|
10
|
+
import matplotlib.pyplot as plt
|
|
11
|
+
|
|
12
|
+
# Sample dataset
|
|
13
|
+
X = np.array([
|
|
14
|
+
[1, 2],
|
|
15
|
+
[2, 3],
|
|
16
|
+
[5, 6],
|
|
17
|
+
[6, 7],
|
|
18
|
+
[8, 9]
|
|
19
|
+
])
|
|
20
|
+
|
|
21
|
+
# Step 1: Perform clustering
|
|
22
|
+
Z = linkage(X, method='single') # 'single', 'complete', 'average', 'ward'
|
|
23
|
+
|
|
24
|
+
# Step 2: Plot dendrogram
|
|
25
|
+
plt.figure()
|
|
26
|
+
dendrogram(Z)
|
|
27
|
+
plt.title("Dendrogram")
|
|
28
|
+
plt.xlabel("Data Points")
|
|
29
|
+
plt.ylabel("Distance")
|
|
30
|
+
plt.show()
|
|
31
|
+
|
|
32
|
+
# Step 3: Form clusters (k = 2)
|
|
33
|
+
clusters = fcluster(Z, 2, criterion='maxclust')
|
|
34
|
+
|
|
35
|
+
print("Cluster labels:", clusters)""")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
##DBScan clustering
|
|
39
|
+
def db():
|
|
40
|
+
print(r"""
|
|
41
|
+
import numpy as np
|
|
42
|
+
from sklearn.cluster import DBSCAN
|
|
43
|
+
import matplotlib.pyplot as plt
|
|
44
|
+
|
|
45
|
+
# Sample dataset
|
|
46
|
+
X = np.array([
|
|
47
|
+
[1, 2],
|
|
48
|
+
[2, 2],
|
|
49
|
+
[2, 3],
|
|
50
|
+
[8, 7],
|
|
51
|
+
[8, 8],
|
|
52
|
+
[25, 80] # noise
|
|
53
|
+
])
|
|
54
|
+
|
|
55
|
+
# Apply DBSCAN
|
|
56
|
+
db = DBSCAN(eps=2, min_samples=2)
|
|
57
|
+
labels = db.fit_predict(X)
|
|
58
|
+
|
|
59
|
+
print("Cluster labels:", labels)
|
|
60
|
+
|
|
61
|
+
# Plot
|
|
62
|
+
plt.scatter(X[:,0], X[:,1], c=labels)
|
|
63
|
+
plt.title("DBSCAN Clustering")
|
|
64
|
+
plt.show()""")
|
|
65
|
+
|
|
66
|
+
##Generalized Sequential Pattern (GSP) Algorithm
|
|
67
|
+
def gsp():
|
|
68
|
+
print(r"""
|
|
69
|
+
from collections import defaultdict
|
|
70
|
+
|
|
71
|
+
# Sample sequence database
|
|
72
|
+
D = [
|
|
73
|
+
['A', 'B', 'C'],
|
|
74
|
+
['A', 'C'],
|
|
75
|
+
['A', 'B', 'C'],
|
|
76
|
+
['B', 'C']
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
min_sup = 2
|
|
80
|
+
|
|
81
|
+
# Step 1: Find frequent 1-sequences
|
|
82
|
+
def get_frequent_1(D, min_sup):
|
|
83
|
+
count = defaultdict(int)
|
|
84
|
+
|
|
85
|
+
for seq in D:
|
|
86
|
+
for item in set(seq):
|
|
87
|
+
count[item] += 1
|
|
88
|
+
|
|
89
|
+
return { (item,): count[item] for item in count if count[item] >= min_sup }
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# Step 2: Generate candidates
|
|
93
|
+
def generate_candidates(prev_freq):
|
|
94
|
+
candidates = []
|
|
95
|
+
keys = list(prev_freq.keys())
|
|
96
|
+
|
|
97
|
+
for i in range(len(keys)):
|
|
98
|
+
for j in range(len(keys)):
|
|
99
|
+
if keys[i][1:] == keys[j][:-1]:
|
|
100
|
+
new_seq = keys[i] + (keys[j][-1],)
|
|
101
|
+
candidates.append(new_seq)
|
|
102
|
+
|
|
103
|
+
return list(set(candidates))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# Step 3: Count support
|
|
107
|
+
def count_support(D, candidates):
|
|
108
|
+
count = defaultdict(int)
|
|
109
|
+
|
|
110
|
+
for seq in D:
|
|
111
|
+
for cand in candidates:
|
|
112
|
+
if is_subsequence(cand, seq):
|
|
113
|
+
count[cand] += 1
|
|
114
|
+
|
|
115
|
+
return {c: count[c] for c in count}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# Check subsequence
|
|
119
|
+
def is_subsequence(subseq, seq):
|
|
120
|
+
i = 0
|
|
121
|
+
for item in seq:
|
|
122
|
+
if i < len(subseq) and subseq[i] == item:
|
|
123
|
+
i += 1
|
|
124
|
+
return i == len(subseq)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# GSP Main
|
|
128
|
+
def GSP(D, min_sup):
|
|
129
|
+
freq_patterns = []
|
|
130
|
+
|
|
131
|
+
# Step 1
|
|
132
|
+
L1 = get_frequent_1(D, min_sup)
|
|
133
|
+
current = L1
|
|
134
|
+
freq_patterns.extend(L1.keys())
|
|
135
|
+
|
|
136
|
+
while current:
|
|
137
|
+
# Step 2
|
|
138
|
+
candidates = generate_candidates(current)
|
|
139
|
+
|
|
140
|
+
# Step 3
|
|
141
|
+
counted = count_support(D, candidates)
|
|
142
|
+
|
|
143
|
+
# Prune
|
|
144
|
+
current = {c: counted[c] for c in counted if counted[c] >= min_sup}
|
|
145
|
+
|
|
146
|
+
freq_patterns.extend(current.keys())
|
|
147
|
+
|
|
148
|
+
return freq_patterns
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# Run
|
|
152
|
+
patterns = GSP(D, min_sup)
|
|
153
|
+
|
|
154
|
+
print("Frequent Sequential Patterns:")
|
|
155
|
+
for p in patterns:
|
|
156
|
+
print(p)
|
|
157
|
+
""")
|
|
158
|
+
|
|
159
|
+
##Problem statement 13
|
|
160
|
+
def ps13():
|
|
161
|
+
print(r"""
|
|
162
|
+
Consider Customer mall dataset with following details.
|
|
163
|
+
1. CustomerID: An identifier for each customer.
|
|
164
|
+
2. Gender: Indicates the gender of the customer (Male or Female).
|
|
165
|
+
3. Age: Represents the age of the customer in years.
|
|
166
|
+
4. Annual Income (k$): Denotes the annual income of the customer in thousands of dollars.
|
|
167
|
+
5. Spending Score (1–100): A score ranging from 1 to 100 that quantifies the customer’s
|
|
168
|
+
spending habits and preferences. A higher score indicates a higher tendency to spend.
|
|
169
|
+
|
|
170
|
+
Do the following tasks
|
|
171
|
+
• Remove CustomerID column .
|
|
172
|
+
• Check for missing values.
|
|
173
|
+
• Convert categorical variable value of gender to numerical (Male-1, Female-0).
|
|
174
|
+
• Display male, female ratio as pie chart.
|
|
175
|
+
• Display age, annual income as bar graph
|
|
176
|
+
• Perform agglomerative clustering with ward algorithm as linkage.
|
|
177
|
+
• Display dendrogram.
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
import pandas as pd
|
|
181
|
+
import matplotlib.pyplot as plt
|
|
182
|
+
from scipy.cluster.hierarchy import linkage, dendrogram
|
|
183
|
+
from sklearn.cluster import AgglomerativeClustering
|
|
184
|
+
|
|
185
|
+
# Load dataset (update path if needed)
|
|
186
|
+
df = pd.read_csv("Mall_Customers.csv")
|
|
187
|
+
|
|
188
|
+
# -------------------------------
|
|
189
|
+
# 1. Remove CustomerID column
|
|
190
|
+
# -------------------------------
|
|
191
|
+
df = df.drop("CustomerID", axis=1)
|
|
192
|
+
|
|
193
|
+
# -------------------------------
|
|
194
|
+
# 2. Check for missing values
|
|
195
|
+
# -------------------------------
|
|
196
|
+
print("Missing values:\n", df.isnull().sum())
|
|
197
|
+
|
|
198
|
+
# -------------------------------
|
|
199
|
+
# 3. Convert Gender to numerical
|
|
200
|
+
# Male = 1, Female = 0
|
|
201
|
+
# -------------------------------
|
|
202
|
+
df["Gender"] = df["Gender"].map({"Male": 1, "Female": 0})
|
|
203
|
+
|
|
204
|
+
# -------------------------------
|
|
205
|
+
# 4. Pie chart (Male vs Female)
|
|
206
|
+
# -------------------------------
|
|
207
|
+
gender_counts = df["Gender"].value_counts()
|
|
208
|
+
|
|
209
|
+
labels = ["Male", "Female"]
|
|
210
|
+
values = [gender_counts.get(1, 0), gender_counts.get(0, 0)]
|
|
211
|
+
|
|
212
|
+
plt.figure()
|
|
213
|
+
plt.pie(values, labels=labels, autopct='%1.1f%%')
|
|
214
|
+
plt.title("Male vs Female Ratio")
|
|
215
|
+
plt.show()
|
|
216
|
+
|
|
217
|
+
# -------------------------------
|
|
218
|
+
# 5. Bar graph (Age & Income)
|
|
219
|
+
# -------------------------------
|
|
220
|
+
plt.figure()
|
|
221
|
+
|
|
222
|
+
plt.bar(range(len(df)), df["Age"])
|
|
223
|
+
plt.title("Age Distribution")
|
|
224
|
+
plt.xlabel("Customers")
|
|
225
|
+
plt.ylabel("Age")
|
|
226
|
+
plt.show()
|
|
227
|
+
|
|
228
|
+
plt.figure()
|
|
229
|
+
|
|
230
|
+
plt.bar(range(len(df)), df["Annual Income (k$)"])
|
|
231
|
+
plt.title("Annual Income Distribution")
|
|
232
|
+
plt.xlabel("Customers")
|
|
233
|
+
plt.ylabel("Income (k$)")
|
|
234
|
+
plt.show()
|
|
235
|
+
|
|
236
|
+
# -------------------------------
|
|
237
|
+
# 6. Agglomerative Clustering (Ward)
|
|
238
|
+
# -------------------------------
|
|
239
|
+
# Use relevant features
|
|
240
|
+
X = df[["Age", "Annual Income (k$)", "Spending Score (1-100)"]]
|
|
241
|
+
|
|
242
|
+
model = AgglomerativeClustering(n_clusters=3, linkage='ward')
|
|
243
|
+
labels = model.fit_predict(X)
|
|
244
|
+
|
|
245
|
+
df["Cluster"] = labels
|
|
246
|
+
print("Cluster labels:\n", df["Cluster"].head())
|
|
247
|
+
|
|
248
|
+
# -------------------------------
|
|
249
|
+
# 7. Dendrogram
|
|
250
|
+
# -------------------------------
|
|
251
|
+
plt.figure()
|
|
252
|
+
Z = linkage(X, method='ward')
|
|
253
|
+
dendrogram(Z)
|
|
254
|
+
plt.title("Dendrogram (Ward Linkage)")
|
|
255
|
+
plt.xlabel("Customers")
|
|
256
|
+
plt.ylabel("Distance")
|
|
257
|
+
plt.show()
|
|
258
|
+
""")
|
|
259
|
+
|
|
260
|
+
##Problem Sheet 14
|
|
261
|
+
def ps14():
|
|
262
|
+
print(r"""
|
|
263
|
+
1. Consider dataset consisting of annual customer data for a wholesale distributor. The dataset
|
|
264
|
+
contains 440 customers and has 8 attributes. Perform the following tasks.
|
|
265
|
+
• Drop the columns channel and region and display first few records.
|
|
266
|
+
• Consider Groceries and Milk attributes. Normalize these attribute values by scaling it from 0
|
|
267
|
+
mean to unit variance. Visualize normalized dataset.
|
|
268
|
+
• Write your own implementation of DBSCAN (no library) with Minpts = 15 and EPS=0.5.
|
|
269
|
+
• Plot the cluster Results.
|
|
270
|
+
|
|
271
|
+
2. Use the make_moons function from the sklearn.datasets module to generate a synthetic
|
|
272
|
+
dataset that has a moon-shaped pattern. Give 2000 as value to sample parameter.
|
|
273
|
+
Run built-in DBSCAN algorithm and visualize the result. Add some noise data and again
|
|
274
|
+
visualize the results.
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
import pandas as pd
|
|
278
|
+
import numpy as np
|
|
279
|
+
import matplotlib.pyplot as plt
|
|
280
|
+
from sklearn.datasets import make_moons
|
|
281
|
+
from sklearn.cluster import DBSCAN
|
|
282
|
+
|
|
283
|
+
# =====================================
|
|
284
|
+
# PART 1: WHOLESALE DATASET
|
|
285
|
+
# =====================================
|
|
286
|
+
|
|
287
|
+
# Load dataset
|
|
288
|
+
df = pd.read_csv("Wholesale customers data.csv")
|
|
289
|
+
|
|
290
|
+
# 1. Drop columns
|
|
291
|
+
df = df.drop(["Channel", "Region"], axis=1)
|
|
292
|
+
print("First few records:\n", df.head())
|
|
293
|
+
|
|
294
|
+
# 2. Normalize Groceries & Milk
|
|
295
|
+
X = df[["Groceries", "Milk"]].values
|
|
296
|
+
|
|
297
|
+
mean = X.mean(axis=0)
|
|
298
|
+
std = X.std(axis=0)
|
|
299
|
+
X_norm = (X - mean) / std
|
|
300
|
+
|
|
301
|
+
# Plot normalized data
|
|
302
|
+
plt.figure()
|
|
303
|
+
plt.scatter(X_norm[:, 0], X_norm[:, 1])
|
|
304
|
+
plt.title("Normalized Data (Groceries vs Milk)")
|
|
305
|
+
plt.xlabel("Groceries")
|
|
306
|
+
plt.ylabel("Milk")
|
|
307
|
+
plt.show()
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# =====================================
|
|
311
|
+
# CUSTOM DBSCAN IMPLEMENTATION
|
|
312
|
+
# =====================================
|
|
313
|
+
|
|
314
|
+
def euclidean(p1, p2):
|
|
315
|
+
return np.sqrt(np.sum((p1 - p2) ** 2))
|
|
316
|
+
|
|
317
|
+
def region_query(X, point_idx, eps):
|
|
318
|
+
neighbors = []
|
|
319
|
+
for i in range(len(X)):
|
|
320
|
+
if euclidean(X[point_idx], X[i]) <= eps:
|
|
321
|
+
neighbors.append(i)
|
|
322
|
+
return neighbors
|
|
323
|
+
|
|
324
|
+
def expand_cluster(X, labels, point_idx, neighbors, cluster_id, eps, min_pts):
|
|
325
|
+
labels[point_idx] = cluster_id
|
|
326
|
+
|
|
327
|
+
i = 0
|
|
328
|
+
while i < len(neighbors):
|
|
329
|
+
n_point = neighbors[i]
|
|
330
|
+
|
|
331
|
+
if labels[n_point] == -1:
|
|
332
|
+
labels[n_point] = cluster_id
|
|
333
|
+
|
|
334
|
+
if labels[n_point] == 0:
|
|
335
|
+
labels[n_point] = cluster_id
|
|
336
|
+
n_neighbors = region_query(X, n_point, eps)
|
|
337
|
+
|
|
338
|
+
if len(n_neighbors) >= min_pts:
|
|
339
|
+
neighbors += n_neighbors
|
|
340
|
+
|
|
341
|
+
i += 1
|
|
342
|
+
|
|
343
|
+
def dbscan(X, eps, min_pts):
|
|
344
|
+
labels = [0] * len(X) # 0 = unvisited
|
|
345
|
+
cluster_id = 0
|
|
346
|
+
|
|
347
|
+
for i in range(len(X)):
|
|
348
|
+
if labels[i] != 0:
|
|
349
|
+
continue
|
|
350
|
+
|
|
351
|
+
neighbors = region_query(X, i, eps)
|
|
352
|
+
|
|
353
|
+
if len(neighbors) < min_pts:
|
|
354
|
+
labels[i] = -1 # noise
|
|
355
|
+
else:
|
|
356
|
+
cluster_id += 1
|
|
357
|
+
expand_cluster(X, labels, i, neighbors, cluster_id, eps, min_pts)
|
|
358
|
+
|
|
359
|
+
return np.array(labels)
|
|
360
|
+
|
|
361
|
+
# 3. Run custom DBSCAN
|
|
362
|
+
labels_custom = dbscan(X_norm, eps=0.5, min_pts=15)
|
|
363
|
+
|
|
364
|
+
# 4. Plot results
|
|
365
|
+
plt.figure()
|
|
366
|
+
plt.scatter(X_norm[:, 0], X_norm[:, 1], c=labels_custom)
|
|
367
|
+
plt.title("Custom DBSCAN Clustering")
|
|
368
|
+
plt.xlabel("Groceries")
|
|
369
|
+
plt.ylabel("Milk")
|
|
370
|
+
plt.show()
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
# =====================================
|
|
374
|
+
# PART 2: MOON DATASET
|
|
375
|
+
# =====================================
|
|
376
|
+
|
|
377
|
+
# Generate moon data
|
|
378
|
+
X_moon, _ = make_moons(n_samples=2000, noise=0.05)
|
|
379
|
+
|
|
380
|
+
plt.figure()
|
|
381
|
+
plt.scatter(X_moon[:, 0], X_moon[:, 1])
|
|
382
|
+
plt.title("Moon Dataset")
|
|
383
|
+
plt.show()
|
|
384
|
+
|
|
385
|
+
# Built-in DBSCAN
|
|
386
|
+
model = DBSCAN(eps=0.2, min_samples=5)
|
|
387
|
+
labels_moon = model.fit_predict(X_moon)
|
|
388
|
+
|
|
389
|
+
plt.figure()
|
|
390
|
+
plt.scatter(X_moon[:, 0], X_moon[:, 1], c=labels_moon)
|
|
391
|
+
plt.title("DBSCAN on Moon Data")
|
|
392
|
+
plt.show()
|
|
393
|
+
|
|
394
|
+
# Add noise
|
|
395
|
+
noise = np.random.uniform(low=-2, high=3, size=(200, 2))
|
|
396
|
+
X_noisy = np.vstack((X_moon, noise))
|
|
397
|
+
|
|
398
|
+
plt.figure()
|
|
399
|
+
plt.scatter(X_noisy[:, 0], X_noisy[:, 1])
|
|
400
|
+
plt.title("Moon Data with Noise")
|
|
401
|
+
plt.show()
|
|
402
|
+
|
|
403
|
+
# DBSCAN on noisy data
|
|
404
|
+
labels_noisy = model.fit_predict(X_noisy)
|
|
405
|
+
|
|
406
|
+
plt.figure()
|
|
407
|
+
plt.scatter(X_noisy[:, 0], X_noisy[:, 1], c=labels_noisy)
|
|
408
|
+
plt.title("DBSCAN with Noise")
|
|
409
|
+
plt.show()
|
|
410
|
+
""")
|
|
411
|
+
|
|
412
|
+
##Problem sheet 15
|
|
413
|
+
def ps15():
|
|
414
|
+
print(r"""
|
|
415
|
+
Implement Mining Sequential Patterns Based on GSP (Generalized Sequential Patterns) MS-GSP
|
|
416
|
+
algorithm - Sequential pattern mining using multiple minimum supports with a support difference
|
|
417
|
+
constraint.
|
|
418
|
+
Input format:
|
|
419
|
+
data.txt:Each line represents a Transaction Sequence and each set in a sequence represents a set of
|
|
420
|
+
items.
|
|
421
|
+
para.txt:Gives the minimum item support for each item as well as the support difference constraint
|
|
422
|
+
Output format:
|
|
423
|
+
Pattern :<{30,20}{70,80}{20,30,70}> count: 10
|
|
424
|
+
|
|
425
|
+
import re
|
|
426
|
+
from collections import defaultdict
|
|
427
|
+
|
|
428
|
+
# ==========================================
|
|
429
|
+
# 1. READ DATA
|
|
430
|
+
# ==========================================
|
|
431
|
+
|
|
432
|
+
def read_data(file):
|
|
433
|
+
sequences = []
|
|
434
|
+
with open(file, 'r') as f:
|
|
435
|
+
for line in f:
|
|
436
|
+
seq = []
|
|
437
|
+
sets = re.findall(r'\{([^}]*)\}', line)
|
|
438
|
+
for s in sets:
|
|
439
|
+
items = list(map(int, s.split(',')))
|
|
440
|
+
seq.append(items)
|
|
441
|
+
sequences.append(seq)
|
|
442
|
+
return sequences
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def read_params(file):
|
|
446
|
+
MIS = {}
|
|
447
|
+
SDC = 0
|
|
448
|
+
|
|
449
|
+
with open(file, 'r') as f:
|
|
450
|
+
for line in f:
|
|
451
|
+
if "MIS" in line:
|
|
452
|
+
item = int(re.findall(r'\d+', line)[0])
|
|
453
|
+
value = float(re.findall(r'\d+\.\d+', line)[0])
|
|
454
|
+
MIS[item] = value
|
|
455
|
+
elif "SDC" in line:
|
|
456
|
+
SDC = float(re.findall(r'\d+\.\d+', line)[0])
|
|
457
|
+
|
|
458
|
+
return MIS, SDC
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
# ==========================================
|
|
462
|
+
# 2. SUPPORT COUNT
|
|
463
|
+
# ==========================================
|
|
464
|
+
|
|
465
|
+
def get_support(sequences):
|
|
466
|
+
count = defaultdict(int)
|
|
467
|
+
total = len(sequences)
|
|
468
|
+
|
|
469
|
+
for seq in sequences:
|
|
470
|
+
unique_items = set()
|
|
471
|
+
for itemset in seq:
|
|
472
|
+
for item in itemset:
|
|
473
|
+
unique_items.add(item)
|
|
474
|
+
for item in unique_items:
|
|
475
|
+
count[item] += 1
|
|
476
|
+
|
|
477
|
+
support = {item: count[item]/total for item in count}
|
|
478
|
+
return support, count
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
# ==========================================
|
|
482
|
+
# 3. CHECK SUBSEQUENCE
|
|
483
|
+
# ==========================================
|
|
484
|
+
|
|
485
|
+
def is_subsequence(candidate, sequence):
|
|
486
|
+
i = 0
|
|
487
|
+
for itemset in sequence:
|
|
488
|
+
if all(item in itemset for item in candidate[i]):
|
|
489
|
+
i += 1
|
|
490
|
+
if i == len(candidate):
|
|
491
|
+
return True
|
|
492
|
+
return False
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
# ==========================================
|
|
496
|
+
# 4. COUNT SUPPORT FOR SEQUENCE
|
|
497
|
+
# ==========================================
|
|
498
|
+
|
|
499
|
+
def count_sequence_support(sequences, candidates):
|
|
500
|
+
count = defaultdict(int)
|
|
501
|
+
|
|
502
|
+
for seq in sequences:
|
|
503
|
+
for cand in candidates:
|
|
504
|
+
if is_subsequence(cand, seq):
|
|
505
|
+
count[str(cand)] += 1
|
|
506
|
+
|
|
507
|
+
return count
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
# ==========================================
|
|
511
|
+
# 5. JOIN STEP (CANDIDATE GENERATION)
|
|
512
|
+
# ==========================================
|
|
513
|
+
|
|
514
|
+
def join(L):
|
|
515
|
+
candidates = []
|
|
516
|
+
|
|
517
|
+
for i in range(len(L)):
|
|
518
|
+
for j in range(len(L)):
|
|
519
|
+
s1 = L[i]
|
|
520
|
+
s2 = L[j]
|
|
521
|
+
|
|
522
|
+
if s1[1:] == s2[:-1]:
|
|
523
|
+
new_seq = s1 + [s2[-1]]
|
|
524
|
+
if new_seq not in candidates:
|
|
525
|
+
candidates.append(new_seq)
|
|
526
|
+
|
|
527
|
+
return candidates
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
# ==========================================
|
|
531
|
+
# 6. MS-GSP MAIN
|
|
532
|
+
# ==========================================
|
|
533
|
+
|
|
534
|
+
def msgsp(sequences, MIS, SDC):
|
|
535
|
+
support, raw_count = get_support(sequences)
|
|
536
|
+
N = len(sequences)
|
|
537
|
+
|
|
538
|
+
# Sort items by MIS
|
|
539
|
+
items = sorted(MIS.keys(), key=lambda x: MIS[x])
|
|
540
|
+
|
|
541
|
+
# Init pass
|
|
542
|
+
L = []
|
|
543
|
+
for item in items:
|
|
544
|
+
if support.get(item, 0) >= MIS[item]:
|
|
545
|
+
L.append(item)
|
|
546
|
+
|
|
547
|
+
# Frequent 1-sequences
|
|
548
|
+
F = []
|
|
549
|
+
F1 = []
|
|
550
|
+
for item in L:
|
|
551
|
+
if support[item] >= MIS[item]:
|
|
552
|
+
F1.append([[item]])
|
|
553
|
+
|
|
554
|
+
F.append(F1)
|
|
555
|
+
|
|
556
|
+
k = 2
|
|
557
|
+
while True:
|
|
558
|
+
prev = F[k-2]
|
|
559
|
+
if not prev:
|
|
560
|
+
break
|
|
561
|
+
|
|
562
|
+
# Join
|
|
563
|
+
candidates = join(prev)
|
|
564
|
+
|
|
565
|
+
# Count support
|
|
566
|
+
counts = count_sequence_support(sequences, candidates)
|
|
567
|
+
|
|
568
|
+
freq_k = []
|
|
569
|
+
|
|
570
|
+
for cand in candidates:
|
|
571
|
+
key = str(cand)
|
|
572
|
+
sup = counts[key] / N if key in counts else 0
|
|
573
|
+
|
|
574
|
+
# MIS check (first item)
|
|
575
|
+
first_item = cand[0][0]
|
|
576
|
+
|
|
577
|
+
# SDC check
|
|
578
|
+
valid = True
|
|
579
|
+
flat = [item for subset in cand for item in subset]
|
|
580
|
+
for i in range(len(flat)):
|
|
581
|
+
for j in range(len(flat)):
|
|
582
|
+
if abs(support.get(flat[i],0) - support.get(flat[j],0)) > SDC:
|
|
583
|
+
valid = False
|
|
584
|
+
|
|
585
|
+
if sup >= MIS[first_item] and valid:
|
|
586
|
+
freq_k.append(cand)
|
|
587
|
+
|
|
588
|
+
if not freq_k:
|
|
589
|
+
break
|
|
590
|
+
|
|
591
|
+
F.append(freq_k)
|
|
592
|
+
k += 1
|
|
593
|
+
|
|
594
|
+
return F, sequences
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
# ==========================================
|
|
598
|
+
# 7. PRINT OUTPUT
|
|
599
|
+
# ==========================================
|
|
600
|
+
|
|
601
|
+
def print_patterns(F, sequences):
|
|
602
|
+
counts = count_sequence_support(sequences,
|
|
603
|
+
[cand for level in F for cand in level])
|
|
604
|
+
|
|
605
|
+
for level in F:
|
|
606
|
+
for pattern in level:
|
|
607
|
+
key = str(pattern)
|
|
608
|
+
print(f"Pattern :<{pattern}> count: {counts.get(key,0)}")
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
# ==========================================
|
|
612
|
+
# RUN
|
|
613
|
+
# ==========================================
|
|
614
|
+
|
|
615
|
+
sequences = read_data("data.txt")
|
|
616
|
+
MIS, SDC = read_params("para.txt")
|
|
617
|
+
|
|
618
|
+
F, sequences = msgsp(sequences, MIS, SDC)
|
|
619
|
+
|
|
620
|
+
print_patterns(F, sequences)
|
|
621
|
+
""")
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
#--------------------------------------
|
|
625
|
+
#IR
|
|
626
|
+
#--------------------------------------
|
|
627
|
+
#Collaborative Filtering (User-Based)
|
|
628
|
+
def cf():
|
|
629
|
+
print(r"""
|
|
630
|
+
import math
|
|
631
|
+
# Sample user-item matrix (0 = not rated)
|
|
632
|
+
R = {
|
|
633
|
+
'A': {'item1': 5, 'item2': 3, 'item3': 0},
|
|
634
|
+
'B': {'item1': 4, 'item2': 0, 'item3': 2},
|
|
635
|
+
'C': {'item1': 0, 'item2': 4, 'item3': 5}
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
# Step 1: Cosine similarity
|
|
639
|
+
def cosine_similarity(user1, user2):
|
|
640
|
+
dot = 0
|
|
641
|
+
norm1 = 0
|
|
642
|
+
norm2 = 0
|
|
643
|
+
|
|
644
|
+
for item in R[user1]:
|
|
645
|
+
r1 = R[user1][item]
|
|
646
|
+
r2 = R[user2][item]
|
|
647
|
+
|
|
648
|
+
dot += r1 * r2
|
|
649
|
+
norm1 += r1 ** 2
|
|
650
|
+
norm2 += r2 ** 2
|
|
651
|
+
|
|
652
|
+
if norm1 == 0 or norm2 == 0:
|
|
653
|
+
return 0
|
|
654
|
+
|
|
655
|
+
return dot / (math.sqrt(norm1) * math.sqrt(norm2))
|
|
656
|
+
|
|
657
|
+
# Step 2: Get similar users
|
|
658
|
+
def get_neighbors(target_user):
|
|
659
|
+
similarities = []
|
|
660
|
+
|
|
661
|
+
for user in R:
|
|
662
|
+
if user != target_user:
|
|
663
|
+
sim = cosine_similarity(target_user, user)
|
|
664
|
+
similarities.append((user, sim))
|
|
665
|
+
|
|
666
|
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
667
|
+
return similarities
|
|
668
|
+
|
|
669
|
+
# Step 3: Predict rating
|
|
670
|
+
def predict_rating(user, item):
|
|
671
|
+
neighbors = get_neighbors(user)
|
|
672
|
+
|
|
673
|
+
numerator = 0
|
|
674
|
+
denominator = 0
|
|
675
|
+
|
|
676
|
+
for neighbor, sim in neighbors:
|
|
677
|
+
rating = R[neighbor][item]
|
|
678
|
+
if rating != 0:
|
|
679
|
+
numerator += sim * rating
|
|
680
|
+
denominator += abs(sim)
|
|
681
|
+
|
|
682
|
+
if denominator == 0:
|
|
683
|
+
return 0
|
|
684
|
+
|
|
685
|
+
return numerator / denominator
|
|
686
|
+
|
|
687
|
+
# Example
|
|
688
|
+
print("Predicted rating for A on item3:", predict_rating('A', 'item3'))""")
|
|
689
|
+
|
|
690
|
+
##Content-Based Recommendation
|
|
691
|
+
def cbr():
|
|
692
|
+
print(r"""import math
|
|
693
|
+
|
|
694
|
+
# Item feature vectors
|
|
695
|
+
items = {
|
|
696
|
+
'item1': [1, 0, 1],
|
|
697
|
+
'item2': [0, 1, 1],
|
|
698
|
+
'item3': [1, 1, 0]
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
# User liked items
|
|
702
|
+
user_likes = ['item1', 'item2']
|
|
703
|
+
|
|
704
|
+
# Step 1: Build user profile
|
|
705
|
+
def build_user_profile(likes):
|
|
706
|
+
profile = [0] * len(items['item1'])
|
|
707
|
+
|
|
708
|
+
for item in likes:
|
|
709
|
+
for i in range(len(profile)):
|
|
710
|
+
profile[i] += items[item][i]
|
|
711
|
+
|
|
712
|
+
# average
|
|
713
|
+
for i in range(len(profile)):
|
|
714
|
+
profile[i] /= len(likes)
|
|
715
|
+
|
|
716
|
+
return profile
|
|
717
|
+
|
|
718
|
+
# Step 2: Cosine similarity
|
|
719
|
+
def cosine(v1, v2):
|
|
720
|
+
dot = sum(v1[i] * v2[i] for i in range(len(v1)))
|
|
721
|
+
norm1 = math.sqrt(sum(x*x for x in v1))
|
|
722
|
+
norm2 = math.sqrt(sum(x*x for x in v2))
|
|
723
|
+
|
|
724
|
+
if norm1 == 0 or norm2 == 0:
|
|
725
|
+
return 0
|
|
726
|
+
|
|
727
|
+
return dot / (norm1 * norm2)
|
|
728
|
+
|
|
729
|
+
# Step 3: Recommend items
|
|
730
|
+
def recommend():
|
|
731
|
+
profile = build_user_profile(user_likes)
|
|
732
|
+
scores = []
|
|
733
|
+
|
|
734
|
+
for item in items:
|
|
735
|
+
if item not in user_likes:
|
|
736
|
+
sim = cosine(profile, items[item])
|
|
737
|
+
scores.append((item, sim))
|
|
738
|
+
|
|
739
|
+
scores.sort(key=lambda x: x[1], reverse=True)
|
|
740
|
+
return scores
|
|
741
|
+
|
|
742
|
+
# Example
|
|
743
|
+
print("Recommendations:", recommend())""")
|
|
744
|
+
|
|
745
|
+
##Page rank
|
|
746
|
+
|
|
747
|
+
def pr():
|
|
748
|
+
|
|
749
|
+
print(r"""
|
|
750
|
+
import math
|
|
751
|
+
def pagerank(graph, d=0.85, iterations=10):
|
|
752
|
+
N = len(graph)
|
|
753
|
+
|
|
754
|
+
# Step 1: Initialize PageRank
|
|
755
|
+
pr = {}
|
|
756
|
+
for page in graph:
|
|
757
|
+
pr[page] = 1 / N
|
|
758
|
+
|
|
759
|
+
# Step 2: Iterations
|
|
760
|
+
for _ in range(iterations):
|
|
761
|
+
new_pr = {}
|
|
762
|
+
|
|
763
|
+
for page in graph:
|
|
764
|
+
# Base value (random jump)
|
|
765
|
+
new_pr[page] = (1 - d) / N
|
|
766
|
+
|
|
767
|
+
for node in graph:
|
|
768
|
+
# Case 1: Normal link
|
|
769
|
+
if len(graph[node]) > 0:
|
|
770
|
+
if page in graph[node]: # node -> page
|
|
771
|
+
new_pr[page] += d * (pr[node] / len(graph[node]))
|
|
772
|
+
|
|
773
|
+
# Case 2: Dangling node (no outgoing links)
|
|
774
|
+
else:
|
|
775
|
+
# Distribute its rank equally to all pages
|
|
776
|
+
new_pr[page] += d * (pr[node] / N)
|
|
777
|
+
|
|
778
|
+
pr = new_pr
|
|
779
|
+
|
|
780
|
+
return pr
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
# Example graph with dangling node
|
|
784
|
+
graph = {
|
|
785
|
+
'A': ['B'],
|
|
786
|
+
'B': ['C'],
|
|
787
|
+
'C': ['A'],
|
|
788
|
+
'D': [] # Dangling node
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
# Run PageRank
|
|
792
|
+
result = pagerank(graph)
|
|
793
|
+
|
|
794
|
+
# Print results
|
|
795
|
+
for page, rank in result.items():
|
|
796
|
+
print(page, ":", round(rank, 4))""")
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
##Principal Component Analysis (PCA)
|
|
801
|
+
def pca():
|
|
802
|
+
print(r"""
|
|
803
|
+
import math
|
|
804
|
+
|
|
805
|
+
# Example dataset (2D → reduce to 1D)
|
|
806
|
+
X = [
|
|
807
|
+
[2.5, 2.4],
|
|
808
|
+
[0.5, 0.7],
|
|
809
|
+
[2.2, 2.9],
|
|
810
|
+
[1.9, 2.2],
|
|
811
|
+
[3.1, 3.0]
|
|
812
|
+
]
|
|
813
|
+
|
|
814
|
+
# Step 1: Mean Centering
|
|
815
|
+
def mean_center(X):
|
|
816
|
+
mean = [sum(col)/len(col) for col in zip(*X)]
|
|
817
|
+
|
|
818
|
+
X_centered = []
|
|
819
|
+
for row in X:
|
|
820
|
+
X_centered.append([row[i] - mean[i] for i in range(len(row))])
|
|
821
|
+
|
|
822
|
+
return X_centered, mean
|
|
823
|
+
|
|
824
|
+
# Step 2: Covariance Matrix
|
|
825
|
+
def covariance_matrix(X):
|
|
826
|
+
n = len(X)
|
|
827
|
+
m = len(X[0])
|
|
828
|
+
|
|
829
|
+
cov = [[0]*m for _ in range(m)]
|
|
830
|
+
|
|
831
|
+
for i in range(m):
|
|
832
|
+
for j in range(m):
|
|
833
|
+
for row in X:
|
|
834
|
+
cov[i][j] += row[i] * row[j]
|
|
835
|
+
cov[i][j] /= n
|
|
836
|
+
|
|
837
|
+
return cov
|
|
838
|
+
|
|
839
|
+
# Step 3: Eigenvalues & Eigenvectors (2x2 only for lab simplicity)
|
|
840
|
+
def eigen_2x2(matrix):
|
|
841
|
+
a, b = matrix[0]
|
|
842
|
+
c, d = matrix[1]
|
|
843
|
+
|
|
844
|
+
# Eigenvalues
|
|
845
|
+
trace = a + d
|
|
846
|
+
det = a*d - b*c
|
|
847
|
+
|
|
848
|
+
term = math.sqrt(trace**2 - 4*det)
|
|
849
|
+
|
|
850
|
+
lambda1 = (trace + term)/2
|
|
851
|
+
lambda2 = (trace - term)/2
|
|
852
|
+
|
|
853
|
+
# Eigenvectors
|
|
854
|
+
vec1 = [b, lambda1 - a] if b != 0 else [1, 0]
|
|
855
|
+
vec2 = [b, lambda2 - a] if b != 0 else [0, 1]
|
|
856
|
+
|
|
857
|
+
return [(lambda1, vec1), (lambda2, vec2)]
|
|
858
|
+
|
|
859
|
+
# Step 4: Projection
|
|
860
|
+
def project(X, vector):
|
|
861
|
+
projected = []
|
|
862
|
+
for row in X:
|
|
863
|
+
val = sum(row[i] * vector[i] for i in range(len(vector)))
|
|
864
|
+
projected.append(val)
|
|
865
|
+
return projected
|
|
866
|
+
|
|
867
|
+
|
|
868
|
+
# Run PCA
|
|
869
|
+
X_centered, mean = mean_center(X)
|
|
870
|
+
cov = covariance_matrix(X_centered)
|
|
871
|
+
eigen_pairs = eigen_2x2(cov)
|
|
872
|
+
|
|
873
|
+
# Sort by eigenvalue
|
|
874
|
+
eigen_pairs.sort(key=lambda x: x[0], reverse=True)
|
|
875
|
+
|
|
876
|
+
# Take top eigenvector
|
|
877
|
+
top_vector = eigen_pairs[0][1]
|
|
878
|
+
|
|
879
|
+
# Project data
|
|
880
|
+
result = project(X_centered, top_vector)
|
|
881
|
+
|
|
882
|
+
print("Reduced Data:", result)
|
|
883
|
+
|
|
884
|
+
##Feature Selection (Simple Reduction)
|
|
885
|
+
|
|
886
|
+
def variance(feature):
|
|
887
|
+
mean = sum(feature)/len(feature)
|
|
888
|
+
return sum((x - mean)**2 for x in feature) / len(feature)
|
|
889
|
+
|
|
890
|
+
def feature_selection(X, threshold=0.5):
|
|
891
|
+
selected = []
|
|
892
|
+
|
|
893
|
+
# Transpose dataset
|
|
894
|
+
features = list(zip(*X))
|
|
895
|
+
|
|
896
|
+
for i, feature in enumerate(features):
|
|
897
|
+
var = variance(feature)
|
|
898
|
+
if var >= threshold:
|
|
899
|
+
selected.append(i)
|
|
900
|
+
|
|
901
|
+
# Build reduced dataset
|
|
902
|
+
reduced = []
|
|
903
|
+
for row in X:
|
|
904
|
+
reduced.append([row[i] for i in selected])
|
|
905
|
+
|
|
906
|
+
return reduced
|
|
907
|
+
|
|
908
|
+
|
|
909
|
+
# Example
|
|
910
|
+
X = [
|
|
911
|
+
[1, 100, 2],
|
|
912
|
+
[2, 100, 3],
|
|
913
|
+
[3, 100, 4]
|
|
914
|
+
]
|
|
915
|
+
|
|
916
|
+
print("Reduced:", feature_selection(X))""")
|
|
917
|
+
|
|
918
|
+
def recommend():
|
|
919
|
+
print(r"""
|
|
920
|
+
import numpy as np
|
|
921
|
+
import pandas as pd
|
|
922
|
+
import matplotlib.pyplot as plt
|
|
923
|
+
|
|
924
|
+
|
|
925
|
+
# =========================
|
|
926
|
+
# DATA PREPARATION
|
|
927
|
+
def load_data():
|
|
928
|
+
data = [
|
|
929
|
+
[5, 3, 4, '?'],
|
|
930
|
+
[3, 1, 2, 3],
|
|
931
|
+
[4, 3, 4, 5],
|
|
932
|
+
[3, 3, 1, 5]
|
|
933
|
+
]
|
|
934
|
+
|
|
935
|
+
df = pd.DataFrame(
|
|
936
|
+
data,
|
|
937
|
+
index=['User1', 'User2', 'User3', 'User4'],
|
|
938
|
+
columns=['Item1', 'Item2', 'Item3', 'Item4']
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
df = df.mask(df == '?', np.nan).astype(float)
|
|
942
|
+
return df
|
|
943
|
+
|
|
944
|
+
def mean_center(df):
|
|
945
|
+
user_mean = df.mean(axis=1)
|
|
946
|
+
df_centered = df.sub(user_mean, axis=0)
|
|
947
|
+
return user_mean, df_centered
|
|
948
|
+
|
|
949
|
+
# SIMILARITY FUNCTION
|
|
950
|
+
def cosine_similarity(a, b):
|
|
951
|
+
mask = ~np.isnan(a) & ~np.isnan(b)
|
|
952
|
+
|
|
953
|
+
if np.sum(mask) == 0:
|
|
954
|
+
return 0
|
|
955
|
+
|
|
956
|
+
a, b = a[mask], b[mask]
|
|
957
|
+
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
|
958
|
+
|
|
959
|
+
# USER-BASED CF
|
|
960
|
+
def predict_user_based(df, user, item):
|
|
961
|
+
target_vector = df.loc[user].values
|
|
962
|
+
similarities = {}
|
|
963
|
+
|
|
964
|
+
for other_user in df.index:
|
|
965
|
+
if other_user == user:
|
|
966
|
+
continue
|
|
967
|
+
|
|
968
|
+
if not np.isnan(df.loc[other_user, item]):
|
|
969
|
+
sim = cosine_similarity(
|
|
970
|
+
target_vector,
|
|
971
|
+
df.loc[other_user].values
|
|
972
|
+
)
|
|
973
|
+
similarities[other_user] = sim
|
|
974
|
+
|
|
975
|
+
num, den = 0, 0
|
|
976
|
+
for u, sim in similarities.items():
|
|
977
|
+
num += sim * df.loc[u, item]
|
|
978
|
+
den += abs(sim)
|
|
979
|
+
|
|
980
|
+
return np.nan if den == 0 else num / den
|
|
981
|
+
|
|
982
|
+
def predict_user_based_mean_centered(df, user, item):
|
|
983
|
+
user_mean = df.mean(axis=1)
|
|
984
|
+
target_vector = df.loc[user].values
|
|
985
|
+
similarities = {}
|
|
986
|
+
|
|
987
|
+
for other_user in df.index:
|
|
988
|
+
if other_user == user:
|
|
989
|
+
continue
|
|
990
|
+
|
|
991
|
+
if not np.isnan(df.loc[other_user, item]):
|
|
992
|
+
sim = cosine_similarity(
|
|
993
|
+
target_vector,
|
|
994
|
+
df.loc[other_user].values
|
|
995
|
+
)
|
|
996
|
+
similarities[other_user] = sim
|
|
997
|
+
|
|
998
|
+
num, den = 0, 0
|
|
999
|
+
|
|
1000
|
+
for u, sim in similarities.items():
|
|
1001
|
+
ru_i = df.loc[u, item]
|
|
1002
|
+
ru_mean = user_mean[u]
|
|
1003
|
+
|
|
1004
|
+
num += sim * (ru_i - ru_mean)
|
|
1005
|
+
den += abs(sim)
|
|
1006
|
+
|
|
1007
|
+
if den == 0:
|
|
1008
|
+
return user_mean[user]
|
|
1009
|
+
|
|
1010
|
+
return user_mean[user] + (num / den)
|
|
1011
|
+
|
|
1012
|
+
def predict_user_based_topk(df, user, item, k=2):
|
|
1013
|
+
target_vector = df.loc[user].values
|
|
1014
|
+
similarities = []
|
|
1015
|
+
|
|
1016
|
+
for other_user in df.index:
|
|
1017
|
+
if other_user == user:
|
|
1018
|
+
continue
|
|
1019
|
+
|
|
1020
|
+
if not np.isnan(df.loc[other_user, item]):
|
|
1021
|
+
sim = cosine_similarity(
|
|
1022
|
+
target_vector,
|
|
1023
|
+
df.loc[other_user].values
|
|
1024
|
+
)
|
|
1025
|
+
similarities.append((other_user, sim))
|
|
1026
|
+
|
|
1027
|
+
# Sort by similarity (descending)
|
|
1028
|
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
1029
|
+
|
|
1030
|
+
# Select Top-K
|
|
1031
|
+
top_k = similarities[:k]
|
|
1032
|
+
|
|
1033
|
+
num, den = 0, 0
|
|
1034
|
+
for u, sim in top_k:
|
|
1035
|
+
num += sim * df.loc[u, item]
|
|
1036
|
+
den += abs(sim)
|
|
1037
|
+
|
|
1038
|
+
return np.nan if den == 0 else num / den
|
|
1039
|
+
|
|
1040
|
+
# ITEM-BASED CF
|
|
1041
|
+
def predict_item_based(df, user, item):
|
|
1042
|
+
target_vector = df[item].values
|
|
1043
|
+
similarities = {}
|
|
1044
|
+
|
|
1045
|
+
for other_item in df.columns:
|
|
1046
|
+
if other_item == item:
|
|
1047
|
+
continue
|
|
1048
|
+
|
|
1049
|
+
if not np.isnan(df.loc[user, other_item]):
|
|
1050
|
+
sim = cosine_similarity(
|
|
1051
|
+
target_vector,
|
|
1052
|
+
df[other_item].values
|
|
1053
|
+
)
|
|
1054
|
+
similarities[other_item] = sim
|
|
1055
|
+
|
|
1056
|
+
num, den = 0, 0
|
|
1057
|
+
|
|
1058
|
+
for i, sim in similarities.items():
|
|
1059
|
+
num += sim * df.loc[user, i]
|
|
1060
|
+
den += abs(sim)
|
|
1061
|
+
|
|
1062
|
+
return np.nan if den == 0 else num / den
|
|
1063
|
+
|
|
1064
|
+
def predict_item_based_topk(df, user, item, k=2):
|
|
1065
|
+
target_vector = df[item].values
|
|
1066
|
+
similarities = []
|
|
1067
|
+
|
|
1068
|
+
for other_item in df.columns:
|
|
1069
|
+
if other_item == item:
|
|
1070
|
+
continue
|
|
1071
|
+
|
|
1072
|
+
if not np.isnan(df.loc[user, other_item]):
|
|
1073
|
+
sim = cosine_similarity(
|
|
1074
|
+
target_vector,
|
|
1075
|
+
df[other_item].values
|
|
1076
|
+
)
|
|
1077
|
+
similarities.append((other_item, sim))
|
|
1078
|
+
|
|
1079
|
+
# Sort & select Top-K
|
|
1080
|
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
1081
|
+
top_k = similarities[:k]
|
|
1082
|
+
|
|
1083
|
+
num, den = 0, 0
|
|
1084
|
+
for i, sim in top_k:
|
|
1085
|
+
num += sim * df.loc[user, i]
|
|
1086
|
+
den += abs(sim)
|
|
1087
|
+
|
|
1088
|
+
return np.nan if den == 0 else num / den
|
|
1089
|
+
|
|
1090
|
+
# EVALUATION
|
|
1091
|
+
def evaluate(df):
|
|
1092
|
+
actuals, preds = [], []
|
|
1093
|
+
|
|
1094
|
+
for u in df.index:
|
|
1095
|
+
for i in df.columns:
|
|
1096
|
+
if not np.isnan(df.loc[u, i]):
|
|
1097
|
+
|
|
1098
|
+
temp = df.copy()
|
|
1099
|
+
actual = temp.loc[u, i]
|
|
1100
|
+
temp.loc[u, i] = np.nan
|
|
1101
|
+
|
|
1102
|
+
p = predict_user_based(temp, u, i)
|
|
1103
|
+
|
|
1104
|
+
if not np.isnan(p):
|
|
1105
|
+
actuals.append(actual)
|
|
1106
|
+
preds.append(p)
|
|
1107
|
+
|
|
1108
|
+
actuals = np.array(actuals)
|
|
1109
|
+
preds = np.array(preds)
|
|
1110
|
+
|
|
1111
|
+
rmse = np.sqrt(np.mean((actuals - preds) ** 2))
|
|
1112
|
+
mae = np.mean(np.abs(actuals - preds))
|
|
1113
|
+
|
|
1114
|
+
return rmse, mae
|
|
1115
|
+
|
|
1116
|
+
# VISUALIZATION
|
|
1117
|
+
def plot_matrix(matrix, title):
|
|
1118
|
+
plt.figure()
|
|
1119
|
+
plt.imshow(matrix, aspect='auto')
|
|
1120
|
+
plt.title(title)
|
|
1121
|
+
plt.colorbar()
|
|
1122
|
+
plt.show()
|
|
1123
|
+
|
|
1124
|
+
# MAIN EXECUTION
|
|
1125
|
+
def main():
|
|
1126
|
+
df = load_data()
|
|
1127
|
+
|
|
1128
|
+
# Mean Centering
|
|
1129
|
+
user_mean, df_centered = mean_center(df)
|
|
1130
|
+
print("\nMean Centered Matrix:\n", df_centered)
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
#for u in df.index:
|
|
1134
|
+
# for i in df.columns:
|
|
1135
|
+
# if np.isnan(df.loc[u, i]):
|
|
1136
|
+
# user_pred_matrix.loc[u, i] = predict_user_based(df, u, i)
|
|
1137
|
+
# item_pred_matrix.loc[u, i] = predict_item_based(df, u, i)
|
|
1138
|
+
|
|
1139
|
+
#print("\nUser-Based Prediction Matrix:\n", user_pred_matrix)
|
|
1140
|
+
#print("\nItem-Based Prediction Matrix:\n", item_pred_matrix)
|
|
1141
|
+
|
|
1142
|
+
|
|
1143
|
+
# Prediction
|
|
1144
|
+
user = "User1"
|
|
1145
|
+
item = "Item4"
|
|
1146
|
+
|
|
1147
|
+
user_pred = predict_user_based(df, user, item)
|
|
1148
|
+
item_pred = predict_item_based(df, user, item)
|
|
1149
|
+
|
|
1150
|
+
print("\nUser-Based Prediction:\n", user_pred)
|
|
1151
|
+
print("\nItem-Based Prediction:\n", item_pred)
|
|
1152
|
+
|
|
1153
|
+
# Evaluation
|
|
1154
|
+
rmse, mae = evaluate(df)
|
|
1155
|
+
print("\nEvaluation Metrics:")
|
|
1156
|
+
print("RMSE =", rmse)
|
|
1157
|
+
print("MAE =", mae)
|
|
1158
|
+
|
|
1159
|
+
# Fill NaNs for visualization
|
|
1160
|
+
user_pred_matrix = df.copy().fillna(df.mean().mean())
|
|
1161
|
+
item_pred_matrix = df.copy().fillna(df.mean().mean())
|
|
1162
|
+
|
|
1163
|
+
# Plots
|
|
1164
|
+
plot_matrix(df.fillna(0), "Original Matrix")
|
|
1165
|
+
plot_matrix(df_centered.fillna(0), "Mean Centered Matrix")
|
|
1166
|
+
plot_matrix(user_pred_matrix, "User-Based Predicted Matrix")
|
|
1167
|
+
plot_matrix(item_pred_matrix, "Item-Based Predicted Matrix")
|
|
1168
|
+
|
|
1169
|
+
|
|
1170
|
+
if __name__ == "__main__":
|
|
1171
|
+
main()"""
|
|
1172
|
+
)
|
|
1173
|
+
recommend()
|