itertoolkit 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bm_preprocessing/__init__.py +14 -0
- bm_preprocessing/importer/DM/__init__.py +7 -0
- bm_preprocessing/importer/DM/agg.py +6 -0
- bm_preprocessing/importer/DM/dbscan.py +6 -0
- bm_preprocessing/importer/DM/finals.py +6 -0
- bm_preprocessing/importer/DM/gsp.py +6 -0
- bm_preprocessing/importer/DM/test.py +6 -0
- bm_preprocessing/importer/Finals/__init__.py +7 -0
- bm_preprocessing/importer/Finals/kaadhal.py +6 -0
- bm_preprocessing/importer/Finals/raaka.py +6 -0
- bm_preprocessing/importer/Finals/seedan.py +6 -0
- bm_preprocessing/importer/Finals/vikram.py +6 -0
- bm_preprocessing/importer/IR/__init__.py +6 -0
- bm_preprocessing/importer/IR/finals.py +6 -0
- bm_preprocessing/importer/IR/pagerank.py +6 -0
- bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
- bm_preprocessing/importer/IR/test.py +6 -0
- bm_preprocessing/importer/PY/__init__.py +4 -0
- bm_preprocessing/importer/PY/lib_doc.py +6 -0
- bm_preprocessing/importer/PY/python_doc.py +6 -0
- bm_preprocessing/importer/__init__.py +8 -0
- bm_preprocessing/importer/_module_printer.py +23 -0
- bm_preprocessing/src/DM/__init__.py +1 -0
- bm_preprocessing/src/DM/agg.py +267 -0
- bm_preprocessing/src/DM/dbscan.py +218 -0
- bm_preprocessing/src/DM/finals.py +19 -0
- bm_preprocessing/src/DM/gsp.py +378 -0
- bm_preprocessing/src/DM/test.py +19 -0
- bm_preprocessing/src/Finals/__init__.py +1 -0
- bm_preprocessing/src/Finals/kaadhal.py +1453 -0
- bm_preprocessing/src/Finals/raaka.py +1338 -0
- bm_preprocessing/src/Finals/seedan.py +1173 -0
- bm_preprocessing/src/Finals/vikram.py +520 -0
- bm_preprocessing/src/IR/__init__.py +1 -0
- bm_preprocessing/src/IR/finals.py +14 -0
- bm_preprocessing/src/IR/pagerank.py +109 -0
- bm_preprocessing/src/IR/recommenders_pca.py +487 -0
- bm_preprocessing/src/IR/test.py +14 -0
- bm_preprocessing/src/PY/__init__.py +1 -0
- bm_preprocessing/src/PY/lib_doc.py +295 -0
- bm_preprocessing/src/PY/python_doc.py +177 -0
- bm_preprocessing/src/__init__.py +1 -0
- itertoolkit-1.5.0.dist-info/METADATA +120 -0
- itertoolkit-1.5.0.dist-info/RECORD +45 -0
- itertoolkit-1.5.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""ir dm final code sem.ipynb
|
|
3
|
+
|
|
4
|
+
Automatically generated by Colab.
|
|
5
|
+
|
|
6
|
+
Original file is located at
|
|
7
|
+
https://colab.research.google.com/drive/1pShgFSMDWpmjt0iCqFTyic4hYGP5vgFc
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
#-------------------------------------------------------------------------------------------------------#
|
|
11
|
+
#IR CODE#
|
|
12
|
+
#-------------------------------------------------------------------------------------------------------#
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import matplotlib.pyplot as plt
|
|
16
|
+
import seaborn as sns
|
|
17
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
18
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
19
|
+
from sklearn.decomposition import PCA
|
|
20
|
+
from sklearn.preprocessing import StandardScaler
|
|
21
|
+
import networkx as nx
|
|
22
|
+
import os
|
|
23
|
+
|
|
24
|
+
sns.set()
|
|
25
|
+
|
|
26
|
+
figures = []
|
|
27
|
+
|
|
28
|
+
print("Current Directory:", os.getcwd())
|
|
29
|
+
print("Files in folder:", os.listdir())
|
|
30
|
+
|
|
31
|
+
# ============================================================
|
|
32
|
+
|
|
33
|
+
# 1. CONTENT-BASED RECOMMENDER SYSTEM
|
|
34
|
+
|
|
35
|
+
# ============================================================
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
print("\nCONTENT-BASED RECOMMENDER SYSTEM")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
movies = pd.read_csv("movies.csv")
|
|
42
|
+
movies.columns = movies.columns.str.lower().str.strip()
|
|
43
|
+
|
|
44
|
+
title_col = [c for c in movies.columns if 'title' in c][0]
|
|
45
|
+
genre_col = [c for c in movies.columns if 'genre' in c][0]
|
|
46
|
+
|
|
47
|
+
tfidf = TfidfVectorizer()
|
|
48
|
+
tfidf_matrix = tfidf.fit_transform(movies[genre_col])
|
|
49
|
+
cosine_sim = cosine_similarity(tfidf_matrix)
|
|
50
|
+
|
|
51
|
+
movie_idx = 0
|
|
52
|
+
scores = sorted(list(enumerate(cosine_sim[movie_idx])), key=lambda x: x[1], reverse=True)[1:4]
|
|
53
|
+
|
|
54
|
+
print("Recommendations for:", movies[title_col].iloc[movie_idx])
|
|
55
|
+
for i in scores:
|
|
56
|
+
print(movies[title_col].iloc[i[0]])
|
|
57
|
+
|
|
58
|
+
fig1 = plt.figure()
|
|
59
|
+
sns.heatmap(cosine_sim, cmap="Blues")
|
|
60
|
+
plt.title("Content Similarity")
|
|
61
|
+
figures.append(fig1)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
except Exception as e:
|
|
65
|
+
print("Error in Content-Based:", e)
|
|
66
|
+
|
|
67
|
+
# ============================================================
|
|
68
|
+
|
|
69
|
+
# 2. COLLABORATIVE FILTERING
|
|
70
|
+
|
|
71
|
+
# ============================================================
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
print("\nCOLLABORATIVE FILTERING")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
ratings = pd.read_csv("data.csv")
|
|
78
|
+
ratings.columns = ratings.columns.str.lower().str.strip()
|
|
79
|
+
|
|
80
|
+
user_col = [c for c in ratings.columns if 'user' in c][0]
|
|
81
|
+
item_col = [c for c in ratings.columns if 'movie' in c or 'item' in c][0]
|
|
82
|
+
rating_col = [c for c in ratings.columns if 'rating' in c][0]
|
|
83
|
+
|
|
84
|
+
user_item = ratings.pivot(index=user_col, columns=item_col, values=rating_col).fillna(0)
|
|
85
|
+
user_sim = cosine_similarity(user_item)
|
|
86
|
+
|
|
87
|
+
print("User Similarity Matrix:\n", user_sim)
|
|
88
|
+
|
|
89
|
+
fig2 = plt.figure()
|
|
90
|
+
sns.heatmap(user_item, cmap="coolwarm")
|
|
91
|
+
plt.title("User-Item Matrix")
|
|
92
|
+
figures.append(fig2)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
except Exception as e:
|
|
96
|
+
print("Error in Collaborative:", e)
|
|
97
|
+
|
|
98
|
+
# ============================================================
|
|
99
|
+
|
|
100
|
+
# 3. PCA
|
|
101
|
+
|
|
102
|
+
# ============================================================
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
print("\nPCA")
|
|
106
|
+
|
|
107
|
+
data = pd.read_csv("data.csv")
|
|
108
|
+
data.columns = data.columns.str.lower().str.strip()
|
|
109
|
+
|
|
110
|
+
scaler = StandardScaler()
|
|
111
|
+
scaled = scaler.fit_transform(data)
|
|
112
|
+
|
|
113
|
+
pca = PCA(n_components=2)
|
|
114
|
+
result = pca.fit_transform(scaled)
|
|
115
|
+
|
|
116
|
+
print("Reduced Data:\n", result)
|
|
117
|
+
|
|
118
|
+
fig3 = plt.figure()
|
|
119
|
+
plt.scatter(result[:, 0], result[:, 1])
|
|
120
|
+
plt.title("PCA Projection")
|
|
121
|
+
figures.append(fig3)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
except Exception as e:
|
|
125
|
+
print("Error in PCA:", e)
|
|
126
|
+
|
|
127
|
+
# ============================================================
|
|
128
|
+
|
|
129
|
+
# 4. PAGERANK
|
|
130
|
+
|
|
131
|
+
# ============================================================
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
print("\nPAGERANK")
|
|
135
|
+
|
|
136
|
+
graph = pd.read_csv("graph.csv")
|
|
137
|
+
graph.columns = graph.columns.str.lower().str.strip()
|
|
138
|
+
|
|
139
|
+
source_col = graph.columns[0]
|
|
140
|
+
target_col = graph.columns[1]
|
|
141
|
+
|
|
142
|
+
G = nx.from_pandas_edgelist(graph, source_col, target_col, create_using=nx.DiGraph())
|
|
143
|
+
pr = nx.pagerank(G)
|
|
144
|
+
|
|
145
|
+
print("PageRank Scores:")
|
|
146
|
+
for k, v in pr.items():
|
|
147
|
+
print(k, ":", round(v, 4))
|
|
148
|
+
|
|
149
|
+
fig4 = plt.figure()
|
|
150
|
+
nx.draw(G, with_labels=True, node_color='lightblue')
|
|
151
|
+
plt.title("Web Graph")
|
|
152
|
+
figures.append(fig4)
|
|
153
|
+
|
|
154
|
+
except Exception as e:
|
|
155
|
+
print("Error in PageRank:", e)
|
|
156
|
+
|
|
157
|
+
# ============================================================
|
|
158
|
+
|
|
159
|
+
# 5. INVERTED INDEX
|
|
160
|
+
|
|
161
|
+
# ============================================================
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
print("\nINVERTED INDEX")
|
|
165
|
+
|
|
166
|
+
docs = pd.read_csv("documents.csv")
|
|
167
|
+
docs.columns = docs.columns.str.lower().str.strip()
|
|
168
|
+
|
|
169
|
+
id_col = [c for c in docs.columns if 'id' in c][0]
|
|
170
|
+
text_col = [c for c in docs.columns if 'content' in c or 'text' in c][0]
|
|
171
|
+
|
|
172
|
+
index = {}
|
|
173
|
+
|
|
174
|
+
for _, row in docs.iterrows():
|
|
175
|
+
words = str(row[text_col]).lower().split()
|
|
176
|
+
for w in words:
|
|
177
|
+
if w not in index:
|
|
178
|
+
index[w] = []
|
|
179
|
+
index[w].append(row[id_col])
|
|
180
|
+
|
|
181
|
+
query = "data"
|
|
182
|
+
print("Search:", query)
|
|
183
|
+
|
|
184
|
+
if query in index:
|
|
185
|
+
for doc_id in index[query]:
|
|
186
|
+
print(docs[docs[id_col] == doc_id][text_col].values[0])
|
|
187
|
+
else:
|
|
188
|
+
print("No results")
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
except Exception as e:
|
|
192
|
+
print("Error in IR:", e)
|
|
193
|
+
|
|
194
|
+
# ============================================================
|
|
195
|
+
|
|
196
|
+
# SHOW ALL PLOTS
|
|
197
|
+
|
|
198
|
+
# ============================================================
|
|
199
|
+
print("\nShowing all plots...")
|
|
200
|
+
|
|
201
|
+
# Activate all figures created
|
|
202
|
+
for num in plt.get_fignums():
|
|
203
|
+
plt.figure(num)
|
|
204
|
+
|
|
205
|
+
# Display all together
|
|
206
|
+
plt.show()
|
|
207
|
+
|
|
208
|
+
#-------------------------------------------------------------------------------------------------------#
|
|
209
|
+
#DM CODE#
|
|
210
|
+
#-------------------------------------------------------------------------------------------------------#
|
|
211
|
+
# ============================================================
|
|
212
|
+
# PROBLEM SHEET 13 – AGGLOMERATIVE CLUSTERING (WARD)
|
|
213
|
+
# ============================================================
|
|
214
|
+
|
|
215
|
+
import pandas as pd
|
|
216
|
+
import numpy as np
|
|
217
|
+
import matplotlib.pyplot as plt
|
|
218
|
+
from scipy.cluster.hierarchy import dendrogram, linkage
|
|
219
|
+
|
|
220
|
+
print("\n===== AGGLOMERATIVE CLUSTERING =====")
|
|
221
|
+
|
|
222
|
+
# Load dataset
|
|
223
|
+
df = pd.read_csv("Mall_Customers.csv")
|
|
224
|
+
|
|
225
|
+
# Remove CustomerID
|
|
226
|
+
df = df.drop("CustomerID", axis=1)
|
|
227
|
+
|
|
228
|
+
# Missing values
|
|
229
|
+
print("Missing values:\n", df.isnull().sum())
|
|
230
|
+
|
|
231
|
+
# Convert Gender
|
|
232
|
+
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
|
|
233
|
+
|
|
234
|
+
# Pie chart
|
|
235
|
+
gender_counts = df['Gender'].value_counts()
|
|
236
|
+
plt.figure()
|
|
237
|
+
plt.pie(gender_counts, labels=['Male', 'Female'], autopct='%1.1f%%')
|
|
238
|
+
plt.title("Male vs Female Ratio")
|
|
239
|
+
plt.show()
|
|
240
|
+
|
|
241
|
+
# Bar graphs
|
|
242
|
+
plt.figure()
|
|
243
|
+
plt.bar(range(len(df)), df['Age'])
|
|
244
|
+
plt.title("Age Distribution")
|
|
245
|
+
plt.show()
|
|
246
|
+
|
|
247
|
+
plt.figure()
|
|
248
|
+
plt.bar(range(len(df)), df['Annual Income (k$)'])
|
|
249
|
+
plt.title("Income Distribution")
|
|
250
|
+
plt.show()
|
|
251
|
+
|
|
252
|
+
# Ward linkage clustering
|
|
253
|
+
data = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].values
|
|
254
|
+
Z = linkage(data, method='ward')
|
|
255
|
+
|
|
256
|
+
plt.figure(figsize=(10,5))
|
|
257
|
+
dendrogram(Z)
|
|
258
|
+
plt.title("Dendrogram (Ward Linkage)")
|
|
259
|
+
plt.xlabel("Data Points")
|
|
260
|
+
plt.ylabel("Distance")
|
|
261
|
+
plt.show()
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
# ============================================================
|
|
265
|
+
# PROBLEM SHEET 14 – DBSCAN
|
|
266
|
+
# ============================================================
|
|
267
|
+
|
|
268
|
+
from sklearn.preprocessing import StandardScaler
|
|
269
|
+
from sklearn.datasets import make_moons
|
|
270
|
+
from sklearn.cluster import DBSCAN
|
|
271
|
+
|
|
272
|
+
print("\n===== DBSCAN CLUSTERING =====")
|
|
273
|
+
|
|
274
|
+
# Load dataset
|
|
275
|
+
df = pd.read_csv("Wholesale customers data.csv")
|
|
276
|
+
df = df.drop(['Channel', 'Region'], axis=1)
|
|
277
|
+
|
|
278
|
+
print("First few records:\n", df.head())
|
|
279
|
+
|
|
280
|
+
# Normalize
|
|
281
|
+
data = df[['Grocery', 'Milk']].values
|
|
282
|
+
scaler = StandardScaler()
|
|
283
|
+
data_scaled = scaler.fit_transform(data)
|
|
284
|
+
|
|
285
|
+
plt.scatter(data_scaled[:,0], data_scaled[:,1])
|
|
286
|
+
plt.title("Normalized Data")
|
|
287
|
+
plt.xlabel("Grocery")
|
|
288
|
+
plt.ylabel("Milk")
|
|
289
|
+
plt.show()
|
|
290
|
+
|
|
291
|
+
# Manual DBSCAN
|
|
292
|
+
EPS = 0.5
|
|
293
|
+
MINPTS = 15
|
|
294
|
+
|
|
295
|
+
UNVISITED = 0
|
|
296
|
+
NOISE = -1
|
|
297
|
+
|
|
298
|
+
labels = [UNVISITED] * len(data_scaled)
|
|
299
|
+
cluster_id = 0
|
|
300
|
+
|
|
301
|
+
def euclidean(p1, p2):
|
|
302
|
+
return np.sqrt(np.sum((p1 - p2)**2))
|
|
303
|
+
|
|
304
|
+
def region_query(idx):
|
|
305
|
+
neighbors = []
|
|
306
|
+
for i in range(len(data_scaled)):
|
|
307
|
+
if euclidean(data_scaled[idx], data_scaled[i]) <= EPS:
|
|
308
|
+
neighbors.append(i)
|
|
309
|
+
return neighbors
|
|
310
|
+
|
|
311
|
+
def expand_cluster(idx, neighbors, cluster_id):
|
|
312
|
+
labels[idx] = cluster_id
|
|
313
|
+
|
|
314
|
+
i = 0
|
|
315
|
+
while i < len(neighbors):
|
|
316
|
+
point = neighbors[i]
|
|
317
|
+
|
|
318
|
+
if labels[point] == UNVISITED:
|
|
319
|
+
labels[point] = cluster_id
|
|
320
|
+
new_neighbors = region_query(point)
|
|
321
|
+
|
|
322
|
+
if len(new_neighbors) >= MINPTS:
|
|
323
|
+
neighbors += new_neighbors
|
|
324
|
+
|
|
325
|
+
elif labels[point] == NOISE:
|
|
326
|
+
labels[point] = cluster_id
|
|
327
|
+
|
|
328
|
+
i += 1
|
|
329
|
+
|
|
330
|
+
# Main loop
|
|
331
|
+
for i in range(len(data_scaled)):
|
|
332
|
+
if labels[i] != UNVISITED:
|
|
333
|
+
continue
|
|
334
|
+
|
|
335
|
+
neighbors = region_query(i)
|
|
336
|
+
|
|
337
|
+
if len(neighbors) < MINPTS:
|
|
338
|
+
labels[i] = NOISE
|
|
339
|
+
else:
|
|
340
|
+
cluster_id += 1
|
|
341
|
+
expand_cluster(i, neighbors, cluster_id)
|
|
342
|
+
|
|
343
|
+
# Plot clusters
|
|
344
|
+
plt.scatter(data_scaled[:,0], data_scaled[:,1], c=labels, cmap='rainbow')
|
|
345
|
+
plt.title("Manual DBSCAN Clustering")
|
|
346
|
+
plt.show()
|
|
347
|
+
|
|
348
|
+
# Built-in DBSCAN (moon dataset)
|
|
349
|
+
X, _ = make_moons(n_samples=2000, noise=0.05)
|
|
350
|
+
|
|
351
|
+
model = DBSCAN(eps=0.2, min_samples=5)
|
|
352
|
+
labels_moon = model.fit_predict(X)
|
|
353
|
+
|
|
354
|
+
plt.scatter(X[:,0], X[:,1], c=labels_moon, cmap='rainbow')
|
|
355
|
+
plt.title("DBSCAN on Moon Dataset")
|
|
356
|
+
plt.show()
|
|
357
|
+
|
|
358
|
+
# Add noise
|
|
359
|
+
noise = np.random.uniform(low=-1.5, high=2.5, size=(200,2))
|
|
360
|
+
X_noisy = np.vstack([X, noise])
|
|
361
|
+
|
|
362
|
+
labels_noisy = model.fit_predict(X_noisy)
|
|
363
|
+
|
|
364
|
+
plt.scatter(X_noisy[:,0], X_noisy[:,1], c=labels_noisy, cmap='rainbow')
|
|
365
|
+
plt.title("DBSCAN with Noise")
|
|
366
|
+
plt.show()
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
# ============================================================
|
|
370
|
+
# PROBLEM SHEET 15 – MS-GSP (FULLY CORRECT)
|
|
371
|
+
# ============================================================
|
|
372
|
+
|
|
373
|
+
import re
|
|
374
|
+
|
|
375
|
+
print("\n===== MS-GSP SEQUENTIAL PATTERN MINING =====")
|
|
376
|
+
|
|
377
|
+
# Read parameters
|
|
378
|
+
MIS = {}
|
|
379
|
+
SDC = 0
|
|
380
|
+
|
|
381
|
+
def convert(x):
|
|
382
|
+
try:
|
|
383
|
+
return int(x)
|
|
384
|
+
except:
|
|
385
|
+
return x
|
|
386
|
+
|
|
387
|
+
with open("para.txt") as f:
|
|
388
|
+
for line in f:
|
|
389
|
+
if "MIS" in line:
|
|
390
|
+
item = re.findall(r'\((.*?)\)', line)[0]
|
|
391
|
+
item = convert(item)
|
|
392
|
+
MIS[item] = float(line.split("=")[1])
|
|
393
|
+
elif "SDC" in line:
|
|
394
|
+
SDC = float(line.split("=")[1])
|
|
395
|
+
|
|
396
|
+
# Read data
|
|
397
|
+
def parse_sequence(line):
|
|
398
|
+
sets = re.findall(r'\{(.*?)\}', line)
|
|
399
|
+
return [set(convert(x) for x in s.split(',')) for s in sets]
|
|
400
|
+
|
|
401
|
+
sequences = []
|
|
402
|
+
with open("data (1).txt") as f:
|
|
403
|
+
for line in f:
|
|
404
|
+
sequences.append(parse_sequence(line.strip()))
|
|
405
|
+
|
|
406
|
+
N = len(sequences)
|
|
407
|
+
|
|
408
|
+
# Support
|
|
409
|
+
def item_support(item):
|
|
410
|
+
return sum(1 for seq in sequences if any(item in s for s in seq)) / N
|
|
411
|
+
|
|
412
|
+
support = {i: item_support(i) for i in MIS}
|
|
413
|
+
|
|
414
|
+
# Subsequence check
|
|
415
|
+
def is_subsequence(pattern, sequence):
|
|
416
|
+
i = 0
|
|
417
|
+
for itemset in sequence:
|
|
418
|
+
if pattern[i].issubset(itemset):
|
|
419
|
+
i += 1
|
|
420
|
+
if i == len(pattern):
|
|
421
|
+
return True
|
|
422
|
+
return False
|
|
423
|
+
|
|
424
|
+
def sequence_support(pattern):
|
|
425
|
+
return sum(1 for seq in sequences if is_subsequence(pattern, seq))
|
|
426
|
+
|
|
427
|
+
# Init pass
|
|
428
|
+
L = sorted(MIS.keys(), key=lambda x: MIS[x])
|
|
429
|
+
F = []
|
|
430
|
+
|
|
431
|
+
F1 = []
|
|
432
|
+
for item in L:
|
|
433
|
+
if support[item] >= MIS[item]:
|
|
434
|
+
F1.append([{item}])
|
|
435
|
+
|
|
436
|
+
F.append(F1)
|
|
437
|
+
print("F1:", F1)
|
|
438
|
+
|
|
439
|
+
# Join
|
|
440
|
+
def join_step(Fk):
|
|
441
|
+
candidates = []
|
|
442
|
+
for s1 in Fk:
|
|
443
|
+
for s2 in Fk:
|
|
444
|
+
if s1[1:] == s2[:-1]:
|
|
445
|
+
candidates.append(s1 + [s2[-1]])
|
|
446
|
+
|
|
447
|
+
if len(s1) == len(s2) and s1[:-1] == s2[:-1]:
|
|
448
|
+
new_seq = s1[:-1] + [s1[-1] | s2[-1]]
|
|
449
|
+
if new_seq not in candidates:
|
|
450
|
+
candidates.append(new_seq)
|
|
451
|
+
return candidates
|
|
452
|
+
|
|
453
|
+
# Prune
|
|
454
|
+
def prune_step(candidates, Fk):
|
|
455
|
+
pruned = []
|
|
456
|
+
Fk_set = [tuple(tuple(sorted(s)) for s in seq) for seq in Fk]
|
|
457
|
+
|
|
458
|
+
for c in candidates:
|
|
459
|
+
valid = True
|
|
460
|
+
for i in range(len(c)):
|
|
461
|
+
sub = c[:i] + c[i+1:]
|
|
462
|
+
sub_tuple = tuple(tuple(sorted(s)) for s in sub)
|
|
463
|
+
|
|
464
|
+
if sub_tuple not in Fk_set:
|
|
465
|
+
valid = False
|
|
466
|
+
break
|
|
467
|
+
|
|
468
|
+
if valid:
|
|
469
|
+
pruned.append(c)
|
|
470
|
+
|
|
471
|
+
return pruned
|
|
472
|
+
|
|
473
|
+
# Main loop
|
|
474
|
+
k = 1
|
|
475
|
+
while True:
|
|
476
|
+
if len(F[k-1]) == 0:
|
|
477
|
+
break
|
|
478
|
+
|
|
479
|
+
Ck = prune_step(join_step(F[k-1]), F[k-1])
|
|
480
|
+
Fk = []
|
|
481
|
+
|
|
482
|
+
for c in Ck:
|
|
483
|
+
count = sequence_support(c)
|
|
484
|
+
|
|
485
|
+
items = set()
|
|
486
|
+
for s in c:
|
|
487
|
+
items.update(s)
|
|
488
|
+
|
|
489
|
+
min_mis = min(MIS[i] for i in items)
|
|
490
|
+
|
|
491
|
+
if (count / N) >= min_mis:
|
|
492
|
+
valid = True
|
|
493
|
+
item_list = list(items)
|
|
494
|
+
|
|
495
|
+
for i in range(len(item_list)):
|
|
496
|
+
for j in range(i+1, len(item_list)):
|
|
497
|
+
if abs(support[item_list[i]] - support[item_list[j]]) > SDC:
|
|
498
|
+
valid = False
|
|
499
|
+
break
|
|
500
|
+
|
|
501
|
+
if valid:
|
|
502
|
+
Fk.append(c)
|
|
503
|
+
|
|
504
|
+
if not Fk:
|
|
505
|
+
break
|
|
506
|
+
|
|
507
|
+
print(f"F{k+1}:", Fk)
|
|
508
|
+
F.append(Fk)
|
|
509
|
+
k += 1
|
|
510
|
+
|
|
511
|
+
# Output
|
|
512
|
+
def format_pattern(p):
|
|
513
|
+
return "<" + "".join(
|
|
514
|
+
["{" + ",".join(map(str, sorted(s))) + "}" for s in p]
|
|
515
|
+
) + ">"
|
|
516
|
+
|
|
517
|
+
print("\nFINAL PATTERNS:")
|
|
518
|
+
for level in F:
|
|
519
|
+
for p in level:
|
|
520
|
+
print(f"Pattern :{format_pattern(p)} count: {sequence_support(p)}")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""IR source snippets."""
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
import networkx as nx
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
# ===== INPUT =====
|
|
7
|
+
# rows = source page, cols = destination page
|
|
8
|
+
pages = ["A", "B", "C", "D"]
|
|
9
|
+
A = np.array(
|
|
10
|
+
[
|
|
11
|
+
[0, 1, 1, 0], # A -> B,C
|
|
12
|
+
[0, 0, 1, 0], # B -> C
|
|
13
|
+
[1, 0, 0, 0], # C -> A
|
|
14
|
+
[0, 0, 1, 0], # D -> C
|
|
15
|
+
],
|
|
16
|
+
dtype=int,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
d = 0.85
|
|
20
|
+
max_iter = 10
|
|
21
|
+
tol = 1e-8
|
|
22
|
+
|
|
23
|
+
# ===== PREP =====
|
|
24
|
+
n = len(pages)
|
|
25
|
+
base = (1 - d) / n
|
|
26
|
+
adj = pd.DataFrame(A, index=pages, columns=pages)
|
|
27
|
+
|
|
28
|
+
incoming = {p: [src for src in pages if adj.loc[src, p] == 1] for p in pages}
|
|
29
|
+
outgoing = {p: int(adj.loc[p].sum()) for p in pages}
|
|
30
|
+
pr = {p: 1 / n for p in pages}
|
|
31
|
+
|
|
32
|
+
print("Step 1: Web Graph")
|
|
33
|
+
print(adj)
|
|
34
|
+
print("\nTotal pages N =", n)
|
|
35
|
+
print("\nInitial PageRank:")
|
|
36
|
+
print(pd.DataFrame({"Page": pages, "Initial PR": [pr[p] for p in pages]}))
|
|
37
|
+
|
|
38
|
+
print("\nStep 2: Incoming Links")
|
|
39
|
+
print(
|
|
40
|
+
pd.DataFrame(
|
|
41
|
+
{
|
|
42
|
+
"Page": pages,
|
|
43
|
+
"Incoming Links": [
|
|
44
|
+
", ".join(incoming[p]) if incoming[p] else "—" for p in pages
|
|
45
|
+
],
|
|
46
|
+
}
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
print("\nOutgoing Links Count")
|
|
51
|
+
print(pd.DataFrame({"Page": pages, "Outgoing Links": [outgoing[p] for p in pages]}))
|
|
52
|
+
|
|
53
|
+
# ===== ITERATIONS =====
|
|
54
|
+
history = [pr.copy()]
|
|
55
|
+
|
|
56
|
+
for it in range(1, max_iter + 1):
|
|
57
|
+
new_pr = {}
|
|
58
|
+
print(f"\n\n===== Iteration {it} =====")
|
|
59
|
+
for p in pages:
|
|
60
|
+
s = 0
|
|
61
|
+
terms, nums = [], []
|
|
62
|
+
for src in incoming[p]:
|
|
63
|
+
c = pr[src] / outgoing[src] if outgoing[src] > 0 else 0
|
|
64
|
+
s += c
|
|
65
|
+
terms.append(f"PR({src})/{outgoing[src]}")
|
|
66
|
+
nums.append(f"{pr[src]:.4f}/{outgoing[src]}={c:.4f}")
|
|
67
|
+
new_pr[p] = base + d * s
|
|
68
|
+
|
|
69
|
+
print(f"\nPage {p}")
|
|
70
|
+
print("Incoming links:", ", ".join(incoming[p]) if incoming[p] else "—")
|
|
71
|
+
if terms:
|
|
72
|
+
print(f"PR({p}) = {base:.4f} + {d} * (" + " + ".join(terms) + ")")
|
|
73
|
+
print("Substitution:", " + ".join(nums))
|
|
74
|
+
else:
|
|
75
|
+
print(f"PR({p}) = {base:.4f}")
|
|
76
|
+
print(f"PR({p}) = {new_pr[p]:.6f}")
|
|
77
|
+
|
|
78
|
+
history.append(new_pr.copy())
|
|
79
|
+
diff = sum(abs(new_pr[p] - pr[p]) for p in pages)
|
|
80
|
+
print("\nPageRank after this iteration:")
|
|
81
|
+
print(
|
|
82
|
+
pd.DataFrame(
|
|
83
|
+
{"Page": pages, f"PR Iteration {it}": [round(new_pr[p], 6) for p in pages]}
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
pr = new_pr
|
|
87
|
+
if diff < tol:
|
|
88
|
+
break
|
|
89
|
+
|
|
90
|
+
# ===== FINAL RESULT =====
|
|
91
|
+
print("\n\nFinal PageRank:")
|
|
92
|
+
final_df = pd.DataFrame(
|
|
93
|
+
{"Page": pages, "Final PR": [pr[p] for p in pages]}
|
|
94
|
+
).sort_values("Final PR", ascending=False)
|
|
95
|
+
print(final_df)
|
|
96
|
+
|
|
97
|
+
# ===== OPTIONAL VISUALIZATION =====
|
|
98
|
+
G = nx.DiGraph()
|
|
99
|
+
for i, src in enumerate(pages):
|
|
100
|
+
for j, dst in enumerate(pages):
|
|
101
|
+
if A[i, j] == 1:
|
|
102
|
+
G.add_edge(src, dst)
|
|
103
|
+
|
|
104
|
+
plt.figure(figsize=(6, 4))
|
|
105
|
+
pos = nx.spring_layout(G, seed=42)
|
|
106
|
+
sizes = [pr[p] * 5000 + 800 for p in G.nodes()]
|
|
107
|
+
nx.draw(G, pos, with_labels=True, node_size=sizes, arrows=True)
|
|
108
|
+
plt.title("Graph Visualization (node size ~ Final PageRank)")
|
|
109
|
+
plt.show()
|