itertoolkit 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bm_preprocessing/__init__.py +14 -0
- bm_preprocessing/importer/DM/__init__.py +7 -0
- bm_preprocessing/importer/DM/agg.py +6 -0
- bm_preprocessing/importer/DM/dbscan.py +6 -0
- bm_preprocessing/importer/DM/finals.py +6 -0
- bm_preprocessing/importer/DM/gsp.py +6 -0
- bm_preprocessing/importer/DM/test.py +6 -0
- bm_preprocessing/importer/Finals/__init__.py +7 -0
- bm_preprocessing/importer/Finals/kaadhal.py +6 -0
- bm_preprocessing/importer/Finals/raaka.py +6 -0
- bm_preprocessing/importer/Finals/seedan.py +6 -0
- bm_preprocessing/importer/Finals/vikram.py +6 -0
- bm_preprocessing/importer/IR/__init__.py +6 -0
- bm_preprocessing/importer/IR/finals.py +6 -0
- bm_preprocessing/importer/IR/pagerank.py +6 -0
- bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
- bm_preprocessing/importer/IR/test.py +6 -0
- bm_preprocessing/importer/PY/__init__.py +4 -0
- bm_preprocessing/importer/PY/lib_doc.py +6 -0
- bm_preprocessing/importer/PY/python_doc.py +6 -0
- bm_preprocessing/importer/__init__.py +8 -0
- bm_preprocessing/importer/_module_printer.py +23 -0
- bm_preprocessing/src/DM/__init__.py +1 -0
- bm_preprocessing/src/DM/agg.py +267 -0
- bm_preprocessing/src/DM/dbscan.py +218 -0
- bm_preprocessing/src/DM/finals.py +19 -0
- bm_preprocessing/src/DM/gsp.py +378 -0
- bm_preprocessing/src/DM/test.py +19 -0
- bm_preprocessing/src/Finals/__init__.py +1 -0
- bm_preprocessing/src/Finals/kaadhal.py +1453 -0
- bm_preprocessing/src/Finals/raaka.py +1338 -0
- bm_preprocessing/src/Finals/seedan.py +1173 -0
- bm_preprocessing/src/Finals/vikram.py +520 -0
- bm_preprocessing/src/IR/__init__.py +1 -0
- bm_preprocessing/src/IR/finals.py +14 -0
- bm_preprocessing/src/IR/pagerank.py +109 -0
- bm_preprocessing/src/IR/recommenders_pca.py +487 -0
- bm_preprocessing/src/IR/test.py +14 -0
- bm_preprocessing/src/PY/__init__.py +1 -0
- bm_preprocessing/src/PY/lib_doc.py +295 -0
- bm_preprocessing/src/PY/python_doc.py +177 -0
- bm_preprocessing/src/__init__.py +1 -0
- itertoolkit-1.5.0.dist-info/METADATA +120 -0
- itertoolkit-1.5.0.dist-info/RECORD +45 -0
- itertoolkit-1.5.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""bm_preprocessing package."""
|
|
2
|
+
|
|
3
|
+
# Keep legacy module paths working, e.g. ``import bm_preprocessing.DM``.
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from .importer import DM, IR, PY, Finals
|
|
7
|
+
|
|
8
|
+
sys.modules[__name__ + ".DM"] = DM
|
|
9
|
+
sys.modules[__name__ + ".PY"] = PY
|
|
10
|
+
sys.modules[__name__ + ".IR"] = IR
|
|
11
|
+
sys.modules[__name__ + ".Finals"] = Finals
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
__all__ = ["DM", "PY", "IR"]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SourceCodeModule:
|
|
5
|
+
"""A class that displays source code when printed."""
|
|
6
|
+
|
|
7
|
+
def __init__(self, name: str, source_path: Path):
|
|
8
|
+
self.name = name
|
|
9
|
+
self._source_path = source_path
|
|
10
|
+
self._source_code = None
|
|
11
|
+
|
|
12
|
+
@property
|
|
13
|
+
def source_code(self) -> str:
|
|
14
|
+
"""Lazily load source code."""
|
|
15
|
+
if self._source_code is None:
|
|
16
|
+
self._source_code = self._source_path.read_text(encoding="utf-8")
|
|
17
|
+
return self._source_code
|
|
18
|
+
|
|
19
|
+
def __repr__(self) -> str:
|
|
20
|
+
return self.source_code
|
|
21
|
+
|
|
22
|
+
def __str__(self) -> str:
|
|
23
|
+
return self.source_code
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Data-mining source snippets."""
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""PS13.ipynb
|
|
3
|
+
|
|
4
|
+
Automatically generated by Colab.
|
|
5
|
+
|
|
6
|
+
Original file is located at
|
|
7
|
+
https://colab.research.google.com/drive/141X72rA2VxgIbA93KoMkwHzHSc1fsbty
|
|
8
|
+
|
|
9
|
+
# Agglomerative hierarchical clustering
|
|
10
|
+
|
|
11
|
+
### Setup
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import matplotlib.pyplot as plt
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import plotly.figure_factory as ff
|
|
18
|
+
import plotly.graph_objects as go
|
|
19
|
+
import seaborn as sns
|
|
20
|
+
from scipy.cluster.hierarchy import dendrogram, linkage
|
|
21
|
+
from sklearn.cluster import AgglomerativeClustering
|
|
22
|
+
from sklearn.preprocessing import LabelEncoder
|
|
23
|
+
|
|
24
|
+
csv_path = "/content/drive/MyDrive/Academic_Resources/Semester 08/20XW87/PS13/mall_customers_dataset.csv"
|
|
25
|
+
|
|
26
|
+
df = pd.read_csv(csv_path)
|
|
27
|
+
display(df)
|
|
28
|
+
|
|
29
|
+
"""### Remove CustomerID column"""
|
|
30
|
+
|
|
31
|
+
df = df.drop(columns=["CustomerID"])
|
|
32
|
+
display(df)
|
|
33
|
+
|
|
34
|
+
"""### Check for missing values"""
|
|
35
|
+
|
|
36
|
+
missing_counts = df.isna().sum()
|
|
37
|
+
print(missing_counts)
|
|
38
|
+
|
|
39
|
+
missing_counts = df.isna().sum()
|
|
40
|
+
missing_percent = (missing_counts / len(df)) * 100
|
|
41
|
+
print(missing_percent)
|
|
42
|
+
|
|
43
|
+
print("Any missing values?", df.isna().values.any())
|
|
44
|
+
|
|
45
|
+
for col in df.columns:
|
|
46
|
+
if df[col].isna().sum() == 0:
|
|
47
|
+
print(f"Column '{col}' has no missing values.")
|
|
48
|
+
else:
|
|
49
|
+
print(f"Column '{col}' has {df[col].isna().sum()} missing values.")
|
|
50
|
+
|
|
51
|
+
"""### Convert categorical variable value of gender to numerical (Male-1, Female-0)"""
|
|
52
|
+
|
|
53
|
+
le = LabelEncoder()
|
|
54
|
+
df["gender_encoded"] = le.fit_transform(df["Gender"])
|
|
55
|
+
|
|
56
|
+
print("Classes:", le.classes_)
|
|
57
|
+
|
|
58
|
+
display(df)
|
|
59
|
+
|
|
60
|
+
"""### Display male, female ratio as pie chart"""
|
|
61
|
+
|
|
62
|
+
gender_counts = df["gender_encoded"].value_counts()
|
|
63
|
+
labels = ["Female", "Male"]
|
|
64
|
+
|
|
65
|
+
plt.figure(figsize=(5, 5))
|
|
66
|
+
plt.pie(
|
|
67
|
+
gender_counts,
|
|
68
|
+
labels=labels,
|
|
69
|
+
autopct="%1.1f%%",
|
|
70
|
+
startangle=90,
|
|
71
|
+
colors=["#66b3ff", "#ff9999"],
|
|
72
|
+
)
|
|
73
|
+
plt.title("Gender Distribution")
|
|
74
|
+
plt.show()
|
|
75
|
+
|
|
76
|
+
"""### Display age, annual income as bar graph"""
|
|
77
|
+
|
|
78
|
+
values = [df["Age"].mean(), df["Annual Income (k$)"].mean()]
|
|
79
|
+
labels = ["Age", "Annual Income (k$)"]
|
|
80
|
+
|
|
81
|
+
plt.bar(labels, values, color=["skyblue", "orange"])
|
|
82
|
+
plt.title("Age and Annual Income")
|
|
83
|
+
plt.ylabel("Value")
|
|
84
|
+
plt.show()
|
|
85
|
+
|
|
86
|
+
df["Age_group"] = pd.cut(df["Age"], bins=[18, 25, 35, 45, 55, 65], right=False)
|
|
87
|
+
|
|
88
|
+
plt.figure(figsize=(8, 5))
|
|
89
|
+
sns.barplot(x="Age_group", y="Annual Income (k$)", data=df)
|
|
90
|
+
|
|
91
|
+
plt.title("Average Income by Age Group")
|
|
92
|
+
plt.xticks(rotation=45)
|
|
93
|
+
plt.show()
|
|
94
|
+
|
|
95
|
+
sns.barplot(x="Age", y="Annual Income (k$)", data=df, estimator="mean")
|
|
96
|
+
|
|
97
|
+
import plotly.express as px
|
|
98
|
+
|
|
99
|
+
fig = px.bar(
|
|
100
|
+
df.groupby("Age")["Annual Income (k$)"].mean().reset_index(),
|
|
101
|
+
x="Age",
|
|
102
|
+
y="Annual Income (k$)",
|
|
103
|
+
title="Mean Income by Age",
|
|
104
|
+
)
|
|
105
|
+
fig.show()
|
|
106
|
+
|
|
107
|
+
"""### Perform agglomerative clustering with ward algorithm as linkage."""
|
|
108
|
+
|
|
109
|
+
agg_clust = AgglomerativeClustering(n_clusters=2, linkage="ward")
|
|
110
|
+
df["Cluster"] = agg_clust.fit_predict(df[["Age", "Annual Income (k$)"]])
|
|
111
|
+
|
|
112
|
+
print(df)
|
|
113
|
+
|
|
114
|
+
linked = linkage(df[["Age", "Annual Income (k$)"]], method="ward")
|
|
115
|
+
|
|
116
|
+
plt.figure(figsize=(20, 5))
|
|
117
|
+
dendrogram(
|
|
118
|
+
linked, labels=df.index.tolist(), distance_sort="descending", show_leaf_counts=True
|
|
119
|
+
)
|
|
120
|
+
plt.title("Dendrogram")
|
|
121
|
+
plt.xlabel("Sample index")
|
|
122
|
+
plt.ylabel("Distance")
|
|
123
|
+
plt.show()
|
|
124
|
+
|
|
125
|
+
fig = ff.create_dendrogram(
|
|
126
|
+
df[["Age", "Annual Income (k$)"]].values,
|
|
127
|
+
labels=df.index.tolist(),
|
|
128
|
+
linkagefun=lambda x: linkage(x, "ward"),
|
|
129
|
+
)
|
|
130
|
+
fig.update_layout(width=1200, height=600)
|
|
131
|
+
fig.show()
|
|
132
|
+
|
|
133
|
+
"""## Custom
|
|
134
|
+
|
|
135
|
+
### Impl
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def euclidean(a, b):
|
|
140
|
+
return np.sqrt(np.sum((a - b) ** 2))
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def ward_distance(cluster_i, cluster_j, data):
|
|
144
|
+
"""
|
|
145
|
+
Ward's formula:
|
|
146
|
+
d(i,j) = sqrt( (2 * ni * nj) / (ni + nj) ) * ||mean_i - mean_j||
|
|
147
|
+
"""
|
|
148
|
+
ni, nj = len(cluster_i), len(cluster_j)
|
|
149
|
+
mean_i = np.mean(data[cluster_i], axis=0)
|
|
150
|
+
mean_j = np.mean(data[cluster_j], axis=0)
|
|
151
|
+
return np.sqrt((2 * ni * nj) / (ni + nj)) * euclidean(mean_i, mean_j)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def custom_agglomerative(data, n_clusters=2):
|
|
155
|
+
n = len(data)
|
|
156
|
+
clusters = {i: [i] for i in range(n)}
|
|
157
|
+
linkage_matrix = []
|
|
158
|
+
next_id = n
|
|
159
|
+
|
|
160
|
+
while len(clusters) > 1:
|
|
161
|
+
keys = list(clusters.keys())
|
|
162
|
+
min_dist = float("inf")
|
|
163
|
+
merge_a, merge_b = None, None
|
|
164
|
+
|
|
165
|
+
for i in range(len(keys)):
|
|
166
|
+
for j in range(i + 1, len(keys)):
|
|
167
|
+
d = ward_distance(clusters[keys[i]], clusters[keys[j]], data)
|
|
168
|
+
if d < min_dist:
|
|
169
|
+
min_dist = d
|
|
170
|
+
merge_a, merge_b = keys[i], keys[j]
|
|
171
|
+
|
|
172
|
+
new_members = clusters[merge_a] + clusters[merge_b]
|
|
173
|
+
|
|
174
|
+
id_a = float(min(merge_a, merge_b))
|
|
175
|
+
id_b = float(max(merge_a, merge_b))
|
|
176
|
+
prev_dist = linkage_matrix[-1][2] if linkage_matrix else 0.0
|
|
177
|
+
safe_dist = float(max(min_dist, prev_dist))
|
|
178
|
+
|
|
179
|
+
linkage_matrix.append([id_a, id_b, safe_dist, float(len(new_members))])
|
|
180
|
+
|
|
181
|
+
clusters[next_id] = new_members
|
|
182
|
+
del clusters[merge_a]
|
|
183
|
+
del clusters[merge_b]
|
|
184
|
+
next_id += 1
|
|
185
|
+
|
|
186
|
+
Z = np.array(linkage_matrix, dtype=float)
|
|
187
|
+
|
|
188
|
+
from scipy.cluster.hierarchy import fcluster
|
|
189
|
+
|
|
190
|
+
labels = fcluster(Z, t=n_clusters, criterion="maxclust") - 1
|
|
191
|
+
|
|
192
|
+
return labels, Z
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
features = df[["Age", "Annual Income (k$)"]].values
|
|
196
|
+
|
|
197
|
+
labels, Z = custom_agglomerative(features, n_clusters=2)
|
|
198
|
+
df["Cluster"] = labels
|
|
199
|
+
print(df)
|
|
200
|
+
|
|
201
|
+
"""### Visualizations"""
|
|
202
|
+
|
|
203
|
+
plt.figure(figsize=(8, 5))
|
|
204
|
+
for c in np.unique(labels):
|
|
205
|
+
mask = labels == c
|
|
206
|
+
plt.scatter(features[mask, 0], features[mask, 1], label=f"Cluster {c}", s=60)
|
|
207
|
+
plt.xlabel("Age")
|
|
208
|
+
plt.ylabel("Annual Income (k$)")
|
|
209
|
+
plt.title("Custom Agglomerative Clustering (Ward)")
|
|
210
|
+
plt.legend()
|
|
211
|
+
plt.tight_layout()
|
|
212
|
+
plt.show()
|
|
213
|
+
|
|
214
|
+
plt.figure(figsize=(12, 5))
|
|
215
|
+
dendrogram(linked, truncate_mode="lastp", p=20, leaf_rotation=45, leaf_font_size=10)
|
|
216
|
+
plt.title("Dendrogram (Ward Linkage)")
|
|
217
|
+
plt.xlabel("Cluster index / size")
|
|
218
|
+
plt.ylabel("Ward distance")
|
|
219
|
+
plt.tight_layout()
|
|
220
|
+
plt.show()
|
|
221
|
+
|
|
222
|
+
ddata = dendrogram(Z, truncate_mode="lastp", p=20, no_plot=True)
|
|
223
|
+
|
|
224
|
+
fig = go.Figure()
|
|
225
|
+
for x_seg, y_seg in zip(ddata["icoord"], ddata["dcoord"]):
|
|
226
|
+
fig.add_trace(
|
|
227
|
+
go.Scatter(
|
|
228
|
+
x=x_seg,
|
|
229
|
+
y=y_seg,
|
|
230
|
+
mode="lines",
|
|
231
|
+
line=dict(color="royalblue", width=1.5),
|
|
232
|
+
showlegend=False,
|
|
233
|
+
hoverinfo="skip",
|
|
234
|
+
)
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
fig.update_layout(
|
|
238
|
+
title="Dendrogram (Custom Ward Linkage)",
|
|
239
|
+
xaxis=dict(
|
|
240
|
+
title="Cluster index / size",
|
|
241
|
+
tickangle=45,
|
|
242
|
+
tickfont=dict(size=10),
|
|
243
|
+
showgrid=False,
|
|
244
|
+
),
|
|
245
|
+
yaxis=dict(title="Ward Distance", showgrid=True, gridcolor="lightgrey"),
|
|
246
|
+
width=1200,
|
|
247
|
+
height=500,
|
|
248
|
+
plot_bgcolor="white",
|
|
249
|
+
paper_bgcolor="white",
|
|
250
|
+
showlegend=False,
|
|
251
|
+
)
|
|
252
|
+
fig.show()
|
|
253
|
+
|
|
254
|
+
"""## Tests"""
|
|
255
|
+
|
|
256
|
+
test_data = np.array([[1, 2], [1, 4], [5, 8], [5, 9], [8, 1], [8, 2]], dtype=float)
|
|
257
|
+
|
|
258
|
+
# Custom
|
|
259
|
+
_, Z_custom = custom_agglomerative(test_data, n_clusters=2)
|
|
260
|
+
|
|
261
|
+
# Scipy
|
|
262
|
+
from scipy.cluster.hierarchy import linkage
|
|
263
|
+
|
|
264
|
+
Z_scipy = linkage(test_data, method="ward")
|
|
265
|
+
|
|
266
|
+
print("Custom:\n", Z_custom.round(3))
|
|
267
|
+
print("\nScipy:\n", Z_scipy.round(3))
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import matplotlib.pyplot as plt
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from google.colab import drive
|
|
7
|
+
from sklearn.metrics import (
|
|
8
|
+
adjusted_rand_score,
|
|
9
|
+
classification_report,
|
|
10
|
+
normalized_mutual_info_score,
|
|
11
|
+
)
|
|
12
|
+
from sklearn.preprocessing import StandardScaler
|
|
13
|
+
|
|
14
|
+
drive.mount("/content/drive")
|
|
15
|
+
|
|
16
|
+
csv_path = "/content/drive/MyDrive/Academic_Resources/Semester 08/20XW87/PS14/wholesale_customers_data.csv"
|
|
17
|
+
df = pd.read_csv(csv_path)
|
|
18
|
+
|
|
19
|
+
print(df.head())
|
|
20
|
+
|
|
21
|
+
"""### DF Report"""
|
|
22
|
+
|
|
23
|
+
print(df.describe())
|
|
24
|
+
|
|
25
|
+
"""### Drop 'Channel' and 'Region' columns"""
|
|
26
|
+
|
|
27
|
+
df = df.drop(columns=["Channel", "Region"])
|
|
28
|
+
|
|
29
|
+
display(df.head())
|
|
30
|
+
|
|
31
|
+
"""### Consider Groceries and Milk attributes. Normalize these attribute values by scaling it from 0 mean to unit variance. Visualize normalized dataset."""
|
|
32
|
+
|
|
33
|
+
plt.scatter(df["Grocery"], df["Milk"], alpha=0.6)
|
|
34
|
+
plt.title("Normalized Groceries vs Milk")
|
|
35
|
+
plt.xlabel("Grocery (before normalized)")
|
|
36
|
+
plt.ylabel("Milk (before normalized)")
|
|
37
|
+
plt.show()
|
|
38
|
+
|
|
39
|
+
# Select Groceries and Milk columns
|
|
40
|
+
data_to_normalize = df[["Grocery", "Milk"]]
|
|
41
|
+
|
|
42
|
+
# Normalize (StandardScaler standardizes to mean=0, std=1)
|
|
43
|
+
scaler = StandardScaler()
|
|
44
|
+
normalized_data = scaler.fit_transform(data_to_normalize)
|
|
45
|
+
|
|
46
|
+
# Convert back to DataFrame for convenience
|
|
47
|
+
normalized_df = pd.DataFrame(normalized_data, columns=["Grocery", "Milk"])
|
|
48
|
+
|
|
49
|
+
print(normalized_df.mean())
|
|
50
|
+
print(normalized_df.std())
|
|
51
|
+
|
|
52
|
+
mean_grocery = normalized_df["Grocery"].mean()
|
|
53
|
+
mean_milk = normalized_df["Milk"].mean()
|
|
54
|
+
|
|
55
|
+
plt.figure(figsize=(7, 5))
|
|
56
|
+
|
|
57
|
+
plt.scatter(normalized_df["Grocery"], normalized_df["Milk"], alpha=0.6)
|
|
58
|
+
|
|
59
|
+
plt.axhline(mean_milk, linestyle="--", label=f"Milk Mean ({mean_milk:.2e})")
|
|
60
|
+
plt.axvline(mean_grocery, linestyle="--", label=f"Grocery Mean ({mean_grocery:.2e})")
|
|
61
|
+
|
|
62
|
+
plt.title("Normalized Grocery vs Milk")
|
|
63
|
+
plt.xlabel("Grocery (Standardized)")
|
|
64
|
+
plt.ylabel("Milk (Standardized)")
|
|
65
|
+
|
|
66
|
+
plt.legend()
|
|
67
|
+
plt.grid()
|
|
68
|
+
plt.show()
|
|
69
|
+
|
|
70
|
+
"""### Implementation of DBSCAN"""
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def euclidean_distance(p1, p2):
|
|
74
|
+
return np.sqrt(np.sum((p1 - p2) ** 2))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def region_query(data, point_idx, eps):
|
|
78
|
+
neighbors = []
|
|
79
|
+
for i in range(len(data)):
|
|
80
|
+
if euclidean_distance(data[point_idx], data[i]) <= eps:
|
|
81
|
+
neighbors.append(i)
|
|
82
|
+
return neighbors
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def expand_cluster(data, labels, point_idx, neighbors, cluster_id, eps, min_pts):
|
|
86
|
+
labels[point_idx] = cluster_id
|
|
87
|
+
i = 0
|
|
88
|
+
while i < len(neighbors):
|
|
89
|
+
neighbor_idx = neighbors[i]
|
|
90
|
+
if labels[neighbor_idx] == -1:
|
|
91
|
+
labels[neighbor_idx] = cluster_id
|
|
92
|
+
elif labels[neighbor_idx] == 0:
|
|
93
|
+
labels[neighbor_idx] = cluster_id
|
|
94
|
+
new_neighbors = region_query(data, neighbor_idx, eps)
|
|
95
|
+
if len(new_neighbors) >= min_pts:
|
|
96
|
+
neighbors += new_neighbors
|
|
97
|
+
i += 1
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def dbscan(data, eps=0.5, min_pts=15):
|
|
101
|
+
labels = [0] * len(data) # 0 means unvisited
|
|
102
|
+
cluster_id = 0
|
|
103
|
+
|
|
104
|
+
for point_idx in range(len(data)):
|
|
105
|
+
if labels[point_idx] != 0:
|
|
106
|
+
continue
|
|
107
|
+
neighbors = region_query(data, point_idx, eps)
|
|
108
|
+
if len(neighbors) < min_pts:
|
|
109
|
+
labels[point_idx] = -1 # Noise
|
|
110
|
+
else:
|
|
111
|
+
cluster_id += 1
|
|
112
|
+
expand_cluster(data, labels, point_idx, neighbors, cluster_id, eps, min_pts)
|
|
113
|
+
|
|
114
|
+
return labels
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# Run on normalized grocery & milk data
|
|
118
|
+
data_np = normalized_df.values
|
|
119
|
+
labels = dbscan(data_np, eps=0.5, min_pts=15)
|
|
120
|
+
|
|
121
|
+
# Convert labels to numpy array for plotting
|
|
122
|
+
labels = np.array(labels)
|
|
123
|
+
|
|
124
|
+
plt.figure(figsize=(8, 6))
|
|
125
|
+
unique_labels = set(labels)
|
|
126
|
+
colors = plt.cm.get_cmap("tab10", len(unique_labels))
|
|
127
|
+
|
|
128
|
+
for k in unique_labels:
|
|
129
|
+
class_member_mask = labels == k
|
|
130
|
+
xy = data_np[class_member_mask]
|
|
131
|
+
if k == -1:
|
|
132
|
+
# Noise in black
|
|
133
|
+
plt.scatter(xy[:, 0], xy[:, 1], c="k", marker="x", label="Noise")
|
|
134
|
+
else:
|
|
135
|
+
plt.scatter(xy[:, 0], xy[:, 1], c=[colors(k - 1)], label=f"Cluster {k}")
|
|
136
|
+
plt.title("DBSCAN Clustering Results (Custom Implementation)")
|
|
137
|
+
plt.xlabel("Grocery (normalized)")
|
|
138
|
+
plt.ylabel("Milk (normalized)")
|
|
139
|
+
plt.legend()
|
|
140
|
+
plt.show()
|
|
141
|
+
|
|
142
|
+
from sklearn.cluster import DBSCAN
|
|
143
|
+
from sklearn.datasets import make_moons
|
|
144
|
+
|
|
145
|
+
# Generate moon-shaped data
|
|
146
|
+
X, _ = make_moons(n_samples=2000, noise=0.05)
|
|
147
|
+
|
|
148
|
+
# Run DBSCAN
|
|
149
|
+
dbscan_builtin = DBSCAN(eps=0.3, min_samples=15)
|
|
150
|
+
labels_builtin = dbscan_builtin.fit_predict(X)
|
|
151
|
+
|
|
152
|
+
# Plot
|
|
153
|
+
plt.figure(figsize=(8, 6))
|
|
154
|
+
unique_labels = set(labels_builtin)
|
|
155
|
+
colors = plt.cm.get_cmap("tab10", len(unique_labels))
|
|
156
|
+
|
|
157
|
+
for k in unique_labels:
|
|
158
|
+
class_member_mask = labels_builtin == k
|
|
159
|
+
xy = X[class_member_mask]
|
|
160
|
+
if k == -1:
|
|
161
|
+
plt.scatter(xy[:, 0], xy[:, 1], c="k", marker="x", label="Noise")
|
|
162
|
+
else:
|
|
163
|
+
plt.scatter(xy[:, 0], xy[:, 1], c=[colors(k)], label=f"Cluster {k}")
|
|
164
|
+
plt.title("DBSCAN on make_moons")
|
|
165
|
+
plt.legend()
|
|
166
|
+
plt.show()
|
|
167
|
+
|
|
168
|
+
# Generate synthetic moon-shaped data
|
|
169
|
+
X, true_labels = make_moons(n_samples=2000, noise=0.05)
|
|
170
|
+
|
|
171
|
+
# Custom DBSCAN on X
|
|
172
|
+
custom_labels = dbscan(X, eps=0.3, min_pts=15)
|
|
173
|
+
custom_labels = np.array(custom_labels)
|
|
174
|
+
|
|
175
|
+
# Sklearn DBSCAN on X
|
|
176
|
+
dbscan_builtin = DBSCAN(eps=0.3, min_samples=15)
|
|
177
|
+
sklearn_labels = dbscan_builtin.fit_predict(X)
|
|
178
|
+
|
|
179
|
+
# Clustering metrics comparing the two
|
|
180
|
+
ari = adjusted_rand_score(custom_labels, sklearn_labels)
|
|
181
|
+
nmi = normalized_mutual_info_score(custom_labels, sklearn_labels)
|
|
182
|
+
|
|
183
|
+
print(f"Adjusted Rand Index (ARI) between custom and sklearn: {ari:.4f}")
|
|
184
|
+
print(f"Normalized Mutual Information (NMI) between custom and sklearn: {nmi:.4f}")
|
|
185
|
+
|
|
186
|
+
# Classification report (treating custom as true, sklearn as predicted)
|
|
187
|
+
print("\nClassification report (custom as true, sklearn as predicted):")
|
|
188
|
+
print(classification_report(custom_labels, sklearn_labels))
|
|
189
|
+
|
|
190
|
+
"""### Add some noise data and again visualize the results."""
|
|
191
|
+
|
|
192
|
+
# Add random noise points
|
|
193
|
+
noise = np.random.uniform(low=-2, high=3, size=(200, 2))
|
|
194
|
+
|
|
195
|
+
# Combine with original data
|
|
196
|
+
X_noisy = np.vstack([X, noise])
|
|
197
|
+
|
|
198
|
+
# Run DBSCAN again
|
|
199
|
+
dbscan_noisy = DBSCAN(eps=0.3, min_samples=15)
|
|
200
|
+
labels_noisy = dbscan_noisy.fit_predict(X_noisy)
|
|
201
|
+
|
|
202
|
+
# Plot
|
|
203
|
+
plt.figure(figsize=(8, 6))
|
|
204
|
+
unique_labels = set(labels_noisy)
|
|
205
|
+
colors = plt.cm.get_cmap("tab10", len(unique_labels))
|
|
206
|
+
|
|
207
|
+
for k in unique_labels:
|
|
208
|
+
class_member_mask = labels_noisy == k
|
|
209
|
+
xy = X_noisy[class_member_mask]
|
|
210
|
+
|
|
211
|
+
if k == -1:
|
|
212
|
+
plt.scatter(xy[:, 0], xy[:, 1], c="k", marker="x", label="Noise")
|
|
213
|
+
else:
|
|
214
|
+
plt.scatter(xy[:, 0], xy[:, 1], c=[colors(k)], label=f"Cluster {k}")
|
|
215
|
+
|
|
216
|
+
plt.title("DBSCAN with Added Noise")
|
|
217
|
+
plt.legend()
|
|
218
|
+
plt.show()
|