itertoolkit 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. bm_preprocessing/__init__.py +14 -0
  2. bm_preprocessing/importer/DM/__init__.py +7 -0
  3. bm_preprocessing/importer/DM/agg.py +6 -0
  4. bm_preprocessing/importer/DM/dbscan.py +6 -0
  5. bm_preprocessing/importer/DM/finals.py +6 -0
  6. bm_preprocessing/importer/DM/gsp.py +6 -0
  7. bm_preprocessing/importer/DM/test.py +6 -0
  8. bm_preprocessing/importer/Finals/__init__.py +7 -0
  9. bm_preprocessing/importer/Finals/kaadhal.py +6 -0
  10. bm_preprocessing/importer/Finals/raaka.py +6 -0
  11. bm_preprocessing/importer/Finals/seedan.py +6 -0
  12. bm_preprocessing/importer/Finals/vikram.py +6 -0
  13. bm_preprocessing/importer/IR/__init__.py +6 -0
  14. bm_preprocessing/importer/IR/finals.py +6 -0
  15. bm_preprocessing/importer/IR/pagerank.py +6 -0
  16. bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
  17. bm_preprocessing/importer/IR/test.py +6 -0
  18. bm_preprocessing/importer/PY/__init__.py +4 -0
  19. bm_preprocessing/importer/PY/lib_doc.py +6 -0
  20. bm_preprocessing/importer/PY/python_doc.py +6 -0
  21. bm_preprocessing/importer/__init__.py +8 -0
  22. bm_preprocessing/importer/_module_printer.py +23 -0
  23. bm_preprocessing/src/DM/__init__.py +1 -0
  24. bm_preprocessing/src/DM/agg.py +267 -0
  25. bm_preprocessing/src/DM/dbscan.py +218 -0
  26. bm_preprocessing/src/DM/finals.py +19 -0
  27. bm_preprocessing/src/DM/gsp.py +378 -0
  28. bm_preprocessing/src/DM/test.py +19 -0
  29. bm_preprocessing/src/Finals/__init__.py +1 -0
  30. bm_preprocessing/src/Finals/kaadhal.py +1453 -0
  31. bm_preprocessing/src/Finals/raaka.py +1338 -0
  32. bm_preprocessing/src/Finals/seedan.py +1173 -0
  33. bm_preprocessing/src/Finals/vikram.py +520 -0
  34. bm_preprocessing/src/IR/__init__.py +1 -0
  35. bm_preprocessing/src/IR/finals.py +14 -0
  36. bm_preprocessing/src/IR/pagerank.py +109 -0
  37. bm_preprocessing/src/IR/recommenders_pca.py +487 -0
  38. bm_preprocessing/src/IR/test.py +14 -0
  39. bm_preprocessing/src/PY/__init__.py +1 -0
  40. bm_preprocessing/src/PY/lib_doc.py +295 -0
  41. bm_preprocessing/src/PY/python_doc.py +177 -0
  42. bm_preprocessing/src/__init__.py +1 -0
  43. itertoolkit-1.5.0.dist-info/METADATA +120 -0
  44. itertoolkit-1.5.0.dist-info/RECORD +45 -0
  45. itertoolkit-1.5.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,14 @@
1
+ """bm_preprocessing package."""
2
+
3
+ # Keep legacy module paths working, e.g. ``import bm_preprocessing.DM``.
4
+ import sys
5
+
6
+ from .importer import DM, IR, PY, Finals
7
+
8
+ sys.modules[__name__ + ".DM"] = DM
9
+ sys.modules[__name__ + ".PY"] = PY
10
+ sys.modules[__name__ + ".IR"] = IR
11
+ sys.modules[__name__ + ".Finals"] = Finals
12
+
13
+
14
+ __all__ = ["DM", "PY", "IR"]
@@ -0,0 +1,7 @@
1
+ from .agg import agg
2
+ from .dbscan import dbscan
3
+ from .finals import finals
4
+ from .gsp import gsp
5
+ from .test import test
6
+
7
+ __all__ = ["finals", "test", "agg", "dbscan", "gsp"]
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "DM" / "agg.py"
6
+ agg = SourceCodeModule("bm_preprocessing.DM.agg", _source_file)
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "DM" / "dbscan.py"
6
+ dbscan = SourceCodeModule("bm_preprocessing.DM.dbscan", _source_file)
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "DM" / "finals.py"
6
+ finals = SourceCodeModule("bm_preprocessing.DM.finals", _source_file)
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "DM" / "gsp.py"
6
+ gsp = SourceCodeModule("bm_preprocessing.DM.gsp", _source_file)
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "DM" / "test.py"
6
+ test = SourceCodeModule("bm_preprocessing.DM.test", _source_file)
@@ -0,0 +1,7 @@
1
+ """Finals source snippets."""
2
+ from .kaadhal import kaadhal
3
+ from .raaka import raaka
4
+ from .seedan import seedan
5
+ from .vikram import vikram
6
+
7
+ __all__ = ["kaadhal", "raaka", "seedan", "vikram"]
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "Finals" / "kaadhal.py"
6
+ kaadhal = SourceCodeModule("bm_preprocessing.Finals.kaadhal", _source_file)
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "Finals" / "raaka.py"
6
+ raaka = SourceCodeModule("bm_preprocessing.Finals.raaka", _source_file)
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "Finals" / "seedan.py"
6
+ seedan = SourceCodeModule("bm_preprocessing.Finals.seedan", _source_file)
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "Finals" / "vikram.py"
6
+ vikram = SourceCodeModule("bm_preprocessing.Finals.vikram", _source_file)
@@ -0,0 +1,6 @@
1
+ from .finals import finals
2
+ from .pagerank import pagerank
3
+ from .recommenders_pca import recommenders_pca
4
+ from .test import test
5
+
6
+ __all__ = ["finals", "test", "pagerank", "recommenders_pca"]
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "IR" / "finals.py"
6
+ finals = SourceCodeModule("bm_preprocessing.IR.finals", _source_file)
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "IR" / "pagerank.py"
6
+ pagerank = SourceCodeModule("bm_preprocessing.IR.pagerank", _source_file)
@@ -0,0 +1,8 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "IR" / "recommenders_pca.py"
6
+ recommenders_pca = SourceCodeModule(
7
+ "bm_preprocessing.IR.recommenders_pca", _source_file
8
+ )
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "IR" / "test.py"
6
+ test = SourceCodeModule("bm_preprocessing.IR.test", _source_file)
@@ -0,0 +1,4 @@
1
+ from .lib_doc import lib_doc
2
+ from .python_doc import python_doc
3
+
4
+ __all__ = ["lib_doc", "python_doc"]
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "PY" / "lib_doc.py"
6
+ lib_doc = SourceCodeModule("bm_preprocessing.PY.lib_doc", _source_file)
@@ -0,0 +1,6 @@
1
+ from pathlib import Path
2
+
3
+ from .._module_printer import SourceCodeModule
4
+
5
+ _source_file = Path(__file__).parents[2] / "src" / "PY" / "python_doc.py"
6
+ python_doc = SourceCodeModule("bm_preprocessing.PY.python_doc", _source_file)
@@ -0,0 +1,8 @@
1
+ """Importer layer that exposes printable source wrappers."""
2
+
3
+ from .DM import *
4
+ from .IR import *
5
+ from .PY import *
6
+ from .Finals import *
7
+
8
+ __all__ = ["DM", "PY", "IR", "Finals"]
@@ -0,0 +1,23 @@
1
+ from pathlib import Path
2
+
3
+
4
+ class SourceCodeModule:
5
+ """A class that displays source code when printed."""
6
+
7
+ def __init__(self, name: str, source_path: Path):
8
+ self.name = name
9
+ self._source_path = source_path
10
+ self._source_code = None
11
+
12
+ @property
13
+ def source_code(self) -> str:
14
+ """Lazily load source code."""
15
+ if self._source_code is None:
16
+ self._source_code = self._source_path.read_text(encoding="utf-8")
17
+ return self._source_code
18
+
19
+ def __repr__(self) -> str:
20
+ return self.source_code
21
+
22
+ def __str__(self) -> str:
23
+ return self.source_code
@@ -0,0 +1 @@
1
+ """Data-mining source snippets."""
@@ -0,0 +1,267 @@
1
+ # -*- coding: utf-8 -*-
2
+ """PS13.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/141X72rA2VxgIbA93KoMkwHzHSc1fsbty
8
+
9
+ # Agglomerative hierarchical clustering
10
+
11
+ ### Setup
12
+ """
13
+
14
+ import matplotlib.pyplot as plt
15
+ import numpy as np
16
+ import pandas as pd
17
+ import plotly.figure_factory as ff
18
+ import plotly.graph_objects as go
19
+ import seaborn as sns
20
+ from scipy.cluster.hierarchy import dendrogram, linkage
21
+ from sklearn.cluster import AgglomerativeClustering
22
+ from sklearn.preprocessing import LabelEncoder
23
+
24
+ csv_path = "/content/drive/MyDrive/Academic_Resources/Semester 08/20XW87/PS13/mall_customers_dataset.csv"
25
+
26
+ df = pd.read_csv(csv_path)
27
+ display(df)
28
+
29
+ """### Remove CustomerID column"""
30
+
31
+ df = df.drop(columns=["CustomerID"])
32
+ display(df)
33
+
34
+ """### Check for missing values"""
35
+
36
+ missing_counts = df.isna().sum()
37
+ print(missing_counts)
38
+
39
+ missing_counts = df.isna().sum()
40
+ missing_percent = (missing_counts / len(df)) * 100
41
+ print(missing_percent)
42
+
43
+ print("Any missing values?", df.isna().values.any())
44
+
45
+ for col in df.columns:
46
+ if df[col].isna().sum() == 0:
47
+ print(f"Column '{col}' has no missing values.")
48
+ else:
49
+ print(f"Column '{col}' has {df[col].isna().sum()} missing values.")
50
+
51
+ """### Convert categorical variable value of gender to numerical (Male-1, Female-0)"""
52
+
53
+ le = LabelEncoder()
54
+ df["gender_encoded"] = le.fit_transform(df["Gender"])
55
+
56
+ print("Classes:", le.classes_)
57
+
58
+ display(df)
59
+
60
+ """### Display male, female ratio as pie chart"""
61
+
62
+ gender_counts = df["gender_encoded"].value_counts()
63
+ labels = ["Female", "Male"]
64
+
65
+ plt.figure(figsize=(5, 5))
66
+ plt.pie(
67
+ gender_counts,
68
+ labels=labels,
69
+ autopct="%1.1f%%",
70
+ startangle=90,
71
+ colors=["#66b3ff", "#ff9999"],
72
+ )
73
+ plt.title("Gender Distribution")
74
+ plt.show()
75
+
76
+ """### Display age, annual income as bar graph"""
77
+
78
+ values = [df["Age"].mean(), df["Annual Income (k$)"].mean()]
79
+ labels = ["Age", "Annual Income (k$)"]
80
+
81
+ plt.bar(labels, values, color=["skyblue", "orange"])
82
+ plt.title("Age and Annual Income")
83
+ plt.ylabel("Value")
84
+ plt.show()
85
+
86
+ df["Age_group"] = pd.cut(df["Age"], bins=[18, 25, 35, 45, 55, 65], right=False)
87
+
88
+ plt.figure(figsize=(8, 5))
89
+ sns.barplot(x="Age_group", y="Annual Income (k$)", data=df)
90
+
91
+ plt.title("Average Income by Age Group")
92
+ plt.xticks(rotation=45)
93
+ plt.show()
94
+
95
+ sns.barplot(x="Age", y="Annual Income (k$)", data=df, estimator="mean")
96
+
97
+ import plotly.express as px
98
+
99
+ fig = px.bar(
100
+ df.groupby("Age")["Annual Income (k$)"].mean().reset_index(),
101
+ x="Age",
102
+ y="Annual Income (k$)",
103
+ title="Mean Income by Age",
104
+ )
105
+ fig.show()
106
+
107
+ """### Perform agglomerative clustering with ward algorithm as linkage."""
108
+
109
+ agg_clust = AgglomerativeClustering(n_clusters=2, linkage="ward")
110
+ df["Cluster"] = agg_clust.fit_predict(df[["Age", "Annual Income (k$)"]])
111
+
112
+ print(df)
113
+
114
+ linked = linkage(df[["Age", "Annual Income (k$)"]], method="ward")
115
+
116
+ plt.figure(figsize=(20, 5))
117
+ dendrogram(
118
+ linked, labels=df.index.tolist(), distance_sort="descending", show_leaf_counts=True
119
+ )
120
+ plt.title("Dendrogram")
121
+ plt.xlabel("Sample index")
122
+ plt.ylabel("Distance")
123
+ plt.show()
124
+
125
+ fig = ff.create_dendrogram(
126
+ df[["Age", "Annual Income (k$)"]].values,
127
+ labels=df.index.tolist(),
128
+ linkagefun=lambda x: linkage(x, "ward"),
129
+ )
130
+ fig.update_layout(width=1200, height=600)
131
+ fig.show()
132
+
133
+ """## Custom
134
+
135
+ ### Impl
136
+ """
137
+
138
+
139
+ def euclidean(a, b):
140
+ return np.sqrt(np.sum((a - b) ** 2))
141
+
142
+
143
+ def ward_distance(cluster_i, cluster_j, data):
144
+ """
145
+ Ward's formula:
146
+ d(i,j) = sqrt( (2 * ni * nj) / (ni + nj) ) * ||mean_i - mean_j||
147
+ """
148
+ ni, nj = len(cluster_i), len(cluster_j)
149
+ mean_i = np.mean(data[cluster_i], axis=0)
150
+ mean_j = np.mean(data[cluster_j], axis=0)
151
+ return np.sqrt((2 * ni * nj) / (ni + nj)) * euclidean(mean_i, mean_j)
152
+
153
+
154
+ def custom_agglomerative(data, n_clusters=2):
155
+ n = len(data)
156
+ clusters = {i: [i] for i in range(n)}
157
+ linkage_matrix = []
158
+ next_id = n
159
+
160
+ while len(clusters) > 1:
161
+ keys = list(clusters.keys())
162
+ min_dist = float("inf")
163
+ merge_a, merge_b = None, None
164
+
165
+ for i in range(len(keys)):
166
+ for j in range(i + 1, len(keys)):
167
+ d = ward_distance(clusters[keys[i]], clusters[keys[j]], data)
168
+ if d < min_dist:
169
+ min_dist = d
170
+ merge_a, merge_b = keys[i], keys[j]
171
+
172
+ new_members = clusters[merge_a] + clusters[merge_b]
173
+
174
+ id_a = float(min(merge_a, merge_b))
175
+ id_b = float(max(merge_a, merge_b))
176
+ prev_dist = linkage_matrix[-1][2] if linkage_matrix else 0.0
177
+ safe_dist = float(max(min_dist, prev_dist))
178
+
179
+ linkage_matrix.append([id_a, id_b, safe_dist, float(len(new_members))])
180
+
181
+ clusters[next_id] = new_members
182
+ del clusters[merge_a]
183
+ del clusters[merge_b]
184
+ next_id += 1
185
+
186
+ Z = np.array(linkage_matrix, dtype=float)
187
+
188
+ from scipy.cluster.hierarchy import fcluster
189
+
190
+ labels = fcluster(Z, t=n_clusters, criterion="maxclust") - 1
191
+
192
+ return labels, Z
193
+
194
+
195
+ features = df[["Age", "Annual Income (k$)"]].values
196
+
197
+ labels, Z = custom_agglomerative(features, n_clusters=2)
198
+ df["Cluster"] = labels
199
+ print(df)
200
+
201
+ """### Visualizations"""
202
+
203
+ plt.figure(figsize=(8, 5))
204
+ for c in np.unique(labels):
205
+ mask = labels == c
206
+ plt.scatter(features[mask, 0], features[mask, 1], label=f"Cluster {c}", s=60)
207
+ plt.xlabel("Age")
208
+ plt.ylabel("Annual Income (k$)")
209
+ plt.title("Custom Agglomerative Clustering (Ward)")
210
+ plt.legend()
211
+ plt.tight_layout()
212
+ plt.show()
213
+
214
+ plt.figure(figsize=(12, 5))
215
+ dendrogram(linked, truncate_mode="lastp", p=20, leaf_rotation=45, leaf_font_size=10)
216
+ plt.title("Dendrogram (Ward Linkage)")
217
+ plt.xlabel("Cluster index / size")
218
+ plt.ylabel("Ward distance")
219
+ plt.tight_layout()
220
+ plt.show()
221
+
222
+ ddata = dendrogram(Z, truncate_mode="lastp", p=20, no_plot=True)
223
+
224
+ fig = go.Figure()
225
+ for x_seg, y_seg in zip(ddata["icoord"], ddata["dcoord"]):
226
+ fig.add_trace(
227
+ go.Scatter(
228
+ x=x_seg,
229
+ y=y_seg,
230
+ mode="lines",
231
+ line=dict(color="royalblue", width=1.5),
232
+ showlegend=False,
233
+ hoverinfo="skip",
234
+ )
235
+ )
236
+
237
+ fig.update_layout(
238
+ title="Dendrogram (Custom Ward Linkage)",
239
+ xaxis=dict(
240
+ title="Cluster index / size",
241
+ tickangle=45,
242
+ tickfont=dict(size=10),
243
+ showgrid=False,
244
+ ),
245
+ yaxis=dict(title="Ward Distance", showgrid=True, gridcolor="lightgrey"),
246
+ width=1200,
247
+ height=500,
248
+ plot_bgcolor="white",
249
+ paper_bgcolor="white",
250
+ showlegend=False,
251
+ )
252
+ fig.show()
253
+
254
+ """## Tests"""
255
+
256
+ test_data = np.array([[1, 2], [1, 4], [5, 8], [5, 9], [8, 1], [8, 2]], dtype=float)
257
+
258
+ # Custom
259
+ _, Z_custom = custom_agglomerative(test_data, n_clusters=2)
260
+
261
+ # Scipy
262
+ from scipy.cluster.hierarchy import linkage
263
+
264
+ Z_scipy = linkage(test_data, method="ward")
265
+
266
+ print("Custom:\n", Z_custom.round(3))
267
+ print("\nScipy:\n", Z_scipy.round(3))
@@ -0,0 +1,218 @@
1
+ import os
2
+
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import pandas as pd
6
+ from google.colab import drive
7
+ from sklearn.metrics import (
8
+ adjusted_rand_score,
9
+ classification_report,
10
+ normalized_mutual_info_score,
11
+ )
12
+ from sklearn.preprocessing import StandardScaler
13
+
14
+ drive.mount("/content/drive")
15
+
16
+ csv_path = "/content/drive/MyDrive/Academic_Resources/Semester 08/20XW87/PS14/wholesale_customers_data.csv"
17
+ df = pd.read_csv(csv_path)
18
+
19
+ print(df.head())
20
+
21
+ """### DF Report"""
22
+
23
+ print(df.describe())
24
+
25
+ """### Drop 'Channel' and 'Region' columns"""
26
+
27
+ df = df.drop(columns=["Channel", "Region"])
28
+
29
+ display(df.head())
30
+
31
+ """### Consider Groceries and Milk attributes. Normalize these attribute values by scaling it from 0 mean to unit variance. Visualize normalized dataset."""
32
+
33
+ plt.scatter(df["Grocery"], df["Milk"], alpha=0.6)
34
+ plt.title("Normalized Groceries vs Milk")
35
+ plt.xlabel("Grocery (before normalized)")
36
+ plt.ylabel("Milk (before normalized)")
37
+ plt.show()
38
+
39
+ # Select Groceries and Milk columns
40
+ data_to_normalize = df[["Grocery", "Milk"]]
41
+
42
+ # Normalize (StandardScaler standardizes to mean=0, std=1)
43
+ scaler = StandardScaler()
44
+ normalized_data = scaler.fit_transform(data_to_normalize)
45
+
46
+ # Convert back to DataFrame for convenience
47
+ normalized_df = pd.DataFrame(normalized_data, columns=["Grocery", "Milk"])
48
+
49
+ print(normalized_df.mean())
50
+ print(normalized_df.std())
51
+
52
+ mean_grocery = normalized_df["Grocery"].mean()
53
+ mean_milk = normalized_df["Milk"].mean()
54
+
55
+ plt.figure(figsize=(7, 5))
56
+
57
+ plt.scatter(normalized_df["Grocery"], normalized_df["Milk"], alpha=0.6)
58
+
59
+ plt.axhline(mean_milk, linestyle="--", label=f"Milk Mean ({mean_milk:.2e})")
60
+ plt.axvline(mean_grocery, linestyle="--", label=f"Grocery Mean ({mean_grocery:.2e})")
61
+
62
+ plt.title("Normalized Grocery vs Milk")
63
+ plt.xlabel("Grocery (Standardized)")
64
+ plt.ylabel("Milk (Standardized)")
65
+
66
+ plt.legend()
67
+ plt.grid()
68
+ plt.show()
69
+
70
+ """### Implementation of DBSCAN"""
71
+
72
+
73
+ def euclidean_distance(p1, p2):
74
+ return np.sqrt(np.sum((p1 - p2) ** 2))
75
+
76
+
77
+ def region_query(data, point_idx, eps):
78
+ neighbors = []
79
+ for i in range(len(data)):
80
+ if euclidean_distance(data[point_idx], data[i]) <= eps:
81
+ neighbors.append(i)
82
+ return neighbors
83
+
84
+
85
+ def expand_cluster(data, labels, point_idx, neighbors, cluster_id, eps, min_pts):
86
+ labels[point_idx] = cluster_id
87
+ i = 0
88
+ while i < len(neighbors):
89
+ neighbor_idx = neighbors[i]
90
+ if labels[neighbor_idx] == -1:
91
+ labels[neighbor_idx] = cluster_id
92
+ elif labels[neighbor_idx] == 0:
93
+ labels[neighbor_idx] = cluster_id
94
+ new_neighbors = region_query(data, neighbor_idx, eps)
95
+ if len(new_neighbors) >= min_pts:
96
+ neighbors += new_neighbors
97
+ i += 1
98
+
99
+
100
+ def dbscan(data, eps=0.5, min_pts=15):
101
+ labels = [0] * len(data) # 0 means unvisited
102
+ cluster_id = 0
103
+
104
+ for point_idx in range(len(data)):
105
+ if labels[point_idx] != 0:
106
+ continue
107
+ neighbors = region_query(data, point_idx, eps)
108
+ if len(neighbors) < min_pts:
109
+ labels[point_idx] = -1 # Noise
110
+ else:
111
+ cluster_id += 1
112
+ expand_cluster(data, labels, point_idx, neighbors, cluster_id, eps, min_pts)
113
+
114
+ return labels
115
+
116
+
117
+ # Run on normalized grocery & milk data
118
+ data_np = normalized_df.values
119
+ labels = dbscan(data_np, eps=0.5, min_pts=15)
120
+
121
+ # Convert labels to numpy array for plotting
122
+ labels = np.array(labels)
123
+
124
+ plt.figure(figsize=(8, 6))
125
+ unique_labels = set(labels)
126
+ colors = plt.cm.get_cmap("tab10", len(unique_labels))
127
+
128
+ for k in unique_labels:
129
+ class_member_mask = labels == k
130
+ xy = data_np[class_member_mask]
131
+ if k == -1:
132
+ # Noise in black
133
+ plt.scatter(xy[:, 0], xy[:, 1], c="k", marker="x", label="Noise")
134
+ else:
135
+ plt.scatter(xy[:, 0], xy[:, 1], c=[colors(k - 1)], label=f"Cluster {k}")
136
+ plt.title("DBSCAN Clustering Results (Custom Implementation)")
137
+ plt.xlabel("Grocery (normalized)")
138
+ plt.ylabel("Milk (normalized)")
139
+ plt.legend()
140
+ plt.show()
141
+
142
+ from sklearn.cluster import DBSCAN
143
+ from sklearn.datasets import make_moons
144
+
145
+ # Generate moon-shaped data
146
+ X, _ = make_moons(n_samples=2000, noise=0.05)
147
+
148
+ # Run DBSCAN
149
+ dbscan_builtin = DBSCAN(eps=0.3, min_samples=15)
150
+ labels_builtin = dbscan_builtin.fit_predict(X)
151
+
152
+ # Plot
153
+ plt.figure(figsize=(8, 6))
154
+ unique_labels = set(labels_builtin)
155
+ colors = plt.cm.get_cmap("tab10", len(unique_labels))
156
+
157
+ for k in unique_labels:
158
+ class_member_mask = labels_builtin == k
159
+ xy = X[class_member_mask]
160
+ if k == -1:
161
+ plt.scatter(xy[:, 0], xy[:, 1], c="k", marker="x", label="Noise")
162
+ else:
163
+ plt.scatter(xy[:, 0], xy[:, 1], c=[colors(k)], label=f"Cluster {k}")
164
+ plt.title("DBSCAN on make_moons")
165
+ plt.legend()
166
+ plt.show()
167
+
168
+ # Generate synthetic moon-shaped data
169
+ X, true_labels = make_moons(n_samples=2000, noise=0.05)
170
+
171
+ # Custom DBSCAN on X
172
+ custom_labels = dbscan(X, eps=0.3, min_pts=15)
173
+ custom_labels = np.array(custom_labels)
174
+
175
+ # Sklearn DBSCAN on X
176
+ dbscan_builtin = DBSCAN(eps=0.3, min_samples=15)
177
+ sklearn_labels = dbscan_builtin.fit_predict(X)
178
+
179
+ # Clustering metrics comparing the two
180
+ ari = adjusted_rand_score(custom_labels, sklearn_labels)
181
+ nmi = normalized_mutual_info_score(custom_labels, sklearn_labels)
182
+
183
+ print(f"Adjusted Rand Index (ARI) between custom and sklearn: {ari:.4f}")
184
+ print(f"Normalized Mutual Information (NMI) between custom and sklearn: {nmi:.4f}")
185
+
186
+ # Classification report (treating custom as true, sklearn as predicted)
187
+ print("\nClassification report (custom as true, sklearn as predicted):")
188
+ print(classification_report(custom_labels, sklearn_labels))
189
+
190
+ """### Add some noise data and again visualize the results."""
191
+
192
+ # Add random noise points
193
+ noise = np.random.uniform(low=-2, high=3, size=(200, 2))
194
+
195
+ # Combine with original data
196
+ X_noisy = np.vstack([X, noise])
197
+
198
+ # Run DBSCAN again
199
+ dbscan_noisy = DBSCAN(eps=0.3, min_samples=15)
200
+ labels_noisy = dbscan_noisy.fit_predict(X_noisy)
201
+
202
+ # Plot
203
+ plt.figure(figsize=(8, 6))
204
+ unique_labels = set(labels_noisy)
205
+ colors = plt.cm.get_cmap("tab10", len(unique_labels))
206
+
207
+ for k in unique_labels:
208
+ class_member_mask = labels_noisy == k
209
+ xy = X_noisy[class_member_mask]
210
+
211
+ if k == -1:
212
+ plt.scatter(xy[:, 0], xy[:, 1], c="k", marker="x", label="Noise")
213
+ else:
214
+ plt.scatter(xy[:, 0], xy[:, 1], c=[colors(k)], label=f"Cluster {k}")
215
+
216
+ plt.title("DBSCAN with Added Noise")
217
+ plt.legend()
218
+ plt.show()