hossam 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hossam/__init__.py +19 -0
- hossam/hs_cluster copy.py +1060 -0
- hossam/hs_cluster.py +369 -128
- hossam/hs_plot.py +244 -13
- hossam/hs_prep.py +241 -56
- hossam/hs_stats.py +39 -2
- hossam/hs_util.py +20 -0
- {hossam-0.4.16.dist-info → hossam-0.4.18.dist-info}/METADATA +1 -1
- hossam-0.4.18.dist-info/RECORD +18 -0
- hossam-0.4.16.dist-info/RECORD +0 -17
- {hossam-0.4.16.dist-info → hossam-0.4.18.dist-info}/WHEEL +0 -0
- {hossam-0.4.16.dist-info → hossam-0.4.18.dist-info}/licenses/LICENSE +0 -0
- {hossam-0.4.16.dist-info → hossam-0.4.18.dist-info}/top_level.txt +0 -0
hossam/hs_cluster.py
CHANGED
|
@@ -1,25 +1,29 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
# ===================================================================
|
|
3
|
-
# 패키지 참조
|
|
3
|
+
# 파이썬 기본 패키지 참조
|
|
4
4
|
# ===================================================================
|
|
5
5
|
import numpy as np
|
|
6
6
|
import concurrent.futures as futures
|
|
7
|
-
|
|
8
|
-
from . import hs_plot
|
|
9
|
-
|
|
10
7
|
from tqdm.auto import tqdm
|
|
11
8
|
from itertools import combinations
|
|
12
|
-
|
|
13
9
|
from typing import Literal, Callable
|
|
10
|
+
|
|
11
|
+
# ===================================================================
|
|
12
|
+
# 데이터 분석 패키지 참조
|
|
13
|
+
# ===================================================================
|
|
14
14
|
from kneed import KneeLocator
|
|
15
15
|
from pandas import Series, DataFrame, MultiIndex, concat
|
|
16
16
|
from matplotlib.pyplot import Axes # type: ignore
|
|
17
|
-
|
|
18
|
-
from sklearn.cluster import KMeans, DBSCAN
|
|
17
|
+
from scipy.stats import normaltest
|
|
18
|
+
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
|
19
19
|
from sklearn.neighbors import NearestNeighbors
|
|
20
20
|
from sklearn.metrics import silhouette_score, adjusted_rand_score
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
# ===================================================================
|
|
23
|
+
# hossam 패키지 참조
|
|
24
|
+
# ===================================================================
|
|
25
|
+
from . import hs_plot
|
|
26
|
+
from .hs_util import is_2d
|
|
23
27
|
|
|
24
28
|
RANDOM_STATE = 52
|
|
25
29
|
|
|
@@ -28,16 +32,20 @@ RANDOM_STATE = 52
|
|
|
28
32
|
# K-평균 군집화 모델을 적합하는 함수.
|
|
29
33
|
# ===================================================================
|
|
30
34
|
def kmeans_fit(
|
|
31
|
-
data: DataFrame,
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
+
data: DataFrame,
|
|
36
|
+
n_clusters: int | None = None,
|
|
37
|
+
k_range: list | tuple = [2, 11],
|
|
38
|
+
random_state: int = RANDOM_STATE,
|
|
39
|
+
plot: bool = False,
|
|
40
|
+
fields: list[str] | tuple[str] | tuple[tuple[str]] | list[list[str]] | None = None,
|
|
41
|
+
**params,
|
|
42
|
+
) -> tuple[KMeans, DataFrame, float]:
|
|
35
43
|
"""
|
|
36
44
|
K-평균 군집화 모델을 적합하는 함수.
|
|
37
45
|
|
|
38
46
|
Args:
|
|
39
47
|
data (DataFrame): 군집화할 데이터프레임.
|
|
40
|
-
n_clusters (int): 군집 개수.
|
|
48
|
+
n_clusters (int | None): 군집 개수.
|
|
41
49
|
random_state (int, optional): 랜덤 시드. 기본값은 RANDOM_STATE.
|
|
42
50
|
plot (bool, optional): True면 결과를 시각화함. 기본값 False.
|
|
43
51
|
fields (list[list[str]] | None, optional): 시각화할 필드 쌍 리스트. 기본값 None이면 수치형 컬럼의 모든 조합 사용.
|
|
@@ -46,21 +54,41 @@ def kmeans_fit(
|
|
|
46
54
|
Returns:
|
|
47
55
|
KMeans: 적합된 KMeans 모델.
|
|
48
56
|
DataFrame: 클러스터 결과가 포함된 데이터 프레임
|
|
57
|
+
float: 실루엣 점수
|
|
49
58
|
"""
|
|
50
59
|
df = data.copy()
|
|
60
|
+
|
|
61
|
+
if n_clusters is None:
|
|
62
|
+
n_clusters = kmeans_best_k(data=df, k_range=k_range, random_state=random_state, plot=False)
|
|
63
|
+
print(f"Best k found: {n_clusters}")
|
|
64
|
+
|
|
51
65
|
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, **params)
|
|
52
66
|
kmeans.fit(data)
|
|
53
67
|
df["cluster"] = kmeans.predict(df)
|
|
68
|
+
score = float(silhouette_score(X=data, labels=df["cluster"]))
|
|
54
69
|
|
|
55
70
|
if plot:
|
|
56
|
-
cluster_plot(
|
|
57
|
-
estimator=kmeans,
|
|
58
|
-
data=data,
|
|
59
|
-
fields=fields,
|
|
60
|
-
title=f"K-Means Clustering (k={n_clusters})"
|
|
61
|
-
)
|
|
62
71
|
|
|
63
|
-
|
|
72
|
+
if not is_2d(fields):
|
|
73
|
+
fields = [fields] # type: ignore
|
|
74
|
+
|
|
75
|
+
# cluster_plot(
|
|
76
|
+
# estimator=kmeans,
|
|
77
|
+
# data=data,
|
|
78
|
+
# fields=fields,
|
|
79
|
+
# title=f"K-Means Clustering (k={n_clusters})",
|
|
80
|
+
# )
|
|
81
|
+
for f in fields: # type: ignore
|
|
82
|
+
hs_plot.visualize_silhouette(
|
|
83
|
+
estimator=kmeans,
|
|
84
|
+
data=data,
|
|
85
|
+
xname=f[0], # type: ignore
|
|
86
|
+
yname=f[1], # type: ignore
|
|
87
|
+
title=f"K-Means Clustering (k={n_clusters})",
|
|
88
|
+
outline=True,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return kmeans, df, score
|
|
64
92
|
|
|
65
93
|
|
|
66
94
|
# ===================================================================
|
|
@@ -122,7 +150,9 @@ def kmeans_elbow(
|
|
|
122
150
|
r = range(k_range[0], k_range[1])
|
|
123
151
|
|
|
124
152
|
for k in r:
|
|
125
|
-
kmeans, _ = kmeans_fit(
|
|
153
|
+
kmeans, _, score = kmeans_fit(
|
|
154
|
+
data=data, n_clusters=k, random_state=random_state
|
|
155
|
+
)
|
|
126
156
|
inertia_list.append(kmeans.inertia_)
|
|
127
157
|
|
|
128
158
|
best_k, _ = elbow_point(
|
|
@@ -131,13 +161,17 @@ def kmeans_elbow(
|
|
|
131
161
|
dir="left,down",
|
|
132
162
|
S=S,
|
|
133
163
|
plot=plot,
|
|
134
|
-
title=title,
|
|
135
164
|
marker=marker,
|
|
136
165
|
width=width,
|
|
137
166
|
height=height,
|
|
138
167
|
dpi=dpi,
|
|
139
168
|
linewidth=linewidth,
|
|
140
169
|
save_path=save_path,
|
|
170
|
+
title=(
|
|
171
|
+
f"K-Means Elbow Method (k={k_range[0]}-{k_range[1]-1}, silhouette={score:.3f})"
|
|
172
|
+
if title is None
|
|
173
|
+
else title
|
|
174
|
+
),
|
|
141
175
|
ax=ax,
|
|
142
176
|
callback=callback,
|
|
143
177
|
**params,
|
|
@@ -206,68 +240,70 @@ def kmeans_silhouette(
|
|
|
206
240
|
estimators = []
|
|
207
241
|
|
|
208
242
|
def __process_k(k):
|
|
209
|
-
estimator, cdf = kmeans_fit(
|
|
243
|
+
estimator, cdf, score = kmeans_fit(
|
|
210
244
|
data=data, n_clusters=k, random_state=random_state
|
|
211
245
|
)
|
|
212
|
-
|
|
213
|
-
return s_score, estimator
|
|
246
|
+
return score, estimator
|
|
214
247
|
|
|
215
248
|
with futures.ThreadPoolExecutor() as executor:
|
|
249
|
+
executed = []
|
|
216
250
|
for k in klist:
|
|
217
251
|
pbar.set_description(f"K-Means Silhouette: k={k}")
|
|
218
|
-
executed
|
|
219
|
-
|
|
252
|
+
executed.append(executor.submit(__process_k, k))
|
|
253
|
+
|
|
254
|
+
for e in executed:
|
|
255
|
+
s_score, estimator = e.result()
|
|
220
256
|
silhouettes.append(s_score)
|
|
221
257
|
estimators.append(estimator)
|
|
222
258
|
pbar.update(1)
|
|
223
259
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
260
|
+
if plot is not False:
|
|
261
|
+
for estimator in estimators:
|
|
262
|
+
pbar.set_description(f"K-Means Plotting: k={estimator.n_clusters}")
|
|
263
|
+
|
|
264
|
+
if plot == "silhouette":
|
|
265
|
+
hs_plot.silhouette_plot(
|
|
266
|
+
estimator=estimator,
|
|
267
|
+
data=data,
|
|
268
|
+
title=title,
|
|
269
|
+
width=width,
|
|
270
|
+
height=height,
|
|
271
|
+
dpi=dpi,
|
|
272
|
+
linewidth=linewidth,
|
|
273
|
+
save_path=save_path,
|
|
274
|
+
**params,
|
|
275
|
+
)
|
|
276
|
+
elif plot == "cluster":
|
|
277
|
+
hs_plot.cluster_plot(
|
|
278
|
+
estimator=estimator,
|
|
279
|
+
data=data,
|
|
280
|
+
xname=xname,
|
|
281
|
+
yname=yname,
|
|
282
|
+
outline=True,
|
|
283
|
+
palette=None,
|
|
284
|
+
width=width,
|
|
285
|
+
height=height,
|
|
286
|
+
dpi=dpi,
|
|
287
|
+
title=title,
|
|
288
|
+
save_path=save_path,
|
|
289
|
+
)
|
|
290
|
+
elif plot == "both":
|
|
291
|
+
hs_plot.visualize_silhouette(
|
|
292
|
+
estimator=estimator,
|
|
293
|
+
data=data,
|
|
294
|
+
xname=xname,
|
|
295
|
+
yname=yname,
|
|
296
|
+
outline=True,
|
|
297
|
+
palette=None,
|
|
298
|
+
width=width,
|
|
299
|
+
height=height,
|
|
300
|
+
dpi=dpi,
|
|
301
|
+
title=title,
|
|
302
|
+
linewidth=linewidth,
|
|
303
|
+
save_path=save_path,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
pbar.update(1)
|
|
271
307
|
|
|
272
308
|
silhouette_df = DataFrame({"k": klist, "silhouette_score": silhouettes})
|
|
273
309
|
silhouette_df.sort_values(by="silhouette_score", ascending=False, inplace=True)
|
|
@@ -358,6 +394,7 @@ def elbow_point(
|
|
|
358
394
|
best_y = kn.elbow_y
|
|
359
395
|
|
|
360
396
|
if plot:
|
|
397
|
+
|
|
361
398
|
def hvline(ax):
|
|
362
399
|
ax.axvline(best_x, color="red", linestyle="--", linewidth=0.7)
|
|
363
400
|
ax.axhline(best_y, color="red", linestyle="--", linewidth=0.7)
|
|
@@ -369,7 +406,7 @@ def elbow_point(
|
|
|
369
406
|
ha="center",
|
|
370
407
|
va="bottom",
|
|
371
408
|
color="black",
|
|
372
|
-
fontweight="bold"
|
|
409
|
+
fontweight="bold",
|
|
373
410
|
)
|
|
374
411
|
|
|
375
412
|
if callback is not None:
|
|
@@ -398,7 +435,7 @@ def elbow_point(
|
|
|
398
435
|
# 데이터프레임의 여러 필드 쌍에 대해 군집 산점도를 그리는 함수.
|
|
399
436
|
# ===================================================================
|
|
400
437
|
def cluster_plot(
|
|
401
|
-
estimator: KMeans,
|
|
438
|
+
estimator: KMeans | DBSCAN | AgglomerativeClustering,
|
|
402
439
|
data: DataFrame,
|
|
403
440
|
hue: str | None = None,
|
|
404
441
|
vector: str | None = None,
|
|
@@ -437,7 +474,7 @@ def cluster_plot(
|
|
|
437
474
|
from hossam import *
|
|
438
475
|
|
|
439
476
|
data = hs_util.load_data('iris')
|
|
440
|
-
estimator, cdf = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
|
|
477
|
+
estimator, cdf, score = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
|
|
441
478
|
hs_cluster.cluster_plot(cdf, hue='cluster')
|
|
442
479
|
```
|
|
443
480
|
"""
|
|
@@ -454,7 +491,7 @@ def cluster_plot(
|
|
|
454
491
|
xname, yname = field_pair
|
|
455
492
|
|
|
456
493
|
hs_plot.cluster_plot(
|
|
457
|
-
estimator=estimator,
|
|
494
|
+
estimator=estimator, # type: ignore
|
|
458
495
|
data=data,
|
|
459
496
|
xname=xname,
|
|
460
497
|
yname=yname,
|
|
@@ -479,7 +516,7 @@ def persona(
|
|
|
479
516
|
data: DataFrame,
|
|
480
517
|
cluster: str | Series | np.ndarray | list | dict,
|
|
481
518
|
fields: list[str] | None = None,
|
|
482
|
-
full: bool = False
|
|
519
|
+
full: bool = False,
|
|
483
520
|
) -> DataFrame:
|
|
484
521
|
"""
|
|
485
522
|
군집화된 데이터프레임에서 각 군집의 페르소나(특성 요약)를 생성하는 함수.
|
|
@@ -497,8 +534,8 @@ def persona(
|
|
|
497
534
|
from hossam import *
|
|
498
535
|
|
|
499
536
|
data = hs_util.load_data('iris')
|
|
500
|
-
|
|
501
|
-
persona_df = hs_cluster.persona(
|
|
537
|
+
estimator, df, score = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
|
|
538
|
+
persona_df = hs_cluster.persona(df, hue='cluster')
|
|
502
539
|
print(persona_df)
|
|
503
540
|
```
|
|
504
541
|
"""
|
|
@@ -509,7 +546,9 @@ def persona(
|
|
|
509
546
|
|
|
510
547
|
if isinstance(cluster, str):
|
|
511
548
|
if cluster not in df.columns:
|
|
512
|
-
raise ValueError(
|
|
549
|
+
raise ValueError(
|
|
550
|
+
f"cluster로 지정된 컬럼 '{cluster}'이(가) 데이터프레임에 존재하지 않습니다."
|
|
551
|
+
)
|
|
513
552
|
else:
|
|
514
553
|
df["cluster"] = cluster
|
|
515
554
|
cluster = "cluster"
|
|
@@ -525,6 +564,9 @@ def persona(
|
|
|
525
564
|
persona_dict[("", f"count")] = len(group)
|
|
526
565
|
|
|
527
566
|
for field in fields:
|
|
567
|
+
if field == cluster:
|
|
568
|
+
continue
|
|
569
|
+
|
|
528
570
|
# 명목형일 경우 최빈값 사용
|
|
529
571
|
if df[field].dtype == "object" or df[field].dtype.name == "category":
|
|
530
572
|
persona_dict[(field, "mode")] = group[field].mode()[0]
|
|
@@ -573,7 +615,7 @@ def kmeans_best_k(
|
|
|
573
615
|
k_range: list | tuple = [2, 11],
|
|
574
616
|
S: float = 0.1,
|
|
575
617
|
random_state: int = RANDOM_STATE,
|
|
576
|
-
plot: bool = True
|
|
618
|
+
plot: bool = True,
|
|
577
619
|
) -> int:
|
|
578
620
|
"""
|
|
579
621
|
엘보우 포인트와 실루엣 점수를 통해 최적의 K값을 결정하는 함수.
|
|
@@ -600,17 +642,19 @@ def kmeans_best_k(
|
|
|
600
642
|
k_range=k_range,
|
|
601
643
|
S=S,
|
|
602
644
|
random_state=random_state,
|
|
603
|
-
plot=True if plot else False
|
|
645
|
+
plot=True if plot else False,
|
|
604
646
|
)
|
|
605
647
|
|
|
606
648
|
silhouette_df = kmeans_silhouette(
|
|
607
649
|
data=data,
|
|
608
650
|
k_range=k_range,
|
|
609
651
|
random_state=random_state,
|
|
610
|
-
plot="both" if plot else False
|
|
652
|
+
plot="both" if plot else False,
|
|
611
653
|
)
|
|
612
654
|
|
|
613
|
-
silhouette_k = silhouette_df.sort_values(
|
|
655
|
+
silhouette_k = silhouette_df.sort_values(
|
|
656
|
+
by="silhouette_score", ascending=False
|
|
657
|
+
).iloc[0]["k"]
|
|
614
658
|
|
|
615
659
|
if elbow_k == silhouette_k:
|
|
616
660
|
best_k = elbow_k
|
|
@@ -625,10 +669,7 @@ def kmeans_best_k(
|
|
|
625
669
|
# DBSCAN 군집화 모델을 적합하는 함수.
|
|
626
670
|
# ===================================================================
|
|
627
671
|
def __dbscan_fit(
|
|
628
|
-
data: DataFrame,
|
|
629
|
-
eps: float = 0.5,
|
|
630
|
-
min_samples: int = 5,
|
|
631
|
-
**params
|
|
672
|
+
data: DataFrame, eps: float = 0.5, min_samples: int = 5, **params
|
|
632
673
|
) -> tuple[DBSCAN, DataFrame, DataFrame]:
|
|
633
674
|
"""
|
|
634
675
|
DBSCAN 군집화 모델을 적합하는 함수.
|
|
@@ -664,12 +705,14 @@ def __dbscan_fit(
|
|
|
664
705
|
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
|
665
706
|
noise_ratio = np.mean(labels == -1)
|
|
666
707
|
|
|
667
|
-
result_df = DataFrame(
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
708
|
+
result_df = DataFrame(
|
|
709
|
+
{
|
|
710
|
+
"eps": [eps],
|
|
711
|
+
"min_samples": [min_samples],
|
|
712
|
+
"n_clusters": [n_clusters],
|
|
713
|
+
"noise_ratio": [noise_ratio],
|
|
714
|
+
}
|
|
715
|
+
)
|
|
673
716
|
|
|
674
717
|
return estimator, df, result_df
|
|
675
718
|
|
|
@@ -691,7 +734,7 @@ def dbscan_eps(
|
|
|
691
734
|
linewidth: int = hs_plot.config.line_width,
|
|
692
735
|
dpi: int = hs_plot.config.dpi,
|
|
693
736
|
save_path: str | None = None,
|
|
694
|
-
ax: Axes | None = None
|
|
737
|
+
ax: Axes | None = None,
|
|
695
738
|
) -> tuple[float, np.ndarray]:
|
|
696
739
|
"""
|
|
697
740
|
DBSCAN 군집화에서 최적의 eps 값을 탐지하는 함수.
|
|
@@ -759,15 +802,37 @@ def dbscan_eps(
|
|
|
759
802
|
|
|
760
803
|
return best_eps, eps_grid
|
|
761
804
|
|
|
805
|
+
# ===================================================================
|
|
806
|
+
# DBSCAN 군집화 모델을 적합하고 최적의 eps 값을 탐지하는 함수.
|
|
807
|
+
# ===================================================================
|
|
762
808
|
def dbscan_fit(
|
|
763
809
|
data: DataFrame,
|
|
764
810
|
eps: float | list | np.ndarray | None = None,
|
|
765
811
|
min_samples: int = 5,
|
|
766
812
|
ari_threshold: float = 0.9,
|
|
767
813
|
noise_diff_threshold: float = 0.05,
|
|
768
|
-
plot
|
|
769
|
-
**params
|
|
814
|
+
plot: bool = True,
|
|
815
|
+
**params,
|
|
770
816
|
) -> tuple[DBSCAN, DataFrame, DataFrame]:
|
|
817
|
+
"""
|
|
818
|
+
DBSCAN 군집화 모델을 적합하고 최적의 eps 값을 탐지하는 함수.
|
|
819
|
+
|
|
820
|
+
Args:
|
|
821
|
+
data (DataFrame): 군집화할 데이터프레임.
|
|
822
|
+
eps (float | list | np.ndarray | None, optional): eps 값 또는 리스트.
|
|
823
|
+
None이면 최적의 eps 값을 탐지함. 기본값 None.
|
|
824
|
+
min_samples (int, optional): 핵심점이 되기 위한 최소 샘플수. 기본값 5.
|
|
825
|
+
ari_threshold (float, optional): 안정 구간 탐지를 위한 ARI 임계값. 기본값 0.9.
|
|
826
|
+
noise_diff_threshold (float, optional): 안정 구간 탐지를 위한 노이즈 비율 변화 임계값. 기본값 0.05.
|
|
827
|
+
plot (bool, optional): True면 결과를 시각화함. 기본값 True.
|
|
828
|
+
**params: DBSCAN에 전달할 추가 파라미터.
|
|
829
|
+
|
|
830
|
+
Returns:
|
|
831
|
+
tuple: (estimator, cluster_df, result_df)
|
|
832
|
+
- estimator: 적합된 DBSCAN 모델 또는 모델 리스트(최적 eps가 여러 개인 경우).
|
|
833
|
+
- cluster_df: 클러스터 및 벡터 유형이 포함된 데이터 프레임 또는 데이터 프레임 리스트(최적 eps가 여러 개인 경우).
|
|
834
|
+
- result_df: eps 값에 따른 군집화 요약 통계 데이터 프레임.
|
|
835
|
+
"""
|
|
771
836
|
|
|
772
837
|
# eps 값이 지정되지 않은 경우 최적의 eps 탐지
|
|
773
838
|
if eps is None:
|
|
@@ -782,51 +847,62 @@ def dbscan_fit(
|
|
|
782
847
|
cluster_dfs = []
|
|
783
848
|
result_dfs: DataFrame | None = None
|
|
784
849
|
|
|
785
|
-
with tqdm(total=len(eps)+2) as pbar:
|
|
850
|
+
with tqdm(total=len(eps) + 2) as pbar:
|
|
851
|
+
pbar.set_description(f"DBSCAN Clustering")
|
|
852
|
+
|
|
786
853
|
with futures.ThreadPoolExecutor() as executor:
|
|
854
|
+
executers = []
|
|
787
855
|
for i, e in enumerate(eps):
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
856
|
+
executers.append(
|
|
857
|
+
executor.submit(
|
|
858
|
+
__dbscan_fit,
|
|
859
|
+
data=data,
|
|
860
|
+
eps=e,
|
|
861
|
+
min_samples=min_samples,
|
|
862
|
+
**params,
|
|
863
|
+
)
|
|
864
|
+
)
|
|
865
|
+
|
|
866
|
+
for i, e in enumerate(executers):
|
|
867
|
+
estimator, cluster_df, result_df = e.result()
|
|
791
868
|
estimators.append(estimator)
|
|
792
869
|
cluster_dfs.append(cluster_df)
|
|
793
870
|
|
|
794
871
|
if result_dfs is None:
|
|
795
|
-
result_df[
|
|
872
|
+
result_df["ARI"] = np.nan
|
|
796
873
|
result_dfs = result_df
|
|
797
874
|
else:
|
|
798
|
-
result_df[
|
|
875
|
+
result_df["ARI"] = adjusted_rand_score(cluster_dfs[i - 1]["cluster"], cluster_df["cluster"]) # type: ignore
|
|
799
876
|
result_dfs = concat([result_dfs, result_df], ignore_index=True)
|
|
800
877
|
|
|
801
878
|
pbar.update(1)
|
|
802
879
|
|
|
803
|
-
|
|
804
|
-
result_dfs[
|
|
805
|
-
result_dfs[
|
|
806
|
-
|
|
807
|
-
(result_dfs[
|
|
808
|
-
(result_dfs[
|
|
809
|
-
(result_dfs['noise_ratio_diff'] <= noise_diff_threshold) # type: ignore
|
|
880
|
+
result_dfs["cluster_diff"] = result_dfs["n_clusters"].diff().abs() # type: ignore
|
|
881
|
+
result_dfs["noise_ratio_diff"] = result_dfs["noise_ratio"].diff().abs() # type: ignore
|
|
882
|
+
result_dfs["stable"] = ( # type: ignore
|
|
883
|
+
(result_dfs["ARI"] >= ari_threshold) # type: ignore
|
|
884
|
+
& (result_dfs["cluster_diff"] <= 0) # type: ignore
|
|
885
|
+
& (result_dfs["noise_ratio_diff"] <= noise_diff_threshold) # type: ignore
|
|
810
886
|
)
|
|
811
887
|
|
|
812
888
|
# 첫 행은 비교 불가
|
|
813
|
-
result_dfs.loc[0,
|
|
889
|
+
result_dfs.loc[0, "stable"] = False # type: ignore
|
|
814
890
|
pbar.update(1)
|
|
815
891
|
|
|
816
892
|
if len(eps) == 1:
|
|
817
|
-
result_dfs[
|
|
818
|
-
result_dfs[
|
|
893
|
+
result_dfs["group_id"] = 1 # type: ignore
|
|
894
|
+
result_dfs["recommand"] = "unknown" # type: ignore
|
|
819
895
|
else:
|
|
820
896
|
# 안정구간 도출하기
|
|
821
897
|
# stable 여부를 0/1로 변환
|
|
822
|
-
stable_flag = result_dfs[
|
|
898
|
+
stable_flag = result_dfs["stable"].astype(int).values # type: ignore
|
|
823
899
|
|
|
824
900
|
# 연속 구간 구분용 그룹 id 생성
|
|
825
|
-
group_id = (stable_flag != np.roll(stable_flag, 1)).cumsum()
|
|
826
|
-
result_dfs[
|
|
901
|
+
group_id = (stable_flag != np.roll(stable_flag, 1)).cumsum() # type: ignore
|
|
902
|
+
result_dfs["group_id"] = group_id # type: ignore
|
|
827
903
|
|
|
828
904
|
# 안정구간 중 가장 긴 구간 선택
|
|
829
|
-
stable_groups = result_dfs[result_dfs[
|
|
905
|
+
stable_groups = result_dfs[result_dfs["stable"]].groupby("group_id") # type: ignore
|
|
830
906
|
|
|
831
907
|
# 각 구간의 길이 계산
|
|
832
908
|
group_sizes = stable_groups.size()
|
|
@@ -834,23 +910,188 @@ def dbscan_fit(
|
|
|
834
910
|
# 가장 긴 안정 구간 선택
|
|
835
911
|
best_group_id = group_sizes.idxmax()
|
|
836
912
|
|
|
837
|
-
result_dfs[
|
|
913
|
+
result_dfs["recommand"] = "bad" # type: ignore
|
|
838
914
|
|
|
839
915
|
# 가장 긴 안정 구간에 해당하는 recommand 컬럼을 `best`로 변경
|
|
840
|
-
result_dfs.loc[result_dfs["group_id"] == best_group_id,
|
|
916
|
+
result_dfs.loc[result_dfs["group_id"] == best_group_id, "recommand"] = "best" # type: ignore
|
|
841
917
|
|
|
842
918
|
# result_dfs에서 recommand가 best에 해당하는 인덱스와 같은 위치의 추정기만 추출
|
|
843
|
-
best_indexes = list(result_dfs[result_dfs[
|
|
919
|
+
best_indexes = list(result_dfs[result_dfs["recommand"] == "best"].index) # type: ignore
|
|
844
920
|
|
|
845
|
-
for i in range(len(estimators)-1, -1, -1):
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
921
|
+
# for i in range(len(estimators) - 1, -1, -1):
|
|
922
|
+
# if i not in best_indexes:
|
|
923
|
+
# del estimators[i]
|
|
924
|
+
# del cluster_dfs[i]
|
|
849
925
|
|
|
850
926
|
pbar.update(1)
|
|
851
927
|
|
|
928
|
+
# best 모델 선정: recommand=='best'인 인덱스의 estimator/cluster_df만 반환
|
|
929
|
+
if len(estimators) == 1:
|
|
930
|
+
|
|
931
|
+
if plot:
|
|
932
|
+
hs_plot.scatterplot(
|
|
933
|
+
df=cluster_dfs[0],
|
|
934
|
+
xname=cluster_dfs[0].columns[0],
|
|
935
|
+
yname=cluster_dfs[0].columns[1],
|
|
936
|
+
hue="cluster",
|
|
937
|
+
vector="vector",
|
|
938
|
+
title=f"DBSCAN Clustering (eps={estimators[0].eps}, min_samples={estimators[0].min_samples})",
|
|
939
|
+
outline=True
|
|
940
|
+
)
|
|
941
|
+
|
|
942
|
+
return estimators[0], cluster_dfs[0], result_dfs # type: ignore
|
|
943
|
+
|
|
944
|
+
# recommand=='best'인 인덱스 추출 (여러 개면 첫 번째)
|
|
945
|
+
best_indexes = list(result_dfs[result_dfs["recommand"] == "best"].index) # type: ignore
|
|
946
|
+
if not best_indexes:
|
|
947
|
+
# fallback: 첫 번째
|
|
948
|
+
best_index = 0
|
|
949
|
+
else:
|
|
950
|
+
best_index = best_indexes[0]
|
|
951
|
+
|
|
952
|
+
best_estimator = estimators[best_index]
|
|
953
|
+
best_cluster_df = cluster_dfs[best_index]
|
|
954
|
+
|
|
955
|
+
if plot:
|
|
956
|
+
hs_plot.scatterplot(
|
|
957
|
+
df=best_cluster_df,
|
|
958
|
+
xname=best_cluster_df.columns[0],
|
|
959
|
+
yname=best_cluster_df.columns[1],
|
|
960
|
+
hue="cluster",
|
|
961
|
+
vector="vector",
|
|
962
|
+
title=f"DBSCAN Clustering (eps={best_estimator.eps}, min_samples={best_estimator.min_samples})",
|
|
963
|
+
outline=True
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
return best_estimator, best_cluster_df, result_dfs # type: ignore
|
|
967
|
+
|
|
968
|
+
|
|
969
|
+
# ===================================================================
|
|
970
|
+
# 단일 계층적 군집화 모델을 적합하는 함수.
|
|
971
|
+
# ===================================================================
|
|
972
|
+
def __agg_fit(
|
|
973
|
+
data: DataFrame,
|
|
974
|
+
n_clusters: int = 3,
|
|
975
|
+
linkage: Literal["ward", "complete", "average", "single"] = "ward",
|
|
976
|
+
plot: bool = False,
|
|
977
|
+
compute_distances: bool = True,
|
|
978
|
+
**params,
|
|
979
|
+
) -> tuple[AgglomerativeClustering, DataFrame, float]:
|
|
980
|
+
"""
|
|
981
|
+
단일 계층적 군집화 모델을 적합하는 함수.
|
|
982
|
+
|
|
983
|
+
Args:
|
|
984
|
+
data (DataFrame): 군집화할 데이터프레임.
|
|
985
|
+
n_clusters (int, optional): 군집 개수. 기본값 3.
|
|
986
|
+
linkage (str, optional): 병합 기준. 기본값 "ward".
|
|
987
|
+
compute_distances (bool, optional): 거리 행렬 계산 여부. 기본값 True.
|
|
988
|
+
plot (bool, optional): True면 결과를 시각화함. 기본값 False.
|
|
989
|
+
**params: AgglomerativeClustering에 전달할 추가 파라미터.
|
|
990
|
+
|
|
991
|
+
Returns:
|
|
992
|
+
tuple: (estimator, df, score)
|
|
993
|
+
- estimator: 적합된 AgglomerativeClustering 모델.
|
|
994
|
+
- df: 클러스터 결과가 포함된 데이터 프레임.
|
|
995
|
+
- score: 실루엣 점수.
|
|
996
|
+
|
|
997
|
+
"""
|
|
998
|
+
df = data.copy()
|
|
999
|
+
estimator = AgglomerativeClustering(
|
|
1000
|
+
n_clusters=n_clusters, compute_distances=compute_distances, linkage=linkage, **params
|
|
1001
|
+
)
|
|
1002
|
+
estimator.fit(data)
|
|
1003
|
+
df["cluster"] = estimator.labels_
|
|
1004
|
+
score = float(silhouette_score(X=data, labels=df["cluster"]))
|
|
1005
|
+
|
|
1006
|
+
if plot:
|
|
1007
|
+
hs_plot.visualize_silhouette(estimator=estimator, data=data)
|
|
1008
|
+
|
|
1009
|
+
return estimator, df, score
|
|
1010
|
+
|
|
1011
|
+
|
|
1012
|
+
def agg_fit(
|
|
1013
|
+
data: DataFrame,
|
|
1014
|
+
n_clusters: int | list[int] | np.ndarray = 3,
|
|
1015
|
+
linkage: Literal["ward", "complete", "average", "single"] = "ward",
|
|
1016
|
+
plot: bool = False,
|
|
1017
|
+
**params,
|
|
1018
|
+
) -> tuple[AgglomerativeClustering | list[AgglomerativeClustering], DataFrame | list[DataFrame], DataFrame]:
|
|
1019
|
+
"""
|
|
1020
|
+
계층적 군집화 모델을 적합하는 함수.
|
|
1021
|
+
|
|
1022
|
+
Args:
|
|
1023
|
+
data (DataFrame): 군집화할 데이터프레임.
|
|
1024
|
+
n_clusters (int | list[int] | np.ndarray, optional): 군집 개수 또는 개수 리스트. 기본값 3.
|
|
1025
|
+
linkage (str, optional): 병합 기준. 기본값 "ward".
|
|
1026
|
+
plot (bool, optional): True면 결과를 시각화함. 기본값 False.
|
|
1027
|
+
**params: AgglomerativeClustering에 전달할 추가 파라미터.
|
|
1028
|
+
|
|
1029
|
+
Returns:
|
|
1030
|
+
tuple: (estimator(s), df(s), score_df)
|
|
1031
|
+
- estimator(s): 적합된 AgglomerativeClustering 모델 또는 모델 리스트 (n_clusters가 리스트일 때 리턴도 리스트로 처리됨).
|
|
1032
|
+
- df(s): 클러스터 결과가 포함된 데이터 프레임 또는 데이터 프레임 리스트(n_cluseters가 리스트일 때 리턴되 리스트로 처리됨).
|
|
1033
|
+
- score_df: 각 군집 개수에 대한 실루엣 점수 데이터프레임.
|
|
1034
|
+
|
|
1035
|
+
Examples:
|
|
1036
|
+
```python
|
|
1037
|
+
from hossam import *
|
|
1038
|
+
|
|
1039
|
+
data = hs_util.load_data('iris')
|
|
1040
|
+
estimators, cluster_dfs, score_df = hs_cluster.agg_fit(data.iloc[:, :-1], n_clusters=[2,3,4])
|
|
1041
|
+
```
|
|
1042
|
+
"""
|
|
1043
|
+
compute_distances = False
|
|
1044
|
+
|
|
1045
|
+
if isinstance(n_clusters, int):
|
|
1046
|
+
n_clusters = [n_clusters]
|
|
1047
|
+
compute_distances = True
|
|
1048
|
+
else:
|
|
1049
|
+
n_clusters = list(range(n_clusters[0], n_clusters[-1]))
|
|
1050
|
+
|
|
1051
|
+
estimators = []
|
|
1052
|
+
cluster_dfs = []
|
|
1053
|
+
scores = []
|
|
1054
|
+
|
|
1055
|
+
with tqdm(total=len(n_clusters)*2) as pbar:
|
|
1056
|
+
pbar.set_description(f"Agglomerative Clustering")
|
|
1057
|
+
|
|
1058
|
+
with futures.ThreadPoolExecutor() as executor:
|
|
1059
|
+
executers = []
|
|
1060
|
+
for k in n_clusters:
|
|
1061
|
+
executers.append(
|
|
1062
|
+
executor.submit(
|
|
1063
|
+
__agg_fit,
|
|
1064
|
+
data=data,
|
|
1065
|
+
n_clusters=k,
|
|
1066
|
+
linkage=linkage,
|
|
1067
|
+
plot=False,
|
|
1068
|
+
compute_distances=compute_distances,
|
|
1069
|
+
**params,
|
|
1070
|
+
)
|
|
1071
|
+
)
|
|
1072
|
+
pbar.update(1)
|
|
1073
|
+
|
|
1074
|
+
for e in executers:
|
|
1075
|
+
estimator, cluster_df, score = e.result()
|
|
1076
|
+
estimators.append(estimator)
|
|
1077
|
+
cluster_dfs.append(cluster_df)
|
|
1078
|
+
scores.append({"k": estimator.n_clusters, "silhouette_score": score})
|
|
1079
|
+
|
|
1080
|
+
if plot:
|
|
1081
|
+
hs_plot.visualize_silhouette(
|
|
1082
|
+
estimator=estimator,
|
|
1083
|
+
data=data,
|
|
1084
|
+
outline=True,
|
|
1085
|
+
title=f"Agglomerative Clustering Silhouette (k={estimator.n_clusters})",
|
|
1086
|
+
)
|
|
1087
|
+
|
|
1088
|
+
pbar.update(1)
|
|
1089
|
+
|
|
1090
|
+
score_df = DataFrame(scores)
|
|
1091
|
+
score_df.sort_values(by="silhouette_score", ascending=False, inplace=True)
|
|
1092
|
+
|
|
852
1093
|
return (
|
|
853
1094
|
estimators[0] if len(estimators) == 1 else estimators, # type: ignore
|
|
854
1095
|
cluster_dfs[0] if len(cluster_dfs) == 1 else cluster_dfs,
|
|
855
|
-
|
|
856
|
-
)
|
|
1096
|
+
score_df, # type: ignore
|
|
1097
|
+
)
|