hossam 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hossam/hs_cluster.py +283 -120
- hossam/hs_plot.py +241 -14
- hossam/hs_prep.py +241 -56
- hossam/hs_util.py +20 -0
- {hossam-0.4.15.dist-info → hossam-0.4.17.dist-info}/METADATA +1 -1
- {hossam-0.4.15.dist-info → hossam-0.4.17.dist-info}/RECORD +9 -9
- {hossam-0.4.15.dist-info → hossam-0.4.17.dist-info}/WHEEL +0 -0
- {hossam-0.4.15.dist-info → hossam-0.4.17.dist-info}/licenses/LICENSE +0 -0
- {hossam-0.4.15.dist-info → hossam-0.4.17.dist-info}/top_level.txt +0 -0
hossam/hs_cluster.py
CHANGED
|
@@ -1,25 +1,28 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
# ===================================================================
|
|
3
|
-
# 패키지 참조
|
|
3
|
+
# 파이썬 기본 패키지 참조
|
|
4
4
|
# ===================================================================
|
|
5
5
|
import numpy as np
|
|
6
6
|
import concurrent.futures as futures
|
|
7
|
-
|
|
8
|
-
from . import hs_plot
|
|
9
|
-
|
|
10
7
|
from tqdm.auto import tqdm
|
|
11
8
|
from itertools import combinations
|
|
12
|
-
|
|
13
9
|
from typing import Literal, Callable
|
|
10
|
+
|
|
11
|
+
# ===================================================================
|
|
12
|
+
# 데이터 분석 패키지 참조
|
|
13
|
+
# ===================================================================
|
|
14
14
|
from kneed import KneeLocator
|
|
15
15
|
from pandas import Series, DataFrame, MultiIndex, concat
|
|
16
16
|
from matplotlib.pyplot import Axes # type: ignore
|
|
17
|
-
|
|
18
|
-
from sklearn.cluster import KMeans, DBSCAN
|
|
17
|
+
from scipy.stats import normaltest
|
|
18
|
+
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
|
19
19
|
from sklearn.neighbors import NearestNeighbors
|
|
20
20
|
from sklearn.metrics import silhouette_score, adjusted_rand_score
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
# ===================================================================
|
|
23
|
+
# hossam 패키지 참조
|
|
24
|
+
# ===================================================================
|
|
25
|
+
from . import hs_plot
|
|
23
26
|
|
|
24
27
|
RANDOM_STATE = 52
|
|
25
28
|
|
|
@@ -28,10 +31,13 @@ RANDOM_STATE = 52
|
|
|
28
31
|
# K-평균 군집화 모델을 적합하는 함수.
|
|
29
32
|
# ===================================================================
|
|
30
33
|
def kmeans_fit(
|
|
31
|
-
data: DataFrame,
|
|
34
|
+
data: DataFrame,
|
|
35
|
+
n_clusters: int,
|
|
36
|
+
random_state: int = RANDOM_STATE,
|
|
37
|
+
plot: bool = False,
|
|
32
38
|
fields: list[list[str]] | None = None,
|
|
33
|
-
**params
|
|
34
|
-
) -> tuple[KMeans, DataFrame]:
|
|
39
|
+
**params,
|
|
40
|
+
) -> tuple[KMeans, DataFrame, float]:
|
|
35
41
|
"""
|
|
36
42
|
K-평균 군집화 모델을 적합하는 함수.
|
|
37
43
|
|
|
@@ -46,21 +52,23 @@ def kmeans_fit(
|
|
|
46
52
|
Returns:
|
|
47
53
|
KMeans: 적합된 KMeans 모델.
|
|
48
54
|
DataFrame: 클러스터 결과가 포함된 데이터 프레임
|
|
55
|
+
float: 실루엣 점수
|
|
49
56
|
"""
|
|
50
57
|
df = data.copy()
|
|
51
58
|
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, **params)
|
|
52
59
|
kmeans.fit(data)
|
|
53
60
|
df["cluster"] = kmeans.predict(df)
|
|
61
|
+
score = float(silhouette_score(X=data, labels=df["cluster"]))
|
|
54
62
|
|
|
55
63
|
if plot:
|
|
56
64
|
cluster_plot(
|
|
57
65
|
estimator=kmeans,
|
|
58
66
|
data=data,
|
|
59
67
|
fields=fields,
|
|
60
|
-
title=f"K-Means Clustering (k={n_clusters})"
|
|
68
|
+
title=f"K-Means Clustering (k={n_clusters})",
|
|
61
69
|
)
|
|
62
70
|
|
|
63
|
-
return kmeans, df
|
|
71
|
+
return kmeans, df, score
|
|
64
72
|
|
|
65
73
|
|
|
66
74
|
# ===================================================================
|
|
@@ -122,7 +130,9 @@ def kmeans_elbow(
|
|
|
122
130
|
r = range(k_range[0], k_range[1])
|
|
123
131
|
|
|
124
132
|
for k in r:
|
|
125
|
-
kmeans, _ = kmeans_fit(
|
|
133
|
+
kmeans, _, score = kmeans_fit(
|
|
134
|
+
data=data, n_clusters=k, random_state=random_state
|
|
135
|
+
)
|
|
126
136
|
inertia_list.append(kmeans.inertia_)
|
|
127
137
|
|
|
128
138
|
best_k, _ = elbow_point(
|
|
@@ -131,13 +141,17 @@ def kmeans_elbow(
|
|
|
131
141
|
dir="left,down",
|
|
132
142
|
S=S,
|
|
133
143
|
plot=plot,
|
|
134
|
-
title=title,
|
|
135
144
|
marker=marker,
|
|
136
145
|
width=width,
|
|
137
146
|
height=height,
|
|
138
147
|
dpi=dpi,
|
|
139
148
|
linewidth=linewidth,
|
|
140
149
|
save_path=save_path,
|
|
150
|
+
title=(
|
|
151
|
+
f"K-Means Elbow Method (k={k_range[0]}-{k_range[1]-1}, silhouette={score:.3f})"
|
|
152
|
+
if title is None
|
|
153
|
+
else title
|
|
154
|
+
),
|
|
141
155
|
ax=ax,
|
|
142
156
|
callback=callback,
|
|
143
157
|
**params,
|
|
@@ -206,68 +220,70 @@ def kmeans_silhouette(
|
|
|
206
220
|
estimators = []
|
|
207
221
|
|
|
208
222
|
def __process_k(k):
|
|
209
|
-
estimator, cdf = kmeans_fit(
|
|
223
|
+
estimator, cdf, score = kmeans_fit(
|
|
210
224
|
data=data, n_clusters=k, random_state=random_state
|
|
211
225
|
)
|
|
212
|
-
|
|
213
|
-
return s_score, estimator
|
|
226
|
+
return score, estimator
|
|
214
227
|
|
|
215
228
|
with futures.ThreadPoolExecutor() as executor:
|
|
229
|
+
executed = []
|
|
216
230
|
for k in klist:
|
|
217
231
|
pbar.set_description(f"K-Means Silhouette: k={k}")
|
|
218
|
-
executed
|
|
219
|
-
|
|
232
|
+
executed.append(executor.submit(__process_k, k))
|
|
233
|
+
|
|
234
|
+
for e in executed:
|
|
235
|
+
s_score, estimator = e.result()
|
|
220
236
|
silhouettes.append(s_score)
|
|
221
237
|
estimators.append(estimator)
|
|
222
238
|
pbar.update(1)
|
|
223
239
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
240
|
+
if plot is not False:
|
|
241
|
+
for estimator in estimators:
|
|
242
|
+
pbar.set_description(f"K-Means Plotting: k={estimator.n_clusters}")
|
|
243
|
+
|
|
244
|
+
if plot == "silhouette":
|
|
245
|
+
hs_plot.silhouette_plot(
|
|
246
|
+
estimator=estimator,
|
|
247
|
+
data=data,
|
|
248
|
+
title=title,
|
|
249
|
+
width=width,
|
|
250
|
+
height=height,
|
|
251
|
+
dpi=dpi,
|
|
252
|
+
linewidth=linewidth,
|
|
253
|
+
save_path=save_path,
|
|
254
|
+
**params,
|
|
255
|
+
)
|
|
256
|
+
elif plot == "cluster":
|
|
257
|
+
hs_plot.cluster_plot(
|
|
258
|
+
estimator=estimator,
|
|
259
|
+
data=data,
|
|
260
|
+
xname=xname,
|
|
261
|
+
yname=yname,
|
|
262
|
+
outline=True,
|
|
263
|
+
palette=None,
|
|
264
|
+
width=width,
|
|
265
|
+
height=height,
|
|
266
|
+
dpi=dpi,
|
|
267
|
+
title=title,
|
|
268
|
+
save_path=save_path,
|
|
269
|
+
)
|
|
270
|
+
elif plot == "both":
|
|
271
|
+
hs_plot.visualize_silhouette(
|
|
272
|
+
estimator=estimator,
|
|
273
|
+
data=data,
|
|
274
|
+
xname=xname,
|
|
275
|
+
yname=yname,
|
|
276
|
+
outline=True,
|
|
277
|
+
palette=None,
|
|
278
|
+
width=width,
|
|
279
|
+
height=height,
|
|
280
|
+
dpi=dpi,
|
|
281
|
+
title=title,
|
|
282
|
+
linewidth=linewidth,
|
|
283
|
+
save_path=save_path,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
pbar.update(1)
|
|
271
287
|
|
|
272
288
|
silhouette_df = DataFrame({"k": klist, "silhouette_score": silhouettes})
|
|
273
289
|
silhouette_df.sort_values(by="silhouette_score", ascending=False, inplace=True)
|
|
@@ -358,6 +374,7 @@ def elbow_point(
|
|
|
358
374
|
best_y = kn.elbow_y
|
|
359
375
|
|
|
360
376
|
if plot:
|
|
377
|
+
|
|
361
378
|
def hvline(ax):
|
|
362
379
|
ax.axvline(best_x, color="red", linestyle="--", linewidth=0.7)
|
|
363
380
|
ax.axhline(best_y, color="red", linestyle="--", linewidth=0.7)
|
|
@@ -369,7 +386,7 @@ def elbow_point(
|
|
|
369
386
|
ha="center",
|
|
370
387
|
va="bottom",
|
|
371
388
|
color="black",
|
|
372
|
-
fontweight="bold"
|
|
389
|
+
fontweight="bold",
|
|
373
390
|
)
|
|
374
391
|
|
|
375
392
|
if callback is not None:
|
|
@@ -398,7 +415,7 @@ def elbow_point(
|
|
|
398
415
|
# 데이터프레임의 여러 필드 쌍에 대해 군집 산점도를 그리는 함수.
|
|
399
416
|
# ===================================================================
|
|
400
417
|
def cluster_plot(
|
|
401
|
-
estimator: KMeans,
|
|
418
|
+
estimator: KMeans | DBSCAN | AgglomerativeClustering,
|
|
402
419
|
data: DataFrame,
|
|
403
420
|
hue: str | None = None,
|
|
404
421
|
vector: str | None = None,
|
|
@@ -437,7 +454,7 @@ def cluster_plot(
|
|
|
437
454
|
from hossam import *
|
|
438
455
|
|
|
439
456
|
data = hs_util.load_data('iris')
|
|
440
|
-
estimator, cdf = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
|
|
457
|
+
estimator, cdf, score = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
|
|
441
458
|
hs_cluster.cluster_plot(cdf, hue='cluster')
|
|
442
459
|
```
|
|
443
460
|
"""
|
|
@@ -454,7 +471,7 @@ def cluster_plot(
|
|
|
454
471
|
xname, yname = field_pair
|
|
455
472
|
|
|
456
473
|
hs_plot.cluster_plot(
|
|
457
|
-
estimator=estimator,
|
|
474
|
+
estimator=estimator, # type: ignore
|
|
458
475
|
data=data,
|
|
459
476
|
xname=xname,
|
|
460
477
|
yname=yname,
|
|
@@ -479,7 +496,7 @@ def persona(
|
|
|
479
496
|
data: DataFrame,
|
|
480
497
|
cluster: str | Series | np.ndarray | list | dict,
|
|
481
498
|
fields: list[str] | None = None,
|
|
482
|
-
full: bool = False
|
|
499
|
+
full: bool = False,
|
|
483
500
|
) -> DataFrame:
|
|
484
501
|
"""
|
|
485
502
|
군집화된 데이터프레임에서 각 군집의 페르소나(특성 요약)를 생성하는 함수.
|
|
@@ -497,8 +514,8 @@ def persona(
|
|
|
497
514
|
from hossam import *
|
|
498
515
|
|
|
499
516
|
data = hs_util.load_data('iris')
|
|
500
|
-
|
|
501
|
-
persona_df = hs_cluster.persona(
|
|
517
|
+
estimator, df, score = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
|
|
518
|
+
persona_df = hs_cluster.persona(df, hue='cluster')
|
|
502
519
|
print(persona_df)
|
|
503
520
|
```
|
|
504
521
|
"""
|
|
@@ -509,7 +526,9 @@ def persona(
|
|
|
509
526
|
|
|
510
527
|
if isinstance(cluster, str):
|
|
511
528
|
if cluster not in df.columns:
|
|
512
|
-
raise ValueError(
|
|
529
|
+
raise ValueError(
|
|
530
|
+
f"cluster로 지정된 컬럼 '{cluster}'이(가) 데이터프레임에 존재하지 않습니다."
|
|
531
|
+
)
|
|
513
532
|
else:
|
|
514
533
|
df["cluster"] = cluster
|
|
515
534
|
cluster = "cluster"
|
|
@@ -573,7 +592,7 @@ def kmeans_best_k(
|
|
|
573
592
|
k_range: list | tuple = [2, 11],
|
|
574
593
|
S: float = 0.1,
|
|
575
594
|
random_state: int = RANDOM_STATE,
|
|
576
|
-
plot: bool = True
|
|
595
|
+
plot: bool = True,
|
|
577
596
|
) -> int:
|
|
578
597
|
"""
|
|
579
598
|
엘보우 포인트와 실루엣 점수를 통해 최적의 K값을 결정하는 함수.
|
|
@@ -600,17 +619,19 @@ def kmeans_best_k(
|
|
|
600
619
|
k_range=k_range,
|
|
601
620
|
S=S,
|
|
602
621
|
random_state=random_state,
|
|
603
|
-
plot=True if plot else False
|
|
622
|
+
plot=True if plot else False,
|
|
604
623
|
)
|
|
605
624
|
|
|
606
625
|
silhouette_df = kmeans_silhouette(
|
|
607
626
|
data=data,
|
|
608
627
|
k_range=k_range,
|
|
609
628
|
random_state=random_state,
|
|
610
|
-
plot="both" if plot else False
|
|
629
|
+
plot="both" if plot else False,
|
|
611
630
|
)
|
|
612
631
|
|
|
613
|
-
silhouette_k = silhouette_df.sort_values(
|
|
632
|
+
silhouette_k = silhouette_df.sort_values(
|
|
633
|
+
by="silhouette_score", ascending=False
|
|
634
|
+
).iloc[0]["k"]
|
|
614
635
|
|
|
615
636
|
if elbow_k == silhouette_k:
|
|
616
637
|
best_k = elbow_k
|
|
@@ -625,10 +646,7 @@ def kmeans_best_k(
|
|
|
625
646
|
# DBSCAN 군집화 모델을 적합하는 함수.
|
|
626
647
|
# ===================================================================
|
|
627
648
|
def __dbscan_fit(
|
|
628
|
-
data: DataFrame,
|
|
629
|
-
eps: float = 0.5,
|
|
630
|
-
min_samples: int = 5,
|
|
631
|
-
**params
|
|
649
|
+
data: DataFrame, eps: float = 0.5, min_samples: int = 5, **params
|
|
632
650
|
) -> tuple[DBSCAN, DataFrame, DataFrame]:
|
|
633
651
|
"""
|
|
634
652
|
DBSCAN 군집화 모델을 적합하는 함수.
|
|
@@ -664,12 +682,14 @@ def __dbscan_fit(
|
|
|
664
682
|
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
|
665
683
|
noise_ratio = np.mean(labels == -1)
|
|
666
684
|
|
|
667
|
-
result_df = DataFrame(
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
685
|
+
result_df = DataFrame(
|
|
686
|
+
{
|
|
687
|
+
"eps": [eps],
|
|
688
|
+
"min_samples": [min_samples],
|
|
689
|
+
"n_clusters": [n_clusters],
|
|
690
|
+
"noise_ratio": [noise_ratio],
|
|
691
|
+
}
|
|
692
|
+
)
|
|
673
693
|
|
|
674
694
|
return estimator, df, result_df
|
|
675
695
|
|
|
@@ -691,7 +711,7 @@ def dbscan_eps(
|
|
|
691
711
|
linewidth: int = hs_plot.config.line_width,
|
|
692
712
|
dpi: int = hs_plot.config.dpi,
|
|
693
713
|
save_path: str | None = None,
|
|
694
|
-
ax: Axes | None = None
|
|
714
|
+
ax: Axes | None = None,
|
|
695
715
|
) -> tuple[float, np.ndarray]:
|
|
696
716
|
"""
|
|
697
717
|
DBSCAN 군집화에서 최적의 eps 값을 탐지하는 함수.
|
|
@@ -759,14 +779,15 @@ def dbscan_eps(
|
|
|
759
779
|
|
|
760
780
|
return best_eps, eps_grid
|
|
761
781
|
|
|
782
|
+
|
|
762
783
|
def dbscan_fit(
|
|
763
784
|
data: DataFrame,
|
|
764
785
|
eps: float | list | np.ndarray | None = None,
|
|
765
786
|
min_samples: int = 5,
|
|
766
787
|
ari_threshold: float = 0.9,
|
|
767
788
|
noise_diff_threshold: float = 0.05,
|
|
768
|
-
plot
|
|
769
|
-
**params
|
|
789
|
+
plot: bool = True,
|
|
790
|
+
**params,
|
|
770
791
|
) -> tuple[DBSCAN, DataFrame, DataFrame]:
|
|
771
792
|
|
|
772
793
|
# eps 값이 지정되지 않은 경우 최적의 eps 탐지
|
|
@@ -782,51 +803,62 @@ def dbscan_fit(
|
|
|
782
803
|
cluster_dfs = []
|
|
783
804
|
result_dfs: DataFrame | None = None
|
|
784
805
|
|
|
785
|
-
with tqdm(total=len(eps)+2) as pbar:
|
|
806
|
+
with tqdm(total=len(eps) + 2) as pbar:
|
|
807
|
+
pbar.set_description(f"DBSCAN Clustering")
|
|
808
|
+
|
|
786
809
|
with futures.ThreadPoolExecutor() as executor:
|
|
810
|
+
executers = []
|
|
787
811
|
for i, e in enumerate(eps):
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
812
|
+
executers.append(
|
|
813
|
+
executor.submit(
|
|
814
|
+
__dbscan_fit,
|
|
815
|
+
data=data,
|
|
816
|
+
eps=e,
|
|
817
|
+
min_samples=min_samples,
|
|
818
|
+
**params,
|
|
819
|
+
)
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
for i, e in enumerate(executers):
|
|
823
|
+
estimator, cluster_df, result_df = e.result()
|
|
791
824
|
estimators.append(estimator)
|
|
792
825
|
cluster_dfs.append(cluster_df)
|
|
793
826
|
|
|
794
827
|
if result_dfs is None:
|
|
795
|
-
result_df[
|
|
828
|
+
result_df["ARI"] = np.nan
|
|
796
829
|
result_dfs = result_df
|
|
797
830
|
else:
|
|
798
|
-
result_df[
|
|
831
|
+
result_df["ARI"] = adjusted_rand_score(cluster_dfs[i - 1]["cluster"], cluster_df["cluster"]) # type: ignore
|
|
799
832
|
result_dfs = concat([result_dfs, result_df], ignore_index=True)
|
|
800
833
|
|
|
801
834
|
pbar.update(1)
|
|
802
835
|
|
|
803
|
-
|
|
804
|
-
result_dfs[
|
|
805
|
-
result_dfs[
|
|
806
|
-
|
|
807
|
-
(result_dfs[
|
|
808
|
-
(result_dfs[
|
|
809
|
-
(result_dfs['noise_ratio_diff'] <= noise_diff_threshold) # type: ignore
|
|
836
|
+
result_dfs["cluster_diff"] = result_dfs["n_clusters"].diff().abs() # type: ignore
|
|
837
|
+
result_dfs["noise_ratio_diff"] = result_dfs["noise_ratio"].diff().abs() # type: ignore
|
|
838
|
+
result_dfs["stable"] = ( # type: ignore
|
|
839
|
+
(result_dfs["ARI"] >= ari_threshold) # type: ignore
|
|
840
|
+
& (result_dfs["cluster_diff"] <= 0) # type: ignore
|
|
841
|
+
& (result_dfs["noise_ratio_diff"] <= noise_diff_threshold) # type: ignore
|
|
810
842
|
)
|
|
811
843
|
|
|
812
844
|
# 첫 행은 비교 불가
|
|
813
|
-
result_dfs.loc[0,
|
|
845
|
+
result_dfs.loc[0, "stable"] = False # type: ignore
|
|
814
846
|
pbar.update(1)
|
|
815
847
|
|
|
816
848
|
if len(eps) == 1:
|
|
817
|
-
result_dfs[
|
|
818
|
-
result_dfs[
|
|
849
|
+
result_dfs["group_id"] = 1 # type: ignore
|
|
850
|
+
result_dfs["recommand"] = "unknown" # type: ignore
|
|
819
851
|
else:
|
|
820
852
|
# 안정구간 도출하기
|
|
821
853
|
# stable 여부를 0/1로 변환
|
|
822
|
-
stable_flag = result_dfs[
|
|
854
|
+
stable_flag = result_dfs["stable"].astype(int).values # type: ignore
|
|
823
855
|
|
|
824
856
|
# 연속 구간 구분용 그룹 id 생성
|
|
825
|
-
group_id = (stable_flag != np.roll(stable_flag, 1)).cumsum()
|
|
826
|
-
result_dfs[
|
|
857
|
+
group_id = (stable_flag != np.roll(stable_flag, 1)).cumsum() # type: ignore
|
|
858
|
+
result_dfs["group_id"] = group_id # type: ignore
|
|
827
859
|
|
|
828
860
|
# 안정구간 중 가장 긴 구간 선택
|
|
829
|
-
stable_groups = result_dfs[result_dfs[
|
|
861
|
+
stable_groups = result_dfs[result_dfs["stable"]].groupby("group_id") # type: ignore
|
|
830
862
|
|
|
831
863
|
# 각 구간의 길이 계산
|
|
832
864
|
group_sizes = stable_groups.size()
|
|
@@ -834,23 +866,154 @@ def dbscan_fit(
|
|
|
834
866
|
# 가장 긴 안정 구간 선택
|
|
835
867
|
best_group_id = group_sizes.idxmax()
|
|
836
868
|
|
|
837
|
-
result_dfs[
|
|
869
|
+
result_dfs["recommand"] = "bad" # type: ignore
|
|
838
870
|
|
|
839
871
|
# 가장 긴 안정 구간에 해당하는 recommand 컬럼을 `best`로 변경
|
|
840
|
-
result_dfs.loc[result_dfs["group_id"] == best_group_id,
|
|
872
|
+
result_dfs.loc[result_dfs["group_id"] == best_group_id, "recommand"] = "best" # type: ignore
|
|
841
873
|
|
|
842
874
|
# result_dfs에서 recommand가 best에 해당하는 인덱스와 같은 위치의 추정기만 추출
|
|
843
|
-
best_indexes = list(result_dfs[result_dfs[
|
|
875
|
+
best_indexes = list(result_dfs[result_dfs["recommand"] == "best"].index) # type: ignore
|
|
844
876
|
|
|
845
|
-
for i in range(len(estimators)-1, -1, -1):
|
|
877
|
+
for i in range(len(estimators) - 1, -1, -1):
|
|
846
878
|
if i not in best_indexes:
|
|
847
|
-
del
|
|
848
|
-
del
|
|
879
|
+
del estimators[i]
|
|
880
|
+
del cluster_dfs[i]
|
|
849
881
|
|
|
850
882
|
pbar.update(1)
|
|
851
883
|
|
|
852
884
|
return (
|
|
853
885
|
estimators[0] if len(estimators) == 1 else estimators, # type: ignore
|
|
854
886
|
cluster_dfs[0] if len(cluster_dfs) == 1 else cluster_dfs,
|
|
855
|
-
result_dfs # type: ignore
|
|
856
|
-
)
|
|
887
|
+
result_dfs, # type: ignore
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
# ===================================================================
|
|
892
|
+
# 단일 계층적 군집화 모델을 적합하는 함수.
|
|
893
|
+
# ===================================================================
|
|
894
|
+
def __agg_fit(
|
|
895
|
+
data: DataFrame,
|
|
896
|
+
n_clusters: int = 3,
|
|
897
|
+
linkage: Literal["ward", "complete", "average", "single"] = "ward",
|
|
898
|
+
plot: bool = False,
|
|
899
|
+
compute_distances: bool = True,
|
|
900
|
+
**params,
|
|
901
|
+
) -> tuple[AgglomerativeClustering, DataFrame, float]:
|
|
902
|
+
"""
|
|
903
|
+
단일 계층적 군집화 모델을 적합하는 함수.
|
|
904
|
+
|
|
905
|
+
Args:
|
|
906
|
+
data (DataFrame): 군집화할 데이터프레임.
|
|
907
|
+
n_clusters (int, optional): 군집 개수. 기본값 3.
|
|
908
|
+
linkage (str, optional): 병합 기준. 기본값 "ward".
|
|
909
|
+
compute_distances (bool, optional): 거리 행렬 계산 여부. 기본값 True.
|
|
910
|
+
plot (bool, optional): True면 결과를 시각화함. 기본값 False.
|
|
911
|
+
**params: AgglomerativeClustering에 전달할 추가 파라미터.
|
|
912
|
+
|
|
913
|
+
Returns:
|
|
914
|
+
tuple: (estimator, df, score)
|
|
915
|
+
- estimator: 적합된 AgglomerativeClustering 모델.
|
|
916
|
+
- df: 클러스터 결과가 포함된 데이터 프레임.
|
|
917
|
+
- score: 실루엣 점수.
|
|
918
|
+
|
|
919
|
+
"""
|
|
920
|
+
df = data.copy()
|
|
921
|
+
estimator = AgglomerativeClustering(
|
|
922
|
+
n_clusters=n_clusters, compute_distances=compute_distances, linkage=linkage, **params
|
|
923
|
+
)
|
|
924
|
+
estimator.fit(data)
|
|
925
|
+
df["cluster"] = estimator.labels_
|
|
926
|
+
score = float(silhouette_score(X=data, labels=df["cluster"]))
|
|
927
|
+
|
|
928
|
+
if plot:
|
|
929
|
+
hs_plot.visualize_silhouette(estimator=estimator, data=data)
|
|
930
|
+
|
|
931
|
+
return estimator, df, score
|
|
932
|
+
|
|
933
|
+
|
|
934
|
+
def agg_fit(
|
|
935
|
+
data: DataFrame,
|
|
936
|
+
n_clusters: int | list[int] | np.ndarray = 3,
|
|
937
|
+
linkage: Literal["ward", "complete", "average", "single"] = "ward",
|
|
938
|
+
plot: bool = False,
|
|
939
|
+
**params,
|
|
940
|
+
) -> tuple[AgglomerativeClustering | list[AgglomerativeClustering], DataFrame | list[DataFrame], DataFrame]:
|
|
941
|
+
"""
|
|
942
|
+
계층적 군집화 모델을 적합하는 함수.
|
|
943
|
+
|
|
944
|
+
Args:
|
|
945
|
+
data (DataFrame): 군집화할 데이터프레임.
|
|
946
|
+
n_clusters (int | list[int] | np.ndarray, optional): 군집 개수 또는 개수 리스트. 기본값 3.
|
|
947
|
+
linkage (str, optional): 병합 기준. 기본값 "ward".
|
|
948
|
+
plot (bool, optional): True면 결과를 시각화함. 기본값 False.
|
|
949
|
+
**params: AgglomerativeClustering에 전달할 추가 파라미터.
|
|
950
|
+
|
|
951
|
+
Returns:
|
|
952
|
+
tuple: (estimator(s), df(s), score_df)
|
|
953
|
+
- estimator(s): 적합된 AgglomerativeClustering 모델 또는 모델 리스트.
|
|
954
|
+
- df(s): 클러스터 결과가 포함된 데이터 프레임 또는 데이터 프레임 리스트.
|
|
955
|
+
- score_df: 각 군집 개수에 대한 실루엣 점수 데이터프레임.
|
|
956
|
+
|
|
957
|
+
Examples:
|
|
958
|
+
```python
|
|
959
|
+
from hossam import *
|
|
960
|
+
|
|
961
|
+
data = hs_util.load_data('iris')
|
|
962
|
+
estimators, cluster_dfs, score_df = hs_cluster.agg_fit(data.iloc[:, :-1], n_clusters=[2,3,4])
|
|
963
|
+
```
|
|
964
|
+
"""
|
|
965
|
+
compute_distances = False
|
|
966
|
+
|
|
967
|
+
if isinstance(n_clusters, int):
|
|
968
|
+
n_clusters = [n_clusters]
|
|
969
|
+
compute_distances = True
|
|
970
|
+
else:
|
|
971
|
+
n_clusters = list(range(n_clusters[0], n_clusters[-1]))
|
|
972
|
+
|
|
973
|
+
estimators = []
|
|
974
|
+
cluster_dfs = []
|
|
975
|
+
scores = []
|
|
976
|
+
|
|
977
|
+
with tqdm(total=len(n_clusters)*2) as pbar:
|
|
978
|
+
pbar.set_description(f"Agglomerative Clustering")
|
|
979
|
+
|
|
980
|
+
with futures.ThreadPoolExecutor() as executor:
|
|
981
|
+
executers = []
|
|
982
|
+
for k in n_clusters:
|
|
983
|
+
executers.append(
|
|
984
|
+
executor.submit(
|
|
985
|
+
__agg_fit,
|
|
986
|
+
data=data,
|
|
987
|
+
n_clusters=k,
|
|
988
|
+
linkage=linkage,
|
|
989
|
+
plot=False,
|
|
990
|
+
compute_distances=compute_distances,
|
|
991
|
+
**params,
|
|
992
|
+
)
|
|
993
|
+
)
|
|
994
|
+
pbar.update(1)
|
|
995
|
+
|
|
996
|
+
for e in executers:
|
|
997
|
+
estimator, cluster_df, score = e.result()
|
|
998
|
+
estimators.append(estimator)
|
|
999
|
+
cluster_dfs.append(cluster_df)
|
|
1000
|
+
scores.append({"k": estimator.n_clusters, "silhouette_score": score})
|
|
1001
|
+
|
|
1002
|
+
if plot:
|
|
1003
|
+
hs_plot.visualize_silhouette(
|
|
1004
|
+
estimator=estimator,
|
|
1005
|
+
data=data,
|
|
1006
|
+
outline=True,
|
|
1007
|
+
title=f"Agglomerative Clustering Silhouette (k={estimator.n_clusters})",
|
|
1008
|
+
)
|
|
1009
|
+
|
|
1010
|
+
pbar.update(1)
|
|
1011
|
+
|
|
1012
|
+
score_df = DataFrame(scores)
|
|
1013
|
+
score_df.sort_values(by="silhouette_score", ascending=False, inplace=True)
|
|
1014
|
+
|
|
1015
|
+
return (
|
|
1016
|
+
estimators[0] if len(estimators) == 1 else estimators, # type: ignore
|
|
1017
|
+
cluster_dfs[0] if len(cluster_dfs) == 1 else cluster_dfs,
|
|
1018
|
+
score_df, # type: ignore
|
|
1019
|
+
)
|