hossam 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hossam/hs_cluster.py CHANGED
@@ -1,25 +1,28 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # ===================================================================
3
- # 패키지 참조
3
+ # 파이썬 기본 패키지 참조
4
4
  # ===================================================================
5
5
  import numpy as np
6
6
  import concurrent.futures as futures
7
-
8
- from . import hs_plot
9
-
10
7
  from tqdm.auto import tqdm
11
8
  from itertools import combinations
12
-
13
9
  from typing import Literal, Callable
10
+
11
+ # ===================================================================
12
+ # 데이터 분석 패키지 참조
13
+ # ===================================================================
14
14
  from kneed import KneeLocator
15
15
  from pandas import Series, DataFrame, MultiIndex, concat
16
16
  from matplotlib.pyplot import Axes # type: ignore
17
-
18
- from sklearn.cluster import KMeans, DBSCAN
17
+ from scipy.stats import normaltest
18
+ from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
19
19
  from sklearn.neighbors import NearestNeighbors
20
20
  from sklearn.metrics import silhouette_score, adjusted_rand_score
21
21
 
22
- from scipy.stats import normaltest
22
+ # ===================================================================
23
+ # hossam 패키지 참조
24
+ # ===================================================================
25
+ from . import hs_plot
23
26
 
24
27
  RANDOM_STATE = 52
25
28
 
@@ -28,10 +31,13 @@ RANDOM_STATE = 52
28
31
  # K-평균 군집화 모델을 적합하는 함수.
29
32
  # ===================================================================
30
33
  def kmeans_fit(
31
- data: DataFrame, n_clusters: int, random_state: int = RANDOM_STATE, plot: bool = False,
34
+ data: DataFrame,
35
+ n_clusters: int,
36
+ random_state: int = RANDOM_STATE,
37
+ plot: bool = False,
32
38
  fields: list[list[str]] | None = None,
33
- **params
34
- ) -> tuple[KMeans, DataFrame]:
39
+ **params,
40
+ ) -> tuple[KMeans, DataFrame, float]:
35
41
  """
36
42
  K-평균 군집화 모델을 적합하는 함수.
37
43
 
@@ -46,21 +52,23 @@ def kmeans_fit(
46
52
  Returns:
47
53
  KMeans: 적합된 KMeans 모델.
48
54
  DataFrame: 클러스터 결과가 포함된 데이터 프레임
55
+ float: 실루엣 점수
49
56
  """
50
57
  df = data.copy()
51
58
  kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, **params)
52
59
  kmeans.fit(data)
53
60
  df["cluster"] = kmeans.predict(df)
61
+ score = float(silhouette_score(X=data, labels=df["cluster"]))
54
62
 
55
63
  if plot:
56
64
  cluster_plot(
57
65
  estimator=kmeans,
58
66
  data=data,
59
67
  fields=fields,
60
- title=f"K-Means Clustering (k={n_clusters})"
68
+ title=f"K-Means Clustering (k={n_clusters})",
61
69
  )
62
70
 
63
- return kmeans, df
71
+ return kmeans, df, score
64
72
 
65
73
 
66
74
  # ===================================================================
@@ -122,7 +130,9 @@ def kmeans_elbow(
122
130
  r = range(k_range[0], k_range[1])
123
131
 
124
132
  for k in r:
125
- kmeans, _ = kmeans_fit(data=data, n_clusters=k, random_state=random_state)
133
+ kmeans, _, score = kmeans_fit(
134
+ data=data, n_clusters=k, random_state=random_state
135
+ )
126
136
  inertia_list.append(kmeans.inertia_)
127
137
 
128
138
  best_k, _ = elbow_point(
@@ -131,13 +141,17 @@ def kmeans_elbow(
131
141
  dir="left,down",
132
142
  S=S,
133
143
  plot=plot,
134
- title=title,
135
144
  marker=marker,
136
145
  width=width,
137
146
  height=height,
138
147
  dpi=dpi,
139
148
  linewidth=linewidth,
140
149
  save_path=save_path,
150
+ title=(
151
+ f"K-Means Elbow Method (k={k_range[0]}-{k_range[1]-1}, silhouette={score:.3f})"
152
+ if title is None
153
+ else title
154
+ ),
141
155
  ax=ax,
142
156
  callback=callback,
143
157
  **params,
@@ -206,68 +220,70 @@ def kmeans_silhouette(
206
220
  estimators = []
207
221
 
208
222
  def __process_k(k):
209
- estimator, cdf = kmeans_fit(
223
+ estimator, cdf, score = kmeans_fit(
210
224
  data=data, n_clusters=k, random_state=random_state
211
225
  )
212
- s_score = silhouette_score(X=data, labels=cdf["cluster"])
213
- return s_score, estimator
226
+ return score, estimator
214
227
 
215
228
  with futures.ThreadPoolExecutor() as executor:
229
+ executed = []
216
230
  for k in klist:
217
231
  pbar.set_description(f"K-Means Silhouette: k={k}")
218
- executed = executor.submit(__process_k, k)
219
- s_score, estimator = executed.result()
232
+ executed.append(executor.submit(__process_k, k))
233
+
234
+ for e in executed:
235
+ s_score, estimator = e.result()
220
236
  silhouettes.append(s_score)
221
237
  estimators.append(estimator)
222
238
  pbar.update(1)
223
239
 
224
- if plot is not False:
225
- for estimator in estimators:
226
- pbar.set_description(f"K-Means Plotting: k={estimator.n_clusters}")
227
-
228
- if plot == "silhouette":
229
- hs_plot.silhouette_plot(
230
- estimator=estimator,
231
- data=data,
232
- title=title,
233
- width=width,
234
- height=height,
235
- dpi=dpi,
236
- linewidth=linewidth,
237
- save_path=save_path,
238
- **params,
239
- )
240
- elif plot == "cluster":
241
- hs_plot.cluster_plot(
242
- estimator=estimator,
243
- data=data,
244
- xname=xname,
245
- yname=yname,
246
- outline=True,
247
- palette=None,
248
- width=width,
249
- height=height,
250
- dpi=dpi,
251
- title=title,
252
- save_path=save_path,
253
- )
254
- elif plot == "both":
255
- hs_plot.visualize_silhouette(
256
- estimator=estimator,
257
- data=data,
258
- xname=xname,
259
- yname=yname,
260
- outline=True,
261
- palette=None,
262
- width=width,
263
- height=height,
264
- dpi=dpi,
265
- title=title,
266
- linewidth=linewidth,
267
- save_path=save_path,
268
- )
269
-
270
- pbar.update(1)
240
+ if plot is not False:
241
+ for estimator in estimators:
242
+ pbar.set_description(f"K-Means Plotting: k={estimator.n_clusters}")
243
+
244
+ if plot == "silhouette":
245
+ hs_plot.silhouette_plot(
246
+ estimator=estimator,
247
+ data=data,
248
+ title=title,
249
+ width=width,
250
+ height=height,
251
+ dpi=dpi,
252
+ linewidth=linewidth,
253
+ save_path=save_path,
254
+ **params,
255
+ )
256
+ elif plot == "cluster":
257
+ hs_plot.cluster_plot(
258
+ estimator=estimator,
259
+ data=data,
260
+ xname=xname,
261
+ yname=yname,
262
+ outline=True,
263
+ palette=None,
264
+ width=width,
265
+ height=height,
266
+ dpi=dpi,
267
+ title=title,
268
+ save_path=save_path,
269
+ )
270
+ elif plot == "both":
271
+ hs_plot.visualize_silhouette(
272
+ estimator=estimator,
273
+ data=data,
274
+ xname=xname,
275
+ yname=yname,
276
+ outline=True,
277
+ palette=None,
278
+ width=width,
279
+ height=height,
280
+ dpi=dpi,
281
+ title=title,
282
+ linewidth=linewidth,
283
+ save_path=save_path,
284
+ )
285
+
286
+ pbar.update(1)
271
287
 
272
288
  silhouette_df = DataFrame({"k": klist, "silhouette_score": silhouettes})
273
289
  silhouette_df.sort_values(by="silhouette_score", ascending=False, inplace=True)
@@ -358,6 +374,7 @@ def elbow_point(
358
374
  best_y = kn.elbow_y
359
375
 
360
376
  if plot:
377
+
361
378
  def hvline(ax):
362
379
  ax.axvline(best_x, color="red", linestyle="--", linewidth=0.7)
363
380
  ax.axhline(best_y, color="red", linestyle="--", linewidth=0.7)
@@ -369,7 +386,7 @@ def elbow_point(
369
386
  ha="center",
370
387
  va="bottom",
371
388
  color="black",
372
- fontweight="bold"
389
+ fontweight="bold",
373
390
  )
374
391
 
375
392
  if callback is not None:
@@ -398,7 +415,7 @@ def elbow_point(
398
415
  # 데이터프레임의 여러 필드 쌍에 대해 군집 산점도를 그리는 함수.
399
416
  # ===================================================================
400
417
  def cluster_plot(
401
- estimator: KMeans,
418
+ estimator: KMeans | DBSCAN | AgglomerativeClustering,
402
419
  data: DataFrame,
403
420
  hue: str | None = None,
404
421
  vector: str | None = None,
@@ -437,7 +454,7 @@ def cluster_plot(
437
454
  from hossam import *
438
455
 
439
456
  data = hs_util.load_data('iris')
440
- estimator, cdf = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
457
+ estimator, cdf, score = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
441
458
  hs_cluster.cluster_plot(cdf, hue='cluster')
442
459
  ```
443
460
  """
@@ -454,7 +471,7 @@ def cluster_plot(
454
471
  xname, yname = field_pair
455
472
 
456
473
  hs_plot.cluster_plot(
457
- estimator=estimator,
474
+ estimator=estimator, # type: ignore
458
475
  data=data,
459
476
  xname=xname,
460
477
  yname=yname,
@@ -479,7 +496,7 @@ def persona(
479
496
  data: DataFrame,
480
497
  cluster: str | Series | np.ndarray | list | dict,
481
498
  fields: list[str] | None = None,
482
- full: bool = False
499
+ full: bool = False,
483
500
  ) -> DataFrame:
484
501
  """
485
502
  군집화된 데이터프레임에서 각 군집의 페르소나(특성 요약)를 생성하는 함수.
@@ -497,8 +514,8 @@ def persona(
497
514
  from hossam import *
498
515
 
499
516
  data = hs_util.load_data('iris')
500
- data['cluster'] = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)[1]
501
- persona_df = hs_cluster.persona(data, hue='cluster')
517
+ estimator, df, score = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
518
+ persona_df = hs_cluster.persona(df, hue='cluster')
502
519
  print(persona_df)
503
520
  ```
504
521
  """
@@ -509,7 +526,9 @@ def persona(
509
526
 
510
527
  if isinstance(cluster, str):
511
528
  if cluster not in df.columns:
512
- raise ValueError(f"cluster로 지정된 컬럼 '{cluster}'이(가) 데이터프레임에 존재하지 않습니다.")
529
+ raise ValueError(
530
+ f"cluster로 지정된 컬럼 '{cluster}'이(가) 데이터프레임에 존재하지 않습니다."
531
+ )
513
532
  else:
514
533
  df["cluster"] = cluster
515
534
  cluster = "cluster"
@@ -573,7 +592,7 @@ def kmeans_best_k(
573
592
  k_range: list | tuple = [2, 11],
574
593
  S: float = 0.1,
575
594
  random_state: int = RANDOM_STATE,
576
- plot: bool = True
595
+ plot: bool = True,
577
596
  ) -> int:
578
597
  """
579
598
  엘보우 포인트와 실루엣 점수를 통해 최적의 K값을 결정하는 함수.
@@ -600,17 +619,19 @@ def kmeans_best_k(
600
619
  k_range=k_range,
601
620
  S=S,
602
621
  random_state=random_state,
603
- plot=True if plot else False
622
+ plot=True if plot else False,
604
623
  )
605
624
 
606
625
  silhouette_df = kmeans_silhouette(
607
626
  data=data,
608
627
  k_range=k_range,
609
628
  random_state=random_state,
610
- plot="both" if plot else False
629
+ plot="both" if plot else False,
611
630
  )
612
631
 
613
- silhouette_k = silhouette_df.sort_values(by="silhouette_score", ascending=False).iloc[0]["k"]
632
+ silhouette_k = silhouette_df.sort_values(
633
+ by="silhouette_score", ascending=False
634
+ ).iloc[0]["k"]
614
635
 
615
636
  if elbow_k == silhouette_k:
616
637
  best_k = elbow_k
@@ -625,10 +646,7 @@ def kmeans_best_k(
625
646
  # DBSCAN 군집화 모델을 적합하는 함수.
626
647
  # ===================================================================
627
648
  def __dbscan_fit(
628
- data: DataFrame,
629
- eps: float = 0.5,
630
- min_samples: int = 5,
631
- **params
649
+ data: DataFrame, eps: float = 0.5, min_samples: int = 5, **params
632
650
  ) -> tuple[DBSCAN, DataFrame, DataFrame]:
633
651
  """
634
652
  DBSCAN 군집화 모델을 적합하는 함수.
@@ -664,12 +682,14 @@ def __dbscan_fit(
664
682
  n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
665
683
  noise_ratio = np.mean(labels == -1)
666
684
 
667
- result_df = DataFrame({
668
- "eps": [eps],
669
- "min_samples": [min_samples],
670
- "n_clusters": [n_clusters],
671
- "noise_ratio": [noise_ratio]
672
- })
685
+ result_df = DataFrame(
686
+ {
687
+ "eps": [eps],
688
+ "min_samples": [min_samples],
689
+ "n_clusters": [n_clusters],
690
+ "noise_ratio": [noise_ratio],
691
+ }
692
+ )
673
693
 
674
694
  return estimator, df, result_df
675
695
 
@@ -691,7 +711,7 @@ def dbscan_eps(
691
711
  linewidth: int = hs_plot.config.line_width,
692
712
  dpi: int = hs_plot.config.dpi,
693
713
  save_path: str | None = None,
694
- ax: Axes | None = None
714
+ ax: Axes | None = None,
695
715
  ) -> tuple[float, np.ndarray]:
696
716
  """
697
717
  DBSCAN 군집화에서 최적의 eps 값을 탐지하는 함수.
@@ -759,14 +779,15 @@ def dbscan_eps(
759
779
 
760
780
  return best_eps, eps_grid
761
781
 
782
+
762
783
  def dbscan_fit(
763
784
  data: DataFrame,
764
785
  eps: float | list | np.ndarray | None = None,
765
786
  min_samples: int = 5,
766
787
  ari_threshold: float = 0.9,
767
788
  noise_diff_threshold: float = 0.05,
768
- plot : bool = True,
769
- **params
789
+ plot: bool = True,
790
+ **params,
770
791
  ) -> tuple[DBSCAN, DataFrame, DataFrame]:
771
792
 
772
793
  # eps 값이 지정되지 않은 경우 최적의 eps 탐지
@@ -782,51 +803,62 @@ def dbscan_fit(
782
803
  cluster_dfs = []
783
804
  result_dfs: DataFrame | None = None
784
805
 
785
- with tqdm(total=len(eps)+2) as pbar:
806
+ with tqdm(total=len(eps) + 2) as pbar:
807
+ pbar.set_description(f"DBSCAN Clustering")
808
+
786
809
  with futures.ThreadPoolExecutor() as executor:
810
+ executers = []
787
811
  for i, e in enumerate(eps):
788
- pbar.set_description(f"DBSCAN Fit: eps={e:.4f}")
789
- executed = executor.submit(__dbscan_fit, data=data, eps=e, min_samples=min_samples, **params)
790
- estimator, cluster_df, result_df = executed.result()
812
+ executers.append(
813
+ executor.submit(
814
+ __dbscan_fit,
815
+ data=data,
816
+ eps=e,
817
+ min_samples=min_samples,
818
+ **params,
819
+ )
820
+ )
821
+
822
+ for i, e in enumerate(executers):
823
+ estimator, cluster_df, result_df = e.result()
791
824
  estimators.append(estimator)
792
825
  cluster_dfs.append(cluster_df)
793
826
 
794
827
  if result_dfs is None:
795
- result_df['ARI'] = np.nan
828
+ result_df["ARI"] = np.nan
796
829
  result_dfs = result_df
797
830
  else:
798
- result_df['ARI'] = adjusted_rand_score(cluster_dfs[i-1]['cluster'], cluster_df['cluster']) # type: ignore
831
+ result_df["ARI"] = adjusted_rand_score(cluster_dfs[i - 1]["cluster"], cluster_df["cluster"]) # type: ignore
799
832
  result_dfs = concat([result_dfs, result_df], ignore_index=True)
800
833
 
801
834
  pbar.update(1)
802
835
 
803
- pbar.set_description(f"DBSCAN Stability Analysis")
804
- result_dfs['cluster_diff'] = result_dfs['n_clusters'].diff().abs() # type: ignore
805
- result_dfs['noise_ratio_diff'] = result_dfs['noise_ratio'].diff().abs() # type: ignore
806
- result_dfs['stable'] = ( # type: ignore
807
- (result_dfs['ARI'] >= ari_threshold) & # type: ignore
808
- (result_dfs['cluster_diff'] <= 0) & # type: ignore
809
- (result_dfs['noise_ratio_diff'] <= noise_diff_threshold) # type: ignore
836
+ result_dfs["cluster_diff"] = result_dfs["n_clusters"].diff().abs() # type: ignore
837
+ result_dfs["noise_ratio_diff"] = result_dfs["noise_ratio"].diff().abs() # type: ignore
838
+ result_dfs["stable"] = ( # type: ignore
839
+ (result_dfs["ARI"] >= ari_threshold) # type: ignore
840
+ & (result_dfs["cluster_diff"] <= 0) # type: ignore
841
+ & (result_dfs["noise_ratio_diff"] <= noise_diff_threshold) # type: ignore
810
842
  )
811
843
 
812
844
  # 첫 행은 비교 불가
813
- result_dfs.loc[0, 'stable'] = False # type: ignore
845
+ result_dfs.loc[0, "stable"] = False # type: ignore
814
846
  pbar.update(1)
815
847
 
816
848
  if len(eps) == 1:
817
- result_dfs['group_id'] = 1 # type: ignore
818
- result_dfs['recommand'] = 'unknown' # type: ignore
849
+ result_dfs["group_id"] = 1 # type: ignore
850
+ result_dfs["recommand"] = "unknown" # type: ignore
819
851
  else:
820
852
  # 안정구간 도출하기
821
853
  # stable 여부를 0/1로 변환
822
- stable_flag = result_dfs['stable'].astype(int).values # type: ignore
854
+ stable_flag = result_dfs["stable"].astype(int).values # type: ignore
823
855
 
824
856
  # 연속 구간 구분용 그룹 id 생성
825
- group_id = (stable_flag != np.roll(stable_flag, 1)).cumsum() # type: ignore
826
- result_dfs['group_id'] = group_id # type: ignore
857
+ group_id = (stable_flag != np.roll(stable_flag, 1)).cumsum() # type: ignore
858
+ result_dfs["group_id"] = group_id # type: ignore
827
859
 
828
860
  # 안정구간 중 가장 긴 구간 선택
829
- stable_groups = result_dfs[result_dfs['stable']].groupby('group_id') # type: ignore
861
+ stable_groups = result_dfs[result_dfs["stable"]].groupby("group_id") # type: ignore
830
862
 
831
863
  # 각 구간의 길이 계산
832
864
  group_sizes = stable_groups.size()
@@ -834,23 +866,154 @@ def dbscan_fit(
834
866
  # 가장 긴 안정 구간 선택
835
867
  best_group_id = group_sizes.idxmax()
836
868
 
837
- result_dfs['recommand'] = 'bad' # type: ignore
869
+ result_dfs["recommand"] = "bad" # type: ignore
838
870
 
839
871
  # 가장 긴 안정 구간에 해당하는 recommand 컬럼을 `best`로 변경
840
- result_dfs.loc[result_dfs["group_id"] == best_group_id, 'recommand'] = 'best' # type: ignore
872
+ result_dfs.loc[result_dfs["group_id"] == best_group_id, "recommand"] = "best" # type: ignore
841
873
 
842
874
  # result_dfs에서 recommand가 best에 해당하는 인덱스와 같은 위치의 추정기만 추출
843
- best_indexes = list(result_dfs[result_dfs['recommand'] == 'best'].index) # type: ignore
875
+ best_indexes = list(result_dfs[result_dfs["recommand"] == "best"].index) # type: ignore
844
876
 
845
- for i in range(len(estimators)-1, -1, -1):
877
+ for i in range(len(estimators) - 1, -1, -1):
846
878
  if i not in best_indexes:
847
- del(estimators[i])
848
- del(cluster_dfs[i])
879
+ del estimators[i]
880
+ del cluster_dfs[i]
849
881
 
850
882
  pbar.update(1)
851
883
 
852
884
  return (
853
885
  estimators[0] if len(estimators) == 1 else estimators, # type: ignore
854
886
  cluster_dfs[0] if len(cluster_dfs) == 1 else cluster_dfs,
855
- result_dfs # type: ignore
856
- )
887
+ result_dfs, # type: ignore
888
+ )
889
+
890
+
891
+ # ===================================================================
892
+ # 단일 계층적 군집화 모델을 적합하는 함수.
893
+ # ===================================================================
894
+ def __agg_fit(
895
+ data: DataFrame,
896
+ n_clusters: int = 3,
897
+ linkage: Literal["ward", "complete", "average", "single"] = "ward",
898
+ plot: bool = False,
899
+ compute_distances: bool = True,
900
+ **params,
901
+ ) -> tuple[AgglomerativeClustering, DataFrame, float]:
902
+ """
903
+ 단일 계층적 군집화 모델을 적합하는 함수.
904
+
905
+ Args:
906
+ data (DataFrame): 군집화할 데이터프레임.
907
+ n_clusters (int, optional): 군집 개수. 기본값 3.
908
+ linkage (str, optional): 병합 기준. 기본값 "ward".
909
+ compute_distances (bool, optional): 거리 행렬 계산 여부. 기본값 True.
910
+ plot (bool, optional): True면 결과를 시각화함. 기본값 False.
911
+ **params: AgglomerativeClustering에 전달할 추가 파라미터.
912
+
913
+ Returns:
914
+ tuple: (estimator, df, score)
915
+ - estimator: 적합된 AgglomerativeClustering 모델.
916
+ - df: 클러스터 결과가 포함된 데이터 프레임.
917
+ - score: 실루엣 점수.
918
+
919
+ """
920
+ df = data.copy()
921
+ estimator = AgglomerativeClustering(
922
+ n_clusters=n_clusters, compute_distances=compute_distances, linkage=linkage, **params
923
+ )
924
+ estimator.fit(data)
925
+ df["cluster"] = estimator.labels_
926
+ score = float(silhouette_score(X=data, labels=df["cluster"]))
927
+
928
+ if plot:
929
+ hs_plot.visualize_silhouette(estimator=estimator, data=data)
930
+
931
+ return estimator, df, score
932
+
933
+
934
+ def agg_fit(
935
+ data: DataFrame,
936
+ n_clusters: int | list[int] | np.ndarray = 3,
937
+ linkage: Literal["ward", "complete", "average", "single"] = "ward",
938
+ plot: bool = False,
939
+ **params,
940
+ ) -> tuple[AgglomerativeClustering | list[AgglomerativeClustering], DataFrame | list[DataFrame], DataFrame]:
941
+ """
942
+ 계층적 군집화 모델을 적합하는 함수.
943
+
944
+ Args:
945
+ data (DataFrame): 군집화할 데이터프레임.
946
+ n_clusters (int | list[int] | np.ndarray, optional): 군집 개수 또는 개수 리스트. 기본값 3.
947
+ linkage (str, optional): 병합 기준. 기본값 "ward".
948
+ plot (bool, optional): True면 결과를 시각화함. 기본값 False.
949
+ **params: AgglomerativeClustering에 전달할 추가 파라미터.
950
+
951
+ Returns:
952
+ tuple: (estimator(s), df(s), score_df)
953
+ - estimator(s): 적합된 AgglomerativeClustering 모델 또는 모델 리스트.
954
+ - df(s): 클러스터 결과가 포함된 데이터 프레임 또는 데이터 프레임 리스트.
955
+ - score_df: 각 군집 개수에 대한 실루엣 점수 데이터프레임.
956
+
957
+ Examples:
958
+ ```python
959
+ from hossam import *
960
+
961
+ data = hs_util.load_data('iris')
962
+ estimators, cluster_dfs, score_df = hs_cluster.agg_fit(data.iloc[:, :-1], n_clusters=[2,3,4])
963
+ ```
964
+ """
965
+ compute_distances = False
966
+
967
+ if isinstance(n_clusters, int):
968
+ n_clusters = [n_clusters]
969
+ compute_distances = True
970
+ else:
971
+ n_clusters = list(range(n_clusters[0], n_clusters[-1]))
972
+
973
+ estimators = []
974
+ cluster_dfs = []
975
+ scores = []
976
+
977
+ with tqdm(total=len(n_clusters)*2) as pbar:
978
+ pbar.set_description(f"Agglomerative Clustering")
979
+
980
+ with futures.ThreadPoolExecutor() as executor:
981
+ executers = []
982
+ for k in n_clusters:
983
+ executers.append(
984
+ executor.submit(
985
+ __agg_fit,
986
+ data=data,
987
+ n_clusters=k,
988
+ linkage=linkage,
989
+ plot=False,
990
+ compute_distances=compute_distances,
991
+ **params,
992
+ )
993
+ )
994
+ pbar.update(1)
995
+
996
+ for e in executers:
997
+ estimator, cluster_df, score = e.result()
998
+ estimators.append(estimator)
999
+ cluster_dfs.append(cluster_df)
1000
+ scores.append({"k": estimator.n_clusters, "silhouette_score": score})
1001
+
1002
+ if plot:
1003
+ hs_plot.visualize_silhouette(
1004
+ estimator=estimator,
1005
+ data=data,
1006
+ outline=True,
1007
+ title=f"Agglomerative Clustering Silhouette (k={estimator.n_clusters})",
1008
+ )
1009
+
1010
+ pbar.update(1)
1011
+
1012
+ score_df = DataFrame(scores)
1013
+ score_df.sort_values(by="silhouette_score", ascending=False, inplace=True)
1014
+
1015
+ return (
1016
+ estimators[0] if len(estimators) == 1 else estimators, # type: ignore
1017
+ cluster_dfs[0] if len(cluster_dfs) == 1 else cluster_dfs,
1018
+ score_df, # type: ignore
1019
+ )