hossam 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hossam/hs_cluster.py CHANGED
@@ -1,25 +1,29 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # ===================================================================
3
- # 패키지 참조
3
+ # 파이썬 기본 패키지 참조
4
4
  # ===================================================================
5
5
  import numpy as np
6
6
  import concurrent.futures as futures
7
-
8
- from . import hs_plot
9
-
10
7
  from tqdm.auto import tqdm
11
8
  from itertools import combinations
12
-
13
9
  from typing import Literal, Callable
10
+
11
+ # ===================================================================
12
+ # 데이터 분석 패키지 참조
13
+ # ===================================================================
14
14
  from kneed import KneeLocator
15
15
  from pandas import Series, DataFrame, MultiIndex, concat
16
16
  from matplotlib.pyplot import Axes # type: ignore
17
-
18
- from sklearn.cluster import KMeans, DBSCAN
17
+ from scipy.stats import normaltest
18
+ from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
19
19
  from sklearn.neighbors import NearestNeighbors
20
20
  from sklearn.metrics import silhouette_score, adjusted_rand_score
21
21
 
22
- from scipy.stats import normaltest
22
+ # ===================================================================
23
+ # hossam 패키지 참조
24
+ # ===================================================================
25
+ from . import hs_plot
26
+ from .hs_util import is_2d
23
27
 
24
28
  RANDOM_STATE = 52
25
29
 
@@ -28,16 +32,20 @@ RANDOM_STATE = 52
28
32
  # K-평균 군집화 모델을 적합하는 함수.
29
33
  # ===================================================================
30
34
  def kmeans_fit(
31
- data: DataFrame, n_clusters: int, random_state: int = RANDOM_STATE, plot: bool = False,
32
- fields: list[list[str]] | None = None,
33
- **params
34
- ) -> tuple[KMeans, DataFrame]:
35
+ data: DataFrame,
36
+ n_clusters: int | None = None,
37
+ k_range: list | tuple = [2, 11],
38
+ random_state: int = RANDOM_STATE,
39
+ plot: bool = False,
40
+ fields: list[str] | tuple[str] | tuple[tuple[str]] | list[list[str]] | None = None,
41
+ **params,
42
+ ) -> tuple[KMeans, DataFrame, float]:
35
43
  """
36
44
  K-평균 군집화 모델을 적합하는 함수.
37
45
 
38
46
  Args:
39
47
  data (DataFrame): 군집화할 데이터프레임.
40
- n_clusters (int): 군집 개수.
48
+ n_clusters (int | None): 군집 개수.
41
49
  random_state (int, optional): 랜덤 시드. 기본값은 RANDOM_STATE.
42
50
  plot (bool, optional): True면 결과를 시각화함. 기본값 False.
43
51
  fields (list[list[str]] | None, optional): 시각화할 필드 쌍 리스트. 기본값 None이면 수치형 컬럼의 모든 조합 사용.
@@ -46,21 +54,41 @@ def kmeans_fit(
46
54
  Returns:
47
55
  KMeans: 적합된 KMeans 모델.
48
56
  DataFrame: 클러스터 결과가 포함된 데이터 프레임
57
+ float: 실루엣 점수
49
58
  """
50
59
  df = data.copy()
60
+
61
+ if n_clusters is None:
62
+ n_clusters = kmeans_best_k(data=df, k_range=k_range, random_state=random_state, plot=False)
63
+ print(f"Best k found: {n_clusters}")
64
+
51
65
  kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, **params)
52
66
  kmeans.fit(data)
53
67
  df["cluster"] = kmeans.predict(df)
68
+ score = float(silhouette_score(X=data, labels=df["cluster"]))
54
69
 
55
70
  if plot:
56
- cluster_plot(
57
- estimator=kmeans,
58
- data=data,
59
- fields=fields,
60
- title=f"K-Means Clustering (k={n_clusters})"
61
- )
62
71
 
63
- return kmeans, df
72
+ if not is_2d(fields):
73
+ fields = [fields] # type: ignore
74
+
75
+ # cluster_plot(
76
+ # estimator=kmeans,
77
+ # data=data,
78
+ # fields=fields,
79
+ # title=f"K-Means Clustering (k={n_clusters})",
80
+ # )
81
+ for f in fields: # type: ignore
82
+ hs_plot.visualize_silhouette(
83
+ estimator=kmeans,
84
+ data=data,
85
+ xname=f[0], # type: ignore
86
+ yname=f[1], # type: ignore
87
+ title=f"K-Means Clustering (k={n_clusters})",
88
+ outline=True,
89
+ )
90
+
91
+ return kmeans, df, score
64
92
 
65
93
 
66
94
  # ===================================================================
@@ -122,7 +150,9 @@ def kmeans_elbow(
122
150
  r = range(k_range[0], k_range[1])
123
151
 
124
152
  for k in r:
125
- kmeans, _ = kmeans_fit(data=data, n_clusters=k, random_state=random_state)
153
+ kmeans, _, score = kmeans_fit(
154
+ data=data, n_clusters=k, random_state=random_state
155
+ )
126
156
  inertia_list.append(kmeans.inertia_)
127
157
 
128
158
  best_k, _ = elbow_point(
@@ -131,13 +161,17 @@ def kmeans_elbow(
131
161
  dir="left,down",
132
162
  S=S,
133
163
  plot=plot,
134
- title=title,
135
164
  marker=marker,
136
165
  width=width,
137
166
  height=height,
138
167
  dpi=dpi,
139
168
  linewidth=linewidth,
140
169
  save_path=save_path,
170
+ title=(
171
+ f"K-Means Elbow Method (k={k_range[0]}-{k_range[1]-1}, silhouette={score:.3f})"
172
+ if title is None
173
+ else title
174
+ ),
141
175
  ax=ax,
142
176
  callback=callback,
143
177
  **params,
@@ -206,68 +240,70 @@ def kmeans_silhouette(
206
240
  estimators = []
207
241
 
208
242
  def __process_k(k):
209
- estimator, cdf = kmeans_fit(
243
+ estimator, cdf, score = kmeans_fit(
210
244
  data=data, n_clusters=k, random_state=random_state
211
245
  )
212
- s_score = silhouette_score(X=data, labels=cdf["cluster"])
213
- return s_score, estimator
246
+ return score, estimator
214
247
 
215
248
  with futures.ThreadPoolExecutor() as executor:
249
+ executed = []
216
250
  for k in klist:
217
251
  pbar.set_description(f"K-Means Silhouette: k={k}")
218
- executed = executor.submit(__process_k, k)
219
- s_score, estimator = executed.result()
252
+ executed.append(executor.submit(__process_k, k))
253
+
254
+ for e in executed:
255
+ s_score, estimator = e.result()
220
256
  silhouettes.append(s_score)
221
257
  estimators.append(estimator)
222
258
  pbar.update(1)
223
259
 
224
- if plot is not False:
225
- for estimator in estimators:
226
- pbar.set_description(f"K-Means Plotting: k={estimator.n_clusters}")
227
-
228
- if plot == "silhouette":
229
- hs_plot.silhouette_plot(
230
- estimator=estimator,
231
- data=data,
232
- title=title,
233
- width=width,
234
- height=height,
235
- dpi=dpi,
236
- linewidth=linewidth,
237
- save_path=save_path,
238
- **params,
239
- )
240
- elif plot == "cluster":
241
- hs_plot.cluster_plot(
242
- estimator=estimator,
243
- data=data,
244
- xname=xname,
245
- yname=yname,
246
- outline=True,
247
- palette=None,
248
- width=width,
249
- height=height,
250
- dpi=dpi,
251
- title=title,
252
- save_path=save_path,
253
- )
254
- elif plot == "both":
255
- hs_plot.visualize_silhouette(
256
- estimator=estimator,
257
- data=data,
258
- xname=xname,
259
- yname=yname,
260
- outline=True,
261
- palette=None,
262
- width=width,
263
- height=height,
264
- dpi=dpi,
265
- title=title,
266
- linewidth=linewidth,
267
- save_path=save_path,
268
- )
269
-
270
- pbar.update(1)
260
+ if plot is not False:
261
+ for estimator in estimators:
262
+ pbar.set_description(f"K-Means Plotting: k={estimator.n_clusters}")
263
+
264
+ if plot == "silhouette":
265
+ hs_plot.silhouette_plot(
266
+ estimator=estimator,
267
+ data=data,
268
+ title=title,
269
+ width=width,
270
+ height=height,
271
+ dpi=dpi,
272
+ linewidth=linewidth,
273
+ save_path=save_path,
274
+ **params,
275
+ )
276
+ elif plot == "cluster":
277
+ hs_plot.cluster_plot(
278
+ estimator=estimator,
279
+ data=data,
280
+ xname=xname,
281
+ yname=yname,
282
+ outline=True,
283
+ palette=None,
284
+ width=width,
285
+ height=height,
286
+ dpi=dpi,
287
+ title=title,
288
+ save_path=save_path,
289
+ )
290
+ elif plot == "both":
291
+ hs_plot.visualize_silhouette(
292
+ estimator=estimator,
293
+ data=data,
294
+ xname=xname,
295
+ yname=yname,
296
+ outline=True,
297
+ palette=None,
298
+ width=width,
299
+ height=height,
300
+ dpi=dpi,
301
+ title=title,
302
+ linewidth=linewidth,
303
+ save_path=save_path,
304
+ )
305
+
306
+ pbar.update(1)
271
307
 
272
308
  silhouette_df = DataFrame({"k": klist, "silhouette_score": silhouettes})
273
309
  silhouette_df.sort_values(by="silhouette_score", ascending=False, inplace=True)
@@ -358,6 +394,7 @@ def elbow_point(
358
394
  best_y = kn.elbow_y
359
395
 
360
396
  if plot:
397
+
361
398
  def hvline(ax):
362
399
  ax.axvline(best_x, color="red", linestyle="--", linewidth=0.7)
363
400
  ax.axhline(best_y, color="red", linestyle="--", linewidth=0.7)
@@ -369,7 +406,7 @@ def elbow_point(
369
406
  ha="center",
370
407
  va="bottom",
371
408
  color="black",
372
- fontweight="bold"
409
+ fontweight="bold",
373
410
  )
374
411
 
375
412
  if callback is not None:
@@ -398,7 +435,7 @@ def elbow_point(
398
435
  # 데이터프레임의 여러 필드 쌍에 대해 군집 산점도를 그리는 함수.
399
436
  # ===================================================================
400
437
  def cluster_plot(
401
- estimator: KMeans,
438
+ estimator: KMeans | DBSCAN | AgglomerativeClustering,
402
439
  data: DataFrame,
403
440
  hue: str | None = None,
404
441
  vector: str | None = None,
@@ -437,7 +474,7 @@ def cluster_plot(
437
474
  from hossam import *
438
475
 
439
476
  data = hs_util.load_data('iris')
440
- estimator, cdf = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
477
+ estimator, cdf, score = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
441
478
  hs_cluster.cluster_plot(cdf, hue='cluster')
442
479
  ```
443
480
  """
@@ -454,7 +491,7 @@ def cluster_plot(
454
491
  xname, yname = field_pair
455
492
 
456
493
  hs_plot.cluster_plot(
457
- estimator=estimator,
494
+ estimator=estimator, # type: ignore
458
495
  data=data,
459
496
  xname=xname,
460
497
  yname=yname,
@@ -479,7 +516,7 @@ def persona(
479
516
  data: DataFrame,
480
517
  cluster: str | Series | np.ndarray | list | dict,
481
518
  fields: list[str] | None = None,
482
- full: bool = False
519
+ full: bool = False,
483
520
  ) -> DataFrame:
484
521
  """
485
522
  군집화된 데이터프레임에서 각 군집의 페르소나(특성 요약)를 생성하는 함수.
@@ -497,8 +534,8 @@ def persona(
497
534
  from hossam import *
498
535
 
499
536
  data = hs_util.load_data('iris')
500
- data['cluster'] = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)[1]
501
- persona_df = hs_cluster.persona(data, hue='cluster')
537
+ estimator, df, score = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
538
+ persona_df = hs_cluster.persona(df, hue='cluster')
502
539
  print(persona_df)
503
540
  ```
504
541
  """
@@ -509,7 +546,9 @@ def persona(
509
546
 
510
547
  if isinstance(cluster, str):
511
548
  if cluster not in df.columns:
512
- raise ValueError(f"cluster로 지정된 컬럼 '{cluster}'이(가) 데이터프레임에 존재하지 않습니다.")
549
+ raise ValueError(
550
+ f"cluster로 지정된 컬럼 '{cluster}'이(가) 데이터프레임에 존재하지 않습니다."
551
+ )
513
552
  else:
514
553
  df["cluster"] = cluster
515
554
  cluster = "cluster"
@@ -525,6 +564,9 @@ def persona(
525
564
  persona_dict[("", f"count")] = len(group)
526
565
 
527
566
  for field in fields:
567
+ if field == cluster:
568
+ continue
569
+
528
570
  # 명목형일 경우 최빈값 사용
529
571
  if df[field].dtype == "object" or df[field].dtype.name == "category":
530
572
  persona_dict[(field, "mode")] = group[field].mode()[0]
@@ -573,7 +615,7 @@ def kmeans_best_k(
573
615
  k_range: list | tuple = [2, 11],
574
616
  S: float = 0.1,
575
617
  random_state: int = RANDOM_STATE,
576
- plot: bool = True
618
+ plot: bool = True,
577
619
  ) -> int:
578
620
  """
579
621
  엘보우 포인트와 실루엣 점수를 통해 최적의 K값을 결정하는 함수.
@@ -600,17 +642,19 @@ def kmeans_best_k(
600
642
  k_range=k_range,
601
643
  S=S,
602
644
  random_state=random_state,
603
- plot=True if plot else False
645
+ plot=True if plot else False,
604
646
  )
605
647
 
606
648
  silhouette_df = kmeans_silhouette(
607
649
  data=data,
608
650
  k_range=k_range,
609
651
  random_state=random_state,
610
- plot="both" if plot else False
652
+ plot="both" if plot else False,
611
653
  )
612
654
 
613
- silhouette_k = silhouette_df.sort_values(by="silhouette_score", ascending=False).iloc[0]["k"]
655
+ silhouette_k = silhouette_df.sort_values(
656
+ by="silhouette_score", ascending=False
657
+ ).iloc[0]["k"]
614
658
 
615
659
  if elbow_k == silhouette_k:
616
660
  best_k = elbow_k
@@ -625,10 +669,7 @@ def kmeans_best_k(
625
669
  # DBSCAN 군집화 모델을 적합하는 함수.
626
670
  # ===================================================================
627
671
  def __dbscan_fit(
628
- data: DataFrame,
629
- eps: float = 0.5,
630
- min_samples: int = 5,
631
- **params
672
+ data: DataFrame, eps: float = 0.5, min_samples: int = 5, **params
632
673
  ) -> tuple[DBSCAN, DataFrame, DataFrame]:
633
674
  """
634
675
  DBSCAN 군집화 모델을 적합하는 함수.
@@ -664,12 +705,14 @@ def __dbscan_fit(
664
705
  n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
665
706
  noise_ratio = np.mean(labels == -1)
666
707
 
667
- result_df = DataFrame({
668
- "eps": [eps],
669
- "min_samples": [min_samples],
670
- "n_clusters": [n_clusters],
671
- "noise_ratio": [noise_ratio]
672
- })
708
+ result_df = DataFrame(
709
+ {
710
+ "eps": [eps],
711
+ "min_samples": [min_samples],
712
+ "n_clusters": [n_clusters],
713
+ "noise_ratio": [noise_ratio],
714
+ }
715
+ )
673
716
 
674
717
  return estimator, df, result_df
675
718
 
@@ -691,7 +734,7 @@ def dbscan_eps(
691
734
  linewidth: int = hs_plot.config.line_width,
692
735
  dpi: int = hs_plot.config.dpi,
693
736
  save_path: str | None = None,
694
- ax: Axes | None = None
737
+ ax: Axes | None = None,
695
738
  ) -> tuple[float, np.ndarray]:
696
739
  """
697
740
  DBSCAN 군집화에서 최적의 eps 값을 탐지하는 함수.
@@ -759,15 +802,37 @@ def dbscan_eps(
759
802
 
760
803
  return best_eps, eps_grid
761
804
 
805
+ # ===================================================================
806
+ # DBSCAN 군집화 모델을 적합하고 최적의 eps 값을 탐지하는 함수.
807
+ # ===================================================================
762
808
  def dbscan_fit(
763
809
  data: DataFrame,
764
810
  eps: float | list | np.ndarray | None = None,
765
811
  min_samples: int = 5,
766
812
  ari_threshold: float = 0.9,
767
813
  noise_diff_threshold: float = 0.05,
768
- plot : bool = True,
769
- **params
814
+ plot: bool = True,
815
+ **params,
770
816
  ) -> tuple[DBSCAN, DataFrame, DataFrame]:
817
+ """
818
+ DBSCAN 군집화 모델을 적합하고 최적의 eps 값을 탐지하는 함수.
819
+
820
+ Args:
821
+ data (DataFrame): 군집화할 데이터프레임.
822
+ eps (float | list | np.ndarray | None, optional): eps 값 또는 리스트.
823
+ None이면 최적의 eps 값을 탐지함. 기본값 None.
824
+ min_samples (int, optional): 핵심점이 되기 위한 최소 샘플수. 기본값 5.
825
+ ari_threshold (float, optional): 안정 구간 탐지를 위한 ARI 임계값. 기본값 0.9.
826
+ noise_diff_threshold (float, optional): 안정 구간 탐지를 위한 노이즈 비율 변화 임계값. 기본값 0.05.
827
+ plot (bool, optional): True면 결과를 시각화함. 기본값 True.
828
+ **params: DBSCAN에 전달할 추가 파라미터.
829
+
830
+ Returns:
831
+ tuple: (estimator, cluster_df, result_df)
832
+ - estimator: 적합된 DBSCAN 모델 또는 모델 리스트(최적 eps가 여러 개인 경우).
833
+ - cluster_df: 클러스터 및 벡터 유형이 포함된 데이터 프레임 또는 데이터 프레임 리스트(최적 eps가 여러 개인 경우).
834
+ - result_df: eps 값에 따른 군집화 요약 통계 데이터 프레임.
835
+ """
771
836
 
772
837
  # eps 값이 지정되지 않은 경우 최적의 eps 탐지
773
838
  if eps is None:
@@ -782,51 +847,62 @@ def dbscan_fit(
782
847
  cluster_dfs = []
783
848
  result_dfs: DataFrame | None = None
784
849
 
785
- with tqdm(total=len(eps)+2) as pbar:
850
+ with tqdm(total=len(eps) + 2) as pbar:
851
+ pbar.set_description(f"DBSCAN Clustering")
852
+
786
853
  with futures.ThreadPoolExecutor() as executor:
854
+ executers = []
787
855
  for i, e in enumerate(eps):
788
- pbar.set_description(f"DBSCAN Fit: eps={e:.4f}")
789
- executed = executor.submit(__dbscan_fit, data=data, eps=e, min_samples=min_samples, **params)
790
- estimator, cluster_df, result_df = executed.result()
856
+ executers.append(
857
+ executor.submit(
858
+ __dbscan_fit,
859
+ data=data,
860
+ eps=e,
861
+ min_samples=min_samples,
862
+ **params,
863
+ )
864
+ )
865
+
866
+ for i, e in enumerate(executers):
867
+ estimator, cluster_df, result_df = e.result()
791
868
  estimators.append(estimator)
792
869
  cluster_dfs.append(cluster_df)
793
870
 
794
871
  if result_dfs is None:
795
- result_df['ARI'] = np.nan
872
+ result_df["ARI"] = np.nan
796
873
  result_dfs = result_df
797
874
  else:
798
- result_df['ARI'] = adjusted_rand_score(cluster_dfs[i-1]['cluster'], cluster_df['cluster']) # type: ignore
875
+ result_df["ARI"] = adjusted_rand_score(cluster_dfs[i - 1]["cluster"], cluster_df["cluster"]) # type: ignore
799
876
  result_dfs = concat([result_dfs, result_df], ignore_index=True)
800
877
 
801
878
  pbar.update(1)
802
879
 
803
- pbar.set_description(f"DBSCAN Stability Analysis")
804
- result_dfs['cluster_diff'] = result_dfs['n_clusters'].diff().abs() # type: ignore
805
- result_dfs['noise_ratio_diff'] = result_dfs['noise_ratio'].diff().abs() # type: ignore
806
- result_dfs['stable'] = ( # type: ignore
807
- (result_dfs['ARI'] >= ari_threshold) & # type: ignore
808
- (result_dfs['cluster_diff'] <= 0) & # type: ignore
809
- (result_dfs['noise_ratio_diff'] <= noise_diff_threshold) # type: ignore
880
+ result_dfs["cluster_diff"] = result_dfs["n_clusters"].diff().abs() # type: ignore
881
+ result_dfs["noise_ratio_diff"] = result_dfs["noise_ratio"].diff().abs() # type: ignore
882
+ result_dfs["stable"] = ( # type: ignore
883
+ (result_dfs["ARI"] >= ari_threshold) # type: ignore
884
+ & (result_dfs["cluster_diff"] <= 0) # type: ignore
885
+ & (result_dfs["noise_ratio_diff"] <= noise_diff_threshold) # type: ignore
810
886
  )
811
887
 
812
888
  # 첫 행은 비교 불가
813
- result_dfs.loc[0, 'stable'] = False # type: ignore
889
+ result_dfs.loc[0, "stable"] = False # type: ignore
814
890
  pbar.update(1)
815
891
 
816
892
  if len(eps) == 1:
817
- result_dfs['group_id'] = 1 # type: ignore
818
- result_dfs['recommand'] = 'unknown' # type: ignore
893
+ result_dfs["group_id"] = 1 # type: ignore
894
+ result_dfs["recommand"] = "unknown" # type: ignore
819
895
  else:
820
896
  # 안정구간 도출하기
821
897
  # stable 여부를 0/1로 변환
822
- stable_flag = result_dfs['stable'].astype(int).values # type: ignore
898
+ stable_flag = result_dfs["stable"].astype(int).values # type: ignore
823
899
 
824
900
  # 연속 구간 구분용 그룹 id 생성
825
- group_id = (stable_flag != np.roll(stable_flag, 1)).cumsum() # type: ignore
826
- result_dfs['group_id'] = group_id # type: ignore
901
+ group_id = (stable_flag != np.roll(stable_flag, 1)).cumsum() # type: ignore
902
+ result_dfs["group_id"] = group_id # type: ignore
827
903
 
828
904
  # 안정구간 중 가장 긴 구간 선택
829
- stable_groups = result_dfs[result_dfs['stable']].groupby('group_id') # type: ignore
905
+ stable_groups = result_dfs[result_dfs["stable"]].groupby("group_id") # type: ignore
830
906
 
831
907
  # 각 구간의 길이 계산
832
908
  group_sizes = stable_groups.size()
@@ -834,23 +910,188 @@ def dbscan_fit(
834
910
  # 가장 긴 안정 구간 선택
835
911
  best_group_id = group_sizes.idxmax()
836
912
 
837
- result_dfs['recommand'] = 'bad' # type: ignore
913
+ result_dfs["recommand"] = "bad" # type: ignore
838
914
 
839
915
  # 가장 긴 안정 구간에 해당하는 recommand 컬럼을 `best`로 변경
840
- result_dfs.loc[result_dfs["group_id"] == best_group_id, 'recommand'] = 'best' # type: ignore
916
+ result_dfs.loc[result_dfs["group_id"] == best_group_id, "recommand"] = "best" # type: ignore
841
917
 
842
918
  # result_dfs에서 recommand가 best에 해당하는 인덱스와 같은 위치의 추정기만 추출
843
- best_indexes = list(result_dfs[result_dfs['recommand'] == 'best'].index) # type: ignore
919
+ best_indexes = list(result_dfs[result_dfs["recommand"] == "best"].index) # type: ignore
844
920
 
845
- for i in range(len(estimators)-1, -1, -1):
846
- if i not in best_indexes:
847
- del(estimators[i])
848
- del(cluster_dfs[i])
921
+ # for i in range(len(estimators) - 1, -1, -1):
922
+ # if i not in best_indexes:
923
+ # del estimators[i]
924
+ # del cluster_dfs[i]
849
925
 
850
926
  pbar.update(1)
851
927
 
928
+ # best 모델 선정: recommand=='best'인 인덱스의 estimator/cluster_df만 반환
929
+ if len(estimators) == 1:
930
+
931
+ if plot:
932
+ hs_plot.scatterplot(
933
+ df=cluster_dfs[0],
934
+ xname=cluster_dfs[0].columns[0],
935
+ yname=cluster_dfs[0].columns[1],
936
+ hue="cluster",
937
+ vector="vector",
938
+ title=f"DBSCAN Clustering (eps={estimators[0].eps}, min_samples={estimators[0].min_samples})",
939
+ outline=True
940
+ )
941
+
942
+ return estimators[0], cluster_dfs[0], result_dfs # type: ignore
943
+
944
+ # recommand=='best'인 인덱스 추출 (여러 개면 첫 번째)
945
+ best_indexes = list(result_dfs[result_dfs["recommand"] == "best"].index) # type: ignore
946
+ if not best_indexes:
947
+ # fallback: 첫 번째
948
+ best_index = 0
949
+ else:
950
+ best_index = best_indexes[0]
951
+
952
+ best_estimator = estimators[best_index]
953
+ best_cluster_df = cluster_dfs[best_index]
954
+
955
+ if plot:
956
+ hs_plot.scatterplot(
957
+ df=best_cluster_df,
958
+ xname=best_cluster_df.columns[0],
959
+ yname=best_cluster_df.columns[1],
960
+ hue="cluster",
961
+ vector="vector",
962
+ title=f"DBSCAN Clustering (eps={best_estimator.eps}, min_samples={best_estimator.min_samples})",
963
+ outline=True
964
+ )
965
+
966
+ return best_estimator, best_cluster_df, result_dfs # type: ignore
967
+
968
+
969
+ # ===================================================================
970
+ # 단일 계층적 군집화 모델을 적합하는 함수.
971
+ # ===================================================================
972
+ def __agg_fit(
973
+ data: DataFrame,
974
+ n_clusters: int = 3,
975
+ linkage: Literal["ward", "complete", "average", "single"] = "ward",
976
+ plot: bool = False,
977
+ compute_distances: bool = True,
978
+ **params,
979
+ ) -> tuple[AgglomerativeClustering, DataFrame, float]:
980
+ """
981
+ 단일 계층적 군집화 모델을 적합하는 함수.
982
+
983
+ Args:
984
+ data (DataFrame): 군집화할 데이터프레임.
985
+ n_clusters (int, optional): 군집 개수. 기본값 3.
986
+ linkage (str, optional): 병합 기준. 기본값 "ward".
987
+ compute_distances (bool, optional): 거리 행렬 계산 여부. 기본값 True.
988
+ plot (bool, optional): True면 결과를 시각화함. 기본값 False.
989
+ **params: AgglomerativeClustering에 전달할 추가 파라미터.
990
+
991
+ Returns:
992
+ tuple: (estimator, df, score)
993
+ - estimator: 적합된 AgglomerativeClustering 모델.
994
+ - df: 클러스터 결과가 포함된 데이터 프레임.
995
+ - score: 실루엣 점수.
996
+
997
+ """
998
+ df = data.copy()
999
+ estimator = AgglomerativeClustering(
1000
+ n_clusters=n_clusters, compute_distances=compute_distances, linkage=linkage, **params
1001
+ )
1002
+ estimator.fit(data)
1003
+ df["cluster"] = estimator.labels_
1004
+ score = float(silhouette_score(X=data, labels=df["cluster"]))
1005
+
1006
+ if plot:
1007
+ hs_plot.visualize_silhouette(estimator=estimator, data=data)
1008
+
1009
+ return estimator, df, score
1010
+
1011
+
1012
+ def agg_fit(
1013
+ data: DataFrame,
1014
+ n_clusters: int | list[int] | np.ndarray = 3,
1015
+ linkage: Literal["ward", "complete", "average", "single"] = "ward",
1016
+ plot: bool = False,
1017
+ **params,
1018
+ ) -> tuple[AgglomerativeClustering | list[AgglomerativeClustering], DataFrame | list[DataFrame], DataFrame]:
1019
+ """
1020
+ 계층적 군집화 모델을 적합하는 함수.
1021
+
1022
+ Args:
1023
+ data (DataFrame): 군집화할 데이터프레임.
1024
+ n_clusters (int | list[int] | np.ndarray, optional): 군집 개수 또는 개수 리스트. 기본값 3.
1025
+ linkage (str, optional): 병합 기준. 기본값 "ward".
1026
+ plot (bool, optional): True면 결과를 시각화함. 기본값 False.
1027
+ **params: AgglomerativeClustering에 전달할 추가 파라미터.
1028
+
1029
+ Returns:
1030
+ tuple: (estimator(s), df(s), score_df)
1031
+ - estimator(s): 적합된 AgglomerativeClustering 모델 또는 모델 리스트 (n_clusters가 리스트일 때 리턴도 리스트로 처리됨).
1032
+ - df(s): 클러스터 결과가 포함된 데이터 프레임 또는 데이터 프레임 리스트(n_cluseters가 리스트일 때 리턴되 리스트로 처리됨).
1033
+ - score_df: 각 군집 개수에 대한 실루엣 점수 데이터프레임.
1034
+
1035
+ Examples:
1036
+ ```python
1037
+ from hossam import *
1038
+
1039
+ data = hs_util.load_data('iris')
1040
+ estimators, cluster_dfs, score_df = hs_cluster.agg_fit(data.iloc[:, :-1], n_clusters=[2,3,4])
1041
+ ```
1042
+ """
1043
+ compute_distances = False
1044
+
1045
+ if isinstance(n_clusters, int):
1046
+ n_clusters = [n_clusters]
1047
+ compute_distances = True
1048
+ else:
1049
+ n_clusters = list(range(n_clusters[0], n_clusters[-1]))
1050
+
1051
+ estimators = []
1052
+ cluster_dfs = []
1053
+ scores = []
1054
+
1055
+ with tqdm(total=len(n_clusters)*2) as pbar:
1056
+ pbar.set_description(f"Agglomerative Clustering")
1057
+
1058
+ with futures.ThreadPoolExecutor() as executor:
1059
+ executers = []
1060
+ for k in n_clusters:
1061
+ executers.append(
1062
+ executor.submit(
1063
+ __agg_fit,
1064
+ data=data,
1065
+ n_clusters=k,
1066
+ linkage=linkage,
1067
+ plot=False,
1068
+ compute_distances=compute_distances,
1069
+ **params,
1070
+ )
1071
+ )
1072
+ pbar.update(1)
1073
+
1074
+ for e in executers:
1075
+ estimator, cluster_df, score = e.result()
1076
+ estimators.append(estimator)
1077
+ cluster_dfs.append(cluster_df)
1078
+ scores.append({"k": estimator.n_clusters, "silhouette_score": score})
1079
+
1080
+ if plot:
1081
+ hs_plot.visualize_silhouette(
1082
+ estimator=estimator,
1083
+ data=data,
1084
+ outline=True,
1085
+ title=f"Agglomerative Clustering Silhouette (k={estimator.n_clusters})",
1086
+ )
1087
+
1088
+ pbar.update(1)
1089
+
1090
+ score_df = DataFrame(scores)
1091
+ score_df.sort_values(by="silhouette_score", ascending=False, inplace=True)
1092
+
852
1093
  return (
853
1094
  estimators[0] if len(estimators) == 1 else estimators, # type: ignore
854
1095
  cluster_dfs[0] if len(cluster_dfs) == 1 else cluster_dfs,
855
- result_dfs # type: ignore
856
- )
1096
+ score_df, # type: ignore
1097
+ )