PyPI - hossam - Versions diffs - 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl - Mend

hossam 0.4.16py3-none-any.whl → 0.4.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

hossam/__init__.py +19 -0
hossam/hs_cluster copy.py +1060 -0
hossam/hs_cluster.py +369 -128
hossam/hs_plot.py +244 -13
hossam/hs_prep.py +241 -56
hossam/hs_stats.py +39 -2
hossam/hs_util.py +20 -0
{hossam-0.4.16.dist-info → hossam-0.4.18.dist-info}/METADATA +1 -1
hossam-0.4.18.dist-info/RECORD +18 -0
hossam-0.4.16.dist-info/RECORD +0 -17
{hossam-0.4.16.dist-info → hossam-0.4.18.dist-info}/WHEEL +0 -0
{hossam-0.4.16.dist-info → hossam-0.4.18.dist-info}/licenses/LICENSE +0 -0
{hossam-0.4.16.dist-info → hossam-0.4.18.dist-info}/top_level.txt +0 -0

hossam/hs_cluster.py CHANGED Viewed

@@ -1,25 +1,29 @@
 # -*- coding: utf-8 -*-
 # ===================================================================
-# 패키지 참조
+# 파이썬 기본 패키지 참조
 # ===================================================================
 import numpy as np
 import concurrent.futures as futures
-from . import hs_plot
 from tqdm.auto import tqdm
 from itertools import combinations
 from typing import Literal, Callable
+# ===================================================================
+# 데이터 분석 패키지 참조
+# ===================================================================
 from kneed import KneeLocator
 from pandas import Series, DataFrame, MultiIndex, concat
 from matplotlib.pyplot import Axes  # type: ignore
-from sklearn.cluster import KMeans, DBSCAN
+from scipy.stats import normaltest
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
 from sklearn.neighbors import NearestNeighbors
 from sklearn.metrics import silhouette_score, adjusted_rand_score
-from scipy.stats import normaltest
+# ===================================================================
+# hossam 패키지 참조
+# ===================================================================
+from . import hs_plot
+from .hs_util import is_2d
 RANDOM_STATE = 52
@@ -28,16 +32,20 @@ RANDOM_STATE = 52
 # K-평균 군집화 모델을 적합하는 함수.
 # ===================================================================
 def kmeans_fit(
-    data: DataFrame, n_clusters: int, random_state: int = RANDOM_STATE, plot: bool = False,
-    fields: list[list[str]] | None = None,
-    **params
-) -> tuple[KMeans, DataFrame]:
+    data: DataFrame,
+    n_clusters: int | None = None,
+    k_range: list | tuple = [2, 11],
+    random_state: int = RANDOM_STATE,
+    plot: bool = False,
+    fields: list[str] | tuple[str] | tuple[tuple[str]] | list[list[str]] | None = None,
+    **params,
+) -> tuple[KMeans, DataFrame, float]:
     """
     K-평균 군집화 모델을 적합하는 함수.
     Args:
         data (DataFrame): 군집화할 데이터프레임.
-        n_clusters (int): 군집 개수.
+        n_clusters (int | None): 군집 개수.
         random_state (int, optional): 랜덤 시드. 기본값은 RANDOM_STATE.
         plot (bool, optional): True면 결과를 시각화함. 기본값 False.
         fields (list[list[str]] | None, optional): 시각화할 필드 쌍 리스트. 기본값 None이면 수치형 컬럼의 모든 조합 사용.
@@ -46,21 +54,41 @@ def kmeans_fit(
     Returns:
         KMeans: 적합된 KMeans 모델.
         DataFrame: 클러스터 결과가 포함된 데이터 프레임
+        float: 실루엣 점수
     """
     df = data.copy()
+    if n_clusters is None:
+        n_clusters = kmeans_best_k(data=df, k_range=k_range, random_state=random_state, plot=False)
+        print(f"Best k found: {n_clusters}")
     kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, **params)
     kmeans.fit(data)
     df["cluster"] = kmeans.predict(df)
+    score = float(silhouette_score(X=data, labels=df["cluster"]))
     if plot:
-        cluster_plot(
-            estimator=kmeans,
-            data=data,
-            fields=fields,
-            title=f"K-Means Clustering (k={n_clusters})"
-        )
-    return kmeans, df
+        if not is_2d(fields):
+            fields = [fields]   # type: ignore
+        # cluster_plot(
+        #     estimator=kmeans,
+        #     data=data,
+        #     fields=fields,
+        #     title=f"K-Means Clustering (k={n_clusters})",
+        # )
+        for f in fields:  # type: ignore
+            hs_plot.visualize_silhouette(
+                estimator=kmeans,
+                data=data,
+                xname=f[0],     # type: ignore
+                yname=f[1],     # type: ignore
+                title=f"K-Means Clustering (k={n_clusters})",
+                outline=True,
+            )
+    return kmeans, df, score
 # ===================================================================
@@ -122,7 +150,9 @@ def kmeans_elbow(
     r = range(k_range[0], k_range[1])
     for k in r:
-        kmeans, _ = kmeans_fit(data=data, n_clusters=k, random_state=random_state)
+        kmeans, _, score = kmeans_fit(
+            data=data, n_clusters=k, random_state=random_state
+        )
         inertia_list.append(kmeans.inertia_)
     best_k, _ = elbow_point(
@@ -131,13 +161,17 @@ def kmeans_elbow(
         dir="left,down",
         S=S,
         plot=plot,
-        title=title,
         marker=marker,
         width=width,
         height=height,
         dpi=dpi,
         linewidth=linewidth,
         save_path=save_path,
+        title=(
+            f"K-Means Elbow Method (k={k_range[0]}-{k_range[1]-1}, silhouette={score:.3f})"
+            if title is None
+            else title
+        ),
         ax=ax,
         callback=callback,
         **params,
@@ -206,68 +240,70 @@ def kmeans_silhouette(
         estimators = []
         def __process_k(k):
-            estimator, cdf = kmeans_fit(
+            estimator, cdf, score = kmeans_fit(
                 data=data, n_clusters=k, random_state=random_state
             )
-            s_score = silhouette_score(X=data, labels=cdf["cluster"])
-            return s_score, estimator
+            return score, estimator
         with futures.ThreadPoolExecutor() as executor:
+            executed = []
             for k in klist:
                 pbar.set_description(f"K-Means Silhouette: k={k}")
-                executed = executor.submit(__process_k, k)
-                s_score, estimator = executed.result()
+                executed.append(executor.submit(__process_k, k))
+            for e in executed:
+                s_score, estimator = e.result()
                 silhouettes.append(s_score)
                 estimators.append(estimator)
                 pbar.update(1)
-            if plot is not False:
-                for estimator in estimators:
-                    pbar.set_description(f"K-Means Plotting: k={estimator.n_clusters}")
-                    if plot == "silhouette":
-                        hs_plot.silhouette_plot(
-                            estimator=estimator,
-                            data=data,
-                            title=title,
-                            width=width,
-                            height=height,
-                            dpi=dpi,
-                            linewidth=linewidth,
-                            save_path=save_path,
-                            **params,
-                        )
-                    elif plot == "cluster":
-                        hs_plot.cluster_plot(
-                            estimator=estimator,
-                            data=data,
-                            xname=xname,
-                            yname=yname,
-                            outline=True,
-                            palette=None,
-                            width=width,
-                            height=height,
-                            dpi=dpi,
-                            title=title,
-                            save_path=save_path,
-                        )
-                    elif plot == "both":
-                        hs_plot.visualize_silhouette(
-                            estimator=estimator,
-                            data=data,
-                            xname=xname,
-                            yname=yname,
-                            outline=True,
-                            palette=None,
-                            width=width,
-                            height=height,
-                            dpi=dpi,
-                            title=title,
-                            linewidth=linewidth,
-                            save_path=save_path,
-                        )
-                    pbar.update(1)
+        if plot is not False:
+            for estimator in estimators:
+                pbar.set_description(f"K-Means Plotting: k={estimator.n_clusters}")
+                if plot == "silhouette":
+                    hs_plot.silhouette_plot(
+                        estimator=estimator,
+                        data=data,
+                        title=title,
+                        width=width,
+                        height=height,
+                        dpi=dpi,
+                        linewidth=linewidth,
+                        save_path=save_path,
+                        **params,
+                    )
+                elif plot == "cluster":
+                    hs_plot.cluster_plot(
+                        estimator=estimator,
+                        data=data,
+                        xname=xname,
+                        yname=yname,
+                        outline=True,
+                        palette=None,
+                        width=width,
+                        height=height,
+                        dpi=dpi,
+                        title=title,
+                        save_path=save_path,
+                    )
+                elif plot == "both":
+                    hs_plot.visualize_silhouette(
+                        estimator=estimator,
+                        data=data,
+                        xname=xname,
+                        yname=yname,
+                        outline=True,
+                        palette=None,
+                        width=width,
+                        height=height,
+                        dpi=dpi,
+                        title=title,
+                        linewidth=linewidth,
+                        save_path=save_path,
+                    )
+                pbar.update(1)
     silhouette_df = DataFrame({"k": klist, "silhouette_score": silhouettes})
     silhouette_df.sort_values(by="silhouette_score", ascending=False, inplace=True)
@@ -358,6 +394,7 @@ def elbow_point(
     best_y = kn.elbow_y
     if plot:
         def hvline(ax):
             ax.axvline(best_x, color="red", linestyle="--", linewidth=0.7)
             ax.axhline(best_y, color="red", linestyle="--", linewidth=0.7)
@@ -369,7 +406,7 @@ def elbow_point(
                 ha="center",
                 va="bottom",
                 color="black",
-                fontweight="bold"
+                fontweight="bold",
             )
             if callback is not None:
@@ -398,7 +435,7 @@ def elbow_point(
 # 데이터프레임의 여러 필드 쌍에 대해 군집 산점도를 그리는 함수.
 # ===================================================================
 def cluster_plot(
-    estimator: KMeans,
+    estimator: KMeans | DBSCAN | AgglomerativeClustering,
     data: DataFrame,
     hue: str | None = None,
     vector: str | None = None,
@@ -437,7 +474,7 @@ def cluster_plot(
         from hossam import *
         data = hs_util.load_data('iris')
-        estimator, cdf = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
+        estimator, cdf, score = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
         hs_cluster.cluster_plot(cdf, hue='cluster')
         ```
     """
@@ -454,7 +491,7 @@ def cluster_plot(
         xname, yname = field_pair
         hs_plot.cluster_plot(
-            estimator=estimator,
+            estimator=estimator,    # type: ignore
             data=data,
             xname=xname,
             yname=yname,
@@ -479,7 +516,7 @@ def persona(
     data: DataFrame,
     cluster: str | Series | np.ndarray | list | dict,
     fields: list[str] | None = None,
-    full: bool = False
+    full: bool = False,
 ) -> DataFrame:
     """
     군집화된 데이터프레임에서 각 군집의 페르소나(특성 요약)를 생성하는 함수.
@@ -497,8 +534,8 @@ def persona(
         from hossam import *
         data = hs_util.load_data('iris')
-        data['cluster'] = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)[1]
-        persona_df = hs_cluster.persona(data, hue='cluster')
+        estimator, df, score = hs_cluster.kmeans_fit(data.iloc[:, :-1], n_clusters=3)
+        persona_df = hs_cluster.persona(df, hue='cluster')
         print(persona_df)
         ```
     """
@@ -509,7 +546,9 @@ def persona(
     if isinstance(cluster, str):
         if cluster not in df.columns:
-            raise ValueError(f"cluster로 지정된 컬럼 '{cluster}'이(가) 데이터프레임에 존재하지 않습니다.")
+            raise ValueError(
+                f"cluster로 지정된 컬럼 '{cluster}'이(가) 데이터프레임에 존재하지 않습니다."
+            )
     else:
         df["cluster"] = cluster
         cluster = "cluster"
@@ -525,6 +564,9 @@ def persona(
         persona_dict[("", f"count")] = len(group)
         for field in fields:
+            if field == cluster:
+                continue
             # 명목형일 경우 최빈값 사용
             if df[field].dtype == "object" or df[field].dtype.name == "category":
                 persona_dict[(field, "mode")] = group[field].mode()[0]
@@ -573,7 +615,7 @@ def kmeans_best_k(
     k_range: list | tuple = [2, 11],
     S: float = 0.1,
     random_state: int = RANDOM_STATE,
-    plot: bool = True
+    plot: bool = True,
 ) -> int:
     """
     엘보우 포인트와 실루엣 점수를 통해 최적의 K값을 결정하는 함수.
@@ -600,17 +642,19 @@ def kmeans_best_k(
         k_range=k_range,
         S=S,
         random_state=random_state,
-        plot=True if plot else False
+        plot=True if plot else False,
     )
     silhouette_df = kmeans_silhouette(
         data=data,
         k_range=k_range,
         random_state=random_state,
-        plot="both" if plot else False
+        plot="both" if plot else False,
     )
-    silhouette_k = silhouette_df.sort_values(by="silhouette_score", ascending=False).iloc[0]["k"]
+    silhouette_k = silhouette_df.sort_values(
+        by="silhouette_score", ascending=False
+    ).iloc[0]["k"]
     if elbow_k == silhouette_k:
         best_k = elbow_k
@@ -625,10 +669,7 @@ def kmeans_best_k(
 # DBSCAN 군집화 모델을 적합하는 함수.
 # ===================================================================
 def __dbscan_fit(
-    data: DataFrame,
-    eps: float = 0.5,
-    min_samples: int = 5,
-    **params
+    data: DataFrame, eps: float = 0.5, min_samples: int = 5, **params
 ) -> tuple[DBSCAN, DataFrame, DataFrame]:
     """
     DBSCAN 군집화 모델을 적합하는 함수.
@@ -664,12 +705,14 @@ def __dbscan_fit(
     n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
     noise_ratio = np.mean(labels == -1)
-    result_df = DataFrame({
-        "eps": [eps],
-        "min_samples": [min_samples],
-        "n_clusters": [n_clusters],
-        "noise_ratio": [noise_ratio]
-    })
+    result_df = DataFrame(
+        {
+            "eps": [eps],
+            "min_samples": [min_samples],
+            "n_clusters": [n_clusters],
+            "noise_ratio": [noise_ratio],
+        }
+    )
     return estimator, df, result_df
@@ -691,7 +734,7 @@ def dbscan_eps(
     linewidth: int = hs_plot.config.line_width,
     dpi: int = hs_plot.config.dpi,
     save_path: str | None = None,
-    ax: Axes | None = None
+    ax: Axes | None = None,
 ) -> tuple[float, np.ndarray]:
     """
     DBSCAN 군집화에서 최적의 eps 값을 탐지하는 함수.
@@ -759,15 +802,37 @@ def dbscan_eps(
     return best_eps, eps_grid
+# ===================================================================
+# DBSCAN 군집화 모델을 적합하고 최적의 eps 값을 탐지하는 함수.
+# ===================================================================
 def dbscan_fit(
     data: DataFrame,
     eps: float | list | np.ndarray | None = None,
     min_samples: int = 5,
     ari_threshold: float = 0.9,
     noise_diff_threshold: float = 0.05,
-    plot : bool = True,
-    **params
+    plot: bool = True,
+    **params,
 ) -> tuple[DBSCAN, DataFrame, DataFrame]:
+    """
+    DBSCAN 군집화 모델을 적합하고 최적의 eps 값을 탐지하는 함수.
+    Args:
+        data (DataFrame): 군집화할 데이터프레임.
+        eps (float | list | np.ndarray | None, optional): eps 값 또는 리스트.
+            None이면 최적의 eps 값을 탐지함. 기본값 None.
+        min_samples (int, optional): 핵심점이 되기 위한 최소 샘플수. 기본값 5.
+        ari_threshold (float, optional): 안정 구간 탐지를 위한 ARI 임계값. 기본값 0.9.
+        noise_diff_threshold (float, optional): 안정 구간 탐지를 위한 노이즈 비율 변화 임계값. 기본값 0.05.
+        plot (bool, optional): True면 결과를 시각화함. 기본값 True.
+        **params: DBSCAN에 전달할 추가 파라미터.
+    Returns:
+        tuple: (estimator, cluster_df, result_df)
+            - estimator: 적합된 DBSCAN 모델 또는 모델 리스트(최적 eps가 여러 개인 경우).
+            - cluster_df: 클러스터 및 벡터 유형이 포함된 데이터 프레임 또는 데이터 프레임 리스트(최적 eps가 여러 개인 경우).
+            - result_df: eps 값에 따른 군집화 요약 통계 데이터 프레임.
+    """
     # eps 값이 지정되지 않은 경우 최적의 eps 탐지
     if eps is None:
@@ -782,51 +847,62 @@ def dbscan_fit(
     cluster_dfs = []
     result_dfs: DataFrame | None = None
-    with tqdm(total=len(eps)+2) as pbar:
+    with tqdm(total=len(eps) + 2) as pbar:
+        pbar.set_description(f"DBSCAN Clustering")
         with futures.ThreadPoolExecutor() as executor:
+            executers = []
             for i, e in enumerate(eps):
-                pbar.set_description(f"DBSCAN Fit: eps={e:.4f}")
-                executed = executor.submit(__dbscan_fit, data=data, eps=e, min_samples=min_samples, **params)
-                estimator, cluster_df, result_df = executed.result()
+                executers.append(
+                    executor.submit(
+                        __dbscan_fit,
+                        data=data,
+                        eps=e,
+                        min_samples=min_samples,
+                        **params,
+                    )
+                )
+            for i, e in enumerate(executers):
+                estimator, cluster_df, result_df = e.result()
                 estimators.append(estimator)
                 cluster_dfs.append(cluster_df)
                 if result_dfs is None:
-                    result_df['ARI'] = np.nan
+                    result_df["ARI"] = np.nan
                     result_dfs = result_df
                 else:
-                    result_df['ARI'] = adjusted_rand_score(cluster_dfs[i-1]['cluster'], cluster_df['cluster']) # type: ignore
+                    result_df["ARI"] = adjusted_rand_score(cluster_dfs[i - 1]["cluster"], cluster_df["cluster"])  # type: ignore
                     result_dfs = concat([result_dfs, result_df], ignore_index=True)
                 pbar.update(1)
-            pbar.set_description(f"DBSCAN Stability Analysis")
-            result_dfs['cluster_diff'] = result_dfs['n_clusters'].diff().abs()      # type: ignore
-            result_dfs['noise_ratio_diff'] = result_dfs['noise_ratio'].diff().abs()    # type: ignore
-            result_dfs['stable'] = ( # type: ignore
-                (result_dfs['ARI'] >= ari_threshold) & # type: ignore
-                (result_dfs['cluster_diff'] <= 0) & # type: ignore
-                (result_dfs['noise_ratio_diff'] <= noise_diff_threshold) # type: ignore
+            result_dfs["cluster_diff"] = result_dfs["n_clusters"].diff().abs()  # type: ignore
+            result_dfs["noise_ratio_diff"] = result_dfs["noise_ratio"].diff().abs()  # type: ignore
+            result_dfs["stable"] = (  # type: ignore
+                (result_dfs["ARI"] >= ari_threshold)  # type: ignore
+                & (result_dfs["cluster_diff"] <= 0)  # type: ignore
+                & (result_dfs["noise_ratio_diff"] <= noise_diff_threshold)  # type: ignore
             )
             # 첫 행은 비교 불가
-            result_dfs.loc[0, 'stable'] = False # type: ignore
+            result_dfs.loc[0, "stable"] = False  # type: ignore
             pbar.update(1)
             if len(eps) == 1:
-                result_dfs['group_id'] = 1  # type: ignore
-                result_dfs['recommand'] = 'unknown' # type: ignore
+                result_dfs["group_id"] = 1  # type: ignore
+                result_dfs["recommand"] = "unknown"  # type: ignore
             else:
                 # 안정구간 도출하기
                 # stable 여부를 0/1로 변환
-                stable_flag = result_dfs['stable'].astype(int).values  # type: ignore
+                stable_flag = result_dfs["stable"].astype(int).values  # type: ignore
                 # 연속 구간 구분용 그룹 id 생성
-                group_id = (stable_flag != np.roll(stable_flag, 1)).cumsum()    # type: ignore
-                result_dfs['group_id'] = group_id  # type: ignore
+                group_id = (stable_flag != np.roll(stable_flag, 1)).cumsum()  # type: ignore
+                result_dfs["group_id"] = group_id  # type: ignore
                 # 안정구간 중 가장 긴 구간 선택
-                stable_groups = result_dfs[result_dfs['stable']].groupby('group_id')  # type: ignore
+                stable_groups = result_dfs[result_dfs["stable"]].groupby("group_id")  # type: ignore
                 # 각 구간의 길이 계산
                 group_sizes = stable_groups.size()
@@ -834,23 +910,188 @@ def dbscan_fit(
                 # 가장 긴 안정 구간 선택
                 best_group_id = group_sizes.idxmax()
-                result_dfs['recommand'] = 'bad' # type: ignore
+                result_dfs["recommand"] = "bad"  # type: ignore
                 # 가장 긴 안정 구간에 해당하는 recommand 컬럼을 `best`로 변경
-                result_dfs.loc[result_dfs["group_id"] == best_group_id, 'recommand'] = 'best' # type: ignore
+                result_dfs.loc[result_dfs["group_id"] == best_group_id, "recommand"] = "best"  # type: ignore
                 # result_dfs에서 recommand가 best에 해당하는 인덱스와 같은 위치의 추정기만 추출
-                best_indexes = list(result_dfs[result_dfs['recommand'] == 'best'].index) # type: ignore
+                best_indexes = list(result_dfs[result_dfs["recommand"] == "best"].index)  # type: ignore
-                for i in range(len(estimators)-1, -1, -1):
-                    if i not in best_indexes:
-                        del(estimators[i])
-                        del(cluster_dfs[i])
+                # for i in range(len(estimators) - 1, -1, -1):
+                #     if i not in best_indexes:
+                #         del estimators[i]
+                #         del cluster_dfs[i]
             pbar.update(1)
+    # best 모델 선정: recommand=='best'인 인덱스의 estimator/cluster_df만 반환
+    if len(estimators) == 1:
+        if plot:
+            hs_plot.scatterplot(
+                df=cluster_dfs[0],
+                xname=cluster_dfs[0].columns[0],
+                yname=cluster_dfs[0].columns[1],
+                hue="cluster",
+                vector="vector",
+                title=f"DBSCAN Clustering (eps={estimators[0].eps}, min_samples={estimators[0].min_samples})",
+                outline=True
+            )
+        return estimators[0], cluster_dfs[0], result_dfs # type: ignore
+    # recommand=='best'인 인덱스 추출 (여러 개면 첫 번째)
+    best_indexes = list(result_dfs[result_dfs["recommand"] == "best"].index) # type: ignore
+    if not best_indexes:
+        # fallback: 첫 번째
+        best_index = 0
+    else:
+        best_index = best_indexes[0]
+    best_estimator = estimators[best_index]
+    best_cluster_df = cluster_dfs[best_index]
+    if plot:
+        hs_plot.scatterplot(
+            df=best_cluster_df,
+            xname=best_cluster_df.columns[0],
+            yname=best_cluster_df.columns[1],
+            hue="cluster",
+            vector="vector",
+            title=f"DBSCAN Clustering (eps={best_estimator.eps}, min_samples={best_estimator.min_samples})",
+            outline=True
+        )
+    return best_estimator, best_cluster_df, result_dfs # type: ignore
+# ===================================================================
+# 단일 계층적 군집화 모델을 적합하는 함수.
+# ===================================================================
+def __agg_fit(
+    data: DataFrame,
+    n_clusters: int = 3,
+    linkage: Literal["ward", "complete", "average", "single"] = "ward",
+    plot: bool = False,
+    compute_distances: bool = True,
+    **params,
+) -> tuple[AgglomerativeClustering, DataFrame, float]:
+    """
+    단일 계층적 군집화 모델을 적합하는 함수.
+    Args:
+        data (DataFrame): 군집화할 데이터프레임.
+        n_clusters (int, optional): 군집 개수. 기본값 3.
+        linkage (str, optional): 병합 기준. 기본값 "ward".
+        compute_distances (bool, optional): 거리 행렬 계산 여부. 기본값 True.
+        plot (bool, optional): True면 결과를 시각화함. 기본값 False.
+        **params: AgglomerativeClustering에 전달할 추가 파라미터.
+    Returns:
+        tuple: (estimator, df, score)
+            - estimator: 적합된 AgglomerativeClustering 모델.
+            - df: 클러스터 결과가 포함된 데이터 프레임.
+            - score: 실루엣 점수.
+    """
+    df = data.copy()
+    estimator = AgglomerativeClustering(
+        n_clusters=n_clusters, compute_distances=compute_distances, linkage=linkage, **params
+    )
+    estimator.fit(data)
+    df["cluster"] = estimator.labels_
+    score = float(silhouette_score(X=data, labels=df["cluster"]))
+    if plot:
+        hs_plot.visualize_silhouette(estimator=estimator, data=data)
+    return estimator, df, score
+def agg_fit(
+    data: DataFrame,
+    n_clusters: int | list[int] | np.ndarray = 3,
+    linkage: Literal["ward", "complete", "average", "single"] = "ward",
+    plot: bool = False,
+    **params,
+) -> tuple[AgglomerativeClustering | list[AgglomerativeClustering], DataFrame | list[DataFrame], DataFrame]:
+    """
+    계층적 군집화 모델을 적합하는 함수.
+    Args:
+        data (DataFrame): 군집화할 데이터프레임.
+        n_clusters (int | list[int] | np.ndarray, optional): 군집 개수 또는 개수 리스트. 기본값 3.
+        linkage (str, optional): 병합 기준. 기본값 "ward".
+        plot (bool, optional): True면 결과를 시각화함. 기본값 False.
+        **params: AgglomerativeClustering에 전달할 추가 파라미터.
+    Returns:
+        tuple: (estimator(s), df(s), score_df)
+            - estimator(s): 적합된 AgglomerativeClustering 모델 또는 모델 리스트 (n_clusters가 리스트일 때 리턴도 리스트로 처리됨).
+            - df(s): 클러스터 결과가 포함된 데이터 프레임 또는 데이터 프레임 리스트(n_cluseters가 리스트일 때 리턴되 리스트로 처리됨).
+            - score_df: 각 군집 개수에 대한 실루엣 점수 데이터프레임.
+    Examples:
+        ```python
+        from hossam import *
+        data = hs_util.load_data('iris')
+        estimators, cluster_dfs, score_df = hs_cluster.agg_fit(data.iloc[:, :-1], n_clusters=[2,3,4])
+        ```
+    """
+    compute_distances = False
+    if isinstance(n_clusters, int):
+        n_clusters = [n_clusters]
+        compute_distances = True
+    else:
+        n_clusters = list(range(n_clusters[0], n_clusters[-1]))
+    estimators = []
+    cluster_dfs = []
+    scores = []
+    with tqdm(total=len(n_clusters)*2) as pbar:
+        pbar.set_description(f"Agglomerative Clustering")
+        with futures.ThreadPoolExecutor() as executor:
+            executers = []
+            for k in n_clusters:
+                executers.append(
+                    executor.submit(
+                        __agg_fit,
+                        data=data,
+                        n_clusters=k,
+                        linkage=linkage,
+                        plot=False,
+                        compute_distances=compute_distances,
+                        **params,
+                    )
+                )
+                pbar.update(1)
+            for e in executers:
+                estimator, cluster_df, score = e.result()
+                estimators.append(estimator)
+                cluster_dfs.append(cluster_df)
+                scores.append({"k": estimator.n_clusters, "silhouette_score": score})
+                if plot:
+                    hs_plot.visualize_silhouette(
+                        estimator=estimator,
+                        data=data,
+                        outline=True,
+                        title=f"Agglomerative Clustering Silhouette (k={estimator.n_clusters})",
+                    )
+                pbar.update(1)
+    score_df = DataFrame(scores)
+    score_df.sort_values(by="silhouette_score", ascending=False, inplace=True)
     return (
         estimators[0] if len(estimators) == 1 else estimators,  # type: ignore
         cluster_dfs[0] if len(cluster_dfs) == 1 else cluster_dfs,
-        result_dfs  # type: ignore
-    )
+        score_df,  # type: ignore
+    )

hossam 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl

hossam 0.4.16py3-none-any.whl → 0.4.18py3-none-any.whl