PyPI - hossam - Versions diffs - 0.4.17__py3-none-any.whl → 0.4.18__py3-none-any.whl - Mend

hossam 0.4.17py3-none-any.whl → 0.4.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

hossam/__init__.py +19 -0
hossam/hs_cluster copy.py +1060 -0
hossam/hs_cluster.py +99 -21
hossam/hs_plot.py +23 -11
hossam/hs_stats.py +39 -2
{hossam-0.4.17.dist-info → hossam-0.4.18.dist-info}/METADATA +1 -1
hossam-0.4.18.dist-info/RECORD +18 -0
hossam-0.4.17.dist-info/RECORD +0 -17
{hossam-0.4.17.dist-info → hossam-0.4.18.dist-info}/WHEEL +0 -0
{hossam-0.4.17.dist-info → hossam-0.4.18.dist-info}/licenses/LICENSE +0 -0
{hossam-0.4.17.dist-info → hossam-0.4.18.dist-info}/top_level.txt +0 -0

hossam/hs_cluster.py CHANGED Viewed

@@ -23,6 +23,7 @@ from sklearn.metrics import silhouette_score, adjusted_rand_score
 # hossam 패키지 참조
 # ===================================================================
 from . import hs_plot
+from .hs_util import is_2d
 RANDOM_STATE = 52
@@ -32,10 +33,11 @@ RANDOM_STATE = 52
 # ===================================================================
 def kmeans_fit(
     data: DataFrame,
-    n_clusters: int,
+    n_clusters: int | None = None,
+    k_range: list | tuple = [2, 11],
     random_state: int = RANDOM_STATE,
     plot: bool = False,
-    fields: list[list[str]] | None = None,
+    fields: list[str] | tuple[str] | tuple[tuple[str]] | list[list[str]] | None = None,
     **params,
 ) -> tuple[KMeans, DataFrame, float]:
     """
@@ -43,7 +45,7 @@ def kmeans_fit(
     Args:
         data (DataFrame): 군집화할 데이터프레임.
-        n_clusters (int): 군집 개수.
+        n_clusters (int | None): 군집 개수.
         random_state (int, optional): 랜덤 시드. 기본값은 RANDOM_STATE.
         plot (bool, optional): True면 결과를 시각화함. 기본값 False.
         fields (list[list[str]] | None, optional): 시각화할 필드 쌍 리스트. 기본값 None이면 수치형 컬럼의 모든 조합 사용.
@@ -55,18 +57,36 @@ def kmeans_fit(
         float: 실루엣 점수
     """
     df = data.copy()
+    if n_clusters is None:
+        n_clusters = kmeans_best_k(data=df, k_range=k_range, random_state=random_state, plot=False)
+        print(f"Best k found: {n_clusters}")
     kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, **params)
     kmeans.fit(data)
     df["cluster"] = kmeans.predict(df)
     score = float(silhouette_score(X=data, labels=df["cluster"]))
     if plot:
-        cluster_plot(
-            estimator=kmeans,
-            data=data,
-            fields=fields,
-            title=f"K-Means Clustering (k={n_clusters})",
-        )
+        if not is_2d(fields):
+            fields = [fields]   # type: ignore
+        # cluster_plot(
+        #     estimator=kmeans,
+        #     data=data,
+        #     fields=fields,
+        #     title=f"K-Means Clustering (k={n_clusters})",
+        # )
+        for f in fields:  # type: ignore
+            hs_plot.visualize_silhouette(
+                estimator=kmeans,
+                data=data,
+                xname=f[0],     # type: ignore
+                yname=f[1],     # type: ignore
+                title=f"K-Means Clustering (k={n_clusters})",
+                outline=True,
+            )
     return kmeans, df, score
@@ -544,6 +564,9 @@ def persona(
         persona_dict[("", f"count")] = len(group)
         for field in fields:
+            if field == cluster:
+                continue
             # 명목형일 경우 최빈값 사용
             if df[field].dtype == "object" or df[field].dtype.name == "category":
                 persona_dict[(field, "mode")] = group[field].mode()[0]
@@ -779,7 +802,9 @@ def dbscan_eps(
     return best_eps, eps_grid
+# ===================================================================
+# DBSCAN 군집화 모델을 적합하고 최적의 eps 값을 탐지하는 함수.
+# ===================================================================
 def dbscan_fit(
     data: DataFrame,
     eps: float | list | np.ndarray | None = None,
@@ -789,6 +814,25 @@ def dbscan_fit(
     plot: bool = True,
     **params,
 ) -> tuple[DBSCAN, DataFrame, DataFrame]:
+    """
+    DBSCAN 군집화 모델을 적합하고 최적의 eps 값을 탐지하는 함수.
+    Args:
+        data (DataFrame): 군집화할 데이터프레임.
+        eps (float | list | np.ndarray | None, optional): eps 값 또는 리스트.
+            None이면 최적의 eps 값을 탐지함. 기본값 None.
+        min_samples (int, optional): 핵심점이 되기 위한 최소 샘플수. 기본값 5.
+        ari_threshold (float, optional): 안정 구간 탐지를 위한 ARI 임계값. 기본값 0.9.
+        noise_diff_threshold (float, optional): 안정 구간 탐지를 위한 노이즈 비율 변화 임계값. 기본값 0.05.
+        plot (bool, optional): True면 결과를 시각화함. 기본값 True.
+        **params: DBSCAN에 전달할 추가 파라미터.
+    Returns:
+        tuple: (estimator, cluster_df, result_df)
+            - estimator: 적합된 DBSCAN 모델 또는 모델 리스트(최적 eps가 여러 개인 경우).
+            - cluster_df: 클러스터 및 벡터 유형이 포함된 데이터 프레임 또는 데이터 프레임 리스트(최적 eps가 여러 개인 경우).
+            - result_df: eps 값에 따른 군집화 요약 통계 데이터 프레임.
+    """
     # eps 값이 지정되지 않은 경우 최적의 eps 탐지
     if eps is None:
@@ -874,18 +918,52 @@ def dbscan_fit(
                 # result_dfs에서 recommand가 best에 해당하는 인덱스와 같은 위치의 추정기만 추출
                 best_indexes = list(result_dfs[result_dfs["recommand"] == "best"].index)  # type: ignore
-                for i in range(len(estimators) - 1, -1, -1):
-                    if i not in best_indexes:
-                        del estimators[i]
-                        del cluster_dfs[i]
+                # for i in range(len(estimators) - 1, -1, -1):
+                #     if i not in best_indexes:
+                #         del estimators[i]
+                #         del cluster_dfs[i]
             pbar.update(1)
-    return (
-        estimators[0] if len(estimators) == 1 else estimators,  # type: ignore
-        cluster_dfs[0] if len(cluster_dfs) == 1 else cluster_dfs,
-        result_dfs,  # type: ignore
-    )
+    # best 모델 선정: recommand=='best'인 인덱스의 estimator/cluster_df만 반환
+    if len(estimators) == 1:
+        if plot:
+            hs_plot.scatterplot(
+                df=cluster_dfs[0],
+                xname=cluster_dfs[0].columns[0],
+                yname=cluster_dfs[0].columns[1],
+                hue="cluster",
+                vector="vector",
+                title=f"DBSCAN Clustering (eps={estimators[0].eps}, min_samples={estimators[0].min_samples})",
+                outline=True
+            )
+        return estimators[0], cluster_dfs[0], result_dfs # type: ignore
+    # recommand=='best'인 인덱스 추출 (여러 개면 첫 번째)
+    best_indexes = list(result_dfs[result_dfs["recommand"] == "best"].index) # type: ignore
+    if not best_indexes:
+        # fallback: 첫 번째
+        best_index = 0
+    else:
+        best_index = best_indexes[0]
+    best_estimator = estimators[best_index]
+    best_cluster_df = cluster_dfs[best_index]
+    if plot:
+        hs_plot.scatterplot(
+            df=best_cluster_df,
+            xname=best_cluster_df.columns[0],
+            yname=best_cluster_df.columns[1],
+            hue="cluster",
+            vector="vector",
+            title=f"DBSCAN Clustering (eps={best_estimator.eps}, min_samples={best_estimator.min_samples})",
+            outline=True
+        )
+    return best_estimator, best_cluster_df, result_dfs # type: ignore
 # ===================================================================
@@ -950,8 +1028,8 @@ def agg_fit(
     Returns:
         tuple: (estimator(s), df(s), score_df)
-            - estimator(s): 적합된 AgglomerativeClustering 모델 또는 모델 리스트.
-            - df(s): 클러스터 결과가 포함된 데이터 프레임 또는 데이터 프레임 리스트.
+            - estimator(s): 적합된 AgglomerativeClustering 모델 또는 모델 리스트 (n_clusters가 리스트일 때 리턴도 리스트로 처리됨).
+            - df(s): 클러스터 결과가 포함된 데이터 프레임 또는 데이터 프레임 리스트(n_cluseters가 리스트일 때 리턴되 리스트로 처리됨).
             - score_df: 각 군집 개수에 대한 실루엣 점수 데이터프레임.
     Examples:

hossam/hs_plot.py CHANGED Viewed

@@ -309,7 +309,7 @@ def lineplot(
 # 상자그림(boxplot)을 그린다
 # ===================================================================
 def boxplot(
-    df: DataFrame,
+    df: DataFrame | None = None,
     xname: str | None = None,
     yname: str | None = None,
     title: str | None = None,
@@ -331,7 +331,7 @@ def boxplot(
     """상자그림(boxplot)을 그린다.
     Args:
-        df (DataFrame): 시각화할 데이터.
+        df (DataFrame|None): 시각화할 데이터.
         xname (str|None): x축 범주 컬럼명.
         yname (str|None): y축 값 컬럼명.
         title (str|None): 그래프 제목.
@@ -359,13 +359,20 @@ def boxplot(
         fig, ax = get_default_ax(width, height, 1, 1, dpi)  # type: ignore
         outparams = True
-    if xname is not None and yname is not None:
+    if xname is not None or yname is not None:
+        if xname is not None and yname is None:
+            orient = "h"
+        elif xname is None and yname is not None:
+            orient = "v"
         boxplot_kwargs = {
             "data": df,
             "x": xname,
             "y": yname,
             "orient": orient,
             "ax": ax,
+            "linewidth": linewidth,
         }
         # hue 파라미터 확인 (params에 있을 수 있음)
@@ -377,12 +384,12 @@ def boxplot(
             boxplot_kwargs["color"] = sb.color_palette(palette)[0]
         boxplot_kwargs.update(params)
-        sb.boxplot(**boxplot_kwargs, linewidth=linewidth)
+        sb.boxplot(**boxplot_kwargs)
         # 통계 검정 추가
         if stat_test is not None:
             if stat_pairs is None:
-                stat_pairs = [df[xname].dropna().unique().tolist()]
+                stat_pairs = [df[xname].dropna().unique().tolist()] # type: ignore
             annotator = Annotator(
                 ax, data=df, x=xname, y=yname, pairs=stat_pairs, orient=orient
@@ -847,15 +854,15 @@ def scatterplot(
     else:
         # 핵심벡터
         scatterplot_kwargs["edgecolor"] = "#ffffff"
-        sb.scatterplot(data=df[df[vector] == "core"], **scatterplot_kwargs)
+        sb.scatterplot(data=df[df[vector] == "core"], **scatterplot_kwargs) # type: ignore
         # 외곽백터
         scatterplot_kwargs["edgecolor"] = "#000000"
         scatterplot_kwargs["s"] = 25
         scatterplot_kwargs["marker"] = "^"
         scatterplot_kwargs["linewidth"] = 0.8
-        sb.scatterplot(data=df[df[vector] == "border"], **scatterplot_kwargs)
+        sb.scatterplot(data=df[df[vector] == "border"], **scatterplot_kwargs) # type: ignore
         # 노이즈벡터
         scatterplot_kwargs["edgecolor"] = None
         scatterplot_kwargs["s"] = 25
@@ -3022,10 +3029,15 @@ def pca_plot(
         if field_group is not None:
             title += " - " + ", ".join(field_group)
+        tdf = DataFrame({
+            field_group[0]: xs * scalex,
+            field_group[1]: ys * scaley,
+        })
         scatterplot(
-            df=None,
-            xname=xs * scalex,
-            yname=ys * scaley,
+            df=tdf,
+            xname=field_group[0],
+            yname=field_group[1],
             hue=pca_df[hue] if hue is not None else None,
             outline=False,
             palette=palette,

hossam/hs_stats.py CHANGED Viewed

@@ -244,6 +244,41 @@ def outlier_table(data: DataFrame, *fields: str):
         outlier_count = ((data[f] < down) | (data[f] > up)).sum()
         outlier_rate = (outlier_count / len(data)) * 100
+        # 왜도
+        skew = data[f].skew()
+        # 이상치 개수 및 비율
+        outlier_count = ((data[f] < down) | (data[f] > up)).sum()
+        outlier_rate = (outlier_count / len(data)) * 100
+        # 분포 특성 판정 (왜도 기준)
+        abs_skew = abs(skew)    # type: ignore
+        if abs_skew < 0.5:      # type: ignore
+            dist = "거의 대칭"
+        elif abs_skew < 1.0:    # type: ignore
+            if skew > 0:        # type: ignore
+                dist = "약한 우측 꼬리"
+            else:
+                dist = "약한 좌측 꼬리"
+        elif abs_skew < 2.0:    # type: ignore
+            if skew > 0:        # type: ignore
+                dist = "중간 우측 꼬리"
+            else:
+                dist = "중간 좌측 꼬리"
+        else:
+            if skew > 0:        # type: ignore
+                dist = "극단 우측 꼬리"
+            else:
+                dist = "극단 좌측 꼬리"
+        # 로그변환 필요성 판정
+        if abs_skew < 0.5:      # type: ignore
+            log_need = "낮음"
+        elif abs_skew < 1.0:    # type: ignore
+            log_need = "중간"
+        else:
+            log_need = "높음"
         iq = {
             "field": f,
             "q1": q1,
@@ -254,9 +289,11 @@ def outlier_table(data: DataFrame, *fields: str):
             "down": down,
             "min": min_value,
             "max": max_value,
-            "skew": skew,
             "outlier_count": outlier_count,
-            "outlier_rate": outlier_rate
+            "outlier_rate": outlier_rate,
+            "skew": skew,
+            "dist": dist,
+            "log_need": log_need
         }
         result.append(iq)

{hossam-0.4.17.dist-info → hossam-0.4.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hossam
-Version: 0.4.17
+Version: 0.4.18
 Summary: Hossam Data Helper
 Author-email: Lee Kwang-Ho <leekh4232@gmail.com>
 License-Expression: MIT

hossam-0.4.18.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+hossam/NotoSansKR-Regular.ttf,sha256=0SCufUQwcVWrWTu75j4Lt_V2bgBJIBXl1p8iAJJYkVY,6185516
+hossam/__init__.py,sha256=_v_Gj01j6nNtDfcxS9B2MB8ZPQ_PVnY9jTkvgRJdVxc,4492
+hossam/hs_classroom.py,sha256=Sb1thy49LKn2zU90aiOVwHWhyWSMHLZbZX7eXmQlquc,27523
+hossam/hs_cluster copy.py,sha256=1pylX0lV_FR1ZhcYN1sosMcAxj5WGLjwgHHjkkCPsCY,39761
+hossam/hs_cluster.py,sha256=wrDPsF14tHwRuwBEUz765j_I87JH5HFtZZoUuGf-W-4,41190
+hossam/hs_gis.py,sha256=DVmndBK-_7GMK3J1_on3ieEQk1S0MfUZ8_wlX-cDdZQ,11581
+hossam/hs_plot.py,sha256=KW9NtD-ecFvas5h58ayEPzV0Eu2Dtt6twhj4z4LhPzA,103127
+hossam/hs_prep.py,sha256=lsK_BxuOXNdTTNED-nkw3EY2vt5nQGMvkWvVLqNkwM0,43255
+hossam/hs_stats.py,sha256=1h3RPOVJ_5EGO5_1yHTE8zpuJyy85xOLPqWmR-xqqmo,121161
+hossam/hs_study.py,sha256=ZzL76_V0IHnk_YUTbWncIIBruOj2Sz3xs91snS6cpu0,2776
+hossam/hs_timeserise.py,sha256=NzGV4bJmVQr3qUFySOP25qENItmloYjgh3VgwSbSmXc,43163
+hossam/hs_util.py,sha256=LJvoTFSLX_uADJrC1ONGM93DX2HS2jcNBnPariLPZko,16704
+hossam/leekh.png,sha256=1PB5NQ24SDoHA5KMiBBsWpSa3iniFcwFTuGwuOsTHfI,6395
+hossam-0.4.18.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
+hossam-0.4.18.dist-info/METADATA,sha256=FUrD5dnNDWgA8dChQq8gmFg1lQ47YKiIdqhItMl6Vi8,3803
+hossam-0.4.18.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+hossam-0.4.18.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
+hossam-0.4.18.dist-info/RECORD,,

hossam-0.4.17.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-hossam/NotoSansKR-Regular.ttf,sha256=0SCufUQwcVWrWTu75j4Lt_V2bgBJIBXl1p8iAJJYkVY,6185516
-hossam/__init__.py,sha256=lJ-_g2HAmFnixOmKjCv7_cMSdiYwbM6SNlHEtptUlUI,4045
-hossam/hs_classroom.py,sha256=Sb1thy49LKn2zU90aiOVwHWhyWSMHLZbZX7eXmQlquc,27523
-hossam/hs_cluster.py,sha256=yFlEaLz-cEueurw5nvKuUogdwepOEU3jN7woaYeN-cM,37581
-hossam/hs_gis.py,sha256=DVmndBK-_7GMK3J1_on3ieEQk1S0MfUZ8_wlX-cDdZQ,11581
-hossam/hs_plot.py,sha256=nuN8DoQbMRLMxFRO1VD1zhg2yA1NHqFGrjuvS4FLahY,102773
-hossam/hs_prep.py,sha256=lsK_BxuOXNdTTNED-nkw3EY2vt5nQGMvkWvVLqNkwM0,43255
-hossam/hs_stats.py,sha256=MDS3rvaXDP8aYwcE36JTetWiZgE4fkXnNo0vwlXu-pA,119890
-hossam/hs_study.py,sha256=ZzL76_V0IHnk_YUTbWncIIBruOj2Sz3xs91snS6cpu0,2776
-hossam/hs_timeserise.py,sha256=NzGV4bJmVQr3qUFySOP25qENItmloYjgh3VgwSbSmXc,43163
-hossam/hs_util.py,sha256=LJvoTFSLX_uADJrC1ONGM93DX2HS2jcNBnPariLPZko,16704
-hossam/leekh.png,sha256=1PB5NQ24SDoHA5KMiBBsWpSa3iniFcwFTuGwuOsTHfI,6395
-hossam-0.4.17.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
-hossam-0.4.17.dist-info/METADATA,sha256=camh3iJFBxxmzX1tHg71MWwNcqcY3a_GSKbNhf8iR7E,3803
-hossam-0.4.17.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-hossam-0.4.17.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
-hossam-0.4.17.dist-info/RECORD,,

{hossam-0.4.17.dist-info → hossam-0.4.18.dist-info}/WHEEL RENAMED Viewed

File without changes

{hossam-0.4.17.dist-info → hossam-0.4.18.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{hossam-0.4.17.dist-info → hossam-0.4.18.dist-info}/top_level.txt RENAMED Viewed

File without changes

hossam 0.4.17__py3-none-any.whl → 0.4.18__py3-none-any.whl

hossam 0.4.17py3-none-any.whl → 0.4.18py3-none-any.whl