PyPI - hossam - Versions diffs - 0.4.7__tar.gz → 0.4.9__tar.gz - Mend

hossam 0.4.7tar.gz → 0.4.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

{hossam-0.4.7/hossam.egg-info → hossam-0.4.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hossam
-Version: 0.4.7
+Version: 0.4.9
 Summary: Hossam Data Helper
 Author-email: Lee Kwang-Ho <leekh4232@gmail.com>
 License-Expression: MIT
@@ -73,7 +73,7 @@ title: 🎓 Hossam Data Helper
 pip install hossam
 ```
-**요구사항**: Python 3.8 이상
+**요구사항**: Python 3.13.9 이상
 ## 📚 전체 문서
@@ -83,9 +83,9 @@ pip install hossam
 - **hs_plot**: 25+ 시각화 함수 (선 그래프, 산점도, 히스토그램, 박스플롯, 히트맵 등)
 - **hs_stats**: 회귀/분류 분석, 교차검증, 정규성 검정, 상관분석 등
-- **hs_prep**: 결측치 처리, 이상치 탐지, 스케일링, 인코딩
+- **hs_prep**: 결측치 처리, 이상치 탐지, 스케일링, 인코딩 등의 데이터 전처리 기능
+- **hs_timeserise**: 시계열 분석 기능 지원
 - **hs_gis**: GIS 데이터 로드 및 시각화 (대한민국 지도 지원)
-- **hs_classroom**: 학습용 이진분류, 다중분류, 회귀 데이터 생성
 - **hs_util**: 예쁜 테이블 출력, 그리드 서치 등
 자세한 사용법은 [API 문서](https://py.hossam.kr/api/hossam/)를 참고하세요.

{hossam-0.4.7 → hossam-0.4.9}/README.md RENAMED Viewed

@@ -29,7 +29,7 @@ title: 🎓 Hossam Data Helper
 pip install hossam
 ```
-**요구사항**: Python 3.8 이상
+**요구사항**: Python 3.13.9 이상
 ## 📚 전체 문서
@@ -39,9 +39,9 @@ pip install hossam
 - **hs_plot**: 25+ 시각화 함수 (선 그래프, 산점도, 히스토그램, 박스플롯, 히트맵 등)
 - **hs_stats**: 회귀/분류 분석, 교차검증, 정규성 검정, 상관분석 등
-- **hs_prep**: 결측치 처리, 이상치 탐지, 스케일링, 인코딩
+- **hs_prep**: 결측치 처리, 이상치 탐지, 스케일링, 인코딩 등의 데이터 전처리 기능
+- **hs_timeserise**: 시계열 분석 기능 지원
 - **hs_gis**: GIS 데이터 로드 및 시각화 (대한민국 지도 지원)
-- **hs_classroom**: 학습용 이진분류, 다중분류, 회귀 데이터 생성
 - **hs_util**: 예쁜 테이블 출력, 그리드 서치 등
 자세한 사용법은 [API 문서](https://py.hossam.kr/api/hossam/)를 참고하세요.

{hossam-0.4.7 → hossam-0.4.9}/hossam/__init__.py RENAMED Viewed

@@ -8,6 +8,7 @@ from . import hs_timeserise
 from . import hs_util
 from .hs_util import load_info
 from .hs_util import _load_data_remote as load_data
+from .hs_plot import visualize_silhouette
 # py-modules
 import sys
@@ -24,7 +25,7 @@ except Exception:
 my_dpi = hs_plot.config.dpi
-__all__ = ["my_dpi", "load_data", "load_info", "hs_classroom", "hs_gis", "hs_plot", "hs_prep", "hs_stats", "hs_timeserise", "hs_util"]
+__all__ = ["my_dpi", "load_data", "load_info", "hs_classroom", "hs_gis", "hs_plot", "hs_prep", "hs_stats", "hs_timeserise", "hs_util", "visualize_silhouette"]
 # 내부 모듈에서 hs_fig를 사용할 때는 아래와 같이 import 하세요.
 # from hossam import hs_fig

{hossam-0.4.7 → hossam-0.4.9}/hossam/hs_plot.py RENAMED Viewed

@@ -22,12 +22,16 @@ from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess
 from statannotations.Annotator import Annotator
 # ===================================================================
+from sklearn.cluster._kmeans import KMeans
 from sklearn.metrics import (
     mean_squared_error,
     ConfusionMatrixDisplay,
     roc_curve,
     auc,
-    confusion_matrix
+    confusion_matrix,
+    silhouette_score,
+    silhouette_samples
 )
 # ===================================================================
@@ -708,6 +712,8 @@ def scatterplot(
     xname: str,
     yname: str,
     hue=None,
+    vector: str | None = None,
+    outline: bool = False,
     title: str | None = None,
     palette: str | None = None,
     width: int = config.width,
@@ -726,6 +732,8 @@ def scatterplot(
         xname (str): x축 컬럼.
         yname (str): y축 컬럼.
         hue (str|None): 범주 컬럼.
+        vector (str|None): 벡터 종류 컬럼.
+        outline (bool): 점 외곽선 표시 여부.
         title (str|None): 그래프 제목.
         palette (str|None): 팔레트 이름.
         width (int): 캔버스 가로 픽셀.
@@ -745,9 +753,32 @@ def scatterplot(
         fig, ax = get_default_ax(width, height, 1, 1, dpi) # type: ignore
         outparams = True
+    if outline and hue is not None:
+        # 군집별 값의 종류별로 반복 수행
+        for c in df[hue].unique():
+            if c == -1:
+                continue
+            # 한 종류만 필터링한 결과에서 두 변수만 선택
+            df_c = df.loc[df[hue] == c, [xname, yname]]
+            try:
+                # 외각선 좌표 계산
+                hull = ConvexHull(df_c)
+                # 마지막 좌표 이후에 첫 번째 좌표를 연결
+                points = np.append(hull.vertices, hull.vertices[0])
+                ax.plot(    # type: ignore
+                    df_c.iloc[points, 0], df_c.iloc[points, 1], linewidth=linewidth, linestyle=":"
+                )
+                ax.fill(df_c.iloc[points, 0], df_c.iloc[points, 1], alpha=0.1)  # type: ignore
+            except:
+                pass
     # hue가 있을 때만 palette 사용, 없으면 color 사용
     scatterplot_kwargs = {
-        "data": df,
         "x": xname,
         "y": yname,
         "hue": hue,
@@ -762,7 +793,30 @@ def scatterplot(
     scatterplot_kwargs.update(params)
-    sb.scatterplot(**scatterplot_kwargs)
+    # 백터 종류 구분 필드가 전달되지 않은 경우에는 원본 데이터를 그대로 사용
+    if vector is None:
+        sb.scatterplot(data=df, **scatterplot_kwargs)
+    else:
+        # 핵심벡터
+        scatterplot_kwargs['edgecolor'] = '#ffffff'
+        sb.scatterplot(data=df[df[vector] == "core"], **scatterplot_kwargs)
+        # 외곽백터
+        scatterplot_kwargs['edgecolor'] = '#000000'
+        scatterplot_kwargs['s'] = 25
+        scatterplot_kwargs['marker'] = '^'
+        scatterplot_kwargs['linewidth'] = 0.8
+        sb.scatterplot(data=df[df[vector] == "border"], **scatterplot_kwargs)
+        # 노이즈벡터
+        scatterplot_kwargs['edgecolor'] = None
+        scatterplot_kwargs['s'] = 25
+        scatterplot_kwargs['marker'] = 'x'
+        scatterplot_kwargs['linewidth'] = 2
+        scatterplot_kwargs['color'] = '#ff0000'
+        scatterplot_kwargs['hue'] = None
+        sb.scatterplot(data=df[df[vector] == "noise"], **scatterplot_kwargs)
     finalize_plot(ax, callback, outparams, save_path, True, title) # type: ignore
@@ -1479,79 +1533,6 @@ def heatmap(
     finalize_plot(ax, callback, outparams, save_path, True, title)  # type: ignore
-# ===================================================================
-# 클러스터별 볼록 경계막(convex hull)을 그린다
-# ===================================================================
-def convex_hull(
-    data: DataFrame,
-    xname: str,
-    yname: str,
-    hue: str | None = None,
-    title: str | None = None,
-    palette: str | None = None,
-    width: int = config.width,
-    height: int = config.height,
-    linewidth: float = config.line_width,
-    dpi: int = config.dpi,
-    save_path: str | None = None,
-    callback: Callable | None = None,
-    ax: Axes | None = None,
-    **params,
-):
-    """클러스터별 볼록 껍질(convex hull)과 산점도를 그린다.
-    Args:
-        data (DataFrame): 시각화할 데이터.
-        xname (str): x축 컬럼.
-        yname (str): y축 컬럼.
-        hue (str): 클러스터/범주 컬럼.
-        title (str|None): 그래프 제목.
-        palette (str|None): 팔레트 이름.
-        width (int): 캔버스 가로 픽셀.
-        height (int): 캔버스 세로 픽셀.
-        linewidth (float): 선 굵기.
-        dpi (int): 그림 크기 및 해상도.
-        callback (Callable|None): Axes 후처리 콜백.
-        ax (Axes|None): 외부에서 전달한 Axes.
-        **params: seaborn scatterplot 추가 인자.
-    Returns:
-        None
-    """
-    outparams = False
-    if ax is None:
-        fig, ax = get_default_ax(width, height, 1, 1, dpi)  # type: ignore
-        outparams = True
-    # 군집별 값의 종류별로 반복 수행
-    for c in data[hue].unique():
-        if c == -1:
-            continue
-        # 한 종류만 필터링한 결과에서 두 변수만 선택
-        df_c = data.loc[data[hue] == c, [xname, yname]]
-        try:
-            # 외각선 좌표 계산
-            hull = ConvexHull(df_c)
-            # 마지막 좌표 이후에 첫 번째 좌표를 연결
-            points = np.append(hull.vertices, hull.vertices[0])
-            ax.plot(    # type: ignore
-                df_c.iloc[points, 0], df_c.iloc[points, 1], linewidth=linewidth, linestyle=":"
-            )
-            ax.fill(df_c.iloc[points, 0], df_c.iloc[points, 1], alpha=0.1)  # type: ignore
-        except:
-            pass
-    # convex_hull은 hue가 필수이므로 palette를 그대로 사용
-    sb.scatterplot(
-        data=data, x=xname, y=yname, hue=hue, palette=palette, ax=ax, **params
-    )
-    finalize_plot(ax, callback, outparams, save_path, True, title)  # type: ignore
 # ===================================================================
 # KDE와 신뢰구간을 나타낸 그래프를 그린다
@@ -2045,16 +2026,8 @@ def scatter_by_class(
                 processed.append([item, yname])
         group = processed
-    if outline:
-        for v in group:
-            convex_hull(data=data, xname=v[0], yname=v[1], hue=hue, palette=palette,
-                        width=width, height=height, linewidth=linewidth, dpi=dpi, callback=callback,
-                        save_path=save_path)
-    else:
-        for v in group:
-            scatterplot(data=data, xname=v[0], yname=v[1], hue=hue, palette=palette,
-                        width=width, height=height, linewidth=linewidth, dpi=dpi, callback=callback,
-                        save_path=save_path)    # type: ignore
+    for v in group:
+        scatterplot(data=data, xname=v[0], yname=v[1], outline=outline, hue=hue, palette=palette, width=width, height=height, linewidth=linewidth, dpi=dpi, callback=callback, save_path=save_path)    # type: ignore
 # ===================================================================
@@ -2519,3 +2492,218 @@ def distribution_plot(
                 plt.close()
             else:
                 plt.show()
+def silhouette_plot(
+        estimator: KMeans,
+        data: DataFrame,
+        title: str | None = None,
+        width: int = config.width,
+        height: int = config.height,
+        linewidth: float = config.line_width,
+        dpi: int = config.dpi,
+        save_path: str | None = None,
+        callback: Callable | None = None,
+        ax: Axes | None = None,
+    ) -> None:
+    """
+    군집분석 결과의 실루엣 플롯을 시각화함.
+    Args:
+        estimator (KMeans): 학습된 KMeans 군집 모델 객체.
+        data (DataFrame): 군집분석에 사용된 입력 데이터 (n_samples, n_features).
+        title (str, optional): 플롯 제목. None이면 자동 생성.
+        width (int, optional): 플롯 가로 크기 (inch 단위).
+        height (int, optional): 플롯 세로 크기 (inch 단위).
+        linewidth (float, optional): 기준선 등 선 두께.
+        dpi (int, optional): 플롯 해상도(DPI).
+        save_path (str, optional): 저장 경로 지정 시 파일로 저장.
+        callback (Callable, optional): 추가 커스텀 콜백 함수.
+        ax (Axes, optional): 기존 matplotlib Axes 객체. None이면 새로 생성.
+    Returns:
+        None
+    Note:
+        - 각 군집별 실루엣 계수 분포를 막대그래프로 시각화
+        - 군집 품질(응집도/분리도) 평가에 활용
+        - 붉은색 세로선은 전체 평균 실루엣 스코어를 의미
+    """
+    outparams = False
+    if ax is None:
+        fig, ax = get_default_ax(width, height, 1, 1, dpi) # type: ignore
+        outparams = True
+    sil_avg = silhouette_score(X=data, labels=estimator.labels_)
+    sil_values = silhouette_samples(X=data, labels=estimator.labels_)
+    y_lower = 10
+    # 클러스터링 갯수별로 fill_betweenx( )형태의 막대 그래프 표현.
+    for i in range(estimator.n_clusters):   # type: ignore
+        ith_cluster_sil_values = sil_values[estimator.labels_ == i] # type: ignore
+        ith_cluster_sil_values.sort()   # type: ignore
+        size_cluster_i = ith_cluster_sil_values.shape[0]    # type: ignore
+        y_upper = y_lower + size_cluster_i
+        ax.fill_betweenx(   # type: ignore
+            np.arange(y_lower, y_upper),
+            0,
+            ith_cluster_sil_values,
+            alpha=0.7,
+        )
+        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))  # type: ignore
+        y_lower = y_upper + 10
+    ax.axvline(x=sil_avg, color="red", linestyle="--", linewidth=linewidth)  # type: ignore
+    ax.set_xlabel("The silhouette coefficient values")  # type: ignore
+    ax.set_ylabel("Cluster label")  # type: ignore
+    ax.set_xlim([-0.1, 1])  # type: ignore
+    ax.set_ylim([0, len(data) + (estimator.n_clusters + 1) * 10])  # type: ignore
+    ax.set_yticks([])  # type: ignore
+    ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1]) # type: ignore
+    if title is None:
+        title = "Number of Cluster : " + str(estimator.n_clusters) + ", Silhouette Score :" + str(round(sil_avg, 3))    # type: ignore
+    finalize_plot(ax, callback, outparams, save_path, True, title) # type: ignore
+def cluster_plot(
+    estimator: KMeans,
+    data: DataFrame,
+    xname: str | None = None,
+    yname: str | None = None,
+    hue: str | None = None,
+    title: str | None = None,
+    palette: str | None = None,
+    outline: bool = False,
+    width: int = config.width,
+    height: int = config.height,
+    linewidth: float = config.line_width,
+    dpi: int = config.dpi,
+    save_path: str | None = None,
+    ax: Axes | None = None,
+) -> None:
+    """
+    2차원 공간에서 군집분석 결과를 산점도로 시각화함.
+    Args:
+        estimator (KMeans): 학습된 KMeans 군집 모델 객체.
+        data (DataFrame): 군집분석에 사용된 입력 데이터 (n_samples, n_features).
+        xname (str, optional): x축에 사용할 컬럼명. None이면 첫 번째 컬럼 사용.
+        yname (str, optional): y축에 사용할 컬럼명. None이면 두 번째 컬럼 사용.
+        hue (str, optional): 군집 구분에 사용할 컬럼명. None이면 'cluster' 자동 생성.
+        title (str, optional): 플롯 제목. None이면 기본값 사용.
+        palette (str, optional): 색상 팔레트.
+        outline (bool, optional): 외곽선 표시 여부.
+        width (int, optional): 플롯 가로 크기 (inch 단위).
+        height (int, optional): 플롯 세로 크기 (inch 단위).
+        linewidth (float, optional): 중심점 등 선 두께.
+        dpi (int, optional): 플롯 해상도(DPI).
+        save_path (str, optional): 저장 경로 지정 시 파일로 저장.
+        ax (Axes, optional): 기존 matplotlib Axes 객체. None이면 새로 생성.
+    Returns:
+        None
+    Example:
+        ```python
+        cluster_plot(estimator, data, xname='Sepal.Length', yname='Sepal.Width')
+        ```
+    Note:
+        - 각 군집별 산점도와 중심점(빨간색 원/숫자) 표시
+        - 2차원 특성 공간에서 군집 분포와 분리도 시각화
+    """
+    outparams = False
+    if ax is None:
+        fig, ax = get_default_ax(width, height, 1, 1, dpi) # type: ignore
+        outparams = True
+    df = data.copy()
+    if not hue:
+        df['cluster'] = estimator.labels_   # type: ignore
+        hue = 'cluster'
+    if xname is None:
+        xname = df.columns[0]   # type: ignore
+    if yname is None:
+        yname = df.columns[1]   # type: ignore
+    xindex = df.columns.get_loc(xname)   # type: ignore
+    yindex = df.columns.get_loc(yname)   # type: ignore
+    def callback(ax: Axes) -> None:
+        # 클러스터 중심점 표시
+        centers = estimator.cluster_centers_   # type: ignore
+        ax.scatter(    # type: ignore
+            centers[:, xindex],
+            centers[:, yindex],
+            marker="o",
+            color="white",
+            alpha=1,
+            s=200,
+            edgecolor="r",
+            linewidth=linewidth
+        )
+        for i, c in enumerate(centers):
+            ax.scatter(c[xindex], c[yindex], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
+        ax.set_xlabel("Feature space for the " + xname)
+        ax.set_ylabel("Feature space for the " + yname)
+    scatterplot(
+        df=df,
+        xname=xname,
+        yname=yname,
+        hue=hue,
+        title="The visualization of the clustered data." if title is None else title,
+        outline=outline,
+        palette=palette,
+        width=width,
+        height=height,
+        linewidth=linewidth,
+        dpi=dpi,
+        save_path=save_path,
+        callback=callback,
+        ax=ax
+    )
+def visualize_silhouette(estimator: KMeans, data: DataFrame) -> None:
+    """
+    군집분석 결과의 실루엣 플롯과 군집 산점도를 한 화면에 함께 시각화함.
+    Args:
+        estimator (KMeans): 학습된 KMeans 군집 모델 객체.
+        data (DataFrame): 군집분석에 사용된 입력 데이터 (n_samples, n_features).
+    Returns:
+        None
+    Note:
+        - 실루엣 플롯(왼쪽)과 2차원 군집 산점도(오른쪽)를 동시에 확인 가능
+        - 군집 품질과 분포를 한눈에 비교·분석할 때 유용
+    """
+    fig, ax = get_default_ax(rows=1, cols=2)
+    silhouette_plot(
+        estimator=estimator,
+        data=data,
+        ax=ax[0],
+    )
+    cluster_plot(
+        estimator=estimator,
+        data=data,
+        ax=ax[1],
+    )
+    finalize_plot(ax)

{hossam-0.4.7 → hossam-0.4.9}/hossam/hs_timeserise.py RENAMED Viewed

@@ -123,7 +123,7 @@ def diff(
             ardict["기각값(Critical Values) %s" % key] = value
         stationarity = ar[1] <= 0.05
-        ardict["데이터 정상성 여부"] = "정상" if stationarity else "비정상"
+        ardict["데이터 정상성 여부"] = "정상" if stationarity else "비정상" # type: ignore
         ardf = DataFrame(ardict, index=["ADF Test"]).T
         pretty_table(ardf)

{hossam-0.4.7 → hossam-0.4.9}/hossam/hs_util.py RENAMED Viewed

@@ -2,6 +2,10 @@
 # -------------------------------------------------------------
 import requests
 import json
+import tempfile
+import zipfile
+import shutil
+from pathlib import Path
 from typing import TYPE_CHECKING
 from importlib.metadata import distributions
 import pandas as pd
@@ -21,6 +25,42 @@ def __get_df(path: str, index_col=None) -> DataFrame:
     p = path.rfind(".")
     exec = path[p+1:].lower()
+    # 파일 확장자가 압축파일인 경우 로컬에 파일을 다운로드 후 압축 해제
+    if exec == "zip":
+        tmp_dir = Path(tempfile.mkdtemp())
+        zip_path = tmp_dir / "data.zip"
+        # 원격 URL인 경우 파일 다운로드
+        if path.lower().startswith(('http://', 'https://')):
+            path = path.replace("\\", "/")
+            with requests.Session() as session:
+                r = session.get(path)
+                if r.status_code != 200:
+                    raise Exception(f"HTTP {r.status_code} Error - {r.reason} > {path}")
+                with open(zip_path, "wb") as f:
+                    f.write(r.content)
+        else:
+            zip_path = Path(path)
+        # 압축 해제
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(tmp_dir)
+        # 압축 해제된 파일 중 첫 번째 파일을 데이터로 로드
+        extracted_files = list(tmp_dir.glob('*'))
+        if not extracted_files:
+            raise FileNotFoundError("압축 파일 내에 데이터 파일이 없습니다.")
+        path = str(extracted_files[0])
+        p = path.rfind(".")
+        exec = path[p+1:].lower()
+        # 생성된 임시 디렉토리 삭제
+        shutil.rmtree(tmp_dir)
     if exec == 'xlsx':
         # If path is a remote URL, fetch the file once and reuse the bytes
         if path.lower().startswith(('http://', 'https://')):

{hossam-0.4.7 → hossam-0.4.9/hossam.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hossam
-Version: 0.4.7
+Version: 0.4.9
 Summary: Hossam Data Helper
 Author-email: Lee Kwang-Ho <leekh4232@gmail.com>
 License-Expression: MIT
@@ -73,7 +73,7 @@ title: 🎓 Hossam Data Helper
 pip install hossam
 ```
-**요구사항**: Python 3.8 이상
+**요구사항**: Python 3.13.9 이상
 ## 📚 전체 문서
@@ -83,9 +83,9 @@ pip install hossam
 - **hs_plot**: 25+ 시각화 함수 (선 그래프, 산점도, 히스토그램, 박스플롯, 히트맵 등)
 - **hs_stats**: 회귀/분류 분석, 교차검증, 정규성 검정, 상관분석 등
-- **hs_prep**: 결측치 처리, 이상치 탐지, 스케일링, 인코딩
+- **hs_prep**: 결측치 처리, 이상치 탐지, 스케일링, 인코딩 등의 데이터 전처리 기능
+- **hs_timeserise**: 시계열 분석 기능 지원
 - **hs_gis**: GIS 데이터 로드 및 시각화 (대한민국 지도 지원)
-- **hs_classroom**: 학습용 이진분류, 다중분류, 회귀 데이터 생성
 - **hs_util**: 예쁜 테이블 출력, 그리드 서치 등
 자세한 사용법은 [API 문서](https://py.hossam.kr/api/hossam/)를 참고하세요.

{hossam-0.4.7 → hossam-0.4.9}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "hossam"
-version = "0.4.7"
+version = "0.4.9"
 description = "Hossam Data Helper"
 readme = "README.md"
 requires-python = ">=3.13.9"