PyPI - hossam - Versions diffs - 0.4.18__tar.gz → 0.4.19__tar.gz - Mend

hossam 0.4.18tar.gz → 0.4.19tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{hossam-0.4.18/hossam.egg-info → hossam-0.4.19}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hossam
-Version: 0.4.18
+Version: 0.4.19
 Summary: Hossam Data Helper
 Author-email: Lee Kwang-Ho <leekh4232@gmail.com>
 License-Expression: MIT
@@ -40,6 +40,7 @@ Requires-Dist: xgboost
 Requires-Dist: lightgbm
 Requires-Dist: catboost
 Requires-Dist: kneed
+Requires-Dist: shap
 Dynamic: license-file
 ---

{hossam-0.4.18 → hossam-0.4.19}/hossam/__init__.py RENAMED Viewed

@@ -9,11 +9,19 @@ from . import hs_prep
 from . import hs_stats
 from . import hs_timeserise
 from . import hs_util
+from . import hs_reg
 from . import hs_cluster
 from . import hs_study
 from .hs_util import load_info
 from .hs_util import _load_data_remote as load_data
 from .hs_plot import visualize_silhouette
+from .hs_stats import ttest_ind as hs_ttest_ind
+from .hs_stats import outlier_table as hs_outlier_table
+from .hs_stats import oneway_anova as hs_oneway_anova
+from .hs_reg import learning_cv as hs_learning_cv
+from .hs_reg import get_scores as hs_get_scores
+from .hs_reg import get_score_cv as hs_get_score_cv
+from .hs_reg import VIFSelector
 # py-modules
 import sys
@@ -31,7 +39,29 @@ except Exception:
 my_dpi = hs_plot.config.dpi
-__all__ = ["my_dpi", "load_data", "load_info", "hs_classroom", "hs_gis", "hs_plot", "hs_prep", "hs_stats", "hs_timeserise", "hs_util", "hs_cluster", "hs_study", "visualize_silhouette"]
+__all__ = [
+    "my_dpi",
+    "load_data",
+    "load_info",
+    "hs_classroom",
+    "hs_gis",
+    "hs_plot",
+    "hs_prep",
+    "hs_stats",
+    "hs_timeserise",
+    "hs_util",
+    "hs_cluster",
+    "hs_reg",
+    "hs_study",
+    "visualize_silhouette",
+    "hs_ttest_ind",
+    "hs_outlier_table",
+    "hs_oneway_anova",
+    "hs_learning_cv",
+    "hs_get_scores",
+    "hs_get_score_cv",
+    "VIFSelector",
+]
 def check_pypi_latest(package_name: str):
@@ -51,7 +81,7 @@ def check_pypi_latest(package_name: str):
         "package": package_name,
         "installed": installed,
         "latest": latest,
-        "outdated": installed != latest
+        "outdated": installed != latest,
     }
@@ -67,21 +97,23 @@ def _init_korean_font():
             fprop = fm.FontProperties(fname=str(font_path))
             fname = fprop.get_name()
-            plt.rcParams.update({
-                "font.family": fname,
-                "font.size": hs_plot.config.font_size,
-                "font.weight": hs_plot.config.font_weight,
-                "axes.unicode_minus": False,
-                "text.antialiased": True,
-                "lines.antialiased": True,
-                "patch.antialiased": True,
-                "figure.dpi": hs_plot.config.dpi,
-                "savefig.dpi": hs_plot.config.dpi * 2,
-                "text.hinting": "auto",
-                "text.hinting_factor": 8,
-                "pdf.fonttype": 42,
-                "ps.fonttype": 42,
-            })
+            plt.rcParams.update(
+                {
+                    "font.family": fname,
+                    "font.size": hs_plot.config.font_size,
+                    "font.weight": hs_plot.config.font_weight,
+                    "axes.unicode_minus": False,
+                    "text.antialiased": True,
+                    "lines.antialiased": True,
+                    "patch.antialiased": True,
+                    "figure.dpi": hs_plot.config.dpi,
+                    "savefig.dpi": hs_plot.config.dpi * 2,
+                    "text.hinting": "auto",
+                    "text.hinting_factor": 8,
+                    "pdf.fonttype": 42,
+                    "ps.fonttype": 42,
+                }
+            )
             print(
                 "\n✅ 시각화를 위한 한글 글꼴(NotoSansKR-Regular)이 자동 적용되었습니다."
@@ -103,6 +135,8 @@ def _init():
         f"🔖 Version: {__version__}",
     ]
     for msg in messages:
         print(f"{msg}")
@@ -119,29 +153,36 @@ def _init():
     _init_korean_font()
+    # 각 열의 넓이 제한 없음
     pd.set_option("display.max_colwidth", None)
+    # 출력 너비 제한 없음 (가로 스크롤될 수 있음)
     pd.set_option("display.width", None)
     # 컬럼 생략 금지
     pd.set_option("display.max_columns", None)
     # 행 최대 출력 수 100개로 수정
     pd.set_option("display.max_rows", 100)
+    # 소수점 자리수 3자리로 설정
+    pd.options.display.float_format = "{:.3f}".format
     from IPython.display import display, HTML
-    display(HTML("""
-    <style>
-    .dataframe td, .dataframe th {
-        white-space: nowrap;
-        font-size: 0.85em;
-        padding: 2px 3px;
-    }
+    display(
+        HTML(
+            """
+    <style>
     .dataframe tr:hover {
         background-color: #ffff99 !important;
         border: 1px solid #ffcc00;
     }
     </style>
-    """))
+    """
+        )
+    )
+import multiprocessing as mp
+def is_parallel_worker():
+    return mp.current_process().name != "MainProcess"
-_init()
+if not is_parallel_worker():
+    _init()

{hossam-0.4.18 → hossam-0.4.19}/hossam/hs_classroom.py RENAMED Viewed

@@ -6,6 +6,7 @@ import math
 from pandas import DataFrame, qcut, concat, to_numeric
 from kmodes.kmodes import KModes
 from matplotlib import pyplot as plt
+from prompt_toolkit.formatted_text.ansi import i
 import seaborn as sns
 from .hs_util import load_data, pretty_table
 from .hs_plot import config
@@ -19,6 +20,7 @@ def cluster_students(
     n_groups: int,
     score_cols: list | None = None,
     interest_col: str | None = None,
+    interest_ignore: str | None = None,
     max_iter: int = 200,
     score_metric: str = 'total'
 ) -> DataFrame:
@@ -39,6 +41,8 @@ def cluster_students(
             None일 경우 점수 기반 균형 조정을 하지 않습니다. 기본값: None
         interest_col: 관심사 정보가 있는 컬럼명.
             None일 경우 관심사 기반 군집화를 하지 않습니다. 기본값: None
+        interest_ignore: 관심사 군집화에서 제외할 값.
+            지정된 값은 별도 군집에서 제외됩니다. 기본값: None
         max_iter: 균형 조정 최대 반복 횟수. 기본값: 200
         score_metric: 점수 기준 선택 ('total' 또는 'average').
             'total'이면 총점, 'average'이면 평균점수 기준. 기본값: 'total'
@@ -151,8 +155,18 @@ def cluster_students(
     if actual_n_groups < 2:
         actual_n_groups = 2
+    df_ignore = None
     # ===== 3단계: 관심사 기반 1차 군집 =====
     if interest_col is not None:
+        df_main[interest_col] = df_main[interest_col].fillna('미정')
+        if interest_ignore is not None:
+            df_ignore = df_main[df_main[interest_col] == interest_ignore].copy()
+            df_main = df_main[df_main[interest_col] != interest_ignore].copy()
+            print(df_ignore)
         X_interest = df_main[[interest_col]].to_numpy()
         kmodes_interest = KModes(
@@ -184,12 +198,18 @@ def cluster_students(
         df_main = _balance_group_sizes_only(df_main, actual_n_groups, min_size, max_size)
     # ===== 5단계: 극단값 포함 병합 =====
-    if df_outlier is not None and len(df_outlier) > 0:
+    result = df_main
+    if (df_outlier is not None and len(df_outlier) > 0):
         # '조'는 숫자형 유지: 극단값은 0으로 표시
         df_outlier['조'] = 0
-        result = concat([df_main, df_outlier], ignore_index=True)
-    else:
-        result = df_main
+        result = concat([result, df_outlier], ignore_index=True)
+    if (df_ignore is not None and len(df_ignore) > 0):
+        # '조'는 숫자형 유지: 제외된 학생은 -1로 표시
+        df_ignore['조'] = -1
+        result = concat([result, df_ignore], ignore_index=True)
     # 평균점수는 이미 계산됨 (score_cols 있을 때)
@@ -694,6 +714,7 @@ def analyze_classroom(
     n_groups: int,
     score_cols: list | None = None,
     interest_col: str | None = None,
+    interest_ignore: str | None = None,
     max_iter: int = 200,
     score_metric: str = 'average',
     name_col: str = '학생이름',
@@ -713,6 +734,7 @@ def analyze_classroom(
         n_groups: 목표 조의 개수.
         score_cols: 성적 계산에 사용할 점수 컬럼명 리스트. 기본값: None
         interest_col: 관심사 정보가 있는 컬럼명. 기본값: None
+        interest_ignore: 관심사 군집화에서 제외할 값. 기본값: None
         max_iter: 균형 조정 최대 반복 횟수. 기본값: 200
         score_metric: 점수 기준 선택 ('total' 또는 'average'). 기본값: 'average'
         name_col: 학생 이름 컬럼명. 기본값: '학생이름'
@@ -740,6 +762,7 @@ def analyze_classroom(
         n_groups=n_groups,
         score_cols=score_cols,
         interest_col=interest_col,
+        interest_ignore=interest_ignore,
         max_iter=max_iter,
         score_metric=score_metric
     )

{hossam-0.4.18 → hossam-0.4.19}/hossam/hs_plot.py RENAMED Viewed

@@ -8,6 +8,7 @@ from itertools import combinations
 import numpy as np
 import seaborn as sb
 import matplotlib.pyplot as plt
+from matplotlib.figure import Figure  # type: ignore
 from matplotlib.pyplot import Axes  # type: ignore
 from pandas import Series, DataFrame
 from math import sqrt
@@ -132,7 +133,7 @@ def create_figure(
     ws: int | None = None,
     hs: int | None = None,
     title: str | None = None,
-):
+) -> tuple[Figure, Axes]:
     """기본 크기의 Figure와 Axes를 생성한다. get_default_ax의 래퍼 함수.
     Args:
@@ -1103,14 +1104,9 @@ def pairplot(
         g.fig.suptitle(title, fontsize=config.font_size * 1.5, fontweight="bold")
     g.map_lower(
-        func=sb.kdeplot, fill=True, alpha=config.fill_alpha, linewidth=linewidth
+        func=sb.kdeplot, fill=True, alpha=config.fill_alpha
     )
-    g.map_upper(func=sb.scatterplot, linewidth=linewidth)
-    # KDE 대각선에도 linewidth 적용
-    for ax in g.axes.diag:  # type: ignore
-        for line in ax.get_lines():
-            line.set_linewidth(linewidth)
+    g.map_upper(func=sb.scatterplot)
     plt.tight_layout()
@@ -1768,25 +1764,14 @@ def ols_residplot(
         fig, ax = get_default_ax(width + 150 if mse else width, height, 1, 1, dpi)  # type: ignore
         outparams = True
-    # 산점도 seaborn으로 그리기
-    sb.scatterplot(x=y_pred, y=resid, ax=ax, s=20, edgecolor="white", **params)
-    # 기준선 (잔차 = 0)
-    ax.axhline(0, color="gray", linestyle="--", linewidth=linewidth * 0.7)  # type: ignore
-    # LOWESS 스무딩 (선택적)
-    if lowess:
-        lowess_result = sm_lowess(resid, y_pred, frac=0.6667)
-        ax.plot(    # type: ignore
-            lowess_result[:, 0],
-            lowess_result[:, 1],  # type: ignore
-            color="red",
-            linewidth=linewidth,
-            label="LOWESS",
-        )  # type: ignore
-    ax.set_xlabel("Fitted values")  # type: ignore
-    ax.set_ylabel("Residuals")  # type: ignore
+    sb.residplot(
+        x=y_pred,
+        y=resid,
+        lowess=True,  # 잔차의 추세선 표시
+        line_kws={"color": "red", "linewidth": linewidth * 0.7},  # 추세선 스타일
+        scatter_kws={"edgecolor": "white", "alpha": config.alpha},
+        **params
+    )
     if mse:
         mse_val = mean_squared_error(y, y_pred)
@@ -1916,8 +1901,7 @@ def ols_qqplot(
     # 선 굵기 조정
     for line in ax.get_lines():  # type: ignore
-        if line.get_linestyle() == "--" or line.get_color() == "r":  # type: ignore
-            line.set_linewidth(linewidth)  # type: ignore
+        line.set_linewidth(linewidth)  # type: ignore
     finalize_plot(ax, callback, outparams, save_path, True, title)  # type: ignore

hossam-0.4.19/hossam/hs_reg.py ADDED Viewed

@@ -0,0 +1,313 @@
+from IPython.display import display
+from pandas import DataFrame, merge
+import seaborn as sb
+import numpy as np
+import statsmodels.api as sm
+from statsmodels.stats.outliers_influence import variance_inflation_factor
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.model_selection import learning_curve
+# 성능 평가 지표 모듈
+from sklearn.metrics import (
+    r2_score,
+    mean_absolute_error,
+    mean_squared_error,
+    mean_absolute_percentage_error,
+)
+from .hs_plot import create_figure, finalize_plot
+# --------------------------------------------------------
+# VIF 기반 다중공선성 제거기
+# --------------------------------------------------------
+class VIFSelector(BaseEstimator, TransformerMixin):
+    """
+    VIF(Variance Inflation Factor) 기반 다중공선성 제거기
+    Args:
+        threshold (float): VIF 임계값 (기본값: 10.0
+        check_cols (list or None): VIF 계산에 사용할 열 목록 (기본값: None, 모든 열 사용)
+    Attributes:
+        drop_cols_ (list): 제거된 열 목록
+        vif_cols_ (list): VIF 계산에 사용된 열 목록
+    """
+    def __init__(self, threshold=10.0, check_cols=None):
+        self.threshold = threshold
+        self.check_cols = check_cols
+    def _compute_vifs(self, X: DataFrame):
+        exog = sm.add_constant(X, prepend=True)
+        vifs = {}
+        for i, col in enumerate(X.columns):
+            try:
+                vifs[col] = float(variance_inflation_factor(exog.values, i + 1))
+            except Exception:
+                vifs[col] = float("inf")
+        vdf = DataFrame(vifs.items(), columns=["Variable", "VIF"])
+        return vdf.sort_values("VIF", ascending=False)
+    def fit(self, X, y=None):
+        df = X.copy().dropna()
+        self.vif_cols_ = self.check_cols if self.check_cols else df.columns.tolist()
+        X_vif = df[self.vif_cols_].copy()
+        self.drop_cols_ = []
+        i = 0
+        while True:
+            if X_vif.shape[1] == 0:
+                break
+            vdf = self._compute_vifs(X_vif)
+            max_vif = vdf.iloc[0]["VIF"]
+            max_col = vdf.iloc[0]["Variable"]
+            if max_vif <= self.threshold:
+                # print(
+                #     "모든 변수의 VIF가 임계값 이하가 되어 종료합니다. 제거된 변수 {0}개.".format(
+                #         i
+                #     )
+                # )
+                break
+            X_vif = X_vif.drop(columns=[max_col])
+            self.drop_cols_.append(max_col)
+            #print(f"제거된 변수: {max_col} (VIF={X_vif:.2f})")
+            i += 1
+        return self
+    def transform(self, X):
+        return X.drop(columns=self.drop_cols_, errors="ignore")
+# --------------------------------------------------------
+# 회귀 성능 평가 지표 함수
+# --------------------------------------------------------
+def get_scores(
+    estimator,
+    x_test: DataFrame,
+    y_test: DataFrame | np.ndarray
+) -> DataFrame:
+    """
+    회귀 성능 평가 지표 함수
+    Args:
+        estimator: 학습된 사이킷런 회귀 모델
+        x_test: 테스트용 설명변수 데이터 (DataFrame)
+        y_test: 실제 목표변수 값 (DataFrame 또는 ndarray)
+    Returns:
+        DataFrame: 회귀 성능 평가 지표 (R2, MAE, MSE, RMSE, MAPE, MPE)
+    """
+    if hasattr(estimator, "named_steps"):
+        classname = estimator.named_steps["model"].__class__.__name__
+    else:
+        classname = estimator.__class__.__name__
+    y_pred = estimator.predict(x_test)
+    score_df = DataFrame(
+        {
+            "결정계수(R2)": r2_score(y_test, y_pred),
+            "평균절대오차(MAE)": mean_absolute_error(y_test, y_pred),
+            "평균제곱오차(MSE)": mean_squared_error(y_test, y_pred),
+            "평균오차(RMSE)": np.sqrt(mean_squared_error(y_test, y_pred)),
+            "평균 절대 백분오차 비율(MAPE)": mean_absolute_percentage_error(
+                y_test, y_pred
+            ),
+            "평균 비율 오차(MPE)": np.mean((y_test - y_pred) / y_test * 100),
+        },
+        index=[classname],
+    )
+    return score_df
+# --------------------------------------------------------
+# 학습곡선기반 과적합 판별 함수
+# --------------------------------------------------------
+def learning_cv(
+    estimator,
+    x,
+    y,
+    scoring="neg_root_mean_squared_error",
+    cv=5,
+    train_sizes=np.linspace(0.1, 1.0, 10),
+    n_jobs=-1,
+) -> DataFrame:
+    """학습곡선 기반 과적합 판별 함수
+    Args:
+        estimator: 사이킷런 Estimator (파이프라인 권장)
+        x: 설명변수 (DataFrame 또는 ndarray)
+        y: 목표변수 (Series 또는 ndarray)
+        scoring: 평가 지표 (기본값: neg_root_mean_squared_error)
+        cv: 교차검증 폴드 수 (기본값: 5)
+        train_sizes: 학습곡선 학습 데이터 비율 (기본값: np.linspace(0.1, 1.0, 10))
+        n_jobs: 병렬 처리 개수 (기본값: -1, 모든 CPU 사용)
+    Returns:
+        DataFrame: 과적합 판별 결과 표
+    """
+    train_sizes, train_scores, cv_scores = learning_curve(  # type: ignore
+        estimator=estimator,
+        X=x,
+        y=y,
+        train_sizes=train_sizes,
+        cv=cv,
+        scoring=scoring,
+        n_jobs=n_jobs,
+        shuffle=True,
+        random_state=52,
+    )
+    if hasattr(estimator, "named_steps"):
+        classname = estimator.named_steps["model"].__class__.__name__
+    else:
+        classname = estimator.__class__.__name__
+    # neg RMSE → RMSE
+    train_rmse = -train_scores
+    cv_rmse = -cv_scores
+    # 평균 / 표준편차
+    train_mean = train_rmse.mean(axis=1)
+    cv_mean = cv_rmse.mean(axis=1)
+    cv_std = cv_rmse.std(axis=1)
+    # 마지막 지점 기준 정량 판정
+    final_train = train_mean[-1]
+    final_cv = cv_mean[-1]
+    final_std = cv_std[-1]
+    gap_ratio = final_train / final_cv
+    var_ratio = final_std / final_cv
+    # -----------------
+    # 과소적합 기준선 (some_threshold)
+    # -----------------
+    # 기준모형 RMSE (평균 예측)
+    y_mean = y.mean()
+    rmse_naive = np.sqrt(np.mean((y - y_mean) ** 2))
+    # 분산 기반
+    std_y = y.std()
+    # 최소 설명력(R²) 기반
+    min_r2 = 0.10
+    rmse_r2 = np.sqrt((1 - min_r2) * np.var(y))
+    # 최종 threshold (가장 관대한 기준)
+    # -> 원래 some_threshold는 도메인 지식 수준에서 이 모델은 최소 어느 정도의 성능은 내야 한다는 기준을 설정하는 것
+    some_threshold = min(rmse_naive, std_y, rmse_r2)
+    # -----------------
+    # 판정 로직
+    # -----------------
+    if gap_ratio >= 0.95 and final_cv > some_threshold:
+        status = "⚠️ 과소적합 (bias 큼)"
+    elif gap_ratio <= 0.8:
+        status = "⚠️ 과대적합 (variance 큼)"
+    elif gap_ratio <= 0.95 and var_ratio <= 0.10:
+        status = "✅ 일반화 양호"
+    elif var_ratio > 0.15:
+        status = "⚠️ 데이터 부족 / 분산 큼"
+    else:
+        status = "⚠️ 판단 유보"
+    # -----------------
+    # 정량 결과 표
+    # -----------------
+    result_df = DataFrame(
+        {
+            "Train RMSE": [final_train],
+            "CV RMSE 평균": [final_cv],
+            "CV RMSE 표준편차": [final_std],
+            "Train/CV 비율": [gap_ratio],
+            "CV 변동성 비율": [var_ratio],
+            "판정 결과": [status],
+        },
+        index=[classname],
+    )
+    # -----------------
+    # 학습곡선 시각화
+    # -----------------
+    fig, ax = create_figure()
+    sb.lineplot(
+        x=train_sizes,
+        y=train_mean,
+        marker="o",
+        markeredgecolor="#ffffff",
+        label="Train RMSE",
+    )
+    sb.lineplot(
+        x=train_sizes,
+        y=cv_mean,
+        marker="o",
+        markeredgecolor="#ffffff",
+        label="CV RMSE",
+    )
+    ax.set_xlabel("RMSE", fontsize=8, labelpad=5)   # type : ignore
+    ax.set_ylabel("학습곡선 (Learning Curve)", fontsize=8, labelpad=5)  # type : ignore
+    ax.grid(True, alpha=0.3) # type : ignore
+    finalize_plot(ax)
+    return result_df
+def get_score_cv(
+    estimator,
+    x_test: DataFrame,
+    y_test: DataFrame | np.ndarray,
+    x_origin: DataFrame,
+    y_origin: DataFrame | np.ndarray,
+    scoring="neg_root_mean_squared_error",
+    cv=5,
+    train_sizes=np.linspace(0.1, 1.0, 10),
+    n_jobs=-1,
+) -> DataFrame:
+    """
+    회귀 성능 평가 지표 함수
+    Args:
+        estimator: 학습된 사이킷런 회귀 모델
+        x_test: 테스트용 설명변수 데이터 (DataFrame)
+        y_test: 실제 목표변수 값 (DataFrame 또는 ndarray)
+        x_origin: 학습곡선용 전체 설명변수 데이터 (DataFrame, learning_curve=True일 때 필요)
+        y_origin: 학습곡선용 전체 목표변수 값 (DataFrame 또는 ndarray, learning_curve=True일 때 필요)
+        scoring: 학습곡선 평가 지표 (기본값: neg_root_mean_squared_error)
+        cv: 학습곡선 교차검증 폴드 수 (기본값: 5)
+        train_sizes: 학습곡선 학습 데이터 비율 (기본값: np.linspace(0.1, 1.0, 10))
+        n_jobs: 학습곡선 병렬 처리 개수 (기본값: -1, 모든 CPU 사용)
+    Returns:
+        DataFrame: 회귀 성능 평가 지표 + 과적합 판정 여부
+    """
+    score_df = get_scores(estimator, x_test, y_test)
+    cv_df = learning_cv(
+        estimator,
+        x_origin,
+        y_origin,
+        scoring=scoring,
+        cv=cv,
+        train_sizes=train_sizes,
+        n_jobs=n_jobs,
+    )
+    return merge(score_df, cv_df, left_index=True, right_index=True)

hossam 0.4.18__tar.gz → 0.4.19__tar.gz

hossam 0.4.18tar.gz → 0.4.19tar.gz