hossam 0.4.18__tar.gz → 0.4.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {hossam-0.4.18/hossam.egg-info → hossam-0.4.19}/PKG-INFO +2 -1
  2. {hossam-0.4.18 → hossam-0.4.19}/hossam/__init__.py +68 -27
  3. {hossam-0.4.18 → hossam-0.4.19}/hossam/hs_classroom.py +27 -4
  4. {hossam-0.4.18 → hossam-0.4.19}/hossam/hs_plot.py +13 -29
  5. hossam-0.4.19/hossam/hs_reg.py +313 -0
  6. {hossam-0.4.18 → hossam-0.4.19}/hossam/hs_stats.py +211 -219
  7. {hossam-0.4.18 → hossam-0.4.19/hossam.egg-info}/PKG-INFO +2 -1
  8. {hossam-0.4.18 → hossam-0.4.19}/hossam.egg-info/SOURCES.txt +1 -1
  9. {hossam-0.4.18 → hossam-0.4.19}/hossam.egg-info/requires.txt +1 -0
  10. {hossam-0.4.18 → hossam-0.4.19}/pyproject.toml +3 -2
  11. hossam-0.4.18/hossam/hs_cluster copy.py +0 -1060
  12. {hossam-0.4.18 → hossam-0.4.19}/LICENSE +0 -0
  13. {hossam-0.4.18 → hossam-0.4.19}/MANIFEST.in +0 -0
  14. {hossam-0.4.18 → hossam-0.4.19}/README.md +0 -0
  15. {hossam-0.4.18 → hossam-0.4.19}/hossam/NotoSansKR-Regular.ttf +0 -0
  16. {hossam-0.4.18 → hossam-0.4.19}/hossam/hs_cluster.py +0 -0
  17. {hossam-0.4.18 → hossam-0.4.19}/hossam/hs_gis.py +0 -0
  18. {hossam-0.4.18 → hossam-0.4.19}/hossam/hs_prep.py +0 -0
  19. {hossam-0.4.18 → hossam-0.4.19}/hossam/hs_study.py +0 -0
  20. {hossam-0.4.18 → hossam-0.4.19}/hossam/hs_timeserise.py +0 -0
  21. {hossam-0.4.18 → hossam-0.4.19}/hossam/hs_util.py +0 -0
  22. {hossam-0.4.18 → hossam-0.4.19}/hossam/leekh.png +0 -0
  23. {hossam-0.4.18 → hossam-0.4.19}/hossam.egg-info/dependency_links.txt +0 -0
  24. {hossam-0.4.18 → hossam-0.4.19}/hossam.egg-info/top_level.txt +0 -0
  25. {hossam-0.4.18 → hossam-0.4.19}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hossam
3
- Version: 0.4.18
3
+ Version: 0.4.19
4
4
  Summary: Hossam Data Helper
5
5
  Author-email: Lee Kwang-Ho <leekh4232@gmail.com>
6
6
  License-Expression: MIT
@@ -40,6 +40,7 @@ Requires-Dist: xgboost
40
40
  Requires-Dist: lightgbm
41
41
  Requires-Dist: catboost
42
42
  Requires-Dist: kneed
43
+ Requires-Dist: shap
43
44
  Dynamic: license-file
44
45
 
45
46
  ---
@@ -9,11 +9,19 @@ from . import hs_prep
9
9
  from . import hs_stats
10
10
  from . import hs_timeserise
11
11
  from . import hs_util
12
+ from . import hs_reg
12
13
  from . import hs_cluster
13
14
  from . import hs_study
14
15
  from .hs_util import load_info
15
16
  from .hs_util import _load_data_remote as load_data
16
17
  from .hs_plot import visualize_silhouette
18
+ from .hs_stats import ttest_ind as hs_ttest_ind
19
+ from .hs_stats import outlier_table as hs_outlier_table
20
+ from .hs_stats import oneway_anova as hs_oneway_anova
21
+ from .hs_reg import learning_cv as hs_learning_cv
22
+ from .hs_reg import get_scores as hs_get_scores
23
+ from .hs_reg import get_score_cv as hs_get_score_cv
24
+ from .hs_reg import VIFSelector
17
25
 
18
26
  # py-modules
19
27
  import sys
@@ -31,7 +39,29 @@ except Exception:
31
39
 
32
40
  my_dpi = hs_plot.config.dpi
33
41
 
34
- __all__ = ["my_dpi", "load_data", "load_info", "hs_classroom", "hs_gis", "hs_plot", "hs_prep", "hs_stats", "hs_timeserise", "hs_util", "hs_cluster", "hs_study", "visualize_silhouette"]
42
+ __all__ = [
43
+ "my_dpi",
44
+ "load_data",
45
+ "load_info",
46
+ "hs_classroom",
47
+ "hs_gis",
48
+ "hs_plot",
49
+ "hs_prep",
50
+ "hs_stats",
51
+ "hs_timeserise",
52
+ "hs_util",
53
+ "hs_cluster",
54
+ "hs_reg",
55
+ "hs_study",
56
+ "visualize_silhouette",
57
+ "hs_ttest_ind",
58
+ "hs_outlier_table",
59
+ "hs_oneway_anova",
60
+ "hs_learning_cv",
61
+ "hs_get_scores",
62
+ "hs_get_score_cv",
63
+ "VIFSelector",
64
+ ]
35
65
 
36
66
 
37
67
  def check_pypi_latest(package_name: str):
@@ -51,7 +81,7 @@ def check_pypi_latest(package_name: str):
51
81
  "package": package_name,
52
82
  "installed": installed,
53
83
  "latest": latest,
54
- "outdated": installed != latest
84
+ "outdated": installed != latest,
55
85
  }
56
86
 
57
87
 
@@ -67,21 +97,23 @@ def _init_korean_font():
67
97
  fprop = fm.FontProperties(fname=str(font_path))
68
98
  fname = fprop.get_name()
69
99
 
70
- plt.rcParams.update({
71
- "font.family": fname,
72
- "font.size": hs_plot.config.font_size,
73
- "font.weight": hs_plot.config.font_weight,
74
- "axes.unicode_minus": False,
75
- "text.antialiased": True,
76
- "lines.antialiased": True,
77
- "patch.antialiased": True,
78
- "figure.dpi": hs_plot.config.dpi,
79
- "savefig.dpi": hs_plot.config.dpi * 2,
80
- "text.hinting": "auto",
81
- "text.hinting_factor": 8,
82
- "pdf.fonttype": 42,
83
- "ps.fonttype": 42,
84
- })
100
+ plt.rcParams.update(
101
+ {
102
+ "font.family": fname,
103
+ "font.size": hs_plot.config.font_size,
104
+ "font.weight": hs_plot.config.font_weight,
105
+ "axes.unicode_minus": False,
106
+ "text.antialiased": True,
107
+ "lines.antialiased": True,
108
+ "patch.antialiased": True,
109
+ "figure.dpi": hs_plot.config.dpi,
110
+ "savefig.dpi": hs_plot.config.dpi * 2,
111
+ "text.hinting": "auto",
112
+ "text.hinting_factor": 8,
113
+ "pdf.fonttype": 42,
114
+ "ps.fonttype": 42,
115
+ }
116
+ )
85
117
 
86
118
  print(
87
119
  "\n✅ 시각화를 위한 한글 글꼴(NotoSansKR-Regular)이 자동 적용되었습니다."
@@ -103,6 +135,8 @@ def _init():
103
135
  f"🔖 Version: {__version__}",
104
136
  ]
105
137
 
138
+
139
+
106
140
  for msg in messages:
107
141
  print(f"{msg}")
108
142
 
@@ -119,29 +153,36 @@ def _init():
119
153
 
120
154
  _init_korean_font()
121
155
 
156
+ # 각 열의 넓이 제한 없음
122
157
  pd.set_option("display.max_colwidth", None)
158
+ # 출력 너비 제한 없음 (가로 스크롤될 수 있음)
123
159
  pd.set_option("display.width", None)
124
160
  # 컬럼 생략 금지
125
161
  pd.set_option("display.max_columns", None)
126
162
  # 행 최대 출력 수 100개로 수정
127
163
  pd.set_option("display.max_rows", 100)
164
+ # 소수점 자리수 3자리로 설정
165
+ pd.options.display.float_format = "{:.3f}".format
128
166
 
129
167
  from IPython.display import display, HTML
130
168
 
131
- display(HTML("""
132
- <style>
133
- .dataframe td, .dataframe th {
134
- white-space: nowrap;
135
- font-size: 0.85em;
136
- padding: 2px 3px;
137
- }
138
-
169
+ display(
170
+ HTML(
171
+ """
172
+ <style>
139
173
  .dataframe tr:hover {
140
174
  background-color: #ffff99 !important;
141
175
  border: 1px solid #ffcc00;
142
176
  }
143
177
  </style>
144
- """))
178
+ """
179
+ )
180
+ )
181
+
182
+ import multiprocessing as mp
145
183
 
184
+ def is_parallel_worker():
185
+ return mp.current_process().name != "MainProcess"
146
186
 
147
- _init()
187
+ if not is_parallel_worker():
188
+ _init()
@@ -6,6 +6,7 @@ import math
6
6
  from pandas import DataFrame, qcut, concat, to_numeric
7
7
  from kmodes.kmodes import KModes
8
8
  from matplotlib import pyplot as plt
9
+ from prompt_toolkit.formatted_text.ansi import i
9
10
  import seaborn as sns
10
11
  from .hs_util import load_data, pretty_table
11
12
  from .hs_plot import config
@@ -19,6 +20,7 @@ def cluster_students(
19
20
  n_groups: int,
20
21
  score_cols: list | None = None,
21
22
  interest_col: str | None = None,
23
+ interest_ignore: str | None = None,
22
24
  max_iter: int = 200,
23
25
  score_metric: str = 'total'
24
26
  ) -> DataFrame:
@@ -39,6 +41,8 @@ def cluster_students(
39
41
  None일 경우 점수 기반 균형 조정을 하지 않습니다. 기본값: None
40
42
  interest_col: 관심사 정보가 있는 컬럼명.
41
43
  None일 경우 관심사 기반 군집화를 하지 않습니다. 기본값: None
44
+ interest_ignore: 관심사 군집화에서 제외할 값.
45
+ 지정된 값은 별도 군집에서 제외됩니다. 기본값: None
42
46
  max_iter: 균형 조정 최대 반복 횟수. 기본값: 200
43
47
  score_metric: 점수 기준 선택 ('total' 또는 'average').
44
48
  'total'이면 총점, 'average'이면 평균점수 기준. 기본값: 'total'
@@ -151,8 +155,18 @@ def cluster_students(
151
155
  if actual_n_groups < 2:
152
156
  actual_n_groups = 2
153
157
 
158
+ df_ignore = None
159
+
154
160
  # ===== 3단계: 관심사 기반 1차 군집 =====
155
161
  if interest_col is not None:
162
+ df_main[interest_col] = df_main[interest_col].fillna('미정')
163
+
164
+ if interest_ignore is not None:
165
+ df_ignore = df_main[df_main[interest_col] == interest_ignore].copy()
166
+ df_main = df_main[df_main[interest_col] != interest_ignore].copy()
167
+
168
+ print(df_ignore)
169
+
156
170
  X_interest = df_main[[interest_col]].to_numpy()
157
171
 
158
172
  kmodes_interest = KModes(
@@ -184,12 +198,18 @@ def cluster_students(
184
198
  df_main = _balance_group_sizes_only(df_main, actual_n_groups, min_size, max_size)
185
199
 
186
200
  # ===== 5단계: 극단값 포함 병합 =====
187
- if df_outlier is not None and len(df_outlier) > 0:
201
+ result = df_main
202
+
203
+ if (df_outlier is not None and len(df_outlier) > 0):
188
204
  # '조'는 숫자형 유지: 극단값은 0으로 표시
189
205
  df_outlier['조'] = 0
190
- result = concat([df_main, df_outlier], ignore_index=True)
191
- else:
192
- result = df_main
206
+ result = concat([result, df_outlier], ignore_index=True)
207
+
208
+ if (df_ignore is not None and len(df_ignore) > 0):
209
+ # '조'는 숫자형 유지: 제외된 학생은 -1로 표시
210
+ df_ignore['조'] = -1
211
+ result = concat([result, df_ignore], ignore_index=True)
212
+
193
213
 
194
214
  # 평균점수는 이미 계산됨 (score_cols 있을 때)
195
215
 
@@ -694,6 +714,7 @@ def analyze_classroom(
694
714
  n_groups: int,
695
715
  score_cols: list | None = None,
696
716
  interest_col: str | None = None,
717
+ interest_ignore: str | None = None,
697
718
  max_iter: int = 200,
698
719
  score_metric: str = 'average',
699
720
  name_col: str = '학생이름',
@@ -713,6 +734,7 @@ def analyze_classroom(
713
734
  n_groups: 목표 조의 개수.
714
735
  score_cols: 성적 계산에 사용할 점수 컬럼명 리스트. 기본값: None
715
736
  interest_col: 관심사 정보가 있는 컬럼명. 기본값: None
737
+ interest_ignore: 관심사 군집화에서 제외할 값. 기본값: None
716
738
  max_iter: 균형 조정 최대 반복 횟수. 기본값: 200
717
739
  score_metric: 점수 기준 선택 ('total' 또는 'average'). 기본값: 'average'
718
740
  name_col: 학생 이름 컬럼명. 기본값: '학생이름'
@@ -740,6 +762,7 @@ def analyze_classroom(
740
762
  n_groups=n_groups,
741
763
  score_cols=score_cols,
742
764
  interest_col=interest_col,
765
+ interest_ignore=interest_ignore,
743
766
  max_iter=max_iter,
744
767
  score_metric=score_metric
745
768
  )
@@ -8,6 +8,7 @@ from itertools import combinations
8
8
  import numpy as np
9
9
  import seaborn as sb
10
10
  import matplotlib.pyplot as plt
11
+ from matplotlib.figure import Figure # type: ignore
11
12
  from matplotlib.pyplot import Axes # type: ignore
12
13
  from pandas import Series, DataFrame
13
14
  from math import sqrt
@@ -132,7 +133,7 @@ def create_figure(
132
133
  ws: int | None = None,
133
134
  hs: int | None = None,
134
135
  title: str | None = None,
135
- ):
136
+ ) -> tuple[Figure, Axes]:
136
137
  """기본 크기의 Figure와 Axes를 생성한다. get_default_ax의 래퍼 함수.
137
138
 
138
139
  Args:
@@ -1103,14 +1104,9 @@ def pairplot(
1103
1104
  g.fig.suptitle(title, fontsize=config.font_size * 1.5, fontweight="bold")
1104
1105
 
1105
1106
  g.map_lower(
1106
- func=sb.kdeplot, fill=True, alpha=config.fill_alpha, linewidth=linewidth
1107
+ func=sb.kdeplot, fill=True, alpha=config.fill_alpha
1107
1108
  )
1108
- g.map_upper(func=sb.scatterplot, linewidth=linewidth)
1109
-
1110
- # KDE 대각선에도 linewidth 적용
1111
- for ax in g.axes.diag: # type: ignore
1112
- for line in ax.get_lines():
1113
- line.set_linewidth(linewidth)
1109
+ g.map_upper(func=sb.scatterplot)
1114
1110
 
1115
1111
  plt.tight_layout()
1116
1112
 
@@ -1768,25 +1764,14 @@ def ols_residplot(
1768
1764
  fig, ax = get_default_ax(width + 150 if mse else width, height, 1, 1, dpi) # type: ignore
1769
1765
  outparams = True
1770
1766
 
1771
- # 산점도 seaborn으로 그리기
1772
- sb.scatterplot(x=y_pred, y=resid, ax=ax, s=20, edgecolor="white", **params)
1773
-
1774
- # 기준선 (잔차 = 0)
1775
- ax.axhline(0, color="gray", linestyle="--", linewidth=linewidth * 0.7) # type: ignore
1776
-
1777
- # LOWESS 스무딩 (선택적)
1778
- if lowess:
1779
- lowess_result = sm_lowess(resid, y_pred, frac=0.6667)
1780
- ax.plot( # type: ignore
1781
- lowess_result[:, 0],
1782
- lowess_result[:, 1], # type: ignore
1783
- color="red",
1784
- linewidth=linewidth,
1785
- label="LOWESS",
1786
- ) # type: ignore
1787
-
1788
- ax.set_xlabel("Fitted values") # type: ignore
1789
- ax.set_ylabel("Residuals") # type: ignore
1767
+ sb.residplot(
1768
+ x=y_pred,
1769
+ y=resid,
1770
+ lowess=True, # 잔차의 추세선 표시
1771
+ line_kws={"color": "red", "linewidth": linewidth * 0.7}, # 추세선 스타일
1772
+ scatter_kws={"edgecolor": "white", "alpha": config.alpha},
1773
+ **params
1774
+ )
1790
1775
 
1791
1776
  if mse:
1792
1777
  mse_val = mean_squared_error(y, y_pred)
@@ -1916,8 +1901,7 @@ def ols_qqplot(
1916
1901
 
1917
1902
  # 선 굵기 조정
1918
1903
  for line in ax.get_lines(): # type: ignore
1919
- if line.get_linestyle() == "--" or line.get_color() == "r": # type: ignore
1920
- line.set_linewidth(linewidth) # type: ignore
1904
+ line.set_linewidth(linewidth) # type: ignore
1921
1905
 
1922
1906
  finalize_plot(ax, callback, outparams, save_path, True, title) # type: ignore
1923
1907
 
@@ -0,0 +1,313 @@
1
+ from IPython.display import display
2
+
3
+ from pandas import DataFrame, merge
4
+ import seaborn as sb
5
+ import numpy as np
6
+
7
+ import statsmodels.api as sm
8
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
9
+
10
+ from sklearn.base import BaseEstimator, TransformerMixin
11
+ from sklearn.model_selection import learning_curve
12
+
13
+ # 성능 평가 지표 모듈
14
+ from sklearn.metrics import (
15
+ r2_score,
16
+ mean_absolute_error,
17
+ mean_squared_error,
18
+ mean_absolute_percentage_error,
19
+ )
20
+
21
+ from .hs_plot import create_figure, finalize_plot
22
+
23
+
24
+ # --------------------------------------------------------
25
+ # VIF 기반 다중공선성 제거기
26
+ # --------------------------------------------------------
27
+ class VIFSelector(BaseEstimator, TransformerMixin):
28
+ """
29
+ VIF(Variance Inflation Factor) 기반 다중공선성 제거기
30
+
31
+ Args:
32
+ threshold (float): VIF 임계값 (기본값: 10.0
33
+ check_cols (list or None): VIF 계산에 사용할 열 목록 (기본값: None, 모든 열 사용)
34
+
35
+ Attributes:
36
+ drop_cols_ (list): 제거된 열 목록
37
+ vif_cols_ (list): VIF 계산에 사용된 열 목록
38
+
39
+ """
40
+
41
+ def __init__(self, threshold=10.0, check_cols=None):
42
+ self.threshold = threshold
43
+ self.check_cols = check_cols
44
+
45
+ def _compute_vifs(self, X: DataFrame):
46
+ exog = sm.add_constant(X, prepend=True)
47
+
48
+ vifs = {}
49
+ for i, col in enumerate(X.columns):
50
+ try:
51
+ vifs[col] = float(variance_inflation_factor(exog.values, i + 1))
52
+ except Exception:
53
+ vifs[col] = float("inf")
54
+
55
+ vdf = DataFrame(vifs.items(), columns=["Variable", "VIF"])
56
+ return vdf.sort_values("VIF", ascending=False)
57
+
58
+ def fit(self, X, y=None):
59
+ df = X.copy().dropna()
60
+
61
+ self.vif_cols_ = self.check_cols if self.check_cols else df.columns.tolist()
62
+ X_vif = df[self.vif_cols_].copy()
63
+
64
+ self.drop_cols_ = []
65
+ i = 0
66
+
67
+ while True:
68
+ if X_vif.shape[1] == 0:
69
+ break
70
+
71
+ vdf = self._compute_vifs(X_vif)
72
+ max_vif = vdf.iloc[0]["VIF"]
73
+ max_col = vdf.iloc[0]["Variable"]
74
+
75
+ if max_vif <= self.threshold:
76
+ # print(
77
+ # "모든 변수의 VIF가 임계값 이하가 되어 종료합니다. 제거된 변수 {0}개.".format(
78
+ # i
79
+ # )
80
+ # )
81
+ break
82
+
83
+ X_vif = X_vif.drop(columns=[max_col])
84
+ self.drop_cols_.append(max_col)
85
+ #print(f"제거된 변수: {max_col} (VIF={X_vif:.2f})")
86
+ i += 1
87
+
88
+ return self
89
+
90
+ def transform(self, X):
91
+ return X.drop(columns=self.drop_cols_, errors="ignore")
92
+
93
+
94
+ # --------------------------------------------------------
95
+ # 회귀 성능 평가 지표 함수
96
+ # --------------------------------------------------------
97
+ def get_scores(
98
+ estimator,
99
+ x_test: DataFrame,
100
+ y_test: DataFrame | np.ndarray
101
+ ) -> DataFrame:
102
+ """
103
+ 회귀 성능 평가 지표 함수
104
+
105
+ Args:
106
+ estimator: 학습된 사이킷런 회귀 모델
107
+ x_test: 테스트용 설명변수 데이터 (DataFrame)
108
+ y_test: 실제 목표변수 값 (DataFrame 또는 ndarray)
109
+
110
+ Returns:
111
+ DataFrame: 회귀 성능 평가 지표 (R2, MAE, MSE, RMSE, MAPE, MPE)
112
+ """
113
+ if hasattr(estimator, "named_steps"):
114
+ classname = estimator.named_steps["model"].__class__.__name__
115
+ else:
116
+ classname = estimator.__class__.__name__
117
+
118
+ y_pred = estimator.predict(x_test)
119
+
120
+ score_df = DataFrame(
121
+ {
122
+ "결정계수(R2)": r2_score(y_test, y_pred),
123
+ "평균절대오차(MAE)": mean_absolute_error(y_test, y_pred),
124
+ "평균제곱오차(MSE)": mean_squared_error(y_test, y_pred),
125
+ "평균오차(RMSE)": np.sqrt(mean_squared_error(y_test, y_pred)),
126
+ "평균 절대 백분오차 비율(MAPE)": mean_absolute_percentage_error(
127
+ y_test, y_pred
128
+ ),
129
+ "평균 비율 오차(MPE)": np.mean((y_test - y_pred) / y_test * 100),
130
+ },
131
+ index=[classname],
132
+ )
133
+
134
+ return score_df
135
+
136
+
137
+ # --------------------------------------------------------
138
+ # 학습곡선기반 과적합 판별 함수
139
+ # --------------------------------------------------------
140
+ def learning_cv(
141
+ estimator,
142
+ x,
143
+ y,
144
+ scoring="neg_root_mean_squared_error",
145
+ cv=5,
146
+ train_sizes=np.linspace(0.1, 1.0, 10),
147
+ n_jobs=-1,
148
+ ) -> DataFrame:
149
+ """학습곡선 기반 과적합 판별 함수
150
+
151
+ Args:
152
+ estimator: 사이킷런 Estimator (파이프라인 권장)
153
+ x: 설명변수 (DataFrame 또는 ndarray)
154
+ y: 목표변수 (Series 또는 ndarray)
155
+ scoring: 평가 지표 (기본값: neg_root_mean_squared_error)
156
+ cv: 교차검증 폴드 수 (기본값: 5)
157
+ train_sizes: 학습곡선 학습 데이터 비율 (기본값: np.linspace(0.1, 1.0, 10))
158
+ n_jobs: 병렬 처리 개수 (기본값: -1, 모든 CPU 사용)
159
+
160
+ Returns:
161
+ DataFrame: 과적합 판별 결과 표
162
+ """
163
+
164
+ train_sizes, train_scores, cv_scores = learning_curve( # type: ignore
165
+ estimator=estimator,
166
+ X=x,
167
+ y=y,
168
+ train_sizes=train_sizes,
169
+ cv=cv,
170
+ scoring=scoring,
171
+ n_jobs=n_jobs,
172
+ shuffle=True,
173
+ random_state=52,
174
+ )
175
+
176
+ if hasattr(estimator, "named_steps"):
177
+ classname = estimator.named_steps["model"].__class__.__name__
178
+ else:
179
+ classname = estimator.__class__.__name__
180
+
181
+ # neg RMSE → RMSE
182
+ train_rmse = -train_scores
183
+ cv_rmse = -cv_scores
184
+
185
+ # 평균 / 표준편차
186
+ train_mean = train_rmse.mean(axis=1)
187
+ cv_mean = cv_rmse.mean(axis=1)
188
+ cv_std = cv_rmse.std(axis=1)
189
+
190
+ # 마지막 지점 기준 정량 판정
191
+ final_train = train_mean[-1]
192
+ final_cv = cv_mean[-1]
193
+ final_std = cv_std[-1]
194
+ gap_ratio = final_train / final_cv
195
+ var_ratio = final_std / final_cv
196
+
197
+ # -----------------
198
+ # 과소적합 기준선 (some_threshold)
199
+ # -----------------
200
+ # 기준모형 RMSE (평균 예측)
201
+ y_mean = y.mean()
202
+ rmse_naive = np.sqrt(np.mean((y - y_mean) ** 2))
203
+
204
+ # 분산 기반
205
+ std_y = y.std()
206
+
207
+ # 최소 설명력(R²) 기반
208
+ min_r2 = 0.10
209
+ rmse_r2 = np.sqrt((1 - min_r2) * np.var(y))
210
+
211
+ # 최종 threshold (가장 관대한 기준)
212
+ # -> 원래 some_threshold는 도메인 지식 수준에서 이 모델은 최소 어느 정도의 성능은 내야 한다는 기준을 설정하는 것
213
+ some_threshold = min(rmse_naive, std_y, rmse_r2)
214
+
215
+ # -----------------
216
+ # 판정 로직
217
+ # -----------------
218
+ if gap_ratio >= 0.95 and final_cv > some_threshold:
219
+ status = "⚠️ 과소적합 (bias 큼)"
220
+ elif gap_ratio <= 0.8:
221
+ status = "⚠️ 과대적합 (variance 큼)"
222
+ elif gap_ratio <= 0.95 and var_ratio <= 0.10:
223
+ status = "✅ 일반화 양호"
224
+ elif var_ratio > 0.15:
225
+ status = "⚠️ 데이터 부족 / 분산 큼"
226
+ else:
227
+ status = "⚠️ 판단 유보"
228
+
229
+ # -----------------
230
+ # 정량 결과 표
231
+ # -----------------
232
+ result_df = DataFrame(
233
+ {
234
+ "Train RMSE": [final_train],
235
+ "CV RMSE 평균": [final_cv],
236
+ "CV RMSE 표준편차": [final_std],
237
+ "Train/CV 비율": [gap_ratio],
238
+ "CV 변동성 비율": [var_ratio],
239
+ "판정 결과": [status],
240
+ },
241
+ index=[classname],
242
+ )
243
+
244
+ # -----------------
245
+ # 학습곡선 시각화
246
+ # -----------------
247
+ fig, ax = create_figure()
248
+
249
+ sb.lineplot(
250
+ x=train_sizes,
251
+ y=train_mean,
252
+ marker="o",
253
+ markeredgecolor="#ffffff",
254
+ label="Train RMSE",
255
+ )
256
+ sb.lineplot(
257
+ x=train_sizes,
258
+ y=cv_mean,
259
+ marker="o",
260
+ markeredgecolor="#ffffff",
261
+ label="CV RMSE",
262
+ )
263
+
264
+ ax.set_xlabel("RMSE", fontsize=8, labelpad=5) # type : ignore
265
+ ax.set_ylabel("학습곡선 (Learning Curve)", fontsize=8, labelpad=5) # type : ignore
266
+ ax.grid(True, alpha=0.3) # type : ignore
267
+
268
+ finalize_plot(ax)
269
+
270
+ return result_df
271
+
272
+
273
+ def get_score_cv(
274
+ estimator,
275
+ x_test: DataFrame,
276
+ y_test: DataFrame | np.ndarray,
277
+ x_origin: DataFrame,
278
+ y_origin: DataFrame | np.ndarray,
279
+ scoring="neg_root_mean_squared_error",
280
+ cv=5,
281
+ train_sizes=np.linspace(0.1, 1.0, 10),
282
+ n_jobs=-1,
283
+ ) -> DataFrame:
284
+ """
285
+ 회귀 성능 평가 지표 함수
286
+
287
+ Args:
288
+ estimator: 학습된 사이킷런 회귀 모델
289
+ x_test: 테스트용 설명변수 데이터 (DataFrame)
290
+ y_test: 실제 목표변수 값 (DataFrame 또는 ndarray)
291
+ x_origin: 학습곡선용 전체 설명변수 데이터 (DataFrame, learning_curve=True일 때 필요)
292
+ y_origin: 학습곡선용 전체 목표변수 값 (DataFrame 또는 ndarray, learning_curve=True일 때 필요)
293
+ scoring: 학습곡선 평가 지표 (기본값: neg_root_mean_squared_error)
294
+ cv: 학습곡선 교차검증 폴드 수 (기본값: 5)
295
+ train_sizes: 학습곡선 학습 데이터 비율 (기본값: np.linspace(0.1, 1.0, 10))
296
+ n_jobs: 학습곡선 병렬 처리 개수 (기본값: -1, 모든 CPU 사용)
297
+
298
+ Returns:
299
+ DataFrame: 회귀 성능 평가 지표 + 과적합 판정 여부
300
+ """
301
+
302
+ score_df = get_scores(estimator, x_test, y_test)
303
+ cv_df = learning_cv(
304
+ estimator,
305
+ x_origin,
306
+ y_origin,
307
+ scoring=scoring,
308
+ cv=cv,
309
+ train_sizes=train_sizes,
310
+ n_jobs=n_jobs,
311
+ )
312
+
313
+ return merge(score_df, cv_df, left_index=True, right_index=True)