hossam 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hossam/data_loader.py +7 -9
- hossam/hs_gis.py +17 -18
- hossam/hs_plot.py +220 -220
- hossam/hs_prep.py +56 -25
- hossam/hs_stats.py +199 -67
- hossam/hs_util.py +4 -6
- {hossam-0.4.3.dist-info → hossam-0.4.5.dist-info}/METADATA +1 -1
- hossam-0.4.5.dist-info/RECORD +16 -0
- hossam-0.4.3.dist-info/RECORD +0 -16
- {hossam-0.4.3.dist-info → hossam-0.4.5.dist-info}/WHEEL +0 -0
- {hossam-0.4.3.dist-info → hossam-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {hossam-0.4.3.dist-info → hossam-0.4.5.dist-info}/top_level.txt +0 -0
hossam/hs_stats.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
2
3
|
from __future__ import annotations
|
|
4
|
+
from typing import overload, Tuple, Literal, Union, Any
|
|
3
5
|
|
|
4
6
|
# -------------------------------------------------------------
|
|
5
7
|
import numpy as np
|
|
@@ -22,12 +24,13 @@ from scipy.stats import (
|
|
|
22
24
|
normaltest,
|
|
23
25
|
bartlett,
|
|
24
26
|
levene,
|
|
25
|
-
ttest_1samp,
|
|
27
|
+
ttest_1samp, # type: ignore
|
|
26
28
|
ttest_ind as scipy_ttest_ind,
|
|
27
|
-
ttest_rel,
|
|
29
|
+
ttest_rel, # type: ignore
|
|
28
30
|
wilcoxon,
|
|
29
31
|
pearsonr,
|
|
30
32
|
spearmanr,
|
|
33
|
+
chi2
|
|
31
34
|
)
|
|
32
35
|
|
|
33
36
|
import statsmodels.api as sm
|
|
@@ -36,10 +39,71 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
|
36
39
|
from statsmodels.stats.multitest import multipletests
|
|
37
40
|
from statsmodels.stats.stattools import durbin_watson
|
|
38
41
|
from statsmodels.regression.linear_model import RegressionResultsWrapper
|
|
42
|
+
from statsmodels.discrete.discrete_model import BinaryResultsWrapper
|
|
39
43
|
from statsmodels.discrete.discrete_model import BinaryResults
|
|
40
44
|
|
|
41
45
|
from pingouin import anova, pairwise_tukey, welch_anova, pairwise_gameshowell
|
|
42
46
|
|
|
47
|
+
from .hs_plot import ols_residplot, ols_qqplot
|
|
48
|
+
|
|
49
|
+
# ===================================================================
|
|
50
|
+
# MCAR(결측치 무작위성) 검정
|
|
51
|
+
# ===================================================================
|
|
52
|
+
def mcar_test(data: DataFrame, columns: list | str | None = None) -> DataFrame:
|
|
53
|
+
if isinstance(columns, str):
|
|
54
|
+
columns = [c.strip() for c in columns.split(",")]
|
|
55
|
+
|
|
56
|
+
cols = data.columns if columns is None else columns
|
|
57
|
+
df = data[cols]
|
|
58
|
+
|
|
59
|
+
# 결측치가 있는 컬럼만 사용
|
|
60
|
+
cols_with_na = [c for c in df.columns if df[c].isna().any()]
|
|
61
|
+
if len(cols_with_na) < 2:
|
|
62
|
+
raise ValueError("MCAR 검정은 결측치가 있는 변수가 최소 2개 이상 필요합니다.")
|
|
63
|
+
|
|
64
|
+
X = df[cols_with_na].to_numpy()
|
|
65
|
+
n, p = X.shape
|
|
66
|
+
|
|
67
|
+
# complete cases로 평균·공분산 추정
|
|
68
|
+
complete = ~np.isnan(X).any(axis=1)
|
|
69
|
+
if complete.sum() < p + 1:
|
|
70
|
+
raise ValueError("완전관측치(complete cases)가 부족하여 MCAR 검정을 수행할 수 없습니다.")
|
|
71
|
+
|
|
72
|
+
mu = X[complete].mean(axis=0)
|
|
73
|
+
S = np.cov(X[complete], rowvar=False)
|
|
74
|
+
S_inv = np.linalg.pinv(S)
|
|
75
|
+
|
|
76
|
+
chi_sq = 0.0
|
|
77
|
+
dfree = 0
|
|
78
|
+
|
|
79
|
+
for i in range(n):
|
|
80
|
+
obs = ~np.isnan(X[i])
|
|
81
|
+
if obs.sum() == p:
|
|
82
|
+
continue # complete case는 제외
|
|
83
|
+
diff = X[i, obs] - mu[obs]
|
|
84
|
+
S_obs = S[np.ix_(obs, obs)]
|
|
85
|
+
S_obs_inv = np.linalg.pinv(S_obs)
|
|
86
|
+
|
|
87
|
+
chi_sq += diff @ S_obs_inv @ diff
|
|
88
|
+
dfree += obs.sum()
|
|
89
|
+
|
|
90
|
+
dfree -= p # Little's adjustment
|
|
91
|
+
|
|
92
|
+
p_value = 1 - chi2.cdf(chi_sq, dfree)
|
|
93
|
+
is_mcar = p_value > 0.05
|
|
94
|
+
|
|
95
|
+
return DataFrame([{
|
|
96
|
+
"statistic": chi_sq,
|
|
97
|
+
"dof": dfree,
|
|
98
|
+
"p-value": p_value,
|
|
99
|
+
"is_mcar": is_mcar,
|
|
100
|
+
"interpretation": (
|
|
101
|
+
"결측치는 완전 무작위(MCAR)로 판단됨 → 결측치 삭제 가능"
|
|
102
|
+
if is_mcar else
|
|
103
|
+
"결측치는 완전 무작위(MCAR)가 아님 → 삭제 시 편향 가능"
|
|
104
|
+
)
|
|
105
|
+
}])
|
|
106
|
+
|
|
43
107
|
# ===================================================================
|
|
44
108
|
# 결측치 분석 (Missing Values Analysis)
|
|
45
109
|
# ===================================================================
|
|
@@ -219,6 +283,8 @@ def describe(data: DataFrame, *fields: str, columns: list | None = None):
|
|
|
219
283
|
행은 다음과 같은 통계량을 포함:
|
|
220
284
|
|
|
221
285
|
- count (float): 비결측치의 수
|
|
286
|
+
- na_count (int): 결측치의 수
|
|
287
|
+
- na_rate (float): 결측치 비율(%)
|
|
222
288
|
- mean (float): 평균값
|
|
223
289
|
- std (float): 표준편차
|
|
224
290
|
- min (float): 최소값
|
|
@@ -267,9 +333,13 @@ def describe(data: DataFrame, *fields: str, columns: list | None = None):
|
|
|
267
333
|
|
|
268
334
|
# 기술통계량 구하기
|
|
269
335
|
desc = data[list(fields)].describe().T
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
336
|
+
|
|
337
|
+
# 각 컬럼별 결측치 수(na_count) 추가
|
|
338
|
+
na_counts = data[list(fields)].isnull().sum()
|
|
339
|
+
desc.insert(1, 'na_count', na_counts)
|
|
340
|
+
|
|
341
|
+
# 결측치 비율(na_rate) 추가
|
|
342
|
+
desc.insert(2, 'na_rate', (na_counts / len(data)) * 100)
|
|
273
343
|
|
|
274
344
|
# 추가 통계량 계산
|
|
275
345
|
additional_stats = []
|
|
@@ -305,29 +375,29 @@ def describe(data: DataFrame, *fields: str, columns: list | None = None):
|
|
|
305
375
|
outlier_rate = (outlier_count / len(data)) * 100
|
|
306
376
|
|
|
307
377
|
# 분포 특성 판정 (왜도 기준)
|
|
308
|
-
abs_skew = abs(skew)
|
|
309
|
-
if abs_skew < 0.5:
|
|
378
|
+
abs_skew = abs(skew) # type: ignore
|
|
379
|
+
if abs_skew < 0.5: # type: ignore
|
|
310
380
|
dist = "거의 대칭"
|
|
311
|
-
elif abs_skew < 1.0:
|
|
312
|
-
if skew > 0:
|
|
381
|
+
elif abs_skew < 1.0: # type: ignore
|
|
382
|
+
if skew > 0: # type: ignore
|
|
313
383
|
dist = "약한 우측 꼬리"
|
|
314
384
|
else:
|
|
315
385
|
dist = "약한 좌측 꼬리"
|
|
316
|
-
elif abs_skew < 2.0:
|
|
317
|
-
if skew > 0:
|
|
386
|
+
elif abs_skew < 2.0: # type: ignore
|
|
387
|
+
if skew > 0: # type: ignore
|
|
318
388
|
dist = "중간 우측 꼬리"
|
|
319
389
|
else:
|
|
320
390
|
dist = "중간 좌측 꼬리"
|
|
321
391
|
else:
|
|
322
|
-
if skew > 0:
|
|
392
|
+
if skew > 0: # type: ignore
|
|
323
393
|
dist = "극단 우측 꼬리"
|
|
324
394
|
else:
|
|
325
395
|
dist = "극단 좌측 꼬리"
|
|
326
396
|
|
|
327
397
|
# 로그변환 필요성 판정
|
|
328
|
-
if abs_skew < 0.5:
|
|
398
|
+
if abs_skew < 0.5: # type: ignore
|
|
329
399
|
log_need = "낮음"
|
|
330
|
-
elif abs_skew < 1.0:
|
|
400
|
+
elif abs_skew < 1.0: # type: ignore
|
|
331
401
|
log_need = "중간"
|
|
332
402
|
else:
|
|
333
403
|
log_need = "높음"
|
|
@@ -403,7 +473,7 @@ def category_describe(data: DataFrame, *fields: str):
|
|
|
403
473
|
"""
|
|
404
474
|
if not fields:
|
|
405
475
|
# 명목형(범주형) 컬럼 선택: object, category, bool 타입
|
|
406
|
-
fields = data.select_dtypes(include=['object', 'category', 'bool']).columns
|
|
476
|
+
fields = data.select_dtypes(include=['object', 'category', 'bool']).columns # type: ignore
|
|
407
477
|
|
|
408
478
|
result = []
|
|
409
479
|
summary = []
|
|
@@ -660,7 +730,7 @@ def equal_var_test(data: DataFrame, columns: list | str | None = None, normal_di
|
|
|
660
730
|
normality_result = normal_test(data[numeric_cols], method="n")
|
|
661
731
|
# 모든 컬럼이 정규분포를 따르는지 확인
|
|
662
732
|
all_normal = normality_result["is_normal"].all()
|
|
663
|
-
normal_dist = all_normal
|
|
733
|
+
normal_dist = all_normal # type: ignore
|
|
664
734
|
|
|
665
735
|
try:
|
|
666
736
|
if normal_dist:
|
|
@@ -759,7 +829,7 @@ def ttest_1samp(data, mean_value: float = 0.0) -> DataFrame:
|
|
|
759
829
|
else:
|
|
760
830
|
for a in alternative:
|
|
761
831
|
try:
|
|
762
|
-
s, p = ttest_1samp(col_data, mean_value, alternative=a)
|
|
832
|
+
s, p = ttest_1samp(col_data, mean_value, alternative=a) # type: ignore
|
|
763
833
|
|
|
764
834
|
itp = None
|
|
765
835
|
|
|
@@ -869,26 +939,26 @@ def ttest_ind(x, y, equal_var: bool | None = None) -> DataFrame:
|
|
|
869
939
|
|
|
870
940
|
for a in alternative:
|
|
871
941
|
try:
|
|
872
|
-
s, p = scipy_ttest_ind(x_data, y_data, equal_var=equal_var, alternative=a)
|
|
942
|
+
s, p = scipy_ttest_ind(x_data, y_data, equal_var=equal_var, alternative=a) # type: ignore
|
|
873
943
|
n = "t-test_ind" if equal_var else "Welch's t-test"
|
|
874
944
|
|
|
875
945
|
# 검정 결과 해석
|
|
876
946
|
itp = None
|
|
877
947
|
|
|
878
948
|
if a == "two-sided":
|
|
879
|
-
itp = fmt.format("==" if p > 0.05 else "!=")
|
|
949
|
+
itp = fmt.format("==" if p > 0.05 else "!=") # type: ignore
|
|
880
950
|
elif a == "less":
|
|
881
|
-
itp = fmt.format(">=" if p > 0.05 else "<")
|
|
951
|
+
itp = fmt.format(">=" if p > 0.05 else "<") # type: ignore
|
|
882
952
|
else:
|
|
883
|
-
itp = fmt.format("<=" if p > 0.05 else ">")
|
|
953
|
+
itp = fmt.format("<=" if p > 0.05 else ">") # type: ignore
|
|
884
954
|
|
|
885
955
|
result.append({
|
|
886
956
|
"test": n,
|
|
887
957
|
"alternative": a,
|
|
888
|
-
"statistic": round(s, 3),
|
|
889
|
-
"p-value": round(p, 4),
|
|
890
|
-
"H0": p > 0.05,
|
|
891
|
-
"H1": p <= 0.05,
|
|
958
|
+
"statistic": round(s, 3), # type: ignore
|
|
959
|
+
"p-value": round(p, 4), # type: ignore
|
|
960
|
+
"H0": p > 0.05, # type: ignore
|
|
961
|
+
"H1": p <= 0.05, # type: ignore
|
|
892
962
|
"interpretation": itp,
|
|
893
963
|
"equal_var_checked": var_checked
|
|
894
964
|
})
|
|
@@ -998,7 +1068,7 @@ def ttest_rel(x, y, parametric: bool | None = None) -> DataFrame:
|
|
|
998
1068
|
for a in alternative:
|
|
999
1069
|
try:
|
|
1000
1070
|
if parametric:
|
|
1001
|
-
s, p = ttest_rel(x_data, y_data, alternative=a)
|
|
1071
|
+
s, p = ttest_rel(x_data, y_data, alternative=a) # type: ignore
|
|
1002
1072
|
n = "t-test_paired"
|
|
1003
1073
|
else:
|
|
1004
1074
|
# Wilcoxon signed-rank test (대응표본용 비모수 검정)
|
|
@@ -1008,19 +1078,19 @@ def ttest_rel(x, y, parametric: bool | None = None) -> DataFrame:
|
|
|
1008
1078
|
itp = None
|
|
1009
1079
|
|
|
1010
1080
|
if a == "two-sided":
|
|
1011
|
-
itp = fmt.format("==" if p > 0.05 else "!=")
|
|
1081
|
+
itp = fmt.format("==" if p > 0.05 else "!=") # type: ignore
|
|
1012
1082
|
elif a == "less":
|
|
1013
|
-
itp = fmt.format(">=" if p > 0.05 else "<")
|
|
1083
|
+
itp = fmt.format(">=" if p > 0.05 else "<") # type: ignore
|
|
1014
1084
|
else:
|
|
1015
|
-
itp = fmt.format("<=" if p > 0.05 else ">")
|
|
1085
|
+
itp = fmt.format("<=" if p > 0.05 else ">") # type: ignore
|
|
1016
1086
|
|
|
1017
1087
|
result.append({
|
|
1018
1088
|
"test": n,
|
|
1019
1089
|
"alternative": a,
|
|
1020
|
-
"statistic": round(s, 3) if not np.isnan(s) else s,
|
|
1021
|
-
"p-value": round(p, 4) if not np.isnan(p) else p,
|
|
1022
|
-
"H0": p > 0.05,
|
|
1023
|
-
"H1": p <= 0.05,
|
|
1090
|
+
"statistic": round(s, 3) if not np.isnan(s) else s, # type: ignore
|
|
1091
|
+
"p-value": round(p, 4) if not np.isnan(p) else p, # type: ignore
|
|
1092
|
+
"H0": p > 0.05, # type: ignore
|
|
1093
|
+
"H1": p <= 0.05, # type: ignore
|
|
1024
1094
|
"interpretation": itp,
|
|
1025
1095
|
"normality_checked": var_checked
|
|
1026
1096
|
})
|
|
@@ -1047,7 +1117,7 @@ def ttest_rel(x, y, parametric: bool | None = None) -> DataFrame:
|
|
|
1047
1117
|
# ===================================================================
|
|
1048
1118
|
def vif_filter(
|
|
1049
1119
|
data: DataFrame,
|
|
1050
|
-
yname: str = None,
|
|
1120
|
+
yname: str | None = None,
|
|
1051
1121
|
ignore: list | None = None,
|
|
1052
1122
|
threshold: float = 10.0,
|
|
1053
1123
|
verbose: bool = False,
|
|
@@ -1112,7 +1182,7 @@ def vif_filter(
|
|
|
1112
1182
|
for i, col in enumerate(X_clean.columns, start=0):
|
|
1113
1183
|
# exog의 첫 열은 상수항이므로 변수 인덱스는 +1
|
|
1114
1184
|
try:
|
|
1115
|
-
vifs[col] = float(variance_inflation_factor(exog.values, i + 1))
|
|
1185
|
+
vifs[col] = float(variance_inflation_factor(exog.values, i + 1))# type: ignore
|
|
1116
1186
|
except Exception:
|
|
1117
1187
|
# 계산 실패 시 무한대로 처리하여 우선 제거 대상으로
|
|
1118
1188
|
vifs[col] = float("inf")
|
|
@@ -1150,7 +1220,7 @@ def vif_filter(
|
|
|
1150
1220
|
# ===================================================================
|
|
1151
1221
|
# x, y 데이터에 대한 추세선을 구한다.
|
|
1152
1222
|
# ===================================================================
|
|
1153
|
-
def trend(x:
|
|
1223
|
+
def trend(x: Any, y: Any, degree: int = 1, value_count: int = 100) -> Tuple[np.ndarray, np.ndarray]:
|
|
1154
1224
|
"""x, y 데이터에 대한 추세선을 구한다.
|
|
1155
1225
|
|
|
1156
1226
|
Args:
|
|
@@ -1192,7 +1262,10 @@ def trend(x: any, y: any, degree: int = 1, value_count: int = 100) -> Tuple[np.n
|
|
|
1192
1262
|
# ===================================================================
|
|
1193
1263
|
# 선형회귀 요약 리포트
|
|
1194
1264
|
# ===================================================================
|
|
1195
|
-
def ols_report(fit, data, full=False, alpha=0.05)
|
|
1265
|
+
def ols_report(fit, data, full=False, alpha=0.05) -> Union[
|
|
1266
|
+
Tuple[DataFrame, DataFrame],
|
|
1267
|
+
Tuple[DataFrame, DataFrame, str, str, list[str], str]
|
|
1268
|
+
]:
|
|
1196
1269
|
"""선형회귀 적합 결과를 요약 리포트로 변환한다.
|
|
1197
1270
|
|
|
1198
1271
|
Args:
|
|
@@ -1211,6 +1284,7 @@ def ols_report(fit, data, full=False, alpha=0.05):
|
|
|
1211
1284
|
- 회귀식 문자열 (`equation_text`, str): 상수항과 계수를 포함한 회귀식 표현.
|
|
1212
1285
|
|
|
1213
1286
|
full=False일 때:
|
|
1287
|
+
- 성능 지표 표 (`pdf`, DataFrame): R, R², Adj. R², F, p-value, Durbin-Watson.
|
|
1214
1288
|
- 회귀계수 표 (`rdf`, DataFrame)
|
|
1215
1289
|
|
|
1216
1290
|
Examples:
|
|
@@ -1250,7 +1324,7 @@ def ols_report(fit, data, full=False, alpha=0.05):
|
|
|
1250
1324
|
for i, col in enumerate(indi_df.columns, start=1): # 상수항이 0이므로 1부터 시작
|
|
1251
1325
|
try:
|
|
1252
1326
|
with np.errstate(divide='ignore', invalid='ignore'):
|
|
1253
|
-
vif_value = variance_inflation_factor(indi_df_const.values, i)
|
|
1327
|
+
vif_value = variance_inflation_factor(indi_df_const.values, i) # type: ignore
|
|
1254
1328
|
# inf나 매우 큰 값 처리
|
|
1255
1329
|
if np.isinf(vif_value) or vif_value > 1e10:
|
|
1256
1330
|
vif_dict[col] = np.inf
|
|
@@ -1378,7 +1452,19 @@ def ols_report(fit, data, full=False, alpha=0.05):
|
|
|
1378
1452
|
# ===================================================================
|
|
1379
1453
|
# 선형회귀
|
|
1380
1454
|
# ===================================================================
|
|
1381
|
-
def ols(df: DataFrame, yname: str, report=False)
|
|
1455
|
+
def ols(df: DataFrame, yname: str, report: bool | str | int = False) -> Union[
|
|
1456
|
+
RegressionResultsWrapper,
|
|
1457
|
+
Tuple[RegressionResultsWrapper, DataFrame, DataFrame],
|
|
1458
|
+
Tuple[
|
|
1459
|
+
RegressionResultsWrapper,
|
|
1460
|
+
DataFrame,
|
|
1461
|
+
DataFrame,
|
|
1462
|
+
str,
|
|
1463
|
+
str,
|
|
1464
|
+
list[str],
|
|
1465
|
+
str
|
|
1466
|
+
]
|
|
1467
|
+
]:
|
|
1382
1468
|
"""선형회귀분석을 수행하고 적합 결과를 반환한다.
|
|
1383
1469
|
|
|
1384
1470
|
OLS(Ordinary Least Squares) 선형회귀분석을 실시한다.
|
|
@@ -1387,7 +1473,7 @@ def ols(df: DataFrame, yname: str, report=False):
|
|
|
1387
1473
|
Args:
|
|
1388
1474
|
df (DataFrame): 종속변수와 독립변수를 모두 포함한 데이터프레임.
|
|
1389
1475
|
yname (str): 종속변수 컬럼명.
|
|
1390
|
-
report: 리포트 모드 설정. 다음 값 중 하나:
|
|
1476
|
+
report (bool | str | int): 리포트 모드 설정. 다음 값 중 하나:
|
|
1391
1477
|
- False (기본값): 리포트 미사용. fit 객체만 반환.
|
|
1392
1478
|
- 1 또는 'summary': 요약 리포트 반환 (full=False).
|
|
1393
1479
|
- 2 또는 'full': 풀 리포트 반환 (full=True).
|
|
@@ -1426,10 +1512,10 @@ def ols(df: DataFrame, yname: str, report=False):
|
|
|
1426
1512
|
fit = hs_stats.ols(df, 'target')
|
|
1427
1513
|
|
|
1428
1514
|
# 요약 리포트 반환
|
|
1429
|
-
fit, pdf, rdf = hs_stats.ols(df, 'target', report=
|
|
1515
|
+
fit, pdf, rdf = hs_stats.ols(df, 'target', report='summary')
|
|
1430
1516
|
|
|
1431
1517
|
# 풀 리포트 반환
|
|
1432
|
-
fit, pdf, rdf, result_report, model_report, var_reports, eq = hs_stats.ols(df, 'target', report=
|
|
1518
|
+
fit, pdf, rdf, result_report, model_report, var_reports, eq = hs_stats.ols(df, 'target', report='full')
|
|
1433
1519
|
```
|
|
1434
1520
|
"""
|
|
1435
1521
|
x = df.drop(yname, axis=1)
|
|
@@ -1445,11 +1531,11 @@ def ols(df: DataFrame, yname: str, report=False):
|
|
|
1445
1531
|
return linear_fit
|
|
1446
1532
|
elif report == 1 or report == 'summary':
|
|
1447
1533
|
# 요약 리포트 (full=False)
|
|
1448
|
-
pdf, rdf = ols_report(linear_fit, df, full=False, alpha=0.05)
|
|
1534
|
+
pdf, rdf = ols_report(linear_fit, df, full=False, alpha=0.05) # type: ignore
|
|
1449
1535
|
return linear_fit, pdf, rdf
|
|
1450
1536
|
elif report == 2 or report == 'full' or report is True:
|
|
1451
1537
|
# 풀 리포트 (full=True)
|
|
1452
|
-
pdf, rdf, result_report, model_report, variable_reports, equation_text = ols_report(linear_fit, df, full=True, alpha=0.05)
|
|
1538
|
+
pdf, rdf, result_report, model_report, variable_reports, equation_text = ols_report(linear_fit, df, full=True, alpha=0.05) # type: ignore
|
|
1453
1539
|
return linear_fit, pdf, rdf, result_report, model_report, variable_reports, equation_text
|
|
1454
1540
|
else:
|
|
1455
1541
|
# 기본값: 리포트 미사용
|
|
@@ -1459,15 +1545,31 @@ def ols(df: DataFrame, yname: str, report=False):
|
|
|
1459
1545
|
# ===================================================================
|
|
1460
1546
|
# 로지스틱 회귀 요약 리포트
|
|
1461
1547
|
# ===================================================================
|
|
1462
|
-
def logit_report(
|
|
1548
|
+
def logit_report(
|
|
1549
|
+
fit: BinaryResultsWrapper,
|
|
1550
|
+
data: DataFrame,
|
|
1551
|
+
threshold: float = 0.5,
|
|
1552
|
+
full: Union[bool, str, int] = False,
|
|
1553
|
+
alpha: float = 0.05
|
|
1554
|
+
) -> Union[
|
|
1555
|
+
Tuple[DataFrame, DataFrame],
|
|
1556
|
+
Tuple[
|
|
1557
|
+
DataFrame,
|
|
1558
|
+
DataFrame,
|
|
1559
|
+
str,
|
|
1560
|
+
str,
|
|
1561
|
+
list[str],
|
|
1562
|
+
np.ndarray
|
|
1563
|
+
]
|
|
1564
|
+
]:
|
|
1463
1565
|
"""로지스틱 회귀 적합 결과를 상세 리포트로 변환한다.
|
|
1464
1566
|
|
|
1465
1567
|
Args:
|
|
1466
1568
|
fit: statsmodels Logit 결과 객체 (`fit.summary()`와 예측 확률을 지원해야 함).
|
|
1467
|
-
data: 종속변수와 독립변수를 모두 포함한 DataFrame.
|
|
1468
|
-
threshold: 예측 확률을 이진 분류로 변환할 임계값. 기본값 0.5.
|
|
1469
|
-
full: True이면 6개 값 반환, False이면 주요 2개(cdf, rdf)만 반환. 기본값 False.
|
|
1470
|
-
alpha: 유의수준. 기본값 0.05.
|
|
1569
|
+
data (DataFrame): 종속변수와 독립변수를 모두 포함한 DataFrame.
|
|
1570
|
+
threshold (float): 예측 확률을 이진 분류로 변환할 임계값. 기본값 0.5.
|
|
1571
|
+
full (bool | str | int): True이면 6개 값 반환, False이면 주요 2개(cdf, rdf)만 반환. 기본값 False.
|
|
1572
|
+
alpha (float): 유의수준. 기본값 0.05.
|
|
1471
1573
|
|
|
1472
1574
|
Returns:
|
|
1473
1575
|
tuple: full=True일 때 다음 요소를 포함한다.
|
|
@@ -1555,7 +1657,7 @@ def logit_report(fit, data, threshold=0.5, full=False, alpha=0.05):
|
|
|
1555
1657
|
vif_dict = {}
|
|
1556
1658
|
x_const = sm.add_constant(x, has_constant="add")
|
|
1557
1659
|
for i, col in enumerate(x.columns, start=1): # 상수항이 0이므로 1부터 시작
|
|
1558
|
-
vif_dict[col] = variance_inflation_factor(x_const.values, i)
|
|
1660
|
+
vif_dict[col] = variance_inflation_factor(x_const.values, i) # type: ignore
|
|
1559
1661
|
|
|
1560
1662
|
for idx, row in tbl.iterrows():
|
|
1561
1663
|
name = idx
|
|
@@ -1652,7 +1754,25 @@ def logit_report(fit, data, threshold=0.5, full=False, alpha=0.05):
|
|
|
1652
1754
|
# ===================================================================
|
|
1653
1755
|
# 로지스틱 회귀
|
|
1654
1756
|
# ===================================================================
|
|
1655
|
-
def logit(
|
|
1757
|
+
def logit(
|
|
1758
|
+
df: DataFrame,
|
|
1759
|
+
yname: str,
|
|
1760
|
+
report: Union[bool, str, int] = False
|
|
1761
|
+
) -> Union[
|
|
1762
|
+
BinaryResultsWrapper,
|
|
1763
|
+
Tuple[
|
|
1764
|
+
BinaryResultsWrapper,
|
|
1765
|
+
DataFrame
|
|
1766
|
+
],
|
|
1767
|
+
Tuple[
|
|
1768
|
+
BinaryResultsWrapper,
|
|
1769
|
+
DataFrame,
|
|
1770
|
+
DataFrame,
|
|
1771
|
+
str,
|
|
1772
|
+
str,
|
|
1773
|
+
list[str]
|
|
1774
|
+
]
|
|
1775
|
+
]:
|
|
1656
1776
|
"""로지스틱 회귀분석을 수행하고 적합 결과를 반환한다.
|
|
1657
1777
|
|
|
1658
1778
|
종속변수가 이항(binary) 형태일 때 로지스틱 회귀분석을 실시한다.
|
|
@@ -1718,13 +1838,13 @@ def logit(df: DataFrame, yname: str, report=False):
|
|
|
1718
1838
|
return logit_fit
|
|
1719
1839
|
elif report == 1 or report == 'summary':
|
|
1720
1840
|
# 요약 리포트 (full=False)
|
|
1721
|
-
cdf, rdf = logit_report(logit_fit, df, threshold=0.5, full=False, alpha=0.05)
|
|
1841
|
+
cdf, rdf = logit_report(logit_fit, df, threshold=0.5, full=False, alpha=0.05) # type: ignore
|
|
1722
1842
|
# 요약에서는 result_report와 variable_reports만 포함
|
|
1723
1843
|
# 간단한 버전으로 result와 variable_reports만 생성
|
|
1724
1844
|
return logit_fit, rdf
|
|
1725
1845
|
elif report == 2 or report == 'full' or report is True:
|
|
1726
1846
|
# 풀 리포트 (full=True)
|
|
1727
|
-
cdf, rdf, result_report, model_report, variable_reports, cm = logit_report(logit_fit, df, threshold=0.5, full=True, alpha=0.05)
|
|
1847
|
+
cdf, rdf, result_report, model_report, variable_reports, cm = logit_report(logit_fit, df, threshold=0.5, full=True, alpha=0.05) # type: ignore
|
|
1728
1848
|
return logit_fit, cdf, rdf, result_report, model_report, variable_reports
|
|
1729
1849
|
else:
|
|
1730
1850
|
# 기본값: 리포트 미사용
|
|
@@ -1734,7 +1854,7 @@ def logit(df: DataFrame, yname: str, report=False):
|
|
|
1734
1854
|
# ===================================================================
|
|
1735
1855
|
# 선형성 검정 (Linearity Test)
|
|
1736
1856
|
# ===================================================================
|
|
1737
|
-
def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05) -> DataFrame:
|
|
1857
|
+
def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05, plot: bool = False, title: str | None = None, save_path: str | None = None) -> DataFrame:
|
|
1738
1858
|
"""회귀모형의 선형성을 Ramsey RESET 검정으로 평가한다.
|
|
1739
1859
|
|
|
1740
1860
|
적합된 회귀모형에 대해 Ramsey RESET(Regression Specification Error Test) 검정을 수행하여
|
|
@@ -1747,6 +1867,9 @@ def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05) -> DataFrame:
|
|
|
1747
1867
|
power=2일 때 예측값의 제곱항이 추가됨.
|
|
1748
1868
|
power가 클수록 더 높은 차수의 비선형성을 감지.
|
|
1749
1869
|
alpha (float, optional): 유의수준. 기본값 0.05.
|
|
1870
|
+
plot (bool, optional): True이면 잔차 플롯을 출력. 기본값 False.
|
|
1871
|
+
title (str, optional): 플롯 제목. 기본값 None.
|
|
1872
|
+
save_path (str, optional): 플롯을 저장할 경로. 기본값 None
|
|
1750
1873
|
|
|
1751
1874
|
Returns:
|
|
1752
1875
|
DataFrame: 선형성 검정 결과를 포함한 데이터프레임.
|
|
@@ -1829,13 +1952,16 @@ def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05) -> DataFrame:
|
|
|
1829
1952
|
"해석": [interpretation]
|
|
1830
1953
|
})
|
|
1831
1954
|
|
|
1955
|
+
if plot:
|
|
1956
|
+
ols_residplot(fit, lowess=True, mse=True, title=title, save_path=save_path)
|
|
1957
|
+
|
|
1832
1958
|
return result_df
|
|
1833
1959
|
|
|
1834
1960
|
|
|
1835
1961
|
# ===================================================================
|
|
1836
1962
|
# 정규성 검정 (Normality Test)
|
|
1837
1963
|
# ===================================================================
|
|
1838
|
-
def ols_normality_test(fit, alpha: float = 0.05) -> DataFrame:
|
|
1964
|
+
def ols_normality_test(fit, alpha: float = 0.05, plot: bool = False, title: str | None = None, save_path: str | None = None) -> DataFrame:
|
|
1839
1965
|
"""회귀모형 잔차의 정규성을 검정한다.
|
|
1840
1966
|
|
|
1841
1967
|
회귀모형의 잔차가 정규분포를 따르는지 Shapiro-Wilk 검정과 Jarque-Bera 검정으로 평가한다.
|
|
@@ -1844,6 +1970,9 @@ def ols_normality_test(fit, alpha: float = 0.05) -> DataFrame:
|
|
|
1844
1970
|
Args:
|
|
1845
1971
|
fit: 회귀 모형 객체 (statsmodels의 RegressionResultsWrapper).
|
|
1846
1972
|
alpha (float, optional): 유의수준. 기본값 0.05.
|
|
1973
|
+
plot (bool, optional): True이면 Q-Q 플롯을 출력. 기본값 False.
|
|
1974
|
+
title (str, optional): 플롯 제목. 기본값 None.
|
|
1975
|
+
save_path (str, optional): 플롯을 저장할 경로. 기본값 None
|
|
1847
1976
|
|
|
1848
1977
|
Returns:
|
|
1849
1978
|
DataFrame: 정규성 검정 결과를 포함한 데이터프레임.
|
|
@@ -1900,7 +2029,7 @@ def ols_normality_test(fit, alpha: float = 0.05) -> DataFrame:
|
|
|
1900
2029
|
# 2. Jarque-Bera 검정 (항상 수행)
|
|
1901
2030
|
try:
|
|
1902
2031
|
stat_jb, p_jb = jarque_bera(residuals)
|
|
1903
|
-
significant_jb = p_jb <= alpha
|
|
2032
|
+
significant_jb = p_jb <= alpha # type: ignore
|
|
1904
2033
|
|
|
1905
2034
|
if significant_jb:
|
|
1906
2035
|
interpretation_jb = f"정규성 위반 (p={p_jb:.4f} <= {alpha})"
|
|
@@ -1922,6 +2051,10 @@ def ols_normality_test(fit, alpha: float = 0.05) -> DataFrame:
|
|
|
1922
2051
|
if not results:
|
|
1923
2052
|
raise ValueError("정규성 검정을 수행할 수 없습니다.")
|
|
1924
2053
|
|
|
2054
|
+
|
|
2055
|
+
if plot:
|
|
2056
|
+
ols_qqplot(fit, title=title, save_path=save_path)
|
|
2057
|
+
|
|
1925
2058
|
result_df = DataFrame(results)
|
|
1926
2059
|
return result_df
|
|
1927
2060
|
|
|
@@ -2229,8 +2362,8 @@ def corr_pairwise(
|
|
|
2229
2362
|
corr_val, pval = np.nan, np.nan
|
|
2230
2363
|
|
|
2231
2364
|
# 4) 유의성, 강도
|
|
2232
|
-
significant = False if np.isnan(pval) else pval <= alpha
|
|
2233
|
-
abs_r = abs(corr_val) if not np.isnan(corr_val) else 0
|
|
2365
|
+
significant = False if np.isnan(pval) else pval <= alpha # type: ignore
|
|
2366
|
+
abs_r = abs(corr_val) if not np.isnan(corr_val) else 0 # type: ignore
|
|
2234
2367
|
if abs_r > 0.7:
|
|
2235
2368
|
strength = "strong"
|
|
2236
2369
|
elif abs_r > 0.3:
|
|
@@ -2397,13 +2530,13 @@ def oneway_anova(data: DataFrame, dv: str, between: str, alpha: float = 0.05) ->
|
|
|
2397
2530
|
anova_df['significant'] = anova_df['p-unc'] <= alpha
|
|
2398
2531
|
|
|
2399
2532
|
# ANOVA 결과가 유의한지 확인
|
|
2400
|
-
p_unc = float(anova_df.loc[0, 'p-unc'])
|
|
2533
|
+
p_unc = float(anova_df.loc[0, 'p-unc']) # type: ignore
|
|
2401
2534
|
anova_significant = p_unc <= alpha
|
|
2402
2535
|
|
|
2403
2536
|
# ANOVA 보고 문장 생성
|
|
2404
2537
|
def _safe_get(col: str, default: float = np.nan) -> float:
|
|
2405
2538
|
try:
|
|
2406
|
-
return float(anova_df.loc[0, col]) if col in anova_df.columns else default
|
|
2539
|
+
return float(anova_df.loc[0, col]) if col in anova_df.columns else default # type: ignore
|
|
2407
2540
|
except Exception:
|
|
2408
2541
|
return default
|
|
2409
2542
|
|
|
@@ -2718,7 +2851,7 @@ def predict(fit, data: DataFrame | Series) -> DataFrame | Series | float:
|
|
|
2718
2851
|
|
|
2719
2852
|
# Series 입력인 경우 단일 값 반환
|
|
2720
2853
|
if is_series:
|
|
2721
|
-
return float(predictions.iloc[0])
|
|
2854
|
+
return float(predictions.iloc[0]) # type: ignore
|
|
2722
2855
|
|
|
2723
2856
|
# DataFrame 입력인 경우
|
|
2724
2857
|
if isinstance(data, DataFrame):
|
|
@@ -2791,8 +2924,7 @@ def corr_effect_size(data: DataFrame, dv: str, *fields: str, alpha: float = 0.05
|
|
|
2791
2924
|
|
|
2792
2925
|
# fields가 지정되지 않으면 수치형 컬럼 중 dv 제외 모두 사용
|
|
2793
2926
|
if not fields:
|
|
2794
|
-
fields = [col for col in data.columns
|
|
2795
|
-
if is_numeric_dtype(data[col]) and col != dv]
|
|
2927
|
+
fields = [col for col in data.columns if is_numeric_dtype(data[col]) and col != dv] # type: ignore
|
|
2796
2928
|
|
|
2797
2929
|
# dv가 수치형인지 확인
|
|
2798
2930
|
if not is_numeric_dtype(data[dv]):
|
|
@@ -2820,8 +2952,8 @@ def corr_effect_size(data: DataFrame, dv: str, *fields: str, alpha: float = 0.05
|
|
|
2820
2952
|
normal_y_result = normal_test(data[[dv]], columns=[dv], method=method_y)
|
|
2821
2953
|
|
|
2822
2954
|
# 정규성 판정 (p > alpha면 정규분포 가정)
|
|
2823
|
-
normal_x = normal_x_result.loc[var, 'p-val'] > alpha if var in normal_x_result.index else False
|
|
2824
|
-
normal_y = normal_y_result.loc[dv, 'p-val'] > alpha if dv in normal_y_result.index else False
|
|
2955
|
+
normal_x = normal_x_result.loc[var, 'p-val'] > alpha if var in normal_x_result.index else False # type: ignore
|
|
2956
|
+
normal_y = normal_y_result.loc[dv, 'p-val'] > alpha if dv in normal_y_result.index else False # type: ignore
|
|
2825
2957
|
|
|
2826
2958
|
# Pearson (모두 정규) vs Spearman (하나라도 비정규)
|
|
2827
2959
|
if normal_x and normal_y:
|
|
@@ -2833,8 +2965,8 @@ def corr_effect_size(data: DataFrame, dv: str, *fields: str, alpha: float = 0.05
|
|
|
2833
2965
|
|
|
2834
2966
|
# Cohen's d 계산 (상관계수에서 효과크기로 변환)
|
|
2835
2967
|
# d = 2*r / sqrt(1-r^2)
|
|
2836
|
-
if r**2 < 1:
|
|
2837
|
-
d = (2 * r) / np.sqrt(1 - r**2)
|
|
2968
|
+
if r ** 2 < 1: # type: ignore
|
|
2969
|
+
d = (2 * r) / np.sqrt(1 - r ** 2) # type: ignore
|
|
2838
2970
|
else:
|
|
2839
2971
|
d = 0
|
|
2840
2972
|
|
hossam/hs_util.py
CHANGED
|
@@ -122,11 +122,9 @@ def pretty_table(data: DataFrame, tablefmt="simple", headers: str = "keys") -> N
|
|
|
122
122
|
```
|
|
123
123
|
"""
|
|
124
124
|
|
|
125
|
-
tabulate.WIDE_CHARS_MODE = False
|
|
125
|
+
tabulate.WIDE_CHARS_MODE = False # type: ignore
|
|
126
126
|
print(
|
|
127
|
-
tabulate(
|
|
128
|
-
data, headers=headers, tablefmt=tablefmt, showindex=True, numalign="right"
|
|
129
|
-
)
|
|
127
|
+
tabulate(data, headers=headers, tablefmt=tablefmt, showindex=True, numalign="right") # type: ignore
|
|
130
128
|
)
|
|
131
129
|
|
|
132
130
|
|
|
@@ -167,7 +165,7 @@ def __data_info(
|
|
|
167
165
|
|
|
168
166
|
if info:
|
|
169
167
|
print("\n✅ 테이블 정보")
|
|
170
|
-
pretty_table(data.info(), tablefmt="pretty")
|
|
168
|
+
pretty_table(data.info(), tablefmt="pretty") # type: ignore
|
|
171
169
|
|
|
172
170
|
print("\n✅ 상위 5개 행")
|
|
173
171
|
pretty_table(data.head(), tablefmt="pretty")
|
|
@@ -229,7 +227,7 @@ def load_data(key: str,
|
|
|
229
227
|
elif k.endswith(".csv"):
|
|
230
228
|
origin = read_csv(key)
|
|
231
229
|
else:
|
|
232
|
-
origin = _load_data_remote(key, local)
|
|
230
|
+
origin = _load_data_remote(key, local) # type: ignore
|
|
233
231
|
|
|
234
232
|
if origin is None:
|
|
235
233
|
raise RuntimeError("Data loading failed: origin is None")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
hossam/NotoSansKR-Regular.ttf,sha256=0SCufUQwcVWrWTu75j4Lt_V2bgBJIBXl1p8iAJJYkVY,6185516
|
|
2
|
+
hossam/__init__.py,sha256=OkMeP15jt6aCy7QNXMtkO0YRVvgOQYumkb7GuVKrbcs,2712
|
|
3
|
+
hossam/data_loader.py,sha256=K0-MJaVeedF5x8mSp22X2rD_CZ-T185EhoUFEqzP8Ss,6352
|
|
4
|
+
hossam/hs_classroom.py,sha256=rgayol3U5PSo4rLfdbClfiAtG21bFrASaSW56PUsjus,27144
|
|
5
|
+
hossam/hs_gis.py,sha256=DVmndBK-_7GMK3J1_on3ieEQk1S0MfUZ8_wlX-cDdZQ,11581
|
|
6
|
+
hossam/hs_plot.py,sha256=3j9B69pl-zQM_09lTXxLKAMaDM0vwOTsUWbzcU8hCK8,86228
|
|
7
|
+
hossam/hs_prep.py,sha256=kCmFxnMyFZ5tLUfoE8msbwTracajHAmruJbFj6A6eIU,38020
|
|
8
|
+
hossam/hs_stats.py,sha256=uGYkEk8Rb8qMoZ5FiZ7Yg6jssLIGl_EBbmwvvSYljhQ,115780
|
|
9
|
+
hossam/hs_timeserise.py,sha256=gSj3cPgOGLOZEXhfW1anXbwpoJja847ZY9F8l9piJPE,42601
|
|
10
|
+
hossam/hs_util.py,sha256=xuNXC6FJSAmyAbcRAUMsigCKHXM25t3H90nFMgq7IBs,8482
|
|
11
|
+
hossam/leekh.png,sha256=1PB5NQ24SDoHA5KMiBBsWpSa3iniFcwFTuGwuOsTHfI,6395
|
|
12
|
+
hossam-0.4.5.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
|
|
13
|
+
hossam-0.4.5.dist-info/METADATA,sha256=HM5qrrvaFZWAyUlhgV_BLPHAcxEZdZ4gp2p3V4X4pzo,3676
|
|
14
|
+
hossam-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
hossam-0.4.5.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
|
|
16
|
+
hossam-0.4.5.dist-info/RECORD,,
|
hossam-0.4.3.dist-info/RECORD
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
hossam/NotoSansKR-Regular.ttf,sha256=0SCufUQwcVWrWTu75j4Lt_V2bgBJIBXl1p8iAJJYkVY,6185516
|
|
2
|
-
hossam/__init__.py,sha256=OkMeP15jt6aCy7QNXMtkO0YRVvgOQYumkb7GuVKrbcs,2712
|
|
3
|
-
hossam/data_loader.py,sha256=oUIsqbHQoRiHA_1tdElDaYo1ipmUB5fYSXYMB5gLOl0,6395
|
|
4
|
-
hossam/hs_classroom.py,sha256=rgayol3U5PSo4rLfdbClfiAtG21bFrASaSW56PUsjus,27144
|
|
5
|
-
hossam/hs_gis.py,sha256=DLogaf5nxJBbG-d8QoH2g8UfZ1omMtmEXDYgNg8jtT0,11410
|
|
6
|
-
hossam/hs_plot.py,sha256=A_nS8dP4cijp7LZs253SWxfBUp5qvvTlSPGKjDj0BIA,83712
|
|
7
|
-
hossam/hs_prep.py,sha256=2ptFFxV4G1IFmy-B89TqXaPkA8jROZutr2XIkaXNHW4,36006
|
|
8
|
-
hossam/hs_stats.py,sha256=qAor-RE5qNsytoZW1mriK3yql9PVif5bBGyG64YC2PM,110780
|
|
9
|
-
hossam/hs_timeserise.py,sha256=gSj3cPgOGLOZEXhfW1anXbwpoJja847ZY9F8l9piJPE,42601
|
|
10
|
-
hossam/hs_util.py,sha256=8byLj_VR93vS__lyf0xgQKArgMy9qFm2VvZVSCxfQX0,8444
|
|
11
|
-
hossam/leekh.png,sha256=1PB5NQ24SDoHA5KMiBBsWpSa3iniFcwFTuGwuOsTHfI,6395
|
|
12
|
-
hossam-0.4.3.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
|
|
13
|
-
hossam-0.4.3.dist-info/METADATA,sha256=0VAI5TJKWSFwZriKBYnf5a4MSB5cdOLUh9lV_vYDPJY,3676
|
|
14
|
-
hossam-0.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
-
hossam-0.4.3.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
|
|
16
|
-
hossam-0.4.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|