hossam 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hossam/hs_stats.py CHANGED
@@ -1,5 +1,7 @@
1
1
  # -*- coding: utf-8 -*-
2
+
2
3
  from __future__ import annotations
4
+ from typing import overload, Tuple, Literal, Union, Any
3
5
 
4
6
  # -------------------------------------------------------------
5
7
  import numpy as np
@@ -22,12 +24,13 @@ from scipy.stats import (
22
24
  normaltest,
23
25
  bartlett,
24
26
  levene,
25
- ttest_1samp,
27
+ ttest_1samp, # type: ignore
26
28
  ttest_ind as scipy_ttest_ind,
27
- ttest_rel,
29
+ ttest_rel, # type: ignore
28
30
  wilcoxon,
29
31
  pearsonr,
30
32
  spearmanr,
33
+ chi2
31
34
  )
32
35
 
33
36
  import statsmodels.api as sm
@@ -36,10 +39,71 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor
36
39
  from statsmodels.stats.multitest import multipletests
37
40
  from statsmodels.stats.stattools import durbin_watson
38
41
  from statsmodels.regression.linear_model import RegressionResultsWrapper
42
+ from statsmodels.discrete.discrete_model import BinaryResultsWrapper
39
43
  from statsmodels.discrete.discrete_model import BinaryResults
40
44
 
41
45
  from pingouin import anova, pairwise_tukey, welch_anova, pairwise_gameshowell
42
46
 
47
+ from .hs_plot import ols_residplot, ols_qqplot
48
+
49
+ # ===================================================================
50
+ # MCAR(결측치 무작위성) 검정
51
+ # ===================================================================
52
+ def mcar_test(data: DataFrame, columns: list | str | None = None) -> DataFrame:
53
+ if isinstance(columns, str):
54
+ columns = [c.strip() for c in columns.split(",")]
55
+
56
+ cols = data.columns if columns is None else columns
57
+ df = data[cols]
58
+
59
+ # 결측치가 있는 컬럼만 사용
60
+ cols_with_na = [c for c in df.columns if df[c].isna().any()]
61
+ if len(cols_with_na) < 2:
62
+ raise ValueError("MCAR 검정은 결측치가 있는 변수가 최소 2개 이상 필요합니다.")
63
+
64
+ X = df[cols_with_na].to_numpy()
65
+ n, p = X.shape
66
+
67
+ # complete cases로 평균·공분산 추정
68
+ complete = ~np.isnan(X).any(axis=1)
69
+ if complete.sum() < p + 1:
70
+ raise ValueError("완전관측치(complete cases)가 부족하여 MCAR 검정을 수행할 수 없습니다.")
71
+
72
+ mu = X[complete].mean(axis=0)
73
+ S = np.cov(X[complete], rowvar=False)
74
+ S_inv = np.linalg.pinv(S)
75
+
76
+ chi_sq = 0.0
77
+ dfree = 0
78
+
79
+ for i in range(n):
80
+ obs = ~np.isnan(X[i])
81
+ if obs.sum() == p:
82
+ continue # complete case는 제외
83
+ diff = X[i, obs] - mu[obs]
84
+ S_obs = S[np.ix_(obs, obs)]
85
+ S_obs_inv = np.linalg.pinv(S_obs)
86
+
87
+ chi_sq += diff @ S_obs_inv @ diff
88
+ dfree += obs.sum()
89
+
90
+ dfree -= p # Little's adjustment
91
+
92
+ p_value = 1 - chi2.cdf(chi_sq, dfree)
93
+ is_mcar = p_value > 0.05
94
+
95
+ return DataFrame([{
96
+ "statistic": chi_sq,
97
+ "dof": dfree,
98
+ "p-value": p_value,
99
+ "is_mcar": is_mcar,
100
+ "interpretation": (
101
+ "결측치는 완전 무작위(MCAR)로 판단됨 → 결측치 삭제 가능"
102
+ if is_mcar else
103
+ "결측치는 완전 무작위(MCAR)가 아님 → 삭제 시 편향 가능"
104
+ )
105
+ }])
106
+
43
107
  # ===================================================================
44
108
  # 결측치 분석 (Missing Values Analysis)
45
109
  # ===================================================================
@@ -219,6 +283,8 @@ def describe(data: DataFrame, *fields: str, columns: list | None = None):
219
283
  행은 다음과 같은 통계량을 포함:
220
284
 
221
285
  - count (float): 비결측치의 수
286
+ - na_count (int): 결측치의 수
287
+ - na_rate (float): 결측치 비율(%)
222
288
  - mean (float): 평균값
223
289
  - std (float): 표준편차
224
290
  - min (float): 최소값
@@ -267,9 +333,13 @@ def describe(data: DataFrame, *fields: str, columns: list | None = None):
267
333
 
268
334
  # 기술통계량 구하기
269
335
  desc = data[list(fields)].describe().T
270
- # 각 컬럼별 결측치 수(null_count) 추가
271
- null_counts = data[list(fields)].isnull().sum()
272
- desc.insert(1, 'null_count', null_counts)
336
+
337
+ # 컬럼별 결측치 수(na_count) 추가
338
+ na_counts = data[list(fields)].isnull().sum()
339
+ desc.insert(1, 'na_count', na_counts)
340
+
341
+ # 결측치 비율(na_rate) 추가
342
+ desc.insert(2, 'na_rate', (na_counts / len(data)) * 100)
273
343
 
274
344
  # 추가 통계량 계산
275
345
  additional_stats = []
@@ -305,29 +375,29 @@ def describe(data: DataFrame, *fields: str, columns: list | None = None):
305
375
  outlier_rate = (outlier_count / len(data)) * 100
306
376
 
307
377
  # 분포 특성 판정 (왜도 기준)
308
- abs_skew = abs(skew)
309
- if abs_skew < 0.5:
378
+ abs_skew = abs(skew) # type: ignore
379
+ if abs_skew < 0.5: # type: ignore
310
380
  dist = "거의 대칭"
311
- elif abs_skew < 1.0:
312
- if skew > 0:
381
+ elif abs_skew < 1.0: # type: ignore
382
+ if skew > 0: # type: ignore
313
383
  dist = "약한 우측 꼬리"
314
384
  else:
315
385
  dist = "약한 좌측 꼬리"
316
- elif abs_skew < 2.0:
317
- if skew > 0:
386
+ elif abs_skew < 2.0: # type: ignore
387
+ if skew > 0: # type: ignore
318
388
  dist = "중간 우측 꼬리"
319
389
  else:
320
390
  dist = "중간 좌측 꼬리"
321
391
  else:
322
- if skew > 0:
392
+ if skew > 0: # type: ignore
323
393
  dist = "극단 우측 꼬리"
324
394
  else:
325
395
  dist = "극단 좌측 꼬리"
326
396
 
327
397
  # 로그변환 필요성 판정
328
- if abs_skew < 0.5:
398
+ if abs_skew < 0.5: # type: ignore
329
399
  log_need = "낮음"
330
- elif abs_skew < 1.0:
400
+ elif abs_skew < 1.0: # type: ignore
331
401
  log_need = "중간"
332
402
  else:
333
403
  log_need = "높음"
@@ -403,7 +473,7 @@ def category_describe(data: DataFrame, *fields: str):
403
473
  """
404
474
  if not fields:
405
475
  # 명목형(범주형) 컬럼 선택: object, category, bool 타입
406
- fields = data.select_dtypes(include=['object', 'category', 'bool']).columns
476
+ fields = data.select_dtypes(include=['object', 'category', 'bool']).columns # type: ignore
407
477
 
408
478
  result = []
409
479
  summary = []
@@ -660,7 +730,7 @@ def equal_var_test(data: DataFrame, columns: list | str | None = None, normal_di
660
730
  normality_result = normal_test(data[numeric_cols], method="n")
661
731
  # 모든 컬럼이 정규분포를 따르는지 확인
662
732
  all_normal = normality_result["is_normal"].all()
663
- normal_dist = all_normal
733
+ normal_dist = all_normal # type: ignore
664
734
 
665
735
  try:
666
736
  if normal_dist:
@@ -759,7 +829,7 @@ def ttest_1samp(data, mean_value: float = 0.0) -> DataFrame:
759
829
  else:
760
830
  for a in alternative:
761
831
  try:
762
- s, p = ttest_1samp(col_data, mean_value, alternative=a)
832
+ s, p = ttest_1samp(col_data, mean_value, alternative=a) # type: ignore
763
833
 
764
834
  itp = None
765
835
 
@@ -869,26 +939,26 @@ def ttest_ind(x, y, equal_var: bool | None = None) -> DataFrame:
869
939
 
870
940
  for a in alternative:
871
941
  try:
872
- s, p = scipy_ttest_ind(x_data, y_data, equal_var=equal_var, alternative=a)
942
+ s, p = scipy_ttest_ind(x_data, y_data, equal_var=equal_var, alternative=a) # type: ignore
873
943
  n = "t-test_ind" if equal_var else "Welch's t-test"
874
944
 
875
945
  # 검정 결과 해석
876
946
  itp = None
877
947
 
878
948
  if a == "two-sided":
879
- itp = fmt.format("==" if p > 0.05 else "!=")
949
+ itp = fmt.format("==" if p > 0.05 else "!=") # type: ignore
880
950
  elif a == "less":
881
- itp = fmt.format(">=" if p > 0.05 else "<")
951
+ itp = fmt.format(">=" if p > 0.05 else "<") # type: ignore
882
952
  else:
883
- itp = fmt.format("<=" if p > 0.05 else ">")
953
+ itp = fmt.format("<=" if p > 0.05 else ">") # type: ignore
884
954
 
885
955
  result.append({
886
956
  "test": n,
887
957
  "alternative": a,
888
- "statistic": round(s, 3),
889
- "p-value": round(p, 4),
890
- "H0": p > 0.05,
891
- "H1": p <= 0.05,
958
+ "statistic": round(s, 3), # type: ignore
959
+ "p-value": round(p, 4), # type: ignore
960
+ "H0": p > 0.05, # type: ignore
961
+ "H1": p <= 0.05, # type: ignore
892
962
  "interpretation": itp,
893
963
  "equal_var_checked": var_checked
894
964
  })
@@ -998,7 +1068,7 @@ def ttest_rel(x, y, parametric: bool | None = None) -> DataFrame:
998
1068
  for a in alternative:
999
1069
  try:
1000
1070
  if parametric:
1001
- s, p = ttest_rel(x_data, y_data, alternative=a)
1071
+ s, p = ttest_rel(x_data, y_data, alternative=a) # type: ignore
1002
1072
  n = "t-test_paired"
1003
1073
  else:
1004
1074
  # Wilcoxon signed-rank test (대응표본용 비모수 검정)
@@ -1008,19 +1078,19 @@ def ttest_rel(x, y, parametric: bool | None = None) -> DataFrame:
1008
1078
  itp = None
1009
1079
 
1010
1080
  if a == "two-sided":
1011
- itp = fmt.format("==" if p > 0.05 else "!=")
1081
+ itp = fmt.format("==" if p > 0.05 else "!=") # type: ignore
1012
1082
  elif a == "less":
1013
- itp = fmt.format(">=" if p > 0.05 else "<")
1083
+ itp = fmt.format(">=" if p > 0.05 else "<") # type: ignore
1014
1084
  else:
1015
- itp = fmt.format("<=" if p > 0.05 else ">")
1085
+ itp = fmt.format("<=" if p > 0.05 else ">") # type: ignore
1016
1086
 
1017
1087
  result.append({
1018
1088
  "test": n,
1019
1089
  "alternative": a,
1020
- "statistic": round(s, 3) if not np.isnan(s) else s,
1021
- "p-value": round(p, 4) if not np.isnan(p) else p,
1022
- "H0": p > 0.05,
1023
- "H1": p <= 0.05,
1090
+ "statistic": round(s, 3) if not np.isnan(s) else s, # type: ignore
1091
+ "p-value": round(p, 4) if not np.isnan(p) else p, # type: ignore
1092
+ "H0": p > 0.05, # type: ignore
1093
+ "H1": p <= 0.05, # type: ignore
1024
1094
  "interpretation": itp,
1025
1095
  "normality_checked": var_checked
1026
1096
  })
@@ -1047,7 +1117,7 @@ def ttest_rel(x, y, parametric: bool | None = None) -> DataFrame:
1047
1117
  # ===================================================================
1048
1118
  def vif_filter(
1049
1119
  data: DataFrame,
1050
- yname: str = None,
1120
+ yname: str | None = None,
1051
1121
  ignore: list | None = None,
1052
1122
  threshold: float = 10.0,
1053
1123
  verbose: bool = False,
@@ -1112,7 +1182,7 @@ def vif_filter(
1112
1182
  for i, col in enumerate(X_clean.columns, start=0):
1113
1183
  # exog의 첫 열은 상수항이므로 변수 인덱스는 +1
1114
1184
  try:
1115
- vifs[col] = float(variance_inflation_factor(exog.values, i + 1))
1185
+ vifs[col] = float(variance_inflation_factor(exog.values, i + 1))# type: ignore
1116
1186
  except Exception:
1117
1187
  # 계산 실패 시 무한대로 처리하여 우선 제거 대상으로
1118
1188
  vifs[col] = float("inf")
@@ -1150,7 +1220,7 @@ def vif_filter(
1150
1220
  # ===================================================================
1151
1221
  # x, y 데이터에 대한 추세선을 구한다.
1152
1222
  # ===================================================================
1153
- def trend(x: any, y: any, degree: int = 1, value_count: int = 100) -> Tuple[np.ndarray, np.ndarray]:
1223
+ def trend(x: Any, y: Any, degree: int = 1, value_count: int = 100) -> Tuple[np.ndarray, np.ndarray]:
1154
1224
  """x, y 데이터에 대한 추세선을 구한다.
1155
1225
 
1156
1226
  Args:
@@ -1192,7 +1262,10 @@ def trend(x: any, y: any, degree: int = 1, value_count: int = 100) -> Tuple[np.n
1192
1262
  # ===================================================================
1193
1263
  # 선형회귀 요약 리포트
1194
1264
  # ===================================================================
1195
- def ols_report(fit, data, full=False, alpha=0.05):
1265
+ def ols_report(fit, data, full=False, alpha=0.05) -> Union[
1266
+ Tuple[DataFrame, DataFrame],
1267
+ Tuple[DataFrame, DataFrame, str, str, list[str], str]
1268
+ ]:
1196
1269
  """선형회귀 적합 결과를 요약 리포트로 변환한다.
1197
1270
 
1198
1271
  Args:
@@ -1211,6 +1284,7 @@ def ols_report(fit, data, full=False, alpha=0.05):
1211
1284
  - 회귀식 문자열 (`equation_text`, str): 상수항과 계수를 포함한 회귀식 표현.
1212
1285
 
1213
1286
  full=False일 때:
1287
+ - 성능 지표 표 (`pdf`, DataFrame): R, R², Adj. R², F, p-value, Durbin-Watson.
1214
1288
  - 회귀계수 표 (`rdf`, DataFrame)
1215
1289
 
1216
1290
  Examples:
@@ -1250,7 +1324,7 @@ def ols_report(fit, data, full=False, alpha=0.05):
1250
1324
  for i, col in enumerate(indi_df.columns, start=1): # 상수항이 0이므로 1부터 시작
1251
1325
  try:
1252
1326
  with np.errstate(divide='ignore', invalid='ignore'):
1253
- vif_value = variance_inflation_factor(indi_df_const.values, i)
1327
+ vif_value = variance_inflation_factor(indi_df_const.values, i) # type: ignore
1254
1328
  # inf나 매우 큰 값 처리
1255
1329
  if np.isinf(vif_value) or vif_value > 1e10:
1256
1330
  vif_dict[col] = np.inf
@@ -1378,7 +1452,19 @@ def ols_report(fit, data, full=False, alpha=0.05):
1378
1452
  # ===================================================================
1379
1453
  # 선형회귀
1380
1454
  # ===================================================================
1381
- def ols(df: DataFrame, yname: str, report=False):
1455
+ def ols(df: DataFrame, yname: str, report: bool | str | int = False) -> Union[
1456
+ RegressionResultsWrapper,
1457
+ Tuple[RegressionResultsWrapper, DataFrame, DataFrame],
1458
+ Tuple[
1459
+ RegressionResultsWrapper,
1460
+ DataFrame,
1461
+ DataFrame,
1462
+ str,
1463
+ str,
1464
+ list[str],
1465
+ str
1466
+ ]
1467
+ ]:
1382
1468
  """선형회귀분석을 수행하고 적합 결과를 반환한다.
1383
1469
 
1384
1470
  OLS(Ordinary Least Squares) 선형회귀분석을 실시한다.
@@ -1387,7 +1473,7 @@ def ols(df: DataFrame, yname: str, report=False):
1387
1473
  Args:
1388
1474
  df (DataFrame): 종속변수와 독립변수를 모두 포함한 데이터프레임.
1389
1475
  yname (str): 종속변수 컬럼명.
1390
- report: 리포트 모드 설정. 다음 값 중 하나:
1476
+ report (bool | str | int): 리포트 모드 설정. 다음 값 중 하나:
1391
1477
  - False (기본값): 리포트 미사용. fit 객체만 반환.
1392
1478
  - 1 또는 'summary': 요약 리포트 반환 (full=False).
1393
1479
  - 2 또는 'full': 풀 리포트 반환 (full=True).
@@ -1426,10 +1512,10 @@ def ols(df: DataFrame, yname: str, report=False):
1426
1512
  fit = hs_stats.ols(df, 'target')
1427
1513
 
1428
1514
  # 요약 리포트 반환
1429
- fit, pdf, rdf = hs_stats.ols(df, 'target', report=1)
1515
+ fit, pdf, rdf = hs_stats.ols(df, 'target', report='summary')
1430
1516
 
1431
1517
  # 풀 리포트 반환
1432
- fit, pdf, rdf, result_report, model_report, var_reports, eq = hs_stats.ols(df, 'target', report=2)
1518
+ fit, pdf, rdf, result_report, model_report, var_reports, eq = hs_stats.ols(df, 'target', report='full')
1433
1519
  ```
1434
1520
  """
1435
1521
  x = df.drop(yname, axis=1)
@@ -1445,11 +1531,11 @@ def ols(df: DataFrame, yname: str, report=False):
1445
1531
  return linear_fit
1446
1532
  elif report == 1 or report == 'summary':
1447
1533
  # 요약 리포트 (full=False)
1448
- pdf, rdf = ols_report(linear_fit, df, full=False, alpha=0.05)
1534
+ pdf, rdf = ols_report(linear_fit, df, full=False, alpha=0.05) # type: ignore
1449
1535
  return linear_fit, pdf, rdf
1450
1536
  elif report == 2 or report == 'full' or report is True:
1451
1537
  # 풀 리포트 (full=True)
1452
- pdf, rdf, result_report, model_report, variable_reports, equation_text = ols_report(linear_fit, df, full=True, alpha=0.05)
1538
+ pdf, rdf, result_report, model_report, variable_reports, equation_text = ols_report(linear_fit, df, full=True, alpha=0.05) # type: ignore
1453
1539
  return linear_fit, pdf, rdf, result_report, model_report, variable_reports, equation_text
1454
1540
  else:
1455
1541
  # 기본값: 리포트 미사용
@@ -1459,15 +1545,31 @@ def ols(df: DataFrame, yname: str, report=False):
1459
1545
  # ===================================================================
1460
1546
  # 로지스틱 회귀 요약 리포트
1461
1547
  # ===================================================================
1462
- def logit_report(fit, data, threshold=0.5, full=False, alpha=0.05):
1548
+ def logit_report(
1549
+ fit: BinaryResultsWrapper,
1550
+ data: DataFrame,
1551
+ threshold: float = 0.5,
1552
+ full: Union[bool, str, int] = False,
1553
+ alpha: float = 0.05
1554
+ ) -> Union[
1555
+ Tuple[DataFrame, DataFrame],
1556
+ Tuple[
1557
+ DataFrame,
1558
+ DataFrame,
1559
+ str,
1560
+ str,
1561
+ list[str],
1562
+ np.ndarray
1563
+ ]
1564
+ ]:
1463
1565
  """로지스틱 회귀 적합 결과를 상세 리포트로 변환한다.
1464
1566
 
1465
1567
  Args:
1466
1568
  fit: statsmodels Logit 결과 객체 (`fit.summary()`와 예측 확률을 지원해야 함).
1467
- data: 종속변수와 독립변수를 모두 포함한 DataFrame.
1468
- threshold: 예측 확률을 이진 분류로 변환할 임계값. 기본값 0.5.
1469
- full: True이면 6개 값 반환, False이면 주요 2개(cdf, rdf)만 반환. 기본값 False.
1470
- alpha: 유의수준. 기본값 0.05.
1569
+ data (DataFrame): 종속변수와 독립변수를 모두 포함한 DataFrame.
1570
+ threshold (float): 예측 확률을 이진 분류로 변환할 임계값. 기본값 0.5.
1571
+ full (bool | str | int): True이면 6개 값 반환, False이면 주요 2개(cdf, rdf)만 반환. 기본값 False.
1572
+ alpha (float): 유의수준. 기본값 0.05.
1471
1573
 
1472
1574
  Returns:
1473
1575
  tuple: full=True일 때 다음 요소를 포함한다.
@@ -1555,7 +1657,7 @@ def logit_report(fit, data, threshold=0.5, full=False, alpha=0.05):
1555
1657
  vif_dict = {}
1556
1658
  x_const = sm.add_constant(x, has_constant="add")
1557
1659
  for i, col in enumerate(x.columns, start=1): # 상수항이 0이므로 1부터 시작
1558
- vif_dict[col] = variance_inflation_factor(x_const.values, i)
1660
+ vif_dict[col] = variance_inflation_factor(x_const.values, i) # type: ignore
1559
1661
 
1560
1662
  for idx, row in tbl.iterrows():
1561
1663
  name = idx
@@ -1652,7 +1754,25 @@ def logit_report(fit, data, threshold=0.5, full=False, alpha=0.05):
1652
1754
  # ===================================================================
1653
1755
  # 로지스틱 회귀
1654
1756
  # ===================================================================
1655
- def logit(df: DataFrame, yname: str, report=False):
1757
+ def logit(
1758
+ df: DataFrame,
1759
+ yname: str,
1760
+ report: Union[bool, str, int] = False
1761
+ ) -> Union[
1762
+ BinaryResultsWrapper,
1763
+ Tuple[
1764
+ BinaryResultsWrapper,
1765
+ DataFrame
1766
+ ],
1767
+ Tuple[
1768
+ BinaryResultsWrapper,
1769
+ DataFrame,
1770
+ DataFrame,
1771
+ str,
1772
+ str,
1773
+ list[str]
1774
+ ]
1775
+ ]:
1656
1776
  """로지스틱 회귀분석을 수행하고 적합 결과를 반환한다.
1657
1777
 
1658
1778
  종속변수가 이항(binary) 형태일 때 로지스틱 회귀분석을 실시한다.
@@ -1718,13 +1838,13 @@ def logit(df: DataFrame, yname: str, report=False):
1718
1838
  return logit_fit
1719
1839
  elif report == 1 or report == 'summary':
1720
1840
  # 요약 리포트 (full=False)
1721
- cdf, rdf = logit_report(logit_fit, df, threshold=0.5, full=False, alpha=0.05)
1841
+ cdf, rdf = logit_report(logit_fit, df, threshold=0.5, full=False, alpha=0.05) # type: ignore
1722
1842
  # 요약에서는 result_report와 variable_reports만 포함
1723
1843
  # 간단한 버전으로 result와 variable_reports만 생성
1724
1844
  return logit_fit, rdf
1725
1845
  elif report == 2 or report == 'full' or report is True:
1726
1846
  # 풀 리포트 (full=True)
1727
- cdf, rdf, result_report, model_report, variable_reports, cm = logit_report(logit_fit, df, threshold=0.5, full=True, alpha=0.05)
1847
+ cdf, rdf, result_report, model_report, variable_reports, cm = logit_report(logit_fit, df, threshold=0.5, full=True, alpha=0.05) # type: ignore
1728
1848
  return logit_fit, cdf, rdf, result_report, model_report, variable_reports
1729
1849
  else:
1730
1850
  # 기본값: 리포트 미사용
@@ -1734,7 +1854,7 @@ def logit(df: DataFrame, yname: str, report=False):
1734
1854
  # ===================================================================
1735
1855
  # 선형성 검정 (Linearity Test)
1736
1856
  # ===================================================================
1737
- def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05) -> DataFrame:
1857
+ def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05, plot: bool = False, title: str | None = None, save_path: str | None = None) -> DataFrame:
1738
1858
  """회귀모형의 선형성을 Ramsey RESET 검정으로 평가한다.
1739
1859
 
1740
1860
  적합된 회귀모형에 대해 Ramsey RESET(Regression Specification Error Test) 검정을 수행하여
@@ -1747,6 +1867,9 @@ def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05) -> DataFrame:
1747
1867
  power=2일 때 예측값의 제곱항이 추가됨.
1748
1868
  power가 클수록 더 높은 차수의 비선형성을 감지.
1749
1869
  alpha (float, optional): 유의수준. 기본값 0.05.
1870
+ plot (bool, optional): True이면 잔차 플롯을 출력. 기본값 False.
1871
+ title (str, optional): 플롯 제목. 기본값 None.
1872
+ save_path (str, optional): 플롯을 저장할 경로. 기본값 None
1750
1873
 
1751
1874
  Returns:
1752
1875
  DataFrame: 선형성 검정 결과를 포함한 데이터프레임.
@@ -1829,13 +1952,16 @@ def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05) -> DataFrame:
1829
1952
  "해석": [interpretation]
1830
1953
  })
1831
1954
 
1955
+ if plot:
1956
+ ols_residplot(fit, lowess=True, mse=True, title=title, save_path=save_path)
1957
+
1832
1958
  return result_df
1833
1959
 
1834
1960
 
1835
1961
  # ===================================================================
1836
1962
  # 정규성 검정 (Normality Test)
1837
1963
  # ===================================================================
1838
- def ols_normality_test(fit, alpha: float = 0.05) -> DataFrame:
1964
+ def ols_normality_test(fit, alpha: float = 0.05, plot: bool = False, title: str | None = None, save_path: str | None = None) -> DataFrame:
1839
1965
  """회귀모형 잔차의 정규성을 검정한다.
1840
1966
 
1841
1967
  회귀모형의 잔차가 정규분포를 따르는지 Shapiro-Wilk 검정과 Jarque-Bera 검정으로 평가한다.
@@ -1844,6 +1970,9 @@ def ols_normality_test(fit, alpha: float = 0.05) -> DataFrame:
1844
1970
  Args:
1845
1971
  fit: 회귀 모형 객체 (statsmodels의 RegressionResultsWrapper).
1846
1972
  alpha (float, optional): 유의수준. 기본값 0.05.
1973
+ plot (bool, optional): True이면 Q-Q 플롯을 출력. 기본값 False.
1974
+ title (str, optional): 플롯 제목. 기본값 None.
1975
+ save_path (str, optional): 플롯을 저장할 경로. 기본값 None
1847
1976
 
1848
1977
  Returns:
1849
1978
  DataFrame: 정규성 검정 결과를 포함한 데이터프레임.
@@ -1900,7 +2029,7 @@ def ols_normality_test(fit, alpha: float = 0.05) -> DataFrame:
1900
2029
  # 2. Jarque-Bera 검정 (항상 수행)
1901
2030
  try:
1902
2031
  stat_jb, p_jb = jarque_bera(residuals)
1903
- significant_jb = p_jb <= alpha
2032
+ significant_jb = p_jb <= alpha # type: ignore
1904
2033
 
1905
2034
  if significant_jb:
1906
2035
  interpretation_jb = f"정규성 위반 (p={p_jb:.4f} <= {alpha})"
@@ -1922,6 +2051,10 @@ def ols_normality_test(fit, alpha: float = 0.05) -> DataFrame:
1922
2051
  if not results:
1923
2052
  raise ValueError("정규성 검정을 수행할 수 없습니다.")
1924
2053
 
2054
+
2055
+ if plot:
2056
+ ols_qqplot(fit, title=title, save_path=save_path)
2057
+
1925
2058
  result_df = DataFrame(results)
1926
2059
  return result_df
1927
2060
 
@@ -2229,8 +2362,8 @@ def corr_pairwise(
2229
2362
  corr_val, pval = np.nan, np.nan
2230
2363
 
2231
2364
  # 4) 유의성, 강도
2232
- significant = False if np.isnan(pval) else pval <= alpha
2233
- abs_r = abs(corr_val) if not np.isnan(corr_val) else 0
2365
+ significant = False if np.isnan(pval) else pval <= alpha # type: ignore
2366
+ abs_r = abs(corr_val) if not np.isnan(corr_val) else 0 # type: ignore
2234
2367
  if abs_r > 0.7:
2235
2368
  strength = "strong"
2236
2369
  elif abs_r > 0.3:
@@ -2397,13 +2530,13 @@ def oneway_anova(data: DataFrame, dv: str, between: str, alpha: float = 0.05) ->
2397
2530
  anova_df['significant'] = anova_df['p-unc'] <= alpha
2398
2531
 
2399
2532
  # ANOVA 결과가 유의한지 확인
2400
- p_unc = float(anova_df.loc[0, 'p-unc'])
2533
+ p_unc = float(anova_df.loc[0, 'p-unc']) # type: ignore
2401
2534
  anova_significant = p_unc <= alpha
2402
2535
 
2403
2536
  # ANOVA 보고 문장 생성
2404
2537
  def _safe_get(col: str, default: float = np.nan) -> float:
2405
2538
  try:
2406
- return float(anova_df.loc[0, col]) if col in anova_df.columns else default
2539
+ return float(anova_df.loc[0, col]) if col in anova_df.columns else default # type: ignore
2407
2540
  except Exception:
2408
2541
  return default
2409
2542
 
@@ -2718,7 +2851,7 @@ def predict(fit, data: DataFrame | Series) -> DataFrame | Series | float:
2718
2851
 
2719
2852
  # Series 입력인 경우 단일 값 반환
2720
2853
  if is_series:
2721
- return float(predictions.iloc[0])
2854
+ return float(predictions.iloc[0]) # type: ignore
2722
2855
 
2723
2856
  # DataFrame 입력인 경우
2724
2857
  if isinstance(data, DataFrame):
@@ -2791,8 +2924,7 @@ def corr_effect_size(data: DataFrame, dv: str, *fields: str, alpha: float = 0.05
2791
2924
 
2792
2925
  # fields가 지정되지 않으면 수치형 컬럼 중 dv 제외 모두 사용
2793
2926
  if not fields:
2794
- fields = [col for col in data.columns
2795
- if is_numeric_dtype(data[col]) and col != dv]
2927
+ fields = [col for col in data.columns if is_numeric_dtype(data[col]) and col != dv] # type: ignore
2796
2928
 
2797
2929
  # dv가 수치형인지 확인
2798
2930
  if not is_numeric_dtype(data[dv]):
@@ -2820,8 +2952,8 @@ def corr_effect_size(data: DataFrame, dv: str, *fields: str, alpha: float = 0.05
2820
2952
  normal_y_result = normal_test(data[[dv]], columns=[dv], method=method_y)
2821
2953
 
2822
2954
  # 정규성 판정 (p > alpha면 정규분포 가정)
2823
- normal_x = normal_x_result.loc[var, 'p-val'] > alpha if var in normal_x_result.index else False
2824
- normal_y = normal_y_result.loc[dv, 'p-val'] > alpha if dv in normal_y_result.index else False
2955
+ normal_x = normal_x_result.loc[var, 'p-val'] > alpha if var in normal_x_result.index else False # type: ignore
2956
+ normal_y = normal_y_result.loc[dv, 'p-val'] > alpha if dv in normal_y_result.index else False # type: ignore
2825
2957
 
2826
2958
  # Pearson (모두 정규) vs Spearman (하나라도 비정규)
2827
2959
  if normal_x and normal_y:
@@ -2833,8 +2965,8 @@ def corr_effect_size(data: DataFrame, dv: str, *fields: str, alpha: float = 0.05
2833
2965
 
2834
2966
  # Cohen's d 계산 (상관계수에서 효과크기로 변환)
2835
2967
  # d = 2*r / sqrt(1-r^2)
2836
- if r**2 < 1:
2837
- d = (2 * r) / np.sqrt(1 - r**2)
2968
+ if r ** 2 < 1: # type: ignore
2969
+ d = (2 * r) / np.sqrt(1 - r ** 2) # type: ignore
2838
2970
  else:
2839
2971
  d = 0
2840
2972
 
hossam/hs_util.py CHANGED
@@ -122,11 +122,9 @@ def pretty_table(data: DataFrame, tablefmt="simple", headers: str = "keys") -> N
122
122
  ```
123
123
  """
124
124
 
125
- tabulate.WIDE_CHARS_MODE = False
125
+ tabulate.WIDE_CHARS_MODE = False # type: ignore
126
126
  print(
127
- tabulate(
128
- data, headers=headers, tablefmt=tablefmt, showindex=True, numalign="right"
129
- )
127
+ tabulate(data, headers=headers, tablefmt=tablefmt, showindex=True, numalign="right") # type: ignore
130
128
  )
131
129
 
132
130
 
@@ -167,7 +165,7 @@ def __data_info(
167
165
 
168
166
  if info:
169
167
  print("\n✅ 테이블 정보")
170
- pretty_table(data.info(), tablefmt="pretty")
168
+ pretty_table(data.info(), tablefmt="pretty") # type: ignore
171
169
 
172
170
  print("\n✅ 상위 5개 행")
173
171
  pretty_table(data.head(), tablefmt="pretty")
@@ -229,7 +227,7 @@ def load_data(key: str,
229
227
  elif k.endswith(".csv"):
230
228
  origin = read_csv(key)
231
229
  else:
232
- origin = _load_data_remote(key, local)
230
+ origin = _load_data_remote(key, local) # type: ignore
233
231
 
234
232
  if origin is None:
235
233
  raise RuntimeError("Data loading failed: origin is None")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hossam
3
- Version: 0.4.3
3
+ Version: 0.4.5
4
4
  Summary: Hossam Data Helper
5
5
  Author-email: Lee Kwang-Ho <leekh4232@gmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1,16 @@
1
+ hossam/NotoSansKR-Regular.ttf,sha256=0SCufUQwcVWrWTu75j4Lt_V2bgBJIBXl1p8iAJJYkVY,6185516
2
+ hossam/__init__.py,sha256=OkMeP15jt6aCy7QNXMtkO0YRVvgOQYumkb7GuVKrbcs,2712
3
+ hossam/data_loader.py,sha256=K0-MJaVeedF5x8mSp22X2rD_CZ-T185EhoUFEqzP8Ss,6352
4
+ hossam/hs_classroom.py,sha256=rgayol3U5PSo4rLfdbClfiAtG21bFrASaSW56PUsjus,27144
5
+ hossam/hs_gis.py,sha256=DVmndBK-_7GMK3J1_on3ieEQk1S0MfUZ8_wlX-cDdZQ,11581
6
+ hossam/hs_plot.py,sha256=3j9B69pl-zQM_09lTXxLKAMaDM0vwOTsUWbzcU8hCK8,86228
7
+ hossam/hs_prep.py,sha256=kCmFxnMyFZ5tLUfoE8msbwTracajHAmruJbFj6A6eIU,38020
8
+ hossam/hs_stats.py,sha256=uGYkEk8Rb8qMoZ5FiZ7Yg6jssLIGl_EBbmwvvSYljhQ,115780
9
+ hossam/hs_timeserise.py,sha256=gSj3cPgOGLOZEXhfW1anXbwpoJja847ZY9F8l9piJPE,42601
10
+ hossam/hs_util.py,sha256=xuNXC6FJSAmyAbcRAUMsigCKHXM25t3H90nFMgq7IBs,8482
11
+ hossam/leekh.png,sha256=1PB5NQ24SDoHA5KMiBBsWpSa3iniFcwFTuGwuOsTHfI,6395
12
+ hossam-0.4.5.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
13
+ hossam-0.4.5.dist-info/METADATA,sha256=HM5qrrvaFZWAyUlhgV_BLPHAcxEZdZ4gp2p3V4X4pzo,3676
14
+ hossam-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ hossam-0.4.5.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
16
+ hossam-0.4.5.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- hossam/NotoSansKR-Regular.ttf,sha256=0SCufUQwcVWrWTu75j4Lt_V2bgBJIBXl1p8iAJJYkVY,6185516
2
- hossam/__init__.py,sha256=OkMeP15jt6aCy7QNXMtkO0YRVvgOQYumkb7GuVKrbcs,2712
3
- hossam/data_loader.py,sha256=oUIsqbHQoRiHA_1tdElDaYo1ipmUB5fYSXYMB5gLOl0,6395
4
- hossam/hs_classroom.py,sha256=rgayol3U5PSo4rLfdbClfiAtG21bFrASaSW56PUsjus,27144
5
- hossam/hs_gis.py,sha256=DLogaf5nxJBbG-d8QoH2g8UfZ1omMtmEXDYgNg8jtT0,11410
6
- hossam/hs_plot.py,sha256=A_nS8dP4cijp7LZs253SWxfBUp5qvvTlSPGKjDj0BIA,83712
7
- hossam/hs_prep.py,sha256=2ptFFxV4G1IFmy-B89TqXaPkA8jROZutr2XIkaXNHW4,36006
8
- hossam/hs_stats.py,sha256=qAor-RE5qNsytoZW1mriK3yql9PVif5bBGyG64YC2PM,110780
9
- hossam/hs_timeserise.py,sha256=gSj3cPgOGLOZEXhfW1anXbwpoJja847ZY9F8l9piJPE,42601
10
- hossam/hs_util.py,sha256=8byLj_VR93vS__lyf0xgQKArgMy9qFm2VvZVSCxfQX0,8444
11
- hossam/leekh.png,sha256=1PB5NQ24SDoHA5KMiBBsWpSa3iniFcwFTuGwuOsTHfI,6395
12
- hossam-0.4.3.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
13
- hossam-0.4.3.dist-info/METADATA,sha256=0VAI5TJKWSFwZriKBYnf5a4MSB5cdOLUh9lV_vYDPJY,3676
14
- hossam-0.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- hossam-0.4.3.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
16
- hossam-0.4.3.dist-info/RECORD,,
File without changes