hossam 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hossam/hs_prep.py CHANGED
@@ -5,6 +5,7 @@
5
5
  import joblib
6
6
  import numpy as np
7
7
  from itertools import combinations
8
+ from typing import Any
8
9
 
9
10
  import pandas as pd
10
11
  import jenkspy
@@ -19,7 +20,7 @@ from .hs_util import pretty_table
19
20
  # 연속형 변수를 표준정규화(Z-score)로 변환한다
20
21
  # ===================================================================
21
22
  def standard_scaler(
22
- data: any, yname: str | None = None, save_path: str | None = None, load_path: str | None = None
23
+ data: Any, yname: str | None = None, save_path: str | None = None, load_path: str | None = None
23
24
  ) -> DataFrame:
24
25
  """연속형 변수에 대해 Standard Scaling을 수행한다.
25
26
 
@@ -54,7 +55,7 @@ def standard_scaler(
54
55
  sdata = scaler.transform(arr) if load_path else scaler.fit_transform(arr)
55
56
  if save_path:
56
57
  joblib.dump(value=scaler, filename=save_path)
57
- return sdata
58
+ return sdata # type: ignore
58
59
 
59
60
  df = data.copy()
60
61
 
@@ -90,7 +91,7 @@ def standard_scaler(
90
91
  # 연속형 변수를 0부터 1 사이의 값으로 정규화한다
91
92
  # ===================================================================
92
93
  def minmax_scaler(
93
- data: any, yname: str | None = None, save_path: str | None = None, load_path: str | None = None
94
+ data: Any, yname: str | None = None, save_path: str | None = None, load_path: str | None = None
94
95
  ) -> DataFrame:
95
96
  """연속형 변수에 대해 MinMax Scaling을 수행한다.
96
97
 
@@ -123,7 +124,7 @@ def minmax_scaler(
123
124
  sdata = scaler.transform(arr) if load_path else scaler.fit_transform(arr)
124
125
  if save_path:
125
126
  joblib.dump(scaler, save_path)
126
- return sdata
127
+ return sdata # type: ignore
127
128
 
128
129
  df = data.copy()
129
130
 
@@ -158,7 +159,7 @@ def minmax_scaler(
158
159
  # ===================================================================
159
160
  # 지정된 컬럼들을 범주형 데이터로 설정한다
160
161
  # ===================================================================
161
- def set_category(data: DataFrame, *args: str, columns: list = None) -> DataFrame:
162
+ def set_category(data: DataFrame, *args: str, columns: list | None = None) -> DataFrame:
162
163
  """카테고리 데이터를 설정한다.
163
164
 
164
165
  Args:
@@ -173,7 +174,7 @@ def set_category(data: DataFrame, *args: str, columns: list = None) -> DataFrame
173
174
  if columns is not None:
174
175
  if args:
175
176
  raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
176
- args = columns
177
+ args = columns # type: ignore
177
178
 
178
179
  df = data.copy()
179
180
 
@@ -226,7 +227,7 @@ def unmelt(
226
227
  # ===================================================================
227
228
  # 지정된 변수의 이상치 테이블로 반환한다
228
229
  # ===================================================================
229
- def outlier_table(data: DataFrame, *fields: str, columns: list = None) -> DataFrame:
230
+ def outlier_table(data: DataFrame, *fields: str, columns: list | None = None) -> DataFrame:
230
231
  """수치형 컬럼에 대한 사분위수 및 IQR 기반 이상치 경계를 계산한다.
231
232
 
232
233
  전달된 `fields`가 없으면 데이터프레임의 모든 수치형 컬럼을 대상으로 한다.
@@ -246,7 +247,7 @@ def outlier_table(data: DataFrame, *fields: str, columns: list = None) -> DataFr
246
247
  """
247
248
  # columns 인자가 있으면 args보다 우선한다.
248
249
  if columns is not None:
249
- if args:
250
+ if args: # type: ignore
250
251
  raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
251
252
  args = columns
252
253
 
@@ -286,7 +287,7 @@ def outlier_table(data: DataFrame, *fields: str, columns: list = None) -> DataFr
286
287
  # ===================================================================
287
288
  # 이상치를 대체값(NaN, 0) 또는 중앙값으로 교체한다
288
289
  # ===================================================================
289
- def replace_outliner(data: DataFrame, method: str = "nan", *fields: str, columns: list = None) -> DataFrame:
290
+ def replace_outliner(data: DataFrame, method: str = "nan", *fields: str, columns: list | None = None) -> DataFrame:
290
291
  """이상치 경계값을 넘어가는 데이터를 경계값으로 대체한다.
291
292
 
292
293
  Args:
@@ -305,7 +306,7 @@ def replace_outliner(data: DataFrame, method: str = "nan", *fields: str, columns
305
306
  """
306
307
  # columns 인자가 있으면 args보다 우선한다.
307
308
  if columns is not None:
308
- if args:
309
+ if args: # type: ignore
309
310
  raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
310
311
  args = columns
311
312
 
@@ -354,7 +355,7 @@ def replace_outliner(data: DataFrame, method: str = "nan", *fields: str, columns
354
355
  # ===================================================================
355
356
  # 중빈 이상치를 제거한 연처리된 데이터프레임을 반환한다
356
357
  # ===================================================================
357
- def drop_outliner(data: DataFrame, *fields: str, columns: list = None) -> DataFrame:
358
+ def drop_outliner(data: DataFrame, *fields: str, columns: list | None = None) -> DataFrame:
358
359
  """이상치를 결측치로 변환한 후 모두 삭제한다.
359
360
 
360
361
  Args:
@@ -367,7 +368,7 @@ def drop_outliner(data: DataFrame, *fields: str, columns: list = None) -> DataFr
367
368
  """
368
369
  # columns 인자가 있으면 args보다 우선한다.
369
370
  if columns is not None:
370
- if args:
371
+ if args: # type: ignore
371
372
  raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
372
373
  args = columns
373
374
 
@@ -378,7 +379,7 @@ def drop_outliner(data: DataFrame, *fields: str, columns: list = None) -> DataFr
378
379
  # ===================================================================
379
380
  # 범주 변수를 더미 변수(One-Hot 인코딩)로 변환한다
380
381
  # ===================================================================
381
- def get_dummies(data: DataFrame, *args: str, columns: list = None, drop_first: bool = True, dtype: str = "int") -> DataFrame:
382
+ def get_dummies(data: DataFrame, *args: str, columns: list | None = None, drop_first: bool = True, dtype: str = "int") -> DataFrame:
382
383
  """명목형 변수를 더미 변수로 변환한다.
383
384
 
384
385
  컬럼명을 지정하면 그 컬럼들만 더미 변수로 변환하고,
@@ -409,7 +410,7 @@ def get_dummies(data: DataFrame, *args: str, columns: list = None, drop_first: b
409
410
  if columns is not None:
410
411
  if args:
411
412
  raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
412
- args = columns
413
+ args = columns # type: ignore
413
414
 
414
415
  if not args:
415
416
  # args가 없으면 숫자 타입이 아닌 모든 컬럼 자동 선택
@@ -417,13 +418,13 @@ def get_dummies(data: DataFrame, *args: str, columns: list = None, drop_first: b
417
418
  for f in data.columns:
418
419
  if not pd.api.types.is_numeric_dtype(data[f]):
419
420
  cols_to_convert.append(f)
420
- args = cols_to_convert
421
+ args = cols_to_convert # type: ignore
421
422
  else:
422
423
  # args가 있으면 그 컬럼들만 사용 (존재 여부 확인)
423
- args = [c for c in args if c in data.columns]
424
+ args = [c for c in args if c in data.columns] # type: ignore
424
425
 
425
426
  # pandas.get_dummies 사용 (재귀 문제 없음)
426
- return pd.get_dummies(data, columns=args, drop_first=drop_first, dtype=dtype) if args else data.copy()
427
+ return pd.get_dummies(data, columns=args, drop_first=drop_first, dtype=dtype) if args else data.copy() # type: ignore
427
428
 
428
429
 
429
430
  # ===================================================================
@@ -630,7 +631,7 @@ def bin_continuous(
630
631
  if apply_labels:
631
632
  # 숫자 인덱스 사용 (0, 1, 2, ...)
632
633
  numeric_labels = list(range(len(edges) - 1))
633
- df[new_col] = pd.cut(series, bins=edges, labels=numeric_labels, include_lowest=True, ordered=False)
634
+ df[new_col] = pd.cut(series, bins=edges, labels=numeric_labels, include_lowest=True, ordered=False) # type: ignore
634
635
  else:
635
636
  # 문자 레이블 적용
636
637
  if labels is None:
@@ -645,9 +646,9 @@ def bin_continuous(
645
646
  except:
646
647
  pass
647
648
  auto_labels.append(f"{left}~{right}")
648
- df[new_col] = pd.cut(series, bins=edges, labels=auto_labels, include_lowest=True, ordered=False)
649
+ df[new_col] = pd.cut(series, bins=edges, labels=auto_labels, include_lowest=True, ordered=False) # type: ignore
649
650
  else:
650
- df[new_col] = pd.cut(series, bins=edges, labels=labels, include_lowest=True, ordered=False)
651
+ df[new_col] = pd.cut(series, bins=edges, labels=labels, include_lowest=True, ordered=False) # type: ignore
651
652
 
652
653
  df[new_col] = df[new_col].astype("category")
653
654
  return df
@@ -671,26 +672,24 @@ def bin_continuous(
671
672
  n_bins = len(edges) - 1
672
673
  if apply_labels:
673
674
  numeric_labels = list(range(n_bins))
674
- df[new_col] = pd.cut(series, bins=edges, labels=numeric_labels, include_lowest=True, ordered=False)
675
+ df[new_col] = pd.cut(series, bins=edges, labels=numeric_labels, include_lowest=True, ordered=False) # type: ignore
675
676
  else:
676
677
  if labels is None:
677
678
  position_labels = [f"Q{i+1}" for i in range(n_bins)]
678
- df[new_col] = pd.cut(
679
- series, bins=edges, labels=position_labels, include_lowest=True, ordered=False
680
- )
679
+ df[new_col] = pd.cut(series, bins=edges, labels=position_labels, include_lowest=True, ordered=False) # type: ignore
681
680
  else:
682
- df[new_col] = pd.cut(series, bins=edges, labels=labels, include_lowest=True, ordered=False)
681
+ df[new_col] = pd.cut(series, bins=edges, labels=labels, include_lowest=True, ordered=False) # type: ignore
683
682
  df[new_col] = df[new_col].astype("category")
684
683
  return df
685
684
 
686
685
  # 자연 구간화 (Jenks) - 의존성 없으면 분위수로 폴백
687
686
  if method_key in {"natural_breaks", "natural", "jenks"}:
688
687
  k = bins if isinstance(bins, int) and bins > 1 else 5
689
- series_nonnull = series.dropna()
688
+ series_nonnull = series.dropna() # type: ignore
690
689
  k = min(k, max(2, series_nonnull.nunique()))
691
690
  edges = None
692
691
  try:
693
- edges = jenkspy.jenks_breaks(series_nonnull.to_list(), nb_class=k)
692
+ edges = jenkspy.jenks_breaks(series_nonnull.to_list(), nb_class=k) # type: ignore
694
693
  edges[0] = -np.inf
695
694
  edges[-1] = np.inf
696
695
  except Exception:
@@ -730,7 +729,7 @@ def bin_continuous(
730
729
  if apply_labels:
731
730
  # 숫자 인덱스 사용
732
731
  numeric_labels = list(range(len(cut_edges) - 1))
733
- df[new_col] = pd.cut(series, bins=cut_edges, labels=numeric_labels, include_lowest=True, ordered=False)
732
+ df[new_col] = pd.cut(series, bins=cut_edges, labels=numeric_labels, include_lowest=True, ordered=False) # type: ignore
734
733
  else:
735
734
  if labels is None:
736
735
  auto_labels = []
@@ -744,9 +743,9 @@ def bin_continuous(
744
743
  except:
745
744
  pass
746
745
  auto_labels.append(f"{left}~{right}")
747
- df[new_col] = pd.cut(series, bins=cut_edges, labels=auto_labels, include_lowest=True, ordered=False)
746
+ df[new_col] = pd.cut(series, bins=cut_edges, labels=auto_labels, include_lowest=True, ordered=False) # type: ignore
748
747
  else:
749
- df[new_col] = pd.cut(series, bins=cut_edges, labels=labels, include_lowest=True, ordered=False)
748
+ df[new_col] = pd.cut(series, bins=cut_edges, labels=labels, include_lowest=True, ordered=False) # type: ignore
750
749
  df[new_col] = df[new_col].astype("category")
751
750
  return df
752
751
 
hossam/hs_stats.py CHANGED
@@ -1,7 +1,7 @@
1
1
  # -*- coding: utf-8 -*-
2
2
 
3
3
  from __future__ import annotations
4
- from typing import overload, Tuple, Literal, Union
4
+ from typing import overload, Tuple, Literal, Union, Any
5
5
 
6
6
  # -------------------------------------------------------------
7
7
  import numpy as np
@@ -24,9 +24,9 @@ from scipy.stats import (
24
24
  normaltest,
25
25
  bartlett,
26
26
  levene,
27
- ttest_1samp,
27
+ ttest_1samp, # type: ignore
28
28
  ttest_ind as scipy_ttest_ind,
29
- ttest_rel,
29
+ ttest_rel, # type: ignore
30
30
  wilcoxon,
31
31
  pearsonr,
32
32
  spearmanr,
@@ -375,29 +375,29 @@ def describe(data: DataFrame, *fields: str, columns: list | None = None):
375
375
  outlier_rate = (outlier_count / len(data)) * 100
376
376
 
377
377
  # 분포 특성 판정 (왜도 기준)
378
- abs_skew = abs(skew)
379
- if abs_skew < 0.5:
378
+ abs_skew = abs(skew) # type: ignore
379
+ if abs_skew < 0.5: # type: ignore
380
380
  dist = "거의 대칭"
381
- elif abs_skew < 1.0:
382
- if skew > 0:
381
+ elif abs_skew < 1.0: # type: ignore
382
+ if skew > 0: # type: ignore
383
383
  dist = "약한 우측 꼬리"
384
384
  else:
385
385
  dist = "약한 좌측 꼬리"
386
- elif abs_skew < 2.0:
387
- if skew > 0:
386
+ elif abs_skew < 2.0: # type: ignore
387
+ if skew > 0: # type: ignore
388
388
  dist = "중간 우측 꼬리"
389
389
  else:
390
390
  dist = "중간 좌측 꼬리"
391
391
  else:
392
- if skew > 0:
392
+ if skew > 0: # type: ignore
393
393
  dist = "극단 우측 꼬리"
394
394
  else:
395
395
  dist = "극단 좌측 꼬리"
396
396
 
397
397
  # 로그변환 필요성 판정
398
- if abs_skew < 0.5:
398
+ if abs_skew < 0.5: # type: ignore
399
399
  log_need = "낮음"
400
- elif abs_skew < 1.0:
400
+ elif abs_skew < 1.0: # type: ignore
401
401
  log_need = "중간"
402
402
  else:
403
403
  log_need = "높음"
@@ -473,7 +473,7 @@ def category_describe(data: DataFrame, *fields: str):
473
473
  """
474
474
  if not fields:
475
475
  # 명목형(범주형) 컬럼 선택: object, category, bool 타입
476
- fields = data.select_dtypes(include=['object', 'category', 'bool']).columns
476
+ fields = data.select_dtypes(include=['object', 'category', 'bool']).columns # type: ignore
477
477
 
478
478
  result = []
479
479
  summary = []
@@ -730,7 +730,7 @@ def equal_var_test(data: DataFrame, columns: list | str | None = None, normal_di
730
730
  normality_result = normal_test(data[numeric_cols], method="n")
731
731
  # 모든 컬럼이 정규분포를 따르는지 확인
732
732
  all_normal = normality_result["is_normal"].all()
733
- normal_dist = all_normal
733
+ normal_dist = all_normal # type: ignore
734
734
 
735
735
  try:
736
736
  if normal_dist:
@@ -829,7 +829,7 @@ def ttest_1samp(data, mean_value: float = 0.0) -> DataFrame:
829
829
  else:
830
830
  for a in alternative:
831
831
  try:
832
- s, p = ttest_1samp(col_data, mean_value, alternative=a)
832
+ s, p = ttest_1samp(col_data, mean_value, alternative=a) # type: ignore
833
833
 
834
834
  itp = None
835
835
 
@@ -939,26 +939,26 @@ def ttest_ind(x, y, equal_var: bool | None = None) -> DataFrame:
939
939
 
940
940
  for a in alternative:
941
941
  try:
942
- s, p = scipy_ttest_ind(x_data, y_data, equal_var=equal_var, alternative=a)
942
+ s, p = scipy_ttest_ind(x_data, y_data, equal_var=equal_var, alternative=a) # type: ignore
943
943
  n = "t-test_ind" if equal_var else "Welch's t-test"
944
944
 
945
945
  # 검정 결과 해석
946
946
  itp = None
947
947
 
948
948
  if a == "two-sided":
949
- itp = fmt.format("==" if p > 0.05 else "!=")
949
+ itp = fmt.format("==" if p > 0.05 else "!=") # type: ignore
950
950
  elif a == "less":
951
- itp = fmt.format(">=" if p > 0.05 else "<")
951
+ itp = fmt.format(">=" if p > 0.05 else "<") # type: ignore
952
952
  else:
953
- itp = fmt.format("<=" if p > 0.05 else ">")
953
+ itp = fmt.format("<=" if p > 0.05 else ">") # type: ignore
954
954
 
955
955
  result.append({
956
956
  "test": n,
957
957
  "alternative": a,
958
- "statistic": round(s, 3),
959
- "p-value": round(p, 4),
960
- "H0": p > 0.05,
961
- "H1": p <= 0.05,
958
+ "statistic": round(s, 3), # type: ignore
959
+ "p-value": round(p, 4), # type: ignore
960
+ "H0": p > 0.05, # type: ignore
961
+ "H1": p <= 0.05, # type: ignore
962
962
  "interpretation": itp,
963
963
  "equal_var_checked": var_checked
964
964
  })
@@ -1068,7 +1068,7 @@ def ttest_rel(x, y, parametric: bool | None = None) -> DataFrame:
1068
1068
  for a in alternative:
1069
1069
  try:
1070
1070
  if parametric:
1071
- s, p = ttest_rel(x_data, y_data, alternative=a)
1071
+ s, p = ttest_rel(x_data, y_data, alternative=a) # type: ignore
1072
1072
  n = "t-test_paired"
1073
1073
  else:
1074
1074
  # Wilcoxon signed-rank test (대응표본용 비모수 검정)
@@ -1078,19 +1078,19 @@ def ttest_rel(x, y, parametric: bool | None = None) -> DataFrame:
1078
1078
  itp = None
1079
1079
 
1080
1080
  if a == "two-sided":
1081
- itp = fmt.format("==" if p > 0.05 else "!=")
1081
+ itp = fmt.format("==" if p > 0.05 else "!=") # type: ignore
1082
1082
  elif a == "less":
1083
- itp = fmt.format(">=" if p > 0.05 else "<")
1083
+ itp = fmt.format(">=" if p > 0.05 else "<") # type: ignore
1084
1084
  else:
1085
- itp = fmt.format("<=" if p > 0.05 else ">")
1085
+ itp = fmt.format("<=" if p > 0.05 else ">") # type: ignore
1086
1086
 
1087
1087
  result.append({
1088
1088
  "test": n,
1089
1089
  "alternative": a,
1090
- "statistic": round(s, 3) if not np.isnan(s) else s,
1091
- "p-value": round(p, 4) if not np.isnan(p) else p,
1092
- "H0": p > 0.05,
1093
- "H1": p <= 0.05,
1090
+ "statistic": round(s, 3) if not np.isnan(s) else s, # type: ignore
1091
+ "p-value": round(p, 4) if not np.isnan(p) else p, # type: ignore
1092
+ "H0": p > 0.05, # type: ignore
1093
+ "H1": p <= 0.05, # type: ignore
1094
1094
  "interpretation": itp,
1095
1095
  "normality_checked": var_checked
1096
1096
  })
@@ -1117,7 +1117,7 @@ def ttest_rel(x, y, parametric: bool | None = None) -> DataFrame:
1117
1117
  # ===================================================================
1118
1118
  def vif_filter(
1119
1119
  data: DataFrame,
1120
- yname: str = None,
1120
+ yname: str | None = None,
1121
1121
  ignore: list | None = None,
1122
1122
  threshold: float = 10.0,
1123
1123
  verbose: bool = False,
@@ -1182,7 +1182,7 @@ def vif_filter(
1182
1182
  for i, col in enumerate(X_clean.columns, start=0):
1183
1183
  # exog의 첫 열은 상수항이므로 변수 인덱스는 +1
1184
1184
  try:
1185
- vifs[col] = float(variance_inflation_factor(exog.values, i + 1))
1185
+ vifs[col] = float(variance_inflation_factor(exog.values, i + 1))# type: ignore
1186
1186
  except Exception:
1187
1187
  # 계산 실패 시 무한대로 처리하여 우선 제거 대상으로
1188
1188
  vifs[col] = float("inf")
@@ -1220,7 +1220,7 @@ def vif_filter(
1220
1220
  # ===================================================================
1221
1221
  # x, y 데이터에 대한 추세선을 구한다.
1222
1222
  # ===================================================================
1223
- def trend(x: any, y: any, degree: int = 1, value_count: int = 100) -> Tuple[np.ndarray, np.ndarray]:
1223
+ def trend(x: Any, y: Any, degree: int = 1, value_count: int = 100) -> Tuple[np.ndarray, np.ndarray]:
1224
1224
  """x, y 데이터에 대한 추세선을 구한다.
1225
1225
 
1226
1226
  Args:
@@ -1324,7 +1324,7 @@ def ols_report(fit, data, full=False, alpha=0.05) -> Union[
1324
1324
  for i, col in enumerate(indi_df.columns, start=1): # 상수항이 0이므로 1부터 시작
1325
1325
  try:
1326
1326
  with np.errstate(divide='ignore', invalid='ignore'):
1327
- vif_value = variance_inflation_factor(indi_df_const.values, i)
1327
+ vif_value = variance_inflation_factor(indi_df_const.values, i) # type: ignore
1328
1328
  # inf나 매우 큰 값 처리
1329
1329
  if np.isinf(vif_value) or vif_value > 1e10:
1330
1330
  vif_dict[col] = np.inf
@@ -1531,11 +1531,11 @@ def ols(df: DataFrame, yname: str, report: bool | str | int = False) -> Union[
1531
1531
  return linear_fit
1532
1532
  elif report == 1 or report == 'summary':
1533
1533
  # 요약 리포트 (full=False)
1534
- pdf, rdf = ols_report(linear_fit, df, full=False, alpha=0.05)
1534
+ pdf, rdf = ols_report(linear_fit, df, full=False, alpha=0.05) # type: ignore
1535
1535
  return linear_fit, pdf, rdf
1536
1536
  elif report == 2 or report == 'full' or report is True:
1537
1537
  # 풀 리포트 (full=True)
1538
- pdf, rdf, result_report, model_report, variable_reports, equation_text = ols_report(linear_fit, df, full=True, alpha=0.05)
1538
+ pdf, rdf, result_report, model_report, variable_reports, equation_text = ols_report(linear_fit, df, full=True, alpha=0.05) # type: ignore
1539
1539
  return linear_fit, pdf, rdf, result_report, model_report, variable_reports, equation_text
1540
1540
  else:
1541
1541
  # 기본값: 리포트 미사용
@@ -1657,7 +1657,7 @@ def logit_report(
1657
1657
  vif_dict = {}
1658
1658
  x_const = sm.add_constant(x, has_constant="add")
1659
1659
  for i, col in enumerate(x.columns, start=1): # 상수항이 0이므로 1부터 시작
1660
- vif_dict[col] = variance_inflation_factor(x_const.values, i)
1660
+ vif_dict[col] = variance_inflation_factor(x_const.values, i) # type: ignore
1661
1661
 
1662
1662
  for idx, row in tbl.iterrows():
1663
1663
  name = idx
@@ -1770,7 +1770,7 @@ def logit(
1770
1770
  DataFrame,
1771
1771
  str,
1772
1772
  str,
1773
- List[str]
1773
+ list[str]
1774
1774
  ]
1775
1775
  ]:
1776
1776
  """로지스틱 회귀분석을 수행하고 적합 결과를 반환한다.
@@ -1838,13 +1838,13 @@ def logit(
1838
1838
  return logit_fit
1839
1839
  elif report == 1 or report == 'summary':
1840
1840
  # 요약 리포트 (full=False)
1841
- cdf, rdf = logit_report(logit_fit, df, threshold=0.5, full=False, alpha=0.05)
1841
+ cdf, rdf = logit_report(logit_fit, df, threshold=0.5, full=False, alpha=0.05) # type: ignore
1842
1842
  # 요약에서는 result_report와 variable_reports만 포함
1843
1843
  # 간단한 버전으로 result와 variable_reports만 생성
1844
1844
  return logit_fit, rdf
1845
1845
  elif report == 2 or report == 'full' or report is True:
1846
1846
  # 풀 리포트 (full=True)
1847
- cdf, rdf, result_report, model_report, variable_reports, cm = logit_report(logit_fit, df, threshold=0.5, full=True, alpha=0.05)
1847
+ cdf, rdf, result_report, model_report, variable_reports, cm = logit_report(logit_fit, df, threshold=0.5, full=True, alpha=0.05) # type: ignore
1848
1848
  return logit_fit, cdf, rdf, result_report, model_report, variable_reports
1849
1849
  else:
1850
1850
  # 기본값: 리포트 미사용
@@ -1854,7 +1854,7 @@ def logit(
1854
1854
  # ===================================================================
1855
1855
  # 선형성 검정 (Linearity Test)
1856
1856
  # ===================================================================
1857
- def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05, plot: bool = False, title: str = None, save_path: str = None) -> DataFrame:
1857
+ def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05, plot: bool = False, title: str | None = None, save_path: str | None = None) -> DataFrame:
1858
1858
  """회귀모형의 선형성을 Ramsey RESET 검정으로 평가한다.
1859
1859
 
1860
1860
  적합된 회귀모형에 대해 Ramsey RESET(Regression Specification Error Test) 검정을 수행하여
@@ -1961,7 +1961,7 @@ def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05, plot: bool = Fa
1961
1961
  # ===================================================================
1962
1962
  # 정규성 검정 (Normality Test)
1963
1963
  # ===================================================================
1964
- def ols_normality_test(fit, alpha: float = 0.05, plot: bool = False, title: str = None, save_path: str = None) -> DataFrame:
1964
+ def ols_normality_test(fit, alpha: float = 0.05, plot: bool = False, title: str | None = None, save_path: str | None = None) -> DataFrame:
1965
1965
  """회귀모형 잔차의 정규성을 검정한다.
1966
1966
 
1967
1967
  회귀모형의 잔차가 정규분포를 따르는지 Shapiro-Wilk 검정과 Jarque-Bera 검정으로 평가한다.
@@ -2029,7 +2029,7 @@ def ols_normality_test(fit, alpha: float = 0.05, plot: bool = False, title: str
2029
2029
  # 2. Jarque-Bera 검정 (항상 수행)
2030
2030
  try:
2031
2031
  stat_jb, p_jb = jarque_bera(residuals)
2032
- significant_jb = p_jb <= alpha
2032
+ significant_jb = p_jb <= alpha # type: ignore
2033
2033
 
2034
2034
  if significant_jb:
2035
2035
  interpretation_jb = f"정규성 위반 (p={p_jb:.4f} <= {alpha})"
@@ -2362,8 +2362,8 @@ def corr_pairwise(
2362
2362
  corr_val, pval = np.nan, np.nan
2363
2363
 
2364
2364
  # 4) 유의성, 강도
2365
- significant = False if np.isnan(pval) else pval <= alpha
2366
- abs_r = abs(corr_val) if not np.isnan(corr_val) else 0
2365
+ significant = False if np.isnan(pval) else pval <= alpha # type: ignore
2366
+ abs_r = abs(corr_val) if not np.isnan(corr_val) else 0 # type: ignore
2367
2367
  if abs_r > 0.7:
2368
2368
  strength = "strong"
2369
2369
  elif abs_r > 0.3:
@@ -2530,13 +2530,13 @@ def oneway_anova(data: DataFrame, dv: str, between: str, alpha: float = 0.05) ->
2530
2530
  anova_df['significant'] = anova_df['p-unc'] <= alpha
2531
2531
 
2532
2532
  # ANOVA 결과가 유의한지 확인
2533
- p_unc = float(anova_df.loc[0, 'p-unc'])
2533
+ p_unc = float(anova_df.loc[0, 'p-unc']) # type: ignore
2534
2534
  anova_significant = p_unc <= alpha
2535
2535
 
2536
2536
  # ANOVA 보고 문장 생성
2537
2537
  def _safe_get(col: str, default: float = np.nan) -> float:
2538
2538
  try:
2539
- return float(anova_df.loc[0, col]) if col in anova_df.columns else default
2539
+ return float(anova_df.loc[0, col]) if col in anova_df.columns else default # type: ignore
2540
2540
  except Exception:
2541
2541
  return default
2542
2542
 
@@ -2851,7 +2851,7 @@ def predict(fit, data: DataFrame | Series) -> DataFrame | Series | float:
2851
2851
 
2852
2852
  # Series 입력인 경우 단일 값 반환
2853
2853
  if is_series:
2854
- return float(predictions.iloc[0])
2854
+ return float(predictions.iloc[0]) # type: ignore
2855
2855
 
2856
2856
  # DataFrame 입력인 경우
2857
2857
  if isinstance(data, DataFrame):
@@ -2924,8 +2924,7 @@ def corr_effect_size(data: DataFrame, dv: str, *fields: str, alpha: float = 0.05
2924
2924
 
2925
2925
  # fields가 지정되지 않으면 수치형 컬럼 중 dv 제외 모두 사용
2926
2926
  if not fields:
2927
- fields = [col for col in data.columns
2928
- if is_numeric_dtype(data[col]) and col != dv]
2927
+ fields = [col for col in data.columns if is_numeric_dtype(data[col]) and col != dv] # type: ignore
2929
2928
 
2930
2929
  # dv가 수치형인지 확인
2931
2930
  if not is_numeric_dtype(data[dv]):
@@ -2953,8 +2952,8 @@ def corr_effect_size(data: DataFrame, dv: str, *fields: str, alpha: float = 0.05
2953
2952
  normal_y_result = normal_test(data[[dv]], columns=[dv], method=method_y)
2954
2953
 
2955
2954
  # 정규성 판정 (p > alpha면 정규분포 가정)
2956
- normal_x = normal_x_result.loc[var, 'p-val'] > alpha if var in normal_x_result.index else False
2957
- normal_y = normal_y_result.loc[dv, 'p-val'] > alpha if dv in normal_y_result.index else False
2955
+ normal_x = normal_x_result.loc[var, 'p-val'] > alpha if var in normal_x_result.index else False # type: ignore
2956
+ normal_y = normal_y_result.loc[dv, 'p-val'] > alpha if dv in normal_y_result.index else False # type: ignore
2958
2957
 
2959
2958
  # Pearson (모두 정규) vs Spearman (하나라도 비정규)
2960
2959
  if normal_x and normal_y:
@@ -2966,8 +2965,8 @@ def corr_effect_size(data: DataFrame, dv: str, *fields: str, alpha: float = 0.05
2966
2965
 
2967
2966
  # Cohen's d 계산 (상관계수에서 효과크기로 변환)
2968
2967
  # d = 2*r / sqrt(1-r^2)
2969
- if r**2 < 1:
2970
- d = (2 * r) / np.sqrt(1 - r**2)
2968
+ if r ** 2 < 1: # type: ignore
2969
+ d = (2 * r) / np.sqrt(1 - r ** 2) # type: ignore
2971
2970
  else:
2972
2971
  d = 0
2973
2972
 
hossam/hs_util.py CHANGED
@@ -122,11 +122,9 @@ def pretty_table(data: DataFrame, tablefmt="simple", headers: str = "keys") -> N
122
122
  ```
123
123
  """
124
124
 
125
- tabulate.WIDE_CHARS_MODE = False
125
+ tabulate.WIDE_CHARS_MODE = False # type: ignore
126
126
  print(
127
- tabulate(
128
- data, headers=headers, tablefmt=tablefmt, showindex=True, numalign="right"
129
- )
127
+ tabulate(data, headers=headers, tablefmt=tablefmt, showindex=True, numalign="right") # type: ignore
130
128
  )
131
129
 
132
130
 
@@ -167,7 +165,7 @@ def __data_info(
167
165
 
168
166
  if info:
169
167
  print("\n✅ 테이블 정보")
170
- pretty_table(data.info(), tablefmt="pretty")
168
+ pretty_table(data.info(), tablefmt="pretty") # type: ignore
171
169
 
172
170
  print("\n✅ 상위 5개 행")
173
171
  pretty_table(data.head(), tablefmt="pretty")
@@ -229,7 +227,7 @@ def load_data(key: str,
229
227
  elif k.endswith(".csv"):
230
228
  origin = read_csv(key)
231
229
  else:
232
- origin = _load_data_remote(key, local)
230
+ origin = _load_data_remote(key, local) # type: ignore
233
231
 
234
232
  if origin is None:
235
233
  raise RuntimeError("Data loading failed: origin is None")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hossam
3
- Version: 0.4.4
3
+ Version: 0.4.5
4
4
  Summary: Hossam Data Helper
5
5
  Author-email: Lee Kwang-Ho <leekh4232@gmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1,16 @@
1
+ hossam/NotoSansKR-Regular.ttf,sha256=0SCufUQwcVWrWTu75j4Lt_V2bgBJIBXl1p8iAJJYkVY,6185516
2
+ hossam/__init__.py,sha256=OkMeP15jt6aCy7QNXMtkO0YRVvgOQYumkb7GuVKrbcs,2712
3
+ hossam/data_loader.py,sha256=K0-MJaVeedF5x8mSp22X2rD_CZ-T185EhoUFEqzP8Ss,6352
4
+ hossam/hs_classroom.py,sha256=rgayol3U5PSo4rLfdbClfiAtG21bFrASaSW56PUsjus,27144
5
+ hossam/hs_gis.py,sha256=DVmndBK-_7GMK3J1_on3ieEQk1S0MfUZ8_wlX-cDdZQ,11581
6
+ hossam/hs_plot.py,sha256=3j9B69pl-zQM_09lTXxLKAMaDM0vwOTsUWbzcU8hCK8,86228
7
+ hossam/hs_prep.py,sha256=kCmFxnMyFZ5tLUfoE8msbwTracajHAmruJbFj6A6eIU,38020
8
+ hossam/hs_stats.py,sha256=uGYkEk8Rb8qMoZ5FiZ7Yg6jssLIGl_EBbmwvvSYljhQ,115780
9
+ hossam/hs_timeserise.py,sha256=gSj3cPgOGLOZEXhfW1anXbwpoJja847ZY9F8l9piJPE,42601
10
+ hossam/hs_util.py,sha256=xuNXC6FJSAmyAbcRAUMsigCKHXM25t3H90nFMgq7IBs,8482
11
+ hossam/leekh.png,sha256=1PB5NQ24SDoHA5KMiBBsWpSa3iniFcwFTuGwuOsTHfI,6395
12
+ hossam-0.4.5.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
13
+ hossam-0.4.5.dist-info/METADATA,sha256=HM5qrrvaFZWAyUlhgV_BLPHAcxEZdZ4gp2p3V4X4pzo,3676
14
+ hossam-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ hossam-0.4.5.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
16
+ hossam-0.4.5.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- hossam/NotoSansKR-Regular.ttf,sha256=0SCufUQwcVWrWTu75j4Lt_V2bgBJIBXl1p8iAJJYkVY,6185516
2
- hossam/__init__.py,sha256=OkMeP15jt6aCy7QNXMtkO0YRVvgOQYumkb7GuVKrbcs,2712
3
- hossam/data_loader.py,sha256=oUIsqbHQoRiHA_1tdElDaYo1ipmUB5fYSXYMB5gLOl0,6395
4
- hossam/hs_classroom.py,sha256=rgayol3U5PSo4rLfdbClfiAtG21bFrASaSW56PUsjus,27144
5
- hossam/hs_gis.py,sha256=DLogaf5nxJBbG-d8QoH2g8UfZ1omMtmEXDYgNg8jtT0,11410
6
- hossam/hs_plot.py,sha256=tsJMi2q9SzHRSs25dXsHkkImW-Jk7su1M6TbKwX9koU,83887
7
- hossam/hs_prep.py,sha256=ocZNGzHzqgasVNLcb_LClTZaAeTYiIg4mzrixeEzBQU,37693
8
- hossam/hs_stats.py,sha256=LpUG8U9ybnh6qSMW2SKCSDJZTeMhLH2xH2Pj4i7U6TU,114889
9
- hossam/hs_timeserise.py,sha256=gSj3cPgOGLOZEXhfW1anXbwpoJja847ZY9F8l9piJPE,42601
10
- hossam/hs_util.py,sha256=8byLj_VR93vS__lyf0xgQKArgMy9qFm2VvZVSCxfQX0,8444
11
- hossam/leekh.png,sha256=1PB5NQ24SDoHA5KMiBBsWpSa3iniFcwFTuGwuOsTHfI,6395
12
- hossam-0.4.4.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
13
- hossam-0.4.4.dist-info/METADATA,sha256=R6qOrcnZhbTzUrRK2x9vNksDjw8rVK1DVZrbRIPSPQQ,3676
14
- hossam-0.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- hossam-0.4.4.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
16
- hossam-0.4.4.dist-info/RECORD,,
File without changes