hossam 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hossam/data_loader.py +7 -9
- hossam/hs_gis.py +17 -18
- hossam/hs_plot.py +207 -210
- hossam/hs_prep.py +29 -30
- hossam/hs_stats.py +54 -55
- hossam/hs_util.py +4 -6
- {hossam-0.4.4.dist-info → hossam-0.4.5.dist-info}/METADATA +1 -1
- hossam-0.4.5.dist-info/RECORD +16 -0
- hossam-0.4.4.dist-info/RECORD +0 -16
- {hossam-0.4.4.dist-info → hossam-0.4.5.dist-info}/WHEEL +0 -0
- {hossam-0.4.4.dist-info → hossam-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {hossam-0.4.4.dist-info → hossam-0.4.5.dist-info}/top_level.txt +0 -0
hossam/hs_prep.py
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
import joblib
|
|
6
6
|
import numpy as np
|
|
7
7
|
from itertools import combinations
|
|
8
|
+
from typing import Any
|
|
8
9
|
|
|
9
10
|
import pandas as pd
|
|
10
11
|
import jenkspy
|
|
@@ -19,7 +20,7 @@ from .hs_util import pretty_table
|
|
|
19
20
|
# 연속형 변수를 표준정규화(Z-score)로 변환한다
|
|
20
21
|
# ===================================================================
|
|
21
22
|
def standard_scaler(
|
|
22
|
-
data:
|
|
23
|
+
data: Any, yname: str | None = None, save_path: str | None = None, load_path: str | None = None
|
|
23
24
|
) -> DataFrame:
|
|
24
25
|
"""연속형 변수에 대해 Standard Scaling을 수행한다.
|
|
25
26
|
|
|
@@ -54,7 +55,7 @@ def standard_scaler(
|
|
|
54
55
|
sdata = scaler.transform(arr) if load_path else scaler.fit_transform(arr)
|
|
55
56
|
if save_path:
|
|
56
57
|
joblib.dump(value=scaler, filename=save_path)
|
|
57
|
-
return sdata
|
|
58
|
+
return sdata # type: ignore
|
|
58
59
|
|
|
59
60
|
df = data.copy()
|
|
60
61
|
|
|
@@ -90,7 +91,7 @@ def standard_scaler(
|
|
|
90
91
|
# 연속형 변수를 0부터 1 사이의 값으로 정규화한다
|
|
91
92
|
# ===================================================================
|
|
92
93
|
def minmax_scaler(
|
|
93
|
-
data:
|
|
94
|
+
data: Any, yname: str | None = None, save_path: str | None = None, load_path: str | None = None
|
|
94
95
|
) -> DataFrame:
|
|
95
96
|
"""연속형 변수에 대해 MinMax Scaling을 수행한다.
|
|
96
97
|
|
|
@@ -123,7 +124,7 @@ def minmax_scaler(
|
|
|
123
124
|
sdata = scaler.transform(arr) if load_path else scaler.fit_transform(arr)
|
|
124
125
|
if save_path:
|
|
125
126
|
joblib.dump(scaler, save_path)
|
|
126
|
-
return sdata
|
|
127
|
+
return sdata # type: ignore
|
|
127
128
|
|
|
128
129
|
df = data.copy()
|
|
129
130
|
|
|
@@ -158,7 +159,7 @@ def minmax_scaler(
|
|
|
158
159
|
# ===================================================================
|
|
159
160
|
# 지정된 컬럼들을 범주형 데이터로 설정한다
|
|
160
161
|
# ===================================================================
|
|
161
|
-
def set_category(data: DataFrame, *args: str, columns: list = None) -> DataFrame:
|
|
162
|
+
def set_category(data: DataFrame, *args: str, columns: list | None = None) -> DataFrame:
|
|
162
163
|
"""카테고리 데이터를 설정한다.
|
|
163
164
|
|
|
164
165
|
Args:
|
|
@@ -173,7 +174,7 @@ def set_category(data: DataFrame, *args: str, columns: list = None) -> DataFrame
|
|
|
173
174
|
if columns is not None:
|
|
174
175
|
if args:
|
|
175
176
|
raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
|
|
176
|
-
args = columns
|
|
177
|
+
args = columns # type: ignore
|
|
177
178
|
|
|
178
179
|
df = data.copy()
|
|
179
180
|
|
|
@@ -226,7 +227,7 @@ def unmelt(
|
|
|
226
227
|
# ===================================================================
|
|
227
228
|
# 지정된 변수의 이상치 테이블로 반환한다
|
|
228
229
|
# ===================================================================
|
|
229
|
-
def outlier_table(data: DataFrame, *fields: str, columns: list = None) -> DataFrame:
|
|
230
|
+
def outlier_table(data: DataFrame, *fields: str, columns: list | None = None) -> DataFrame:
|
|
230
231
|
"""수치형 컬럼에 대한 사분위수 및 IQR 기반 이상치 경계를 계산한다.
|
|
231
232
|
|
|
232
233
|
전달된 `fields`가 없으면 데이터프레임의 모든 수치형 컬럼을 대상으로 한다.
|
|
@@ -246,7 +247,7 @@ def outlier_table(data: DataFrame, *fields: str, columns: list = None) -> DataFr
|
|
|
246
247
|
"""
|
|
247
248
|
# columns 인자가 있으면 args보다 우선한다.
|
|
248
249
|
if columns is not None:
|
|
249
|
-
if args:
|
|
250
|
+
if args: # type: ignore
|
|
250
251
|
raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
|
|
251
252
|
args = columns
|
|
252
253
|
|
|
@@ -286,7 +287,7 @@ def outlier_table(data: DataFrame, *fields: str, columns: list = None) -> DataFr
|
|
|
286
287
|
# ===================================================================
|
|
287
288
|
# 이상치를 대체값(NaN, 0) 또는 중앙값으로 교체한다
|
|
288
289
|
# ===================================================================
|
|
289
|
-
def replace_outliner(data: DataFrame, method: str = "nan", *fields: str, columns: list = None) -> DataFrame:
|
|
290
|
+
def replace_outliner(data: DataFrame, method: str = "nan", *fields: str, columns: list | None = None) -> DataFrame:
|
|
290
291
|
"""이상치 경계값을 넘어가는 데이터를 경계값으로 대체한다.
|
|
291
292
|
|
|
292
293
|
Args:
|
|
@@ -305,7 +306,7 @@ def replace_outliner(data: DataFrame, method: str = "nan", *fields: str, columns
|
|
|
305
306
|
"""
|
|
306
307
|
# columns 인자가 있으면 args보다 우선한다.
|
|
307
308
|
if columns is not None:
|
|
308
|
-
if args:
|
|
309
|
+
if args: # type: ignore
|
|
309
310
|
raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
|
|
310
311
|
args = columns
|
|
311
312
|
|
|
@@ -354,7 +355,7 @@ def replace_outliner(data: DataFrame, method: str = "nan", *fields: str, columns
|
|
|
354
355
|
# ===================================================================
|
|
355
356
|
# 중빈 이상치를 제거한 연처리된 데이터프레임을 반환한다
|
|
356
357
|
# ===================================================================
|
|
357
|
-
def drop_outliner(data: DataFrame, *fields: str, columns: list = None) -> DataFrame:
|
|
358
|
+
def drop_outliner(data: DataFrame, *fields: str, columns: list | None = None) -> DataFrame:
|
|
358
359
|
"""이상치를 결측치로 변환한 후 모두 삭제한다.
|
|
359
360
|
|
|
360
361
|
Args:
|
|
@@ -367,7 +368,7 @@ def drop_outliner(data: DataFrame, *fields: str, columns: list = None) -> DataFr
|
|
|
367
368
|
"""
|
|
368
369
|
# columns 인자가 있으면 args보다 우선한다.
|
|
369
370
|
if columns is not None:
|
|
370
|
-
if args:
|
|
371
|
+
if args: # type: ignore
|
|
371
372
|
raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
|
|
372
373
|
args = columns
|
|
373
374
|
|
|
@@ -378,7 +379,7 @@ def drop_outliner(data: DataFrame, *fields: str, columns: list = None) -> DataFr
|
|
|
378
379
|
# ===================================================================
|
|
379
380
|
# 범주 변수를 더미 변수(One-Hot 인코딩)로 변환한다
|
|
380
381
|
# ===================================================================
|
|
381
|
-
def get_dummies(data: DataFrame, *args: str, columns: list = None, drop_first: bool = True, dtype: str = "int") -> DataFrame:
|
|
382
|
+
def get_dummies(data: DataFrame, *args: str, columns: list | None = None, drop_first: bool = True, dtype: str = "int") -> DataFrame:
|
|
382
383
|
"""명목형 변수를 더미 변수로 변환한다.
|
|
383
384
|
|
|
384
385
|
컬럼명을 지정하면 그 컬럼들만 더미 변수로 변환하고,
|
|
@@ -409,7 +410,7 @@ def get_dummies(data: DataFrame, *args: str, columns: list = None, drop_first: b
|
|
|
409
410
|
if columns is not None:
|
|
410
411
|
if args:
|
|
411
412
|
raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
|
|
412
|
-
args = columns
|
|
413
|
+
args = columns # type: ignore
|
|
413
414
|
|
|
414
415
|
if not args:
|
|
415
416
|
# args가 없으면 숫자 타입이 아닌 모든 컬럼 자동 선택
|
|
@@ -417,13 +418,13 @@ def get_dummies(data: DataFrame, *args: str, columns: list = None, drop_first: b
|
|
|
417
418
|
for f in data.columns:
|
|
418
419
|
if not pd.api.types.is_numeric_dtype(data[f]):
|
|
419
420
|
cols_to_convert.append(f)
|
|
420
|
-
args = cols_to_convert
|
|
421
|
+
args = cols_to_convert # type: ignore
|
|
421
422
|
else:
|
|
422
423
|
# args가 있으면 그 컬럼들만 사용 (존재 여부 확인)
|
|
423
|
-
args = [c for c in args if c in data.columns]
|
|
424
|
+
args = [c for c in args if c in data.columns] # type: ignore
|
|
424
425
|
|
|
425
426
|
# pandas.get_dummies 사용 (재귀 문제 없음)
|
|
426
|
-
return pd.get_dummies(data, columns=args, drop_first=drop_first, dtype=dtype) if args else data.copy()
|
|
427
|
+
return pd.get_dummies(data, columns=args, drop_first=drop_first, dtype=dtype) if args else data.copy() # type: ignore
|
|
427
428
|
|
|
428
429
|
|
|
429
430
|
# ===================================================================
|
|
@@ -630,7 +631,7 @@ def bin_continuous(
|
|
|
630
631
|
if apply_labels:
|
|
631
632
|
# 숫자 인덱스 사용 (0, 1, 2, ...)
|
|
632
633
|
numeric_labels = list(range(len(edges) - 1))
|
|
633
|
-
df[new_col] = pd.cut(series, bins=edges, labels=numeric_labels, include_lowest=True, ordered=False)
|
|
634
|
+
df[new_col] = pd.cut(series, bins=edges, labels=numeric_labels, include_lowest=True, ordered=False) # type: ignore
|
|
634
635
|
else:
|
|
635
636
|
# 문자 레이블 적용
|
|
636
637
|
if labels is None:
|
|
@@ -645,9 +646,9 @@ def bin_continuous(
|
|
|
645
646
|
except:
|
|
646
647
|
pass
|
|
647
648
|
auto_labels.append(f"{left}~{right}")
|
|
648
|
-
df[new_col] = pd.cut(series, bins=edges, labels=auto_labels, include_lowest=True, ordered=False)
|
|
649
|
+
df[new_col] = pd.cut(series, bins=edges, labels=auto_labels, include_lowest=True, ordered=False) # type: ignore
|
|
649
650
|
else:
|
|
650
|
-
df[new_col] = pd.cut(series, bins=edges, labels=labels, include_lowest=True, ordered=False)
|
|
651
|
+
df[new_col] = pd.cut(series, bins=edges, labels=labels, include_lowest=True, ordered=False) # type: ignore
|
|
651
652
|
|
|
652
653
|
df[new_col] = df[new_col].astype("category")
|
|
653
654
|
return df
|
|
@@ -671,26 +672,24 @@ def bin_continuous(
|
|
|
671
672
|
n_bins = len(edges) - 1
|
|
672
673
|
if apply_labels:
|
|
673
674
|
numeric_labels = list(range(n_bins))
|
|
674
|
-
df[new_col] = pd.cut(series, bins=edges, labels=numeric_labels, include_lowest=True, ordered=False)
|
|
675
|
+
df[new_col] = pd.cut(series, bins=edges, labels=numeric_labels, include_lowest=True, ordered=False) # type: ignore
|
|
675
676
|
else:
|
|
676
677
|
if labels is None:
|
|
677
678
|
position_labels = [f"Q{i+1}" for i in range(n_bins)]
|
|
678
|
-
df[new_col] = pd.cut(
|
|
679
|
-
series, bins=edges, labels=position_labels, include_lowest=True, ordered=False
|
|
680
|
-
)
|
|
679
|
+
df[new_col] = pd.cut(series, bins=edges, labels=position_labels, include_lowest=True, ordered=False) # type: ignore
|
|
681
680
|
else:
|
|
682
|
-
df[new_col] = pd.cut(series, bins=edges, labels=labels, include_lowest=True, ordered=False)
|
|
681
|
+
df[new_col] = pd.cut(series, bins=edges, labels=labels, include_lowest=True, ordered=False) # type: ignore
|
|
683
682
|
df[new_col] = df[new_col].astype("category")
|
|
684
683
|
return df
|
|
685
684
|
|
|
686
685
|
# 자연 구간화 (Jenks) - 의존성 없으면 분위수로 폴백
|
|
687
686
|
if method_key in {"natural_breaks", "natural", "jenks"}:
|
|
688
687
|
k = bins if isinstance(bins, int) and bins > 1 else 5
|
|
689
|
-
series_nonnull = series.dropna()
|
|
688
|
+
series_nonnull = series.dropna() # type: ignore
|
|
690
689
|
k = min(k, max(2, series_nonnull.nunique()))
|
|
691
690
|
edges = None
|
|
692
691
|
try:
|
|
693
|
-
edges = jenkspy.jenks_breaks(series_nonnull.to_list(), nb_class=k)
|
|
692
|
+
edges = jenkspy.jenks_breaks(series_nonnull.to_list(), nb_class=k) # type: ignore
|
|
694
693
|
edges[0] = -np.inf
|
|
695
694
|
edges[-1] = np.inf
|
|
696
695
|
except Exception:
|
|
@@ -730,7 +729,7 @@ def bin_continuous(
|
|
|
730
729
|
if apply_labels:
|
|
731
730
|
# 숫자 인덱스 사용
|
|
732
731
|
numeric_labels = list(range(len(cut_edges) - 1))
|
|
733
|
-
df[new_col] = pd.cut(series, bins=cut_edges, labels=numeric_labels, include_lowest=True, ordered=False)
|
|
732
|
+
df[new_col] = pd.cut(series, bins=cut_edges, labels=numeric_labels, include_lowest=True, ordered=False) # type: ignore
|
|
734
733
|
else:
|
|
735
734
|
if labels is None:
|
|
736
735
|
auto_labels = []
|
|
@@ -744,9 +743,9 @@ def bin_continuous(
|
|
|
744
743
|
except:
|
|
745
744
|
pass
|
|
746
745
|
auto_labels.append(f"{left}~{right}")
|
|
747
|
-
df[new_col] = pd.cut(series, bins=cut_edges, labels=auto_labels, include_lowest=True, ordered=False)
|
|
746
|
+
df[new_col] = pd.cut(series, bins=cut_edges, labels=auto_labels, include_lowest=True, ordered=False) # type: ignore
|
|
748
747
|
else:
|
|
749
|
-
df[new_col] = pd.cut(series, bins=cut_edges, labels=labels, include_lowest=True, ordered=False)
|
|
748
|
+
df[new_col] = pd.cut(series, bins=cut_edges, labels=labels, include_lowest=True, ordered=False) # type: ignore
|
|
750
749
|
df[new_col] = df[new_col].astype("category")
|
|
751
750
|
return df
|
|
752
751
|
|
hossam/hs_stats.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
|
-
from typing import overload, Tuple, Literal, Union
|
|
4
|
+
from typing import overload, Tuple, Literal, Union, Any
|
|
5
5
|
|
|
6
6
|
# -------------------------------------------------------------
|
|
7
7
|
import numpy as np
|
|
@@ -24,9 +24,9 @@ from scipy.stats import (
|
|
|
24
24
|
normaltest,
|
|
25
25
|
bartlett,
|
|
26
26
|
levene,
|
|
27
|
-
ttest_1samp,
|
|
27
|
+
ttest_1samp, # type: ignore
|
|
28
28
|
ttest_ind as scipy_ttest_ind,
|
|
29
|
-
ttest_rel,
|
|
29
|
+
ttest_rel, # type: ignore
|
|
30
30
|
wilcoxon,
|
|
31
31
|
pearsonr,
|
|
32
32
|
spearmanr,
|
|
@@ -375,29 +375,29 @@ def describe(data: DataFrame, *fields: str, columns: list | None = None):
|
|
|
375
375
|
outlier_rate = (outlier_count / len(data)) * 100
|
|
376
376
|
|
|
377
377
|
# 분포 특성 판정 (왜도 기준)
|
|
378
|
-
abs_skew = abs(skew)
|
|
379
|
-
if abs_skew < 0.5:
|
|
378
|
+
abs_skew = abs(skew) # type: ignore
|
|
379
|
+
if abs_skew < 0.5: # type: ignore
|
|
380
380
|
dist = "거의 대칭"
|
|
381
|
-
elif abs_skew < 1.0:
|
|
382
|
-
if skew > 0:
|
|
381
|
+
elif abs_skew < 1.0: # type: ignore
|
|
382
|
+
if skew > 0: # type: ignore
|
|
383
383
|
dist = "약한 우측 꼬리"
|
|
384
384
|
else:
|
|
385
385
|
dist = "약한 좌측 꼬리"
|
|
386
|
-
elif abs_skew < 2.0:
|
|
387
|
-
if skew > 0:
|
|
386
|
+
elif abs_skew < 2.0: # type: ignore
|
|
387
|
+
if skew > 0: # type: ignore
|
|
388
388
|
dist = "중간 우측 꼬리"
|
|
389
389
|
else:
|
|
390
390
|
dist = "중간 좌측 꼬리"
|
|
391
391
|
else:
|
|
392
|
-
if skew > 0:
|
|
392
|
+
if skew > 0: # type: ignore
|
|
393
393
|
dist = "극단 우측 꼬리"
|
|
394
394
|
else:
|
|
395
395
|
dist = "극단 좌측 꼬리"
|
|
396
396
|
|
|
397
397
|
# 로그변환 필요성 판정
|
|
398
|
-
if abs_skew < 0.5:
|
|
398
|
+
if abs_skew < 0.5: # type: ignore
|
|
399
399
|
log_need = "낮음"
|
|
400
|
-
elif abs_skew < 1.0:
|
|
400
|
+
elif abs_skew < 1.0: # type: ignore
|
|
401
401
|
log_need = "중간"
|
|
402
402
|
else:
|
|
403
403
|
log_need = "높음"
|
|
@@ -473,7 +473,7 @@ def category_describe(data: DataFrame, *fields: str):
|
|
|
473
473
|
"""
|
|
474
474
|
if not fields:
|
|
475
475
|
# 명목형(범주형) 컬럼 선택: object, category, bool 타입
|
|
476
|
-
fields = data.select_dtypes(include=['object', 'category', 'bool']).columns
|
|
476
|
+
fields = data.select_dtypes(include=['object', 'category', 'bool']).columns # type: ignore
|
|
477
477
|
|
|
478
478
|
result = []
|
|
479
479
|
summary = []
|
|
@@ -730,7 +730,7 @@ def equal_var_test(data: DataFrame, columns: list | str | None = None, normal_di
|
|
|
730
730
|
normality_result = normal_test(data[numeric_cols], method="n")
|
|
731
731
|
# 모든 컬럼이 정규분포를 따르는지 확인
|
|
732
732
|
all_normal = normality_result["is_normal"].all()
|
|
733
|
-
normal_dist = all_normal
|
|
733
|
+
normal_dist = all_normal # type: ignore
|
|
734
734
|
|
|
735
735
|
try:
|
|
736
736
|
if normal_dist:
|
|
@@ -829,7 +829,7 @@ def ttest_1samp(data, mean_value: float = 0.0) -> DataFrame:
|
|
|
829
829
|
else:
|
|
830
830
|
for a in alternative:
|
|
831
831
|
try:
|
|
832
|
-
s, p = ttest_1samp(col_data, mean_value, alternative=a)
|
|
832
|
+
s, p = ttest_1samp(col_data, mean_value, alternative=a) # type: ignore
|
|
833
833
|
|
|
834
834
|
itp = None
|
|
835
835
|
|
|
@@ -939,26 +939,26 @@ def ttest_ind(x, y, equal_var: bool | None = None) -> DataFrame:
|
|
|
939
939
|
|
|
940
940
|
for a in alternative:
|
|
941
941
|
try:
|
|
942
|
-
s, p = scipy_ttest_ind(x_data, y_data, equal_var=equal_var, alternative=a)
|
|
942
|
+
s, p = scipy_ttest_ind(x_data, y_data, equal_var=equal_var, alternative=a) # type: ignore
|
|
943
943
|
n = "t-test_ind" if equal_var else "Welch's t-test"
|
|
944
944
|
|
|
945
945
|
# 검정 결과 해석
|
|
946
946
|
itp = None
|
|
947
947
|
|
|
948
948
|
if a == "two-sided":
|
|
949
|
-
itp = fmt.format("==" if p > 0.05 else "!=")
|
|
949
|
+
itp = fmt.format("==" if p > 0.05 else "!=") # type: ignore
|
|
950
950
|
elif a == "less":
|
|
951
|
-
itp = fmt.format(">=" if p > 0.05 else "<")
|
|
951
|
+
itp = fmt.format(">=" if p > 0.05 else "<") # type: ignore
|
|
952
952
|
else:
|
|
953
|
-
itp = fmt.format("<=" if p > 0.05 else ">")
|
|
953
|
+
itp = fmt.format("<=" if p > 0.05 else ">") # type: ignore
|
|
954
954
|
|
|
955
955
|
result.append({
|
|
956
956
|
"test": n,
|
|
957
957
|
"alternative": a,
|
|
958
|
-
"statistic": round(s, 3),
|
|
959
|
-
"p-value": round(p, 4),
|
|
960
|
-
"H0": p > 0.05,
|
|
961
|
-
"H1": p <= 0.05,
|
|
958
|
+
"statistic": round(s, 3), # type: ignore
|
|
959
|
+
"p-value": round(p, 4), # type: ignore
|
|
960
|
+
"H0": p > 0.05, # type: ignore
|
|
961
|
+
"H1": p <= 0.05, # type: ignore
|
|
962
962
|
"interpretation": itp,
|
|
963
963
|
"equal_var_checked": var_checked
|
|
964
964
|
})
|
|
@@ -1068,7 +1068,7 @@ def ttest_rel(x, y, parametric: bool | None = None) -> DataFrame:
|
|
|
1068
1068
|
for a in alternative:
|
|
1069
1069
|
try:
|
|
1070
1070
|
if parametric:
|
|
1071
|
-
s, p = ttest_rel(x_data, y_data, alternative=a)
|
|
1071
|
+
s, p = ttest_rel(x_data, y_data, alternative=a) # type: ignore
|
|
1072
1072
|
n = "t-test_paired"
|
|
1073
1073
|
else:
|
|
1074
1074
|
# Wilcoxon signed-rank test (대응표본용 비모수 검정)
|
|
@@ -1078,19 +1078,19 @@ def ttest_rel(x, y, parametric: bool | None = None) -> DataFrame:
|
|
|
1078
1078
|
itp = None
|
|
1079
1079
|
|
|
1080
1080
|
if a == "two-sided":
|
|
1081
|
-
itp = fmt.format("==" if p > 0.05 else "!=")
|
|
1081
|
+
itp = fmt.format("==" if p > 0.05 else "!=") # type: ignore
|
|
1082
1082
|
elif a == "less":
|
|
1083
|
-
itp = fmt.format(">=" if p > 0.05 else "<")
|
|
1083
|
+
itp = fmt.format(">=" if p > 0.05 else "<") # type: ignore
|
|
1084
1084
|
else:
|
|
1085
|
-
itp = fmt.format("<=" if p > 0.05 else ">")
|
|
1085
|
+
itp = fmt.format("<=" if p > 0.05 else ">") # type: ignore
|
|
1086
1086
|
|
|
1087
1087
|
result.append({
|
|
1088
1088
|
"test": n,
|
|
1089
1089
|
"alternative": a,
|
|
1090
|
-
"statistic": round(s, 3) if not np.isnan(s) else s,
|
|
1091
|
-
"p-value": round(p, 4) if not np.isnan(p) else p,
|
|
1092
|
-
"H0": p > 0.05,
|
|
1093
|
-
"H1": p <= 0.05,
|
|
1090
|
+
"statistic": round(s, 3) if not np.isnan(s) else s, # type: ignore
|
|
1091
|
+
"p-value": round(p, 4) if not np.isnan(p) else p, # type: ignore
|
|
1092
|
+
"H0": p > 0.05, # type: ignore
|
|
1093
|
+
"H1": p <= 0.05, # type: ignore
|
|
1094
1094
|
"interpretation": itp,
|
|
1095
1095
|
"normality_checked": var_checked
|
|
1096
1096
|
})
|
|
@@ -1117,7 +1117,7 @@ def ttest_rel(x, y, parametric: bool | None = None) -> DataFrame:
|
|
|
1117
1117
|
# ===================================================================
|
|
1118
1118
|
def vif_filter(
|
|
1119
1119
|
data: DataFrame,
|
|
1120
|
-
yname: str = None,
|
|
1120
|
+
yname: str | None = None,
|
|
1121
1121
|
ignore: list | None = None,
|
|
1122
1122
|
threshold: float = 10.0,
|
|
1123
1123
|
verbose: bool = False,
|
|
@@ -1182,7 +1182,7 @@ def vif_filter(
|
|
|
1182
1182
|
for i, col in enumerate(X_clean.columns, start=0):
|
|
1183
1183
|
# exog의 첫 열은 상수항이므로 변수 인덱스는 +1
|
|
1184
1184
|
try:
|
|
1185
|
-
vifs[col] = float(variance_inflation_factor(exog.values, i + 1))
|
|
1185
|
+
vifs[col] = float(variance_inflation_factor(exog.values, i + 1))# type: ignore
|
|
1186
1186
|
except Exception:
|
|
1187
1187
|
# 계산 실패 시 무한대로 처리하여 우선 제거 대상으로
|
|
1188
1188
|
vifs[col] = float("inf")
|
|
@@ -1220,7 +1220,7 @@ def vif_filter(
|
|
|
1220
1220
|
# ===================================================================
|
|
1221
1221
|
# x, y 데이터에 대한 추세선을 구한다.
|
|
1222
1222
|
# ===================================================================
|
|
1223
|
-
def trend(x:
|
|
1223
|
+
def trend(x: Any, y: Any, degree: int = 1, value_count: int = 100) -> Tuple[np.ndarray, np.ndarray]:
|
|
1224
1224
|
"""x, y 데이터에 대한 추세선을 구한다.
|
|
1225
1225
|
|
|
1226
1226
|
Args:
|
|
@@ -1324,7 +1324,7 @@ def ols_report(fit, data, full=False, alpha=0.05) -> Union[
|
|
|
1324
1324
|
for i, col in enumerate(indi_df.columns, start=1): # 상수항이 0이므로 1부터 시작
|
|
1325
1325
|
try:
|
|
1326
1326
|
with np.errstate(divide='ignore', invalid='ignore'):
|
|
1327
|
-
vif_value = variance_inflation_factor(indi_df_const.values, i)
|
|
1327
|
+
vif_value = variance_inflation_factor(indi_df_const.values, i) # type: ignore
|
|
1328
1328
|
# inf나 매우 큰 값 처리
|
|
1329
1329
|
if np.isinf(vif_value) or vif_value > 1e10:
|
|
1330
1330
|
vif_dict[col] = np.inf
|
|
@@ -1531,11 +1531,11 @@ def ols(df: DataFrame, yname: str, report: bool | str | int = False) -> Union[
|
|
|
1531
1531
|
return linear_fit
|
|
1532
1532
|
elif report == 1 or report == 'summary':
|
|
1533
1533
|
# 요약 리포트 (full=False)
|
|
1534
|
-
pdf, rdf = ols_report(linear_fit, df, full=False, alpha=0.05)
|
|
1534
|
+
pdf, rdf = ols_report(linear_fit, df, full=False, alpha=0.05) # type: ignore
|
|
1535
1535
|
return linear_fit, pdf, rdf
|
|
1536
1536
|
elif report == 2 or report == 'full' or report is True:
|
|
1537
1537
|
# 풀 리포트 (full=True)
|
|
1538
|
-
pdf, rdf, result_report, model_report, variable_reports, equation_text = ols_report(linear_fit, df, full=True, alpha=0.05)
|
|
1538
|
+
pdf, rdf, result_report, model_report, variable_reports, equation_text = ols_report(linear_fit, df, full=True, alpha=0.05) # type: ignore
|
|
1539
1539
|
return linear_fit, pdf, rdf, result_report, model_report, variable_reports, equation_text
|
|
1540
1540
|
else:
|
|
1541
1541
|
# 기본값: 리포트 미사용
|
|
@@ -1657,7 +1657,7 @@ def logit_report(
|
|
|
1657
1657
|
vif_dict = {}
|
|
1658
1658
|
x_const = sm.add_constant(x, has_constant="add")
|
|
1659
1659
|
for i, col in enumerate(x.columns, start=1): # 상수항이 0이므로 1부터 시작
|
|
1660
|
-
vif_dict[col] = variance_inflation_factor(x_const.values, i)
|
|
1660
|
+
vif_dict[col] = variance_inflation_factor(x_const.values, i) # type: ignore
|
|
1661
1661
|
|
|
1662
1662
|
for idx, row in tbl.iterrows():
|
|
1663
1663
|
name = idx
|
|
@@ -1770,7 +1770,7 @@ def logit(
|
|
|
1770
1770
|
DataFrame,
|
|
1771
1771
|
str,
|
|
1772
1772
|
str,
|
|
1773
|
-
|
|
1773
|
+
list[str]
|
|
1774
1774
|
]
|
|
1775
1775
|
]:
|
|
1776
1776
|
"""로지스틱 회귀분석을 수행하고 적합 결과를 반환한다.
|
|
@@ -1838,13 +1838,13 @@ def logit(
|
|
|
1838
1838
|
return logit_fit
|
|
1839
1839
|
elif report == 1 or report == 'summary':
|
|
1840
1840
|
# 요약 리포트 (full=False)
|
|
1841
|
-
cdf, rdf = logit_report(logit_fit, df, threshold=0.5, full=False, alpha=0.05)
|
|
1841
|
+
cdf, rdf = logit_report(logit_fit, df, threshold=0.5, full=False, alpha=0.05) # type: ignore
|
|
1842
1842
|
# 요약에서는 result_report와 variable_reports만 포함
|
|
1843
1843
|
# 간단한 버전으로 result와 variable_reports만 생성
|
|
1844
1844
|
return logit_fit, rdf
|
|
1845
1845
|
elif report == 2 or report == 'full' or report is True:
|
|
1846
1846
|
# 풀 리포트 (full=True)
|
|
1847
|
-
cdf, rdf, result_report, model_report, variable_reports, cm = logit_report(logit_fit, df, threshold=0.5, full=True, alpha=0.05)
|
|
1847
|
+
cdf, rdf, result_report, model_report, variable_reports, cm = logit_report(logit_fit, df, threshold=0.5, full=True, alpha=0.05) # type: ignore
|
|
1848
1848
|
return logit_fit, cdf, rdf, result_report, model_report, variable_reports
|
|
1849
1849
|
else:
|
|
1850
1850
|
# 기본값: 리포트 미사용
|
|
@@ -1854,7 +1854,7 @@ def logit(
|
|
|
1854
1854
|
# ===================================================================
|
|
1855
1855
|
# 선형성 검정 (Linearity Test)
|
|
1856
1856
|
# ===================================================================
|
|
1857
|
-
def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05, plot: bool = False, title: str = None, save_path: str = None) -> DataFrame:
|
|
1857
|
+
def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05, plot: bool = False, title: str | None = None, save_path: str | None = None) -> DataFrame:
|
|
1858
1858
|
"""회귀모형의 선형성을 Ramsey RESET 검정으로 평가한다.
|
|
1859
1859
|
|
|
1860
1860
|
적합된 회귀모형에 대해 Ramsey RESET(Regression Specification Error Test) 검정을 수행하여
|
|
@@ -1961,7 +1961,7 @@ def ols_linearity_test(fit, power: int = 2, alpha: float = 0.05, plot: bool = Fa
|
|
|
1961
1961
|
# ===================================================================
|
|
1962
1962
|
# 정규성 검정 (Normality Test)
|
|
1963
1963
|
# ===================================================================
|
|
1964
|
-
def ols_normality_test(fit, alpha: float = 0.05, plot: bool = False, title: str = None, save_path: str = None) -> DataFrame:
|
|
1964
|
+
def ols_normality_test(fit, alpha: float = 0.05, plot: bool = False, title: str | None = None, save_path: str | None = None) -> DataFrame:
|
|
1965
1965
|
"""회귀모형 잔차의 정규성을 검정한다.
|
|
1966
1966
|
|
|
1967
1967
|
회귀모형의 잔차가 정규분포를 따르는지 Shapiro-Wilk 검정과 Jarque-Bera 검정으로 평가한다.
|
|
@@ -2029,7 +2029,7 @@ def ols_normality_test(fit, alpha: float = 0.05, plot: bool = False, title: str
|
|
|
2029
2029
|
# 2. Jarque-Bera 검정 (항상 수행)
|
|
2030
2030
|
try:
|
|
2031
2031
|
stat_jb, p_jb = jarque_bera(residuals)
|
|
2032
|
-
significant_jb = p_jb <= alpha
|
|
2032
|
+
significant_jb = p_jb <= alpha # type: ignore
|
|
2033
2033
|
|
|
2034
2034
|
if significant_jb:
|
|
2035
2035
|
interpretation_jb = f"정규성 위반 (p={p_jb:.4f} <= {alpha})"
|
|
@@ -2362,8 +2362,8 @@ def corr_pairwise(
|
|
|
2362
2362
|
corr_val, pval = np.nan, np.nan
|
|
2363
2363
|
|
|
2364
2364
|
# 4) 유의성, 강도
|
|
2365
|
-
significant = False if np.isnan(pval) else pval <= alpha
|
|
2366
|
-
abs_r = abs(corr_val) if not np.isnan(corr_val) else 0
|
|
2365
|
+
significant = False if np.isnan(pval) else pval <= alpha # type: ignore
|
|
2366
|
+
abs_r = abs(corr_val) if not np.isnan(corr_val) else 0 # type: ignore
|
|
2367
2367
|
if abs_r > 0.7:
|
|
2368
2368
|
strength = "strong"
|
|
2369
2369
|
elif abs_r > 0.3:
|
|
@@ -2530,13 +2530,13 @@ def oneway_anova(data: DataFrame, dv: str, between: str, alpha: float = 0.05) ->
|
|
|
2530
2530
|
anova_df['significant'] = anova_df['p-unc'] <= alpha
|
|
2531
2531
|
|
|
2532
2532
|
# ANOVA 결과가 유의한지 확인
|
|
2533
|
-
p_unc = float(anova_df.loc[0, 'p-unc'])
|
|
2533
|
+
p_unc = float(anova_df.loc[0, 'p-unc']) # type: ignore
|
|
2534
2534
|
anova_significant = p_unc <= alpha
|
|
2535
2535
|
|
|
2536
2536
|
# ANOVA 보고 문장 생성
|
|
2537
2537
|
def _safe_get(col: str, default: float = np.nan) -> float:
|
|
2538
2538
|
try:
|
|
2539
|
-
return float(anova_df.loc[0, col]) if col in anova_df.columns else default
|
|
2539
|
+
return float(anova_df.loc[0, col]) if col in anova_df.columns else default # type: ignore
|
|
2540
2540
|
except Exception:
|
|
2541
2541
|
return default
|
|
2542
2542
|
|
|
@@ -2851,7 +2851,7 @@ def predict(fit, data: DataFrame | Series) -> DataFrame | Series | float:
|
|
|
2851
2851
|
|
|
2852
2852
|
# Series 입력인 경우 단일 값 반환
|
|
2853
2853
|
if is_series:
|
|
2854
|
-
return float(predictions.iloc[0])
|
|
2854
|
+
return float(predictions.iloc[0]) # type: ignore
|
|
2855
2855
|
|
|
2856
2856
|
# DataFrame 입력인 경우
|
|
2857
2857
|
if isinstance(data, DataFrame):
|
|
@@ -2924,8 +2924,7 @@ def corr_effect_size(data: DataFrame, dv: str, *fields: str, alpha: float = 0.05
|
|
|
2924
2924
|
|
|
2925
2925
|
# fields가 지정되지 않으면 수치형 컬럼 중 dv 제외 모두 사용
|
|
2926
2926
|
if not fields:
|
|
2927
|
-
fields = [col for col in data.columns
|
|
2928
|
-
if is_numeric_dtype(data[col]) and col != dv]
|
|
2927
|
+
fields = [col for col in data.columns if is_numeric_dtype(data[col]) and col != dv] # type: ignore
|
|
2929
2928
|
|
|
2930
2929
|
# dv가 수치형인지 확인
|
|
2931
2930
|
if not is_numeric_dtype(data[dv]):
|
|
@@ -2953,8 +2952,8 @@ def corr_effect_size(data: DataFrame, dv: str, *fields: str, alpha: float = 0.05
|
|
|
2953
2952
|
normal_y_result = normal_test(data[[dv]], columns=[dv], method=method_y)
|
|
2954
2953
|
|
|
2955
2954
|
# 정규성 판정 (p > alpha면 정규분포 가정)
|
|
2956
|
-
normal_x = normal_x_result.loc[var, 'p-val'] > alpha if var in normal_x_result.index else False
|
|
2957
|
-
normal_y = normal_y_result.loc[dv, 'p-val'] > alpha if dv in normal_y_result.index else False
|
|
2955
|
+
normal_x = normal_x_result.loc[var, 'p-val'] > alpha if var in normal_x_result.index else False # type: ignore
|
|
2956
|
+
normal_y = normal_y_result.loc[dv, 'p-val'] > alpha if dv in normal_y_result.index else False # type: ignore
|
|
2958
2957
|
|
|
2959
2958
|
# Pearson (모두 정규) vs Spearman (하나라도 비정규)
|
|
2960
2959
|
if normal_x and normal_y:
|
|
@@ -2966,8 +2965,8 @@ def corr_effect_size(data: DataFrame, dv: str, *fields: str, alpha: float = 0.05
|
|
|
2966
2965
|
|
|
2967
2966
|
# Cohen's d 계산 (상관계수에서 효과크기로 변환)
|
|
2968
2967
|
# d = 2*r / sqrt(1-r^2)
|
|
2969
|
-
if r**2 < 1:
|
|
2970
|
-
d = (2 * r) / np.sqrt(1 - r**2)
|
|
2968
|
+
if r ** 2 < 1: # type: ignore
|
|
2969
|
+
d = (2 * r) / np.sqrt(1 - r ** 2) # type: ignore
|
|
2971
2970
|
else:
|
|
2972
2971
|
d = 0
|
|
2973
2972
|
|
hossam/hs_util.py
CHANGED
|
@@ -122,11 +122,9 @@ def pretty_table(data: DataFrame, tablefmt="simple", headers: str = "keys") -> N
|
|
|
122
122
|
```
|
|
123
123
|
"""
|
|
124
124
|
|
|
125
|
-
tabulate.WIDE_CHARS_MODE = False
|
|
125
|
+
tabulate.WIDE_CHARS_MODE = False # type: ignore
|
|
126
126
|
print(
|
|
127
|
-
tabulate(
|
|
128
|
-
data, headers=headers, tablefmt=tablefmt, showindex=True, numalign="right"
|
|
129
|
-
)
|
|
127
|
+
tabulate(data, headers=headers, tablefmt=tablefmt, showindex=True, numalign="right") # type: ignore
|
|
130
128
|
)
|
|
131
129
|
|
|
132
130
|
|
|
@@ -167,7 +165,7 @@ def __data_info(
|
|
|
167
165
|
|
|
168
166
|
if info:
|
|
169
167
|
print("\n✅ 테이블 정보")
|
|
170
|
-
pretty_table(data.info(), tablefmt="pretty")
|
|
168
|
+
pretty_table(data.info(), tablefmt="pretty") # type: ignore
|
|
171
169
|
|
|
172
170
|
print("\n✅ 상위 5개 행")
|
|
173
171
|
pretty_table(data.head(), tablefmt="pretty")
|
|
@@ -229,7 +227,7 @@ def load_data(key: str,
|
|
|
229
227
|
elif k.endswith(".csv"):
|
|
230
228
|
origin = read_csv(key)
|
|
231
229
|
else:
|
|
232
|
-
origin = _load_data_remote(key, local)
|
|
230
|
+
origin = _load_data_remote(key, local) # type: ignore
|
|
233
231
|
|
|
234
232
|
if origin is None:
|
|
235
233
|
raise RuntimeError("Data loading failed: origin is None")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
hossam/NotoSansKR-Regular.ttf,sha256=0SCufUQwcVWrWTu75j4Lt_V2bgBJIBXl1p8iAJJYkVY,6185516
|
|
2
|
+
hossam/__init__.py,sha256=OkMeP15jt6aCy7QNXMtkO0YRVvgOQYumkb7GuVKrbcs,2712
|
|
3
|
+
hossam/data_loader.py,sha256=K0-MJaVeedF5x8mSp22X2rD_CZ-T185EhoUFEqzP8Ss,6352
|
|
4
|
+
hossam/hs_classroom.py,sha256=rgayol3U5PSo4rLfdbClfiAtG21bFrASaSW56PUsjus,27144
|
|
5
|
+
hossam/hs_gis.py,sha256=DVmndBK-_7GMK3J1_on3ieEQk1S0MfUZ8_wlX-cDdZQ,11581
|
|
6
|
+
hossam/hs_plot.py,sha256=3j9B69pl-zQM_09lTXxLKAMaDM0vwOTsUWbzcU8hCK8,86228
|
|
7
|
+
hossam/hs_prep.py,sha256=kCmFxnMyFZ5tLUfoE8msbwTracajHAmruJbFj6A6eIU,38020
|
|
8
|
+
hossam/hs_stats.py,sha256=uGYkEk8Rb8qMoZ5FiZ7Yg6jssLIGl_EBbmwvvSYljhQ,115780
|
|
9
|
+
hossam/hs_timeserise.py,sha256=gSj3cPgOGLOZEXhfW1anXbwpoJja847ZY9F8l9piJPE,42601
|
|
10
|
+
hossam/hs_util.py,sha256=xuNXC6FJSAmyAbcRAUMsigCKHXM25t3H90nFMgq7IBs,8482
|
|
11
|
+
hossam/leekh.png,sha256=1PB5NQ24SDoHA5KMiBBsWpSa3iniFcwFTuGwuOsTHfI,6395
|
|
12
|
+
hossam-0.4.5.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
|
|
13
|
+
hossam-0.4.5.dist-info/METADATA,sha256=HM5qrrvaFZWAyUlhgV_BLPHAcxEZdZ4gp2p3V4X4pzo,3676
|
|
14
|
+
hossam-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
hossam-0.4.5.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
|
|
16
|
+
hossam-0.4.5.dist-info/RECORD,,
|
hossam-0.4.4.dist-info/RECORD
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
hossam/NotoSansKR-Regular.ttf,sha256=0SCufUQwcVWrWTu75j4Lt_V2bgBJIBXl1p8iAJJYkVY,6185516
|
|
2
|
-
hossam/__init__.py,sha256=OkMeP15jt6aCy7QNXMtkO0YRVvgOQYumkb7GuVKrbcs,2712
|
|
3
|
-
hossam/data_loader.py,sha256=oUIsqbHQoRiHA_1tdElDaYo1ipmUB5fYSXYMB5gLOl0,6395
|
|
4
|
-
hossam/hs_classroom.py,sha256=rgayol3U5PSo4rLfdbClfiAtG21bFrASaSW56PUsjus,27144
|
|
5
|
-
hossam/hs_gis.py,sha256=DLogaf5nxJBbG-d8QoH2g8UfZ1omMtmEXDYgNg8jtT0,11410
|
|
6
|
-
hossam/hs_plot.py,sha256=tsJMi2q9SzHRSs25dXsHkkImW-Jk7su1M6TbKwX9koU,83887
|
|
7
|
-
hossam/hs_prep.py,sha256=ocZNGzHzqgasVNLcb_LClTZaAeTYiIg4mzrixeEzBQU,37693
|
|
8
|
-
hossam/hs_stats.py,sha256=LpUG8U9ybnh6qSMW2SKCSDJZTeMhLH2xH2Pj4i7U6TU,114889
|
|
9
|
-
hossam/hs_timeserise.py,sha256=gSj3cPgOGLOZEXhfW1anXbwpoJja847ZY9F8l9piJPE,42601
|
|
10
|
-
hossam/hs_util.py,sha256=8byLj_VR93vS__lyf0xgQKArgMy9qFm2VvZVSCxfQX0,8444
|
|
11
|
-
hossam/leekh.png,sha256=1PB5NQ24SDoHA5KMiBBsWpSa3iniFcwFTuGwuOsTHfI,6395
|
|
12
|
-
hossam-0.4.4.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
|
|
13
|
-
hossam-0.4.4.dist-info/METADATA,sha256=R6qOrcnZhbTzUrRK2x9vNksDjw8rVK1DVZrbRIPSPQQ,3676
|
|
14
|
-
hossam-0.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
-
hossam-0.4.4.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
|
|
16
|
-
hossam-0.4.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|