hossam 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hossam/hs_prep.py CHANGED
@@ -5,22 +5,29 @@
5
5
  import joblib
6
6
  import numpy as np
7
7
  from itertools import combinations
8
- from typing import Any
8
+ from typing import Any, Callable
9
9
 
10
10
  import pandas as pd
11
11
  import jenkspy
12
12
  from pandas import DataFrame
13
13
  from sklearn.preprocessing import StandardScaler, MinMaxScaler
14
14
  from sklearn.impute import SimpleImputer
15
-
15
+ from sklearn.decomposition import PCA
16
16
 
17
17
  from .hs_util import pretty_table
18
+ from .hs_plot import config, pca_plot
19
+
20
+ RANDOM_STATE = 52
21
+
18
22
 
19
23
  # ===================================================================
20
24
  # 연속형 변수를 표준정규화(Z-score)로 변환한다
21
25
  # ===================================================================
22
26
  def standard_scaler(
23
- data: Any, yname: str | None = None, save_path: str | None = None, load_path: str | None = None
27
+ data: Any,
28
+ yname: str | None = None,
29
+ save_path: str | None = None,
30
+ load_path: str | None = None,
24
31
  ) -> DataFrame:
25
32
  """연속형 변수에 대해 Standard Scaling을 수행한다.
26
33
 
@@ -55,7 +62,7 @@ def standard_scaler(
55
62
  sdata = scaler.transform(arr) if load_path else scaler.fit_transform(arr)
56
63
  if save_path:
57
64
  joblib.dump(value=scaler, filename=save_path)
58
- return sdata # type: ignore
65
+ return sdata # type: ignore
59
66
 
60
67
  df = data.copy()
61
68
 
@@ -91,7 +98,10 @@ def standard_scaler(
91
98
  # 연속형 변수를 0부터 1 사이의 값으로 정규화한다
92
99
  # ===================================================================
93
100
  def minmax_scaler(
94
- data: Any, yname: str | None = None, save_path: str | None = None, load_path: str | None = None
101
+ data: Any,
102
+ yname: str | None = None,
103
+ save_path: str | None = None,
104
+ load_path: str | None = None,
95
105
  ) -> DataFrame:
96
106
  """연속형 변수에 대해 MinMax Scaling을 수행한다.
97
107
 
@@ -124,7 +134,7 @@ def minmax_scaler(
124
134
  sdata = scaler.transform(arr) if load_path else scaler.fit_transform(arr)
125
135
  if save_path:
126
136
  joblib.dump(scaler, save_path)
127
- return sdata # type: ignore
137
+ return sdata # type: ignore
128
138
 
129
139
  df = data.copy()
130
140
 
@@ -174,7 +184,7 @@ def set_category(data: DataFrame, *args: str, columns: list | None = None) -> Da
174
184
  if columns is not None:
175
185
  if args:
176
186
  raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
177
- args = columns # type: ignore
187
+ args = columns # type: ignore
178
188
 
179
189
  df = data.copy()
180
190
 
@@ -217,17 +227,22 @@ def unmelt(
217
227
  ```
218
228
  """
219
229
  # 그룹별로 값들을 리스트로 모음
220
- grouped = data.groupby(id_vars, observed=True)[value_vars].apply(lambda x: x.tolist())
230
+ grouped = data.groupby(id_vars, observed=True)[value_vars].apply(
231
+ lambda x: x.tolist()
232
+ )
221
233
  series_dict = {}
222
234
  for idx, values in grouped.items():
223
235
  series_dict[str(idx)] = pd.Series(values)
224
236
 
225
237
  return DataFrame(series_dict)
226
238
 
239
+
227
240
  # ===================================================================
228
241
  # 지정된 변수의 이상치 테이블로 반환한다
229
242
  # ===================================================================
230
- def outlier_table(data: DataFrame, *fields: str, columns: list | None = None) -> DataFrame:
243
+ def outlier_table(
244
+ data: DataFrame, *fields: str, columns: list | None = None
245
+ ) -> DataFrame:
231
246
  """수치형 컬럼에 대한 사분위수 및 IQR 기반 이상치 경계를 계산한다.
232
247
 
233
248
  전달된 `fields`가 없으면 데이터프레임의 모든 수치형 컬럼을 대상으로 한다.
@@ -247,11 +262,15 @@ def outlier_table(data: DataFrame, *fields: str, columns: list | None = None) ->
247
262
  """
248
263
  # columns 인자가 있으면 args보다 우선한다.
249
264
  if columns is not None:
250
- if args: # type: ignore
265
+ if args: # type: ignore
251
266
  raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
252
267
  args = columns
253
268
 
254
- target_fields = list(fields) if fields else list(data.select_dtypes(include=[np.number]).columns)
269
+ target_fields = (
270
+ list(fields)
271
+ if fields
272
+ else list(data.select_dtypes(include=[np.number]).columns)
273
+ )
255
274
  result = []
256
275
  for f in target_fields:
257
276
  if f not in data.columns:
@@ -287,7 +306,9 @@ def outlier_table(data: DataFrame, *fields: str, columns: list | None = None) ->
287
306
  # ===================================================================
288
307
  # 이상치를 대체값(NaN, 0) 또는 중앙값으로 교체한다
289
308
  # ===================================================================
290
- def replace_outliner(data: DataFrame, method: str = "nan", *fields: str, columns: list | None = None) -> DataFrame:
309
+ def replace_outliner(
310
+ data: DataFrame, method: str = "nan", *fields: str, columns: list | None = None
311
+ ) -> DataFrame:
291
312
  """이상치 경계값을 넘어가는 데이터를 경계값으로 대체한다.
292
313
 
293
314
  Args:
@@ -306,7 +327,7 @@ def replace_outliner(data: DataFrame, method: str = "nan", *fields: str, columns
306
327
  """
307
328
  # columns 인자가 있으면 args보다 우선한다.
308
329
  if columns is not None:
309
- if args: # type: ignore
330
+ if args: # type: ignore
310
331
  raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
311
332
  args = columns
312
333
 
@@ -331,7 +352,9 @@ def replace_outliner(data: DataFrame, method: str = "nan", *fields: str, columns
331
352
  # 이상치가 발견된 필드에 대해서만 처리
332
353
  for f in outliner_table.index:
333
354
  if method == "outline":
334
- df.loc[df[f] < outliner_table.loc[f, "DOWN"], f] = outliner_table.loc[f, "DOWN"]
355
+ df.loc[df[f] < outliner_table.loc[f, "DOWN"], f] = outliner_table.loc[
356
+ f, "DOWN"
357
+ ]
335
358
  df.loc[df[f] > outliner_table.loc[f, "UP"], f] = outliner_table.loc[f, "UP"]
336
359
  else:
337
360
  df.loc[df[f] < outliner_table.loc[f, "DOWN"], f] = np.nan
@@ -344,7 +367,9 @@ def replace_outliner(data: DataFrame, method: str = "nan", *fields: str, columns
344
367
  df_imr = imr.fit_transform(df.values)
345
368
  df = DataFrame(df_imr, index=data.index, columns=df.columns)
346
369
  elif method not in {"nan", "outline"}:
347
- raise ValueError("method는 'nan', 'outline', 'mean', 'median', 'most' 중 하나여야 합니다.")
370
+ raise ValueError(
371
+ "method는 'nan', 'outline', 'mean', 'median', 'most' 중 하나여야 합니다."
372
+ )
348
373
 
349
374
  # 분리했던 카테고리 타입을 다시 병합
350
375
  if category_fields:
@@ -352,10 +377,13 @@ def replace_outliner(data: DataFrame, method: str = "nan", *fields: str, columns
352
377
 
353
378
  return df
354
379
 
380
+
355
381
  # ===================================================================
356
382
  # 중빈 이상치를 제거한 연처리된 데이터프레임을 반환한다
357
383
  # ===================================================================
358
- def drop_outliner(data: DataFrame, *fields: str, columns: list | None = None) -> DataFrame:
384
+ def drop_outliner(
385
+ data: DataFrame, *fields: str, columns: list | None = None
386
+ ) -> DataFrame:
359
387
  """이상치를 결측치로 변환한 후 모두 삭제한다.
360
388
 
361
389
  Args:
@@ -368,7 +396,7 @@ def drop_outliner(data: DataFrame, *fields: str, columns: list | None = None) ->
368
396
  """
369
397
  # columns 인자가 있으면 args보다 우선한다.
370
398
  if columns is not None:
371
- if args: # type: ignore
399
+ if args: # type: ignore
372
400
  raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
373
401
  args = columns
374
402
 
@@ -379,7 +407,13 @@ def drop_outliner(data: DataFrame, *fields: str, columns: list | None = None) ->
379
407
  # ===================================================================
380
408
  # 범주 변수를 더미 변수(One-Hot 인코딩)로 변환한다
381
409
  # ===================================================================
382
- def get_dummies(data: DataFrame, *args: str, columns: list | None = None, drop_first: bool = True, dtype: str = "int") -> DataFrame:
410
+ def get_dummies(
411
+ data: DataFrame,
412
+ *args: str,
413
+ columns: list | None = None,
414
+ drop_first: bool = True,
415
+ dtype: str = "int",
416
+ ) -> DataFrame:
383
417
  """명목형 변수를 더미 변수로 변환한다.
384
418
 
385
419
  컬럼명을 지정하면 그 컬럼들만 더미 변수로 변환하고,
@@ -410,7 +444,7 @@ def get_dummies(data: DataFrame, *args: str, columns: list | None = None, drop_f
410
444
  if columns is not None:
411
445
  if args:
412
446
  raise ValueError("args와 columns 인자는 중복 사용할 수 없습니다.")
413
- args = columns # type: ignore
447
+ args = columns # type: ignore
414
448
 
415
449
  if not args:
416
450
  # args가 없으면 숫자 타입이 아닌 모든 컬럼 자동 선택
@@ -418,13 +452,13 @@ def get_dummies(data: DataFrame, *args: str, columns: list | None = None, drop_f
418
452
  for f in data.columns:
419
453
  if not pd.api.types.is_numeric_dtype(data[f]):
420
454
  cols_to_convert.append(f)
421
- args = cols_to_convert # type: ignore
455
+ args = cols_to_convert # type: ignore
422
456
  else:
423
457
  # args가 있으면 그 컬럼들만 사용 (존재 여부 확인)
424
- args = [c for c in args if c in data.columns] # type: ignore
458
+ args = [c for c in args if c in data.columns] # type: ignore
425
459
 
426
460
  # pandas.get_dummies 사용 (재귀 문제 없음)
427
- return pd.get_dummies(data, columns=args, drop_first=drop_first, dtype=dtype) if args else data.copy() # type: ignore
461
+ return pd.get_dummies(data, columns=args, drop_first=drop_first, dtype=dtype) if args else data.copy() # type: ignore
428
462
 
429
463
 
430
464
  # ===================================================================
@@ -561,7 +595,12 @@ def bin_continuous(
561
595
  new_col = new_col or f"{field}_bin"
562
596
  method_key = (method or "").lower()
563
597
 
564
- def _cut(edges: list[float], default_labels: list[str] | None = None, right: bool = False, ordered: bool = True):
598
+ def _cut(
599
+ edges: list[float],
600
+ default_labels: list[str] | None = None,
601
+ right: bool = False,
602
+ ordered: bool = True,
603
+ ):
565
604
  nonlocal labels
566
605
  use_labels = None
567
606
 
@@ -631,24 +670,32 @@ def bin_continuous(
631
670
  if apply_labels:
632
671
  # 숫자 인덱스 사용 (0, 1, 2, ...)
633
672
  numeric_labels = list(range(len(edges) - 1))
634
- df[new_col] = pd.cut(series, bins=edges, labels=numeric_labels, include_lowest=True, ordered=False) # type: ignore
673
+ df[new_col] = pd.cut(series, bins=edges, labels=numeric_labels, include_lowest=True, ordered=False) # type: ignore
635
674
  else:
636
675
  # 문자 레이블 적용
637
676
  if labels is None:
638
677
  auto_labels = []
639
678
  for i in range(len(edges) - 1):
640
679
  left = f"{edges[i]:.2f}" if edges[i] != -np.inf else "-∞"
641
- right = f"{edges[i+1]:.2f}" if edges[i+1] != np.inf else "∞"
680
+ right = f"{edges[i+1]:.2f}" if edges[i + 1] != np.inf else "∞"
642
681
  # 정수값인 경우 소수점 제거
643
682
  try:
644
- left = str(int(float(left))) if float(left) == int(float(left)) else left
645
- right = str(int(float(right))) if float(right) == int(float(right)) else right
683
+ left = (
684
+ str(int(float(left)))
685
+ if float(left) == int(float(left))
686
+ else left
687
+ )
688
+ right = (
689
+ str(int(float(right)))
690
+ if float(right) == int(float(right))
691
+ else right
692
+ )
646
693
  except:
647
694
  pass
648
695
  auto_labels.append(f"{left}~{right}")
649
- df[new_col] = pd.cut(series, bins=edges, labels=auto_labels, include_lowest=True, ordered=False) # type: ignore
696
+ df[new_col] = pd.cut(series, bins=edges, labels=auto_labels, include_lowest=True, ordered=False) # type: ignore
650
697
  else:
651
- df[new_col] = pd.cut(series, bins=edges, labels=labels, include_lowest=True, ordered=False) # type: ignore
698
+ df[new_col] = pd.cut(series, bins=edges, labels=labels, include_lowest=True, ordered=False) # type: ignore
652
699
 
653
700
  df[new_col] = df[new_col].astype("category")
654
701
  return df
@@ -657,39 +704,45 @@ def bin_continuous(
657
704
  if method_key in {"quantile", "qcut", "equal_freq"}:
658
705
  k = bins if isinstance(bins, int) and bins > 0 else 4
659
706
  # apply_labels=False일 때 기본 레이블을 사분위수 위치(Q1~)로 설정
660
- default_q_labels = labels if labels is not None else [f"Q{i+1}" for i in range(k)]
707
+ default_q_labels = (
708
+ labels if labels is not None else [f"Q{i+1}" for i in range(k)]
709
+ )
661
710
  try:
662
711
  if apply_labels:
663
712
  # 숫자 인덱스 사용
664
713
  numeric_labels = list(range(k))
665
- df[new_col] = pd.qcut(series, q=k, labels=numeric_labels, duplicates="drop")
714
+ df[new_col] = pd.qcut(
715
+ series, q=k, labels=numeric_labels, duplicates="drop"
716
+ )
666
717
  else:
667
718
  # 사분위수 위치 기반 문자 레이블(Q1, Q2, ...)
668
- df[new_col] = pd.qcut(series, q=k, labels=default_q_labels, duplicates="drop")
719
+ df[new_col] = pd.qcut(
720
+ series, q=k, labels=default_q_labels, duplicates="drop"
721
+ )
669
722
  except ValueError:
670
723
  _, edges = pd.cut(series, bins=k, include_lowest=True, retbins=True)
671
724
  # apply_labels=True: 숫자 인덱스 / False: 문자 레이블
672
725
  n_bins = len(edges) - 1
673
726
  if apply_labels:
674
727
  numeric_labels = list(range(n_bins))
675
- df[new_col] = pd.cut(series, bins=edges, labels=numeric_labels, include_lowest=True, ordered=False) # type: ignore
728
+ df[new_col] = pd.cut(series, bins=edges, labels=numeric_labels, include_lowest=True, ordered=False) # type: ignore
676
729
  else:
677
730
  if labels is None:
678
731
  position_labels = [f"Q{i+1}" for i in range(n_bins)]
679
- df[new_col] = pd.cut(series, bins=edges, labels=position_labels, include_lowest=True, ordered=False) # type: ignore
732
+ df[new_col] = pd.cut(series, bins=edges, labels=position_labels, include_lowest=True, ordered=False) # type: ignore
680
733
  else:
681
- df[new_col] = pd.cut(series, bins=edges, labels=labels, include_lowest=True, ordered=False) # type: ignore
734
+ df[new_col] = pd.cut(series, bins=edges, labels=labels, include_lowest=True, ordered=False) # type: ignore
682
735
  df[new_col] = df[new_col].astype("category")
683
736
  return df
684
737
 
685
738
  # 자연 구간화 (Jenks) - 의존성 없으면 분위수로 폴백
686
739
  if method_key in {"natural_breaks", "natural", "jenks"}:
687
740
  k = bins if isinstance(bins, int) and bins > 1 else 5
688
- series_nonnull = series.dropna() # type: ignore
741
+ series_nonnull = series.dropna() # type: ignore
689
742
  k = min(k, max(2, series_nonnull.nunique()))
690
743
  edges = None
691
744
  try:
692
- edges = jenkspy.jenks_breaks(series_nonnull.to_list(), nb_class=k) # type: ignore
745
+ edges = jenkspy.jenks_breaks(series_nonnull.to_list(), nb_class=k) # type: ignore
693
746
  edges[0] = -np.inf
694
747
  edges[-1] = np.inf
695
748
  except Exception:
@@ -706,18 +759,32 @@ def bin_continuous(
706
759
  if apply_labels:
707
760
  # 숫자 인덱스 사용
708
761
  numeric_labels = list(range(len(edges) - 1))
709
- df[new_col] = pd.cut(series, bins=edges, labels=numeric_labels, include_lowest=True, ordered=False)
762
+ df[new_col] = pd.cut(
763
+ series,
764
+ bins=edges,
765
+ labels=numeric_labels,
766
+ include_lowest=True,
767
+ ordered=False,
768
+ )
710
769
  df[new_col] = df[new_col].astype("category")
711
770
  else:
712
771
  if labels is None:
713
772
  auto_labels = []
714
773
  for i in range(len(edges) - 1):
715
774
  left = f"{edges[i]:.2f}" if edges[i] != -np.inf else "-∞"
716
- right = f"{edges[i+1]:.2f}" if edges[i+1] != np.inf else "∞"
775
+ right = f"{edges[i+1]:.2f}" if edges[i + 1] != np.inf else "∞"
717
776
  # 정수값인 경우 소수점 제거
718
777
  try:
719
- left = str(int(float(left))) if float(left) == int(float(left)) else left
720
- right = str(int(float(right))) if float(right) == int(float(right)) else right
778
+ left = (
779
+ str(int(float(left)))
780
+ if float(left) == int(float(left))
781
+ else left
782
+ )
783
+ right = (
784
+ str(int(float(right)))
785
+ if float(right) == int(float(right))
786
+ else right
787
+ )
721
788
  except:
722
789
  pass
723
790
  auto_labels.append(f"{left}~{right}")
@@ -729,23 +796,37 @@ def bin_continuous(
729
796
  if apply_labels:
730
797
  # 숫자 인덱스 사용
731
798
  numeric_labels = list(range(len(cut_edges) - 1))
732
- df[new_col] = pd.cut(series, bins=cut_edges, labels=numeric_labels, include_lowest=True, ordered=False) # type: ignore
799
+ df[new_col] = pd.cut(series, bins=cut_edges, labels=numeric_labels, include_lowest=True, ordered=False) # type: ignore
733
800
  else:
734
801
  if labels is None:
735
802
  auto_labels = []
736
803
  for i in range(len(cut_edges) - 1):
737
- left = f"{cut_edges[i]:.2f}" if cut_edges[i] != -np.inf else "-∞"
738
- right = f"{cut_edges[i+1]:.2f}" if cut_edges[i+1] != np.inf else ""
804
+ left = (
805
+ f"{cut_edges[i]:.2f}" if cut_edges[i] != -np.inf else "-∞"
806
+ )
807
+ right = (
808
+ f"{cut_edges[i+1]:.2f}"
809
+ if cut_edges[i + 1] != np.inf
810
+ else "∞"
811
+ )
739
812
  # 정수값인 경우 소수점 제거
740
813
  try:
741
- left = str(int(float(left))) if float(left) == int(float(left)) else left
742
- right = str(int(float(right))) if float(right) == int(float(right)) else right
814
+ left = (
815
+ str(int(float(left)))
816
+ if float(left) == int(float(left))
817
+ else left
818
+ )
819
+ right = (
820
+ str(int(float(right)))
821
+ if float(right) == int(float(right))
822
+ else right
823
+ )
743
824
  except:
744
825
  pass
745
826
  auto_labels.append(f"{left}~{right}")
746
- df[new_col] = pd.cut(series, bins=cut_edges, labels=auto_labels, include_lowest=True, ordered=False) # type: ignore
827
+ df[new_col] = pd.cut(series, bins=cut_edges, labels=auto_labels, include_lowest=True, ordered=False) # type: ignore
747
828
  else:
748
- df[new_col] = pd.cut(series, bins=cut_edges, labels=labels, include_lowest=True, ordered=False) # type: ignore
829
+ df[new_col] = pd.cut(series, bins=cut_edges, labels=labels, include_lowest=True, ordered=False) # type: ignore
749
830
  df[new_col] = df[new_col].astype("category")
750
831
  return df
751
832
 
@@ -764,7 +845,9 @@ def bin_continuous(
764
845
  # ===================================================================
765
846
  # 지정된 변수에 로그 먼저 변환을 적용한다
766
847
  # ===================================================================
767
- def log_transform(data: DataFrame, *fields: str, columns: list | None = None) -> DataFrame:
848
+ def log_transform(
849
+ data: DataFrame, *fields: str, columns: list | None = None
850
+ ) -> DataFrame:
768
851
  """수치형 변수에 대해 로그 변환을 수행한다.
769
852
 
770
853
  자연로그(ln)를 사용하여 변환하며, 0 또는 음수 값이 있을 경우
@@ -803,7 +886,7 @@ def log_transform(data: DataFrame, *fields: str, columns: list | None = None) ->
803
886
  if columns is not None:
804
887
  if fields:
805
888
  raise ValueError("fields와 columns 인자는 중복 사용할 수 없습니다.")
806
- fields = columns # type: ignore
889
+ fields = columns # type: ignore
807
890
 
808
891
  # 대상 컬럼 결정
809
892
  if not fields:
@@ -818,7 +901,14 @@ def log_transform(data: DataFrame, *fields: str, columns: list | None = None) ->
818
901
  if col not in df.columns:
819
902
  continue
820
903
 
821
- if df[col].dtype not in ['int', 'int32', 'int64', 'float', 'float32', 'float64']:
904
+ if df[col].dtype not in [
905
+ "int",
906
+ "int32",
907
+ "int64",
908
+ "float",
909
+ "float32",
910
+ "float64",
911
+ ]:
822
912
  continue
823
913
 
824
914
  # 최소값 확인
@@ -837,7 +927,9 @@ def log_transform(data: DataFrame, *fields: str, columns: list | None = None) ->
837
927
  # ===================================================================
838
928
  # 변수 간의 상호작용 항을 추가한 데이터프레임을 반환한다
839
929
  # ===================================================================
840
- def add_interaction(data: DataFrame, pairs: list[tuple[str, str]] | None = None) -> DataFrame:
930
+ def add_interaction(
931
+ data: DataFrame, pairs: list[tuple[str, str]] | None = None
932
+ ) -> DataFrame:
841
933
  """데이터프레임에 상호작용(interaction) 항을 추가한다.
842
934
 
843
935
  수치형 및 명목형 변수 간의 상호작용 항을 생성하여 데이터프레임에 추가한다.
@@ -879,8 +971,11 @@ def add_interaction(data: DataFrame, pairs: list[tuple[str, str]] | None = None)
879
971
 
880
972
  # pairs가 제공되면 그것을 사용, 아니면 모든 수치형 컬럼의 2-way 상호작용 생성
881
973
  if pairs is not None:
882
- cols_to_interact = [(col1, col2) for col1, col2 in pairs
883
- if col1 in df.columns and col2 in df.columns]
974
+ cols_to_interact = [
975
+ (col1, col2)
976
+ for col1, col2 in pairs
977
+ if col1 in df.columns and col2 in df.columns
978
+ ]
884
979
  else:
885
980
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
886
981
  cols_to_interact = list(combinations(numeric_cols, 2))
@@ -919,8 +1014,98 @@ def add_interaction(data: DataFrame, pairs: list[tuple[str, str]] | None = None)
919
1014
  interaction_col_name = f"{col1}_{col2}"
920
1015
  # 문자열로 변환 후 결합 (결측치 처리 포함)
921
1016
  df[interaction_col_name] = (
922
- df[col1].astype(str).fillna("nan") + "_" +
923
- df[col2].astype(str).fillna("nan")
1017
+ df[col1].astype(str).fillna("nan")
1018
+ + "_"
1019
+ + df[col2].astype(str).fillna("nan")
924
1020
  )
925
1021
 
926
1022
  return df
1023
+
1024
+
1025
+ # ===================================================================
1026
+ # PCA 분석
1027
+ # ===================================================================
1028
+ def pca(
1029
+ data: DataFrame,
1030
+ n_components: int | float | str = 0.8,
1031
+ yname: str | None = None,
1032
+ random_state: int = RANDOM_STATE,
1033
+ plot: bool = False,
1034
+ fields: list | tuple | list[list] | tuple[list] | list[tuple] | tuple[tuple] | None = None,
1035
+ palette: str | None = None,
1036
+ width: int = config.width,
1037
+ height: int = config.height,
1038
+ linewidth: float = config.line_width,
1039
+ dpi: int = config.dpi,
1040
+ save_path: str | None = None,
1041
+ callback: Callable | None = None,
1042
+ ) -> tuple[PCA, DataFrame, DataFrame]:
1043
+ """
1044
+ 주성분 분석(PCA)을 수행하고, 주성분 데이터프레임과 설명된 분산 비율 데이터프레임을 반환합니다.
1045
+
1046
+ Args:
1047
+ data (DataFrame): 입력 데이터프레임.
1048
+ n_components (int|float|str): 주성분의 수 또는 설명할 분산 비율. (기본값 0.8)
1049
+ yname (str|None): 종속 변수 이름. 종속변수 이름이 주어진 경우 해당 컬럼은 제외하고 처리합니다. (기본값 None)
1050
+ random_state (int): 랜덤 시드. (기본값 RANDOM_STATE)
1051
+ plot (bool): 주성분 설명력 그래프를 출력할지 여부. (기본값 False)
1052
+ fields (list|tuple|list[list]|tuple[list]|list[tuple]|tuple[tuple]|None): 산점도에 표시할 변수 목록.
1053
+ palette (str|None): 팔레트 이름.
1054
+ width (int): 캔버스 가로 픽셀.
1055
+ height (int): 캔버스 세로 픽셀.
1056
+ linewidth (float): 선 굵기.
1057
+ dpi (int): 그림 크기 및 해상도.
1058
+ save_path (str|None): 저장 경로.
1059
+ callback (Callable|None): Axes 후처리 콜백.
1060
+
1061
+ Returns:
1062
+ tuple[DataFrame, DataFrame]: 주성분 데이터프레임과 설명된 분산 비율 데이터프레임.
1063
+ """
1064
+
1065
+ df = data.copy()
1066
+ yfield = None
1067
+
1068
+ if yname is not None:
1069
+ if yname not in df.columns:
1070
+ raise ValueError(f"yname '{yname}'이(가) 데이터프레임에 존재하지 않습니다.")
1071
+
1072
+ yfield = df[[yname]].copy()
1073
+ df = df.drop(columns=[yname])
1074
+
1075
+ estimator = PCA(n_components=n_components, random_state=random_state)
1076
+ pca = estimator.fit_transform(df)
1077
+
1078
+ n = pca.shape[1]
1079
+ cols = [f"PC{i+1}" for i in range(n)]
1080
+
1081
+ pca_df = DataFrame(pca, columns=cols)
1082
+
1083
+ if yfield is not None:
1084
+ pca_df[yname] = yfield
1085
+
1086
+ # 주성분 로딩 행렬 구성
1087
+ loadings = pd.DataFrame(estimator.components_, columns=df.columns.tolist(), index=cols)
1088
+ loadings.T.index.name = "Features"
1089
+
1090
+ # 주성분별 설명력과 누적합을 데이터프레임에 추가 구성
1091
+ loadings[f"[Explained Variance]"] = estimator.explained_variance_ratio_
1092
+ loadings[f"[Cumulative Variance]"] = estimator.explained_variance_ratio_.cumsum()
1093
+ loadings = loadings.T
1094
+
1095
+ if plot:
1096
+ pca_plot(
1097
+ estimator=estimator,
1098
+ data = data,
1099
+ yname = yname,
1100
+ fields = fields,
1101
+ hue = yname,
1102
+ palette = palette,
1103
+ width = width,
1104
+ height = height,
1105
+ linewidth = linewidth,
1106
+ dpi = dpi,
1107
+ save_path = save_path,
1108
+ callback = callback,
1109
+ )
1110
+
1111
+ return estimator, pca_df, loadings
hossam/hs_util.py CHANGED
@@ -474,3 +474,23 @@ def load_data(key: str,
474
474
  raise RuntimeError("Data loading failed: origin is None")
475
475
 
476
476
  return __data_info(origin, index_col, timeindex, info, categories)
477
+
478
+
479
+ # ===================================================================
480
+ # 2차원 리스트 여부 확인
481
+ # ===================================================================
482
+ def is_2d(x) -> bool:
483
+ """
484
+ 주어진 객체가 2차원 리스트인지 확인합니다.
485
+
486
+ Args:
487
+ x: 확인할 객체
488
+
489
+ Returns:
490
+ bool: 객체가 2차원 리스트인 경우 True, 그렇지 않은 경우 False
491
+ """
492
+ return (
493
+ isinstance(x, (list, tuple)) and
494
+ len(x) > 0 and
495
+ all(isinstance(i, (list, tuple)) for i in x)
496
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hossam
3
- Version: 0.4.15
3
+ Version: 0.4.17
4
4
  Summary: Hossam Data Helper
5
5
  Author-email: Lee Kwang-Ho <leekh4232@gmail.com>
6
6
  License-Expression: MIT
@@ -1,17 +1,17 @@
1
1
  hossam/NotoSansKR-Regular.ttf,sha256=0SCufUQwcVWrWTu75j4Lt_V2bgBJIBXl1p8iAJJYkVY,6185516
2
2
  hossam/__init__.py,sha256=lJ-_g2HAmFnixOmKjCv7_cMSdiYwbM6SNlHEtptUlUI,4045
3
3
  hossam/hs_classroom.py,sha256=Sb1thy49LKn2zU90aiOVwHWhyWSMHLZbZX7eXmQlquc,27523
4
- hossam/hs_cluster.py,sha256=umgJRRr9vXkaFVIs6V_K3SrOo6Q5AEqVYmQSMaGj7rY,31945
4
+ hossam/hs_cluster.py,sha256=yFlEaLz-cEueurw5nvKuUogdwepOEU3jN7woaYeN-cM,37581
5
5
  hossam/hs_gis.py,sha256=DVmndBK-_7GMK3J1_on3ieEQk1S0MfUZ8_wlX-cDdZQ,11581
6
- hossam/hs_plot.py,sha256=Xv5aBJ8PibAW9QP-cvv0mHM7MQkByNs0mZKArrDcWwY,94918
7
- hossam/hs_prep.py,sha256=ypuX97mCxpo7CLoI_S79bUw7th0ok5LCZjt4vzRaGiI,38326
6
+ hossam/hs_plot.py,sha256=nuN8DoQbMRLMxFRO1VD1zhg2yA1NHqFGrjuvS4FLahY,102773
7
+ hossam/hs_prep.py,sha256=lsK_BxuOXNdTTNED-nkw3EY2vt5nQGMvkWvVLqNkwM0,43255
8
8
  hossam/hs_stats.py,sha256=MDS3rvaXDP8aYwcE36JTetWiZgE4fkXnNo0vwlXu-pA,119890
9
9
  hossam/hs_study.py,sha256=ZzL76_V0IHnk_YUTbWncIIBruOj2Sz3xs91snS6cpu0,2776
10
10
  hossam/hs_timeserise.py,sha256=NzGV4bJmVQr3qUFySOP25qENItmloYjgh3VgwSbSmXc,43163
11
- hossam/hs_util.py,sha256=ptl-2W7-0Ad_BemZMR8cFnDt6-SHCRRCk1Gh7giFjSs,16149
11
+ hossam/hs_util.py,sha256=LJvoTFSLX_uADJrC1ONGM93DX2HS2jcNBnPariLPZko,16704
12
12
  hossam/leekh.png,sha256=1PB5NQ24SDoHA5KMiBBsWpSa3iniFcwFTuGwuOsTHfI,6395
13
- hossam-0.4.15.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
14
- hossam-0.4.15.dist-info/METADATA,sha256=XhcpBu7ZhjNeWCaPfJQxhmCj8IOHFqY0IJOX8SjJ_qk,3803
15
- hossam-0.4.15.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
16
- hossam-0.4.15.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
17
- hossam-0.4.15.dist-info/RECORD,,
13
+ hossam-0.4.17.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
14
+ hossam-0.4.17.dist-info/METADATA,sha256=camh3iJFBxxmzX1tHg71MWwNcqcY3a_GSKbNhf8iR7E,3803
15
+ hossam-0.4.17.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
16
+ hossam-0.4.17.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
17
+ hossam-0.4.17.dist-info/RECORD,,