hossam 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hossam/hs_plot.py CHANGED
@@ -1,7 +1,8 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  from __future__ import annotations
3
3
  from types import SimpleNamespace
4
- from typing import Callable
4
+ from typing import Callable, Literal
5
+ from itertools import combinations
5
6
 
6
7
  # ===================================================================
7
8
  import numpy as np
@@ -15,6 +16,7 @@ from pandas import DataFrame
15
16
  # ===================================================================
16
17
  from scipy.stats import t
17
18
  from scipy.spatial import ConvexHull
19
+ from scipy.cluster.hierarchy import dendrogram as sch_dendrogram
18
20
  from statsmodels.graphics.gofplots import qqplot as sm_qqplot
19
21
  from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess
20
22
 
@@ -23,6 +25,7 @@ from statannotations.Annotator import Annotator
23
25
 
24
26
  # ===================================================================
25
27
  from sklearn.cluster import AgglomerativeClustering, KMeans
28
+ from sklearn.decomposition import PCA
26
29
 
27
30
  from sklearn.metrics import (
28
31
  mean_squared_error,
@@ -34,6 +37,8 @@ from sklearn.metrics import (
34
37
  silhouette_samples,
35
38
  )
36
39
 
40
+ from .hs_util import is_2d
41
+
37
42
  # ===================================================================
38
43
  config = SimpleNamespace(
39
44
  dpi=200,
@@ -304,7 +309,7 @@ def lineplot(
304
309
  # 상자그림(boxplot)을 그린다
305
310
  # ===================================================================
306
311
  def boxplot(
307
- df: DataFrame,
312
+ df: DataFrame | None = None,
308
313
  xname: str | None = None,
309
314
  yname: str | None = None,
310
315
  title: str | None = None,
@@ -326,7 +331,7 @@ def boxplot(
326
331
  """상자그림(boxplot)을 그린다.
327
332
 
328
333
  Args:
329
- df (DataFrame): 시각화할 데이터.
334
+ df (DataFrame|None): 시각화할 데이터.
330
335
  xname (str|None): x축 범주 컬럼명.
331
336
  yname (str|None): y축 값 컬럼명.
332
337
  title (str|None): 그래프 제목.
@@ -354,13 +359,20 @@ def boxplot(
354
359
  fig, ax = get_default_ax(width, height, 1, 1, dpi) # type: ignore
355
360
  outparams = True
356
361
 
357
- if xname is not None and yname is not None:
362
+ if xname is not None or yname is not None:
363
+ if xname is not None and yname is None:
364
+ orient = "h"
365
+ elif xname is None and yname is not None:
366
+ orient = "v"
367
+
368
+
358
369
  boxplot_kwargs = {
359
370
  "data": df,
360
371
  "x": xname,
361
372
  "y": yname,
362
373
  "orient": orient,
363
374
  "ax": ax,
375
+ "linewidth": linewidth,
364
376
  }
365
377
 
366
378
  # hue 파라미터 확인 (params에 있을 수 있음)
@@ -372,12 +384,12 @@ def boxplot(
372
384
  boxplot_kwargs["color"] = sb.color_palette(palette)[0]
373
385
 
374
386
  boxplot_kwargs.update(params)
375
- sb.boxplot(**boxplot_kwargs, linewidth=linewidth)
387
+ sb.boxplot(**boxplot_kwargs)
376
388
 
377
389
  # 통계 검정 추가
378
390
  if stat_test is not None:
379
391
  if stat_pairs is None:
380
- stat_pairs = [df[xname].dropna().unique().tolist()]
392
+ stat_pairs = [df[xname].dropna().unique().tolist()] # type: ignore
381
393
 
382
394
  annotator = Annotator(
383
395
  ax, data=df, x=xname, y=yname, pairs=stat_pairs, orient=orient
@@ -749,7 +761,7 @@ def stackplot(
749
761
  # 산점도를 그린다
750
762
  # ===================================================================
751
763
  def scatterplot(
752
- df: DataFrame,
764
+ df: DataFrame | None,
753
765
  xname: str,
754
766
  yname: str,
755
767
  hue=None,
@@ -769,7 +781,7 @@ def scatterplot(
769
781
  """산점도를 그린다.
770
782
 
771
783
  Args:
772
- df (DataFrame): 시각화할 데이터.
784
+ df (DataFrame | None): 시각화할 데이터.
773
785
  xname (str): x축 컬럼.
774
786
  yname (str): y축 컬럼.
775
787
  hue (str|None): 범주 컬럼.
@@ -796,12 +808,12 @@ def scatterplot(
796
808
 
797
809
  if outline and hue is not None:
798
810
  # 군집별 값의 종류별로 반복 수행
799
- for c in df[hue].unique():
811
+ for c in df[hue].unique(): # type: ignore
800
812
  if c == -1:
801
813
  continue
802
814
 
803
815
  # 한 종류만 필터링한 결과에서 두 변수만 선택
804
- df_c = df.loc[df[hue] == c, [xname, yname]]
816
+ df_c = df.loc[df[hue] == c, [xname, yname]] # type: ignore
805
817
 
806
818
  try:
807
819
  # 외각선 좌표 계산
@@ -842,15 +854,15 @@ def scatterplot(
842
854
  else:
843
855
  # 핵심벡터
844
856
  scatterplot_kwargs["edgecolor"] = "#ffffff"
845
- sb.scatterplot(data=df[df[vector] == "core"], **scatterplot_kwargs)
857
+ sb.scatterplot(data=df[df[vector] == "core"], **scatterplot_kwargs) # type: ignore
846
858
 
847
859
  # 외곽백터
848
860
  scatterplot_kwargs["edgecolor"] = "#000000"
849
861
  scatterplot_kwargs["s"] = 25
850
862
  scatterplot_kwargs["marker"] = "^"
851
863
  scatterplot_kwargs["linewidth"] = 0.8
852
- sb.scatterplot(data=df[df[vector] == "border"], **scatterplot_kwargs)
853
-
864
+ sb.scatterplot(data=df[df[vector] == "border"], **scatterplot_kwargs) # type: ignore
865
+
854
866
  # 노이즈벡터
855
867
  scatterplot_kwargs["edgecolor"] = None
856
868
  scatterplot_kwargs["s"] = 25
@@ -2644,6 +2656,9 @@ def silhouette_plot(
2644
2656
  finalize_plot(ax, callback, outparams, save_path, True, title) # type: ignore
2645
2657
 
2646
2658
 
2659
+ # ===================================================================
2660
+ # 군집분석 결과 시각화
2661
+ # ===================================================================
2647
2662
  def cluster_plot(
2648
2663
  estimator: KMeans | AgglomerativeClustering | None = None,
2649
2664
  data: DataFrame | None = None,
@@ -2755,6 +2770,9 @@ def cluster_plot(
2755
2770
  )
2756
2771
 
2757
2772
 
2773
+ # ===================================================================
2774
+ # 군집분석 결과의 실루엣 플롯과 군집 산점도를 한 화면에 함께 시각화
2775
+ # ===================================================================
2758
2776
  def visualize_silhouette(
2759
2777
  estimator: KMeans | AgglomerativeClustering,
2760
2778
  data: DataFrame,
@@ -2819,3 +2837,216 @@ def visualize_silhouette(
2819
2837
  )
2820
2838
 
2821
2839
  finalize_plot(ax)
2840
+
2841
+
2842
+
2843
+ # ===================================================================
2844
+ # 덴드로그램 시각화
2845
+ # ===================================================================
2846
+ def dandrogram(
2847
+ estimator: AgglomerativeClustering,
2848
+ p: int = 30,
2849
+ count_sort: Literal["ascending", "descending", False] = "ascending",
2850
+ title: str | None = None,
2851
+ width: int = config.width,
2852
+ height: int = config.height,
2853
+ dpi: int = config.dpi,
2854
+ save_path: str | None = None,
2855
+ callback: Callable | None = None,
2856
+ ax: Axes | None = None
2857
+ ) -> None:
2858
+ """덴드로그램 시각화
2859
+
2860
+ Args:
2861
+ estimator (AgglomerativeClustering): 학습된 AgglomerativeClustering 군집 모델 객체.
2862
+ p (int): 덴드로그램에서 표시할 마지막 병합된 군집 수. 기본값 30.
2863
+ count_sort (str): 'ascending' 또는 'descending'으로 병합 순서 정렬.
2864
+ title (str|None): 그래프 제목.
2865
+ palette (str|None): 팔레트 이름.
2866
+ width (int): 캔버스 가로 픽셀.
2867
+ height (int): 캔버스 세로 픽셀.
2868
+ dpi (int): 그림 크기 및 해상도.
2869
+ save_path (str|None): 저장 경로.
2870
+ callback (Callable|None): Axes 후처리 콜백.
2871
+ ax (Axes|None): 외부에서 전달한 Axes. None이면 새로 생성.
2872
+
2873
+ Returns:
2874
+ None
2875
+ """
2876
+ # 덴드로그램을 그리기 위해 linkage 행렬 생성
2877
+ counts = np.zeros(estimator.children_.shape[0]) # type: ignore
2878
+ n_samples = len(estimator.labels_)
2879
+
2880
+ for i, merge in enumerate(estimator.children_): # type: ignore
2881
+ current_count = 0
2882
+ for child_idx in merge:
2883
+ if child_idx < n_samples:
2884
+ current_count += 1 # leaf node
2885
+ else:
2886
+ current_count += counts[child_idx - n_samples]
2887
+ counts[i] = current_count
2888
+
2889
+ linkage_matrix = np.column_stack(
2890
+ [estimator.children_, estimator.distances_, counts]
2891
+ ).astype(float)
2892
+
2893
+ outparams = False
2894
+
2895
+ if ax is None:
2896
+ fig, ax = get_default_ax(width, height, 1, 1, dpi) # type: ignore
2897
+ outparams = True
2898
+
2899
+
2900
+ sch_dendrogram(
2901
+ linkage_matrix,
2902
+ ax=ax,
2903
+ p=p,
2904
+ truncate_mode="lastp" if p > 0 else None,
2905
+ leaf_rotation=0,
2906
+ leaf_font_size=8,
2907
+ count_sort=count_sort,
2908
+ color_threshold=None,
2909
+ above_threshold_color="grey",
2910
+ )
2911
+
2912
+ finalize_plot(ax, callback, outparams, save_path, True, title) # type: ignore
2913
+
2914
+
2915
+ # ===================================================================
2916
+ # PCA 분석 결과에 대한 biplot 시각화
2917
+ # ===================================================================
2918
+ def pca_plot(
2919
+ estimator: PCA,
2920
+ data: DataFrame,
2921
+ yname: str | None = None,
2922
+ fields: list | tuple | list[list] | tuple[list] | list[tuple] | tuple[tuple] | None = None,
2923
+ hue: str | None = None,
2924
+ palette: str | None = None,
2925
+ width: int = config.width,
2926
+ height: int = config.height,
2927
+ linewidth: float = config.line_width,
2928
+ dpi: int = config.dpi,
2929
+ save_path: str | None = None,
2930
+ callback: Callable | None = None,
2931
+ ) -> None:
2932
+ """
2933
+ PCA 분석 결과에 대한 biplot 시각화
2934
+
2935
+ Args:
2936
+ estimator (PCA): 학습된 PCA 객체.
2937
+ data (DataFrame): PCA에 사용된 원본 데이터.
2938
+ yname (str | None): 종속변수 컬럼명.
2939
+ fields (list | tuple | list[list] | tuple[list] | list[tuple] | tuple[tuple] | None): 시각화할 독립변수 목록. None이면 자동 탐지.
2940
+ hue (str|None): 집단 구분 컬럼명.
2941
+ palette (str|None): 팔레트 이름.
2942
+ width (int): 캔버스 가로 픽셀.
2943
+ height (int): 캔버스 세로 픽셀.
2944
+ linewidth (float): 선 굵기.
2945
+ dpi (int): 그림 크기 및 해상도.
2946
+ save_path (str|None): 저장 경로.
2947
+ callback (Callable|None): Axes 후처리 콜백.
2948
+
2949
+ Returns:
2950
+ None
2951
+ """
2952
+ df = data.copy()
2953
+ df_columns = df.columns.tolist()
2954
+
2955
+ # 종속변수가 지정되었다면 해당 컬럼 추출
2956
+ yfield = None
2957
+ if yname is not None and yname in data.columns:
2958
+ yfield = df[[yname]].copy()
2959
+ df = df.drop(columns=[yname])
2960
+
2961
+ # PCA 변환 수행
2962
+ #display(df)
2963
+ score = estimator.transform(df)
2964
+ #print(score)
2965
+
2966
+ # 추정기로부터 PCA 결과 데이터 프레임 생성
2967
+ pca_df = DataFrame(
2968
+ data=score,
2969
+ columns=[f"PC{i+1}" for i in range(estimator.n_components_)],
2970
+ )
2971
+ #display(pca_df)
2972
+
2973
+ # 종속변수 컬럼 추가
2974
+ if yfield is not None:
2975
+ pca_df[yname] = yfield
2976
+
2977
+ # 모든 컬럼명에 대한 조합 생성
2978
+ if fields is None:
2979
+ feature_cols = pca_df.columns.tolist()
2980
+ if yname is not None and yname in feature_cols:
2981
+ feature_cols.remove(yname)
2982
+ fields = list(combinations(feature_cols, 2))
2983
+
2984
+ if not is_2d(fields):
2985
+ fields = [fields] # type: ignore
2986
+
2987
+ components = estimator.components_
2988
+
2989
+ x_index: int = 0
2990
+ y_index: int = 0
2991
+
2992
+ def __callable(ax) -> None:
2993
+ for i in range(n):
2994
+ ax.arrow(
2995
+ 0,
2996
+ 0,
2997
+ components[x_index, i],
2998
+ components[y_index, i],
2999
+ color="r",
3000
+ head_width=0.007,
3001
+ head_length=0.007,
3002
+ linewidth=linewidth * 0.75,
3003
+ alpha=0.75,
3004
+ )
3005
+ ax.text(
3006
+ components[x_index, i] * 1.15,
3007
+ components[y_index, i] * 1.15,
3008
+ f"{df_columns[i]} ({components[x_index, i]:.2f})",
3009
+ color="b",
3010
+ ha="center",
3011
+ va="center",
3012
+ )
3013
+
3014
+ if callback is not None:
3015
+ callback(ax)
3016
+
3017
+ for field_group in fields: # type: ignore
3018
+ x_index = int(pca_df.columns.get_loc(field_group[0])) # type: ignore
3019
+ y_index = int(pca_df.columns.get_loc(field_group[1])) # type: ignore
3020
+
3021
+ xs = score[:, x_index]
3022
+ ys = score[:, y_index]
3023
+ n = score.shape[1]
3024
+
3025
+ scalex = 1.0 / (xs.max() - xs.min())
3026
+ scaley = 1.0 / (ys.max() - ys.min())
3027
+
3028
+ title = "PCA Biplot"
3029
+ if field_group is not None:
3030
+ title += " - " + ", ".join(field_group)
3031
+
3032
+ tdf = DataFrame({
3033
+ field_group[0]: xs * scalex,
3034
+ field_group[1]: ys * scaley,
3035
+ })
3036
+
3037
+ scatterplot(
3038
+ df=tdf,
3039
+ xname=field_group[0],
3040
+ yname=field_group[1],
3041
+ hue=pca_df[hue] if hue is not None else None,
3042
+ outline=False,
3043
+ palette=palette,
3044
+ width=width,
3045
+ height=height,
3046
+ linewidth=linewidth,
3047
+ dpi=dpi,
3048
+ save_path=save_path,
3049
+ title=title,
3050
+ callback=__callable,
3051
+ )
3052
+