hossam 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hossam/hs_plot.py CHANGED
@@ -1,7 +1,8 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  from __future__ import annotations
3
3
  from types import SimpleNamespace
4
- from typing import Callable
4
+ from typing import Callable, Literal
5
+ from itertools import combinations
5
6
 
6
7
  # ===================================================================
7
8
  import numpy as np
@@ -15,6 +16,7 @@ from pandas import DataFrame
15
16
  # ===================================================================
16
17
  from scipy.stats import t
17
18
  from scipy.spatial import ConvexHull
19
+ from scipy.cluster.hierarchy import dendrogram as sch_dendrogram
18
20
  from statsmodels.graphics.gofplots import qqplot as sm_qqplot
19
21
  from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess
20
22
 
@@ -22,7 +24,8 @@ from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess
22
24
  from statannotations.Annotator import Annotator
23
25
 
24
26
  # ===================================================================
25
- from sklearn.cluster._kmeans import KMeans
27
+ from sklearn.cluster import AgglomerativeClustering, KMeans
28
+ from sklearn.decomposition import PCA
26
29
 
27
30
  from sklearn.metrics import (
28
31
  mean_squared_error,
@@ -34,6 +37,8 @@ from sklearn.metrics import (
34
37
  silhouette_samples,
35
38
  )
36
39
 
40
+ from .hs_util import is_2d
41
+
37
42
  # ===================================================================
38
43
  config = SimpleNamespace(
39
44
  dpi=200,
@@ -749,7 +754,7 @@ def stackplot(
749
754
  # 산점도를 그린다
750
755
  # ===================================================================
751
756
  def scatterplot(
752
- df: DataFrame,
757
+ df: DataFrame | None,
753
758
  xname: str,
754
759
  yname: str,
755
760
  hue=None,
@@ -769,7 +774,7 @@ def scatterplot(
769
774
  """산점도를 그린다.
770
775
 
771
776
  Args:
772
- df (DataFrame): 시각화할 데이터.
777
+ df (DataFrame | None): 시각화할 데이터.
773
778
  xname (str): x축 컬럼.
774
779
  yname (str): y축 컬럼.
775
780
  hue (str|None): 범주 컬럼.
@@ -796,12 +801,12 @@ def scatterplot(
796
801
 
797
802
  if outline and hue is not None:
798
803
  # 군집별 값의 종류별로 반복 수행
799
- for c in df[hue].unique():
804
+ for c in df[hue].unique(): # type: ignore
800
805
  if c == -1:
801
806
  continue
802
807
 
803
808
  # 한 종류만 필터링한 결과에서 두 변수만 선택
804
- df_c = df.loc[df[hue] == c, [xname, yname]]
809
+ df_c = df.loc[df[hue] == c, [xname, yname]] # type: ignore
805
810
 
806
811
  try:
807
812
  # 외각선 좌표 계산
@@ -2559,7 +2564,7 @@ def distribution_plot(
2559
2564
 
2560
2565
 
2561
2566
  def silhouette_plot(
2562
- estimator: KMeans,
2567
+ estimator: KMeans | AgglomerativeClustering,
2563
2568
  data: DataFrame,
2564
2569
  title: str | None = None,
2565
2570
  width: int = config.width,
@@ -2574,7 +2579,7 @@ def silhouette_plot(
2574
2579
  군집분석 결과의 실루엣 플롯을 시각화함.
2575
2580
 
2576
2581
  Args:
2577
- estimator (KMeans): 학습된 KMeans 군집 모델 객체.
2582
+ estimator (KMeans | AgglomerativeClustering): 학습된 KMeans 또는 AgglomerativeClustering 군집 모델 객체.
2578
2583
  data (DataFrame): 군집분석에 사용된 입력 데이터 (n_samples, n_features).
2579
2584
  title (str, optional): 플롯 제목. None이면 자동 생성.
2580
2585
  width (int, optional): 플롯 가로 크기 (inch 단위).
@@ -2605,7 +2610,15 @@ def silhouette_plot(
2605
2610
  y_lower = 10
2606
2611
 
2607
2612
  # 클러스터링 갯수별로 fill_betweenx( )형태의 막대 그래프 표현.
2608
- for i in range(estimator.n_clusters): # type: ignore
2613
+ n_clusters: int = 0
2614
+ if hasattr(estimator, "n_clusters") and estimator.n_clusters is not None: # type: ignore
2615
+ n_clusters = estimator.n_clusters # type: ignore
2616
+ elif hasattr(estimator, "n_clusters_") and estimator.n_clusters_ is not None: # type: ignore
2617
+ n_clusters = estimator.n_clusters_ # type: ignore
2618
+ else:
2619
+ n_clusters = len(np.unique(estimator.labels_)) # type: ignore
2620
+
2621
+ for i in range(n_clusters): # type: ignore
2609
2622
  ith_cluster_sil_values = sil_values[estimator.labels_ == i] # type: ignore
2610
2623
  ith_cluster_sil_values.sort() # type: ignore
2611
2624
 
@@ -2626,18 +2639,21 @@ def silhouette_plot(
2626
2639
  ax.set_xlabel("The silhouette coefficient values") # type: ignore
2627
2640
  ax.set_ylabel("Cluster label") # type: ignore
2628
2641
  ax.set_xlim([-0.1, 1]) # type: ignore
2629
- ax.set_ylim([0, len(data) + (estimator.n_clusters + 1) * 10]) # type: ignore
2642
+ ax.set_ylim([0, len(data) + (n_clusters + 1) * 10]) # type: ignore
2630
2643
  ax.set_yticks([]) # type: ignore
2631
2644
  ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1]) # type: ignore
2632
2645
 
2633
2646
  if title is None:
2634
- title = "Number of Cluster : " + str(estimator.n_clusters) + ", Silhouette Score :" + str(round(sil_avg, 3)) # type: ignore
2647
+ title = "Number of Cluster : " + str(n_clusters) + ", Silhouette Score :" + str(round(sil_avg, 3)) # type: ignore
2635
2648
 
2636
2649
  finalize_plot(ax, callback, outparams, save_path, True, title) # type: ignore
2637
2650
 
2638
2651
 
2652
+ # ===================================================================
2653
+ # 군집분석 결과 시각화
2654
+ # ===================================================================
2639
2655
  def cluster_plot(
2640
- estimator: KMeans | None = None,
2656
+ estimator: KMeans | AgglomerativeClustering | None = None,
2641
2657
  data: DataFrame | None = None,
2642
2658
  xname: str | None = None,
2643
2659
  yname: str | None = None,
@@ -2747,8 +2763,11 @@ def cluster_plot(
2747
2763
  )
2748
2764
 
2749
2765
 
2766
+ # ===================================================================
2767
+ # 군집분석 결과의 실루엣 플롯과 군집 산점도를 한 화면에 함께 시각화
2768
+ # ===================================================================
2750
2769
  def visualize_silhouette(
2751
- estimator: KMeans,
2770
+ estimator: KMeans | AgglomerativeClustering,
2752
2771
  data: DataFrame,
2753
2772
  xname: str | None = None,
2754
2773
  yname: str | None = None,
@@ -2765,7 +2784,7 @@ def visualize_silhouette(
2765
2784
  군집분석 결과의 실루엣 플롯과 군집 산점도를 한 화면에 함께 시각화함.
2766
2785
 
2767
2786
  Args:
2768
- estimator (KMeans): 학습된 KMeans 군집 모델 객체.
2787
+ estimator (KMeans | AgglomerativeClustering): 학습된 KMeans 또는 AgglomerativeClustering 군집 모델 객체.
2769
2788
  data (DataFrame): 군집분석에 사용된 입력 데이터 (n_samples, n_features).
2770
2789
  xname (str, optional): 산점도 x축에 사용할 컬럼명. None이면 첫 번째 컬럼 사용.
2771
2790
  yname (str, optional): 산점도 y축에 사용할 컬럼명. None이면 두 번째 컬럼 사용.
@@ -2811,3 +2830,211 @@ def visualize_silhouette(
2811
2830
  )
2812
2831
 
2813
2832
  finalize_plot(ax)
2833
+
2834
+
2835
+
2836
+ # ===================================================================
2837
+ # 덴드로그램 시각화
2838
+ # ===================================================================
2839
+ def dandrogram(
2840
+ estimator: AgglomerativeClustering,
2841
+ p: int = 30,
2842
+ count_sort: Literal["ascending", "descending", False] = "ascending",
2843
+ title: str | None = None,
2844
+ width: int = config.width,
2845
+ height: int = config.height,
2846
+ dpi: int = config.dpi,
2847
+ save_path: str | None = None,
2848
+ callback: Callable | None = None,
2849
+ ax: Axes | None = None
2850
+ ) -> None:
2851
+ """덴드로그램 시각화
2852
+
2853
+ Args:
2854
+ estimator (AgglomerativeClustering): 학습된 AgglomerativeClustering 군집 모델 객체.
2855
+ p (int): 덴드로그램에서 표시할 마지막 병합된 군집 수. 기본값 30.
2856
+ count_sort (str): 'ascending' 또는 'descending'으로 병합 순서 정렬.
2857
+ title (str|None): 그래프 제목.
2858
+ palette (str|None): 팔레트 이름.
2859
+ width (int): 캔버스 가로 픽셀.
2860
+ height (int): 캔버스 세로 픽셀.
2861
+ dpi (int): 그림 크기 및 해상도.
2862
+ save_path (str|None): 저장 경로.
2863
+ callback (Callable|None): Axes 후처리 콜백.
2864
+ ax (Axes|None): 외부에서 전달한 Axes. None이면 새로 생성.
2865
+
2866
+ Returns:
2867
+ None
2868
+ """
2869
+ # 덴드로그램을 그리기 위해 linkage 행렬 생성
2870
+ counts = np.zeros(estimator.children_.shape[0]) # type: ignore
2871
+ n_samples = len(estimator.labels_)
2872
+
2873
+ for i, merge in enumerate(estimator.children_): # type: ignore
2874
+ current_count = 0
2875
+ for child_idx in merge:
2876
+ if child_idx < n_samples:
2877
+ current_count += 1 # leaf node
2878
+ else:
2879
+ current_count += counts[child_idx - n_samples]
2880
+ counts[i] = current_count
2881
+
2882
+ linkage_matrix = np.column_stack(
2883
+ [estimator.children_, estimator.distances_, counts]
2884
+ ).astype(float)
2885
+
2886
+ outparams = False
2887
+
2888
+ if ax is None:
2889
+ fig, ax = get_default_ax(width, height, 1, 1, dpi) # type: ignore
2890
+ outparams = True
2891
+
2892
+
2893
+ sch_dendrogram(
2894
+ linkage_matrix,
2895
+ ax=ax,
2896
+ p=p,
2897
+ truncate_mode="lastp" if p > 0 else None,
2898
+ leaf_rotation=0,
2899
+ leaf_font_size=8,
2900
+ count_sort=count_sort,
2901
+ color_threshold=None,
2902
+ above_threshold_color="grey",
2903
+ )
2904
+
2905
+ finalize_plot(ax, callback, outparams, save_path, True, title) # type: ignore
2906
+
2907
+
2908
+ # ===================================================================
2909
+ # PCA 분석 결과에 대한 biplot 시각화
2910
+ # ===================================================================
2911
+ def pca_plot(
2912
+ estimator: PCA,
2913
+ data: DataFrame,
2914
+ yname: str | None = None,
2915
+ fields: list | tuple | list[list] | tuple[list] | list[tuple] | tuple[tuple] | None = None,
2916
+ hue: str | None = None,
2917
+ palette: str | None = None,
2918
+ width: int = config.width,
2919
+ height: int = config.height,
2920
+ linewidth: float = config.line_width,
2921
+ dpi: int = config.dpi,
2922
+ save_path: str | None = None,
2923
+ callback: Callable | None = None,
2924
+ ) -> None:
2925
+ """
2926
+ PCA 분석 결과에 대한 biplot 시각화
2927
+
2928
+ Args:
2929
+ estimator (PCA): 학습된 PCA 객체.
2930
+ data (DataFrame): PCA에 사용된 원본 데이터.
2931
+ yname (str | None): 종속변수 컬럼명.
2932
+ fields (list | tuple | list[list] | tuple[list] | list[tuple] | tuple[tuple] | None): 시각화할 독립변수 목록. None이면 자동 탐지.
2933
+ hue (str|None): 집단 구분 컬럼명.
2934
+ palette (str|None): 팔레트 이름.
2935
+ width (int): 캔버스 가로 픽셀.
2936
+ height (int): 캔버스 세로 픽셀.
2937
+ linewidth (float): 선 굵기.
2938
+ dpi (int): 그림 크기 및 해상도.
2939
+ save_path (str|None): 저장 경로.
2940
+ callback (Callable|None): Axes 후처리 콜백.
2941
+
2942
+ Returns:
2943
+ None
2944
+ """
2945
+ df = data.copy()
2946
+ df_columns = df.columns.tolist()
2947
+
2948
+ # 종속변수가 지정되었다면 해당 컬럼 추출
2949
+ yfield = None
2950
+ if yname is not None and yname in data.columns:
2951
+ yfield = df[[yname]].copy()
2952
+ df = df.drop(columns=[yname])
2953
+
2954
+ # PCA 변환 수행
2955
+ #display(df)
2956
+ score = estimator.transform(df)
2957
+ #print(score)
2958
+
2959
+ # 추정기로부터 PCA 결과 데이터 프레임 생성
2960
+ pca_df = DataFrame(
2961
+ data=score,
2962
+ columns=[f"PC{i+1}" for i in range(estimator.n_components_)],
2963
+ )
2964
+ #display(pca_df)
2965
+
2966
+ # 종속변수 컬럼 추가
2967
+ if yfield is not None:
2968
+ pca_df[yname] = yfield
2969
+
2970
+ # 모든 컬럼명에 대한 조합 생성
2971
+ if fields is None:
2972
+ feature_cols = pca_df.columns.tolist()
2973
+ if yname is not None and yname in feature_cols:
2974
+ feature_cols.remove(yname)
2975
+ fields = list(combinations(feature_cols, 2))
2976
+
2977
+ if not is_2d(fields):
2978
+ fields = [fields] # type: ignore
2979
+
2980
+ components = estimator.components_
2981
+
2982
+ x_index: int = 0
2983
+ y_index: int = 0
2984
+
2985
+ def __callable(ax) -> None:
2986
+ for i in range(n):
2987
+ ax.arrow(
2988
+ 0,
2989
+ 0,
2990
+ components[x_index, i],
2991
+ components[y_index, i],
2992
+ color="r",
2993
+ head_width=0.007,
2994
+ head_length=0.007,
2995
+ linewidth=linewidth * 0.75,
2996
+ alpha=0.75,
2997
+ )
2998
+ ax.text(
2999
+ components[x_index, i] * 1.15,
3000
+ components[y_index, i] * 1.15,
3001
+ f"{df_columns[i]} ({components[x_index, i]:.2f})",
3002
+ color="b",
3003
+ ha="center",
3004
+ va="center",
3005
+ )
3006
+
3007
+ if callback is not None:
3008
+ callback(ax)
3009
+
3010
+ for field_group in fields: # type: ignore
3011
+ x_index = int(pca_df.columns.get_loc(field_group[0])) # type: ignore
3012
+ y_index = int(pca_df.columns.get_loc(field_group[1])) # type: ignore
3013
+
3014
+ xs = score[:, x_index]
3015
+ ys = score[:, y_index]
3016
+ n = score.shape[1]
3017
+
3018
+ scalex = 1.0 / (xs.max() - xs.min())
3019
+ scaley = 1.0 / (ys.max() - ys.min())
3020
+
3021
+ title = "PCA Biplot"
3022
+ if field_group is not None:
3023
+ title += " - " + ", ".join(field_group)
3024
+
3025
+ scatterplot(
3026
+ df=None,
3027
+ xname=xs * scalex,
3028
+ yname=ys * scaley,
3029
+ hue=pca_df[hue] if hue is not None else None,
3030
+ outline=False,
3031
+ palette=palette,
3032
+ width=width,
3033
+ height=height,
3034
+ linewidth=linewidth,
3035
+ dpi=dpi,
3036
+ save_path=save_path,
3037
+ title=title,
3038
+ callback=__callable,
3039
+ )
3040
+