hossam 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hossam/hs_cluster.py +283 -120
- hossam/hs_plot.py +241 -14
- hossam/hs_prep.py +241 -56
- hossam/hs_util.py +20 -0
- {hossam-0.4.15.dist-info → hossam-0.4.17.dist-info}/METADATA +1 -1
- {hossam-0.4.15.dist-info → hossam-0.4.17.dist-info}/RECORD +9 -9
- {hossam-0.4.15.dist-info → hossam-0.4.17.dist-info}/WHEEL +0 -0
- {hossam-0.4.15.dist-info → hossam-0.4.17.dist-info}/licenses/LICENSE +0 -0
- {hossam-0.4.15.dist-info → hossam-0.4.17.dist-info}/top_level.txt +0 -0
hossam/hs_plot.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
from types import SimpleNamespace
|
|
4
|
-
from typing import Callable
|
|
4
|
+
from typing import Callable, Literal
|
|
5
|
+
from itertools import combinations
|
|
5
6
|
|
|
6
7
|
# ===================================================================
|
|
7
8
|
import numpy as np
|
|
@@ -15,6 +16,7 @@ from pandas import DataFrame
|
|
|
15
16
|
# ===================================================================
|
|
16
17
|
from scipy.stats import t
|
|
17
18
|
from scipy.spatial import ConvexHull
|
|
19
|
+
from scipy.cluster.hierarchy import dendrogram as sch_dendrogram
|
|
18
20
|
from statsmodels.graphics.gofplots import qqplot as sm_qqplot
|
|
19
21
|
from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess
|
|
20
22
|
|
|
@@ -22,7 +24,8 @@ from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess
|
|
|
22
24
|
from statannotations.Annotator import Annotator
|
|
23
25
|
|
|
24
26
|
# ===================================================================
|
|
25
|
-
from sklearn.cluster
|
|
27
|
+
from sklearn.cluster import AgglomerativeClustering, KMeans
|
|
28
|
+
from sklearn.decomposition import PCA
|
|
26
29
|
|
|
27
30
|
from sklearn.metrics import (
|
|
28
31
|
mean_squared_error,
|
|
@@ -34,6 +37,8 @@ from sklearn.metrics import (
|
|
|
34
37
|
silhouette_samples,
|
|
35
38
|
)
|
|
36
39
|
|
|
40
|
+
from .hs_util import is_2d
|
|
41
|
+
|
|
37
42
|
# ===================================================================
|
|
38
43
|
config = SimpleNamespace(
|
|
39
44
|
dpi=200,
|
|
@@ -749,7 +754,7 @@ def stackplot(
|
|
|
749
754
|
# 산점도를 그린다
|
|
750
755
|
# ===================================================================
|
|
751
756
|
def scatterplot(
|
|
752
|
-
df: DataFrame,
|
|
757
|
+
df: DataFrame | None,
|
|
753
758
|
xname: str,
|
|
754
759
|
yname: str,
|
|
755
760
|
hue=None,
|
|
@@ -769,7 +774,7 @@ def scatterplot(
|
|
|
769
774
|
"""산점도를 그린다.
|
|
770
775
|
|
|
771
776
|
Args:
|
|
772
|
-
df (DataFrame): 시각화할 데이터.
|
|
777
|
+
df (DataFrame | None): 시각화할 데이터.
|
|
773
778
|
xname (str): x축 컬럼.
|
|
774
779
|
yname (str): y축 컬럼.
|
|
775
780
|
hue (str|None): 범주 컬럼.
|
|
@@ -796,12 +801,12 @@ def scatterplot(
|
|
|
796
801
|
|
|
797
802
|
if outline and hue is not None:
|
|
798
803
|
# 군집별 값의 종류별로 반복 수행
|
|
799
|
-
for c in df[hue].unique():
|
|
804
|
+
for c in df[hue].unique(): # type: ignore
|
|
800
805
|
if c == -1:
|
|
801
806
|
continue
|
|
802
807
|
|
|
803
808
|
# 한 종류만 필터링한 결과에서 두 변수만 선택
|
|
804
|
-
df_c = df.loc[df[hue] == c, [xname, yname]]
|
|
809
|
+
df_c = df.loc[df[hue] == c, [xname, yname]] # type: ignore
|
|
805
810
|
|
|
806
811
|
try:
|
|
807
812
|
# 외각선 좌표 계산
|
|
@@ -2559,7 +2564,7 @@ def distribution_plot(
|
|
|
2559
2564
|
|
|
2560
2565
|
|
|
2561
2566
|
def silhouette_plot(
|
|
2562
|
-
estimator: KMeans,
|
|
2567
|
+
estimator: KMeans | AgglomerativeClustering,
|
|
2563
2568
|
data: DataFrame,
|
|
2564
2569
|
title: str | None = None,
|
|
2565
2570
|
width: int = config.width,
|
|
@@ -2574,7 +2579,7 @@ def silhouette_plot(
|
|
|
2574
2579
|
군집분석 결과의 실루엣 플롯을 시각화함.
|
|
2575
2580
|
|
|
2576
2581
|
Args:
|
|
2577
|
-
estimator (KMeans): 학습된 KMeans 군집 모델 객체.
|
|
2582
|
+
estimator (KMeans | AgglomerativeClustering): 학습된 KMeans 또는 AgglomerativeClustering 군집 모델 객체.
|
|
2578
2583
|
data (DataFrame): 군집분석에 사용된 입력 데이터 (n_samples, n_features).
|
|
2579
2584
|
title (str, optional): 플롯 제목. None이면 자동 생성.
|
|
2580
2585
|
width (int, optional): 플롯 가로 크기 (inch 단위).
|
|
@@ -2605,7 +2610,15 @@ def silhouette_plot(
|
|
|
2605
2610
|
y_lower = 10
|
|
2606
2611
|
|
|
2607
2612
|
# 클러스터링 갯수별로 fill_betweenx( )형태의 막대 그래프 표현.
|
|
2608
|
-
|
|
2613
|
+
n_clusters: int = 0
|
|
2614
|
+
if hasattr(estimator, "n_clusters") and estimator.n_clusters is not None: # type: ignore
|
|
2615
|
+
n_clusters = estimator.n_clusters # type: ignore
|
|
2616
|
+
elif hasattr(estimator, "n_clusters_") and estimator.n_clusters_ is not None: # type: ignore
|
|
2617
|
+
n_clusters = estimator.n_clusters_ # type: ignore
|
|
2618
|
+
else:
|
|
2619
|
+
n_clusters = len(np.unique(estimator.labels_)) # type: ignore
|
|
2620
|
+
|
|
2621
|
+
for i in range(n_clusters): # type: ignore
|
|
2609
2622
|
ith_cluster_sil_values = sil_values[estimator.labels_ == i] # type: ignore
|
|
2610
2623
|
ith_cluster_sil_values.sort() # type: ignore
|
|
2611
2624
|
|
|
@@ -2626,18 +2639,21 @@ def silhouette_plot(
|
|
|
2626
2639
|
ax.set_xlabel("The silhouette coefficient values") # type: ignore
|
|
2627
2640
|
ax.set_ylabel("Cluster label") # type: ignore
|
|
2628
2641
|
ax.set_xlim([-0.1, 1]) # type: ignore
|
|
2629
|
-
ax.set_ylim([0, len(data) + (
|
|
2642
|
+
ax.set_ylim([0, len(data) + (n_clusters + 1) * 10]) # type: ignore
|
|
2630
2643
|
ax.set_yticks([]) # type: ignore
|
|
2631
2644
|
ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1]) # type: ignore
|
|
2632
2645
|
|
|
2633
2646
|
if title is None:
|
|
2634
|
-
title = "Number of Cluster : " + str(
|
|
2647
|
+
title = "Number of Cluster : " + str(n_clusters) + ", Silhouette Score :" + str(round(sil_avg, 3)) # type: ignore
|
|
2635
2648
|
|
|
2636
2649
|
finalize_plot(ax, callback, outparams, save_path, True, title) # type: ignore
|
|
2637
2650
|
|
|
2638
2651
|
|
|
2652
|
+
# ===================================================================
|
|
2653
|
+
# 군집분석 결과 시각화
|
|
2654
|
+
# ===================================================================
|
|
2639
2655
|
def cluster_plot(
|
|
2640
|
-
estimator: KMeans | None = None,
|
|
2656
|
+
estimator: KMeans | AgglomerativeClustering | None = None,
|
|
2641
2657
|
data: DataFrame | None = None,
|
|
2642
2658
|
xname: str | None = None,
|
|
2643
2659
|
yname: str | None = None,
|
|
@@ -2747,8 +2763,11 @@ def cluster_plot(
|
|
|
2747
2763
|
)
|
|
2748
2764
|
|
|
2749
2765
|
|
|
2766
|
+
# ===================================================================
|
|
2767
|
+
# 군집분석 결과의 실루엣 플롯과 군집 산점도를 한 화면에 함께 시각화
|
|
2768
|
+
# ===================================================================
|
|
2750
2769
|
def visualize_silhouette(
|
|
2751
|
-
estimator: KMeans,
|
|
2770
|
+
estimator: KMeans | AgglomerativeClustering,
|
|
2752
2771
|
data: DataFrame,
|
|
2753
2772
|
xname: str | None = None,
|
|
2754
2773
|
yname: str | None = None,
|
|
@@ -2765,7 +2784,7 @@ def visualize_silhouette(
|
|
|
2765
2784
|
군집분석 결과의 실루엣 플롯과 군집 산점도를 한 화면에 함께 시각화함.
|
|
2766
2785
|
|
|
2767
2786
|
Args:
|
|
2768
|
-
estimator (KMeans): 학습된 KMeans 군집 모델 객체.
|
|
2787
|
+
estimator (KMeans | AgglomerativeClustering): 학습된 KMeans 또는 AgglomerativeClustering 군집 모델 객체.
|
|
2769
2788
|
data (DataFrame): 군집분석에 사용된 입력 데이터 (n_samples, n_features).
|
|
2770
2789
|
xname (str, optional): 산점도 x축에 사용할 컬럼명. None이면 첫 번째 컬럼 사용.
|
|
2771
2790
|
yname (str, optional): 산점도 y축에 사용할 컬럼명. None이면 두 번째 컬럼 사용.
|
|
@@ -2811,3 +2830,211 @@ def visualize_silhouette(
|
|
|
2811
2830
|
)
|
|
2812
2831
|
|
|
2813
2832
|
finalize_plot(ax)
|
|
2833
|
+
|
|
2834
|
+
|
|
2835
|
+
|
|
2836
|
+
# ===================================================================
|
|
2837
|
+
# 덴드로그램 시각화
|
|
2838
|
+
# ===================================================================
|
|
2839
|
+
def dandrogram(
|
|
2840
|
+
estimator: AgglomerativeClustering,
|
|
2841
|
+
p: int = 30,
|
|
2842
|
+
count_sort: Literal["ascending", "descending", False] = "ascending",
|
|
2843
|
+
title: str | None = None,
|
|
2844
|
+
width: int = config.width,
|
|
2845
|
+
height: int = config.height,
|
|
2846
|
+
dpi: int = config.dpi,
|
|
2847
|
+
save_path: str | None = None,
|
|
2848
|
+
callback: Callable | None = None,
|
|
2849
|
+
ax: Axes | None = None
|
|
2850
|
+
) -> None:
|
|
2851
|
+
"""덴드로그램 시각화
|
|
2852
|
+
|
|
2853
|
+
Args:
|
|
2854
|
+
estimator (AgglomerativeClustering): 학습된 AgglomerativeClustering 군집 모델 객체.
|
|
2855
|
+
p (int): 덴드로그램에서 표시할 마지막 병합된 군집 수. 기본값 30.
|
|
2856
|
+
count_sort (str): 'ascending' 또는 'descending'으로 병합 순서 정렬.
|
|
2857
|
+
title (str|None): 그래프 제목.
|
|
2858
|
+
palette (str|None): 팔레트 이름.
|
|
2859
|
+
width (int): 캔버스 가로 픽셀.
|
|
2860
|
+
height (int): 캔버스 세로 픽셀.
|
|
2861
|
+
dpi (int): 그림 크기 및 해상도.
|
|
2862
|
+
save_path (str|None): 저장 경로.
|
|
2863
|
+
callback (Callable|None): Axes 후처리 콜백.
|
|
2864
|
+
ax (Axes|None): 외부에서 전달한 Axes. None이면 새로 생성.
|
|
2865
|
+
|
|
2866
|
+
Returns:
|
|
2867
|
+
None
|
|
2868
|
+
"""
|
|
2869
|
+
# 덴드로그램을 그리기 위해 linkage 행렬 생성
|
|
2870
|
+
counts = np.zeros(estimator.children_.shape[0]) # type: ignore
|
|
2871
|
+
n_samples = len(estimator.labels_)
|
|
2872
|
+
|
|
2873
|
+
for i, merge in enumerate(estimator.children_): # type: ignore
|
|
2874
|
+
current_count = 0
|
|
2875
|
+
for child_idx in merge:
|
|
2876
|
+
if child_idx < n_samples:
|
|
2877
|
+
current_count += 1 # leaf node
|
|
2878
|
+
else:
|
|
2879
|
+
current_count += counts[child_idx - n_samples]
|
|
2880
|
+
counts[i] = current_count
|
|
2881
|
+
|
|
2882
|
+
linkage_matrix = np.column_stack(
|
|
2883
|
+
[estimator.children_, estimator.distances_, counts]
|
|
2884
|
+
).astype(float)
|
|
2885
|
+
|
|
2886
|
+
outparams = False
|
|
2887
|
+
|
|
2888
|
+
if ax is None:
|
|
2889
|
+
fig, ax = get_default_ax(width, height, 1, 1, dpi) # type: ignore
|
|
2890
|
+
outparams = True
|
|
2891
|
+
|
|
2892
|
+
|
|
2893
|
+
sch_dendrogram(
|
|
2894
|
+
linkage_matrix,
|
|
2895
|
+
ax=ax,
|
|
2896
|
+
p=p,
|
|
2897
|
+
truncate_mode="lastp" if p > 0 else None,
|
|
2898
|
+
leaf_rotation=0,
|
|
2899
|
+
leaf_font_size=8,
|
|
2900
|
+
count_sort=count_sort,
|
|
2901
|
+
color_threshold=None,
|
|
2902
|
+
above_threshold_color="grey",
|
|
2903
|
+
)
|
|
2904
|
+
|
|
2905
|
+
finalize_plot(ax, callback, outparams, save_path, True, title) # type: ignore
|
|
2906
|
+
|
|
2907
|
+
|
|
2908
|
+
# ===================================================================
|
|
2909
|
+
# PCA 분석 결과에 대한 biplot 시각화
|
|
2910
|
+
# ===================================================================
|
|
2911
|
+
def pca_plot(
|
|
2912
|
+
estimator: PCA,
|
|
2913
|
+
data: DataFrame,
|
|
2914
|
+
yname: str | None = None,
|
|
2915
|
+
fields: list | tuple | list[list] | tuple[list] | list[tuple] | tuple[tuple] | None = None,
|
|
2916
|
+
hue: str | None = None,
|
|
2917
|
+
palette: str | None = None,
|
|
2918
|
+
width: int = config.width,
|
|
2919
|
+
height: int = config.height,
|
|
2920
|
+
linewidth: float = config.line_width,
|
|
2921
|
+
dpi: int = config.dpi,
|
|
2922
|
+
save_path: str | None = None,
|
|
2923
|
+
callback: Callable | None = None,
|
|
2924
|
+
) -> None:
|
|
2925
|
+
"""
|
|
2926
|
+
PCA 분석 결과에 대한 biplot 시각화
|
|
2927
|
+
|
|
2928
|
+
Args:
|
|
2929
|
+
estimator (PCA): 학습된 PCA 객체.
|
|
2930
|
+
data (DataFrame): PCA에 사용된 원본 데이터.
|
|
2931
|
+
yname (str | None): 종속변수 컬럼명.
|
|
2932
|
+
fields (list | tuple | list[list] | tuple[list] | list[tuple] | tuple[tuple] | None): 시각화할 독립변수 목록. None이면 자동 탐지.
|
|
2933
|
+
hue (str|None): 집단 구분 컬럼명.
|
|
2934
|
+
palette (str|None): 팔레트 이름.
|
|
2935
|
+
width (int): 캔버스 가로 픽셀.
|
|
2936
|
+
height (int): 캔버스 세로 픽셀.
|
|
2937
|
+
linewidth (float): 선 굵기.
|
|
2938
|
+
dpi (int): 그림 크기 및 해상도.
|
|
2939
|
+
save_path (str|None): 저장 경로.
|
|
2940
|
+
callback (Callable|None): Axes 후처리 콜백.
|
|
2941
|
+
|
|
2942
|
+
Returns:
|
|
2943
|
+
None
|
|
2944
|
+
"""
|
|
2945
|
+
df = data.copy()
|
|
2946
|
+
df_columns = df.columns.tolist()
|
|
2947
|
+
|
|
2948
|
+
# 종속변수가 지정되었다면 해당 컬럼 추출
|
|
2949
|
+
yfield = None
|
|
2950
|
+
if yname is not None and yname in data.columns:
|
|
2951
|
+
yfield = df[[yname]].copy()
|
|
2952
|
+
df = df.drop(columns=[yname])
|
|
2953
|
+
|
|
2954
|
+
# PCA 변환 수행
|
|
2955
|
+
#display(df)
|
|
2956
|
+
score = estimator.transform(df)
|
|
2957
|
+
#print(score)
|
|
2958
|
+
|
|
2959
|
+
# 추정기로부터 PCA 결과 데이터 프레임 생성
|
|
2960
|
+
pca_df = DataFrame(
|
|
2961
|
+
data=score,
|
|
2962
|
+
columns=[f"PC{i+1}" for i in range(estimator.n_components_)],
|
|
2963
|
+
)
|
|
2964
|
+
#display(pca_df)
|
|
2965
|
+
|
|
2966
|
+
# 종속변수 컬럼 추가
|
|
2967
|
+
if yfield is not None:
|
|
2968
|
+
pca_df[yname] = yfield
|
|
2969
|
+
|
|
2970
|
+
# 모든 컬럼명에 대한 조합 생성
|
|
2971
|
+
if fields is None:
|
|
2972
|
+
feature_cols = pca_df.columns.tolist()
|
|
2973
|
+
if yname is not None and yname in feature_cols:
|
|
2974
|
+
feature_cols.remove(yname)
|
|
2975
|
+
fields = list(combinations(feature_cols, 2))
|
|
2976
|
+
|
|
2977
|
+
if not is_2d(fields):
|
|
2978
|
+
fields = [fields] # type: ignore
|
|
2979
|
+
|
|
2980
|
+
components = estimator.components_
|
|
2981
|
+
|
|
2982
|
+
x_index: int = 0
|
|
2983
|
+
y_index: int = 0
|
|
2984
|
+
|
|
2985
|
+
def __callable(ax) -> None:
|
|
2986
|
+
for i in range(n):
|
|
2987
|
+
ax.arrow(
|
|
2988
|
+
0,
|
|
2989
|
+
0,
|
|
2990
|
+
components[x_index, i],
|
|
2991
|
+
components[y_index, i],
|
|
2992
|
+
color="r",
|
|
2993
|
+
head_width=0.007,
|
|
2994
|
+
head_length=0.007,
|
|
2995
|
+
linewidth=linewidth * 0.75,
|
|
2996
|
+
alpha=0.75,
|
|
2997
|
+
)
|
|
2998
|
+
ax.text(
|
|
2999
|
+
components[x_index, i] * 1.15,
|
|
3000
|
+
components[y_index, i] * 1.15,
|
|
3001
|
+
f"{df_columns[i]} ({components[x_index, i]:.2f})",
|
|
3002
|
+
color="b",
|
|
3003
|
+
ha="center",
|
|
3004
|
+
va="center",
|
|
3005
|
+
)
|
|
3006
|
+
|
|
3007
|
+
if callback is not None:
|
|
3008
|
+
callback(ax)
|
|
3009
|
+
|
|
3010
|
+
for field_group in fields: # type: ignore
|
|
3011
|
+
x_index = int(pca_df.columns.get_loc(field_group[0])) # type: ignore
|
|
3012
|
+
y_index = int(pca_df.columns.get_loc(field_group[1])) # type: ignore
|
|
3013
|
+
|
|
3014
|
+
xs = score[:, x_index]
|
|
3015
|
+
ys = score[:, y_index]
|
|
3016
|
+
n = score.shape[1]
|
|
3017
|
+
|
|
3018
|
+
scalex = 1.0 / (xs.max() - xs.min())
|
|
3019
|
+
scaley = 1.0 / (ys.max() - ys.min())
|
|
3020
|
+
|
|
3021
|
+
title = "PCA Biplot"
|
|
3022
|
+
if field_group is not None:
|
|
3023
|
+
title += " - " + ", ".join(field_group)
|
|
3024
|
+
|
|
3025
|
+
scatterplot(
|
|
3026
|
+
df=None,
|
|
3027
|
+
xname=xs * scalex,
|
|
3028
|
+
yname=ys * scaley,
|
|
3029
|
+
hue=pca_df[hue] if hue is not None else None,
|
|
3030
|
+
outline=False,
|
|
3031
|
+
palette=palette,
|
|
3032
|
+
width=width,
|
|
3033
|
+
height=height,
|
|
3034
|
+
linewidth=linewidth,
|
|
3035
|
+
dpi=dpi,
|
|
3036
|
+
save_path=save_path,
|
|
3037
|
+
title=title,
|
|
3038
|
+
callback=__callable,
|
|
3039
|
+
)
|
|
3040
|
+
|