hossam 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hossam/__init__.py +19 -0
- hossam/hs_cluster copy.py +1060 -0
- hossam/hs_cluster.py +369 -128
- hossam/hs_plot.py +244 -13
- hossam/hs_prep.py +241 -56
- hossam/hs_stats.py +39 -2
- hossam/hs_util.py +20 -0
- {hossam-0.4.16.dist-info → hossam-0.4.18.dist-info}/METADATA +1 -1
- hossam-0.4.18.dist-info/RECORD +18 -0
- hossam-0.4.16.dist-info/RECORD +0 -17
- {hossam-0.4.16.dist-info → hossam-0.4.18.dist-info}/WHEEL +0 -0
- {hossam-0.4.16.dist-info → hossam-0.4.18.dist-info}/licenses/LICENSE +0 -0
- {hossam-0.4.16.dist-info → hossam-0.4.18.dist-info}/top_level.txt +0 -0
hossam/hs_plot.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
from types import SimpleNamespace
|
|
4
|
-
from typing import Callable
|
|
4
|
+
from typing import Callable, Literal
|
|
5
|
+
from itertools import combinations
|
|
5
6
|
|
|
6
7
|
# ===================================================================
|
|
7
8
|
import numpy as np
|
|
@@ -15,6 +16,7 @@ from pandas import DataFrame
|
|
|
15
16
|
# ===================================================================
|
|
16
17
|
from scipy.stats import t
|
|
17
18
|
from scipy.spatial import ConvexHull
|
|
19
|
+
from scipy.cluster.hierarchy import dendrogram as sch_dendrogram
|
|
18
20
|
from statsmodels.graphics.gofplots import qqplot as sm_qqplot
|
|
19
21
|
from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess
|
|
20
22
|
|
|
@@ -23,6 +25,7 @@ from statannotations.Annotator import Annotator
|
|
|
23
25
|
|
|
24
26
|
# ===================================================================
|
|
25
27
|
from sklearn.cluster import AgglomerativeClustering, KMeans
|
|
28
|
+
from sklearn.decomposition import PCA
|
|
26
29
|
|
|
27
30
|
from sklearn.metrics import (
|
|
28
31
|
mean_squared_error,
|
|
@@ -34,6 +37,8 @@ from sklearn.metrics import (
|
|
|
34
37
|
silhouette_samples,
|
|
35
38
|
)
|
|
36
39
|
|
|
40
|
+
from .hs_util import is_2d
|
|
41
|
+
|
|
37
42
|
# ===================================================================
|
|
38
43
|
config = SimpleNamespace(
|
|
39
44
|
dpi=200,
|
|
@@ -304,7 +309,7 @@ def lineplot(
|
|
|
304
309
|
# 상자그림(boxplot)을 그린다
|
|
305
310
|
# ===================================================================
|
|
306
311
|
def boxplot(
|
|
307
|
-
df: DataFrame,
|
|
312
|
+
df: DataFrame | None = None,
|
|
308
313
|
xname: str | None = None,
|
|
309
314
|
yname: str | None = None,
|
|
310
315
|
title: str | None = None,
|
|
@@ -326,7 +331,7 @@ def boxplot(
|
|
|
326
331
|
"""상자그림(boxplot)을 그린다.
|
|
327
332
|
|
|
328
333
|
Args:
|
|
329
|
-
df (DataFrame): 시각화할 데이터.
|
|
334
|
+
df (DataFrame|None): 시각화할 데이터.
|
|
330
335
|
xname (str|None): x축 범주 컬럼명.
|
|
331
336
|
yname (str|None): y축 값 컬럼명.
|
|
332
337
|
title (str|None): 그래프 제목.
|
|
@@ -354,13 +359,20 @@ def boxplot(
|
|
|
354
359
|
fig, ax = get_default_ax(width, height, 1, 1, dpi) # type: ignore
|
|
355
360
|
outparams = True
|
|
356
361
|
|
|
357
|
-
if xname is not None
|
|
362
|
+
if xname is not None or yname is not None:
|
|
363
|
+
if xname is not None and yname is None:
|
|
364
|
+
orient = "h"
|
|
365
|
+
elif xname is None and yname is not None:
|
|
366
|
+
orient = "v"
|
|
367
|
+
|
|
368
|
+
|
|
358
369
|
boxplot_kwargs = {
|
|
359
370
|
"data": df,
|
|
360
371
|
"x": xname,
|
|
361
372
|
"y": yname,
|
|
362
373
|
"orient": orient,
|
|
363
374
|
"ax": ax,
|
|
375
|
+
"linewidth": linewidth,
|
|
364
376
|
}
|
|
365
377
|
|
|
366
378
|
# hue 파라미터 확인 (params에 있을 수 있음)
|
|
@@ -372,12 +384,12 @@ def boxplot(
|
|
|
372
384
|
boxplot_kwargs["color"] = sb.color_palette(palette)[0]
|
|
373
385
|
|
|
374
386
|
boxplot_kwargs.update(params)
|
|
375
|
-
sb.boxplot(**boxplot_kwargs
|
|
387
|
+
sb.boxplot(**boxplot_kwargs)
|
|
376
388
|
|
|
377
389
|
# 통계 검정 추가
|
|
378
390
|
if stat_test is not None:
|
|
379
391
|
if stat_pairs is None:
|
|
380
|
-
stat_pairs = [df[xname].dropna().unique().tolist()]
|
|
392
|
+
stat_pairs = [df[xname].dropna().unique().tolist()] # type: ignore
|
|
381
393
|
|
|
382
394
|
annotator = Annotator(
|
|
383
395
|
ax, data=df, x=xname, y=yname, pairs=stat_pairs, orient=orient
|
|
@@ -749,7 +761,7 @@ def stackplot(
|
|
|
749
761
|
# 산점도를 그린다
|
|
750
762
|
# ===================================================================
|
|
751
763
|
def scatterplot(
|
|
752
|
-
df: DataFrame,
|
|
764
|
+
df: DataFrame | None,
|
|
753
765
|
xname: str,
|
|
754
766
|
yname: str,
|
|
755
767
|
hue=None,
|
|
@@ -769,7 +781,7 @@ def scatterplot(
|
|
|
769
781
|
"""산점도를 그린다.
|
|
770
782
|
|
|
771
783
|
Args:
|
|
772
|
-
df (DataFrame): 시각화할 데이터.
|
|
784
|
+
df (DataFrame | None): 시각화할 데이터.
|
|
773
785
|
xname (str): x축 컬럼.
|
|
774
786
|
yname (str): y축 컬럼.
|
|
775
787
|
hue (str|None): 범주 컬럼.
|
|
@@ -796,12 +808,12 @@ def scatterplot(
|
|
|
796
808
|
|
|
797
809
|
if outline and hue is not None:
|
|
798
810
|
# 군집별 값의 종류별로 반복 수행
|
|
799
|
-
for c in df[hue].unique():
|
|
811
|
+
for c in df[hue].unique(): # type: ignore
|
|
800
812
|
if c == -1:
|
|
801
813
|
continue
|
|
802
814
|
|
|
803
815
|
# 한 종류만 필터링한 결과에서 두 변수만 선택
|
|
804
|
-
df_c = df.loc[df[hue] == c, [xname, yname]]
|
|
816
|
+
df_c = df.loc[df[hue] == c, [xname, yname]] # type: ignore
|
|
805
817
|
|
|
806
818
|
try:
|
|
807
819
|
# 외각선 좌표 계산
|
|
@@ -842,15 +854,15 @@ def scatterplot(
|
|
|
842
854
|
else:
|
|
843
855
|
# 핵심벡터
|
|
844
856
|
scatterplot_kwargs["edgecolor"] = "#ffffff"
|
|
845
|
-
sb.scatterplot(data=df[df[vector] == "core"], **scatterplot_kwargs)
|
|
857
|
+
sb.scatterplot(data=df[df[vector] == "core"], **scatterplot_kwargs) # type: ignore
|
|
846
858
|
|
|
847
859
|
# 외곽백터
|
|
848
860
|
scatterplot_kwargs["edgecolor"] = "#000000"
|
|
849
861
|
scatterplot_kwargs["s"] = 25
|
|
850
862
|
scatterplot_kwargs["marker"] = "^"
|
|
851
863
|
scatterplot_kwargs["linewidth"] = 0.8
|
|
852
|
-
sb.scatterplot(data=df[df[vector] == "border"], **scatterplot_kwargs)
|
|
853
|
-
|
|
864
|
+
sb.scatterplot(data=df[df[vector] == "border"], **scatterplot_kwargs) # type: ignore
|
|
865
|
+
|
|
854
866
|
# 노이즈벡터
|
|
855
867
|
scatterplot_kwargs["edgecolor"] = None
|
|
856
868
|
scatterplot_kwargs["s"] = 25
|
|
@@ -2644,6 +2656,9 @@ def silhouette_plot(
|
|
|
2644
2656
|
finalize_plot(ax, callback, outparams, save_path, True, title) # type: ignore
|
|
2645
2657
|
|
|
2646
2658
|
|
|
2659
|
+
# ===================================================================
|
|
2660
|
+
# 군집분석 결과 시각화
|
|
2661
|
+
# ===================================================================
|
|
2647
2662
|
def cluster_plot(
|
|
2648
2663
|
estimator: KMeans | AgglomerativeClustering | None = None,
|
|
2649
2664
|
data: DataFrame | None = None,
|
|
@@ -2755,6 +2770,9 @@ def cluster_plot(
|
|
|
2755
2770
|
)
|
|
2756
2771
|
|
|
2757
2772
|
|
|
2773
|
+
# ===================================================================
|
|
2774
|
+
# 군집분석 결과의 실루엣 플롯과 군집 산점도를 한 화면에 함께 시각화
|
|
2775
|
+
# ===================================================================
|
|
2758
2776
|
def visualize_silhouette(
|
|
2759
2777
|
estimator: KMeans | AgglomerativeClustering,
|
|
2760
2778
|
data: DataFrame,
|
|
@@ -2819,3 +2837,216 @@ def visualize_silhouette(
|
|
|
2819
2837
|
)
|
|
2820
2838
|
|
|
2821
2839
|
finalize_plot(ax)
|
|
2840
|
+
|
|
2841
|
+
|
|
2842
|
+
|
|
2843
|
+
# ===================================================================
|
|
2844
|
+
# 덴드로그램 시각화
|
|
2845
|
+
# ===================================================================
|
|
2846
|
+
def dandrogram(
|
|
2847
|
+
estimator: AgglomerativeClustering,
|
|
2848
|
+
p: int = 30,
|
|
2849
|
+
count_sort: Literal["ascending", "descending", False] = "ascending",
|
|
2850
|
+
title: str | None = None,
|
|
2851
|
+
width: int = config.width,
|
|
2852
|
+
height: int = config.height,
|
|
2853
|
+
dpi: int = config.dpi,
|
|
2854
|
+
save_path: str | None = None,
|
|
2855
|
+
callback: Callable | None = None,
|
|
2856
|
+
ax: Axes | None = None
|
|
2857
|
+
) -> None:
|
|
2858
|
+
"""덴드로그램 시각화
|
|
2859
|
+
|
|
2860
|
+
Args:
|
|
2861
|
+
estimator (AgglomerativeClustering): 학습된 AgglomerativeClustering 군집 모델 객체.
|
|
2862
|
+
p (int): 덴드로그램에서 표시할 마지막 병합된 군집 수. 기본값 30.
|
|
2863
|
+
count_sort (str): 'ascending' 또는 'descending'으로 병합 순서 정렬.
|
|
2864
|
+
title (str|None): 그래프 제목.
|
|
2865
|
+
palette (str|None): 팔레트 이름.
|
|
2866
|
+
width (int): 캔버스 가로 픽셀.
|
|
2867
|
+
height (int): 캔버스 세로 픽셀.
|
|
2868
|
+
dpi (int): 그림 크기 및 해상도.
|
|
2869
|
+
save_path (str|None): 저장 경로.
|
|
2870
|
+
callback (Callable|None): Axes 후처리 콜백.
|
|
2871
|
+
ax (Axes|None): 외부에서 전달한 Axes. None이면 새로 생성.
|
|
2872
|
+
|
|
2873
|
+
Returns:
|
|
2874
|
+
None
|
|
2875
|
+
"""
|
|
2876
|
+
# 덴드로그램을 그리기 위해 linkage 행렬 생성
|
|
2877
|
+
counts = np.zeros(estimator.children_.shape[0]) # type: ignore
|
|
2878
|
+
n_samples = len(estimator.labels_)
|
|
2879
|
+
|
|
2880
|
+
for i, merge in enumerate(estimator.children_): # type: ignore
|
|
2881
|
+
current_count = 0
|
|
2882
|
+
for child_idx in merge:
|
|
2883
|
+
if child_idx < n_samples:
|
|
2884
|
+
current_count += 1 # leaf node
|
|
2885
|
+
else:
|
|
2886
|
+
current_count += counts[child_idx - n_samples]
|
|
2887
|
+
counts[i] = current_count
|
|
2888
|
+
|
|
2889
|
+
linkage_matrix = np.column_stack(
|
|
2890
|
+
[estimator.children_, estimator.distances_, counts]
|
|
2891
|
+
).astype(float)
|
|
2892
|
+
|
|
2893
|
+
outparams = False
|
|
2894
|
+
|
|
2895
|
+
if ax is None:
|
|
2896
|
+
fig, ax = get_default_ax(width, height, 1, 1, dpi) # type: ignore
|
|
2897
|
+
outparams = True
|
|
2898
|
+
|
|
2899
|
+
|
|
2900
|
+
sch_dendrogram(
|
|
2901
|
+
linkage_matrix,
|
|
2902
|
+
ax=ax,
|
|
2903
|
+
p=p,
|
|
2904
|
+
truncate_mode="lastp" if p > 0 else None,
|
|
2905
|
+
leaf_rotation=0,
|
|
2906
|
+
leaf_font_size=8,
|
|
2907
|
+
count_sort=count_sort,
|
|
2908
|
+
color_threshold=None,
|
|
2909
|
+
above_threshold_color="grey",
|
|
2910
|
+
)
|
|
2911
|
+
|
|
2912
|
+
finalize_plot(ax, callback, outparams, save_path, True, title) # type: ignore
|
|
2913
|
+
|
|
2914
|
+
|
|
2915
|
+
# ===================================================================
|
|
2916
|
+
# PCA 분석 결과에 대한 biplot 시각화
|
|
2917
|
+
# ===================================================================
|
|
2918
|
+
def pca_plot(
|
|
2919
|
+
estimator: PCA,
|
|
2920
|
+
data: DataFrame,
|
|
2921
|
+
yname: str | None = None,
|
|
2922
|
+
fields: list | tuple | list[list] | tuple[list] | list[tuple] | tuple[tuple] | None = None,
|
|
2923
|
+
hue: str | None = None,
|
|
2924
|
+
palette: str | None = None,
|
|
2925
|
+
width: int = config.width,
|
|
2926
|
+
height: int = config.height,
|
|
2927
|
+
linewidth: float = config.line_width,
|
|
2928
|
+
dpi: int = config.dpi,
|
|
2929
|
+
save_path: str | None = None,
|
|
2930
|
+
callback: Callable | None = None,
|
|
2931
|
+
) -> None:
|
|
2932
|
+
"""
|
|
2933
|
+
PCA 분석 결과에 대한 biplot 시각화
|
|
2934
|
+
|
|
2935
|
+
Args:
|
|
2936
|
+
estimator (PCA): 학습된 PCA 객체.
|
|
2937
|
+
data (DataFrame): PCA에 사용된 원본 데이터.
|
|
2938
|
+
yname (str | None): 종속변수 컬럼명.
|
|
2939
|
+
fields (list | tuple | list[list] | tuple[list] | list[tuple] | tuple[tuple] | None): 시각화할 독립변수 목록. None이면 자동 탐지.
|
|
2940
|
+
hue (str|None): 집단 구분 컬럼명.
|
|
2941
|
+
palette (str|None): 팔레트 이름.
|
|
2942
|
+
width (int): 캔버스 가로 픽셀.
|
|
2943
|
+
height (int): 캔버스 세로 픽셀.
|
|
2944
|
+
linewidth (float): 선 굵기.
|
|
2945
|
+
dpi (int): 그림 크기 및 해상도.
|
|
2946
|
+
save_path (str|None): 저장 경로.
|
|
2947
|
+
callback (Callable|None): Axes 후처리 콜백.
|
|
2948
|
+
|
|
2949
|
+
Returns:
|
|
2950
|
+
None
|
|
2951
|
+
"""
|
|
2952
|
+
df = data.copy()
|
|
2953
|
+
df_columns = df.columns.tolist()
|
|
2954
|
+
|
|
2955
|
+
# 종속변수가 지정되었다면 해당 컬럼 추출
|
|
2956
|
+
yfield = None
|
|
2957
|
+
if yname is not None and yname in data.columns:
|
|
2958
|
+
yfield = df[[yname]].copy()
|
|
2959
|
+
df = df.drop(columns=[yname])
|
|
2960
|
+
|
|
2961
|
+
# PCA 변환 수행
|
|
2962
|
+
#display(df)
|
|
2963
|
+
score = estimator.transform(df)
|
|
2964
|
+
#print(score)
|
|
2965
|
+
|
|
2966
|
+
# 추정기로부터 PCA 결과 데이터 프레임 생성
|
|
2967
|
+
pca_df = DataFrame(
|
|
2968
|
+
data=score,
|
|
2969
|
+
columns=[f"PC{i+1}" for i in range(estimator.n_components_)],
|
|
2970
|
+
)
|
|
2971
|
+
#display(pca_df)
|
|
2972
|
+
|
|
2973
|
+
# 종속변수 컬럼 추가
|
|
2974
|
+
if yfield is not None:
|
|
2975
|
+
pca_df[yname] = yfield
|
|
2976
|
+
|
|
2977
|
+
# 모든 컬럼명에 대한 조합 생성
|
|
2978
|
+
if fields is None:
|
|
2979
|
+
feature_cols = pca_df.columns.tolist()
|
|
2980
|
+
if yname is not None and yname in feature_cols:
|
|
2981
|
+
feature_cols.remove(yname)
|
|
2982
|
+
fields = list(combinations(feature_cols, 2))
|
|
2983
|
+
|
|
2984
|
+
if not is_2d(fields):
|
|
2985
|
+
fields = [fields] # type: ignore
|
|
2986
|
+
|
|
2987
|
+
components = estimator.components_
|
|
2988
|
+
|
|
2989
|
+
x_index: int = 0
|
|
2990
|
+
y_index: int = 0
|
|
2991
|
+
|
|
2992
|
+
def __callable(ax) -> None:
|
|
2993
|
+
for i in range(n):
|
|
2994
|
+
ax.arrow(
|
|
2995
|
+
0,
|
|
2996
|
+
0,
|
|
2997
|
+
components[x_index, i],
|
|
2998
|
+
components[y_index, i],
|
|
2999
|
+
color="r",
|
|
3000
|
+
head_width=0.007,
|
|
3001
|
+
head_length=0.007,
|
|
3002
|
+
linewidth=linewidth * 0.75,
|
|
3003
|
+
alpha=0.75,
|
|
3004
|
+
)
|
|
3005
|
+
ax.text(
|
|
3006
|
+
components[x_index, i] * 1.15,
|
|
3007
|
+
components[y_index, i] * 1.15,
|
|
3008
|
+
f"{df_columns[i]} ({components[x_index, i]:.2f})",
|
|
3009
|
+
color="b",
|
|
3010
|
+
ha="center",
|
|
3011
|
+
va="center",
|
|
3012
|
+
)
|
|
3013
|
+
|
|
3014
|
+
if callback is not None:
|
|
3015
|
+
callback(ax)
|
|
3016
|
+
|
|
3017
|
+
for field_group in fields: # type: ignore
|
|
3018
|
+
x_index = int(pca_df.columns.get_loc(field_group[0])) # type: ignore
|
|
3019
|
+
y_index = int(pca_df.columns.get_loc(field_group[1])) # type: ignore
|
|
3020
|
+
|
|
3021
|
+
xs = score[:, x_index]
|
|
3022
|
+
ys = score[:, y_index]
|
|
3023
|
+
n = score.shape[1]
|
|
3024
|
+
|
|
3025
|
+
scalex = 1.0 / (xs.max() - xs.min())
|
|
3026
|
+
scaley = 1.0 / (ys.max() - ys.min())
|
|
3027
|
+
|
|
3028
|
+
title = "PCA Biplot"
|
|
3029
|
+
if field_group is not None:
|
|
3030
|
+
title += " - " + ", ".join(field_group)
|
|
3031
|
+
|
|
3032
|
+
tdf = DataFrame({
|
|
3033
|
+
field_group[0]: xs * scalex,
|
|
3034
|
+
field_group[1]: ys * scaley,
|
|
3035
|
+
})
|
|
3036
|
+
|
|
3037
|
+
scatterplot(
|
|
3038
|
+
df=tdf,
|
|
3039
|
+
xname=field_group[0],
|
|
3040
|
+
yname=field_group[1],
|
|
3041
|
+
hue=pca_df[hue] if hue is not None else None,
|
|
3042
|
+
outline=False,
|
|
3043
|
+
palette=palette,
|
|
3044
|
+
width=width,
|
|
3045
|
+
height=height,
|
|
3046
|
+
linewidth=linewidth,
|
|
3047
|
+
dpi=dpi,
|
|
3048
|
+
save_path=save_path,
|
|
3049
|
+
title=title,
|
|
3050
|
+
callback=__callable,
|
|
3051
|
+
)
|
|
3052
|
+
|