hossam 0.4.17__py3-none-any.whl → 0.4.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hossam/hs_cluster.py CHANGED
@@ -23,6 +23,7 @@ from sklearn.metrics import silhouette_score, adjusted_rand_score
23
23
  # hossam 패키지 참조
24
24
  # ===================================================================
25
25
  from . import hs_plot
26
+ from .hs_util import is_2d
26
27
 
27
28
  RANDOM_STATE = 52
28
29
 
@@ -32,10 +33,11 @@ RANDOM_STATE = 52
32
33
  # ===================================================================
33
34
  def kmeans_fit(
34
35
  data: DataFrame,
35
- n_clusters: int,
36
+ n_clusters: int | None = None,
37
+ k_range: list | tuple = [2, 11],
36
38
  random_state: int = RANDOM_STATE,
37
39
  plot: bool = False,
38
- fields: list[list[str]] | None = None,
40
+ fields: list[str] | tuple[str] | tuple[tuple[str]] | list[list[str]] | None = None,
39
41
  **params,
40
42
  ) -> tuple[KMeans, DataFrame, float]:
41
43
  """
@@ -43,7 +45,7 @@ def kmeans_fit(
43
45
 
44
46
  Args:
45
47
  data (DataFrame): 군집화할 데이터프레임.
46
- n_clusters (int): 군집 개수.
48
+ n_clusters (int | None): 군집 개수.
47
49
  random_state (int, optional): 랜덤 시드. 기본값은 RANDOM_STATE.
48
50
  plot (bool, optional): True면 결과를 시각화함. 기본값 False.
49
51
  fields (list[list[str]] | None, optional): 시각화할 필드 쌍 리스트. 기본값 None이면 수치형 컬럼의 모든 조합 사용.
@@ -55,18 +57,36 @@ def kmeans_fit(
55
57
  float: 실루엣 점수
56
58
  """
57
59
  df = data.copy()
60
+
61
+ if n_clusters is None:
62
+ n_clusters = kmeans_best_k(data=df, k_range=k_range, random_state=random_state, plot=False)
63
+ print(f"Best k found: {n_clusters}")
64
+
58
65
  kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, **params)
59
66
  kmeans.fit(data)
60
67
  df["cluster"] = kmeans.predict(df)
61
68
  score = float(silhouette_score(X=data, labels=df["cluster"]))
62
69
 
63
70
  if plot:
64
- cluster_plot(
65
- estimator=kmeans,
66
- data=data,
67
- fields=fields,
68
- title=f"K-Means Clustering (k={n_clusters})",
69
- )
71
+
72
+ if not is_2d(fields):
73
+ fields = [fields] # type: ignore
74
+
75
+ # cluster_plot(
76
+ # estimator=kmeans,
77
+ # data=data,
78
+ # fields=fields,
79
+ # title=f"K-Means Clustering (k={n_clusters})",
80
+ # )
81
+ for f in fields: # type: ignore
82
+ hs_plot.visualize_silhouette(
83
+ estimator=kmeans,
84
+ data=data,
85
+ xname=f[0], # type: ignore
86
+ yname=f[1], # type: ignore
87
+ title=f"K-Means Clustering (k={n_clusters})",
88
+ outline=True,
89
+ )
70
90
 
71
91
  return kmeans, df, score
72
92
 
@@ -544,6 +564,9 @@ def persona(
544
564
  persona_dict[("", f"count")] = len(group)
545
565
 
546
566
  for field in fields:
567
+ if field == cluster:
568
+ continue
569
+
547
570
  # 명목형일 경우 최빈값 사용
548
571
  if df[field].dtype == "object" or df[field].dtype.name == "category":
549
572
  persona_dict[(field, "mode")] = group[field].mode()[0]
@@ -779,7 +802,9 @@ def dbscan_eps(
779
802
 
780
803
  return best_eps, eps_grid
781
804
 
782
-
805
+ # ===================================================================
806
+ # DBSCAN 군집화 모델을 적합하고 최적의 eps 값을 탐지하는 함수.
807
+ # ===================================================================
783
808
  def dbscan_fit(
784
809
  data: DataFrame,
785
810
  eps: float | list | np.ndarray | None = None,
@@ -789,6 +814,25 @@ def dbscan_fit(
789
814
  plot: bool = True,
790
815
  **params,
791
816
  ) -> tuple[DBSCAN, DataFrame, DataFrame]:
817
+ """
818
+ DBSCAN 군집화 모델을 적합하고 최적의 eps 값을 탐지하는 함수.
819
+
820
+ Args:
821
+ data (DataFrame): 군집화할 데이터프레임.
822
+ eps (float | list | np.ndarray | None, optional): eps 값 또는 리스트.
823
+ None이면 최적의 eps 값을 탐지함. 기본값 None.
824
+ min_samples (int, optional): 핵심점이 되기 위한 최소 샘플수. 기본값 5.
825
+ ari_threshold (float, optional): 안정 구간 탐지를 위한 ARI 임계값. 기본값 0.9.
826
+ noise_diff_threshold (float, optional): 안정 구간 탐지를 위한 노이즈 비율 변화 임계값. 기본값 0.05.
827
+ plot (bool, optional): True면 결과를 시각화함. 기본값 True.
828
+ **params: DBSCAN에 전달할 추가 파라미터.
829
+
830
+ Returns:
831
+ tuple: (estimator, cluster_df, result_df)
832
+ - estimator: 적합된 DBSCAN 모델 또는 모델 리스트(최적 eps가 여러 개인 경우).
833
+ - cluster_df: 클러스터 및 벡터 유형이 포함된 데이터 프레임 또는 데이터 프레임 리스트(최적 eps가 여러 개인 경우).
834
+ - result_df: eps 값에 따른 군집화 요약 통계 데이터 프레임.
835
+ """
792
836
 
793
837
  # eps 값이 지정되지 않은 경우 최적의 eps 탐지
794
838
  if eps is None:
@@ -874,18 +918,52 @@ def dbscan_fit(
874
918
  # result_dfs에서 recommand가 best에 해당하는 인덱스와 같은 위치의 추정기만 추출
875
919
  best_indexes = list(result_dfs[result_dfs["recommand"] == "best"].index) # type: ignore
876
920
 
877
- for i in range(len(estimators) - 1, -1, -1):
878
- if i not in best_indexes:
879
- del estimators[i]
880
- del cluster_dfs[i]
921
+ # for i in range(len(estimators) - 1, -1, -1):
922
+ # if i not in best_indexes:
923
+ # del estimators[i]
924
+ # del cluster_dfs[i]
881
925
 
882
926
  pbar.update(1)
883
927
 
884
- return (
885
- estimators[0] if len(estimators) == 1 else estimators, # type: ignore
886
- cluster_dfs[0] if len(cluster_dfs) == 1 else cluster_dfs,
887
- result_dfs, # type: ignore
888
- )
928
+ # best 모델 선정: recommand=='best'인 인덱스의 estimator/cluster_df만 반환
929
+ if len(estimators) == 1:
930
+
931
+ if plot:
932
+ hs_plot.scatterplot(
933
+ df=cluster_dfs[0],
934
+ xname=cluster_dfs[0].columns[0],
935
+ yname=cluster_dfs[0].columns[1],
936
+ hue="cluster",
937
+ vector="vector",
938
+ title=f"DBSCAN Clustering (eps={estimators[0].eps}, min_samples={estimators[0].min_samples})",
939
+ outline=True
940
+ )
941
+
942
+ return estimators[0], cluster_dfs[0], result_dfs # type: ignore
943
+
944
+ # recommand=='best'인 인덱스 추출 (여러 개면 첫 번째)
945
+ best_indexes = list(result_dfs[result_dfs["recommand"] == "best"].index) # type: ignore
946
+ if not best_indexes:
947
+ # fallback: 첫 번째
948
+ best_index = 0
949
+ else:
950
+ best_index = best_indexes[0]
951
+
952
+ best_estimator = estimators[best_index]
953
+ best_cluster_df = cluster_dfs[best_index]
954
+
955
+ if plot:
956
+ hs_plot.scatterplot(
957
+ df=best_cluster_df,
958
+ xname=best_cluster_df.columns[0],
959
+ yname=best_cluster_df.columns[1],
960
+ hue="cluster",
961
+ vector="vector",
962
+ title=f"DBSCAN Clustering (eps={best_estimator.eps}, min_samples={best_estimator.min_samples})",
963
+ outline=True
964
+ )
965
+
966
+ return best_estimator, best_cluster_df, result_dfs # type: ignore
889
967
 
890
968
 
891
969
  # ===================================================================
@@ -950,8 +1028,8 @@ def agg_fit(
950
1028
 
951
1029
  Returns:
952
1030
  tuple: (estimator(s), df(s), score_df)
953
- - estimator(s): 적합된 AgglomerativeClustering 모델 또는 모델 리스트.
954
- - df(s): 클러스터 결과가 포함된 데이터 프레임 또는 데이터 프레임 리스트.
1031
+ - estimator(s): 적합된 AgglomerativeClustering 모델 또는 모델 리스트 (n_clusters가 리스트일 때 리턴도 리스트로 처리됨).
1032
+ - df(s): 클러스터 결과가 포함된 데이터 프레임 또는 데이터 프레임 리스트(n_cluseters가 리스트일 때 리턴되 리스트로 처리됨).
955
1033
  - score_df: 각 군집 개수에 대한 실루엣 점수 데이터프레임.
956
1034
 
957
1035
  Examples:
hossam/hs_plot.py CHANGED
@@ -309,7 +309,7 @@ def lineplot(
309
309
  # 상자그림(boxplot)을 그린다
310
310
  # ===================================================================
311
311
  def boxplot(
312
- df: DataFrame,
312
+ df: DataFrame | None = None,
313
313
  xname: str | None = None,
314
314
  yname: str | None = None,
315
315
  title: str | None = None,
@@ -331,7 +331,7 @@ def boxplot(
331
331
  """상자그림(boxplot)을 그린다.
332
332
 
333
333
  Args:
334
- df (DataFrame): 시각화할 데이터.
334
+ df (DataFrame|None): 시각화할 데이터.
335
335
  xname (str|None): x축 범주 컬럼명.
336
336
  yname (str|None): y축 값 컬럼명.
337
337
  title (str|None): 그래프 제목.
@@ -359,13 +359,20 @@ def boxplot(
359
359
  fig, ax = get_default_ax(width, height, 1, 1, dpi) # type: ignore
360
360
  outparams = True
361
361
 
362
- if xname is not None and yname is not None:
362
+ if xname is not None or yname is not None:
363
+ if xname is not None and yname is None:
364
+ orient = "h"
365
+ elif xname is None and yname is not None:
366
+ orient = "v"
367
+
368
+
363
369
  boxplot_kwargs = {
364
370
  "data": df,
365
371
  "x": xname,
366
372
  "y": yname,
367
373
  "orient": orient,
368
374
  "ax": ax,
375
+ "linewidth": linewidth,
369
376
  }
370
377
 
371
378
  # hue 파라미터 확인 (params에 있을 수 있음)
@@ -377,12 +384,12 @@ def boxplot(
377
384
  boxplot_kwargs["color"] = sb.color_palette(palette)[0]
378
385
 
379
386
  boxplot_kwargs.update(params)
380
- sb.boxplot(**boxplot_kwargs, linewidth=linewidth)
387
+ sb.boxplot(**boxplot_kwargs)
381
388
 
382
389
  # 통계 검정 추가
383
390
  if stat_test is not None:
384
391
  if stat_pairs is None:
385
- stat_pairs = [df[xname].dropna().unique().tolist()]
392
+ stat_pairs = [df[xname].dropna().unique().tolist()] # type: ignore
386
393
 
387
394
  annotator = Annotator(
388
395
  ax, data=df, x=xname, y=yname, pairs=stat_pairs, orient=orient
@@ -847,15 +854,15 @@ def scatterplot(
847
854
  else:
848
855
  # 핵심벡터
849
856
  scatterplot_kwargs["edgecolor"] = "#ffffff"
850
- sb.scatterplot(data=df[df[vector] == "core"], **scatterplot_kwargs)
857
+ sb.scatterplot(data=df[df[vector] == "core"], **scatterplot_kwargs) # type: ignore
851
858
 
852
859
  # 외곽백터
853
860
  scatterplot_kwargs["edgecolor"] = "#000000"
854
861
  scatterplot_kwargs["s"] = 25
855
862
  scatterplot_kwargs["marker"] = "^"
856
863
  scatterplot_kwargs["linewidth"] = 0.8
857
- sb.scatterplot(data=df[df[vector] == "border"], **scatterplot_kwargs)
858
-
864
+ sb.scatterplot(data=df[df[vector] == "border"], **scatterplot_kwargs) # type: ignore
865
+
859
866
  # 노이즈벡터
860
867
  scatterplot_kwargs["edgecolor"] = None
861
868
  scatterplot_kwargs["s"] = 25
@@ -3022,10 +3029,15 @@ def pca_plot(
3022
3029
  if field_group is not None:
3023
3030
  title += " - " + ", ".join(field_group)
3024
3031
 
3032
+ tdf = DataFrame({
3033
+ field_group[0]: xs * scalex,
3034
+ field_group[1]: ys * scaley,
3035
+ })
3036
+
3025
3037
  scatterplot(
3026
- df=None,
3027
- xname=xs * scalex,
3028
- yname=ys * scaley,
3038
+ df=tdf,
3039
+ xname=field_group[0],
3040
+ yname=field_group[1],
3029
3041
  hue=pca_df[hue] if hue is not None else None,
3030
3042
  outline=False,
3031
3043
  palette=palette,
hossam/hs_stats.py CHANGED
@@ -244,6 +244,41 @@ def outlier_table(data: DataFrame, *fields: str):
244
244
  outlier_count = ((data[f] < down) | (data[f] > up)).sum()
245
245
  outlier_rate = (outlier_count / len(data)) * 100
246
246
 
247
+ # 왜도
248
+ skew = data[f].skew()
249
+
250
+ # 이상치 개수 및 비율
251
+ outlier_count = ((data[f] < down) | (data[f] > up)).sum()
252
+ outlier_rate = (outlier_count / len(data)) * 100
253
+
254
+ # 분포 특성 판정 (왜도 기준)
255
+ abs_skew = abs(skew) # type: ignore
256
+ if abs_skew < 0.5: # type: ignore
257
+ dist = "거의 대칭"
258
+ elif abs_skew < 1.0: # type: ignore
259
+ if skew > 0: # type: ignore
260
+ dist = "약한 우측 꼬리"
261
+ else:
262
+ dist = "약한 좌측 꼬리"
263
+ elif abs_skew < 2.0: # type: ignore
264
+ if skew > 0: # type: ignore
265
+ dist = "중간 우측 꼬리"
266
+ else:
267
+ dist = "중간 좌측 꼬리"
268
+ else:
269
+ if skew > 0: # type: ignore
270
+ dist = "극단 우측 꼬리"
271
+ else:
272
+ dist = "극단 좌측 꼬리"
273
+
274
+ # 로그변환 필요성 판정
275
+ if abs_skew < 0.5: # type: ignore
276
+ log_need = "낮음"
277
+ elif abs_skew < 1.0: # type: ignore
278
+ log_need = "중간"
279
+ else:
280
+ log_need = "높음"
281
+
247
282
  iq = {
248
283
  "field": f,
249
284
  "q1": q1,
@@ -254,9 +289,11 @@ def outlier_table(data: DataFrame, *fields: str):
254
289
  "down": down,
255
290
  "min": min_value,
256
291
  "max": max_value,
257
- "skew": skew,
258
292
  "outlier_count": outlier_count,
259
- "outlier_rate": outlier_rate
293
+ "outlier_rate": outlier_rate,
294
+ "skew": skew,
295
+ "dist": dist,
296
+ "log_need": log_need
260
297
  }
261
298
 
262
299
  result.append(iq)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hossam
3
- Version: 0.4.17
3
+ Version: 0.4.18
4
4
  Summary: Hossam Data Helper
5
5
  Author-email: Lee Kwang-Ho <leekh4232@gmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1,18 @@
1
+ hossam/NotoSansKR-Regular.ttf,sha256=0SCufUQwcVWrWTu75j4Lt_V2bgBJIBXl1p8iAJJYkVY,6185516
2
+ hossam/__init__.py,sha256=_v_Gj01j6nNtDfcxS9B2MB8ZPQ_PVnY9jTkvgRJdVxc,4492
3
+ hossam/hs_classroom.py,sha256=Sb1thy49LKn2zU90aiOVwHWhyWSMHLZbZX7eXmQlquc,27523
4
+ hossam/hs_cluster copy.py,sha256=1pylX0lV_FR1ZhcYN1sosMcAxj5WGLjwgHHjkkCPsCY,39761
5
+ hossam/hs_cluster.py,sha256=wrDPsF14tHwRuwBEUz765j_I87JH5HFtZZoUuGf-W-4,41190
6
+ hossam/hs_gis.py,sha256=DVmndBK-_7GMK3J1_on3ieEQk1S0MfUZ8_wlX-cDdZQ,11581
7
+ hossam/hs_plot.py,sha256=KW9NtD-ecFvas5h58ayEPzV0Eu2Dtt6twhj4z4LhPzA,103127
8
+ hossam/hs_prep.py,sha256=lsK_BxuOXNdTTNED-nkw3EY2vt5nQGMvkWvVLqNkwM0,43255
9
+ hossam/hs_stats.py,sha256=1h3RPOVJ_5EGO5_1yHTE8zpuJyy85xOLPqWmR-xqqmo,121161
10
+ hossam/hs_study.py,sha256=ZzL76_V0IHnk_YUTbWncIIBruOj2Sz3xs91snS6cpu0,2776
11
+ hossam/hs_timeserise.py,sha256=NzGV4bJmVQr3qUFySOP25qENItmloYjgh3VgwSbSmXc,43163
12
+ hossam/hs_util.py,sha256=LJvoTFSLX_uADJrC1ONGM93DX2HS2jcNBnPariLPZko,16704
13
+ hossam/leekh.png,sha256=1PB5NQ24SDoHA5KMiBBsWpSa3iniFcwFTuGwuOsTHfI,6395
14
+ hossam-0.4.18.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
15
+ hossam-0.4.18.dist-info/METADATA,sha256=FUrD5dnNDWgA8dChQq8gmFg1lQ47YKiIdqhItMl6Vi8,3803
16
+ hossam-0.4.18.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
17
+ hossam-0.4.18.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
18
+ hossam-0.4.18.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- hossam/NotoSansKR-Regular.ttf,sha256=0SCufUQwcVWrWTu75j4Lt_V2bgBJIBXl1p8iAJJYkVY,6185516
2
- hossam/__init__.py,sha256=lJ-_g2HAmFnixOmKjCv7_cMSdiYwbM6SNlHEtptUlUI,4045
3
- hossam/hs_classroom.py,sha256=Sb1thy49LKn2zU90aiOVwHWhyWSMHLZbZX7eXmQlquc,27523
4
- hossam/hs_cluster.py,sha256=yFlEaLz-cEueurw5nvKuUogdwepOEU3jN7woaYeN-cM,37581
5
- hossam/hs_gis.py,sha256=DVmndBK-_7GMK3J1_on3ieEQk1S0MfUZ8_wlX-cDdZQ,11581
6
- hossam/hs_plot.py,sha256=nuN8DoQbMRLMxFRO1VD1zhg2yA1NHqFGrjuvS4FLahY,102773
7
- hossam/hs_prep.py,sha256=lsK_BxuOXNdTTNED-nkw3EY2vt5nQGMvkWvVLqNkwM0,43255
8
- hossam/hs_stats.py,sha256=MDS3rvaXDP8aYwcE36JTetWiZgE4fkXnNo0vwlXu-pA,119890
9
- hossam/hs_study.py,sha256=ZzL76_V0IHnk_YUTbWncIIBruOj2Sz3xs91snS6cpu0,2776
10
- hossam/hs_timeserise.py,sha256=NzGV4bJmVQr3qUFySOP25qENItmloYjgh3VgwSbSmXc,43163
11
- hossam/hs_util.py,sha256=LJvoTFSLX_uADJrC1ONGM93DX2HS2jcNBnPariLPZko,16704
12
- hossam/leekh.png,sha256=1PB5NQ24SDoHA5KMiBBsWpSa3iniFcwFTuGwuOsTHfI,6395
13
- hossam-0.4.17.dist-info/licenses/LICENSE,sha256=nIqzhlcFY_2D6QtFsYjwU7BWkafo-rUJOQpDZ-DsauI,941
14
- hossam-0.4.17.dist-info/METADATA,sha256=camh3iJFBxxmzX1tHg71MWwNcqcY3a_GSKbNhf8iR7E,3803
15
- hossam-0.4.17.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
16
- hossam-0.4.17.dist-info/top_level.txt,sha256=_-7bwjhthHplWhywEaHIJX2yL11CQCaLjCNSBlk6wiQ,7
17
- hossam-0.4.17.dist-info/RECORD,,