pandas-plots 0.11.14__tar.gz → 0.11.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pandas-plots
3
- Version: 0.11.14
3
+ Version: 0.11.16
4
4
  Summary: A collection of helper for table handling and vizualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -22,7 +22,7 @@ License-File: LICENSE
22
22
  Requires-Dist: pandas>=2.0.0
23
23
  Requires-Dist: plotly>=5.18.0
24
24
  Requires-Dist: matplotlib>=3.8.2
25
- Requires-Dist: matplotlib-venn>=0.11.10
25
+ Requires-Dist: matplotlib-venn==0.11.10
26
26
  Requires-Dist: seaborn>=0.13.2
27
27
  Requires-Dist: Jinja2>=3.1.4
28
28
  Requires-Dist: requests>=2.32.0
@@ -83,6 +83,7 @@ tbl.show_num_df(
83
83
  - `describe_df()` an alternative version of pandas `describe()` function
84
84
  - `descr_db()` a very short descr for a `duckdb` relation
85
85
  - `pivot_df()` gets a pivot table of a 3 column dataframe (or 2 columns if no weights are given)
86
+ - `print_summary()` shows statistics for a pandas DataFrame or Series
86
87
 
87
88
  - `pls` for plotly visualizations
88
89
  - `plot_box()` auto annotated boxplot w/ violin option
@@ -49,6 +49,7 @@ tbl.show_num_df(
49
49
  - `describe_df()` an alternative version of pandas `describe()` function
50
50
  - `descr_db()` a very short descr for a `duckdb` relation
51
51
  - `pivot_df()` gets a pivot table of a 3 column dataframe (or 2 columns if no weights are given)
52
+ - `print_summary()` shows statistics for a pandas DataFrame or Series
52
53
 
53
54
  - `pls` for plotly visualizations
54
55
  - `plot_box()` auto annotated boxplot w/ violin option
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = pandas-plots
3
- version = 0.11.14
3
+ version = 0.11.16
4
4
  author = smeisegeier
5
5
  author_email = dexterDSDo@googlemail.com
6
6
  description = A collection of helper for table handling and vizualization
@@ -28,7 +28,7 @@ install_requires =
28
28
  pandas >= 2.0.0
29
29
  plotly >= 5.18.0
30
30
  matplotlib >= 3.8.2
31
- matplotlib-venn >= 0.11.10
31
+ matplotlib-venn == 0.11.10
32
32
  seaborn >= 0.13.2
33
33
  Jinja2 >= 3.1.4
34
34
  requests >= 2.32.0
@@ -11,6 +11,7 @@ from matplotlib import pyplot as plt
11
11
  from plotly import express as px
12
12
 
13
13
  from .hlp import *
14
+ from .tbl import print_summary
14
15
 
15
16
 
16
17
  def _set_caption(caption: str) -> str:
@@ -757,6 +758,7 @@ def plot_box(
757
758
  height: int = 200,
758
759
  width: int = 1200,
759
760
  annotations: bool = True,
761
+ summary: bool = True,
760
762
  caption: str = None,
761
763
  title: str = None,
762
764
  violin: bool = False,
@@ -776,6 +778,7 @@ def plot_box(
776
778
  violin: Use violin plot or not
777
779
  x_min: The minimum value for the x-axis scale (max and min must be set)
778
780
  x_max: The maximum value for the x-axis scale (max and min must be set)
781
+ summary: Whether to add a summary table to the plot
779
782
 
780
783
  Returns:
781
784
  None
@@ -885,6 +888,8 @@ def plot_box(
885
888
  )
886
889
 
887
890
  fig.show("png")
891
+ if summary:
892
+ print_summary(ser)
888
893
  return
889
894
 
890
895
 
@@ -896,6 +901,7 @@ def plot_boxes(
896
901
  height: int = 600,
897
902
  width: int = 800,
898
903
  annotations: bool = True,
904
+ summary: bool = True,
899
905
  title: str = None,
900
906
  ) -> None:
901
907
  """
@@ -909,6 +915,7 @@ def plot_boxes(
909
915
  height (int): The height of the plot.
910
916
  width (int): The width of the plot.
911
917
  annotations (bool): Whether to add annotations to the plot.
918
+ summary (bool): Whether to add a summary to the plot.
912
919
 
913
920
  Returns:
914
921
  None
@@ -1022,6 +1029,8 @@ def plot_boxes(
1022
1029
  fig.update_yaxes(title_text=df.columns[1])
1023
1030
 
1024
1031
  fig.show("png")
1032
+ if summary:
1033
+ print_summary(df)
1025
1034
  return
1026
1035
 
1027
1036
 
@@ -112,18 +112,19 @@ def describe_df(
112
112
  header = f"🟠 {col}({len(unis):_}|{df[col].dtype})"
113
113
  return unis, header
114
114
 
115
- # * show all columns
115
+ # hack this block somehow interferes with the plotly renderer. so its run even when use_columns=False
116
116
  if use_columns:
117
117
  print("--- column uniques (all)")
118
118
  print(f"🟠 index {wrap_text(df.index.tolist()[:top_n_uniques])}")
119
- for col in df.columns[:]:
120
- _u, _h = get_uniques_header(col)
121
- # * check col type
122
- is_str = df.loc[:, col].dtype.kind == "O"
123
- # * wrap output
124
- print(
125
- f"{_h} {wrap_text(_u[:top_n_uniques], max_items_in_line=70, use_apo=is_str)}"
126
- )
119
+ for col in df.columns[:]:
120
+ _u, _h = get_uniques_header(col)
121
+ # * check col type
122
+ is_str = df.loc[:, col].dtype.kind == "O"
123
+ # * wrap output
124
+ if use_columns:
125
+ print(
126
+ f"{_h} {wrap_text(_u[:top_n_uniques], max_items_in_line=70, use_apo=is_str)}"
127
+ )
127
128
 
128
129
  print("--- column stats (numeric)")
129
130
  # * only show numerics
@@ -131,9 +132,10 @@ def describe_df(
131
132
  _u, _h = get_uniques_header(col)
132
133
 
133
134
  # * extra care for scipy metrics, these are very vulnarable to nan
134
- print(
135
- f"{_h} min: {round(df[col].min(),3):_} | max: {round(df[col].max(),3):_} | median: {round(df[col].median(),3):_} | mean: {round(df[col].mean(),3):_} | std: {round(df[col].std(),3):_} | cv: {round(df[col].std() / df[col].mean(),3):_} | sum: {round(df[col].sum(),3):_} | skew: {round(stats.skew(df[col].dropna().tolist()),3)} | kurto: {round(stats.kurtosis(df[col].dropna().tolist()),3)}"
136
- )
135
+ # print(
136
+ # f"{_h} min: {round(df[col].min(),3):_} | max: {round(df[col].max(),3):_} | median: {round(df[col].median(),3):_} | mean: {round(df[col].mean(),3):_} | std: {round(df[col].std(),3):_} | cv: {round(df[col].std() / df[col].mean(),3):_} | sum: {round(df[col].sum(),3):_} | skew: {round(stats.skew(df[col].dropna().tolist()),3)} | kurto: {round(stats.kurtosis(df[col].dropna().tolist()),3)}"
137
+ # )
138
+ print_summary(df[col], _h)
137
139
 
138
140
  # * show first 3 rows
139
141
  display(df[:3])
@@ -619,3 +621,57 @@ def show_num_df(
619
621
  )
620
622
 
621
623
  return out
624
+
625
+ def print_summary(df: pd.DataFrame | pd.Series, name: str="🟠 "):
626
+ """
627
+ Print statistical summary for a pandas DataFrame or Series.
628
+
629
+ The function computes and prints various statistics for each numeric column in a DataFrame
630
+ or for a Series. Statistics include minimum, lower bound, 25th percentile (Q1), median, mean,
631
+ 75th percentile (Q3), upper bound, maximum, standard deviation, coefficient of variation,
632
+ sum, skewness, and kurtosis. The interquartile range (IQR) is used to compute the lower
633
+ and upper bounds, which are adjusted not to exceed the min and max of the data.
634
+
635
+ Args:
636
+ df (Union[pd.DataFrame, pd.Series]): Input DataFrame or Series. Only numeric columns
637
+ in DataFrame are considered.
638
+ """
639
+ if df.empty:
640
+ return
641
+
642
+ def print_summary_ser(ser: pd.Series, name: str=""):
643
+ # Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
644
+ iqr_value = stats.iqr(ser)
645
+
646
+ # Using the iqr function, we still calculate the bounds manually
647
+ q1 = stats.scoreatpercentile(ser, 25)
648
+ q3 = stats.scoreatpercentile(ser, 75)
649
+
650
+ # Calculate upper bound directly
651
+ min = round(ser.min(),3)
652
+ med = round(ser.median(),3)
653
+ upper = round(q3 + 1.5 * iqr_value,3)
654
+ lower = round(q1 - 1.5 * iqr_value,3)
655
+ mean = round(ser.mean(),3)
656
+ std = round(ser.std(),3)
657
+ cv = round(ser.std() / ser.mean(),3)
658
+ max = round(ser.max(),3)
659
+ sum = round(ser.sum(),3)
660
+ skew = round(stats.skew(ser.dropna().tolist()),3)
661
+ kurto = round(stats.kurtosis(ser.dropna().tolist()),3)
662
+
663
+ lower = min if lower < min else lower
664
+ upper = max if upper > max else upper
665
+
666
+ # * extra care for scipy metrics, these are very vulnarable to nan
667
+ print(
668
+ f"""{name} min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
669
+
670
+ if isinstance(df, pd.Series):
671
+ print_summary_ser(df, name)
672
+ return
673
+ if isinstance(df, pd.DataFrame):
674
+ # * only show numerics
675
+ for col in df.select_dtypes("number").columns:
676
+ print_summary_ser(ser=df[col], name=col)
677
+ return
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pandas-plots
3
- Version: 0.11.14
3
+ Version: 0.11.16
4
4
  Summary: A collection of helper for table handling and vizualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -22,7 +22,7 @@ License-File: LICENSE
22
22
  Requires-Dist: pandas>=2.0.0
23
23
  Requires-Dist: plotly>=5.18.0
24
24
  Requires-Dist: matplotlib>=3.8.2
25
- Requires-Dist: matplotlib-venn>=0.11.10
25
+ Requires-Dist: matplotlib-venn==0.11.10
26
26
  Requires-Dist: seaborn>=0.13.2
27
27
  Requires-Dist: Jinja2>=3.1.4
28
28
  Requires-Dist: requests>=2.32.0
@@ -83,6 +83,7 @@ tbl.show_num_df(
83
83
  - `describe_df()` an alternative version of pandas `describe()` function
84
84
  - `descr_db()` a very short descr for a `duckdb` relation
85
85
  - `pivot_df()` gets a pivot table of a 3 column dataframe (or 2 columns if no weights are given)
86
+ - `print_summary()` shows statistics for a pandas DataFrame or Series
86
87
 
87
88
  - `pls` for plotly visualizations
88
89
  - `plot_box()` auto annotated boxplot w/ violin option
@@ -1,7 +1,7 @@
1
1
  pandas>=2.0.0
2
2
  plotly>=5.18.0
3
3
  matplotlib>=3.8.2
4
- matplotlib-venn>=0.11.10
4
+ matplotlib-venn==0.11.10
5
5
  seaborn>=0.13.2
6
6
  Jinja2>=3.1.4
7
7
  requests>=2.32.0
File without changes