pcorr 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pcorr-0.5.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Mark
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pcorr-0.5.1/PKG-INFO ADDED
@@ -0,0 +1,139 @@
1
+ Metadata-Version: 2.4
2
+ Name: pcorr
3
+ Version: 0.5.1
4
+ Summary: Pairwise Pearson correlations with p-values and multiple-comparison correction
5
+ Author: Mark
6
+ License-Expression: MIT
7
+ Project-URL: Repository, https://github.com/Cyber200potato/pcorr
8
+ Project-URL: Issues, https://github.com/Cyber200potato/pcorr/issues
9
+ Keywords: correlation,pearson,p-value,statistics,multiple-comparisons
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering
21
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
22
+ Requires-Python: >=3.8
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: numpy>=1.20
26
+ Requires-Dist: pandas>=1.2
27
+ Requires-Dist: scipy>=1.6
28
+ Provides-Extra: full
29
+ Requires-Dist: statsmodels>=0.13; extra == "full"
30
+ Provides-Extra: test
31
+ Requires-Dist: pytest>=7.0; extra == "test"
32
+ Requires-Dist: statsmodels>=0.13; extra == "test"
33
+ Dynamic: license-file
34
+
35
+ # pcorr
36
+
37
+ Compute all pairwise Pearson/Spearman correlations between numeric columns in a `pandas.DataFrame` — like `pandas.DataFrame.corr()`, but with **p-values for each pair** and **multiple-comparison correction** in one call.
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ pip install -e . # core: numpy, pandas, scipy
43
+ pip install -e ".[full]" # + statsmodels (fdr_bh, holm, sidak, ...)
44
+ ```
45
+
46
+ Without statsmodels only `method="bonferroni"` and `method="none"` are available.
47
+
48
+ If the package is published, you can install it as:
49
+
50
+ ```bash
51
+ pip install pcorr
52
+ pip install "pcorr[full]"
53
+ ```
54
+
55
+ ## Usage
56
+
57
+ ```python
58
+ import pandas as pd
59
+ from pcorr import corr_pairwise, corr_table, show_table
60
+
61
+ df = pd.read_csv("data.csv")
62
+
63
+ # Long (tidy) format: one row per pair
64
+ table = corr_pairwise(df, method="fdr_bh", alpha=0.05)
65
+ print(table)
66
+ # var1 var2 n r p_corrected p_value significant
67
+ # 0 x y 200 0.967 0.0000 0.0000 True
68
+ # 1 x z 200 -0.894 0.0000 0.0000 True
69
+ # ...
70
+
71
+ # Two square tables (r and p) rendered as lower-triangle output
72
+ tables = corr_table(df, method="bonferroni")
73
+ tables["r"] # coefficients
74
+ tables["p"] # p-values (raw when method="none", otherwise corrected)
75
+
76
+ # Convenience display (renders nicely in Jupyter, prints in console)
77
+ show_table(df, method="bonferroni")
78
+ ```
79
+
80
+ ## API
81
+
82
+ - `corr_pairwise(df, ...)` — tidy table: one row per column pair.
83
+ - `corr_table(df, ...)` — two square tables (coefficients and p-values) in a lower-triangle style.
84
+ - `show_table(df, ...)` — displays/prints `corr_table(...)`.
85
+ - `corr_matrices(df, ...)` — alias for `corr_table(...)` (compatibility).
86
+
87
+ ## Output format
88
+
89
+ ### corr_pairwise
90
+
91
+ Always returns:
92
+
93
+ - `var1`, `var2` — column names
94
+ - `n` — number of valid observations in the pair (after pairwise NaN deletion)
95
+ - `r` — correlation coefficient
96
+ - `significant` — `p < alpha` for the p-value used for significance
97
+
98
+ P-value columns depend on `method`:
99
+
100
+ - `method="none"` returns `p_value` (raw p-value)
101
+ - any correction (e.g. `bonferroni`, `fdr_bh`, `holm`) returns `p_corrected` and `p_value` (raw)
102
+
103
+ Raw p-values example:
104
+
105
+ ```python
106
+ from pcorr import corr_pairwise
107
+ out = corr_pairwise(df, method="none")
108
+ ```
109
+
110
+ ### corr_table / corr_matrices
111
+
112
+ Returns a dict with two `DataFrame`s:
113
+
114
+ - `tables["r"]` — coefficients (lower triangle filled), diagonal `"1"`, upper triangle blank
115
+ - `tables["p"]` — p-values in the same layout, diagonal `"—"`
116
+ - raw p-values when `method="none"`
117
+ - corrected p-values for any correction method
118
+
119
+ ## Parameters
120
+
121
+ | parameter | meaning |
122
+ |-----------|------------------------------------------------------------------|
123
+ | `columns` | which columns to use (default: all numeric) |
124
+ | `corr` | `"pearson"` or `"spearman"` |
125
+ | `method` | `bonferroni`, `fdr_bh`, `holm`, `sidak`, `none`, ... |
126
+ | `alpha` | threshold for `significant` |
127
+ | `min_n` | minimum valid observations per pair (pairwise NaN deletion) |
128
+ | `round_to`| rounding for r/p columns (None disables rounding) |
129
+
130
+ ## Notes
131
+
132
+ - Pairwise NaN deletion: each pair has its own `n`.
133
+ - Constant columns and pairs with `n < min_n` are handled without crashing.
134
+ - `corr_pairwise` is sorted by the p-value used for ranking (corrected when applicable).
135
+
136
+ ## P-value correction methods
137
+
138
+ - `method="none"` and `method="bonferroni"` work without statsmodels.
139
+ - Other methods use `statsmodels.stats.multitest.multipletests` and require statsmodels (install via the `full` extra).
pcorr-0.5.1/README.md ADDED
@@ -0,0 +1,105 @@
1
+ # pcorr
2
+
3
+ Compute all pairwise Pearson/Spearman correlations between numeric columns in a `pandas.DataFrame` — like `pandas.DataFrame.corr()`, but with **p-values for each pair** and **multiple-comparison correction** in one call.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install -e . # core: numpy, pandas, scipy
9
+ pip install -e ".[full]" # + statsmodels (fdr_bh, holm, sidak, ...)
10
+ ```
11
+
12
+ Without statsmodels only `method="bonferroni"` and `method="none"` are available.
13
+
14
+ If the package is published, you can install it as:
15
+
16
+ ```bash
17
+ pip install pcorr
18
+ pip install "pcorr[full]"
19
+ ```
20
+
21
+ ## Usage
22
+
23
+ ```python
24
+ import pandas as pd
25
+ from pcorr import corr_pairwise, corr_table, show_table
26
+
27
+ df = pd.read_csv("data.csv")
28
+
29
+ # Long (tidy) format: one row per pair
30
+ table = corr_pairwise(df, method="fdr_bh", alpha=0.05)
31
+ print(table)
32
+ # var1 var2 n r p_corrected p_value significant
33
+ # 0 x y 200 0.967 0.0000 0.0000 True
34
+ # 1 x z 200 -0.894 0.0000 0.0000 True
35
+ # ...
36
+
37
+ # Two square tables (r and p) rendered as lower-triangle output
38
+ tables = corr_table(df, method="bonferroni")
39
+ tables["r"] # coefficients
40
+ tables["p"] # p-values (raw when method="none", otherwise corrected)
41
+
42
+ # Convenience display (renders nicely in Jupyter, prints in console)
43
+ show_table(df, method="bonferroni")
44
+ ```
45
+
46
+ ## API
47
+
48
+ - `corr_pairwise(df, ...)` — tidy table: one row per column pair.
49
+ - `corr_table(df, ...)` — two square tables (coefficients and p-values) in a lower-triangle style.
50
+ - `show_table(df, ...)` — displays/prints `corr_table(...)`.
51
+ - `corr_matrices(df, ...)` — alias for `corr_table(...)` (compatibility).
52
+
53
+ ## Output format
54
+
55
+ ### corr_pairwise
56
+
57
+ Always returns:
58
+
59
+ - `var1`, `var2` — column names
60
+ - `n` — number of valid observations in the pair (after pairwise NaN deletion)
61
+ - `r` — correlation coefficient
62
+ - `significant` — `p < alpha` for the p-value used for significance
63
+
64
+ P-value columns depend on `method`:
65
+
66
+ - `method="none"` returns `p_value` (raw p-value)
67
+ - any correction (e.g. `bonferroni`, `fdr_bh`, `holm`) returns `p_corrected` and `p_value` (raw)
68
+
69
+ Raw p-values example:
70
+
71
+ ```python
72
+ from pcorr import corr_pairwise
73
+ out = corr_pairwise(df, method="none")
74
+ ```
75
+
76
+ ### corr_table / corr_matrices
77
+
78
+ Returns a dict with two `DataFrame`s:
79
+
80
+ - `tables["r"]` — coefficients (lower triangle filled), diagonal `"1"`, upper triangle blank
81
+ - `tables["p"]` — p-values in the same layout, diagonal `"—"`
82
+ - raw p-values when `method="none"`
83
+ - corrected p-values for any correction method
84
+
85
+ ## Parameters
86
+
87
+ | parameter | meaning |
88
+ |-----------|------------------------------------------------------------------|
89
+ | `columns` | which columns to use (default: all numeric) |
90
+ | `corr` | `"pearson"` or `"spearman"` |
91
+ | `method` | `bonferroni`, `fdr_bh`, `holm`, `sidak`, `none`, ... |
92
+ | `alpha` | threshold for `significant` |
93
+ | `min_n` | minimum valid observations per pair (pairwise NaN deletion) |
94
+ | `round_to`| rounding for r/p columns (None disables rounding) |
95
+
96
+ ## Notes
97
+
98
+ - Pairwise NaN deletion: each pair has its own `n`.
99
+ - Constant columns and pairs with `n < min_n` are handled without crashing.
100
+ - `corr_pairwise` is sorted by the p-value used for ranking (corrected when applicable).
101
+
102
+ ## P-value correction methods
103
+
104
+ - `method="none"` and `method="bonferroni"` work without statsmodels.
105
+ - Other methods use `statsmodels.stats.multitest.multipletests` and require statsmodels (install via the `full` extra).
@@ -0,0 +1,19 @@
1
+ """pcorr — pairwise Pearson/Spearman correlations with p-values and correction.
2
+
3
+ Combines the convenience of a pandas-style "all pairs" correlation matrix
4
+ with scipy's per-pair p-values, plus optional multiple-comparison correction.
5
+ """
6
+
7
+ from .core import corr_pairwise, corr_table, corr_matrices, show_table
8
+
9
+ try:
10
+ from importlib.metadata import PackageNotFoundError, version
11
+ except ImportError:
12
+ from importlib_metadata import PackageNotFoundError, version
13
+
14
+ try:
15
+ __version__ = version("pcorr")
16
+ except PackageNotFoundError:
17
+ __version__ = "0.0.0"
18
+
19
+ __all__ = ["corr_pairwise", "corr_table", "corr_matrices", "show_table"]
@@ -0,0 +1,324 @@
1
+ """Pairwise Pearson/Spearman correlations with p-values and multiple-comparison correction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from itertools import combinations
6
+ from typing import Iterable, Optional
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from scipy.stats import pearsonr, spearmanr
11
+
12
+ try:
13
+ from statsmodels.stats.multitest import multipletests
14
+ _HAS_STATSMODELS = True
15
+ except ImportError: # pragma: no cover
16
+ _HAS_STATSMODELS = False
17
+
18
+
19
+ _SUPPORTED_METHODS = {
20
+ "bonferroni", "sidak", "holm", "holm-sidak", "simes-hochberg",
21
+ "hommel", "fdr_bh", "fdr_by", "fdr_tsbh", "fdr_tsbky", "none",
22
+ }
23
+
24
+ _CORR_FUNCS = {
25
+ "pearson": pearsonr,
26
+ "spearman": spearmanr,
27
+ }
28
+
29
+
30
+ def _select_columns(df: pd.DataFrame, columns: Optional[Iterable[str]]) -> list:
31
+ if columns is None:
32
+ cols = df.select_dtypes(include=np.number).columns.tolist()
33
+ else:
34
+ cols = list(columns)
35
+ if len(cols) < 2:
36
+ raise ValueError("Need at least 2 numeric columns to correlate.")
37
+ return cols
38
+
39
+
40
+ def _apply_correction(pvals: np.ndarray, method: str, alpha: float):
41
+ """Return (corrected_pvals, rejected) handling NaNs gracefully."""
42
+ pvals = np.asarray(pvals, dtype=float)
43
+ corrected = np.full_like(pvals, np.nan)
44
+ rejected = np.zeros_like(pvals, dtype=bool)
45
+
46
+ if not _HAS_STATSMODELS:
47
+ raise ImportError(
48
+ "statsmodels is required for correction methods other than "
49
+ "'bonferroni' and 'none'. Install it with `pip install statsmodels`, "
50
+ "or use method='bonferroni'."
51
+ )
52
+
53
+ mask = ~np.isnan(pvals)
54
+ if mask.sum() == 0:
55
+ return corrected, rejected
56
+
57
+ rej, corr_p, _, _ = multipletests(pvals[mask], alpha=alpha, method=method)
58
+ corrected[mask] = corr_p
59
+ rejected[mask] = rej
60
+ return corrected, rejected
61
+
62
+
63
+ def _bonferroni(pvals: np.ndarray, alpha: float):
64
+ """Pure-numpy Bonferroni so the core works without statsmodels."""
65
+ pvals = np.asarray(pvals, dtype=float)
66
+ mask = ~np.isnan(pvals)
67
+ n = int(mask.sum())
68
+ corrected = np.full_like(pvals, np.nan)
69
+ corrected[mask] = np.minimum(pvals[mask] * n, 1.0)
70
+ rejected = np.zeros_like(pvals, dtype=bool)
71
+ rejected[mask] = corrected[mask] < alpha
72
+ return corrected, rejected
73
+
74
+
75
+ def corr_pairwise(
76
+ df: pd.DataFrame,
77
+ columns: Optional[Iterable[str]] = None,
78
+ corr: str = "pearson",
79
+ method: str = "none",
80
+ alpha: float = 0.05,
81
+ min_n: int = 3,
82
+ round_to: Optional[int] = 4,
83
+ ) -> pd.DataFrame:
84
+ """Compute pairwise correlations in long (tidy) format.
85
+
86
+ The p-value column adapts to `method`:
87
+ * method="none" -> a single column `p_value` (uncorrected).
88
+ * any correction -> columns `p_corrected` then `p_value` (raw).
89
+
90
+ Parameters
91
+ ----------
92
+ df : pandas.DataFrame
93
+ Input data. Pairwise NaN deletion is applied per column pair.
94
+ columns : iterable of str, optional
95
+ Columns to correlate. Defaults to all numeric columns.
96
+ corr : {"pearson", "spearman"}, default "pearson"
97
+ Correlation coefficient to compute.
98
+ method : str, default "none"
99
+ Multiple-comparison correction. "none" reports raw p-values only.
100
+ "bonferroni" works with scipy alone; other statsmodels methods
101
+ (e.g. "fdr_bh", "holm", "sidak") require statsmodels.
102
+ alpha : float, default 0.05
103
+ Significance threshold. A pair is flagged significant when its
104
+ reported p-value < alpha. Set 0.10 / 0.05 / 0.01 for 90/95/99%.
105
+ min_n : int, default 3
106
+ Minimum valid (non-NaN) observations per pair; smaller pairs skipped.
107
+ round_to : int or None, default 4
108
+ Decimal places for r/p columns. None disables rounding.
109
+
110
+ Returns
111
+ -------
112
+ pandas.DataFrame
113
+ Columns:
114
+ * method="none": var1, var2, n, r, p_value, significant.
115
+ * otherwise: var1, var2, n, r, p_corrected, p_value, significant.
116
+ Sorted by p_corrected when a correction is used, otherwise by p_value.
117
+ """
118
+ corr = corr.lower()
119
+ if corr not in _CORR_FUNCS:
120
+ raise ValueError(f"Unknown corr '{corr}'. Use 'pearson' or 'spearman'.")
121
+ corr_func = _CORR_FUNCS[corr]
122
+
123
+ method = method.lower()
124
+ if method not in _SUPPORTED_METHODS:
125
+ raise ValueError(
126
+ f"Unknown method '{method}'. Supported: {sorted(_SUPPORTED_METHODS)}"
127
+ )
128
+
129
+ cols = _select_columns(df, columns)
130
+ corrected_mode = method != "none"
131
+ if corrected_mode:
132
+ empty_cols = ["var1", "var2", "n", "r", "p_corrected", "p_value", "significant"]
133
+ else:
134
+ empty_cols = ["var1", "var2", "n", "r", "p_value", "significant"]
135
+
136
+ records = []
137
+ for a, b in combinations(cols, 2):
138
+ pair = df[[a, b]].dropna()
139
+ n = len(pair)
140
+ if n < min_n:
141
+ continue
142
+ if pair[a].nunique() < 2 or pair[b].nunique() < 2:
143
+ r, p = np.nan, np.nan
144
+ else:
145
+ r, p = corr_func(pair[a], pair[b])
146
+ records.append({"var1": a, "var2": b, "n": n, "r": r, "_p_raw": p})
147
+
148
+ if not records:
149
+ return pd.DataFrame(columns=empty_cols)
150
+
151
+ out = pd.DataFrame.from_records(records)
152
+ raw = out["_p_raw"].to_numpy(dtype=float)
153
+
154
+ if corrected_mode:
155
+ if method == "bonferroni" and not _HAS_STATSMODELS:
156
+ corrected, _ = _bonferroni(raw, alpha)
157
+ else:
158
+ corrected, _ = _apply_correction(raw, method, alpha)
159
+ out["p_corrected"] = corrected
160
+ out["p_value"] = raw
161
+ sort_col = "p_corrected"
162
+ else:
163
+ out["p_value"] = raw
164
+ sort_col = "p_value"
165
+
166
+ out = out.drop(columns="_p_raw")
167
+ out["significant"] = out[sort_col] < alpha
168
+ out = out.sort_values(sort_col, na_position="last").reset_index(drop=True)
169
+
170
+ if round_to is not None:
171
+ for c in ("r", "p_corrected", "p_value"):
172
+ if c in out.columns:
173
+ out[c] = out[c].round(round_to)
174
+
175
+ if corrected_mode:
176
+ out = out[["var1", "var2", "n", "r", "p_corrected", "p_value", "significant"]]
177
+ else:
178
+ out = out[["var1", "var2", "n", "r", "p_value", "significant"]]
179
+ out.attrs["corr"] = corr
180
+ out.attrs["correction_method"] = method
181
+ out.attrs["alpha"] = alpha
182
+ return out
183
+
184
+
185
+ def corr_table(
186
+ df: pd.DataFrame,
187
+ columns: Optional[Iterable[str]] = None,
188
+ corr: str = "pearson",
189
+ method: str = "none",
190
+ alpha: float = 0.05,
191
+ min_n: int = 3,
192
+ round_to: int = 4,
193
+ stars: bool = True,
194
+ ):
195
+ """Pairwise correlations as two square matrices: coefficients and p-values.
196
+
197
+ Parameters
198
+ ----------
199
+ df : pandas.DataFrame
200
+ Input data (pairwise NaN deletion per pair).
201
+ columns : iterable of str, optional
202
+ Columns to use. Defaults to all numeric columns.
203
+ corr : {"pearson", "spearman"}, default "pearson"
204
+ Correlation coefficient.
205
+ method : str, default "none"
206
+ Multiple-comparison correction. "none" -> the p-matrix holds raw
207
+ p-values; any other method -> it holds corrected p-values.
208
+ alpha : float, default 0.05
209
+ Significance threshold. A coefficient is starred when its reported
210
+ p-value < alpha. Use 0.10 / 0.05 / 0.01 for 90 / 95 / 99%.
211
+ min_n : int, default 3
212
+ Minimum valid observations per pair.
213
+ round_to : int, default 4
214
+ Decimal places for both matrices.
215
+ stars : bool, default True
216
+ If True, the coefficient matrix is a string matrix with a single "*"
217
+ appended to significant coefficients (p < alpha). If False, it is a
218
+ plain numeric matrix.
219
+
220
+ Returns
221
+ -------
222
+ dict of pandas.DataFrame with keys:
223
+ 'r' : coefficient matrix. Lower triangle filled, upper triangle blank,
224
+ diagonal = "1". A single "*" marks significant coefficients
225
+ (reported p < alpha) when stars=True.
226
+ 'p' : p-value matrix in the same lower-triangle style, diagonal = "—".
227
+ Holds raw p-values (method="none") or corrected ones otherwise.
228
+
229
+ Both are string matrices sharing index/columns, so cells line up.
230
+ """
231
+ cols = _select_columns(df, columns)
232
+ long = corr_pairwise(df, columns=cols, corr=corr, method=method,
233
+ alpha=alpha, min_n=min_n, round_to=None)
234
+ p_col = "p_corrected" if method.lower() != "none" else "p_value"
235
+
236
+ r_num = pd.DataFrame(np.eye(len(cols)), index=cols, columns=cols)
237
+ p_mat = pd.DataFrame(np.zeros((len(cols), len(cols))), index=cols, columns=cols)
238
+ sig_mat = pd.DataFrame(False, index=cols, columns=cols)
239
+
240
+ for _, row in long.iterrows():
241
+ a, b = row["var1"], row["var2"]
242
+ r_num.loc[a, b] = r_num.loc[b, a] = row["r"]
243
+ p_mat.loc[a, b] = p_mat.loc[b, a] = row[p_col]
244
+ sig_mat.loc[a, b] = sig_mat.loc[b, a] = bool(row["significant"])
245
+
246
+ cols_list = list(cols)
247
+ r_out = pd.DataFrame("", index=cols_list, columns=cols_list)
248
+ p_out = pd.DataFrame("", index=cols_list, columns=cols_list)
249
+
250
+ for i, ri in enumerate(cols_list):
251
+ for j, cj in enumerate(cols_list):
252
+ if i == j:
253
+ r_out.loc[ri, cj] = "1"
254
+ p_out.loc[ri, cj] = "—"
255
+ elif i > j: # lower triangle only
256
+ r = r_num.loc[ri, cj]
257
+ p = p_mat.loc[ri, cj]
258
+ if pd.isna(r):
259
+ r_out.loc[ri, cj] = "—"
260
+ p_out.loc[ri, cj] = "—"
261
+ else:
262
+ mark = "*" if (stars and sig_mat.loc[ri, cj]) else ""
263
+ r_out.loc[ri, cj] = f"{r:.{round_to}f}{mark}"
264
+ p_out.loc[ri, cj] = f"{p:.{round_to}f}"
265
+ # upper triangle stays blank
266
+
267
+ for d in (r_out, p_out):
268
+ d.attrs["corr"] = corr
269
+ d.attrs["correction_method"] = method
270
+ d.attrs["alpha"] = alpha
271
+
272
+ return {"r": r_out, "p": p_out}
273
+
274
+
275
+
276
+ corr_matrices = corr_table
277
+
278
+
279
+ def show_table(df, columns=None, corr="pearson", method="none", alpha=0.05,
280
+ min_n=3, round_to=4, stars=True, tables=None):
281
+ """Display both tables (coefficients and p-values) with a plain pandas render.
282
+
283
+ In Jupyter this shows two clean HTML tables back-to-back, each preceded by
284
+ a short caption, and returns None (so Jupyter doesn't also echo the raw
285
+ dict as text below them). Outside Jupyter it prints them and returns the
286
+ dict {'r', 'p'}.
287
+
288
+ If you need the tables as objects, call `corr_table(...)` directly.
289
+
290
+ Same arguments as `corr_table`. Pass a precomputed `tables=corr_table(...)`
291
+ to skip recomputation.
292
+ """
293
+ if tables is None:
294
+ tables = corr_table(df, columns=columns, corr=corr, method=method,
295
+ alpha=alpha, min_n=min_n, round_to=round_to,
296
+ stars=stars)
297
+ r_tab, p_tab = tables["r"], tables["p"]
298
+
299
+ corr_name = tables["r"].attrs.get("corr", corr).capitalize()
300
+ meth = tables["r"].attrs.get("correction_method", method)
301
+ a = tables["r"].attrs.get("alpha", alpha)
302
+ star_note = f" (* p < {a})" if stars else ""
303
+ p_kind = "raw" if meth == "none" else f"{meth}-corrected"
304
+
305
+ r_caption = f"{corr_name} correlation coefficients{star_note}"
306
+ p_caption = f"p-values ({p_kind})"
307
+
308
+ try:
309
+ from IPython.display import display, HTML
310
+ get_ipython # noqa: F821 — exists only inside IPython/Jupyter
311
+ display(HTML(f"<b>{r_caption}</b>"))
312
+ display(r_tab) # plain pandas HTML render
313
+ display(HTML(f"<b>{p_caption}</b>"))
314
+ display(p_tab)
315
+ # Return nothing: Jupyter would otherwise echo the dict as raw text
316
+ # below the rendered tables. Use corr_table(...) if you need the object.
317
+ return None
318
+ except (ImportError, NameError):
319
+ print(r_caption)
320
+ print(r_tab.to_string())
321
+ print()
322
+ print(p_caption)
323
+ print(p_tab.to_string())
324
+ return tables
@@ -0,0 +1,139 @@
1
+ Metadata-Version: 2.4
2
+ Name: pcorr
3
+ Version: 0.5.1
4
+ Summary: Pairwise Pearson correlations with p-values and multiple-comparison correction
5
+ Author: Mark
6
+ License-Expression: MIT
7
+ Project-URL: Repository, https://github.com/Cyber200potato/pcorr
8
+ Project-URL: Issues, https://github.com/Cyber200potato/pcorr/issues
9
+ Keywords: correlation,pearson,p-value,statistics,multiple-comparisons
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering
21
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
22
+ Requires-Python: >=3.8
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: numpy>=1.20
26
+ Requires-Dist: pandas>=1.2
27
+ Requires-Dist: scipy>=1.6
28
+ Provides-Extra: full
29
+ Requires-Dist: statsmodels>=0.13; extra == "full"
30
+ Provides-Extra: test
31
+ Requires-Dist: pytest>=7.0; extra == "test"
32
+ Requires-Dist: statsmodels>=0.13; extra == "test"
33
+ Dynamic: license-file
34
+
35
+ # pcorr
36
+
37
+ Compute all pairwise Pearson/Spearman correlations between numeric columns in a `pandas.DataFrame` — like `pandas.DataFrame.corr()`, but with **p-values for each pair** and **multiple-comparison correction** in one call.
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ pip install -e . # core: numpy, pandas, scipy
43
+ pip install -e ".[full]" # + statsmodels (fdr_bh, holm, sidak, ...)
44
+ ```
45
+
46
+ Without statsmodels only `method="bonferroni"` and `method="none"` are available.
47
+
48
+ If the package is published, you can install it as:
49
+
50
+ ```bash
51
+ pip install pcorr
52
+ pip install "pcorr[full]"
53
+ ```
54
+
55
+ ## Usage
56
+
57
+ ```python
58
+ import pandas as pd
59
+ from pcorr import corr_pairwise, corr_table, show_table
60
+
61
+ df = pd.read_csv("data.csv")
62
+
63
+ # Long (tidy) format: one row per pair
64
+ table = corr_pairwise(df, method="fdr_bh", alpha=0.05)
65
+ print(table)
66
+ # var1 var2 n r p_corrected p_value significant
67
+ # 0 x y 200 0.967 0.0000 0.0000 True
68
+ # 1 x z 200 -0.894 0.0000 0.0000 True
69
+ # ...
70
+
71
+ # Two square tables (r and p) rendered as lower-triangle output
72
+ tables = corr_table(df, method="bonferroni")
73
+ tables["r"] # coefficients
74
+ tables["p"] # p-values (raw when method="none", otherwise corrected)
75
+
76
+ # Convenience display (renders nicely in Jupyter, prints in console)
77
+ show_table(df, method="bonferroni")
78
+ ```
79
+
80
+ ## API
81
+
82
+ - `corr_pairwise(df, ...)` — tidy table: one row per column pair.
83
+ - `corr_table(df, ...)` — two square tables (coefficients and p-values) in a lower-triangle style.
84
+ - `show_table(df, ...)` — displays/prints `corr_table(...)`.
85
+ - `corr_matrices(df, ...)` — alias for `corr_table(...)` (compatibility).
86
+
87
+ ## Output format
88
+
89
+ ### corr_pairwise
90
+
91
+ Always returns:
92
+
93
+ - `var1`, `var2` — column names
94
+ - `n` — number of valid observations in the pair (after pairwise NaN deletion)
95
+ - `r` — correlation coefficient
96
+ - `significant` — `p < alpha` for the p-value used for significance
97
+
98
+ P-value columns depend on `method`:
99
+
100
+ - `method="none"` returns `p_value` (raw p-value)
101
+ - any correction (e.g. `bonferroni`, `fdr_bh`, `holm`) returns `p_corrected` and `p_value` (raw)
102
+
103
+ Raw p-values example:
104
+
105
+ ```python
106
+ from pcorr import corr_pairwise
107
+ out = corr_pairwise(df, method="none")
108
+ ```
109
+
110
+ ### corr_table / corr_matrices
111
+
112
+ Returns a dict with two `DataFrame`s:
113
+
114
+ - `tables["r"]` — coefficients (lower triangle filled), diagonal `"1"`, upper triangle blank
115
+ - `tables["p"]` — p-values in the same layout, diagonal `"—"`
116
+ - raw p-values when `method="none"`
117
+ - corrected p-values for any correction method
118
+
119
+ ## Parameters
120
+
121
+ | parameter | meaning |
122
+ |-----------|------------------------------------------------------------------|
123
+ | `columns` | which columns to use (default: all numeric) |
124
+ | `corr` | `"pearson"` or `"spearman"` |
125
+ | `method` | `bonferroni`, `fdr_bh`, `holm`, `sidak`, `none`, ... |
126
+ | `alpha` | threshold for `significant` |
127
+ | `min_n` | minimum valid observations per pair (pairwise NaN deletion) |
128
+ | `round_to`| rounding for r/p columns (None disables rounding) |
129
+
130
+ ## Notes
131
+
132
+ - Pairwise NaN deletion: each pair has its own `n`.
133
+ - Constant columns and pairs with `n < min_n` are handled without crashing.
134
+ - `corr_pairwise` is sorted by the p-value used for ranking (corrected when applicable).
135
+
136
+ ## P-value correction methods
137
+
138
+ - `method="none"` and `method="bonferroni"` work without statsmodels.
139
+ - Other methods use `statsmodels.stats.multitest.multipletests` and require statsmodels (install via the `full` extra).
@@ -0,0 +1,11 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ pcorr/__init__.py
5
+ pcorr/core.py
6
+ pcorr.egg-info/PKG-INFO
7
+ pcorr.egg-info/SOURCES.txt
8
+ pcorr.egg-info/dependency_links.txt
9
+ pcorr.egg-info/requires.txt
10
+ pcorr.egg-info/top_level.txt
11
+ tests/test_core.py
@@ -0,0 +1,10 @@
1
+ numpy>=1.20
2
+ pandas>=1.2
3
+ scipy>=1.6
4
+
5
+ [full]
6
+ statsmodels>=0.13
7
+
8
+ [test]
9
+ pytest>=7.0
10
+ statsmodels>=0.13
@@ -0,0 +1 @@
1
+ pcorr
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pcorr"
7
+ version = "0.5.1"
8
+ description = "Pairwise Pearson correlations with p-values and multiple-comparison correction"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = "MIT"
12
+ license-files = ["LICENSE"]
13
+ authors = [{ name = "Mark" }]
14
+ keywords = ["correlation", "pearson", "p-value", "statistics", "multiple-comparisons"]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Science/Research",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3 :: Only",
21
+ "Programming Language :: Python :: 3.8",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Topic :: Scientific/Engineering",
27
+ "Topic :: Scientific/Engineering :: Mathematics",
28
+ ]
29
+
30
+ dependencies = [
31
+ "numpy>=1.20",
32
+ "pandas>=1.2",
33
+ "scipy>=1.6",
34
+ ]
35
+
36
+ [project.urls]
37
+ Repository = "https://github.com/Cyber200potato/pcorr"
38
+ Issues = "https://github.com/Cyber200potato/pcorr/issues"
39
+
40
+
41
+ [project.optional-dependencies]
42
+ full = ["statsmodels>=0.13"] # enables fdr_bh, holm, sidak, etc.
43
+ test = ["pytest>=7.0", "statsmodels>=0.13"]
44
+
45
+ [tool.setuptools.packages.find]
46
+ where = ["."]
47
+ include = ["pcorr*"]
pcorr-0.5.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,211 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import pytest
4
+
5
+ from pcorr import corr_pairwise, corr_table
6
+ from pcorr import corr_matrices
7
+
8
+
9
+ @pytest.fixture
10
+ def df():
11
+ rng = np.random.default_rng(42)
12
+ x = rng.normal(size=200)
13
+ return pd.DataFrame({
14
+ "x": x,
15
+ "y": 2 * x + rng.normal(scale=0.5, size=200), # strong positive
16
+ "z": -x + rng.normal(scale=0.5, size=200), # strong negative
17
+ "noise": rng.normal(size=200), # ~uncorrelated
18
+ })
19
+
20
+
21
+ # ---- corr_pairwise ----
22
+
23
+ def test_default_is_raw_pvalue(df):
24
+ out = corr_pairwise(df)
25
+ assert "p_value" in out.columns
26
+ assert "p_corrected" not in out.columns
27
+ assert list(out.columns) == ["var1", "var2", "n", "r", "p_value", "significant"]
28
+ assert len(out) == 6
29
+
30
+
31
+ def test_corrected_includes_raw(df):
32
+ out = corr_pairwise(df, method="bonferroni")
33
+ assert "p_corrected" in out.columns
34
+ assert "p_value" in out.columns
35
+ assert list(out.columns) == ["var1", "var2", "n", "r", "p_corrected", "p_value", "significant"]
36
+
37
+
38
+ def test_pearson_matches_scipy(df):
39
+ from scipy.stats import pearsonr
40
+ out = corr_pairwise(df, round_to=None)
41
+ row = out[(out.var1 == "x") & (out.var2 == "y")].iloc[0]
42
+ r, p = pearsonr(df["x"], df["y"])
43
+ assert np.isclose(row["r"], r)
44
+ assert np.isclose(row["p_value"], p)
45
+
46
+
47
+ def test_spearman_matches_scipy(df):
48
+ from scipy.stats import spearmanr
49
+ out = corr_pairwise(df, corr="spearman", round_to=None)
50
+ row = out[(out.var1 == "x") & (out.var2 == "y")].iloc[0]
51
+ r, p = spearmanr(df["x"], df["y"])
52
+ assert np.isclose(row["r"], r)
53
+ assert np.isclose(row["p_value"], p)
54
+
55
+
56
+ def test_significant_flag_respects_alpha(df):
57
+ # noise pair has high p; at alpha=0.05 not significant
58
+ out = corr_pairwise(df, alpha=0.05, round_to=None)
59
+ noise = out[out.var2 == "noise"]
60
+ for _, row in noise.iterrows():
61
+ assert row["significant"] == (row["p_value"] < 0.05)
62
+
63
+
64
+ def test_alpha_changes_significance():
65
+ rng = np.random.default_rng(0)
66
+ n = 80
67
+ x = rng.normal(size=n)
68
+ y = 0.25 * x + rng.normal(size=n) # weak-ish correlation
69
+ df = pd.DataFrame({"x": x, "y": y})
70
+ p = corr_pairwise(df, round_to=None).iloc[0]["p_value"]
71
+ strict = corr_pairwise(df, alpha=0.01).iloc[0]["significant"]
72
+ loose = corr_pairwise(df, alpha=0.10).iloc[0]["significant"]
73
+ # whatever p is, strict<=loose in significance
74
+ assert bool(loose) >= bool(strict)
75
+
76
+
77
+ def test_correction_increases_pvalues(df):
78
+ raw = corr_pairwise(df, method="none", round_to=None).set_index(["var1", "var2"])
79
+ corr = corr_pairwise(df, method="bonferroni", round_to=None).set_index(["var1", "var2"])
80
+ j = raw.join(corr, lsuffix="_r", rsuffix="_c")
81
+ assert (j["p_corrected"] >= j["p_value_r"] - 1e-12).all()
82
+
83
+
84
+ def test_invalid_corr(df):
85
+ with pytest.raises(ValueError):
86
+ corr_pairwise(df, corr="kendall")
87
+
88
+
89
+ def test_invalid_method(df):
90
+ with pytest.raises(ValueError):
91
+ corr_pairwise(df, method="not_a_method")
92
+
93
+
94
+ def test_pairwise_nan_deletion():
95
+ df = pd.DataFrame({
96
+ "a": [1, 2, 3, 4, np.nan],
97
+ "b": [2, 4, 6, 8, 10],
98
+ "c": [np.nan, np.nan, 1, 2, 3],
99
+ })
100
+ out = corr_pairwise(df, min_n=2)
101
+ ab = out[(out.var1 == "a") & (out.var2 == "b")].iloc[0]
102
+ assert ab["n"] == 4
103
+ assert np.isclose(ab["r"], 1.0)
104
+
105
+
106
+ def test_min_n_skips_pairs():
107
+ df = pd.DataFrame({"a": [1, np.nan, np.nan], "b": [1, 2, 3]})
108
+ out = corr_pairwise(df, min_n=3)
109
+ assert len(out) == 0
110
+
111
+
112
+ def test_constant_column_yields_nan():
113
+ df = pd.DataFrame({"a": [1, 1, 1, 1], "b": [1, 2, 3, 4]})
114
+ out = corr_pairwise(df)
115
+ assert np.isnan(out.iloc[0]["r"])
116
+
117
+
118
+ # ---- corr_table ----
119
+
120
+ def test_table_returns_two_matrices(df):
121
+ t = corr_table(df)
122
+ assert set(t.keys()) == {"r", "p"}
123
+ assert list(t["r"].columns) == list(df.columns)
124
+ assert list(t["p"].columns) == list(df.columns)
125
+
126
+
127
+ def test_table_p_lower_triangle(df):
128
+ t = corr_table(df)
129
+ p = t["p"]
130
+ cols = list(p.columns)
131
+ # diagonal is em-dash, upper triangle blank, lower filled
132
+ assert p.loc[cols[0], cols[0]] == "—"
133
+ assert p.loc[cols[0], cols[1]] == "" # upper blank
134
+ assert p.loc[cols[1], cols[0]] != "" # lower filled
135
+
136
+
137
+ def test_table_stars_only_significant(df):
138
+ t = corr_table(df, alpha=0.05)
139
+ cols = list(t["r"].columns)
140
+ # find lower-triangle cell for the strong x-y pair
141
+ xi, yi = cols.index("x"), cols.index("y")
142
+ lo, hi = (("y", "x") if xi < yi else ("x", "y"))
143
+ assert t["r"].loc[lo, hi].endswith("*") # strong pair starred
144
+ # diagonal never starred
145
+ assert t["r"].loc["x", "x"] == "1"
146
+ # a noise pair in the lower triangle should not be starred
147
+ ni = cols.index("noise")
148
+ others = [c for c in cols if c != "noise"]
149
+ for o in others:
150
+ oi = cols.index(o)
151
+ cell = t["r"].loc["noise", o] if ni > oi else t["r"].loc[o, "noise"]
152
+ if cell not in ("", "—"):
153
+ # if it's the lower-triangle filled cell, noise shouldn't be sig
154
+ assert not cell.endswith("*")
155
+
156
+
157
+ def test_table_no_stars(df):
158
+ t = corr_table(df, stars=False)
159
+ cols = list(t["r"].columns)
160
+ xi, yi = cols.index("x"), cols.index("y")
161
+ lo, hi = (("y", "x") if xi < yi else ("x", "y"))
162
+ # strong pair present but never carries a star when stars=False
163
+ assert not t["r"].loc[lo, hi].endswith("*")
164
+ assert t["r"].loc["x", "x"] == "1"
165
+
166
+
167
+ def test_table_corrected_pmatrix(df):
168
+ # corrected lower-triangle p >= raw lower-triangle p, cell by cell
169
+ raw = corr_table(df, method="none", round_to=6)["p"]
170
+ cor = corr_table(df, method="bonferroni", round_to=6)["p"]
171
+ cols = list(raw.columns)
172
+ for i, ri in enumerate(cols):
173
+ for j, cj in enumerate(cols):
174
+ if i > j and raw.loc[ri, cj] not in ("", "—"):
175
+ assert float(cor.loc[ri, cj]) >= float(raw.loc[ri, cj]) - 1e-9
176
+
177
+
178
+ def test_table_alpha_controls_stars():
179
+ rng = np.random.default_rng(3)
180
+ n = 60
181
+ x = rng.normal(size=n)
182
+ y = 0.3 * x + rng.normal(size=n)
183
+ df = pd.DataFrame({"x": x, "y": y})
184
+ cols = list(corr_table(df)["r"].columns)
185
+ lo, hi = (cols[1], cols[0]) # lower-triangle cell
186
+ loose = corr_table(df, alpha=0.10)["r"].loc[lo, hi]
187
+ strict = corr_table(df, alpha=0.001)["r"].loc[lo, hi]
188
+ assert loose.count("*") >= strict.count("*")
189
+
190
+
191
+ def test_corr_matrices_alias(df):
192
+ t = corr_matrices(df)
193
+ assert set(t.keys()) == {"r", "p"}
194
+
195
+
196
+ def test_fdr_bh_outputs_corrected(df):
197
+ raw = corr_pairwise(df, method="none", round_to=None).set_index(["var1", "var2"])
198
+ cor = corr_pairwise(df, method="fdr_bh", round_to=None).set_index(["var1", "var2"])
199
+ j = raw.join(cor, lsuffix="_r", rsuffix="_c")
200
+ assert (j["p_corrected"] >= j["p_value_r"] - 1e-12).all()
201
+ assert j["p_corrected"].dropna().between(0.0, 1.0).all()
202
+
203
+
204
+ def test_missing_statsmodels_blocks_noncore_methods(df, monkeypatch):
205
+ import pcorr.core as core
206
+ monkeypatch.setattr(core, "_HAS_STATSMODELS", False)
207
+ with pytest.raises(ImportError):
208
+ corr_pairwise(df, method="holm")
209
+ out = corr_pairwise(df, method="bonferroni")
210
+ assert "p_corrected" in out.columns
211
+ assert "p_value" in out.columns