pcorr 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pcorr-0.5.1/LICENSE +21 -0
- pcorr-0.5.1/PKG-INFO +139 -0
- pcorr-0.5.1/README.md +105 -0
- pcorr-0.5.1/pcorr/__init__.py +19 -0
- pcorr-0.5.1/pcorr/core.py +324 -0
- pcorr-0.5.1/pcorr.egg-info/PKG-INFO +139 -0
- pcorr-0.5.1/pcorr.egg-info/SOURCES.txt +11 -0
- pcorr-0.5.1/pcorr.egg-info/dependency_links.txt +1 -0
- pcorr-0.5.1/pcorr.egg-info/requires.txt +10 -0
- pcorr-0.5.1/pcorr.egg-info/top_level.txt +1 -0
- pcorr-0.5.1/pyproject.toml +47 -0
- pcorr-0.5.1/setup.cfg +4 -0
- pcorr-0.5.1/tests/test_core.py +211 -0
pcorr-0.5.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mark
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pcorr-0.5.1/PKG-INFO
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pcorr
|
|
3
|
+
Version: 0.5.1
|
|
4
|
+
Summary: Pairwise Pearson correlations with p-values and multiple-comparison correction
|
|
5
|
+
Author: Mark
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/Cyber200potato/pcorr
|
|
8
|
+
Project-URL: Issues, https://github.com/Cyber200potato/pcorr/issues
|
|
9
|
+
Keywords: correlation,pearson,p-value,statistics,multiple-comparisons
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: numpy>=1.20
|
|
26
|
+
Requires-Dist: pandas>=1.2
|
|
27
|
+
Requires-Dist: scipy>=1.6
|
|
28
|
+
Provides-Extra: full
|
|
29
|
+
Requires-Dist: statsmodels>=0.13; extra == "full"
|
|
30
|
+
Provides-Extra: test
|
|
31
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
32
|
+
Requires-Dist: statsmodels>=0.13; extra == "test"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# pcorr
|
|
36
|
+
|
|
37
|
+
Compute all pairwise Pearson/Spearman correlations between numeric columns in a `pandas.DataFrame` — like `pandas.DataFrame.corr()`, but with **p-values for each pair** and **multiple-comparison correction** in one call.
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install -e . # core: numpy, pandas, scipy
|
|
43
|
+
pip install -e ".[full]" # + statsmodels (fdr_bh, holm, sidak, ...)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Without statsmodels only `method="bonferroni"` and `method="none"` are available.
|
|
47
|
+
|
|
48
|
+
If the package is published, you can install it as:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install pcorr
|
|
52
|
+
pip install "pcorr[full]"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Usage
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
import pandas as pd
|
|
59
|
+
from pcorr import corr_pairwise, corr_table, show_table
|
|
60
|
+
|
|
61
|
+
df = pd.read_csv("data.csv")
|
|
62
|
+
|
|
63
|
+
# Long (tidy) format: one row per pair
|
|
64
|
+
table = corr_pairwise(df, method="fdr_bh", alpha=0.05)
|
|
65
|
+
print(table)
|
|
66
|
+
# var1 var2 n r p_corrected p_value significant
|
|
67
|
+
# 0 x y 200 0.967 0.0000 0.0000 True
|
|
68
|
+
# 1 x z 200 -0.894 0.0000 0.0000 True
|
|
69
|
+
# ...
|
|
70
|
+
|
|
71
|
+
# Two square tables (r and p) rendered as lower-triangle output
|
|
72
|
+
tables = corr_table(df, method="bonferroni")
|
|
73
|
+
tables["r"] # coefficients
|
|
74
|
+
tables["p"] # p-values (raw when method="none", otherwise corrected)
|
|
75
|
+
|
|
76
|
+
# Convenience display (renders nicely in Jupyter, prints in console)
|
|
77
|
+
show_table(df, method="bonferroni")
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## API
|
|
81
|
+
|
|
82
|
+
- `corr_pairwise(df, ...)` — tidy table: one row per column pair.
|
|
83
|
+
- `corr_table(df, ...)` — two square tables (coefficients and p-values) in a lower-triangle style.
|
|
84
|
+
- `show_table(df, ...)` — displays/prints `corr_table(...)`.
|
|
85
|
+
- `corr_matrices(df, ...)` — alias for `corr_table(...)` (compatibility).
|
|
86
|
+
|
|
87
|
+
## Output format
|
|
88
|
+
|
|
89
|
+
### corr_pairwise
|
|
90
|
+
|
|
91
|
+
Always returns:
|
|
92
|
+
|
|
93
|
+
- `var1`, `var2` — column names
|
|
94
|
+
- `n` — number of valid observations in the pair (after pairwise NaN deletion)
|
|
95
|
+
- `r` — correlation coefficient
|
|
96
|
+
- `significant` — `p < alpha` for the p-value used for significance
|
|
97
|
+
|
|
98
|
+
P-value columns depend on `method`:
|
|
99
|
+
|
|
100
|
+
- `method="none"` returns `p_value` (raw p-value)
|
|
101
|
+
- any correction (e.g. `bonferroni`, `fdr_bh`, `holm`) returns `p_corrected` and `p_value` (raw)
|
|
102
|
+
|
|
103
|
+
Raw p-values example:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from pcorr import corr_pairwise
|
|
107
|
+
out = corr_pairwise(df, method="none")
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### corr_table / corr_matrices
|
|
111
|
+
|
|
112
|
+
Returns a dict with two `DataFrame`s:
|
|
113
|
+
|
|
114
|
+
- `tables["r"]` — coefficients (lower triangle filled), diagonal `"1"`, upper triangle blank
|
|
115
|
+
- `tables["p"]` — p-values in the same layout, diagonal `"—"`
|
|
116
|
+
- raw p-values when `method="none"`
|
|
117
|
+
- corrected p-values for any correction method
|
|
118
|
+
|
|
119
|
+
## Parameters
|
|
120
|
+
|
|
121
|
+
| parameter | meaning |
|
|
122
|
+
|-----------|------------------------------------------------------------------|
|
|
123
|
+
| `columns` | which columns to use (default: all numeric) |
|
|
124
|
+
| `corr` | `"pearson"` or `"spearman"` |
|
|
125
|
+
| `method` | `bonferroni`, `fdr_bh`, `holm`, `sidak`, `none`, ... |
|
|
126
|
+
| `alpha` | threshold for `significant` |
|
|
127
|
+
| `min_n` | minimum valid observations per pair (pairwise NaN deletion) |
|
|
128
|
+
| `round_to`| rounding for r/p columns (None disables rounding) |
|
|
129
|
+
|
|
130
|
+
## Notes
|
|
131
|
+
|
|
132
|
+
- Pairwise NaN deletion: each pair has its own `n`.
|
|
133
|
+
- Constant columns and pairs with `n < min_n` are handled without crashing.
|
|
134
|
+
- `corr_pairwise` is sorted by the p-value used for ranking (corrected when applicable).
|
|
135
|
+
|
|
136
|
+
## P-value correction methods
|
|
137
|
+
|
|
138
|
+
- `method="none"` and `method="bonferroni"` work without statsmodels.
|
|
139
|
+
- Other methods use `statsmodels.stats.multitest.multipletests` and require statsmodels (install via the `full` extra).
|
pcorr-0.5.1/README.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# pcorr
|
|
2
|
+
|
|
3
|
+
Compute all pairwise Pearson/Spearman correlations between numeric columns in a `pandas.DataFrame` — like `pandas.DataFrame.corr()`, but with **p-values for each pair** and **multiple-comparison correction** in one call.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install -e . # core: numpy, pandas, scipy
|
|
9
|
+
pip install -e ".[full]" # + statsmodels (fdr_bh, holm, sidak, ...)
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
Without statsmodels only `method="bonferroni"` and `method="none"` are available.
|
|
13
|
+
|
|
14
|
+
If the package is published, you can install it as:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install pcorr
|
|
18
|
+
pip install "pcorr[full]"
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
import pandas as pd
|
|
25
|
+
from pcorr import corr_pairwise, corr_table, show_table
|
|
26
|
+
|
|
27
|
+
df = pd.read_csv("data.csv")
|
|
28
|
+
|
|
29
|
+
# Long (tidy) format: one row per pair
|
|
30
|
+
table = corr_pairwise(df, method="fdr_bh", alpha=0.05)
|
|
31
|
+
print(table)
|
|
32
|
+
# var1 var2 n r p_corrected p_value significant
|
|
33
|
+
# 0 x y 200 0.967 0.0000 0.0000 True
|
|
34
|
+
# 1 x z 200 -0.894 0.0000 0.0000 True
|
|
35
|
+
# ...
|
|
36
|
+
|
|
37
|
+
# Two square tables (r and p) rendered as lower-triangle output
|
|
38
|
+
tables = corr_table(df, method="bonferroni")
|
|
39
|
+
tables["r"] # coefficients
|
|
40
|
+
tables["p"] # p-values (raw when method="none", otherwise corrected)
|
|
41
|
+
|
|
42
|
+
# Convenience display (renders nicely in Jupyter, prints in console)
|
|
43
|
+
show_table(df, method="bonferroni")
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## API
|
|
47
|
+
|
|
48
|
+
- `corr_pairwise(df, ...)` — tidy table: one row per column pair.
|
|
49
|
+
- `corr_table(df, ...)` — two square tables (coefficients and p-values) in a lower-triangle style.
|
|
50
|
+
- `show_table(df, ...)` — displays/prints `corr_table(...)`.
|
|
51
|
+
- `corr_matrices(df, ...)` — alias for `corr_table(...)` (compatibility).
|
|
52
|
+
|
|
53
|
+
## Output format
|
|
54
|
+
|
|
55
|
+
### corr_pairwise
|
|
56
|
+
|
|
57
|
+
Always returns:
|
|
58
|
+
|
|
59
|
+
- `var1`, `var2` — column names
|
|
60
|
+
- `n` — number of valid observations in the pair (after pairwise NaN deletion)
|
|
61
|
+
- `r` — correlation coefficient
|
|
62
|
+
- `significant` — `p < alpha` for the p-value used for significance
|
|
63
|
+
|
|
64
|
+
P-value columns depend on `method`:
|
|
65
|
+
|
|
66
|
+
- `method="none"` returns `p_value` (raw p-value)
|
|
67
|
+
- any correction (e.g. `bonferroni`, `fdr_bh`, `holm`) returns `p_corrected` and `p_value` (raw)
|
|
68
|
+
|
|
69
|
+
Raw p-values example:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from pcorr import corr_pairwise
|
|
73
|
+
out = corr_pairwise(df, method="none")
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### corr_table / corr_matrices
|
|
77
|
+
|
|
78
|
+
Returns a dict with two `DataFrame`s:
|
|
79
|
+
|
|
80
|
+
- `tables["r"]` — coefficients (lower triangle filled), diagonal `"1"`, upper triangle blank
|
|
81
|
+
- `tables["p"]` — p-values in the same layout, diagonal `"—"`
|
|
82
|
+
- raw p-values when `method="none"`
|
|
83
|
+
- corrected p-values for any correction method
|
|
84
|
+
|
|
85
|
+
## Parameters
|
|
86
|
+
|
|
87
|
+
| parameter | meaning |
|
|
88
|
+
|-----------|------------------------------------------------------------------|
|
|
89
|
+
| `columns` | which columns to use (default: all numeric) |
|
|
90
|
+
| `corr` | `"pearson"` or `"spearman"` |
|
|
91
|
+
| `method` | `bonferroni`, `fdr_bh`, `holm`, `sidak`, `none`, ... |
|
|
92
|
+
| `alpha` | threshold for `significant` |
|
|
93
|
+
| `min_n` | minimum valid observations per pair (pairwise NaN deletion) |
|
|
94
|
+
| `round_to`| rounding for r/p columns (None disables rounding) |
|
|
95
|
+
|
|
96
|
+
## Notes
|
|
97
|
+
|
|
98
|
+
- Pairwise NaN deletion: each pair has its own `n`.
|
|
99
|
+
- Constant columns and pairs with `n < min_n` are handled without crashing.
|
|
100
|
+
- `corr_pairwise` is sorted by the p-value used for ranking (corrected when applicable).
|
|
101
|
+
|
|
102
|
+
## P-value correction methods
|
|
103
|
+
|
|
104
|
+
- `method="none"` and `method="bonferroni"` work without statsmodels.
|
|
105
|
+
- Other methods use `statsmodels.stats.multitest.multipletests` and require statsmodels (install via the `full` extra).
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""pcorr — pairwise Pearson/Spearman correlations with p-values and correction.
|
|
2
|
+
|
|
3
|
+
Combines the convenience of a pandas-style "all pairs" correlation matrix
|
|
4
|
+
with scipy's per-pair p-values, plus optional multiple-comparison correction.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .core import corr_pairwise, corr_table, corr_matrices, show_table
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
11
|
+
except ImportError:
|
|
12
|
+
from importlib_metadata import PackageNotFoundError, version
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
__version__ = version("pcorr")
|
|
16
|
+
except PackageNotFoundError:
|
|
17
|
+
__version__ = "0.0.0"
|
|
18
|
+
|
|
19
|
+
__all__ = ["corr_pairwise", "corr_table", "corr_matrices", "show_table"]
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
"""Pairwise Pearson/Spearman correlations with p-values and multiple-comparison correction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from itertools import combinations
|
|
6
|
+
from typing import Iterable, Optional
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from scipy.stats import pearsonr, spearmanr
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from statsmodels.stats.multitest import multipletests
|
|
14
|
+
_HAS_STATSMODELS = True
|
|
15
|
+
except ImportError: # pragma: no cover
|
|
16
|
+
_HAS_STATSMODELS = False
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_SUPPORTED_METHODS = {
|
|
20
|
+
"bonferroni", "sidak", "holm", "holm-sidak", "simes-hochberg",
|
|
21
|
+
"hommel", "fdr_bh", "fdr_by", "fdr_tsbh", "fdr_tsbky", "none",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
_CORR_FUNCS = {
|
|
25
|
+
"pearson": pearsonr,
|
|
26
|
+
"spearman": spearmanr,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _select_columns(df: pd.DataFrame, columns: Optional[Iterable[str]]) -> list:
|
|
31
|
+
if columns is None:
|
|
32
|
+
cols = df.select_dtypes(include=np.number).columns.tolist()
|
|
33
|
+
else:
|
|
34
|
+
cols = list(columns)
|
|
35
|
+
if len(cols) < 2:
|
|
36
|
+
raise ValueError("Need at least 2 numeric columns to correlate.")
|
|
37
|
+
return cols
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _apply_correction(pvals: np.ndarray, method: str, alpha: float):
|
|
41
|
+
"""Return (corrected_pvals, rejected) handling NaNs gracefully."""
|
|
42
|
+
pvals = np.asarray(pvals, dtype=float)
|
|
43
|
+
corrected = np.full_like(pvals, np.nan)
|
|
44
|
+
rejected = np.zeros_like(pvals, dtype=bool)
|
|
45
|
+
|
|
46
|
+
if not _HAS_STATSMODELS:
|
|
47
|
+
raise ImportError(
|
|
48
|
+
"statsmodels is required for correction methods other than "
|
|
49
|
+
"'bonferroni' and 'none'. Install it with `pip install statsmodels`, "
|
|
50
|
+
"or use method='bonferroni'."
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
mask = ~np.isnan(pvals)
|
|
54
|
+
if mask.sum() == 0:
|
|
55
|
+
return corrected, rejected
|
|
56
|
+
|
|
57
|
+
rej, corr_p, _, _ = multipletests(pvals[mask], alpha=alpha, method=method)
|
|
58
|
+
corrected[mask] = corr_p
|
|
59
|
+
rejected[mask] = rej
|
|
60
|
+
return corrected, rejected
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _bonferroni(pvals: np.ndarray, alpha: float):
|
|
64
|
+
"""Pure-numpy Bonferroni so the core works without statsmodels."""
|
|
65
|
+
pvals = np.asarray(pvals, dtype=float)
|
|
66
|
+
mask = ~np.isnan(pvals)
|
|
67
|
+
n = int(mask.sum())
|
|
68
|
+
corrected = np.full_like(pvals, np.nan)
|
|
69
|
+
corrected[mask] = np.minimum(pvals[mask] * n, 1.0)
|
|
70
|
+
rejected = np.zeros_like(pvals, dtype=bool)
|
|
71
|
+
rejected[mask] = corrected[mask] < alpha
|
|
72
|
+
return corrected, rejected
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def corr_pairwise(
|
|
76
|
+
df: pd.DataFrame,
|
|
77
|
+
columns: Optional[Iterable[str]] = None,
|
|
78
|
+
corr: str = "pearson",
|
|
79
|
+
method: str = "none",
|
|
80
|
+
alpha: float = 0.05,
|
|
81
|
+
min_n: int = 3,
|
|
82
|
+
round_to: Optional[int] = 4,
|
|
83
|
+
) -> pd.DataFrame:
|
|
84
|
+
"""Compute pairwise correlations in long (tidy) format.
|
|
85
|
+
|
|
86
|
+
The p-value column adapts to `method`:
|
|
87
|
+
* method="none" -> a single column `p_value` (uncorrected).
|
|
88
|
+
* any correction -> columns `p_corrected` then `p_value` (raw).
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
df : pandas.DataFrame
|
|
93
|
+
Input data. Pairwise NaN deletion is applied per column pair.
|
|
94
|
+
columns : iterable of str, optional
|
|
95
|
+
Columns to correlate. Defaults to all numeric columns.
|
|
96
|
+
corr : {"pearson", "spearman"}, default "pearson"
|
|
97
|
+
Correlation coefficient to compute.
|
|
98
|
+
method : str, default "none"
|
|
99
|
+
Multiple-comparison correction. "none" reports raw p-values only.
|
|
100
|
+
"bonferroni" works with scipy alone; other statsmodels methods
|
|
101
|
+
(e.g. "fdr_bh", "holm", "sidak") require statsmodels.
|
|
102
|
+
alpha : float, default 0.05
|
|
103
|
+
Significance threshold. A pair is flagged significant when its
|
|
104
|
+
reported p-value < alpha. Set 0.10 / 0.05 / 0.01 for 90/95/99%.
|
|
105
|
+
min_n : int, default 3
|
|
106
|
+
Minimum valid (non-NaN) observations per pair; smaller pairs skipped.
|
|
107
|
+
round_to : int or None, default 4
|
|
108
|
+
Decimal places for r/p columns. None disables rounding.
|
|
109
|
+
|
|
110
|
+
Returns
|
|
111
|
+
-------
|
|
112
|
+
pandas.DataFrame
|
|
113
|
+
Columns:
|
|
114
|
+
* method="none": var1, var2, n, r, p_value, significant.
|
|
115
|
+
* otherwise: var1, var2, n, r, p_corrected, p_value, significant.
|
|
116
|
+
Sorted by p_corrected when a correction is used, otherwise by p_value.
|
|
117
|
+
"""
|
|
118
|
+
corr = corr.lower()
|
|
119
|
+
if corr not in _CORR_FUNCS:
|
|
120
|
+
raise ValueError(f"Unknown corr '{corr}'. Use 'pearson' or 'spearman'.")
|
|
121
|
+
corr_func = _CORR_FUNCS[corr]
|
|
122
|
+
|
|
123
|
+
method = method.lower()
|
|
124
|
+
if method not in _SUPPORTED_METHODS:
|
|
125
|
+
raise ValueError(
|
|
126
|
+
f"Unknown method '{method}'. Supported: {sorted(_SUPPORTED_METHODS)}"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
cols = _select_columns(df, columns)
|
|
130
|
+
corrected_mode = method != "none"
|
|
131
|
+
if corrected_mode:
|
|
132
|
+
empty_cols = ["var1", "var2", "n", "r", "p_corrected", "p_value", "significant"]
|
|
133
|
+
else:
|
|
134
|
+
empty_cols = ["var1", "var2", "n", "r", "p_value", "significant"]
|
|
135
|
+
|
|
136
|
+
records = []
|
|
137
|
+
for a, b in combinations(cols, 2):
|
|
138
|
+
pair = df[[a, b]].dropna()
|
|
139
|
+
n = len(pair)
|
|
140
|
+
if n < min_n:
|
|
141
|
+
continue
|
|
142
|
+
if pair[a].nunique() < 2 or pair[b].nunique() < 2:
|
|
143
|
+
r, p = np.nan, np.nan
|
|
144
|
+
else:
|
|
145
|
+
r, p = corr_func(pair[a], pair[b])
|
|
146
|
+
records.append({"var1": a, "var2": b, "n": n, "r": r, "_p_raw": p})
|
|
147
|
+
|
|
148
|
+
if not records:
|
|
149
|
+
return pd.DataFrame(columns=empty_cols)
|
|
150
|
+
|
|
151
|
+
out = pd.DataFrame.from_records(records)
|
|
152
|
+
raw = out["_p_raw"].to_numpy(dtype=float)
|
|
153
|
+
|
|
154
|
+
if corrected_mode:
|
|
155
|
+
if method == "bonferroni" and not _HAS_STATSMODELS:
|
|
156
|
+
corrected, _ = _bonferroni(raw, alpha)
|
|
157
|
+
else:
|
|
158
|
+
corrected, _ = _apply_correction(raw, method, alpha)
|
|
159
|
+
out["p_corrected"] = corrected
|
|
160
|
+
out["p_value"] = raw
|
|
161
|
+
sort_col = "p_corrected"
|
|
162
|
+
else:
|
|
163
|
+
out["p_value"] = raw
|
|
164
|
+
sort_col = "p_value"
|
|
165
|
+
|
|
166
|
+
out = out.drop(columns="_p_raw")
|
|
167
|
+
out["significant"] = out[sort_col] < alpha
|
|
168
|
+
out = out.sort_values(sort_col, na_position="last").reset_index(drop=True)
|
|
169
|
+
|
|
170
|
+
if round_to is not None:
|
|
171
|
+
for c in ("r", "p_corrected", "p_value"):
|
|
172
|
+
if c in out.columns:
|
|
173
|
+
out[c] = out[c].round(round_to)
|
|
174
|
+
|
|
175
|
+
if corrected_mode:
|
|
176
|
+
out = out[["var1", "var2", "n", "r", "p_corrected", "p_value", "significant"]]
|
|
177
|
+
else:
|
|
178
|
+
out = out[["var1", "var2", "n", "r", "p_value", "significant"]]
|
|
179
|
+
out.attrs["corr"] = corr
|
|
180
|
+
out.attrs["correction_method"] = method
|
|
181
|
+
out.attrs["alpha"] = alpha
|
|
182
|
+
return out
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def corr_table(
|
|
186
|
+
df: pd.DataFrame,
|
|
187
|
+
columns: Optional[Iterable[str]] = None,
|
|
188
|
+
corr: str = "pearson",
|
|
189
|
+
method: str = "none",
|
|
190
|
+
alpha: float = 0.05,
|
|
191
|
+
min_n: int = 3,
|
|
192
|
+
round_to: int = 4,
|
|
193
|
+
stars: bool = True,
|
|
194
|
+
):
|
|
195
|
+
"""Pairwise correlations as two square matrices: coefficients and p-values.
|
|
196
|
+
|
|
197
|
+
Parameters
|
|
198
|
+
----------
|
|
199
|
+
df : pandas.DataFrame
|
|
200
|
+
Input data (pairwise NaN deletion per pair).
|
|
201
|
+
columns : iterable of str, optional
|
|
202
|
+
Columns to use. Defaults to all numeric columns.
|
|
203
|
+
corr : {"pearson", "spearman"}, default "pearson"
|
|
204
|
+
Correlation coefficient.
|
|
205
|
+
method : str, default "none"
|
|
206
|
+
Multiple-comparison correction. "none" -> the p-matrix holds raw
|
|
207
|
+
p-values; any other method -> it holds corrected p-values.
|
|
208
|
+
alpha : float, default 0.05
|
|
209
|
+
Significance threshold. A coefficient is starred when its reported
|
|
210
|
+
p-value < alpha. Use 0.10 / 0.05 / 0.01 for 90 / 95 / 99%.
|
|
211
|
+
min_n : int, default 3
|
|
212
|
+
Minimum valid observations per pair.
|
|
213
|
+
round_to : int, default 4
|
|
214
|
+
Decimal places for both matrices.
|
|
215
|
+
stars : bool, default True
|
|
216
|
+
If True, the coefficient matrix is a string matrix with a single "*"
|
|
217
|
+
appended to significant coefficients (p < alpha). If False, it is a
|
|
218
|
+
plain numeric matrix.
|
|
219
|
+
|
|
220
|
+
Returns
|
|
221
|
+
-------
|
|
222
|
+
dict of pandas.DataFrame with keys:
|
|
223
|
+
'r' : coefficient matrix. Lower triangle filled, upper triangle blank,
|
|
224
|
+
diagonal = "1". A single "*" marks significant coefficients
|
|
225
|
+
(reported p < alpha) when stars=True.
|
|
226
|
+
'p' : p-value matrix in the same lower-triangle style, diagonal = "—".
|
|
227
|
+
Holds raw p-values (method="none") or corrected ones otherwise.
|
|
228
|
+
|
|
229
|
+
Both are string matrices sharing index/columns, so cells line up.
|
|
230
|
+
"""
|
|
231
|
+
cols = _select_columns(df, columns)
|
|
232
|
+
long = corr_pairwise(df, columns=cols, corr=corr, method=method,
|
|
233
|
+
alpha=alpha, min_n=min_n, round_to=None)
|
|
234
|
+
p_col = "p_corrected" if method.lower() != "none" else "p_value"
|
|
235
|
+
|
|
236
|
+
r_num = pd.DataFrame(np.eye(len(cols)), index=cols, columns=cols)
|
|
237
|
+
p_mat = pd.DataFrame(np.zeros((len(cols), len(cols))), index=cols, columns=cols)
|
|
238
|
+
sig_mat = pd.DataFrame(False, index=cols, columns=cols)
|
|
239
|
+
|
|
240
|
+
for _, row in long.iterrows():
|
|
241
|
+
a, b = row["var1"], row["var2"]
|
|
242
|
+
r_num.loc[a, b] = r_num.loc[b, a] = row["r"]
|
|
243
|
+
p_mat.loc[a, b] = p_mat.loc[b, a] = row[p_col]
|
|
244
|
+
sig_mat.loc[a, b] = sig_mat.loc[b, a] = bool(row["significant"])
|
|
245
|
+
|
|
246
|
+
cols_list = list(cols)
|
|
247
|
+
r_out = pd.DataFrame("", index=cols_list, columns=cols_list)
|
|
248
|
+
p_out = pd.DataFrame("", index=cols_list, columns=cols_list)
|
|
249
|
+
|
|
250
|
+
for i, ri in enumerate(cols_list):
|
|
251
|
+
for j, cj in enumerate(cols_list):
|
|
252
|
+
if i == j:
|
|
253
|
+
r_out.loc[ri, cj] = "1"
|
|
254
|
+
p_out.loc[ri, cj] = "—"
|
|
255
|
+
elif i > j: # lower triangle only
|
|
256
|
+
r = r_num.loc[ri, cj]
|
|
257
|
+
p = p_mat.loc[ri, cj]
|
|
258
|
+
if pd.isna(r):
|
|
259
|
+
r_out.loc[ri, cj] = "—"
|
|
260
|
+
p_out.loc[ri, cj] = "—"
|
|
261
|
+
else:
|
|
262
|
+
mark = "*" if (stars and sig_mat.loc[ri, cj]) else ""
|
|
263
|
+
r_out.loc[ri, cj] = f"{r:.{round_to}f}{mark}"
|
|
264
|
+
p_out.loc[ri, cj] = f"{p:.{round_to}f}"
|
|
265
|
+
# upper triangle stays blank
|
|
266
|
+
|
|
267
|
+
for d in (r_out, p_out):
|
|
268
|
+
d.attrs["corr"] = corr
|
|
269
|
+
d.attrs["correction_method"] = method
|
|
270
|
+
d.attrs["alpha"] = alpha
|
|
271
|
+
|
|
272
|
+
return {"r": r_out, "p": p_out}
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
corr_matrices = corr_table
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def show_table(df, columns=None, corr="pearson", method="none", alpha=0.05,
|
|
280
|
+
min_n=3, round_to=4, stars=True, tables=None):
|
|
281
|
+
"""Display both tables (coefficients and p-values) with a plain pandas render.
|
|
282
|
+
|
|
283
|
+
In Jupyter this shows two clean HTML tables back-to-back, each preceded by
|
|
284
|
+
a short caption, and returns None (so Jupyter doesn't also echo the raw
|
|
285
|
+
dict as text below them). Outside Jupyter it prints them and returns the
|
|
286
|
+
dict {'r', 'p'}.
|
|
287
|
+
|
|
288
|
+
If you need the tables as objects, call `corr_table(...)` directly.
|
|
289
|
+
|
|
290
|
+
Same arguments as `corr_table`. Pass a precomputed `tables=corr_table(...)`
|
|
291
|
+
to skip recomputation.
|
|
292
|
+
"""
|
|
293
|
+
if tables is None:
|
|
294
|
+
tables = corr_table(df, columns=columns, corr=corr, method=method,
|
|
295
|
+
alpha=alpha, min_n=min_n, round_to=round_to,
|
|
296
|
+
stars=stars)
|
|
297
|
+
r_tab, p_tab = tables["r"], tables["p"]
|
|
298
|
+
|
|
299
|
+
corr_name = tables["r"].attrs.get("corr", corr).capitalize()
|
|
300
|
+
meth = tables["r"].attrs.get("correction_method", method)
|
|
301
|
+
a = tables["r"].attrs.get("alpha", alpha)
|
|
302
|
+
star_note = f" (* p < {a})" if stars else ""
|
|
303
|
+
p_kind = "raw" if meth == "none" else f"{meth}-corrected"
|
|
304
|
+
|
|
305
|
+
r_caption = f"{corr_name} correlation coefficients{star_note}"
|
|
306
|
+
p_caption = f"p-values ({p_kind})"
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
from IPython.display import display, HTML
|
|
310
|
+
get_ipython # noqa: F821 — exists only inside IPython/Jupyter
|
|
311
|
+
display(HTML(f"<b>{r_caption}</b>"))
|
|
312
|
+
display(r_tab) # plain pandas HTML render
|
|
313
|
+
display(HTML(f"<b>{p_caption}</b>"))
|
|
314
|
+
display(p_tab)
|
|
315
|
+
# Return nothing: Jupyter would otherwise echo the dict as raw text
|
|
316
|
+
# below the rendered tables. Use corr_table(...) if you need the object.
|
|
317
|
+
return None
|
|
318
|
+
except (ImportError, NameError):
|
|
319
|
+
print(r_caption)
|
|
320
|
+
print(r_tab.to_string())
|
|
321
|
+
print()
|
|
322
|
+
print(p_caption)
|
|
323
|
+
print(p_tab.to_string())
|
|
324
|
+
return tables
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pcorr
|
|
3
|
+
Version: 0.5.1
|
|
4
|
+
Summary: Pairwise Pearson correlations with p-values and multiple-comparison correction
|
|
5
|
+
Author: Mark
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/Cyber200potato/pcorr
|
|
8
|
+
Project-URL: Issues, https://github.com/Cyber200potato/pcorr/issues
|
|
9
|
+
Keywords: correlation,pearson,p-value,statistics,multiple-comparisons
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: numpy>=1.20
|
|
26
|
+
Requires-Dist: pandas>=1.2
|
|
27
|
+
Requires-Dist: scipy>=1.6
|
|
28
|
+
Provides-Extra: full
|
|
29
|
+
Requires-Dist: statsmodels>=0.13; extra == "full"
|
|
30
|
+
Provides-Extra: test
|
|
31
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
32
|
+
Requires-Dist: statsmodels>=0.13; extra == "test"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# pcorr
|
|
36
|
+
|
|
37
|
+
Compute all pairwise Pearson/Spearman correlations between numeric columns in a `pandas.DataFrame` — like `pandas.DataFrame.corr()`, but with **p-values for each pair** and **multiple-comparison correction** in one call.
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install -e . # core: numpy, pandas, scipy
|
|
43
|
+
pip install -e ".[full]" # + statsmodels (fdr_bh, holm, sidak, ...)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Without statsmodels only `method="bonferroni"` and `method="none"` are available.
|
|
47
|
+
|
|
48
|
+
If the package is published, you can install it as:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install pcorr
|
|
52
|
+
pip install "pcorr[full]"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Usage
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
import pandas as pd
|
|
59
|
+
from pcorr import corr_pairwise, corr_table, show_table
|
|
60
|
+
|
|
61
|
+
df = pd.read_csv("data.csv")
|
|
62
|
+
|
|
63
|
+
# Long (tidy) format: one row per pair
|
|
64
|
+
table = corr_pairwise(df, method="fdr_bh", alpha=0.05)
|
|
65
|
+
print(table)
|
|
66
|
+
# var1 var2 n r p_corrected p_value significant
|
|
67
|
+
# 0 x y 200 0.967 0.0000 0.0000 True
|
|
68
|
+
# 1 x z 200 -0.894 0.0000 0.0000 True
|
|
69
|
+
# ...
|
|
70
|
+
|
|
71
|
+
# Two square tables (r and p) rendered as lower-triangle output
|
|
72
|
+
tables = corr_table(df, method="bonferroni")
|
|
73
|
+
tables["r"] # coefficients
|
|
74
|
+
tables["p"] # p-values (raw when method="none", otherwise corrected)
|
|
75
|
+
|
|
76
|
+
# Convenience display (renders nicely in Jupyter, prints in console)
|
|
77
|
+
show_table(df, method="bonferroni")
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## API
|
|
81
|
+
|
|
82
|
+
- `corr_pairwise(df, ...)` — tidy table: one row per column pair.
|
|
83
|
+
- `corr_table(df, ...)` — two square tables (coefficients and p-values) in a lower-triangle style.
|
|
84
|
+
- `show_table(df, ...)` — displays/prints `corr_table(...)`.
|
|
85
|
+
- `corr_matrices(df, ...)` — alias for `corr_table(...)` (compatibility).
|
|
86
|
+
|
|
87
|
+
## Output format
|
|
88
|
+
|
|
89
|
+
### corr_pairwise
|
|
90
|
+
|
|
91
|
+
Always returns:
|
|
92
|
+
|
|
93
|
+
- `var1`, `var2` — column names
|
|
94
|
+
- `n` — number of valid observations in the pair (after pairwise NaN deletion)
|
|
95
|
+
- `r` — correlation coefficient
|
|
96
|
+
- `significant` — `p < alpha` for the p-value used for significance
|
|
97
|
+
|
|
98
|
+
P-value columns depend on `method`:
|
|
99
|
+
|
|
100
|
+
- `method="none"` returns `p_value` (raw p-value)
|
|
101
|
+
- any correction (e.g. `bonferroni`, `fdr_bh`, `holm`) returns `p_corrected` and `p_value` (raw)
|
|
102
|
+
|
|
103
|
+
Raw p-values example:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from pcorr import corr_pairwise
|
|
107
|
+
out = corr_pairwise(df, method="none")
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### corr_table / corr_matrices
|
|
111
|
+
|
|
112
|
+
Returns a dict with two `DataFrame`s:
|
|
113
|
+
|
|
114
|
+
- `tables["r"]` — coefficients (lower triangle filled), diagonal `"1"`, upper triangle blank
|
|
115
|
+
- `tables["p"]` — p-values in the same layout, diagonal `"—"`
|
|
116
|
+
- raw p-values when `method="none"`
|
|
117
|
+
- corrected p-values for any correction method
|
|
118
|
+
|
|
119
|
+
## Parameters
|
|
120
|
+
|
|
121
|
+
| parameter | meaning |
|
|
122
|
+
|-----------|------------------------------------------------------------------|
|
|
123
|
+
| `columns` | which columns to use (default: all numeric) |
|
|
124
|
+
| `corr` | `"pearson"` or `"spearman"` |
|
|
125
|
+
| `method` | `bonferroni`, `fdr_bh`, `holm`, `sidak`, `none`, ... |
|
|
126
|
+
| `alpha` | threshold for `significant` |
|
|
127
|
+
| `min_n` | minimum valid observations per pair (pairwise NaN deletion) |
|
|
128
|
+
| `round_to`| rounding for r/p columns (None disables rounding) |
|
|
129
|
+
|
|
130
|
+
## Notes
|
|
131
|
+
|
|
132
|
+
- Pairwise NaN deletion: each pair has its own `n`.
|
|
133
|
+
- Constant columns and pairs with `n < min_n` are handled without crashing.
|
|
134
|
+
- `corr_pairwise` is sorted by the p-value used for ranking (corrected when applicable).
|
|
135
|
+
|
|
136
|
+
## P-value correction methods
|
|
137
|
+
|
|
138
|
+
- `method="none"` and `method="bonferroni"` work without statsmodels.
|
|
139
|
+
- Other methods use `statsmodels.stats.multitest.multipletests` and require statsmodels (install via the `full` extra).
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pcorr
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pcorr"
|
|
7
|
+
version = "0.5.1"
|
|
8
|
+
description = "Pairwise Pearson correlations with p-values and multiple-comparison correction"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
authors = [{ name = "Mark" }]
|
|
14
|
+
keywords = ["correlation", "pearson", "p-value", "statistics", "multiple-comparisons"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Intended Audience :: Science/Research",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
21
|
+
"Programming Language :: Python :: 3.8",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Topic :: Scientific/Engineering",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Mathematics",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
dependencies = [
|
|
31
|
+
"numpy>=1.20",
|
|
32
|
+
"pandas>=1.2",
|
|
33
|
+
"scipy>=1.6",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Repository = "https://github.com/Cyber200potato/pcorr"
|
|
38
|
+
Issues = "https://github.com/Cyber200potato/pcorr/issues"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
[project.optional-dependencies]
|
|
42
|
+
full = ["statsmodels>=0.13"] # enables fdr_bh, holm, sidak, etc.
|
|
43
|
+
test = ["pytest>=7.0", "statsmodels>=0.13"]
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.packages.find]
|
|
46
|
+
where = ["."]
|
|
47
|
+
include = ["pcorr*"]
|
pcorr-0.5.1/setup.cfg
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from pcorr import corr_pairwise, corr_table
|
|
6
|
+
from pcorr import corr_matrices
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@pytest.fixture
|
|
10
|
+
def df():
|
|
11
|
+
rng = np.random.default_rng(42)
|
|
12
|
+
x = rng.normal(size=200)
|
|
13
|
+
return pd.DataFrame({
|
|
14
|
+
"x": x,
|
|
15
|
+
"y": 2 * x + rng.normal(scale=0.5, size=200), # strong positive
|
|
16
|
+
"z": -x + rng.normal(scale=0.5, size=200), # strong negative
|
|
17
|
+
"noise": rng.normal(size=200), # ~uncorrelated
|
|
18
|
+
})
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ---- corr_pairwise ----
|
|
22
|
+
|
|
23
|
+
def test_default_is_raw_pvalue(df):
|
|
24
|
+
out = corr_pairwise(df)
|
|
25
|
+
assert "p_value" in out.columns
|
|
26
|
+
assert "p_corrected" not in out.columns
|
|
27
|
+
assert list(out.columns) == ["var1", "var2", "n", "r", "p_value", "significant"]
|
|
28
|
+
assert len(out) == 6
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_corrected_includes_raw(df):
|
|
32
|
+
out = corr_pairwise(df, method="bonferroni")
|
|
33
|
+
assert "p_corrected" in out.columns
|
|
34
|
+
assert "p_value" in out.columns
|
|
35
|
+
assert list(out.columns) == ["var1", "var2", "n", "r", "p_corrected", "p_value", "significant"]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_pearson_matches_scipy(df):
|
|
39
|
+
from scipy.stats import pearsonr
|
|
40
|
+
out = corr_pairwise(df, round_to=None)
|
|
41
|
+
row = out[(out.var1 == "x") & (out.var2 == "y")].iloc[0]
|
|
42
|
+
r, p = pearsonr(df["x"], df["y"])
|
|
43
|
+
assert np.isclose(row["r"], r)
|
|
44
|
+
assert np.isclose(row["p_value"], p)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_spearman_matches_scipy(df):
|
|
48
|
+
from scipy.stats import spearmanr
|
|
49
|
+
out = corr_pairwise(df, corr="spearman", round_to=None)
|
|
50
|
+
row = out[(out.var1 == "x") & (out.var2 == "y")].iloc[0]
|
|
51
|
+
r, p = spearmanr(df["x"], df["y"])
|
|
52
|
+
assert np.isclose(row["r"], r)
|
|
53
|
+
assert np.isclose(row["p_value"], p)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_significant_flag_respects_alpha(df):
|
|
57
|
+
# noise pair has high p; at alpha=0.05 not significant
|
|
58
|
+
out = corr_pairwise(df, alpha=0.05, round_to=None)
|
|
59
|
+
noise = out[out.var2 == "noise"]
|
|
60
|
+
for _, row in noise.iterrows():
|
|
61
|
+
assert row["significant"] == (row["p_value"] < 0.05)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_alpha_changes_significance():
|
|
65
|
+
rng = np.random.default_rng(0)
|
|
66
|
+
n = 80
|
|
67
|
+
x = rng.normal(size=n)
|
|
68
|
+
y = 0.25 * x + rng.normal(size=n) # weak-ish correlation
|
|
69
|
+
df = pd.DataFrame({"x": x, "y": y})
|
|
70
|
+
p = corr_pairwise(df, round_to=None).iloc[0]["p_value"]
|
|
71
|
+
strict = corr_pairwise(df, alpha=0.01).iloc[0]["significant"]
|
|
72
|
+
loose = corr_pairwise(df, alpha=0.10).iloc[0]["significant"]
|
|
73
|
+
# whatever p is, strict<=loose in significance
|
|
74
|
+
assert bool(loose) >= bool(strict)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def test_correction_increases_pvalues(df):
|
|
78
|
+
raw = corr_pairwise(df, method="none", round_to=None).set_index(["var1", "var2"])
|
|
79
|
+
corr = corr_pairwise(df, method="bonferroni", round_to=None).set_index(["var1", "var2"])
|
|
80
|
+
j = raw.join(corr, lsuffix="_r", rsuffix="_c")
|
|
81
|
+
assert (j["p_corrected"] >= j["p_value_r"] - 1e-12).all()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def test_invalid_corr(df):
|
|
85
|
+
with pytest.raises(ValueError):
|
|
86
|
+
corr_pairwise(df, corr="kendall")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def test_invalid_method(df):
|
|
90
|
+
with pytest.raises(ValueError):
|
|
91
|
+
corr_pairwise(df, method="not_a_method")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_pairwise_nan_deletion():
|
|
95
|
+
df = pd.DataFrame({
|
|
96
|
+
"a": [1, 2, 3, 4, np.nan],
|
|
97
|
+
"b": [2, 4, 6, 8, 10],
|
|
98
|
+
"c": [np.nan, np.nan, 1, 2, 3],
|
|
99
|
+
})
|
|
100
|
+
out = corr_pairwise(df, min_n=2)
|
|
101
|
+
ab = out[(out.var1 == "a") & (out.var2 == "b")].iloc[0]
|
|
102
|
+
assert ab["n"] == 4
|
|
103
|
+
assert np.isclose(ab["r"], 1.0)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def test_min_n_skips_pairs():
|
|
107
|
+
df = pd.DataFrame({"a": [1, np.nan, np.nan], "b": [1, 2, 3]})
|
|
108
|
+
out = corr_pairwise(df, min_n=3)
|
|
109
|
+
assert len(out) == 0
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def test_constant_column_yields_nan():
|
|
113
|
+
df = pd.DataFrame({"a": [1, 1, 1, 1], "b": [1, 2, 3, 4]})
|
|
114
|
+
out = corr_pairwise(df)
|
|
115
|
+
assert np.isnan(out.iloc[0]["r"])
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ---- corr_table ----
|
|
119
|
+
|
|
120
|
+
def test_table_returns_two_matrices(df):
|
|
121
|
+
t = corr_table(df)
|
|
122
|
+
assert set(t.keys()) == {"r", "p"}
|
|
123
|
+
assert list(t["r"].columns) == list(df.columns)
|
|
124
|
+
assert list(t["p"].columns) == list(df.columns)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_table_p_lower_triangle(df):
|
|
128
|
+
t = corr_table(df)
|
|
129
|
+
p = t["p"]
|
|
130
|
+
cols = list(p.columns)
|
|
131
|
+
# diagonal is em-dash, upper triangle blank, lower filled
|
|
132
|
+
assert p.loc[cols[0], cols[0]] == "—"
|
|
133
|
+
assert p.loc[cols[0], cols[1]] == "" # upper blank
|
|
134
|
+
assert p.loc[cols[1], cols[0]] != "" # lower filled
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def test_table_stars_only_significant(df):
|
|
138
|
+
t = corr_table(df, alpha=0.05)
|
|
139
|
+
cols = list(t["r"].columns)
|
|
140
|
+
# find lower-triangle cell for the strong x-y pair
|
|
141
|
+
xi, yi = cols.index("x"), cols.index("y")
|
|
142
|
+
lo, hi = (("y", "x") if xi < yi else ("x", "y"))
|
|
143
|
+
assert t["r"].loc[lo, hi].endswith("*") # strong pair starred
|
|
144
|
+
# diagonal never starred
|
|
145
|
+
assert t["r"].loc["x", "x"] == "1"
|
|
146
|
+
# a noise pair in the lower triangle should not be starred
|
|
147
|
+
ni = cols.index("noise")
|
|
148
|
+
others = [c for c in cols if c != "noise"]
|
|
149
|
+
for o in others:
|
|
150
|
+
oi = cols.index(o)
|
|
151
|
+
cell = t["r"].loc["noise", o] if ni > oi else t["r"].loc[o, "noise"]
|
|
152
|
+
if cell not in ("", "—"):
|
|
153
|
+
# if it's the lower-triangle filled cell, noise shouldn't be sig
|
|
154
|
+
assert not cell.endswith("*")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_table_no_stars(df):
|
|
158
|
+
t = corr_table(df, stars=False)
|
|
159
|
+
cols = list(t["r"].columns)
|
|
160
|
+
xi, yi = cols.index("x"), cols.index("y")
|
|
161
|
+
lo, hi = (("y", "x") if xi < yi else ("x", "y"))
|
|
162
|
+
# strong pair present but never carries a star when stars=False
|
|
163
|
+
assert not t["r"].loc[lo, hi].endswith("*")
|
|
164
|
+
assert t["r"].loc["x", "x"] == "1"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def test_table_corrected_pmatrix(df):
|
|
168
|
+
# corrected lower-triangle p >= raw lower-triangle p, cell by cell
|
|
169
|
+
raw = corr_table(df, method="none", round_to=6)["p"]
|
|
170
|
+
cor = corr_table(df, method="bonferroni", round_to=6)["p"]
|
|
171
|
+
cols = list(raw.columns)
|
|
172
|
+
for i, ri in enumerate(cols):
|
|
173
|
+
for j, cj in enumerate(cols):
|
|
174
|
+
if i > j and raw.loc[ri, cj] not in ("", "—"):
|
|
175
|
+
assert float(cor.loc[ri, cj]) >= float(raw.loc[ri, cj]) - 1e-9
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def test_table_alpha_controls_stars():
|
|
179
|
+
rng = np.random.default_rng(3)
|
|
180
|
+
n = 60
|
|
181
|
+
x = rng.normal(size=n)
|
|
182
|
+
y = 0.3 * x + rng.normal(size=n)
|
|
183
|
+
df = pd.DataFrame({"x": x, "y": y})
|
|
184
|
+
cols = list(corr_table(df)["r"].columns)
|
|
185
|
+
lo, hi = (cols[1], cols[0]) # lower-triangle cell
|
|
186
|
+
loose = corr_table(df, alpha=0.10)["r"].loc[lo, hi]
|
|
187
|
+
strict = corr_table(df, alpha=0.001)["r"].loc[lo, hi]
|
|
188
|
+
assert loose.count("*") >= strict.count("*")
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def test_corr_matrices_alias(df):
|
|
192
|
+
t = corr_matrices(df)
|
|
193
|
+
assert set(t.keys()) == {"r", "p"}
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def test_fdr_bh_outputs_corrected(df):
|
|
197
|
+
raw = corr_pairwise(df, method="none", round_to=None).set_index(["var1", "var2"])
|
|
198
|
+
cor = corr_pairwise(df, method="fdr_bh", round_to=None).set_index(["var1", "var2"])
|
|
199
|
+
j = raw.join(cor, lsuffix="_r", rsuffix="_c")
|
|
200
|
+
assert (j["p_corrected"] >= j["p_value_r"] - 1e-12).all()
|
|
201
|
+
assert j["p_corrected"].dropna().between(0.0, 1.0).all()
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def test_missing_statsmodels_blocks_noncore_methods(df, monkeypatch):
|
|
205
|
+
import pcorr.core as core
|
|
206
|
+
monkeypatch.setattr(core, "_HAS_STATSMODELS", False)
|
|
207
|
+
with pytest.raises(ImportError):
|
|
208
|
+
corr_pairwise(df, method="holm")
|
|
209
|
+
out = corr_pairwise(df, method="bonferroni")
|
|
210
|
+
assert "p_corrected" in out.columns
|
|
211
|
+
assert "p_value" in out.columns
|