acore 0.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
acore/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,194 @@
1
+ import itertools
2
+ import numpy as np
3
+ import pandas as pd
4
+ from scipy import stats
5
+ import pingouin as pg
6
+ from scipy.special import betainc
7
+ import acore.utils as utils
8
+ from acore.multiple_testing import apply_pvalue_correction
9
+
10
+
11
+ def calculate_correlations(x, y, method='pearson'):
12
+ """
13
+ Calculates a Spearman (nonparametric) or a Pearson (parametric) correlation coefficient and p-value to test for non-correlation.
14
+
15
+ :param ndarray x: array 1
16
+ :param ndarray y: array 2
17
+ :param str method: chooses which kind of correlation method to run
18
+ :return: Tuple with two floats, correlation coefficient and two-tailed p-value.
19
+
20
+ Example::
21
+
22
+ result = calculate_correlations(x, y, method='pearson')
23
+ """
24
+ if method == "pearson":
25
+ coefficient, pvalue = stats.pearsonr(x, y)
26
+ elif method == "spearman":
27
+ coefficient, pvalue = stats.spearmanr(x, y)
28
+
29
+ return (coefficient, pvalue)
30
+
31
+
32
+ def run_correlation(df, alpha=0.05, subject='subject', group='group', method='pearson', correction='fdr_bh'):
33
+ """
34
+ This function calculates pairwise correlations for columns in dataframe, and returns it in the shape of a edge list with 'weight' as correlation score, and the ajusted p-values.
35
+
36
+ :param df: pandas dataframe with samples as rows and features as columns.
37
+ :param str subject: name of column containing subject identifiers.
38
+ :param str group: name of column containing group identifiers.
39
+ :param str method: method to use for correlation calculation ('pearson', 'spearman').
40
+ :param floar alpha: error rate. Values velow alpha are considered significant.
41
+ :param string correction: type of correction see apply_pvalue_correction for methods
42
+ :return: Pandas dataframe with columns: 'node1', 'node2', 'weight', 'padj' and 'rejected'.
43
+
44
+ Example::
45
+
46
+ result = run_correlation(df, alpha=0.05, subject='subject', group='group', method='pearson', correction='fdr_bh')
47
+ """
48
+ correlation = pd.DataFrame()
49
+ # ToDo
50
+ # The Repeated measurements correlation calculation is too time consuming so it only runs if
51
+ # the number of features is less than 200
52
+ if utils.check_is_paired(df, subject, group):
53
+ if len(df[subject].unique()) > 2:
54
+ if len(df.columns) < 200:
55
+ correlation = run_rm_correlation(df, alpha=alpha, subject=subject, correction=correction)
56
+ else:
57
+ df = df.dropna(axis=1)._get_numeric_data()
58
+ if not df.empty:
59
+ r, p = run_efficient_correlation(df, method=method)
60
+ rdf = pd.DataFrame(r, index=df.columns, columns=df.columns)
61
+ pdf = pd.DataFrame(p, index=df.columns, columns=df.columns)
62
+ correlation = utils.convertToEdgeList(rdf, ["node1", "node2", "weight"])
63
+ pvalues = utils.convertToEdgeList(pdf, ["node1", "node2", "pvalue"])
64
+ correlation = pd.merge(correlation, pvalues, on=['node1', 'node2'])
65
+
66
+ rejected, padj = apply_pvalue_correction(correlation["pvalue"].tolist(), alpha=alpha, method=correction)
67
+ correlation["padj"] = padj
68
+ correlation["rejected"] = rejected
69
+ correlation = correlation[correlation.rejected]
70
+ correlation["pvalue"] = correlation["pvalue"].apply(lambda x: str(round(x, 5)))
71
+ correlation["padj"] = correlation["padj"].apply(lambda x: str(round(x, 5)))
72
+
73
+ return correlation
74
+
75
+
76
+ def run_multi_correlation(df_dict, alpha=0.05, subject='subject', on=['subject', 'biological_sample'], group='group', method='pearson', correction='fdr_bh'):
77
+ """
78
+ This function merges all input dataframes and calculates pairwise correlations for all columns.
79
+
80
+ :param dict df_dict: dictionary of pandas dataframes with samples as rows and features as columns.
81
+ :param str subject: name of the column containing subject identifiers.
82
+ :param str group: name of the column containing group identifiers.
83
+ :param list on: column names to join dataframes on (must be found in all dataframes).
84
+ :param str method: method to use for correlation calculation ('pearson', 'spearman').
85
+ :param float alpha: error rate. Values velow alpha are considered significant.
86
+ :param string correction: type of correction see apply_pvalue_correction for methods
87
+ :return: Pandas dataframe with columns: 'node1', 'node2', 'weight', 'padj' and 'rejected'.
88
+
89
+ Example::
90
+
91
+ result = run_multi_correlation(df_dict, alpha=0.05, subject='subject', on=['subject', 'biological_sample'] , group='group', method='pearson', correction='fdr_bh')
92
+ """
93
+ multidf = pd.DataFrame()
94
+ correlation = None
95
+ for dtype in df_dict:
96
+ if multidf.empty:
97
+ if isinstance(df_dict[dtype], pd.DataFrame):
98
+ multidf = df_dict[dtype]
99
+ else:
100
+ if isinstance(df_dict[dtype], pd.DataFrame):
101
+ multidf = pd.merge(multidf, df_dict[dtype], how='inner', on=on)
102
+ if not multidf.empty:
103
+ correlation = run_correlation(multidf, alpha=alpha, subject=subject, group=group, method=method, correction=correction)
104
+
105
+ return correlation
106
+
107
+
108
+ def calculate_rm_correlation(df, x, y, subject):
109
+ """
110
+ Computes correlation and p-values between two columns a and b in df.
111
+
112
+ :param df: pandas dataframe with subjects as rows and two features and columns.
113
+ :param str x: feature a name.
114
+ :param str y: feature b name.
115
+ :param subject: column name containing the covariate variable.
116
+ :return: Tuple with values for: feature a, feature b, correlation, p-value and degrees of freedom.
117
+
118
+ Example::
119
+
120
+ result = calculate_rm_correlation(df, x='feature a', y='feature b', subject='subject')
121
+ """
122
+ result = pg.rm_corr(data=df, x=x, y=y, subject=subject)
123
+
124
+ return (x, y, result["r"].values[0], result["pval"].values[0], result["dof"].values[0])
125
+
126
+
127
+ def run_rm_correlation(df, alpha=0.05, subject='subject', correction='fdr_bh'):
128
+ """
129
+ Computes pairwise repeated measurements correlations for all columns in dataframe, and returns results as an edge list with 'weight' as correlation score, p-values, degrees of freedom and ajusted p-values.
130
+
131
+ :param df: pandas dataframe with samples as rows and features as columns.
132
+ :param str subject: name of column containing subject identifiers.
133
+ :param float alpha: error rate. Values velow alpha are considered significant.
134
+ :param string correction: type of correction type see apply_pvalue_correction for methods
135
+ :return: Pandas dataframe with columns: 'node1', 'node2', 'weight', 'pvalue', 'dof', 'padj' and 'rejected'.
136
+
137
+ Example::
138
+
139
+ result = run_rm_correlation(df, alpha=0.05, subject='subject', correction='fdr_bh')
140
+ """
141
+ rows = []
142
+ if not df.empty:
143
+ df = df.set_index(subject)._get_numeric_data().dropna(axis=1)
144
+ df.columns = df.columns.astype(str)
145
+ combinations = itertools.combinations(df.columns, 2)
146
+ df = df.reset_index()
147
+ for x, y in combinations:
148
+ row = [x, y]
149
+ subset = df[[x, y, subject]]
150
+ row.extend(pg.rm_corr(subset, x, y, subject).values.tolist()[0])
151
+ rows.append(row)
152
+
153
+ correlation = pd.DataFrame(rows, columns=["node1", "node2", "weight", "dof", "pvalue", "CI95%", "power"])
154
+ rejected, padj = apply_pvalue_correction(correlation["pvalue"].tolist(), alpha=alpha, method=correction)
155
+ correlation["padj"] = padj
156
+ correlation["rejected"] = rejected
157
+ correlation = correlation[correlation.rejected]
158
+ correlation["padj"] = correlation["padj"].apply(lambda x: str(round(x, 5)))
159
+
160
+ return correlation
161
+
162
+
163
+ def run_efficient_correlation(data, method='pearson'):
164
+ """
165
+ Calculates pairwise correlations and returns lower triangle of the matrix with correlation values and p-values.
166
+
167
+ :param data: pandas dataframe with samples as index and features as columns (numeric data only).
168
+ :param str method: method to use for correlation calculation ('pearson', 'spearman').
169
+ :return: Two numpy arrays: correlation and p-values.
170
+
171
+ Example::
172
+
173
+ result = run_efficient_correlation(data, method='pearson')
174
+ """
175
+ matrix = data.values
176
+ if method == 'pearson':
177
+ r = np.corrcoef(matrix, rowvar=False)
178
+ elif method == 'spearman':
179
+ r, p = stats.spearmanr(matrix, axis=0)
180
+
181
+ diagonal = np.triu_indices(r.shape[0], 1)
182
+ rf = r[diagonal]
183
+ df = matrix.shape[1] - 2
184
+ ts = rf * rf * (df / (1 - rf * rf))
185
+ pf = betainc(0.5 * df, 0.5, df / (df + ts))
186
+ p = np.zeros(shape=r.shape)
187
+ p[np.triu_indices(p.shape[0], 1)] = pf
188
+ p[np.tril_indices(p.shape[0], -1)] = pf
189
+ p[np.diag_indices(p.shape[0])] = np.ones(p.shape[0])
190
+
191
+ r[diagonal] = np.nan
192
+ p[diagonal] = np.nan
193
+
194
+ return r, p