SIMPApy 0.3.2__tar.gz → 1.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {simpapy-0.3.2 → simpapy-1.1.2}/PKG-INFO +3 -3
- {simpapy-0.3.2 → simpapy-1.1.2}/README.md +1 -1
- {simpapy-0.3.2 → simpapy-1.1.2}/SIMPApy/__init__.py +5 -2
- simpapy-1.1.2/SIMPApy/ranking.py +237 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/SIMPApy.egg-info/PKG-INFO +3 -3
- {simpapy-0.3.2 → simpapy-1.1.2}/SIMPApy.egg-info/requires.txt +1 -1
- {simpapy-0.3.2 → simpapy-1.1.2}/pyproject.toml +1 -1
- {simpapy-0.3.2 → simpapy-1.1.2}/tests/test_ranking.py +1 -3
- simpapy-0.3.2/SIMPApy/ranking.py +0 -171
- {simpapy-0.3.2 → simpapy-1.1.2}/LICENSE.txt +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/SIMPApy/SIMPA.py +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/SIMPApy/analyze.py +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/SIMPApy/core.py +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/SIMPApy/preprocess.py +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/SIMPApy/visualize.py +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/SIMPApy.egg-info/SOURCES.txt +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/SIMPApy.egg-info/dependency_links.txt +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/SIMPApy.egg-info/top_level.txt +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/setup.cfg +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/tests/test_SIMPA.py +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/tests/test_analyze.py +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/tests/test_core.py +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/tests/test_preprocess.py +0 -0
- {simpapy-0.3.2 → simpapy-1.1.2}/tests/test_visualize.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: SIMPApy
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.1.2
|
|
4
4
|
Summary: Normalized Single Sample Integrated Multiomics Pathway Analysis
|
|
5
5
|
Author-email: Hasan Alsharoh <hasanalsharoh@gmail.com>
|
|
6
6
|
License: Apache License (2.0)
|
|
@@ -25,7 +25,7 @@ Requires-Dist: scikit-learn==1.5.1
|
|
|
25
25
|
Requires-Dist: seaborn==0.13.2
|
|
26
26
|
Requires-Dist: statsmodels==0.14.1
|
|
27
27
|
Requires-Dist: ipywidgets==8.1.5
|
|
28
|
-
Requires-Dist: pillow==
|
|
28
|
+
Requires-Dist: pillow==12.2.0
|
|
29
29
|
Requires-Dist: kaleido==0.1.0.post1
|
|
30
30
|
Dynamic: license-file
|
|
31
31
|
|
|
@@ -231,7 +231,7 @@ Please consult the module's documentation for further instructions.
|
|
|
231
231
|
- seaborn==0.13.2
|
|
232
232
|
- statsmodels==0.14.1
|
|
233
233
|
- ipywidgets==8.1.5
|
|
234
|
-
- pillow==
|
|
234
|
+
- pillow==12.2.0
|
|
235
235
|
- kaleido==0.1.0.post1
|
|
236
236
|
|
|
237
237
|
|
|
@@ -10,15 +10,18 @@ The package includes the following modules:
|
|
|
10
10
|
- ranking: Contains functions for calculating ranking and mean signed deviation.
|
|
11
11
|
- simpa: Contains the main functions for running SIMPA.
|
|
12
12
|
- visualize: Contains functions for creating interactive plots of from SIMPA results.
|
|
13
|
+
- preprocess: Contains functions for preprocessing multi-omics data and creating aggregated dataframes for SIMPA module.
|
|
14
|
+
- analyze: Contains functions for analyzing SIMPA results, including calculating group differences, plotting volcano plots,
|
|
15
|
+
and calculating and plotting correlations between omics layers.
|
|
13
16
|
"""
|
|
14
17
|
from .core import _sopa, sopa, load_sopa
|
|
15
|
-
from .ranking import calculate_ranking, _calculate_msd
|
|
18
|
+
from .ranking import calculate_ranking, _calculate_msd, _calculate_msd_robust
|
|
16
19
|
from .SIMPA import _simpa, simpa, load_simpa
|
|
17
20
|
from .preprocess import _extract_tag_genes, _create_aggregated_dataframes, process_multiomics_data
|
|
18
21
|
from .visualize import _create_traces, create_interactive_plot
|
|
19
22
|
from .analyze import group_diffs, plot_volcano, calculate_correlation, plot_correlation_scatterplot
|
|
20
23
|
|
|
21
|
-
__version__ = "
|
|
24
|
+
__version__ = "1.1.2"
|
|
22
25
|
__all__ = [
|
|
23
26
|
"calculate_ranking",
|
|
24
27
|
"sopa",
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Gene ranking functions for different omics data types.
|
|
3
|
+
|
|
4
|
+
This module contains functions to calculate rankings for RNA-seq, DNA methylation,
|
|
5
|
+
and copy number variation data.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import numpy as np
|
|
10
|
+
from scipy.stats import norm
|
|
11
|
+
from typing import Dict, List, Union, Optional, Tuple
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _calculate_msd(df: pd.DataFrame, alpha: float = 0.05) -> pd.Series:
|
|
15
|
+
"""
|
|
16
|
+
Calculates the Minimum Significant Difference (MSD) for each gene in the dataframe.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
df: pandas DataFrame with gene expression data.
|
|
20
|
+
alpha: Significance level for the Z-score. Default is 0.05.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
pandas Series with MSD for each gene.
|
|
24
|
+
"""
|
|
25
|
+
# Separate TWA (control) group columns
|
|
26
|
+
twa_cols = [col for col in df.columns if col.startswith('tw')]
|
|
27
|
+
twa_df = df[twa_cols]
|
|
28
|
+
|
|
29
|
+
# Calculate the mean expression for each gene across the TWA group
|
|
30
|
+
gene_means = twa_df.mean(axis=1)
|
|
31
|
+
|
|
32
|
+
# Calculate the Sum of Squares Within (SSW) for each gene
|
|
33
|
+
ssw = ((twa_df.subtract(gene_means, axis=0))**2).sum(axis=1)
|
|
34
|
+
|
|
35
|
+
# Calculate the standard error (SE)
|
|
36
|
+
n = len(twa_cols) # Number of samples in the TWA group
|
|
37
|
+
se = np.sqrt(ssw / (n - 1))
|
|
38
|
+
|
|
39
|
+
# Calculate the Z-score for the given alpha level
|
|
40
|
+
z_alpha = norm.ppf(1 - alpha/2)
|
|
41
|
+
|
|
42
|
+
# Calculate MSD
|
|
43
|
+
msd = z_alpha * se
|
|
44
|
+
|
|
45
|
+
return msd
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _calculate_msd_robust(
|
|
49
|
+
df: pd.DataFrame,
|
|
50
|
+
alpha: float = 0.05,
|
|
51
|
+
asymmetric: bool = True,
|
|
52
|
+
kappa: float = 1.4826,
|
|
53
|
+
) -> pd.DataFrame:
|
|
54
|
+
"""
|
|
55
|
+
Robust Minimum Significant Deviation via median and (optionally directional) MAD.
|
|
56
|
+
|
|
57
|
+
Returns a DataFrame indexed by gene with columns:
|
|
58
|
+
- 'center' : gene-wise median of the TWA group
|
|
59
|
+
- 'msd_up' : MSD applied when D_{x,s} >= 0
|
|
60
|
+
- 'msd_dn' : MSD applied when D_{x,s} < 0
|
|
61
|
+
|
|
62
|
+
When asymmetric=False, msd_up == msd_dn (classical symmetric robust MSD).
|
|
63
|
+
|
|
64
|
+
Fallback hierarchy for zero-scale genes:
|
|
65
|
+
1. IQR / 1.349 (Gaussian-consistent, nonzero unless >=25% ties at median)
|
|
66
|
+
2. global median of non-zero robust scales across genes
|
|
67
|
+
"""
|
|
68
|
+
twa_cols = [c for c in df.columns if c.startswith('tw')]
|
|
69
|
+
if len(twa_cols) < 3:
|
|
70
|
+
raise ValueError("Robust MSD requires >= 3 TWA samples.")
|
|
71
|
+
twa = df[twa_cols]
|
|
72
|
+
|
|
73
|
+
# Robust center and deviations
|
|
74
|
+
center = twa.median(axis=1)
|
|
75
|
+
devs = twa.subtract(center, axis=0)
|
|
76
|
+
|
|
77
|
+
# Symmetric MAD (used as a sanity fallback inside asymmetric branch)
|
|
78
|
+
mad_sym = devs.abs().median(axis=1)
|
|
79
|
+
sigma_sym = kappa * mad_sym
|
|
80
|
+
|
|
81
|
+
# IQR-based and global fallbacks
|
|
82
|
+
iqr = twa.quantile(0.75, axis=1) - twa.quantile(0.25, axis=1)
|
|
83
|
+
sigma_iqr = iqr / 1.349
|
|
84
|
+
pool = sigma_sym[sigma_sym > 0]
|
|
85
|
+
global_sigma = float(pool.median()) if not pool.empty else 1e-6
|
|
86
|
+
global_sigma = max(global_sigma, 1e-6)
|
|
87
|
+
|
|
88
|
+
def _apply_fallbacks(sigma: pd.Series) -> pd.Series:
|
|
89
|
+
s = sigma.copy().astype(float)
|
|
90
|
+
bad = (s <= 0) | ~np.isfinite(s)
|
|
91
|
+
s.loc[bad] = sigma_iqr.loc[bad]
|
|
92
|
+
bad = (s <= 0) | ~np.isfinite(s)
|
|
93
|
+
s.loc[bad] = global_sigma
|
|
94
|
+
return s
|
|
95
|
+
|
|
96
|
+
if asymmetric:
|
|
97
|
+
pos = devs.where(devs > 0) # NaN where dev <= 0
|
|
98
|
+
neg = (-devs).where(devs < 0) # positive magnitudes, NaN elsewhere
|
|
99
|
+
mad_up = pos.median(axis=1)
|
|
100
|
+
mad_dn = neg.median(axis=1)
|
|
101
|
+
sigma_up = kappa * mad_up
|
|
102
|
+
sigma_dn = kappa * mad_dn
|
|
103
|
+
# If a tail is empty/zero, fall back to symmetric MAD *before* IQR
|
|
104
|
+
sigma_up = sigma_up.where(sigma_up > 0, sigma_sym)
|
|
105
|
+
sigma_dn = sigma_dn.where(sigma_dn > 0, sigma_sym)
|
|
106
|
+
else:
|
|
107
|
+
sigma_up = sigma_sym.copy()
|
|
108
|
+
sigma_dn = sigma_sym.copy()
|
|
109
|
+
|
|
110
|
+
sigma_up = _apply_fallbacks(sigma_up)
|
|
111
|
+
sigma_dn = _apply_fallbacks(sigma_dn)
|
|
112
|
+
|
|
113
|
+
z = norm.ppf(1 - alpha / 2)
|
|
114
|
+
return pd.DataFrame({
|
|
115
|
+
'center': center,
|
|
116
|
+
'msd_up': z * sigma_up,
|
|
117
|
+
'msd_dn': z * sigma_dn,
|
|
118
|
+
})
|
|
119
|
+
|
|
120
|
+
def calculate_ranking(
|
|
121
|
+
df: pd.DataFrame,
|
|
122
|
+
omic: str = "RNA",
|
|
123
|
+
alpha: float = 0.05,
|
|
124
|
+
robust: bool = True,
|
|
125
|
+
asymmetric: bool = True,
|
|
126
|
+
):
|
|
127
|
+
"""
|
|
128
|
+
Parameters
|
|
129
|
+
omic : str
|
|
130
|
+
Type of omic data: "RNA", "DNAm", or "CNV".
|
|
131
|
+
alpha : float
|
|
132
|
+
Significance level for ranking (used in MSD calculation).
|
|
133
|
+
robust : bool
|
|
134
|
+
If True, use median + MAD-based MSD (non-parametric).
|
|
135
|
+
asymmetric : bool
|
|
136
|
+
If True (and robust=True), use directional (double) MAD
|
|
137
|
+
to handle skewed baseline distributions.
|
|
138
|
+
(asymetric and robust are highly recommended together and are the default)
|
|
139
|
+
Using assymetric and robust together is preferred for non parametric data, and is the more conservative approach.
|
|
140
|
+
Having both false is the classical parametric approach which may be more powerful if assumptions are met, but is more sensitive to outliers and non-normality.
|
|
141
|
+
"""
|
|
142
|
+
if omic.upper() in ["RNA", "DNAM"]:
|
|
143
|
+
twa_cols = [c for c in df.columns if c.startswith('tw')]
|
|
144
|
+
|
|
145
|
+
if robust:
|
|
146
|
+
tbl = _calculate_msd_robust(df, alpha=alpha, asymmetric=asymmetric)
|
|
147
|
+
center = tbl['center']
|
|
148
|
+
msd_up, msd_dn = tbl['msd_up'], tbl['msd_dn']
|
|
149
|
+
else:
|
|
150
|
+
center = df[twa_cols].mean(axis=1)
|
|
151
|
+
msd_par = _calculate_msd(df, alpha)
|
|
152
|
+
msd_up = msd_par
|
|
153
|
+
msd_dn = msd_par
|
|
154
|
+
|
|
155
|
+
ranked_dfs = {}
|
|
156
|
+
for sample in df.columns:
|
|
157
|
+
d_xs = df[sample] - center
|
|
158
|
+
# Directional scale: msd_up for positive deviations, msd_dn for negative
|
|
159
|
+
scale = np.where(d_xs >= 0, msd_up.values, msd_dn.values)
|
|
160
|
+
# Avoid zero division from any edge case
|
|
161
|
+
scale = np.where(scale > 0, scale, np.nan)
|
|
162
|
+
weighted = d_xs.values / scale
|
|
163
|
+
msd_signed = np.where(d_xs >= 0, msd_up.values, -msd_dn.values)
|
|
164
|
+
|
|
165
|
+
ranked_dfs[sample] = pd.DataFrame({
|
|
166
|
+
'D_xs': d_xs.values,
|
|
167
|
+
'MSD': msd_signed,
|
|
168
|
+
'weighted': weighted,
|
|
169
|
+
'Significant': np.abs(d_xs.values) > np.where(d_xs >= 0, msd_up.values, msd_dn.values),
|
|
170
|
+
'Rank': pd.Series(d_xs.values, index=df.index).rank(ascending=False).values,
|
|
171
|
+
}, index=df.index)
|
|
172
|
+
return ranked_dfs
|
|
173
|
+
|
|
174
|
+
elif omic.upper() == "CNV":
|
|
175
|
+
|
|
176
|
+
control_data = df.filter(regex='^tw')
|
|
177
|
+
N = len(control_data.columns)
|
|
178
|
+
epsilon = 0.01 # Small constant to prevent division by zero
|
|
179
|
+
|
|
180
|
+
# Pre-compute all necessary stats for the control group
|
|
181
|
+
control_counts_df = control_data.apply(pd.Series.value_counts, axis=1).fillna(0).astype(int)
|
|
182
|
+
mu_controls = control_data.mean(axis=1)
|
|
183
|
+
sigma_controls = control_data.std(axis=1)
|
|
184
|
+
|
|
185
|
+
ranked_dfs = {}
|
|
186
|
+
|
|
187
|
+
# 2. Loop through all samples
|
|
188
|
+
for sample_name in df.columns:
|
|
189
|
+
sample_series = df[sample_name]
|
|
190
|
+
scores = []
|
|
191
|
+
|
|
192
|
+
# Loop through each gene in the current sample
|
|
193
|
+
for gene, cn_value in sample_series.items():
|
|
194
|
+
|
|
195
|
+
if cn_value != 2:
|
|
196
|
+
|
|
197
|
+
# Look up k: number of controls with the same CN value
|
|
198
|
+
k = control_counts_df.loc[gene, cn_value] if cn_value in control_counts_df.columns else 0
|
|
199
|
+
|
|
200
|
+
# Construct 2x2 table cells for a stable Odds Ratio calculation
|
|
201
|
+
a, b = 1.5, 0.5
|
|
202
|
+
c, d = k + 0.5, (N - k) + 0.5
|
|
203
|
+
|
|
204
|
+
# Calculate the corrected odds ratio
|
|
205
|
+
or_corrected = (a * d) / (b * c)
|
|
206
|
+
|
|
207
|
+
# Handle edge case for log transform if OR is somehow non-positive
|
|
208
|
+
if or_corrected <= 0:
|
|
209
|
+
or_corrected = epsilon
|
|
210
|
+
|
|
211
|
+
# Look up the pre-computed standard deviation for the gene
|
|
212
|
+
sigma_for_gene = sigma_controls.loc[gene]
|
|
213
|
+
|
|
214
|
+
# Calculate the final score using the enhanced formula
|
|
215
|
+
score = (np.sign(cn_value - 2) * np.log10(or_corrected)) / (sigma_for_gene + epsilon)
|
|
216
|
+
|
|
217
|
+
else: # cn_value == 2
|
|
218
|
+
|
|
219
|
+
# Look up pre-computed mean and std dev for the gene
|
|
220
|
+
mu_for_gene = mu_controls.loc[gene]
|
|
221
|
+
sigma_for_gene = sigma_controls.loc[gene]
|
|
222
|
+
|
|
223
|
+
# Calculate the Z-score relative to the control mean to capture nuance
|
|
224
|
+
score = (2 - mu_for_gene) / (sigma_for_gene + epsilon)
|
|
225
|
+
|
|
226
|
+
scores.append(score)
|
|
227
|
+
|
|
228
|
+
df_sample = pd.DataFrame(
|
|
229
|
+
{'adjusted_weight': scores},
|
|
230
|
+
index=df.index
|
|
231
|
+
)
|
|
232
|
+
ranked_dfs[sample_name] = df_sample
|
|
233
|
+
|
|
234
|
+
return ranked_dfs
|
|
235
|
+
|
|
236
|
+
else:
|
|
237
|
+
raise ValueError("Omic type must be 'RNA', 'DNAm', or 'CNV'")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: SIMPApy
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.1.2
|
|
4
4
|
Summary: Normalized Single Sample Integrated Multiomics Pathway Analysis
|
|
5
5
|
Author-email: Hasan Alsharoh <hasanalsharoh@gmail.com>
|
|
6
6
|
License: Apache License (2.0)
|
|
@@ -25,7 +25,7 @@ Requires-Dist: scikit-learn==1.5.1
|
|
|
25
25
|
Requires-Dist: seaborn==0.13.2
|
|
26
26
|
Requires-Dist: statsmodels==0.14.1
|
|
27
27
|
Requires-Dist: ipywidgets==8.1.5
|
|
28
|
-
Requires-Dist: pillow==
|
|
28
|
+
Requires-Dist: pillow==12.2.0
|
|
29
29
|
Requires-Dist: kaleido==0.1.0.post1
|
|
30
30
|
Dynamic: license-file
|
|
31
31
|
|
|
@@ -231,7 +231,7 @@ Please consult the module's documentation for further instructions.
|
|
|
231
231
|
- seaborn==0.13.2
|
|
232
232
|
- statsmodels==0.14.1
|
|
233
233
|
- ipywidgets==8.1.5
|
|
234
|
-
- pillow==
|
|
234
|
+
- pillow==12.2.0
|
|
235
235
|
- kaleido==0.1.0.post1
|
|
236
236
|
|
|
237
237
|
|
|
@@ -93,16 +93,14 @@ class TestCalculateRanking(unittest.TestCase):
|
|
|
93
93
|
with self.assertRaises(ValueError):
|
|
94
94
|
calculate_ranking(self.rna_data, omic="invalid_type")
|
|
95
95
|
|
|
96
|
-
@unittest.expectedFailure
|
|
97
96
|
def test_empty_dataframe(self):
|
|
98
97
|
"""Test with empty dataframe"""
|
|
99
98
|
empty_df = pd.DataFrame()
|
|
100
99
|
with self.assertRaises(Exception): # Some exception should be raised
|
|
101
100
|
calculate_ranking(empty_df)
|
|
102
101
|
|
|
103
|
-
@unittest.expectedFailure
|
|
104
102
|
def test_single_sample(self):
|
|
105
|
-
"""Test with single sample (
|
|
103
|
+
"""Test with single sample (less than 3 is not enough for ranking)"""
|
|
106
104
|
single_sample = pd.DataFrame({'case1': [10, 20, 30]},
|
|
107
105
|
index=['gene1', 'gene2', 'gene3'])
|
|
108
106
|
with self.assertRaises(Exception): # Should fail without control samples
|
simpapy-0.3.2/SIMPApy/ranking.py
DELETED
|
@@ -1,171 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Gene ranking functions for different omics data types.
|
|
3
|
-
|
|
4
|
-
This module contains functions to calculate rankings for RNA-seq, DNA methylation,
|
|
5
|
-
and copy number variation data.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import numpy as np
|
|
10
|
-
from scipy.stats import norm
|
|
11
|
-
from typing import Dict, List, Union, Optional, Tuple
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def _calculate_msd(df: pd.DataFrame, alpha: float = 0.05) -> pd.Series:
|
|
15
|
-
"""
|
|
16
|
-
Calculates the Minimum Significant Difference (MSD) for each gene in the dataframe.
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
df: pandas DataFrame with gene expression data.
|
|
20
|
-
alpha: Significance level for the Z-score. Default is 0.05.
|
|
21
|
-
|
|
22
|
-
Returns:
|
|
23
|
-
pandas Series with MSD for each gene.
|
|
24
|
-
"""
|
|
25
|
-
# Separate TWA (control) group columns
|
|
26
|
-
twa_cols = [col for col in df.columns if col.startswith('tw')]
|
|
27
|
-
twa_df = df[twa_cols]
|
|
28
|
-
|
|
29
|
-
# Calculate the mean expression for each gene across the TWA group
|
|
30
|
-
gene_means = twa_df.mean(axis=1)
|
|
31
|
-
|
|
32
|
-
# Calculate the Sum of Squares Within (SSW) for each gene
|
|
33
|
-
ssw = ((twa_df.subtract(gene_means, axis=0))**2).sum(axis=1)
|
|
34
|
-
|
|
35
|
-
# Calculate the standard error (SE)
|
|
36
|
-
n = len(twa_cols) # Number of samples in the TWA group
|
|
37
|
-
se = np.sqrt(ssw / (n - 1))
|
|
38
|
-
|
|
39
|
-
# Calculate the Z-score for the given alpha level
|
|
40
|
-
z_alpha = norm.ppf(1 - alpha/2)
|
|
41
|
-
|
|
42
|
-
# Calculate MSD
|
|
43
|
-
msd = z_alpha * se
|
|
44
|
-
|
|
45
|
-
return msd
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def calculate_ranking(
|
|
49
|
-
df: pd.DataFrame,
|
|
50
|
-
omic: str = "RNA",
|
|
51
|
-
alpha: float = 0.05
|
|
52
|
-
) -> Dict[str, pd.DataFrame]:
|
|
53
|
-
"""
|
|
54
|
-
Calculate rankings for different types of omics data.
|
|
55
|
-
|
|
56
|
-
Args:
|
|
57
|
-
df: pandas DataFrame with omics data. Rows are genes/features, columns are samples.
|
|
58
|
-
omic: Type of omics data. Must be "RNA", "DNAm", or "CNV". Default is "RNA".
|
|
59
|
-
alpha: Significance level for RNA and DNAm rankings. Default is 0.05.
|
|
60
|
-
|
|
61
|
-
Returns:
|
|
62
|
-
A dictionary of DataFrames, where each key is a sample name containing a
|
|
63
|
-
DataFrame with gene rankings.
|
|
64
|
-
"""
|
|
65
|
-
if omic.upper() in ["RNA", "DNAM"]:
|
|
66
|
-
# Calculate MSD first for RNA and DNAm
|
|
67
|
-
msd = _calculate_msd(df, alpha)
|
|
68
|
-
|
|
69
|
-
# Separate TWA (control) group columns
|
|
70
|
-
twa_cols = [col for col in df.columns if col.startswith('tw')]
|
|
71
|
-
twa_df = df[twa_cols]
|
|
72
|
-
|
|
73
|
-
# Calculate the mean expression for each gene across the TWA group
|
|
74
|
-
gene_means = twa_df.mean(axis=1)
|
|
75
|
-
|
|
76
|
-
# Dictionary to store the ranked DataFrames
|
|
77
|
-
ranked_dfs = {}
|
|
78
|
-
|
|
79
|
-
# Iterate over each sample (column) including TWA samples
|
|
80
|
-
for sample in df.columns:
|
|
81
|
-
# Calculate the difference (D_(x,s))
|
|
82
|
-
d_xs = df[sample] - gene_means
|
|
83
|
-
|
|
84
|
-
# Adjust sign of MSD based on D_(x,s)
|
|
85
|
-
msd_signed = msd * np.sign(d_xs)
|
|
86
|
-
|
|
87
|
-
# Calculate weighted score
|
|
88
|
-
weighted_score = d_xs / msd
|
|
89
|
-
|
|
90
|
-
# Create a DataFrame for the current sample
|
|
91
|
-
sample_df = pd.DataFrame({
|
|
92
|
-
'D_xs': d_xs,
|
|
93
|
-
'MSD': msd_signed,
|
|
94
|
-
'weighted': weighted_score,
|
|
95
|
-
'Significant': abs(d_xs) > msd
|
|
96
|
-
})
|
|
97
|
-
|
|
98
|
-
# Rank genes based on D_(x,s)
|
|
99
|
-
sample_df['Rank'] = sample_df['D_xs'].rank(ascending=False)
|
|
100
|
-
|
|
101
|
-
# Store the DataFrame in the dictionary
|
|
102
|
-
ranked_dfs[sample] = sample_df
|
|
103
|
-
|
|
104
|
-
del sample_df
|
|
105
|
-
|
|
106
|
-
return ranked_dfs
|
|
107
|
-
|
|
108
|
-
elif omic.upper() == "CNV":
|
|
109
|
-
|
|
110
|
-
control_data = df.filter(regex='^tw')
|
|
111
|
-
N = len(control_data.columns)
|
|
112
|
-
epsilon = 0.01 # Small constant to prevent division by zero
|
|
113
|
-
|
|
114
|
-
# Pre-compute all necessary stats for the control group
|
|
115
|
-
control_counts_df = control_data.apply(pd.Series.value_counts, axis=1).fillna(0).astype(int)
|
|
116
|
-
mu_controls = control_data.mean(axis=1)
|
|
117
|
-
sigma_controls = control_data.std(axis=1)
|
|
118
|
-
|
|
119
|
-
ranked_dfs = {}
|
|
120
|
-
|
|
121
|
-
# 2. Loop through all samples
|
|
122
|
-
for sample_name in df.columns:
|
|
123
|
-
sample_series = df[sample_name]
|
|
124
|
-
scores = []
|
|
125
|
-
|
|
126
|
-
# Loop through each gene in the current sample
|
|
127
|
-
for gene, cn_value in sample_series.items():
|
|
128
|
-
|
|
129
|
-
if cn_value != 2:
|
|
130
|
-
|
|
131
|
-
# Look up k: number of controls with the same CN value
|
|
132
|
-
k = control_counts_df.loc[gene, cn_value] if cn_value in control_counts_df.columns else 0
|
|
133
|
-
|
|
134
|
-
# Construct 2x2 table cells for a stable Odds Ratio calculation
|
|
135
|
-
a, b = 1.5, 0.5
|
|
136
|
-
c, d = k + 0.5, (N - k) + 0.5
|
|
137
|
-
|
|
138
|
-
# Calculate the corrected odds ratio
|
|
139
|
-
or_corrected = (a * d) / (b * c)
|
|
140
|
-
|
|
141
|
-
# Handle edge case for log transform if OR is somehow non-positive
|
|
142
|
-
if or_corrected <= 0:
|
|
143
|
-
or_corrected = epsilon
|
|
144
|
-
|
|
145
|
-
# Look up the pre-computed standard deviation for the gene
|
|
146
|
-
sigma_for_gene = sigma_controls.loc[gene]
|
|
147
|
-
|
|
148
|
-
# Calculate the final score using the enhanced formula
|
|
149
|
-
score = (np.sign(cn_value - 2) * np.log10(or_corrected)) / (sigma_for_gene + epsilon)
|
|
150
|
-
|
|
151
|
-
else: # cn_value == 2
|
|
152
|
-
|
|
153
|
-
# Look up pre-computed mean and std dev for the gene
|
|
154
|
-
mu_for_gene = mu_controls.loc[gene]
|
|
155
|
-
sigma_for_gene = sigma_controls.loc[gene]
|
|
156
|
-
|
|
157
|
-
# Calculate the Z-score relative to the control mean to capture nuance
|
|
158
|
-
score = (2 - mu_for_gene) / (sigma_for_gene + epsilon)
|
|
159
|
-
|
|
160
|
-
scores.append(score)
|
|
161
|
-
|
|
162
|
-
df_sample = pd.DataFrame(
|
|
163
|
-
{'adjusted_weight': scores},
|
|
164
|
-
index=df.index
|
|
165
|
-
)
|
|
166
|
-
ranked_dfs[sample_name] = df_sample
|
|
167
|
-
|
|
168
|
-
return ranked_dfs
|
|
169
|
-
|
|
170
|
-
else:
|
|
171
|
-
raise ValueError("Omic type must be 'RNA', 'DNAm', or 'CNV'")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|