SIMPApy 0.3.2__tar.gz → 1.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: SIMPApy
3
- Version: 0.3.2
3
+ Version: 1.1.2
4
4
  Summary: Normalized Single Sample Integrated Multiomics Pathway Analysis
5
5
  Author-email: Hasan Alsharoh <hasanalsharoh@gmail.com>
6
6
  License: Apache License (2.0)
@@ -25,7 +25,7 @@ Requires-Dist: scikit-learn==1.5.1
25
25
  Requires-Dist: seaborn==0.13.2
26
26
  Requires-Dist: statsmodels==0.14.1
27
27
  Requires-Dist: ipywidgets==8.1.5
28
- Requires-Dist: pillow==10.4.0
28
+ Requires-Dist: pillow==12.2.0
29
29
  Requires-Dist: kaleido==0.1.0.post1
30
30
  Dynamic: license-file
31
31
 
@@ -231,7 +231,7 @@ Please consult the module's documentation for further instructions.
231
231
  - seaborn==0.13.2
232
232
  - statsmodels==0.14.1
233
233
  - ipywidgets==8.1.5
234
- - pillow==10.4.0
234
+ - pillow==12.2.0
235
235
  - kaleido==0.1.0.post1
236
236
 
237
237
 
@@ -200,7 +200,7 @@ Please consult the module's documentation for further instructions.
200
200
  - seaborn==0.13.2
201
201
  - statsmodels==0.14.1
202
202
  - ipywidgets==8.1.5
203
- - pillow==10.4.0
203
+ - pillow==12.2.0
204
204
  - kaleido==0.1.0.post1
205
205
 
206
206
 
@@ -10,15 +10,18 @@ The package includes the following modules:
10
10
  - ranking: Contains functions for calculating ranking and mean signed deviation.
11
11
  - simpa: Contains the main functions for running SIMPA.
12
12
  - visualize: Contains functions for creating interactive plots of from SIMPA results.
13
+ - preprocess: Contains functions for preprocessing multi-omics data and creating aggregated dataframes for SIMPA module.
14
+ - analyze: Contains functions for analyzing SIMPA results, including calculating group differences, plotting volcano plots,
15
+ and calculating and plotting correlations between omics layers.
13
16
  """
14
17
  from .core import _sopa, sopa, load_sopa
15
- from .ranking import calculate_ranking, _calculate_msd
18
+ from .ranking import calculate_ranking, _calculate_msd, _calculate_msd_robust
16
19
  from .SIMPA import _simpa, simpa, load_simpa
17
20
  from .preprocess import _extract_tag_genes, _create_aggregated_dataframes, process_multiomics_data
18
21
  from .visualize import _create_traces, create_interactive_plot
19
22
  from .analyze import group_diffs, plot_volcano, calculate_correlation, plot_correlation_scatterplot
20
23
 
21
- __version__ = "0.3.2"
24
+ __version__ = "1.1.2"
22
25
  __all__ = [
23
26
  "calculate_ranking",
24
27
  "sopa",
@@ -0,0 +1,237 @@
1
+ """
2
+ Gene ranking functions for different omics data types.
3
+
4
+ This module contains functions to calculate rankings for RNA-seq, DNA methylation,
5
+ and copy number variation data.
6
+ """
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+ from scipy.stats import norm
11
+ from typing import Dict, List, Union, Optional, Tuple
12
+
13
+
14
+ def _calculate_msd(df: pd.DataFrame, alpha: float = 0.05) -> pd.Series:
15
+ """
16
+ Calculates the Minimum Significant Difference (MSD) for each gene in the dataframe.
17
+
18
+ Args:
19
+ df: pandas DataFrame with gene expression data.
20
+ alpha: Significance level for the Z-score. Default is 0.05.
21
+
22
+ Returns:
23
+ pandas Series with MSD for each gene.
24
+ """
25
+ # Separate TWA (control) group columns
26
+ twa_cols = [col for col in df.columns if col.startswith('tw')]
27
+ twa_df = df[twa_cols]
28
+
29
+ # Calculate the mean expression for each gene across the TWA group
30
+ gene_means = twa_df.mean(axis=1)
31
+
32
+ # Calculate the Sum of Squares Within (SSW) for each gene
33
+ ssw = ((twa_df.subtract(gene_means, axis=0))**2).sum(axis=1)
34
+
35
+ # Calculate the standard error (SE)
36
+ n = len(twa_cols) # Number of samples in the TWA group
37
+ se = np.sqrt(ssw / (n - 1))
38
+
39
+ # Calculate the Z-score for the given alpha level
40
+ z_alpha = norm.ppf(1 - alpha/2)
41
+
42
+ # Calculate MSD
43
+ msd = z_alpha * se
44
+
45
+ return msd
46
+
47
+
48
+ def _calculate_msd_robust(
49
+ df: pd.DataFrame,
50
+ alpha: float = 0.05,
51
+ asymmetric: bool = True,
52
+ kappa: float = 1.4826,
53
+ ) -> pd.DataFrame:
54
+ """
55
+ Robust Minimum Significant Deviation via median and (optionally directional) MAD.
56
+
57
+ Returns a DataFrame indexed by gene with columns:
58
+ - 'center' : gene-wise median of the TWA group
59
+ - 'msd_up' : MSD applied when D_{x,s} >= 0
60
+ - 'msd_dn' : MSD applied when D_{x,s} < 0
61
+
62
+ When asymmetric=False, msd_up == msd_dn (classical symmetric robust MSD).
63
+
64
+ Fallback hierarchy for zero-scale genes:
65
+ 1. IQR / 1.349 (Gaussian-consistent, nonzero unless >=25% ties at median)
66
+ 2. global median of non-zero robust scales across genes
67
+ """
68
+ twa_cols = [c for c in df.columns if c.startswith('tw')]
69
+ if len(twa_cols) < 3:
70
+ raise ValueError("Robust MSD requires >= 3 TWA samples.")
71
+ twa = df[twa_cols]
72
+
73
+ # Robust center and deviations
74
+ center = twa.median(axis=1)
75
+ devs = twa.subtract(center, axis=0)
76
+
77
+ # Symmetric MAD (used as a sanity fallback inside asymmetric branch)
78
+ mad_sym = devs.abs().median(axis=1)
79
+ sigma_sym = kappa * mad_sym
80
+
81
+ # IQR-based and global fallbacks
82
+ iqr = twa.quantile(0.75, axis=1) - twa.quantile(0.25, axis=1)
83
+ sigma_iqr = iqr / 1.349
84
+ pool = sigma_sym[sigma_sym > 0]
85
+ global_sigma = float(pool.median()) if not pool.empty else 1e-6
86
+ global_sigma = max(global_sigma, 1e-6)
87
+
88
+ def _apply_fallbacks(sigma: pd.Series) -> pd.Series:
89
+ s = sigma.copy().astype(float)
90
+ bad = (s <= 0) | ~np.isfinite(s)
91
+ s.loc[bad] = sigma_iqr.loc[bad]
92
+ bad = (s <= 0) | ~np.isfinite(s)
93
+ s.loc[bad] = global_sigma
94
+ return s
95
+
96
+ if asymmetric:
97
+ pos = devs.where(devs > 0) # NaN where dev <= 0
98
+ neg = (-devs).where(devs < 0) # positive magnitudes, NaN elsewhere
99
+ mad_up = pos.median(axis=1)
100
+ mad_dn = neg.median(axis=1)
101
+ sigma_up = kappa * mad_up
102
+ sigma_dn = kappa * mad_dn
103
+ # If a tail is empty/zero, fall back to symmetric MAD *before* IQR
104
+ sigma_up = sigma_up.where(sigma_up > 0, sigma_sym)
105
+ sigma_dn = sigma_dn.where(sigma_dn > 0, sigma_sym)
106
+ else:
107
+ sigma_up = sigma_sym.copy()
108
+ sigma_dn = sigma_sym.copy()
109
+
110
+ sigma_up = _apply_fallbacks(sigma_up)
111
+ sigma_dn = _apply_fallbacks(sigma_dn)
112
+
113
+ z = norm.ppf(1 - alpha / 2)
114
+ return pd.DataFrame({
115
+ 'center': center,
116
+ 'msd_up': z * sigma_up,
117
+ 'msd_dn': z * sigma_dn,
118
+ })
119
+
120
+ def calculate_ranking(
121
+ df: pd.DataFrame,
122
+ omic: str = "RNA",
123
+ alpha: float = 0.05,
124
+ robust: bool = True,
125
+ asymmetric: bool = True,
126
+ ):
127
+ """
128
+ Parameters
129
+ omic : str
130
+ Type of omic data: "RNA", "DNAm", or "CNV".
131
+ alpha : float
132
+ Significance level for ranking (used in MSD calculation).
133
+ robust : bool
134
+ If True, use median + MAD-based MSD (non-parametric).
135
+ asymmetric : bool
136
+ If True (and robust=True), use directional (double) MAD
137
+ to handle skewed baseline distributions.
138
+ (asymetric and robust are highly recommended together and are the default)
139
+ Using assymetric and robust together is preferred for non parametric data, and is the more conservative approach.
140
+ Having both false is the classical parametric approach which may be more powerful if assumptions are met, but is more sensitive to outliers and non-normality.
141
+ """
142
+ if omic.upper() in ["RNA", "DNAM"]:
143
+ twa_cols = [c for c in df.columns if c.startswith('tw')]
144
+
145
+ if robust:
146
+ tbl = _calculate_msd_robust(df, alpha=alpha, asymmetric=asymmetric)
147
+ center = tbl['center']
148
+ msd_up, msd_dn = tbl['msd_up'], tbl['msd_dn']
149
+ else:
150
+ center = df[twa_cols].mean(axis=1)
151
+ msd_par = _calculate_msd(df, alpha)
152
+ msd_up = msd_par
153
+ msd_dn = msd_par
154
+
155
+ ranked_dfs = {}
156
+ for sample in df.columns:
157
+ d_xs = df[sample] - center
158
+ # Directional scale: msd_up for positive deviations, msd_dn for negative
159
+ scale = np.where(d_xs >= 0, msd_up.values, msd_dn.values)
160
+ # Avoid zero division from any edge case
161
+ scale = np.where(scale > 0, scale, np.nan)
162
+ weighted = d_xs.values / scale
163
+ msd_signed = np.where(d_xs >= 0, msd_up.values, -msd_dn.values)
164
+
165
+ ranked_dfs[sample] = pd.DataFrame({
166
+ 'D_xs': d_xs.values,
167
+ 'MSD': msd_signed,
168
+ 'weighted': weighted,
169
+ 'Significant': np.abs(d_xs.values) > np.where(d_xs >= 0, msd_up.values, msd_dn.values),
170
+ 'Rank': pd.Series(d_xs.values, index=df.index).rank(ascending=False).values,
171
+ }, index=df.index)
172
+ return ranked_dfs
173
+
174
+ elif omic.upper() == "CNV":
175
+
176
+ control_data = df.filter(regex='^tw')
177
+ N = len(control_data.columns)
178
+ epsilon = 0.01 # Small constant to prevent division by zero
179
+
180
+ # Pre-compute all necessary stats for the control group
181
+ control_counts_df = control_data.apply(pd.Series.value_counts, axis=1).fillna(0).astype(int)
182
+ mu_controls = control_data.mean(axis=1)
183
+ sigma_controls = control_data.std(axis=1)
184
+
185
+ ranked_dfs = {}
186
+
187
+ # 2. Loop through all samples
188
+ for sample_name in df.columns:
189
+ sample_series = df[sample_name]
190
+ scores = []
191
+
192
+ # Loop through each gene in the current sample
193
+ for gene, cn_value in sample_series.items():
194
+
195
+ if cn_value != 2:
196
+
197
+ # Look up k: number of controls with the same CN value
198
+ k = control_counts_df.loc[gene, cn_value] if cn_value in control_counts_df.columns else 0
199
+
200
+ # Construct 2x2 table cells for a stable Odds Ratio calculation
201
+ a, b = 1.5, 0.5
202
+ c, d = k + 0.5, (N - k) + 0.5
203
+
204
+ # Calculate the corrected odds ratio
205
+ or_corrected = (a * d) / (b * c)
206
+
207
+ # Handle edge case for log transform if OR is somehow non-positive
208
+ if or_corrected <= 0:
209
+ or_corrected = epsilon
210
+
211
+ # Look up the pre-computed standard deviation for the gene
212
+ sigma_for_gene = sigma_controls.loc[gene]
213
+
214
+ # Calculate the final score using the enhanced formula
215
+ score = (np.sign(cn_value - 2) * np.log10(or_corrected)) / (sigma_for_gene + epsilon)
216
+
217
+ else: # cn_value == 2
218
+
219
+ # Look up pre-computed mean and std dev for the gene
220
+ mu_for_gene = mu_controls.loc[gene]
221
+ sigma_for_gene = sigma_controls.loc[gene]
222
+
223
+ # Calculate the Z-score relative to the control mean to capture nuance
224
+ score = (2 - mu_for_gene) / (sigma_for_gene + epsilon)
225
+
226
+ scores.append(score)
227
+
228
+ df_sample = pd.DataFrame(
229
+ {'adjusted_weight': scores},
230
+ index=df.index
231
+ )
232
+ ranked_dfs[sample_name] = df_sample
233
+
234
+ return ranked_dfs
235
+
236
+ else:
237
+ raise ValueError("Omic type must be 'RNA', 'DNAm', or 'CNV'")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: SIMPApy
3
- Version: 0.3.2
3
+ Version: 1.1.2
4
4
  Summary: Normalized Single Sample Integrated Multiomics Pathway Analysis
5
5
  Author-email: Hasan Alsharoh <hasanalsharoh@gmail.com>
6
6
  License: Apache License (2.0)
@@ -25,7 +25,7 @@ Requires-Dist: scikit-learn==1.5.1
25
25
  Requires-Dist: seaborn==0.13.2
26
26
  Requires-Dist: statsmodels==0.14.1
27
27
  Requires-Dist: ipywidgets==8.1.5
28
- Requires-Dist: pillow==10.4.0
28
+ Requires-Dist: pillow==12.2.0
29
29
  Requires-Dist: kaleido==0.1.0.post1
30
30
  Dynamic: license-file
31
31
 
@@ -231,7 +231,7 @@ Please consult the module's documentation for further instructions.
231
231
  - seaborn==0.13.2
232
232
  - statsmodels==0.14.1
233
233
  - ipywidgets==8.1.5
234
- - pillow==10.4.0
234
+ - pillow==12.2.0
235
235
  - kaleido==0.1.0.post1
236
236
 
237
237
 
@@ -9,5 +9,5 @@ scikit-learn==1.5.1
9
9
  seaborn==0.13.2
10
10
  statsmodels==0.14.1
11
11
  ipywidgets==8.1.5
12
- pillow==10.4.0
12
+ pillow==12.2.0
13
13
  kaleido==0.1.0.post1
@@ -31,7 +31,7 @@ dependencies = [
31
31
  "seaborn==0.13.2",
32
32
  "statsmodels==0.14.1",
33
33
  "ipywidgets==8.1.5",
34
- "pillow==10.4.0",
34
+ "pillow==12.2.0",
35
35
  "kaleido==0.1.0.post1"
36
36
  ]
37
37
 
@@ -93,16 +93,14 @@ class TestCalculateRanking(unittest.TestCase):
93
93
  with self.assertRaises(ValueError):
94
94
  calculate_ranking(self.rna_data, omic="invalid_type")
95
95
 
96
- @unittest.expectedFailure
97
96
  def test_empty_dataframe(self):
98
97
  """Test with empty dataframe"""
99
98
  empty_df = pd.DataFrame()
100
99
  with self.assertRaises(Exception): # Some exception should be raised
101
100
  calculate_ranking(empty_df)
102
101
 
103
- @unittest.expectedFailure
104
102
  def test_single_sample(self):
105
- """Test with single sample (should fail as control samples are needed)"""
103
+ """Test with single sample (less than 3 is not enough for ranking)"""
106
104
  single_sample = pd.DataFrame({'case1': [10, 20, 30]},
107
105
  index=['gene1', 'gene2', 'gene3'])
108
106
  with self.assertRaises(Exception): # Should fail without control samples
@@ -1,171 +0,0 @@
1
- """
2
- Gene ranking functions for different omics data types.
3
-
4
- This module contains functions to calculate rankings for RNA-seq, DNA methylation,
5
- and copy number variation data.
6
- """
7
-
8
- import pandas as pd
9
- import numpy as np
10
- from scipy.stats import norm
11
- from typing import Dict, List, Union, Optional, Tuple
12
-
13
-
14
- def _calculate_msd(df: pd.DataFrame, alpha: float = 0.05) -> pd.Series:
15
- """
16
- Calculates the Minimum Significant Difference (MSD) for each gene in the dataframe.
17
-
18
- Args:
19
- df: pandas DataFrame with gene expression data.
20
- alpha: Significance level for the Z-score. Default is 0.05.
21
-
22
- Returns:
23
- pandas Series with MSD for each gene.
24
- """
25
- # Separate TWA (control) group columns
26
- twa_cols = [col for col in df.columns if col.startswith('tw')]
27
- twa_df = df[twa_cols]
28
-
29
- # Calculate the mean expression for each gene across the TWA group
30
- gene_means = twa_df.mean(axis=1)
31
-
32
- # Calculate the Sum of Squares Within (SSW) for each gene
33
- ssw = ((twa_df.subtract(gene_means, axis=0))**2).sum(axis=1)
34
-
35
- # Calculate the standard error (SE)
36
- n = len(twa_cols) # Number of samples in the TWA group
37
- se = np.sqrt(ssw / (n - 1))
38
-
39
- # Calculate the Z-score for the given alpha level
40
- z_alpha = norm.ppf(1 - alpha/2)
41
-
42
- # Calculate MSD
43
- msd = z_alpha * se
44
-
45
- return msd
46
-
47
-
48
- def calculate_ranking(
49
- df: pd.DataFrame,
50
- omic: str = "RNA",
51
- alpha: float = 0.05
52
- ) -> Dict[str, pd.DataFrame]:
53
- """
54
- Calculate rankings for different types of omics data.
55
-
56
- Args:
57
- df: pandas DataFrame with omics data. Rows are genes/features, columns are samples.
58
- omic: Type of omics data. Must be "RNA", "DNAm", or "CNV". Default is "RNA".
59
- alpha: Significance level for RNA and DNAm rankings. Default is 0.05.
60
-
61
- Returns:
62
- A dictionary of DataFrames, where each key is a sample name containing a
63
- DataFrame with gene rankings.
64
- """
65
- if omic.upper() in ["RNA", "DNAM"]:
66
- # Calculate MSD first for RNA and DNAm
67
- msd = _calculate_msd(df, alpha)
68
-
69
- # Separate TWA (control) group columns
70
- twa_cols = [col for col in df.columns if col.startswith('tw')]
71
- twa_df = df[twa_cols]
72
-
73
- # Calculate the mean expression for each gene across the TWA group
74
- gene_means = twa_df.mean(axis=1)
75
-
76
- # Dictionary to store the ranked DataFrames
77
- ranked_dfs = {}
78
-
79
- # Iterate over each sample (column) including TWA samples
80
- for sample in df.columns:
81
- # Calculate the difference (D_(x,s))
82
- d_xs = df[sample] - gene_means
83
-
84
- # Adjust sign of MSD based on D_(x,s)
85
- msd_signed = msd * np.sign(d_xs)
86
-
87
- # Calculate weighted score
88
- weighted_score = d_xs / msd
89
-
90
- # Create a DataFrame for the current sample
91
- sample_df = pd.DataFrame({
92
- 'D_xs': d_xs,
93
- 'MSD': msd_signed,
94
- 'weighted': weighted_score,
95
- 'Significant': abs(d_xs) > msd
96
- })
97
-
98
- # Rank genes based on D_(x,s)
99
- sample_df['Rank'] = sample_df['D_xs'].rank(ascending=False)
100
-
101
- # Store the DataFrame in the dictionary
102
- ranked_dfs[sample] = sample_df
103
-
104
- del sample_df
105
-
106
- return ranked_dfs
107
-
108
- elif omic.upper() == "CNV":
109
-
110
- control_data = df.filter(regex='^tw')
111
- N = len(control_data.columns)
112
- epsilon = 0.01 # Small constant to prevent division by zero
113
-
114
- # Pre-compute all necessary stats for the control group
115
- control_counts_df = control_data.apply(pd.Series.value_counts, axis=1).fillna(0).astype(int)
116
- mu_controls = control_data.mean(axis=1)
117
- sigma_controls = control_data.std(axis=1)
118
-
119
- ranked_dfs = {}
120
-
121
- # 2. Loop through all samples
122
- for sample_name in df.columns:
123
- sample_series = df[sample_name]
124
- scores = []
125
-
126
- # Loop through each gene in the current sample
127
- for gene, cn_value in sample_series.items():
128
-
129
- if cn_value != 2:
130
-
131
- # Look up k: number of controls with the same CN value
132
- k = control_counts_df.loc[gene, cn_value] if cn_value in control_counts_df.columns else 0
133
-
134
- # Construct 2x2 table cells for a stable Odds Ratio calculation
135
- a, b = 1.5, 0.5
136
- c, d = k + 0.5, (N - k) + 0.5
137
-
138
- # Calculate the corrected odds ratio
139
- or_corrected = (a * d) / (b * c)
140
-
141
- # Handle edge case for log transform if OR is somehow non-positive
142
- if or_corrected <= 0:
143
- or_corrected = epsilon
144
-
145
- # Look up the pre-computed standard deviation for the gene
146
- sigma_for_gene = sigma_controls.loc[gene]
147
-
148
- # Calculate the final score using the enhanced formula
149
- score = (np.sign(cn_value - 2) * np.log10(or_corrected)) / (sigma_for_gene + epsilon)
150
-
151
- else: # cn_value == 2
152
-
153
- # Look up pre-computed mean and std dev for the gene
154
- mu_for_gene = mu_controls.loc[gene]
155
- sigma_for_gene = sigma_controls.loc[gene]
156
-
157
- # Calculate the Z-score relative to the control mean to capture nuance
158
- score = (2 - mu_for_gene) / (sigma_for_gene + epsilon)
159
-
160
- scores.append(score)
161
-
162
- df_sample = pd.DataFrame(
163
- {'adjusted_weight': scores},
164
- index=df.index
165
- )
166
- ranked_dfs[sample_name] = df_sample
167
-
168
- return ranked_dfs
169
-
170
- else:
171
- raise ValueError("Omic type must be 'RNA', 'DNAm', or 'CNV'")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes