SIMPApy 0.0.4a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
SIMPApy/SIMPA.py ADDED
@@ -0,0 +1,280 @@
1
+ """
2
+ Integration module for normalized single sample integrated multiomics pathway analysis.
3
+
4
+ This module provides functions to integrate SOPA results from multiple
5
+ omics data types (RNA-seq, CNV, and DNA methylation) to identify
6
+ consistently enriched pathways.
7
+ """
8
+
9
+ import os
10
+ import pandas as pd
11
+ import numpy as np
12
+ import glob
13
+ from scipy.stats import norm
14
+ from statsmodels.stats.multitest import multipletests
15
+ from typing import Dict, List, Union, Optional, Tuple, Any
16
+
17
+
18
+ def calculate_wcos_mpes(row: pd.Series) -> pd.Series:
19
+ """
20
+ Calculates Weighted Combined Omics Score (WCOS) and Multiomics Pathway Enrichment Score (MPES)
21
+ for a single pathway across multiple omics platforms.
22
+
23
+ Args:
24
+ row: A pandas Series containing GSEA results for a pathway across omics types.
25
+
26
+ Returns:
27
+ A pandas Series with WCOS values for each omic type and the MPES score.
28
+ """
29
+ wcos_values = []
30
+ for omic in ['rna', 'cnv', 'dna']:
31
+ fdr = row[f'{omic}_fdr']
32
+ nes = row[f'{omic}_nes']
33
+
34
+ # Handle cases where pval (and thus FDR) might be exactly 0 or 1
35
+ fdr = max(1e-16, min(fdr, 1 - 1e-16))
36
+
37
+ # Correctly count leading-edge genes from the lead_genes string
38
+ leading_edge_genes = len(row[f'{omic}_lead_genes'].split(';')) if isinstance(row[f'{omic}_lead_genes'], str) else 0
39
+
40
+ matched_genes = len(row['matched_genes'].split(';')) if isinstance(row['matched_genes'], str) else 0
41
+
42
+ l = leading_edge_genes / matched_genes if matched_genes > 0 else 0
43
+ wcos = (1 - fdr) * nes * np.log(1+l)
44
+ wcos_values.append(wcos)
45
+
46
+ # Normalize WCOS values
47
+ wcos_mean = np.mean(wcos_values)
48
+ wcos_std = np.std(wcos_values)
49
+
50
+ # Calculate MPES
51
+ mpes = ((np.sum(wcos_values)) - wcos_mean) / np.sqrt((wcos_std**2)/3)
52
+
53
+ return pd.Series({
54
+ 'rna_wcos': wcos_values[0],
55
+ 'cnv_wcos': wcos_values[1],
56
+ 'dna_wcos': wcos_values[2],
57
+ 'mpes': mpes
58
+ })
59
+
60
+
61
+ def sort_gene_list(gene_str: str) -> str:
62
+ """
63
+ Sort a semicolon-separated string of genes alphabetically.
64
+
65
+ Args:
66
+ gene_str: String of semicolon-separated gene names.
67
+
68
+ Returns:
69
+ Sorted string of semicolon-separated gene names.
70
+ """
71
+ if pd.isna(gene_str) or not isinstance(gene_str, str):
72
+ return gene_str
73
+ genes = gene_str.split(';')
74
+ return ';'.join(sorted(genes))
75
+
76
+
77
+ def simpa(
78
+ sample_id: str,
79
+ rna_dir: str,
80
+ cnv_dir: str,
81
+ dna_dir: str,
82
+ output_dir: Optional[str] = None
83
+ ) -> Optional[pd.DataFrame]:
84
+ """
85
+ Integrates GSEA results from RNA, CNV, and DNA methylation for a single sample.
86
+
87
+ Args:
88
+ sample_id: Sample identifier used in filenames.
89
+ rna_dir: Directory containing RNA-seq GSEA results.
90
+ cnv_dir: Directory containing CNV GSEA results.
91
+ dna_dir: Directory containing DNA methylation GSEA results.
92
+ output_dir: Optional directory to save integrated results. If None, results are not saved.
93
+
94
+ Returns:
95
+ A pandas DataFrame with integrated GSEA results or None if an error occurs.
96
+ """
97
+ try:
98
+ # Read files and ensure Term is not the index
99
+ rna = pd.read_csv(os.path.join(rna_dir, f'{sample_id}_gsea_results.csv'))
100
+ cnv = pd.read_csv(os.path.join(cnv_dir, f'{sample_id}_gsea_results.csv'))
101
+ dna = pd.read_csv(os.path.join(dna_dir, f'{sample_id}_gsea_results.csv'))
102
+ except FileNotFoundError:
103
+ print(f"GSEA results file not found for sample: {sample_id}")
104
+ return None
105
+ except pd.errors.EmptyDataError:
106
+ print(f"GSEA results file is empty for sample: {sample_id}")
107
+ return None
108
+ except Exception as e:
109
+ print(f"An unexpected error occurred while reading GSEA results for sample {sample_id}: {e}")
110
+ return None
111
+
112
+ # Sort matched_genes alphabetically in each dataframe
113
+ for df in [rna, cnv, dna]:
114
+ df['matched_genes'] = df['matched_genes'].apply(sort_gene_list)
115
+
116
+ # Rename columns with omic prefixes, excluding 'Term' and 'matched_genes'
117
+ for df, prefix in [(rna, 'rna'), (cnv, 'cnv'), (dna, 'dna')]:
118
+ df.columns = [f'{prefix}_{col}' if col not in ['Term', 'matched_genes'] else col
119
+ for col in df.columns]
120
+ # Fix the (leading edge genes) column name
121
+ if f'{prefix}_lead_genes' in df.columns:
122
+ df.rename(columns={f'{prefix}_lead_genes': f'{prefix}_lead_genes'}, inplace=True)
123
+
124
+ # Replace 0 and 1 values with 1e-16 and 1-1e-16 in pval columns
125
+ for df, prefix in [(rna, 'rna'), (cnv, 'cnv'), (dna, 'dna')]:
126
+ df[f'{prefix}_pval'] = df[f'{prefix}_pval'].replace({0.000000: 1e-16, 1.000000: 1 - 1e-16})
127
+
128
+ # Merge dataframes on Term
129
+ combined = rna.merge(cnv, on=['Term', 'matched_genes'], how='inner')
130
+ combined = combined.merge(dna, on=['Term', 'matched_genes'], how='inner')
131
+
132
+ # Calculate z-scores
133
+ for prefix in ['rna', 'cnv', 'dna']:
134
+ combined[f'{prefix}_z'] = norm.ppf(1 - combined[f'{prefix}_pval'])
135
+ # Multiply z-scores by the sign of NES to get the correct direction
136
+ combined[f'{prefix}_z'] = combined[f'{prefix}_z'] * np.sign(combined[f'{prefix}_nes'])
137
+
138
+ # Calculate combined z-score
139
+ z_scores = combined[['rna_z', 'cnv_z', 'dna_z']]
140
+ combined['combined_z'] = z_scores.sum(axis=1) / np.sqrt(3)
141
+
142
+ # Calculate combined p-value
143
+ combined['combined_pval'] = norm.sf(abs(combined['combined_z'])) * 2 # Two-tailed test
144
+
145
+ # Multiple testing correction with proper error handling
146
+ try:
147
+ # Remove any infinite or NA values
148
+ valid_pvals = combined['combined_pval'].replace([np.inf, -np.inf], np.nan).dropna()
149
+
150
+ if len(valid_pvals) > 0:
151
+ # Perform correction only on valid p-values
152
+ reject, pvals_corrected, _, _ = multipletests(valid_pvals, method='fdr_bh')
153
+
154
+ # Initialize FDR column with NaN
155
+ combined['fdr_bh'] = np.nan
156
+
157
+ # Update FDR values only for rows with valid p-values
158
+ combined.loc[valid_pvals.index, 'fdr_bh'] = pvals_corrected
159
+ else:
160
+ # If no valid p-values, set all FDR to NaN
161
+ combined['fdr_bh'] = np.nan
162
+ print(f"Warning: No valid p-values for multiple testing correction in sample {sample_id}")
163
+ except Exception as e:
164
+ print(f"Warning: Error in multiple testing correction for sample {sample_id}: {e}")
165
+ combined['fdr_bh'] = np.nan
166
+
167
+ # Sort by FDR (putting NaN values at the end)
168
+ combined = combined.sort_values(by='fdr_bh', na_position='last')
169
+
170
+ # Apply WCOS and MPES calculations
171
+ wcos_mpes_results = combined.apply(calculate_wcos_mpes, axis=1)
172
+ combined = pd.concat([combined, wcos_mpes_results], axis=1)
173
+
174
+ # Keep relevant columns
175
+ try:
176
+ combined = combined[['Term', 'combined_pval', 'combined_z', 'fdr_bh', 'matched_genes',
177
+ 'rna_lead_genes', 'cnv_lead_genes', 'dna_lead_genes',
178
+ 'rna_nes', 'cnv_nes', 'dna_nes',
179
+ 'rna_wcos', 'cnv_wcos', 'dna_wcos', 'mpes']]
180
+ except KeyError as e:
181
+ # Handle case where columns might have different names
182
+ print(f"Warning: Some expected columns are missing: {e}")
183
+ # Keep all columns if we can't find the expected ones
184
+ pass
185
+
186
+ # Save results if output_dir is provided
187
+ if output_dir is not None:
188
+ os.makedirs(output_dir, exist_ok=True)
189
+ output_file = os.path.join(output_dir, f'{sample_id}_integrated_gsea_results.csv')
190
+ combined.to_csv(output_file)
191
+ print(f"Saved integrated results for {sample_id} to {output_file}")
192
+
193
+ return combined
194
+
195
+
196
+ def run_simpa_batch(
197
+ sample_ids: List[str],
198
+ rna_dir: str,
199
+ cnv_dir: str,
200
+ dna_dir: str,
201
+ output_dir: str
202
+ ) -> None:
203
+ """
204
+ Run SIMPA integration for multiple samples.
205
+
206
+ Args:
207
+ sample_ids: List of sample identifiers.
208
+ rna_dir: Directory containing RNA-seq GSEA results.
209
+ cnv_dir: Directory containing CNV GSEA results.
210
+ dna_dir: Directory containing DNA methylation GSEA results.
211
+ output_dir: Directory to save integrated results.
212
+
213
+ Returns:
214
+ None. Results are saved to files in the output directory.
215
+ """
216
+ os.makedirs(output_dir, exist_ok=True)
217
+
218
+ for sample_id in sample_ids:
219
+ combined_df = simpa(sample_id, rna_dir, cnv_dir, dna_dir)
220
+
221
+ if combined_df is not None:
222
+ output_file = os.path.join(output_dir, f'{sample_id}_integrated_gsea_results.csv')
223
+ combined_df.to_csv(output_file)
224
+
225
+ # Clear memory
226
+ del combined_df
227
+ if 'rna' in locals():
228
+ del rna
229
+ if 'cnv' in locals():
230
+ del cnv
231
+ if 'dna' in locals():
232
+ del dna
233
+
234
+ print('Integration done! Results saved in:', output_dir)
235
+
236
+ def load_simpa(directory):
237
+ """
238
+ Loads and processes SIMPA results from a directory of CSV files.
239
+
240
+ Args:
241
+ directory: The path to the directory containing the SIMPA results files.
242
+
243
+ Returns:
244
+ A pandas DataFrame with columns: sample_name, term, fdr, pval.
245
+ """
246
+
247
+ all_results = []
248
+
249
+ # Use glob to find all CSV files matching the pattern
250
+ file_pattern = os.path.join(directory, "tm*_integrated_gsea_results.csv")
251
+ file_paths = glob.glob(file_pattern)
252
+
253
+ # also get tw files
254
+ file_pattern = os.path.join(directory, "tw*_integrated_gsea_results.csv")
255
+ file_paths.extend(glob.glob(file_pattern))
256
+
257
+ for file_path in file_paths:
258
+ # Extract sample name from filename
259
+ file_name = os.path.basename(file_path)
260
+ sample_name = file_name.split("_integrated_gsea_results")[0] # Extract tm(n) or tw(n)
261
+
262
+ # Load the CSV file into a DataFrame
263
+ try:
264
+ df = pd.read_csv(file_path)
265
+ except pd.errors.ParserError:
266
+ print(f"Error: Could not parse {file_path} as a CSV file. Skipping.")
267
+ continue
268
+
269
+ # Select relevant columns and add sample name
270
+ df = df[['Term','combined_pval', 'combined_z', 'fdr_bh','matched_genes',
271
+ 'rna_lead_genes', 'cnv_lead_genes', 'dna_lead_genes', 'rna_nes', 'cnv_nes', 'dna_nes', 'mpes']]
272
+ df['sample_name'] = sample_name
273
+
274
+ # Append to the list of results
275
+ all_results.append(df)
276
+
277
+ # Concatenate all results into a single DataFrame
278
+ final_df = pd.concat(all_results, ignore_index=True)
279
+
280
+ return final_df
SIMPApy/__init__.py ADDED
@@ -0,0 +1,31 @@
1
+ """
2
+ SIMPApy: A package for normalized single sample Integrated Multiomics Pathway Analysis.
3
+
4
+ This package provides tools for running Gene Set Enrichment Analysis (GSEA) on
5
+ multi-omics data (RNA-seq, DNA methylation, and copy number variation) in single samples and
6
+ integrating the results to identify consistently enriched pathways.
7
+
8
+ The package includes the following modules:
9
+ - core: Contains the main functions for running sopa and sopa_population.
10
+ - ranking: Contains functions for calculating ranking and mean signed deviation.
11
+ - simpa: Contains the main functions for running SIMPA.
12
+ - visualize: Contains functions for creating interactive plots of from SIMPA results.
13
+ """
14
+ from .core import sopa, sopa_population, load_sopa
15
+ from .ranking import calculate_ranking, _calculate_msd
16
+ from .SIMPA import simpa, run_simpa_batch, load_simpa
17
+ from .preprocess import _extract_tag_genes, _create_aggregated_dataframes, process_multiomics_data
18
+ from .visualize import _create_traces, create_interactive_plot
19
+
20
+ __version__ = "0.0.4-alpha"
21
+ __all__ = [
22
+ "calculate_ranking",
23
+ "sopa",
24
+ "sopa_population",
25
+ "load_sopa",
26
+ "simpa",
27
+ "run_simpa_batch",
28
+ "load_simpa",
29
+ "process_multiomics_data",
30
+ "create_interactive_plot"
31
+ ]
SIMPApy/core.py ADDED
@@ -0,0 +1,162 @@
1
+ """
2
+ Core functions for the SIMPApy package.
3
+
4
+ This module contains the main functions for running SOPA on ranked gene data.
5
+ """
6
+
7
+ import gseapy as gp
8
+ import pandas as pd
9
+ import numpy as np
10
+ import os
11
+ import glob
12
+ from typing import Dict, List, Union, Optional, Tuple
13
+
14
+
15
+ def sopa(
16
+ ranking: pd.Series,
17
+ gene_set: Union[Dict, str],
18
+ minisz: int = 3,
19
+ seeder: int = 7,
20
+ threads: int = 8,
21
+ permutation_num: int = 1000,
22
+ **kwargs
23
+ ) -> pd.DataFrame:
24
+ """
25
+ Run SOPA on a ranked gene list.
26
+
27
+ Args:
28
+ ranking: A pandas Series with gene names as index and ranking values.
29
+ gene_set: Gene set database in GMT format or a dictionary.
30
+ minisz: Minimum size of gene sets to consider. Default is 3.
31
+ seeder: Random seed for reproducibility. Default is 7.
32
+ threads: Number of threads to use. Default is 8.
33
+ permutation_num: Number of permutations for calculating FDR. Default is 1000.
34
+ **kwargs: Additional arguments passed to gp.prerank().
35
+
36
+ Returns:
37
+ A pandas DataFrame with SOPA GSEA results sorted by FDR.
38
+ """
39
+ # Pass all arguments to prerank, including any additional arguments
40
+ pre_res = gp.prerank(
41
+ rnk=ranking,
42
+ gene_sets=gene_set,
43
+ min_size=minisz,
44
+ seed=seeder,
45
+ threads=threads,
46
+ **kwargs
47
+ )
48
+
49
+ out = []
50
+ for term in list(pre_res.results):
51
+ out.append([
52
+ term,
53
+ pre_res.results[term]['fdr'],
54
+ pre_res.results[term]['es'],
55
+ pre_res.results[term]['nes'],
56
+ pre_res.results[term]['pval'],
57
+ pre_res.results[term]['matched_genes'],
58
+ pre_res.results[term]['gene %'],
59
+ pre_res.results[term]['lead_genes'],
60
+ pre_res.results[term]['tag %']
61
+ ])
62
+
63
+ out_df = pd.DataFrame(
64
+ out,
65
+ columns=['Term', 'fdr', 'es', 'nes', 'pval', 'matched_genes', 'gene %', 'lead_genes', 'tag %']
66
+ ).sort_values('fdr').reset_index(drop=True)
67
+
68
+ return out_df
69
+
70
+
71
+ def sopa_population(
72
+ ranks: pd.DataFrame,
73
+ gene_set: Union[Dict, str],
74
+ output_dir: str,
75
+ minisz: int = 3,
76
+ seeder: int = 7,
77
+ **kwargs
78
+ ) -> None:
79
+ """
80
+ Run SOPA on all samples in the ranking dataframe and save results as CSV files.
81
+
82
+ Args:
83
+ ranks: DataFrame with genes as index and samples as columns.
84
+ gene_set: Gene set database in GMT format or a dictionary.
85
+ output_dir: Directory where results will be saved.
86
+ minisz: Minimum size of gene sets to consider. Default is 3.
87
+ seeder: Random seed for reproducibility. Default is 7.
88
+ **kwargs: Additional arguments passed to sopa().
89
+
90
+ Returns:
91
+ None. Results are saved to files in the output directory.
92
+ """
93
+ # Create output directory if it doesn't exist
94
+ os.makedirs(output_dir, exist_ok=True)
95
+
96
+ # Iterate through each sample in the ranks DataFrame
97
+ for col in ranks.columns:
98
+ # Sort rankings and handle infinities
99
+ ranking = ranks[col].replace([np.inf, -np.inf], np.nan).dropna().sort_values(ascending=False)
100
+
101
+ # Run sopa with all the parameters
102
+ gsea_result = sopa(
103
+ ranking=ranking,
104
+ gene_set=gene_set,
105
+ minisz=minisz,
106
+ seeder=seeder,
107
+ **kwargs
108
+ )
109
+
110
+ # Construct the output file path (use os.path.join for cross-platform compatibility)
111
+ output_file = os.path.join(output_dir, f"{col}_gsea_results.csv")
112
+
113
+ # Save the GSEA results to a CSV file
114
+ gsea_result.to_csv(output_file, sep=',')
115
+
116
+ # Clean up
117
+ del gsea_result, ranking
118
+
119
+ def load_sopa(directory):
120
+ """
121
+ Loads and processes sopa results from a directory of CSV files.
122
+
123
+ Args:
124
+ directory: The path to the directory containing the SOPA results files.
125
+
126
+ Returns:
127
+ A pandas DataFrame with columns: sample_name, term, fdr, pval.
128
+ """
129
+
130
+ all_results = []
131
+
132
+ # Use glob to find all CSV files matching the pattern
133
+ file_pattern = os.path.join(directory, "tm*_gsea_results.csv")
134
+ file_paths = glob.glob(file_pattern)
135
+
136
+ # also get tw files
137
+ file_pattern = os.path.join(directory, "tw*_gsea_results.csv")
138
+ file_paths.extend(glob.glob(file_pattern))
139
+
140
+ for file_path in file_paths:
141
+ # Extract sample name from filename
142
+ file_name = os.path.basename(file_path)
143
+ sample_name = file_name.split("_gsea_results")[0] # Extract tm(n) or tw(n)
144
+
145
+ # Load the CSV file into a DataFrame
146
+ try:
147
+ df = pd.read_csv(file_path)
148
+ except pd.errors.ParserError:
149
+ print(f"Error: Could not parse {file_path} as a CSV file. Skipping.")
150
+ continue
151
+
152
+ # Select relevant columns and add sample name
153
+ df = df[['Term', 'fdr', 'es','nes','matched_genes','gene %','tag %','lead_genes']]
154
+ df['sample_name'] = sample_name
155
+
156
+ # Append to the list of results
157
+ all_results.append(df)
158
+
159
+ # Concatenate all results into a single DataFrame
160
+ final_df = pd.concat(all_results, ignore_index=True)
161
+
162
+ return final_df