proteomics-toolkit 26.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proteomics_toolkit-26.1.0/LICENSE +21 -0
- proteomics_toolkit-26.1.0/PKG-INFO +351 -0
- proteomics_toolkit-26.1.0/README.md +313 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit/__init__.py +324 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit/classification.py +578 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit/data_import.py +847 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit/enrichment.py +794 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit/export.py +725 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit/normalization.py +1271 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit/preprocessing.py +934 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit/statistical_analysis.py +1577 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit/temporal_clustering.py +1590 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit/validation.py +375 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit/visualization.py +2707 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit.egg-info/PKG-INFO +351 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit.egg-info/SOURCES.txt +27 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit.egg-info/dependency_links.txt +1 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit.egg-info/requires.txt +19 -0
- proteomics_toolkit-26.1.0/proteomics_toolkit.egg-info/top_level.txt +1 -0
- proteomics_toolkit-26.1.0/pyproject.toml +77 -0
- proteomics_toolkit-26.1.0/setup.cfg +4 -0
- proteomics_toolkit-26.1.0/tests/test_classification.py +68 -0
- proteomics_toolkit-26.1.0/tests/test_data_import.py +141 -0
- proteomics_toolkit-26.1.0/tests/test_enrichment.py +87 -0
- proteomics_toolkit-26.1.0/tests/test_export.py +84 -0
- proteomics_toolkit-26.1.0/tests/test_normalization.py +222 -0
- proteomics_toolkit-26.1.0/tests/test_preprocessing.py +169 -0
- proteomics_toolkit-26.1.0/tests/test_statistical_analysis.py +223 -0
- proteomics_toolkit-26.1.0/tests/test_validation.py +64 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 MacCoss Lab
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: proteomics-toolkit
|
|
3
|
+
Version: 26.1.0
|
|
4
|
+
Summary: Proteomics analysis toolkit for mass spectrometry data
|
|
5
|
+
Author: Michael MacCoss Lab, University of Washington
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/uw-maccosslab/proteomics-toolkit
|
|
8
|
+
Project-URL: Repository, https://github.com/uw-maccosslab/proteomics-toolkit
|
|
9
|
+
Project-URL: Issues, https://github.com/uw-maccosslab/proteomics-toolkit/issues
|
|
10
|
+
Keywords: proteomics,mass-spectrometry,DIA,bioinformatics
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: pandas>=2.0
|
|
22
|
+
Requires-Dist: numpy>=1.24
|
|
23
|
+
Requires-Dist: scipy>=1.10
|
|
24
|
+
Requires-Dist: statsmodels>=0.14
|
|
25
|
+
Requires-Dist: scikit-learn>=1.3
|
|
26
|
+
Requires-Dist: matplotlib>=3.7
|
|
27
|
+
Requires-Dist: seaborn>=0.12
|
|
28
|
+
Requires-Dist: pyarrow>=12.0
|
|
29
|
+
Requires-Dist: requests>=2.25
|
|
30
|
+
Provides-Extra: xgboost
|
|
31
|
+
Requires-Dist: xgboost>=1.7; extra == "xgboost"
|
|
32
|
+
Provides-Extra: all
|
|
33
|
+
Requires-Dist: xgboost>=1.7; extra == "all"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# Proteomics Analysis Toolkit
|
|
40
|
+
|
|
41
|
+
[](https://github.com/uw-maccosslab/proteomics-toolkit/actions/workflows/ci.yml)
|
|
42
|
+
[](https://pypi.org/project/proteomics-toolkit/)
|
|
43
|
+
[](https://pypi.org/project/proteomics-toolkit/)
|
|
44
|
+
[](https://github.com/uw-maccosslab/proteomics-toolkit/blob/main/LICENSE)
|
|
45
|
+
|
|
46
|
+
A Python toolkit for analyzing mass spectrometry-based proteomics data, supporting both Skyline CSV and PRISM parquet workflows.
|
|
47
|
+
|
|
48
|
+
## Features
|
|
49
|
+
|
|
50
|
+
### Core Analysis Modules
|
|
51
|
+
- **data_import**: Load Skyline CSV or PRISM parquet data, handle batch suffixes, manage sample metadata
|
|
52
|
+
- **preprocessing**: Protein identifier parsing, sample classification, data quality assessment
|
|
53
|
+
- **normalization**: Seven normalization methods (median, VSN, quantile, MAD, z-score, RLR, LOESS)
|
|
54
|
+
- **statistical_analysis**: Differential protein analysis — t-tests, Wilcoxon, Mann-Whitney, mixed-effects models
|
|
55
|
+
- **visualization**: Publication-ready plots — volcano, PCA, box plots, heatmaps, correlation, trajectories
|
|
56
|
+
- **enrichment**: Gene set enrichment via Enrichr API
|
|
57
|
+
- **temporal_clustering**: K-means clustering of temporal protein trends
|
|
58
|
+
- **validation**: Metadata/data consistency checking with diagnostic reports
|
|
59
|
+
- **export**: Standardized result export with timestamped configs
|
|
60
|
+
|
|
61
|
+
## Installation
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Install from PyPI
|
|
65
|
+
pip install proteomics-toolkit
|
|
66
|
+
|
|
67
|
+
# With XGBoost support (for classification module)
|
|
68
|
+
pip install proteomics-toolkit[xgboost]
|
|
69
|
+
|
|
70
|
+
# Install from GitHub (latest development version)
|
|
71
|
+
pip install git+https://github.com/uw-maccosslab/proteomics-toolkit.git
|
|
72
|
+
|
|
73
|
+
# For development (editable install from local clone)
|
|
74
|
+
git clone https://github.com/uw-maccosslab/proteomics-toolkit.git
|
|
75
|
+
cd proteomics-toolkit
|
|
76
|
+
pip install -e .
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Quick Start
|
|
80
|
+
|
|
81
|
+
### PRISM Workflow (recommended for batch-corrected data)
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
import proteomics_toolkit as ptk
|
|
85
|
+
import pandas as pd
|
|
86
|
+
|
|
87
|
+
# 1. Load PRISM data
|
|
88
|
+
protein_data, metadata, sample_cols = ptk.load_prism_data(
|
|
89
|
+
'PRISM-Output/corrected_proteins.parquet',
|
|
90
|
+
'PRISM-Output/sample_metadata.csv',
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# 2. Map batch-suffixed column names to short replicate IDs
|
|
94
|
+
col_map = ptk.strip_batch_suffix(sample_cols) # {full_col: short_name}
|
|
95
|
+
short_to_col = {v: k for k, v in col_map.items()}
|
|
96
|
+
|
|
97
|
+
# 3. Build sample metadata dict (keys = full PRISM column names)
|
|
98
|
+
meta_dict = {}
|
|
99
|
+
for _, row in metadata.iterrows():
|
|
100
|
+
full_col = short_to_col.get(row['Replicate'])
|
|
101
|
+
if full_col:
|
|
102
|
+
meta_dict[full_col] = row.to_dict()
|
|
103
|
+
|
|
104
|
+
# 4. Filter low-confidence proteins
|
|
105
|
+
protein_data_filtered = protein_data[~protein_data['low_confidence']].copy()
|
|
106
|
+
|
|
107
|
+
# 5. Build annotation + sample data for stats
|
|
108
|
+
annot = protein_data_filtered[[
|
|
109
|
+
'leading_protein', 'leading_description', 'leading_gene_name',
|
|
110
|
+
'leading_uniprot_id', 'leading_name'
|
|
111
|
+
]].copy()
|
|
112
|
+
annot.columns = ['Protein', 'Description', 'Protein Gene', 'UniProt_Accession', 'UniProt_Entry_Name']
|
|
113
|
+
data = pd.concat([annot.reset_index(drop=True),
|
|
114
|
+
protein_data_filtered[sample_cols].reset_index(drop=True)], axis=1)
|
|
115
|
+
data.index = data['Protein'] # accession as index
|
|
116
|
+
|
|
117
|
+
# 6. Statistical analysis
|
|
118
|
+
config = ptk.StatisticalConfig()
|
|
119
|
+
config.analysis_type = 'unpaired'
|
|
120
|
+
config.statistical_test_method = 'welch_t'
|
|
121
|
+
config.group_column = 'Group'
|
|
122
|
+
config.group_labels = ['Control', 'Treatment'] # [reference, study]
|
|
123
|
+
config.correction_method = 'fdr_bh'
|
|
124
|
+
config.p_value_threshold = 0.05
|
|
125
|
+
config.fold_change_threshold = 1.0
|
|
126
|
+
config.log_transform_before_stats = True
|
|
127
|
+
config.validate()
|
|
128
|
+
|
|
129
|
+
results = ptk.run_comprehensive_statistical_analysis(
|
|
130
|
+
data, meta_dict, config, protein_annotations=annot
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# 7. Visualization
|
|
134
|
+
ptk.plot_volcano(results, fc_threshold=1.0, gene_column='Protein Gene', label_top_n=15)
|
|
135
|
+
ptk.display_analysis_summary(results, config)
|
|
136
|
+
|
|
137
|
+
# 8. Enrichment
|
|
138
|
+
enrich_config = ptk.EnrichmentConfig(
|
|
139
|
+
enrichr_libraries=['GO_Biological_Process_2023', 'KEGG_2021_Human'],
|
|
140
|
+
pvalue_cutoff=0.05,
|
|
141
|
+
)
|
|
142
|
+
enrich = ptk.run_differential_enrichment(
|
|
143
|
+
results, gene_column='Protein Gene', logfc_column='logFC',
|
|
144
|
+
pvalue_column='adj.P.Val', config=enrich_config,
|
|
145
|
+
)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Skyline CSV Workflow
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
# 1. Load data
|
|
152
|
+
protein_data, metadata, peptide_data = ptk.load_skyline_data(
|
|
153
|
+
protein_file='protein_quant.csv',
|
|
154
|
+
metadata_file='metadata.csv',
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# 2. Process sample names
|
|
158
|
+
sample_columns = ptk.data_import.identify_sample_columns(protein_data, metadata)
|
|
159
|
+
cleaned_names = ptk.clean_sample_names(sample_columns)
|
|
160
|
+
|
|
161
|
+
# 3. Parse annotations and filter
|
|
162
|
+
processed_data = ptk.parse_protein_identifiers(protein_data)
|
|
163
|
+
|
|
164
|
+
# 4. Normalize (skip for PRISM — already normalized)
|
|
165
|
+
normalized = ptk.median_normalize(processed_data, sample_columns=list(cleaned_names.values()))
|
|
166
|
+
|
|
167
|
+
# 5. QC plots
|
|
168
|
+
ptk.plot_box_plot(normalized, list(cleaned_names.values()), sample_metadata)
|
|
169
|
+
ptk.plot_pca(normalized, list(cleaned_names.values()), sample_metadata)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Statistical Analysis
|
|
173
|
+
|
|
174
|
+
All statistical analyses use `StatisticalConfig` + `run_comprehensive_statistical_analysis()`.
|
|
175
|
+
|
|
176
|
+
### Unpaired comparison (two independent groups)
|
|
177
|
+
```python
|
|
178
|
+
config = ptk.StatisticalConfig()
|
|
179
|
+
config.analysis_type = 'unpaired'
|
|
180
|
+
config.statistical_test_method = 'welch_t' # or 'mann_whitney'
|
|
181
|
+
config.group_column = 'Group'
|
|
182
|
+
config.group_labels = ['Control', 'Treatment']
|
|
183
|
+
config.log_transform_before_stats = 'auto'
|
|
184
|
+
config.validate()
|
|
185
|
+
|
|
186
|
+
results = ptk.run_comprehensive_statistical_analysis(
|
|
187
|
+
data, sample_metadata, config, protein_annotations=annot
|
|
188
|
+
)
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### Paired comparison (before/after per subject)
|
|
192
|
+
```python
|
|
193
|
+
config = ptk.StatisticalConfig()
|
|
194
|
+
config.analysis_type = 'paired'
|
|
195
|
+
config.statistical_test_method = 'paired_t'
|
|
196
|
+
config.subject_column = 'Subject'
|
|
197
|
+
config.paired_column = 'Condition'
|
|
198
|
+
config.paired_label1 = 'Before'
|
|
199
|
+
config.paired_label2 = 'After'
|
|
200
|
+
config.group_column = 'Condition'
|
|
201
|
+
config.group_labels = ['Before', 'After']
|
|
202
|
+
config.validate()
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Mixed-effects model (repeated measures)
|
|
206
|
+
```python
|
|
207
|
+
config = ptk.StatisticalConfig()
|
|
208
|
+
config.analysis_type = 'paired'
|
|
209
|
+
config.statistical_test_method = 'mixed_effects'
|
|
210
|
+
config.subject_column = 'Subject'
|
|
211
|
+
config.paired_column = 'Visit'
|
|
212
|
+
config.paired_label1 = 'Baseline'
|
|
213
|
+
config.paired_label2 = 'Follow-up'
|
|
214
|
+
config.group_column = 'Treatment'
|
|
215
|
+
config.group_labels = ['Placebo', 'Drug']
|
|
216
|
+
config.interaction_terms = ['Treatment', 'Visit']
|
|
217
|
+
config.validate()
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
**Output columns:** `Protein`, `logFC`, `P.Value`, `adj.P.Val`, `AveExpr`, `t`, `Protein Gene`, `Description`, `UniProt_Accession`, `Gene`
|
|
221
|
+
|
|
222
|
+
## Enrichment
|
|
223
|
+
|
|
224
|
+
Enrichment results use these column names (not the Enrichr web-UI names):
|
|
225
|
+
|
|
226
|
+
| Column | Description |
|
|
227
|
+
|---|---|
|
|
228
|
+
| `Term` | Pathway / GO term name |
|
|
229
|
+
| `P_Value` | Unadjusted p-value |
|
|
230
|
+
| `Adj_P_Value` | BH-adjusted p-value |
|
|
231
|
+
| `Z_Score` | Enrichr z-score |
|
|
232
|
+
| `Combined_Score` | log(p) × z — used for ranking |
|
|
233
|
+
| `Genes` | Semicolon-separated gene list |
|
|
234
|
+
| `N_Genes` | Number of overlapping genes |
|
|
235
|
+
| `Library` | Source Enrichr library |
|
|
236
|
+
|
|
237
|
+
## Dependencies
|
|
238
|
+
|
|
239
|
+
- pandas >= 1.3.0
|
|
240
|
+
- numpy >= 1.21.0
|
|
241
|
+
- scipy >= 1.7.0
|
|
242
|
+
- matplotlib >= 3.4.0
|
|
243
|
+
- seaborn >= 0.11.0
|
|
244
|
+
- scikit-learn >= 1.0.0
|
|
245
|
+
- statsmodels >= 0.12.0
|
|
246
|
+
- requests >= 2.25.0 (for Enrichr API)
|
|
247
|
+
- pyarrow >= 8.0.0 (for PRISM parquet files)
|
|
248
|
+
|
|
249
|
+
## Module Reference
|
|
250
|
+
|
|
251
|
+
### data_import.py
|
|
252
|
+
- `load_skyline_data()` — Load Skyline protein/peptide CSVs + metadata
|
|
253
|
+
- `load_prism_data()` — Load PRISM parquet + metadata
|
|
254
|
+
- `identify_sample_columns()` — Auto-detect sample columns
|
|
255
|
+
- `clean_sample_names()` — Remove common prefixes/suffixes
|
|
256
|
+
- `detect_batch_suffix()` — Detect PRISM `__@__` batch suffix
|
|
257
|
+
- `strip_batch_suffix()` — Map batch-suffixed names → short names
|
|
258
|
+
- `create_sample_column_mapping()` — Map data columns to metadata sample names
|
|
259
|
+
- `match_samples_to_metadata()` — Link samples to metadata rows
|
|
260
|
+
- `BATCH_SUFFIX_DELIMITER` — Constant: `"__@__"`
|
|
261
|
+
|
|
262
|
+
### preprocessing.py
|
|
263
|
+
- `parse_protein_identifiers()` — Extract UniProt accessions and databases
|
|
264
|
+
- `parse_gene_and_description()` — Parse gene names from descriptions
|
|
265
|
+
- `classify_samples()` — Classify samples into groups / controls with color assignment
|
|
266
|
+
- `apply_systematic_color_scheme()` — Generate consistent group colors
|
|
267
|
+
- `create_standard_data_structure()` — Build standard 5-column annotation + sample layout
|
|
268
|
+
- `assess_data_completeness()` — Evaluate missing data patterns
|
|
269
|
+
- `filter_proteins_by_completeness()` — Remove proteins below detection threshold
|
|
270
|
+
- `calculate_group_colors()` — Generate group color mapping
|
|
271
|
+
- `identify_annotation_columns()` — Auto-detect annotation vs sample columns
|
|
272
|
+
|
|
273
|
+
### normalization.py
|
|
274
|
+
- `median_normalize()` — Median-based normalization (preserves original scale)
|
|
275
|
+
- `vsn_normalize()` — Variance Stabilizing Normalization (arcsinh-transformed)
|
|
276
|
+
- `quantile_normalize()` — Force identical distributions
|
|
277
|
+
- `mad_normalize()` — Median absolute deviation normalization
|
|
278
|
+
- `z_score_normalize()` — Standardize to mean=0, sd=1
|
|
279
|
+
- `rlr_normalize()` — Robust linear regression (log2-transformed)
|
|
280
|
+
- `loess_normalize()` — LOESS intensity-dependent (log2-transformed)
|
|
281
|
+
- `handle_negative_values()` — Handle negative values from VSN
|
|
282
|
+
- `analyze_negative_values()` — Analyze negative value patterns
|
|
283
|
+
- `calculate_normalization_stats()` — Evaluate normalization effectiveness
|
|
284
|
+
|
|
285
|
+
### statistical_analysis.py
|
|
286
|
+
- `StatisticalConfig` — Configuration class (zero-arg constructor, set attributes individually)
|
|
287
|
+
- `run_comprehensive_statistical_analysis()` — Main analysis entry point
|
|
288
|
+
- `display_analysis_summary()` — Print/return summary of results
|
|
289
|
+
- `run_statistical_analysis()` — Backward-compatible wrapper
|
|
290
|
+
|
|
291
|
+
### visualization.py
|
|
292
|
+
- `plot_box_plot()` — Sample intensity distributions by group
|
|
293
|
+
- `plot_volcano()` — Volcano plot with labeled top hits
|
|
294
|
+
- `plot_pca()` — PCA with group coloring, optional log-transform
|
|
295
|
+
- `plot_comparative_pca()` — Compare PCA across normalization methods
|
|
296
|
+
- `plot_normalization_comparison()` — Before/after normalization QC
|
|
297
|
+
- `plot_sample_correlation_heatmap()` — Full correlation matrix
|
|
298
|
+
- `plot_sample_correlation_triangular_heatmap()` — Lower-triangle correlation
|
|
299
|
+
- `plot_control_correlation()` — Control sample correlation with optional clustering
|
|
300
|
+
- `plot_control_correlation_analysis()` — Multi-panel control QC
|
|
301
|
+
- `plot_control_group_correlation_analysis()` — Group-wise control QC
|
|
302
|
+
- `plot_individual_control_pool_analysis()` — Individual control analysis
|
|
303
|
+
- `plot_control_cv_distribution()` — CV distribution for control samples
|
|
304
|
+
- `plot_grouped_heatmap()` — Heatmap for any grouped data
|
|
305
|
+
- `plot_grouped_trajectories()` — Line plots for temporal/dose-response data
|
|
306
|
+
- `plot_protein_profile()` — Single protein expression profile
|
|
307
|
+
|
|
308
|
+
### enrichment.py
|
|
309
|
+
- `EnrichmentConfig` — Configuration dataclass (libraries, thresholds, API settings)
|
|
310
|
+
- `query_enrichr()` — Query Enrichr API with a gene list
|
|
311
|
+
- `parse_enrichr_results()` — Parse raw results into a tidy DataFrame
|
|
312
|
+
- `run_enrichment_analysis()` — Complete enrichment on a gene list
|
|
313
|
+
- `run_enrichment_by_group()` — Enrichment for each group in a DataFrame
|
|
314
|
+
- `run_differential_enrichment()` — Split by up/down-regulated, run enrichment on each
|
|
315
|
+
- `plot_enrichment_barplot()` — Horizontal bar plot by Combined Score
|
|
316
|
+
- `plot_enrichment_comparison()` — Dot plot comparing enrichment across groups
|
|
317
|
+
- `get_available_libraries()` — List common Enrichr libraries
|
|
318
|
+
- `merge_enrichment_results()` — Merge multiple enrichment DataFrames
|
|
319
|
+
|
|
320
|
+
### temporal_clustering.py
|
|
321
|
+
- `TemporalClusteringConfig` — Configuration dataclass
|
|
322
|
+
- `run_temporal_analysis()` — Complete pipeline: clustering → visualization → enrichment
|
|
323
|
+
- `calculate_temporal_means()` — Mean abundance per timepoint across subjects
|
|
324
|
+
- `cluster_temporal_trends()` — K-means or hierarchical clustering
|
|
325
|
+
- `name_clusters_by_pattern()` — Assign descriptive cluster names
|
|
326
|
+
- `classify_trend_pattern()` — Classify individual protein trends
|
|
327
|
+
- `merge_with_statistics()` — Merge temporal data with statistical results
|
|
328
|
+
- `filter_significant_proteins()` — Filter to significant proteins
|
|
329
|
+
- `run_enrichment_by_cluster()` — Enrichment per cluster
|
|
330
|
+
- `plot_cluster_heatmap()` — Cluster-organized heatmap
|
|
331
|
+
- `plot_cluster_parallel_coordinates()` — Parallel coordinate plots
|
|
332
|
+
|
|
333
|
+
### validation.py
|
|
334
|
+
- `validate_metadata_data_consistency()` — Check metadata matches data columns
|
|
335
|
+
- `enhanced_sample_processing()` — Sample processing with validation
|
|
336
|
+
- `generate_sample_matching_diagnostic_report()` — Detailed mismatch diagnostics
|
|
337
|
+
- `SampleMatchingError` — Exception for sample matching failures
|
|
338
|
+
- `ControlSampleError` — Exception for control sample configuration issues
|
|
339
|
+
|
|
340
|
+
### export.py
|
|
341
|
+
- `export_complete_analysis()` — Full export: data + config + results
|
|
342
|
+
- `export_analysis_results()` — Export normalized data + differential results
|
|
343
|
+
- `export_timestamped_config()` — Save analysis config with timestamp
|
|
344
|
+
- `create_config_dict_from_notebook_vars()` — Build config dict from notebook variables
|
|
345
|
+
- `export_significant_proteins_summary()` — Export significant results summary
|
|
346
|
+
- `export_results()` — General-purpose result export
|
|
347
|
+
|
|
348
|
+
## See Also
|
|
349
|
+
|
|
350
|
+
- [Usage Guide](docs/guide.md) -- Detailed recipe book with usage patterns
|
|
351
|
+
- [CLAUDE.md](../CLAUDE.md) — Project conventions and data prep patterns
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
# Proteomics Analysis Toolkit
|
|
2
|
+
|
|
3
|
+
[](https://github.com/uw-maccosslab/proteomics-toolkit/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/proteomics-toolkit/)
|
|
5
|
+
[](https://pypi.org/project/proteomics-toolkit/)
|
|
6
|
+
[](https://github.com/uw-maccosslab/proteomics-toolkit/blob/main/LICENSE)
|
|
7
|
+
|
|
8
|
+
A Python toolkit for analyzing mass spectrometry-based proteomics data, supporting both Skyline CSV and PRISM parquet workflows.
|
|
9
|
+
|
|
10
|
+
## Features
|
|
11
|
+
|
|
12
|
+
### Core Analysis Modules
|
|
13
|
+
- **data_import**: Load Skyline CSV or PRISM parquet data, handle batch suffixes, manage sample metadata
|
|
14
|
+
- **preprocessing**: Protein identifier parsing, sample classification, data quality assessment
|
|
15
|
+
- **normalization**: Seven normalization methods (median, VSN, quantile, MAD, z-score, RLR, LOESS)
|
|
16
|
+
- **statistical_analysis**: Differential protein analysis — t-tests, Wilcoxon, Mann-Whitney, mixed-effects models
|
|
17
|
+
- **visualization**: Publication-ready plots — volcano, PCA, box plots, heatmaps, correlation, trajectories
|
|
18
|
+
- **enrichment**: Gene set enrichment via Enrichr API
|
|
19
|
+
- **temporal_clustering**: K-means clustering of temporal protein trends
|
|
20
|
+
- **validation**: Metadata/data consistency checking with diagnostic reports
|
|
21
|
+
- **export**: Standardized result export with timestamped configs
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
# Install from PyPI
|
|
27
|
+
pip install proteomics-toolkit
|
|
28
|
+
|
|
29
|
+
# With XGBoost support (for classification module)
|
|
30
|
+
pip install proteomics-toolkit[xgboost]
|
|
31
|
+
|
|
32
|
+
# Install from GitHub (latest development version)
|
|
33
|
+
pip install git+https://github.com/uw-maccosslab/proteomics-toolkit.git
|
|
34
|
+
|
|
35
|
+
# For development (editable install from local clone)
|
|
36
|
+
git clone https://github.com/uw-maccosslab/proteomics-toolkit.git
|
|
37
|
+
cd proteomics-toolkit
|
|
38
|
+
pip install -e .
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
### PRISM Workflow (recommended for batch-corrected data)
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import proteomics_toolkit as ptk
|
|
47
|
+
import pandas as pd
|
|
48
|
+
|
|
49
|
+
# 1. Load PRISM data
|
|
50
|
+
protein_data, metadata, sample_cols = ptk.load_prism_data(
|
|
51
|
+
'PRISM-Output/corrected_proteins.parquet',
|
|
52
|
+
'PRISM-Output/sample_metadata.csv',
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# 2. Map batch-suffixed column names to short replicate IDs
|
|
56
|
+
col_map = ptk.strip_batch_suffix(sample_cols) # {full_col: short_name}
|
|
57
|
+
short_to_col = {v: k for k, v in col_map.items()}
|
|
58
|
+
|
|
59
|
+
# 3. Build sample metadata dict (keys = full PRISM column names)
|
|
60
|
+
meta_dict = {}
|
|
61
|
+
for _, row in metadata.iterrows():
|
|
62
|
+
full_col = short_to_col.get(row['Replicate'])
|
|
63
|
+
if full_col:
|
|
64
|
+
meta_dict[full_col] = row.to_dict()
|
|
65
|
+
|
|
66
|
+
# 4. Filter low-confidence proteins
|
|
67
|
+
protein_data_filtered = protein_data[~protein_data['low_confidence']].copy()
|
|
68
|
+
|
|
69
|
+
# 5. Build annotation + sample data for stats
|
|
70
|
+
annot = protein_data_filtered[[
|
|
71
|
+
'leading_protein', 'leading_description', 'leading_gene_name',
|
|
72
|
+
'leading_uniprot_id', 'leading_name'
|
|
73
|
+
]].copy()
|
|
74
|
+
annot.columns = ['Protein', 'Description', 'Protein Gene', 'UniProt_Accession', 'UniProt_Entry_Name']
|
|
75
|
+
data = pd.concat([annot.reset_index(drop=True),
|
|
76
|
+
protein_data_filtered[sample_cols].reset_index(drop=True)], axis=1)
|
|
77
|
+
data.index = data['Protein'] # accession as index
|
|
78
|
+
|
|
79
|
+
# 6. Statistical analysis
|
|
80
|
+
config = ptk.StatisticalConfig()
|
|
81
|
+
config.analysis_type = 'unpaired'
|
|
82
|
+
config.statistical_test_method = 'welch_t'
|
|
83
|
+
config.group_column = 'Group'
|
|
84
|
+
config.group_labels = ['Control', 'Treatment'] # [reference, study]
|
|
85
|
+
config.correction_method = 'fdr_bh'
|
|
86
|
+
config.p_value_threshold = 0.05
|
|
87
|
+
config.fold_change_threshold = 1.0
|
|
88
|
+
config.log_transform_before_stats = True
|
|
89
|
+
config.validate()
|
|
90
|
+
|
|
91
|
+
results = ptk.run_comprehensive_statistical_analysis(
|
|
92
|
+
data, meta_dict, config, protein_annotations=annot
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# 7. Visualization
|
|
96
|
+
ptk.plot_volcano(results, fc_threshold=1.0, gene_column='Protein Gene', label_top_n=15)
|
|
97
|
+
ptk.display_analysis_summary(results, config)
|
|
98
|
+
|
|
99
|
+
# 8. Enrichment
|
|
100
|
+
enrich_config = ptk.EnrichmentConfig(
|
|
101
|
+
enrichr_libraries=['GO_Biological_Process_2023', 'KEGG_2021_Human'],
|
|
102
|
+
pvalue_cutoff=0.05,
|
|
103
|
+
)
|
|
104
|
+
enrich = ptk.run_differential_enrichment(
|
|
105
|
+
results, gene_column='Protein Gene', logfc_column='logFC',
|
|
106
|
+
pvalue_column='adj.P.Val', config=enrich_config,
|
|
107
|
+
)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Skyline CSV Workflow
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
# 1. Load data
|
|
114
|
+
protein_data, metadata, peptide_data = ptk.load_skyline_data(
|
|
115
|
+
protein_file='protein_quant.csv',
|
|
116
|
+
metadata_file='metadata.csv',
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# 2. Process sample names
|
|
120
|
+
sample_columns = ptk.data_import.identify_sample_columns(protein_data, metadata)
|
|
121
|
+
cleaned_names = ptk.clean_sample_names(sample_columns)
|
|
122
|
+
|
|
123
|
+
# 3. Parse annotations and filter
|
|
124
|
+
processed_data = ptk.parse_protein_identifiers(protein_data)
|
|
125
|
+
|
|
126
|
+
# 4. Normalize (skip for PRISM — already normalized)
|
|
127
|
+
normalized = ptk.median_normalize(processed_data, sample_columns=list(cleaned_names.values()))
|
|
128
|
+
|
|
129
|
+
# 5. QC plots
|
|
130
|
+
ptk.plot_box_plot(normalized, list(cleaned_names.values()), sample_metadata)
|
|
131
|
+
ptk.plot_pca(normalized, list(cleaned_names.values()), sample_metadata)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Statistical Analysis
|
|
135
|
+
|
|
136
|
+
All statistical analyses use `StatisticalConfig` + `run_comprehensive_statistical_analysis()`.
|
|
137
|
+
|
|
138
|
+
### Unpaired comparison (two independent groups)
|
|
139
|
+
```python
|
|
140
|
+
config = ptk.StatisticalConfig()
|
|
141
|
+
config.analysis_type = 'unpaired'
|
|
142
|
+
config.statistical_test_method = 'welch_t' # or 'mann_whitney'
|
|
143
|
+
config.group_column = 'Group'
|
|
144
|
+
config.group_labels = ['Control', 'Treatment']
|
|
145
|
+
config.log_transform_before_stats = 'auto'
|
|
146
|
+
config.validate()
|
|
147
|
+
|
|
148
|
+
results = ptk.run_comprehensive_statistical_analysis(
|
|
149
|
+
data, sample_metadata, config, protein_annotations=annot
|
|
150
|
+
)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Paired comparison (before/after per subject)
|
|
154
|
+
```python
|
|
155
|
+
config = ptk.StatisticalConfig()
|
|
156
|
+
config.analysis_type = 'paired'
|
|
157
|
+
config.statistical_test_method = 'paired_t'
|
|
158
|
+
config.subject_column = 'Subject'
|
|
159
|
+
config.paired_column = 'Condition'
|
|
160
|
+
config.paired_label1 = 'Before'
|
|
161
|
+
config.paired_label2 = 'After'
|
|
162
|
+
config.group_column = 'Condition'
|
|
163
|
+
config.group_labels = ['Before', 'After']
|
|
164
|
+
config.validate()
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Mixed-effects model (repeated measures)
|
|
168
|
+
```python
|
|
169
|
+
config = ptk.StatisticalConfig()
|
|
170
|
+
config.analysis_type = 'paired'
|
|
171
|
+
config.statistical_test_method = 'mixed_effects'
|
|
172
|
+
config.subject_column = 'Subject'
|
|
173
|
+
config.paired_column = 'Visit'
|
|
174
|
+
config.paired_label1 = 'Baseline'
|
|
175
|
+
config.paired_label2 = 'Follow-up'
|
|
176
|
+
config.group_column = 'Treatment'
|
|
177
|
+
config.group_labels = ['Placebo', 'Drug']
|
|
178
|
+
config.interaction_terms = ['Treatment', 'Visit']
|
|
179
|
+
config.validate()
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
**Output columns:** `Protein`, `logFC`, `P.Value`, `adj.P.Val`, `AveExpr`, `t`, `Protein Gene`, `Description`, `UniProt_Accession`, `Gene`
|
|
183
|
+
|
|
184
|
+
## Enrichment
|
|
185
|
+
|
|
186
|
+
Enrichment results use these column names (not the Enrichr web-UI names):
|
|
187
|
+
|
|
188
|
+
| Column | Description |
|
|
189
|
+
|---|---|
|
|
190
|
+
| `Term` | Pathway / GO term name |
|
|
191
|
+
| `P_Value` | Unadjusted p-value |
|
|
192
|
+
| `Adj_P_Value` | BH-adjusted p-value |
|
|
193
|
+
| `Z_Score` | Enrichr z-score |
|
|
194
|
+
| `Combined_Score` | log(p) × z — used for ranking |
|
|
195
|
+
| `Genes` | Semicolon-separated gene list |
|
|
196
|
+
| `N_Genes` | Number of overlapping genes |
|
|
197
|
+
| `Library` | Source Enrichr library |
|
|
198
|
+
|
|
199
|
+
## Dependencies
|
|
200
|
+
|
|
201
|
+
- pandas >= 1.3.0
|
|
202
|
+
- numpy >= 1.21.0
|
|
203
|
+
- scipy >= 1.7.0
|
|
204
|
+
- matplotlib >= 3.4.0
|
|
205
|
+
- seaborn >= 0.11.0
|
|
206
|
+
- scikit-learn >= 1.0.0
|
|
207
|
+
- statsmodels >= 0.12.0
|
|
208
|
+
- requests >= 2.25.0 (for Enrichr API)
|
|
209
|
+
- pyarrow >= 8.0.0 (for PRISM parquet files)
|
|
210
|
+
|
|
211
|
+
## Module Reference
|
|
212
|
+
|
|
213
|
+
### data_import.py
|
|
214
|
+
- `load_skyline_data()` — Load Skyline protein/peptide CSVs + metadata
|
|
215
|
+
- `load_prism_data()` — Load PRISM parquet + metadata
|
|
216
|
+
- `identify_sample_columns()` — Auto-detect sample columns
|
|
217
|
+
- `clean_sample_names()` — Remove common prefixes/suffixes
|
|
218
|
+
- `detect_batch_suffix()` — Detect PRISM `__@__` batch suffix
|
|
219
|
+
- `strip_batch_suffix()` — Map batch-suffixed names → short names
|
|
220
|
+
- `create_sample_column_mapping()` — Map data columns to metadata sample names
|
|
221
|
+
- `match_samples_to_metadata()` — Link samples to metadata rows
|
|
222
|
+
- `BATCH_SUFFIX_DELIMITER` — Constant: `"__@__"`
|
|
223
|
+
|
|
224
|
+
### preprocessing.py
|
|
225
|
+
- `parse_protein_identifiers()` — Extract UniProt accessions and databases
|
|
226
|
+
- `parse_gene_and_description()` — Parse gene names from descriptions
|
|
227
|
+
- `classify_samples()` — Classify samples into groups / controls with color assignment
|
|
228
|
+
- `apply_systematic_color_scheme()` — Generate consistent group colors
|
|
229
|
+
- `create_standard_data_structure()` — Build standard 5-column annotation + sample layout
|
|
230
|
+
- `assess_data_completeness()` — Evaluate missing data patterns
|
|
231
|
+
- `filter_proteins_by_completeness()` — Remove proteins below detection threshold
|
|
232
|
+
- `calculate_group_colors()` — Generate group color mapping
|
|
233
|
+
- `identify_annotation_columns()` — Auto-detect annotation vs sample columns
|
|
234
|
+
|
|
235
|
+
### normalization.py
|
|
236
|
+
- `median_normalize()` — Median-based normalization (preserves original scale)
|
|
237
|
+
- `vsn_normalize()` — Variance Stabilizing Normalization (arcsinh-transformed)
|
|
238
|
+
- `quantile_normalize()` — Force identical distributions
|
|
239
|
+
- `mad_normalize()` — Median absolute deviation normalization
|
|
240
|
+
- `z_score_normalize()` — Standardize to mean=0, sd=1
|
|
241
|
+
- `rlr_normalize()` — Robust linear regression (log2-transformed)
|
|
242
|
+
- `loess_normalize()` — LOESS intensity-dependent (log2-transformed)
|
|
243
|
+
- `handle_negative_values()` — Handle negative values from VSN
|
|
244
|
+
- `analyze_negative_values()` — Analyze negative value patterns
|
|
245
|
+
- `calculate_normalization_stats()` — Evaluate normalization effectiveness
|
|
246
|
+
|
|
247
|
+
### statistical_analysis.py
|
|
248
|
+
- `StatisticalConfig` — Configuration class (zero-arg constructor, set attributes individually)
|
|
249
|
+
- `run_comprehensive_statistical_analysis()` — Main analysis entry point
|
|
250
|
+
- `display_analysis_summary()` — Print/return summary of results
|
|
251
|
+
- `run_statistical_analysis()` — Backward-compatible wrapper
|
|
252
|
+
|
|
253
|
+
### visualization.py
|
|
254
|
+
- `plot_box_plot()` — Sample intensity distributions by group
|
|
255
|
+
- `plot_volcano()` — Volcano plot with labeled top hits
|
|
256
|
+
- `plot_pca()` — PCA with group coloring, optional log-transform
|
|
257
|
+
- `plot_comparative_pca()` — Compare PCA across normalization methods
|
|
258
|
+
- `plot_normalization_comparison()` — Before/after normalization QC
|
|
259
|
+
- `plot_sample_correlation_heatmap()` — Full correlation matrix
|
|
260
|
+
- `plot_sample_correlation_triangular_heatmap()` — Lower-triangle correlation
|
|
261
|
+
- `plot_control_correlation()` — Control sample correlation with optional clustering
|
|
262
|
+
- `plot_control_correlation_analysis()` — Multi-panel control QC
|
|
263
|
+
- `plot_control_group_correlation_analysis()` — Group-wise control QC
|
|
264
|
+
- `plot_individual_control_pool_analysis()` — Individual control analysis
|
|
265
|
+
- `plot_control_cv_distribution()` — CV distribution for control samples
|
|
266
|
+
- `plot_grouped_heatmap()` — Heatmap for any grouped data
|
|
267
|
+
- `plot_grouped_trajectories()` — Line plots for temporal/dose-response data
|
|
268
|
+
- `plot_protein_profile()` — Single protein expression profile
|
|
269
|
+
|
|
270
|
+
### enrichment.py
|
|
271
|
+
- `EnrichmentConfig` — Configuration dataclass (libraries, thresholds, API settings)
|
|
272
|
+
- `query_enrichr()` — Query Enrichr API with a gene list
|
|
273
|
+
- `parse_enrichr_results()` — Parse raw results into a tidy DataFrame
|
|
274
|
+
- `run_enrichment_analysis()` — Complete enrichment on a gene list
|
|
275
|
+
- `run_enrichment_by_group()` — Enrichment for each group in a DataFrame
|
|
276
|
+
- `run_differential_enrichment()` — Split by up/down-regulated, run enrichment on each
|
|
277
|
+
- `plot_enrichment_barplot()` — Horizontal bar plot by Combined Score
|
|
278
|
+
- `plot_enrichment_comparison()` — Dot plot comparing enrichment across groups
|
|
279
|
+
- `get_available_libraries()` — List common Enrichr libraries
|
|
280
|
+
- `merge_enrichment_results()` — Merge multiple enrichment DataFrames
|
|
281
|
+
|
|
282
|
+
### temporal_clustering.py
|
|
283
|
+
- `TemporalClusteringConfig` — Configuration dataclass
|
|
284
|
+
- `run_temporal_analysis()` — Complete pipeline: clustering → visualization → enrichment
|
|
285
|
+
- `calculate_temporal_means()` — Mean abundance per timepoint across subjects
|
|
286
|
+
- `cluster_temporal_trends()` — K-means or hierarchical clustering
|
|
287
|
+
- `name_clusters_by_pattern()` — Assign descriptive cluster names
|
|
288
|
+
- `classify_trend_pattern()` — Classify individual protein trends
|
|
289
|
+
- `merge_with_statistics()` — Merge temporal data with statistical results
|
|
290
|
+
- `filter_significant_proteins()` — Filter to significant proteins
|
|
291
|
+
- `run_enrichment_by_cluster()` — Enrichment per cluster
|
|
292
|
+
- `plot_cluster_heatmap()` — Cluster-organized heatmap
|
|
293
|
+
- `plot_cluster_parallel_coordinates()` — Parallel coordinate plots
|
|
294
|
+
|
|
295
|
+
### validation.py
|
|
296
|
+
- `validate_metadata_data_consistency()` — Check metadata matches data columns
|
|
297
|
+
- `enhanced_sample_processing()` — Sample processing with validation
|
|
298
|
+
- `generate_sample_matching_diagnostic_report()` — Detailed mismatch diagnostics
|
|
299
|
+
- `SampleMatchingError` — Exception for sample matching failures
|
|
300
|
+
- `ControlSampleError` — Exception for control sample configuration issues
|
|
301
|
+
|
|
302
|
+
### export.py
|
|
303
|
+
- `export_complete_analysis()` — Full export: data + config + results
|
|
304
|
+
- `export_analysis_results()` — Export normalized data + differential results
|
|
305
|
+
- `export_timestamped_config()` — Save analysis config with timestamp
|
|
306
|
+
- `create_config_dict_from_notebook_vars()` — Build config dict from notebook variables
|
|
307
|
+
- `export_significant_proteins_summary()` — Export significant results summary
|
|
308
|
+
- `export_results()` — General-purpose result export
|
|
309
|
+
|
|
310
|
+
## See Also
|
|
311
|
+
|
|
312
|
+
- [Usage Guide](docs/guide.md) -- Detailed recipe book with usage patterns
|
|
313
|
+
- [CLAUDE.md](../CLAUDE.md) — Project conventions and data prep patterns
|