masster 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +2 -0
- masster/_version.py +1 -1
- masster/study/__init__.py +1 -0
- masster/study/defaults/find_consensus_def.py +1 -1
- masster/study/defaults/merge_def.py +69 -25
- masster/study/h5.py +65 -106
- masster/study/id.py +1 -1
- masster/study/load.py +11 -6
- masster/study/merge.py +1607 -0
- masster/study/processing.py +0 -902
- masster/study/save.py +1 -1
- masster/study/study.py +28 -31
- masster/wizard/README.md +373 -0
- masster/wizard/__init__.py +11 -0
- masster/wizard/example.py +223 -0
- masster/wizard/test_structure.py +49 -0
- masster/wizard/test_wizard.py +285 -0
- masster/wizard/wizard.py +1175 -0
- masster/wizard.py +1175 -0
- {masster-0.4.14.dist-info → masster-0.4.16.dist-info}/METADATA +1 -1
- {masster-0.4.14.dist-info → masster-0.4.16.dist-info}/RECORD +24 -16
- {masster-0.4.14.dist-info → masster-0.4.16.dist-info}/WHEEL +0 -0
- {masster-0.4.14.dist-info → masster-0.4.16.dist-info}/entry_points.txt +0 -0
- {masster-0.4.14.dist-info → masster-0.4.16.dist-info}/licenses/LICENSE +0 -0
masster/study/save.py
CHANGED
|
@@ -53,7 +53,7 @@ def save(self, filename=None, add_timestamp=True, compress=False):
|
|
|
53
53
|
if hasattr(self, "samples_df") and not self.samples_df.is_empty()
|
|
54
54
|
else 0
|
|
55
55
|
)
|
|
56
|
-
self.logger.
|
|
56
|
+
self.logger.debug(
|
|
57
57
|
f"Saving study with {sample_count} samples and {feature_count} features to {filename}",
|
|
58
58
|
)
|
|
59
59
|
|
masster/study/study.py
CHANGED
|
@@ -116,16 +116,13 @@ from masster.study.plot import plot_tic
|
|
|
116
116
|
from masster.study.plot import plot_eic
|
|
117
117
|
from masster.study.plot import plot_rt_correction
|
|
118
118
|
from masster.study.processing import align
|
|
119
|
-
from masster.study.
|
|
119
|
+
from masster.study.merge import merge
|
|
120
|
+
from masster.study.merge import _reset_consensus_data
|
|
121
|
+
from masster.study.merge import _extract_consensus_features
|
|
122
|
+
from masster.study.merge import _perform_adduct_grouping
|
|
123
|
+
from masster.study.merge import _finalize_merge
|
|
120
124
|
from masster.study.processing import integrate
|
|
121
125
|
from masster.study.processing import find_ms2
|
|
122
|
-
from masster.study.processing import _reset_consensus_data
|
|
123
|
-
from masster.study.processing import _process_merge_parameters
|
|
124
|
-
from masster.study.processing import _validate_merge_inputs
|
|
125
|
-
from masster.study.processing import _perform_feature_grouping
|
|
126
|
-
from masster.study.processing import _extract_consensus_features
|
|
127
|
-
from masster.study.processing import _perform_adduct_grouping
|
|
128
|
-
from masster.study.processing import _finalize_merge
|
|
129
126
|
from masster.study.parameters import store_history
|
|
130
127
|
from masster.study.parameters import get_parameters
|
|
131
128
|
from masster.study.parameters import update_parameters
|
|
@@ -153,7 +150,6 @@ from masster.study.defaults.align_def import align_defaults
|
|
|
153
150
|
from masster.study.defaults.export_def import export_mgf_defaults
|
|
154
151
|
from masster.study.defaults.fill_chrom_def import fill_chrom_defaults
|
|
155
152
|
from masster.study.defaults.fill_def import fill_defaults
|
|
156
|
-
from masster.study.defaults.find_consensus_def import find_consensus_defaults
|
|
157
153
|
from masster.study.defaults.find_ms2_def import find_ms2_defaults
|
|
158
154
|
from masster.study.defaults.integrate_chrom_def import integrate_chrom_defaults
|
|
159
155
|
from masster.study.defaults.integrate_def import integrate_defaults
|
|
@@ -392,7 +388,6 @@ class Study:
|
|
|
392
388
|
fill_chrom = fill # Backward compatibility alias
|
|
393
389
|
fill_single = fill_single
|
|
394
390
|
fill_chrom_single = fill_single # Backward compatibility alias
|
|
395
|
-
|
|
396
391
|
# === Data Retrieval and Access ===
|
|
397
392
|
get_consensus = get_consensus
|
|
398
393
|
get_chrom = get_chrom
|
|
@@ -499,9 +494,6 @@ class Study:
|
|
|
499
494
|
|
|
500
495
|
# === Merge Helper Methods ===
|
|
501
496
|
_reset_consensus_data = _reset_consensus_data
|
|
502
|
-
_process_merge_parameters = _process_merge_parameters
|
|
503
|
-
_validate_merge_inputs = _validate_merge_inputs
|
|
504
|
-
_perform_feature_grouping = _perform_feature_grouping
|
|
505
497
|
_extract_consensus_features = _extract_consensus_features
|
|
506
498
|
_perform_adduct_grouping = _perform_adduct_grouping
|
|
507
499
|
_finalize_merge = _finalize_merge
|
|
@@ -512,7 +504,6 @@ class Study:
|
|
|
512
504
|
export_mgf_defaults = export_mgf_defaults
|
|
513
505
|
fill_chrom_defaults = fill_chrom_defaults
|
|
514
506
|
fill_defaults = fill_defaults
|
|
515
|
-
find_consensus_defaults = find_consensus_defaults
|
|
516
507
|
find_ms2_defaults = find_ms2_defaults
|
|
517
508
|
integrate_chrom_defaults = integrate_chrom_defaults
|
|
518
509
|
integrate_defaults = integrate_defaults
|
|
@@ -693,11 +684,13 @@ class Study:
|
|
|
693
684
|
samples count, and various statistics. Shows warning symbols for values
|
|
694
685
|
that are out of normal range.
|
|
695
686
|
"""
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
687
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
688
|
+
self.consensus_df = pl.DataFrame()
|
|
689
|
+
consensus_df_len = 0
|
|
690
|
+
else:
|
|
691
|
+
consensus_df_len = len(self.consensus_df)
|
|
692
|
+
|
|
693
|
+
samples_df_len = len(self.samples_df) if (self.samples_df is not None and not self.samples_df.is_empty()) else 0
|
|
701
694
|
|
|
702
695
|
# Calculate consensus statistics only if consensus_df exists and has data
|
|
703
696
|
if consensus_df_len > 0:
|
|
@@ -719,7 +712,8 @@ class Study:
|
|
|
719
712
|
max_samples = 0
|
|
720
713
|
|
|
721
714
|
# Count only features where 'filled' == False
|
|
722
|
-
if not self.features_df.is_empty() and
|
|
715
|
+
if (self.features_df is not None and not self.features_df.is_empty() and
|
|
716
|
+
"filled" in self.features_df.columns):
|
|
723
717
|
unfilled_features_count = self.features_df.filter(
|
|
724
718
|
~self.features_df["filled"],
|
|
725
719
|
).height
|
|
@@ -727,7 +721,8 @@ class Study:
|
|
|
727
721
|
unfilled_features_count = 0
|
|
728
722
|
|
|
729
723
|
# Calculate features in consensus vs not in consensus (only for unfilled features)
|
|
730
|
-
if
|
|
724
|
+
if (self.features_df is not None and not self.features_df.is_empty() and
|
|
725
|
+
self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty()):
|
|
731
726
|
# Get unfilled features only
|
|
732
727
|
unfilled_features = (
|
|
733
728
|
self.features_df.filter(~self.features_df["filled"])
|
|
@@ -781,6 +776,7 @@ class Study:
|
|
|
781
776
|
if (
|
|
782
777
|
consensus_df_len > 0
|
|
783
778
|
and samples_df_len > 0
|
|
779
|
+
and self.features_df is not None
|
|
784
780
|
and not self.features_df.is_empty()
|
|
785
781
|
):
|
|
786
782
|
# Ensure matching data types for join keys
|
|
@@ -806,7 +802,8 @@ class Study:
|
|
|
806
802
|
)
|
|
807
803
|
|
|
808
804
|
# Use more efficient counting - count non-null chroms only for features in consensus mapping
|
|
809
|
-
if
|
|
805
|
+
if (self.consensus_mapping_df is not None and
|
|
806
|
+
not self.consensus_mapping_df.is_empty()):
|
|
810
807
|
non_null_chroms = (
|
|
811
808
|
self.features_df.join(
|
|
812
809
|
self.consensus_mapping_df.select("feature_uid"),
|
|
@@ -828,16 +825,16 @@ class Study:
|
|
|
828
825
|
chrom_completeness = 0
|
|
829
826
|
|
|
830
827
|
# Calculate consensus features with MS2 (count unique consensus_uids with MS2)
|
|
831
|
-
if not self.consensus_ms2.is_empty():
|
|
828
|
+
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
832
829
|
consensus_with_ms2_count = self.consensus_ms2["consensus_uid"].n_unique()
|
|
833
830
|
else:
|
|
834
831
|
consensus_with_ms2_count = 0
|
|
835
832
|
|
|
836
|
-
if not self.consensus_df.is_empty():
|
|
833
|
+
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
837
834
|
# Compute RT spread using only consensus rows with number_samples >= half the number of samples
|
|
838
835
|
threshold = (
|
|
839
836
|
self.consensus_df.select(pl.col("number_samples").max()).item() / 2
|
|
840
|
-
if not self.samples_df.is_empty()
|
|
837
|
+
if (self.samples_df is not None and not self.samples_df.is_empty())
|
|
841
838
|
else 0
|
|
842
839
|
)
|
|
843
840
|
filtered = self.consensus_df.filter(pl.col("number_samples") >= threshold)
|
|
@@ -864,16 +861,16 @@ class Study:
|
|
|
864
861
|
|
|
865
862
|
# Total MS2 spectra count
|
|
866
863
|
total_ms2_count = (
|
|
867
|
-
len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
864
|
+
len(self.consensus_ms2) if (self.consensus_ms2 is not None and not self.consensus_ms2.is_empty()) else 0
|
|
868
865
|
)
|
|
869
866
|
|
|
870
867
|
# Estimate memory usage
|
|
871
868
|
memory_usage = (
|
|
872
|
-
self.samples_df.estimated_size()
|
|
873
|
-
+ self.features_df.estimated_size()
|
|
874
|
-
+ self.consensus_df.estimated_size()
|
|
875
|
-
+ self.consensus_ms2.estimated_size()
|
|
876
|
-
+ self.consensus_mapping_df.estimated_size()
|
|
869
|
+
(self.samples_df.estimated_size() if self.samples_df is not None else 0)
|
|
870
|
+
+ (self.features_df.estimated_size() if self.features_df is not None else 0)
|
|
871
|
+
+ (self.consensus_df.estimated_size() if self.consensus_df is not None else 0)
|
|
872
|
+
+ (self.consensus_ms2.estimated_size() if self.consensus_ms2 is not None else 0)
|
|
873
|
+
+ (self.consensus_mapping_df.estimated_size() if self.consensus_mapping_df is not None else 0)
|
|
877
874
|
)
|
|
878
875
|
|
|
879
876
|
# Add warning symbols for out-of-range values
|
masster/wizard/README.md
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
# Wizard Class Documentation
|
|
2
|
+
|
|
3
|
+
The `Wizard` class provides comprehensive automation for mass spectrometry data processing, from raw files to final study results. It handles the complete workflow with minimal user intervention while providing intelligent resume capabilities, parallel processing optimization, and adaptive output formats.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
### Basic Usage
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from masster import Wizard
|
|
11
|
+
|
|
12
|
+
# Create wizard with minimal configuration
|
|
13
|
+
wizard = Wizard(
|
|
14
|
+
data_source="./raw_data", # Directory with raw files
|
|
15
|
+
study_folder="./processed", # Output directory
|
|
16
|
+
polarity="positive", # or "negative"
|
|
17
|
+
num_cores=4 # CPU cores to use
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Run complete pipeline
|
|
21
|
+
success = wizard.run_full_pipeline()
|
|
22
|
+
|
|
23
|
+
if success:
|
|
24
|
+
wizard.info() # Print summary
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### Advanced Configuration
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from masster import Wizard, wizard_def
|
|
31
|
+
|
|
32
|
+
# Create custom parameters
|
|
33
|
+
params = wizard_def(
|
|
34
|
+
data_source="./raw_data",
|
|
35
|
+
study_folder="./processed_advanced",
|
|
36
|
+
polarity="negative",
|
|
37
|
+
num_cores=8,
|
|
38
|
+
|
|
39
|
+
# File discovery
|
|
40
|
+
file_extensions=[".wiff", ".raw", ".mzML"],
|
|
41
|
+
search_subfolders=True,
|
|
42
|
+
skip_patterns=["blank", "QC", "test"],
|
|
43
|
+
|
|
44
|
+
# Processing parameters
|
|
45
|
+
adducts=["H-1:-:0.95", "Cl:-:0.05", "CH2O2:0:0.2"],
|
|
46
|
+
chrom_fwhm=0.15,
|
|
47
|
+
noise_threshold=5e4,
|
|
48
|
+
|
|
49
|
+
# Study assembly
|
|
50
|
+
rt_tolerance=1.0,
|
|
51
|
+
mz_tolerance=0.008,
|
|
52
|
+
min_samples_for_merge=30,
|
|
53
|
+
|
|
54
|
+
# Output options
|
|
55
|
+
export_formats=["csv", "xlsx", "mgf", "parquet"],
|
|
56
|
+
generate_plots=True,
|
|
57
|
+
compress_output=True,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
wizard = Wizard(params=params)
|
|
61
|
+
wizard.run_full_pipeline()
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Key Features
|
|
65
|
+
|
|
66
|
+
### 🔄 Automated Pipeline
|
|
67
|
+
- **Raw Data Discovery**: Automatically finds and validates raw MS files
|
|
68
|
+
- **Batch Conversion**: Parallel conversion to sample5 format with optimized parameters
|
|
69
|
+
- **Study Assembly**: Creates study from processed samples with quality filtering
|
|
70
|
+
- **Feature Alignment**: Cross-sample alignment using configurable algorithms
|
|
71
|
+
- **Consensus Generation**: Merges aligned features with statistical validation
|
|
72
|
+
- **Results Export**: Multiple output formats for downstream analysis
|
|
73
|
+
|
|
74
|
+
### 💾 Intelligent Resume
|
|
75
|
+
- **Checkpoint System**: Automatically saves progress at key points
|
|
76
|
+
- **File Tracking**: Remembers which files have been processed successfully
|
|
77
|
+
- **Smart Recovery**: Resumes from last successful step after interruption
|
|
78
|
+
- **Validation**: Verifies existing outputs before skipping
|
|
79
|
+
|
|
80
|
+
### ⚡ Performance Optimization
|
|
81
|
+
- **Parallel Processing**: Utilizes multiple CPU cores efficiently
|
|
82
|
+
- **Memory Management**: Adaptive batch sizing based on available memory
|
|
83
|
+
- **Process Isolation**: Prevents memory leaks in long-running jobs
|
|
84
|
+
- **Adaptive Compression**: Optimizes output format based on study size
|
|
85
|
+
|
|
86
|
+
### 📊 Comprehensive Logging
|
|
87
|
+
- **Progress Tracking**: Real-time status updates with time estimates
|
|
88
|
+
- **Detailed Logs**: Complete processing history saved to files
|
|
89
|
+
- **Error Reporting**: Clear error messages with recovery suggestions
|
|
90
|
+
- **Performance Metrics**: Processing times and resource usage statistics
|
|
91
|
+
|
|
92
|
+
## Pipeline Steps
|
|
93
|
+
|
|
94
|
+
### 1. File Discovery
|
|
95
|
+
- Searches for raw MS files (`.wiff`, `.raw`, `.mzML`, `.d`)
|
|
96
|
+
- Applies skip patterns to exclude unwanted files
|
|
97
|
+
- Validates file integrity and accessibility
|
|
98
|
+
- Reports file sizes and estimates processing time
|
|
99
|
+
|
|
100
|
+
### 2. Sample5 Conversion
|
|
101
|
+
- **Feature Detection**: Two-pass algorithm with configurable parameters
|
|
102
|
+
- **Adduct Detection**: Automated adduct grouping based on polarity
|
|
103
|
+
- **MS2 Linking**: Associates fragmentation spectra with features
|
|
104
|
+
- **Quality Control**: Validates outputs and reports statistics
|
|
105
|
+
- **Parallel Processing**: Utilizes multiple CPU cores with batch optimization
|
|
106
|
+
|
|
107
|
+
### 3. Study Assembly
|
|
108
|
+
- **Sample Loading**: Imports all processed sample5 files
|
|
109
|
+
- **Quality Filtering**: Removes low-quality features based on coherence/prominence
|
|
110
|
+
- **Metadata Organization**: Organizes sample information and experimental design
|
|
111
|
+
- **Memory Optimization**: Efficient data structures for large studies
|
|
112
|
+
|
|
113
|
+
### 4. Feature Alignment
|
|
114
|
+
- **RT Alignment**: Corrects retention time shifts between samples
|
|
115
|
+
- **Mass Alignment**: Accounts for mass calibration differences
|
|
116
|
+
- **Algorithm Selection**: Supports KD-tree, QT-clustering, and chunked methods
|
|
117
|
+
- **Validation**: Reports alignment statistics and quality metrics
|
|
118
|
+
|
|
119
|
+
### 5. Consensus Generation
|
|
120
|
+
- **Feature Merging**: Groups aligned features into consensus features
|
|
121
|
+
- **Statistical Validation**: Applies minimum sample requirements
|
|
122
|
+
- **Gap Filling**: Extracts chromatograms for missing values
|
|
123
|
+
- **MS2 Integration**: Links consensus features to MS2 spectra
|
|
124
|
+
|
|
125
|
+
### 6. Visualization & Export
|
|
126
|
+
- **Interactive Plots**: 2D feature maps, PCA plots, alignment visualizations
|
|
127
|
+
- **Multiple Formats**: CSV, Excel, MGF, Parquet exports
|
|
128
|
+
- **Study Archival**: Compressed study5 format for long-term storage
|
|
129
|
+
- **Metadata Export**: Complete processing parameters and statistics
|
|
130
|
+
|
|
131
|
+
## Configuration Options
|
|
132
|
+
|
|
133
|
+
### Core Parameters
|
|
134
|
+
|
|
135
|
+
| Parameter | Type | Default | Description |
|
|
136
|
+
|-----------|------|---------|-------------|
|
|
137
|
+
| `data_source` | str | **required** | Directory containing raw data files |
|
|
138
|
+
| `study_folder` | str | **required** | Output directory for processed study |
|
|
139
|
+
| `polarity` | str | `"positive"` | Ion polarity mode (`"positive"` or `"negative"`) |
|
|
140
|
+
| `num_cores` | int | `4` | Number of CPU cores for parallel processing |
|
|
141
|
+
| `adducts` | List[str] | auto-set | Adduct specifications (set based on polarity) |
|
|
142
|
+
|
|
143
|
+
### File Discovery
|
|
144
|
+
|
|
145
|
+
| Parameter | Type | Default | Description |
|
|
146
|
+
|-----------|------|---------|-------------|
|
|
147
|
+
| `file_extensions` | List[str] | `[".wiff", ".raw", ".mzML", ".d"]` | File types to search for |
|
|
148
|
+
| `search_subfolders` | bool | `True` | Search subdirectories recursively |
|
|
149
|
+
| `skip_patterns` | List[str] | `["blank", "QC", "test"]` | Filename patterns to skip |
|
|
150
|
+
| `max_file_size_gb` | float | `4.0` | Maximum file size warning threshold |
|
|
151
|
+
|
|
152
|
+
### Processing Parameters
|
|
153
|
+
|
|
154
|
+
| Parameter | Type | Default | Description |
|
|
155
|
+
|-----------|------|---------|-------------|
|
|
156
|
+
| `batch_size` | int | `8` | Files processed per batch |
|
|
157
|
+
| `memory_limit_gb` | float | `16.0` | Memory usage limit |
|
|
158
|
+
| `chrom_fwhm` | float | `0.2` | Expected chromatographic peak width (s) |
|
|
159
|
+
| `noise_threshold` | float | `1e5` | Intensity threshold for peak detection |
|
|
160
|
+
| `chrom_peak_snr` | float | `5.0` | Signal-to-noise ratio requirement |
|
|
161
|
+
| `tol_ppm` | float | `10.0` | Mass tolerance (ppm) |
|
|
162
|
+
|
|
163
|
+
### Study Assembly
|
|
164
|
+
|
|
165
|
+
| Parameter | Type | Default | Description |
|
|
166
|
+
|-----------|------|---------|-------------|
|
|
167
|
+
| `rt_tolerance` | float | `1.5` | RT tolerance for alignment (seconds) |
|
|
168
|
+
| `mz_tolerance` | float | `0.01` | m/z tolerance for alignment (Da) |
|
|
169
|
+
| `alignment_algorithm` | str | `"kd"` | Alignment algorithm (`"kd"`, `"qt"`, `"chunked"`) |
|
|
170
|
+
| `merge_method` | str | `"chunked"` | Merge algorithm for consensus generation |
|
|
171
|
+
| `min_samples_for_merge` | int | `50` | Minimum samples required for consensus |
|
|
172
|
+
|
|
173
|
+
### Output & Logging
|
|
174
|
+
|
|
175
|
+
| Parameter | Type | Default | Description |
|
|
176
|
+
|-----------|------|---------|-------------|
|
|
177
|
+
| `generate_plots` | bool | `True` | Generate visualization plots |
|
|
178
|
+
| `export_formats` | List[str] | `["csv", "mgf", "xlsx"]` | Output formats to generate |
|
|
179
|
+
| `compress_output` | bool | `True` | Compress final study file |
|
|
180
|
+
| `adaptive_compression` | bool | `True` | Adapt compression based on study size |
|
|
181
|
+
| `log_level` | str | `"INFO"` | Logging detail level |
|
|
182
|
+
| `log_to_file` | bool | `True` | Save logs to file |
|
|
183
|
+
|
|
184
|
+
### Resume & Recovery
|
|
185
|
+
|
|
186
|
+
| Parameter | Type | Default | Description |
|
|
187
|
+
|-----------|------|---------|-------------|
|
|
188
|
+
| `resume_enabled` | bool | `True` | Enable automatic resume capability |
|
|
189
|
+
| `force_reprocess` | bool | `False` | Force reprocessing of existing files |
|
|
190
|
+
| `backup_enabled` | bool | `True` | Create backups of intermediate results |
|
|
191
|
+
| `checkpoint_interval` | int | `10` | Save progress every N files |
|
|
192
|
+
| `cleanup_temp_files` | bool | `True` | Remove temporary files after completion |
|
|
193
|
+
|
|
194
|
+
## Methods
|
|
195
|
+
|
|
196
|
+
### Pipeline Control
|
|
197
|
+
|
|
198
|
+
#### `run_full_pipeline() -> bool`
|
|
199
|
+
Executes the complete processing pipeline in sequence. Returns `True` if successful.
|
|
200
|
+
|
|
201
|
+
#### Individual Steps
|
|
202
|
+
- `discover_files() -> List[Path]` - Find raw data files
|
|
203
|
+
- `convert_to_sample5(file_list=None) -> bool` - Convert to sample5 format
|
|
204
|
+
- `assemble_study() -> bool` - Create study from sample5 files
|
|
205
|
+
- `align_and_merge() -> bool` - Perform feature alignment and merging
|
|
206
|
+
- `generate_plots() -> bool` - Create visualization plots
|
|
207
|
+
- `export_results() -> bool` - Export in requested formats
|
|
208
|
+
- `save_study() -> bool` - Save final study file
|
|
209
|
+
- `cleanup_temp_files() -> bool` - Remove temporary files
|
|
210
|
+
|
|
211
|
+
### Status & Information
|
|
212
|
+
|
|
213
|
+
#### `info()`
|
|
214
|
+
Prints comprehensive wizard status including progress, timings, and results.
|
|
215
|
+
|
|
216
|
+
#### `get_status() -> Dict[str, Any]`
|
|
217
|
+
Returns detailed status dictionary with current step, processed files, timing, and parameters.
|
|
218
|
+
|
|
219
|
+
## Error Handling & Recovery
|
|
220
|
+
|
|
221
|
+
### Common Issues and Solutions
|
|
222
|
+
|
|
223
|
+
**Memory Errors**
|
|
224
|
+
- Reduce `batch_size` parameter
|
|
225
|
+
- Increase `memory_limit_gb` if available
|
|
226
|
+
- Use `merge_method="chunked"` for large studies
|
|
227
|
+
- Enable `cleanup_temp_files=True`
|
|
228
|
+
|
|
229
|
+
**File Access Errors**
|
|
230
|
+
- Check file permissions on source and destination folders
|
|
231
|
+
- Verify network connectivity for remote file systems
|
|
232
|
+
- Ensure sufficient disk space in output directory
|
|
233
|
+
- Close any applications that might lock files
|
|
234
|
+
|
|
235
|
+
**Processing Failures**
|
|
236
|
+
- Check individual file integrity
|
|
237
|
+
- Review `skip_patterns` to exclude problematic files
|
|
238
|
+
- Examine detailed logs in `wizard.log` and `processing.log`
|
|
239
|
+
- Try processing failed files individually for debugging
|
|
240
|
+
|
|
241
|
+
**Resume Issues**
|
|
242
|
+
- Delete `wizard_checkpoint.json` to force fresh start
|
|
243
|
+
- Verify output directory permissions
|
|
244
|
+
- Check for corrupted intermediate files
|
|
245
|
+
|
|
246
|
+
### Validation and Quality Control
|
|
247
|
+
|
|
248
|
+
The Wizard includes built-in validation at each step:
|
|
249
|
+
|
|
250
|
+
- **File Validation**: Checks file accessibility and format compatibility
|
|
251
|
+
- **Processing Validation**: Verifies sample5 outputs can be loaded
|
|
252
|
+
- **Study Validation**: Ensures study assembly completed successfully
|
|
253
|
+
- **Alignment Validation**: Reports alignment statistics and warnings
|
|
254
|
+
- **Export Validation**: Confirms all requested outputs were created
|
|
255
|
+
|
|
256
|
+
## Performance Guidelines
|
|
257
|
+
|
|
258
|
+
### System Requirements
|
|
259
|
+
- **Minimum**: 4 CPU cores, 8 GB RAM
|
|
260
|
+
- **Recommended**: 8+ CPU cores, 16+ GB RAM
|
|
261
|
+
- **Large Studies**: 16+ CPU cores, 32+ GB RAM
|
|
262
|
+
- **Storage**: SSD recommended, ~2-3x raw data size free space
|
|
263
|
+
|
|
264
|
+
### Optimization Tips
|
|
265
|
+
|
|
266
|
+
**For Small Studies (< 50 samples)**
|
|
267
|
+
- Use `num_cores = 4-6`
|
|
268
|
+
- Set `batch_size = 4-8`
|
|
269
|
+
- Use `merge_method = "kd"`
|
|
270
|
+
- Enable all export formats
|
|
271
|
+
|
|
272
|
+
**For Large Studies (100+ samples)**
|
|
273
|
+
- Use `num_cores = 8-16`
|
|
274
|
+
- Set `batch_size = 16-32`
|
|
275
|
+
- Use `merge_method = "chunked"`
|
|
276
|
+
- Enable `adaptive_compression = True`
|
|
277
|
+
- Consider processing in polarity-specific batches
|
|
278
|
+
|
|
279
|
+
**For Very Large Studies (500+ samples)**
|
|
280
|
+
- Process positive/negative modes separately
|
|
281
|
+
- Use `memory_limit_gb = 64+`
|
|
282
|
+
- Set `checkpoint_interval = 50`
|
|
283
|
+
- Enable `cleanup_temp_files = True`
|
|
284
|
+
- Consider cluster/cloud processing
|
|
285
|
+
|
|
286
|
+
## Integration Examples
|
|
287
|
+
|
|
288
|
+
### With Existing Workflows
|
|
289
|
+
|
|
290
|
+
```python
|
|
291
|
+
# Integration with custom preprocessing
|
|
292
|
+
wizard = Wizard(data_source="./preprocessed", ...)
|
|
293
|
+
|
|
294
|
+
# Skip conversion if already done
|
|
295
|
+
if not wizard.study_folder_path.glob("*.sample5"):
|
|
296
|
+
wizard.convert_to_sample5()
|
|
297
|
+
|
|
298
|
+
# Continue with study-level processing
|
|
299
|
+
wizard.assemble_study()
|
|
300
|
+
wizard.align_and_merge()
|
|
301
|
+
wizard.export_results()
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
### Batch Processing Multiple Studies
|
|
305
|
+
|
|
306
|
+
```python
|
|
307
|
+
studies = [
|
|
308
|
+
{"source": "./batch1", "output": "./results/batch1", "polarity": "pos"},
|
|
309
|
+
{"source": "./batch2", "output": "./results/batch2", "polarity": "neg"},
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
for study_config in studies:
|
|
313
|
+
wizard = Wizard(**study_config, num_cores=8)
|
|
314
|
+
success = wizard.run_full_pipeline()
|
|
315
|
+
|
|
316
|
+
if success:
|
|
317
|
+
print(f"✅ {study_config['output']} completed")
|
|
318
|
+
else:
|
|
319
|
+
print(f"❌ {study_config['output']} failed")
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
### Custom Processing Steps
|
|
323
|
+
|
|
324
|
+
```python
|
|
325
|
+
wizard = Wizard(...)
|
|
326
|
+
|
|
327
|
+
# Standard conversion
|
|
328
|
+
wizard.convert_to_sample5()
|
|
329
|
+
|
|
330
|
+
# Custom study assembly with specific parameters
|
|
331
|
+
wizard.assemble_study()
|
|
332
|
+
|
|
333
|
+
# Custom filtering before alignment
|
|
334
|
+
if hasattr(wizard.study, 'features_filter'):
|
|
335
|
+
selection = wizard.study.features_select(
|
|
336
|
+
chrom_coherence=0.5, # Higher quality threshold
|
|
337
|
+
chrom_prominence_scaled=2.0
|
|
338
|
+
)
|
|
339
|
+
wizard.study.features_filter(selection)
|
|
340
|
+
|
|
341
|
+
# Continue with standard pipeline
|
|
342
|
+
wizard.align_and_merge()
|
|
343
|
+
wizard.generate_plots()
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
## Output Files
|
|
347
|
+
|
|
348
|
+
The Wizard generates several types of output files:
|
|
349
|
+
|
|
350
|
+
### Primary Results
|
|
351
|
+
- `final_study.study5` - Complete study in masster native format
|
|
352
|
+
- `consensus_features.csv` - Feature table with RT, m/z, intensity data
|
|
353
|
+
- `study_results.xlsx` - Multi-sheet Excel workbook with results and metadata
|
|
354
|
+
- `consensus_ms2.mgf` - MS2 spectra for database searching
|
|
355
|
+
|
|
356
|
+
### Visualizations
|
|
357
|
+
- `alignment_plot.html` - Interactive alignment visualization
|
|
358
|
+
- `consensus_2d.html` - 2D feature map of consensus features
|
|
359
|
+
- `pca_plot.html` - Principal component analysis plot
|
|
360
|
+
- `consensus_stats.html` - Study statistics and quality metrics
|
|
361
|
+
|
|
362
|
+
### Processing Logs
|
|
363
|
+
- `wizard.log` - Detailed processing log with debug information
|
|
364
|
+
- `processing.log` - Simple progress log with timestamps
|
|
365
|
+
- `study_metadata.txt` - Study summary with parameters and statistics
|
|
366
|
+
|
|
367
|
+
### Individual Sample Outputs (if enabled)
|
|
368
|
+
- `sample_name.sample5` - Processed sample in masster format
|
|
369
|
+
- `sample_name.features.csv` - Individual sample feature table
|
|
370
|
+
- `sample_name.mgf` - Individual sample MS2 spectra
|
|
371
|
+
- `sample_name_2d.html` - Individual sample 2D plot
|
|
372
|
+
|
|
373
|
+
The Wizard provides a complete, automated solution for mass spectrometry data processing while maintaining flexibility for custom workflows and providing robust error handling and recovery capabilities.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Wizard module for automated processing of mass spectrometry studies.
|
|
3
|
+
|
|
4
|
+
This module provides the Wizard class for fully automated processing of MS data
|
|
5
|
+
from raw files to final study results, including batch conversion, assembly,
|
|
6
|
+
alignment, merging, plotting, and export.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .wizard import Wizard, wizard_def
|
|
10
|
+
|
|
11
|
+
__all__ = ["Wizard", "wizard_def"]
|