masster 0.4.13__py3-none-any.whl → 0.4.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +2 -0
- masster/_version.py +1 -1
- masster/sample/sample.py +41 -0
- masster/study/__init__.py +1 -0
- masster/study/defaults/find_consensus_def.py +1 -1
- masster/study/defaults/merge_def.py +69 -25
- masster/study/h5.py +65 -106
- masster/study/id.py +1 -1
- masster/study/load.py +11 -6
- masster/study/merge.py +1607 -0
- masster/study/processing.py +0 -874
- masster/study/save.py +1 -1
- masster/study/study.py +79 -21
- masster/wizard/README.md +373 -0
- masster/wizard/__init__.py +11 -0
- masster/wizard/example.py +223 -0
- masster/wizard/test_structure.py +49 -0
- masster/wizard/test_wizard.py +285 -0
- masster/wizard/wizard.py +1175 -0
- masster/wizard.py +1175 -0
- {masster-0.4.13.dist-info → masster-0.4.16.dist-info}/METADATA +1 -1
- {masster-0.4.13.dist-info → masster-0.4.16.dist-info}/RECORD +25 -17
- {masster-0.4.13.dist-info → masster-0.4.16.dist-info}/WHEEL +0 -0
- {masster-0.4.13.dist-info → masster-0.4.16.dist-info}/entry_points.txt +0 -0
- {masster-0.4.13.dist-info → masster-0.4.16.dist-info}/licenses/LICENSE +0 -0
masster/study/save.py
CHANGED
|
@@ -53,7 +53,7 @@ def save(self, filename=None, add_timestamp=True, compress=False):
|
|
|
53
53
|
if hasattr(self, "samples_df") and not self.samples_df.is_empty()
|
|
54
54
|
else 0
|
|
55
55
|
)
|
|
56
|
-
self.logger.
|
|
56
|
+
self.logger.debug(
|
|
57
57
|
f"Saving study with {sample_count} samples and {feature_count} features to {filename}",
|
|
58
58
|
)
|
|
59
59
|
|
masster/study/study.py
CHANGED
|
@@ -116,7 +116,11 @@ from masster.study.plot import plot_tic
|
|
|
116
116
|
from masster.study.plot import plot_eic
|
|
117
117
|
from masster.study.plot import plot_rt_correction
|
|
118
118
|
from masster.study.processing import align
|
|
119
|
-
from masster.study.
|
|
119
|
+
from masster.study.merge import merge
|
|
120
|
+
from masster.study.merge import _reset_consensus_data
|
|
121
|
+
from masster.study.merge import _extract_consensus_features
|
|
122
|
+
from masster.study.merge import _perform_adduct_grouping
|
|
123
|
+
from masster.study.merge import _finalize_merge
|
|
120
124
|
from masster.study.processing import integrate
|
|
121
125
|
from masster.study.processing import find_ms2
|
|
122
126
|
from masster.study.parameters import store_history
|
|
@@ -146,7 +150,6 @@ from masster.study.defaults.align_def import align_defaults
|
|
|
146
150
|
from masster.study.defaults.export_def import export_mgf_defaults
|
|
147
151
|
from masster.study.defaults.fill_chrom_def import fill_chrom_defaults
|
|
148
152
|
from masster.study.defaults.fill_def import fill_defaults
|
|
149
|
-
from masster.study.defaults.find_consensus_def import find_consensus_defaults
|
|
150
153
|
from masster.study.defaults.find_ms2_def import find_ms2_defaults
|
|
151
154
|
from masster.study.defaults.integrate_chrom_def import integrate_chrom_defaults
|
|
152
155
|
from masster.study.defaults.integrate_def import integrate_defaults
|
|
@@ -385,7 +388,6 @@ class Study:
|
|
|
385
388
|
fill_chrom = fill # Backward compatibility alias
|
|
386
389
|
fill_single = fill_single
|
|
387
390
|
fill_chrom_single = fill_single # Backward compatibility alias
|
|
388
|
-
|
|
389
391
|
# === Data Retrieval and Access ===
|
|
390
392
|
get_consensus = get_consensus
|
|
391
393
|
get_chrom = get_chrom
|
|
@@ -490,13 +492,18 @@ class Study:
|
|
|
490
492
|
_format_adduct_name = _format_adduct_name
|
|
491
493
|
_parse_element_counts = _parse_element_counts
|
|
492
494
|
|
|
495
|
+
# === Merge Helper Methods ===
|
|
496
|
+
_reset_consensus_data = _reset_consensus_data
|
|
497
|
+
_extract_consensus_features = _extract_consensus_features
|
|
498
|
+
_perform_adduct_grouping = _perform_adduct_grouping
|
|
499
|
+
_finalize_merge = _finalize_merge
|
|
500
|
+
|
|
493
501
|
# === Default Parameters ===
|
|
494
502
|
study_defaults = study_defaults
|
|
495
503
|
align_defaults = align_defaults
|
|
496
504
|
export_mgf_defaults = export_mgf_defaults
|
|
497
505
|
fill_chrom_defaults = fill_chrom_defaults
|
|
498
506
|
fill_defaults = fill_defaults
|
|
499
|
-
find_consensus_defaults = find_consensus_defaults
|
|
500
507
|
find_ms2_defaults = find_ms2_defaults
|
|
501
508
|
integrate_chrom_defaults = integrate_chrom_defaults
|
|
502
509
|
integrate_defaults = integrate_defaults
|
|
@@ -587,6 +594,51 @@ class Study:
|
|
|
587
594
|
except Exception as e:
|
|
588
595
|
self.logger.error(f"Failed to reload current module {current_module}: {e}")
|
|
589
596
|
|
|
597
|
+
def __dir__(self):
|
|
598
|
+
"""
|
|
599
|
+
Custom __dir__ implementation to hide internal methods starting with '_'
|
|
600
|
+
and backward compatibility aliases from tab completion and dir() calls,
|
|
601
|
+
while keeping them accessible to class methods.
|
|
602
|
+
|
|
603
|
+
Returns:
|
|
604
|
+
list: List of public attribute and method names (excluding internal and deprecated methods)
|
|
605
|
+
"""
|
|
606
|
+
# Define backward compatibility aliases to hide
|
|
607
|
+
backward_compatibility_aliases = {
|
|
608
|
+
'add_folder', # alias for add
|
|
609
|
+
'find_consensus', # alias for merge
|
|
610
|
+
'integrate_chrom', # alias for integrate
|
|
611
|
+
'fill_chrom', # alias for fill
|
|
612
|
+
'fill_chrom_single', # alias for fill_single
|
|
613
|
+
'filter_consensus', # alias for consensus_filter
|
|
614
|
+
'select_consensus', # alias for consensus_select
|
|
615
|
+
'filter_features', # alias for features_filter
|
|
616
|
+
'select_features', # alias for features_select
|
|
617
|
+
'consensus_find', # alias for merge
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
# Get all attributes from the class
|
|
621
|
+
all_attrs = set()
|
|
622
|
+
|
|
623
|
+
# Add attributes from the class and all its bases
|
|
624
|
+
for cls in self.__class__.__mro__:
|
|
625
|
+
all_attrs.update(cls.__dict__.keys())
|
|
626
|
+
|
|
627
|
+
# Add instance attributes
|
|
628
|
+
all_attrs.update(self.__dict__.keys())
|
|
629
|
+
|
|
630
|
+
# Filter out attributes starting with '_' (but keep special methods like __init__, __str__, etc.)
|
|
631
|
+
# Also filter out backward compatibility aliases
|
|
632
|
+
public_attrs = [
|
|
633
|
+
attr for attr in all_attrs
|
|
634
|
+
if not attr.startswith('_') or attr.startswith('__') and attr.endswith('__')
|
|
635
|
+
]
|
|
636
|
+
|
|
637
|
+
# Remove backward compatibility aliases from the public attributes
|
|
638
|
+
public_attrs = [attr for attr in public_attrs if attr not in backward_compatibility_aliases]
|
|
639
|
+
|
|
640
|
+
return sorted(public_attrs)
|
|
641
|
+
|
|
590
642
|
def __str__(self):
|
|
591
643
|
"""
|
|
592
644
|
Returns a string representation of the study.
|
|
@@ -632,11 +684,13 @@ class Study:
|
|
|
632
684
|
samples count, and various statistics. Shows warning symbols for values
|
|
633
685
|
that are out of normal range.
|
|
634
686
|
"""
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
687
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
688
|
+
self.consensus_df = pl.DataFrame()
|
|
689
|
+
consensus_df_len = 0
|
|
690
|
+
else:
|
|
691
|
+
consensus_df_len = len(self.consensus_df)
|
|
692
|
+
|
|
693
|
+
samples_df_len = len(self.samples_df) if (self.samples_df is not None and not self.samples_df.is_empty()) else 0
|
|
640
694
|
|
|
641
695
|
# Calculate consensus statistics only if consensus_df exists and has data
|
|
642
696
|
if consensus_df_len > 0:
|
|
@@ -658,7 +712,8 @@ class Study:
|
|
|
658
712
|
max_samples = 0
|
|
659
713
|
|
|
660
714
|
# Count only features where 'filled' == False
|
|
661
|
-
if not self.features_df.is_empty() and
|
|
715
|
+
if (self.features_df is not None and not self.features_df.is_empty() and
|
|
716
|
+
"filled" in self.features_df.columns):
|
|
662
717
|
unfilled_features_count = self.features_df.filter(
|
|
663
718
|
~self.features_df["filled"],
|
|
664
719
|
).height
|
|
@@ -666,7 +721,8 @@ class Study:
|
|
|
666
721
|
unfilled_features_count = 0
|
|
667
722
|
|
|
668
723
|
# Calculate features in consensus vs not in consensus (only for unfilled features)
|
|
669
|
-
if
|
|
724
|
+
if (self.features_df is not None and not self.features_df.is_empty() and
|
|
725
|
+
self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty()):
|
|
670
726
|
# Get unfilled features only
|
|
671
727
|
unfilled_features = (
|
|
672
728
|
self.features_df.filter(~self.features_df["filled"])
|
|
@@ -720,6 +776,7 @@ class Study:
|
|
|
720
776
|
if (
|
|
721
777
|
consensus_df_len > 0
|
|
722
778
|
and samples_df_len > 0
|
|
779
|
+
and self.features_df is not None
|
|
723
780
|
and not self.features_df.is_empty()
|
|
724
781
|
):
|
|
725
782
|
# Ensure matching data types for join keys
|
|
@@ -745,7 +802,8 @@ class Study:
|
|
|
745
802
|
)
|
|
746
803
|
|
|
747
804
|
# Use more efficient counting - count non-null chroms only for features in consensus mapping
|
|
748
|
-
if
|
|
805
|
+
if (self.consensus_mapping_df is not None and
|
|
806
|
+
not self.consensus_mapping_df.is_empty()):
|
|
749
807
|
non_null_chroms = (
|
|
750
808
|
self.features_df.join(
|
|
751
809
|
self.consensus_mapping_df.select("feature_uid"),
|
|
@@ -767,16 +825,16 @@ class Study:
|
|
|
767
825
|
chrom_completeness = 0
|
|
768
826
|
|
|
769
827
|
# Calculate consensus features with MS2 (count unique consensus_uids with MS2)
|
|
770
|
-
if not self.consensus_ms2.is_empty():
|
|
828
|
+
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
771
829
|
consensus_with_ms2_count = self.consensus_ms2["consensus_uid"].n_unique()
|
|
772
830
|
else:
|
|
773
831
|
consensus_with_ms2_count = 0
|
|
774
832
|
|
|
775
|
-
if not self.consensus_df.is_empty():
|
|
833
|
+
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
776
834
|
# Compute RT spread using only consensus rows with number_samples >= half the number of samples
|
|
777
835
|
threshold = (
|
|
778
836
|
self.consensus_df.select(pl.col("number_samples").max()).item() / 2
|
|
779
|
-
if not self.samples_df.is_empty()
|
|
837
|
+
if (self.samples_df is not None and not self.samples_df.is_empty())
|
|
780
838
|
else 0
|
|
781
839
|
)
|
|
782
840
|
filtered = self.consensus_df.filter(pl.col("number_samples") >= threshold)
|
|
@@ -803,16 +861,16 @@ class Study:
|
|
|
803
861
|
|
|
804
862
|
# Total MS2 spectra count
|
|
805
863
|
total_ms2_count = (
|
|
806
|
-
len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
864
|
+
len(self.consensus_ms2) if (self.consensus_ms2 is not None and not self.consensus_ms2.is_empty()) else 0
|
|
807
865
|
)
|
|
808
866
|
|
|
809
867
|
# Estimate memory usage
|
|
810
868
|
memory_usage = (
|
|
811
|
-
self.samples_df.estimated_size()
|
|
812
|
-
+ self.features_df.estimated_size()
|
|
813
|
-
+ self.consensus_df.estimated_size()
|
|
814
|
-
+ self.consensus_ms2.estimated_size()
|
|
815
|
-
+ self.consensus_mapping_df.estimated_size()
|
|
869
|
+
(self.samples_df.estimated_size() if self.samples_df is not None else 0)
|
|
870
|
+
+ (self.features_df.estimated_size() if self.features_df is not None else 0)
|
|
871
|
+
+ (self.consensus_df.estimated_size() if self.consensus_df is not None else 0)
|
|
872
|
+
+ (self.consensus_ms2.estimated_size() if self.consensus_ms2 is not None else 0)
|
|
873
|
+
+ (self.consensus_mapping_df.estimated_size() if self.consensus_mapping_df is not None else 0)
|
|
816
874
|
)
|
|
817
875
|
|
|
818
876
|
# Add warning symbols for out-of-range values
|
masster/wizard/README.md
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
# Wizard Class Documentation
|
|
2
|
+
|
|
3
|
+
The `Wizard` class provides comprehensive automation for mass spectrometry data processing, from raw files to final study results. It handles the complete workflow with minimal user intervention while providing intelligent resume capabilities, parallel processing optimization, and adaptive output formats.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
### Basic Usage
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from masster import Wizard
|
|
11
|
+
|
|
12
|
+
# Create wizard with minimal configuration
|
|
13
|
+
wizard = Wizard(
|
|
14
|
+
data_source="./raw_data", # Directory with raw files
|
|
15
|
+
study_folder="./processed", # Output directory
|
|
16
|
+
polarity="positive", # or "negative"
|
|
17
|
+
num_cores=4 # CPU cores to use
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Run complete pipeline
|
|
21
|
+
success = wizard.run_full_pipeline()
|
|
22
|
+
|
|
23
|
+
if success:
|
|
24
|
+
wizard.info() # Print summary
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### Advanced Configuration
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from masster import Wizard, wizard_def
|
|
31
|
+
|
|
32
|
+
# Create custom parameters
|
|
33
|
+
params = wizard_def(
|
|
34
|
+
data_source="./raw_data",
|
|
35
|
+
study_folder="./processed_advanced",
|
|
36
|
+
polarity="negative",
|
|
37
|
+
num_cores=8,
|
|
38
|
+
|
|
39
|
+
# File discovery
|
|
40
|
+
file_extensions=[".wiff", ".raw", ".mzML"],
|
|
41
|
+
search_subfolders=True,
|
|
42
|
+
skip_patterns=["blank", "QC", "test"],
|
|
43
|
+
|
|
44
|
+
# Processing parameters
|
|
45
|
+
adducts=["H-1:-:0.95", "Cl:-:0.05", "CH2O2:0:0.2"],
|
|
46
|
+
chrom_fwhm=0.15,
|
|
47
|
+
noise_threshold=5e4,
|
|
48
|
+
|
|
49
|
+
# Study assembly
|
|
50
|
+
rt_tolerance=1.0,
|
|
51
|
+
mz_tolerance=0.008,
|
|
52
|
+
min_samples_for_merge=30,
|
|
53
|
+
|
|
54
|
+
# Output options
|
|
55
|
+
export_formats=["csv", "xlsx", "mgf", "parquet"],
|
|
56
|
+
generate_plots=True,
|
|
57
|
+
compress_output=True,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
wizard = Wizard(params=params)
|
|
61
|
+
wizard.run_full_pipeline()
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Key Features
|
|
65
|
+
|
|
66
|
+
### 🔄 Automated Pipeline
|
|
67
|
+
- **Raw Data Discovery**: Automatically finds and validates raw MS files
|
|
68
|
+
- **Batch Conversion**: Parallel conversion to sample5 format with optimized parameters
|
|
69
|
+
- **Study Assembly**: Creates study from processed samples with quality filtering
|
|
70
|
+
- **Feature Alignment**: Cross-sample alignment using configurable algorithms
|
|
71
|
+
- **Consensus Generation**: Merges aligned features with statistical validation
|
|
72
|
+
- **Results Export**: Multiple output formats for downstream analysis
|
|
73
|
+
|
|
74
|
+
### 💾 Intelligent Resume
|
|
75
|
+
- **Checkpoint System**: Automatically saves progress at key points
|
|
76
|
+
- **File Tracking**: Remembers which files have been processed successfully
|
|
77
|
+
- **Smart Recovery**: Resumes from last successful step after interruption
|
|
78
|
+
- **Validation**: Verifies existing outputs before skipping
|
|
79
|
+
|
|
80
|
+
### ⚡ Performance Optimization
|
|
81
|
+
- **Parallel Processing**: Utilizes multiple CPU cores efficiently
|
|
82
|
+
- **Memory Management**: Adaptive batch sizing based on available memory
|
|
83
|
+
- **Process Isolation**: Prevents memory leaks in long-running jobs
|
|
84
|
+
- **Adaptive Compression**: Optimizes output format based on study size
|
|
85
|
+
|
|
86
|
+
### 📊 Comprehensive Logging
|
|
87
|
+
- **Progress Tracking**: Real-time status updates with time estimates
|
|
88
|
+
- **Detailed Logs**: Complete processing history saved to files
|
|
89
|
+
- **Error Reporting**: Clear error messages with recovery suggestions
|
|
90
|
+
- **Performance Metrics**: Processing times and resource usage statistics
|
|
91
|
+
|
|
92
|
+
## Pipeline Steps
|
|
93
|
+
|
|
94
|
+
### 1. File Discovery
|
|
95
|
+
- Searches for raw MS files (`.wiff`, `.raw`, `.mzML`, `.d`)
|
|
96
|
+
- Applies skip patterns to exclude unwanted files
|
|
97
|
+
- Validates file integrity and accessibility
|
|
98
|
+
- Reports file sizes and estimates processing time
|
|
99
|
+
|
|
100
|
+
### 2. Sample5 Conversion
|
|
101
|
+
- **Feature Detection**: Two-pass algorithm with configurable parameters
|
|
102
|
+
- **Adduct Detection**: Automated adduct grouping based on polarity
|
|
103
|
+
- **MS2 Linking**: Associates fragmentation spectra with features
|
|
104
|
+
- **Quality Control**: Validates outputs and reports statistics
|
|
105
|
+
- **Parallel Processing**: Utilizes multiple CPU cores with batch optimization
|
|
106
|
+
|
|
107
|
+
### 3. Study Assembly
|
|
108
|
+
- **Sample Loading**: Imports all processed sample5 files
|
|
109
|
+
- **Quality Filtering**: Removes low-quality features based on coherence/prominence
|
|
110
|
+
- **Metadata Organization**: Organizes sample information and experimental design
|
|
111
|
+
- **Memory Optimization**: Efficient data structures for large studies
|
|
112
|
+
|
|
113
|
+
### 4. Feature Alignment
|
|
114
|
+
- **RT Alignment**: Corrects retention time shifts between samples
|
|
115
|
+
- **Mass Alignment**: Accounts for mass calibration differences
|
|
116
|
+
- **Algorithm Selection**: Supports KD-tree, QT-clustering, and chunked methods
|
|
117
|
+
- **Validation**: Reports alignment statistics and quality metrics
|
|
118
|
+
|
|
119
|
+
### 5. Consensus Generation
|
|
120
|
+
- **Feature Merging**: Groups aligned features into consensus features
|
|
121
|
+
- **Statistical Validation**: Applies minimum sample requirements
|
|
122
|
+
- **Gap Filling**: Extracts chromatograms for missing values
|
|
123
|
+
- **MS2 Integration**: Links consensus features to MS2 spectra
|
|
124
|
+
|
|
125
|
+
### 6. Visualization & Export
|
|
126
|
+
- **Interactive Plots**: 2D feature maps, PCA plots, alignment visualizations
|
|
127
|
+
- **Multiple Formats**: CSV, Excel, MGF, Parquet exports
|
|
128
|
+
- **Study Archival**: Compressed study5 format for long-term storage
|
|
129
|
+
- **Metadata Export**: Complete processing parameters and statistics
|
|
130
|
+
|
|
131
|
+
## Configuration Options
|
|
132
|
+
|
|
133
|
+
### Core Parameters
|
|
134
|
+
|
|
135
|
+
| Parameter | Type | Default | Description |
|
|
136
|
+
|-----------|------|---------|-------------|
|
|
137
|
+
| `data_source` | str | **required** | Directory containing raw data files |
|
|
138
|
+
| `study_folder` | str | **required** | Output directory for processed study |
|
|
139
|
+
| `polarity` | str | `"positive"` | Ion polarity mode (`"positive"` or `"negative"`) |
|
|
140
|
+
| `num_cores` | int | `4` | Number of CPU cores for parallel processing |
|
|
141
|
+
| `adducts` | List[str] | auto-set | Adduct specifications (set based on polarity) |
|
|
142
|
+
|
|
143
|
+
### File Discovery
|
|
144
|
+
|
|
145
|
+
| Parameter | Type | Default | Description |
|
|
146
|
+
|-----------|------|---------|-------------|
|
|
147
|
+
| `file_extensions` | List[str] | `[".wiff", ".raw", ".mzML", ".d"]` | File types to search for |
|
|
148
|
+
| `search_subfolders` | bool | `True` | Search subdirectories recursively |
|
|
149
|
+
| `skip_patterns` | List[str] | `["blank", "QC", "test"]` | Filename patterns to skip |
|
|
150
|
+
| `max_file_size_gb` | float | `4.0` | Maximum file size warning threshold |
|
|
151
|
+
|
|
152
|
+
### Processing Parameters
|
|
153
|
+
|
|
154
|
+
| Parameter | Type | Default | Description |
|
|
155
|
+
|-----------|------|---------|-------------|
|
|
156
|
+
| `batch_size` | int | `8` | Files processed per batch |
|
|
157
|
+
| `memory_limit_gb` | float | `16.0` | Memory usage limit |
|
|
158
|
+
| `chrom_fwhm` | float | `0.2` | Expected chromatographic peak width (s) |
|
|
159
|
+
| `noise_threshold` | float | `1e5` | Intensity threshold for peak detection |
|
|
160
|
+
| `chrom_peak_snr` | float | `5.0` | Signal-to-noise ratio requirement |
|
|
161
|
+
| `tol_ppm` | float | `10.0` | Mass tolerance (ppm) |
|
|
162
|
+
|
|
163
|
+
### Study Assembly
|
|
164
|
+
|
|
165
|
+
| Parameter | Type | Default | Description |
|
|
166
|
+
|-----------|------|---------|-------------|
|
|
167
|
+
| `rt_tolerance` | float | `1.5` | RT tolerance for alignment (seconds) |
|
|
168
|
+
| `mz_tolerance` | float | `0.01` | m/z tolerance for alignment (Da) |
|
|
169
|
+
| `alignment_algorithm` | str | `"kd"` | Alignment algorithm (`"kd"`, `"qt"`, `"chunked"`) |
|
|
170
|
+
| `merge_method` | str | `"chunked"` | Merge algorithm for consensus generation |
|
|
171
|
+
| `min_samples_for_merge` | int | `50` | Minimum samples required for consensus |
|
|
172
|
+
|
|
173
|
+
### Output & Logging
|
|
174
|
+
|
|
175
|
+
| Parameter | Type | Default | Description |
|
|
176
|
+
|-----------|------|---------|-------------|
|
|
177
|
+
| `generate_plots` | bool | `True` | Generate visualization plots |
|
|
178
|
+
| `export_formats` | List[str] | `["csv", "mgf", "xlsx"]` | Output formats to generate |
|
|
179
|
+
| `compress_output` | bool | `True` | Compress final study file |
|
|
180
|
+
| `adaptive_compression` | bool | `True` | Adapt compression based on study size |
|
|
181
|
+
| `log_level` | str | `"INFO"` | Logging detail level |
|
|
182
|
+
| `log_to_file` | bool | `True` | Save logs to file |
|
|
183
|
+
|
|
184
|
+
### Resume & Recovery
|
|
185
|
+
|
|
186
|
+
| Parameter | Type | Default | Description |
|
|
187
|
+
|-----------|------|---------|-------------|
|
|
188
|
+
| `resume_enabled` | bool | `True` | Enable automatic resume capability |
|
|
189
|
+
| `force_reprocess` | bool | `False` | Force reprocessing of existing files |
|
|
190
|
+
| `backup_enabled` | bool | `True` | Create backups of intermediate results |
|
|
191
|
+
| `checkpoint_interval` | int | `10` | Save progress every N files |
|
|
192
|
+
| `cleanup_temp_files` | bool | `True` | Remove temporary files after completion |
|
|
193
|
+
|
|
194
|
+
## Methods
|
|
195
|
+
|
|
196
|
+
### Pipeline Control
|
|
197
|
+
|
|
198
|
+
#### `run_full_pipeline() -> bool`
|
|
199
|
+
Executes the complete processing pipeline in sequence. Returns `True` if successful.
|
|
200
|
+
|
|
201
|
+
#### Individual Steps
|
|
202
|
+
- `discover_files() -> List[Path]` - Find raw data files
|
|
203
|
+
- `convert_to_sample5(file_list=None) -> bool` - Convert to sample5 format
|
|
204
|
+
- `assemble_study() -> bool` - Create study from sample5 files
|
|
205
|
+
- `align_and_merge() -> bool` - Perform feature alignment and merging
|
|
206
|
+
- `generate_plots() -> bool` - Create visualization plots
|
|
207
|
+
- `export_results() -> bool` - Export in requested formats
|
|
208
|
+
- `save_study() -> bool` - Save final study file
|
|
209
|
+
- `cleanup_temp_files() -> bool` - Remove temporary files
|
|
210
|
+
|
|
211
|
+
### Status & Information
|
|
212
|
+
|
|
213
|
+
#### `info()`
|
|
214
|
+
Prints comprehensive wizard status including progress, timings, and results.
|
|
215
|
+
|
|
216
|
+
#### `get_status() -> Dict[str, Any]`
|
|
217
|
+
Returns detailed status dictionary with current step, processed files, timing, and parameters.
|
|
218
|
+
|
|
219
|
+
## Error Handling & Recovery
|
|
220
|
+
|
|
221
|
+
### Common Issues and Solutions
|
|
222
|
+
|
|
223
|
+
**Memory Errors**
|
|
224
|
+
- Reduce `batch_size` parameter
|
|
225
|
+
- Increase `memory_limit_gb` if available
|
|
226
|
+
- Use `merge_method="chunked"` for large studies
|
|
227
|
+
- Enable `cleanup_temp_files=True`
|
|
228
|
+
|
|
229
|
+
**File Access Errors**
|
|
230
|
+
- Check file permissions on source and destination folders
|
|
231
|
+
- Verify network connectivity for remote file systems
|
|
232
|
+
- Ensure sufficient disk space in output directory
|
|
233
|
+
- Close any applications that might lock files
|
|
234
|
+
|
|
235
|
+
**Processing Failures**
|
|
236
|
+
- Check individual file integrity
|
|
237
|
+
- Review `skip_patterns` to exclude problematic files
|
|
238
|
+
- Examine detailed logs in `wizard.log` and `processing.log`
|
|
239
|
+
- Try processing failed files individually for debugging
|
|
240
|
+
|
|
241
|
+
**Resume Issues**
|
|
242
|
+
- Delete `wizard_checkpoint.json` to force fresh start
|
|
243
|
+
- Verify output directory permissions
|
|
244
|
+
- Check for corrupted intermediate files
|
|
245
|
+
|
|
246
|
+
### Validation and Quality Control
|
|
247
|
+
|
|
248
|
+
The Wizard includes built-in validation at each step:
|
|
249
|
+
|
|
250
|
+
- **File Validation**: Checks file accessibility and format compatibility
|
|
251
|
+
- **Processing Validation**: Verifies sample5 outputs can be loaded
|
|
252
|
+
- **Study Validation**: Ensures study assembly completed successfully
|
|
253
|
+
- **Alignment Validation**: Reports alignment statistics and warnings
|
|
254
|
+
- **Export Validation**: Confirms all requested outputs were created
|
|
255
|
+
|
|
256
|
+
## Performance Guidelines
|
|
257
|
+
|
|
258
|
+
### System Requirements
|
|
259
|
+
- **Minimum**: 4 CPU cores, 8 GB RAM
|
|
260
|
+
- **Recommended**: 8+ CPU cores, 16+ GB RAM
|
|
261
|
+
- **Large Studies**: 16+ CPU cores, 32+ GB RAM
|
|
262
|
+
- **Storage**: SSD recommended, ~2-3x raw data size free space
|
|
263
|
+
|
|
264
|
+
### Optimization Tips
|
|
265
|
+
|
|
266
|
+
**For Small Studies (< 50 samples)**
|
|
267
|
+
- Use `num_cores = 4-6`
|
|
268
|
+
- Set `batch_size = 4-8`
|
|
269
|
+
- Use `merge_method = "kd"`
|
|
270
|
+
- Enable all export formats
|
|
271
|
+
|
|
272
|
+
**For Large Studies (100+ samples)**
|
|
273
|
+
- Use `num_cores = 8-16`
|
|
274
|
+
- Set `batch_size = 16-32`
|
|
275
|
+
- Use `merge_method = "chunked"`
|
|
276
|
+
- Enable `adaptive_compression = True`
|
|
277
|
+
- Consider processing in polarity-specific batches
|
|
278
|
+
|
|
279
|
+
**For Very Large Studies (500+ samples)**
|
|
280
|
+
- Process positive/negative modes separately
|
|
281
|
+
- Use `memory_limit_gb = 64+`
|
|
282
|
+
- Set `checkpoint_interval = 50`
|
|
283
|
+
- Enable `cleanup_temp_files = True`
|
|
284
|
+
- Consider cluster/cloud processing
|
|
285
|
+
|
|
286
|
+
## Integration Examples
|
|
287
|
+
|
|
288
|
+
### With Existing Workflows
|
|
289
|
+
|
|
290
|
+
```python
|
|
291
|
+
# Integration with custom preprocessing
|
|
292
|
+
wizard = Wizard(data_source="./preprocessed", ...)
|
|
293
|
+
|
|
294
|
+
# Skip conversion if already done
|
|
295
|
+
if not wizard.study_folder_path.glob("*.sample5"):
|
|
296
|
+
wizard.convert_to_sample5()
|
|
297
|
+
|
|
298
|
+
# Continue with study-level processing
|
|
299
|
+
wizard.assemble_study()
|
|
300
|
+
wizard.align_and_merge()
|
|
301
|
+
wizard.export_results()
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
### Batch Processing Multiple Studies
|
|
305
|
+
|
|
306
|
+
```python
|
|
307
|
+
studies = [
|
|
308
|
+
{"source": "./batch1", "output": "./results/batch1", "polarity": "pos"},
|
|
309
|
+
{"source": "./batch2", "output": "./results/batch2", "polarity": "neg"},
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
for study_config in studies:
|
|
313
|
+
wizard = Wizard(**study_config, num_cores=8)
|
|
314
|
+
success = wizard.run_full_pipeline()
|
|
315
|
+
|
|
316
|
+
if success:
|
|
317
|
+
print(f"✅ {study_config['output']} completed")
|
|
318
|
+
else:
|
|
319
|
+
print(f"❌ {study_config['output']} failed")
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
### Custom Processing Steps
|
|
323
|
+
|
|
324
|
+
```python
|
|
325
|
+
wizard = Wizard(...)
|
|
326
|
+
|
|
327
|
+
# Standard conversion
|
|
328
|
+
wizard.convert_to_sample5()
|
|
329
|
+
|
|
330
|
+
# Custom study assembly with specific parameters
|
|
331
|
+
wizard.assemble_study()
|
|
332
|
+
|
|
333
|
+
# Custom filtering before alignment
|
|
334
|
+
if hasattr(wizard.study, 'features_filter'):
|
|
335
|
+
selection = wizard.study.features_select(
|
|
336
|
+
chrom_coherence=0.5, # Higher quality threshold
|
|
337
|
+
chrom_prominence_scaled=2.0
|
|
338
|
+
)
|
|
339
|
+
wizard.study.features_filter(selection)
|
|
340
|
+
|
|
341
|
+
# Continue with standard pipeline
|
|
342
|
+
wizard.align_and_merge()
|
|
343
|
+
wizard.generate_plots()
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
## Output Files
|
|
347
|
+
|
|
348
|
+
The Wizard generates several types of output files:
|
|
349
|
+
|
|
350
|
+
### Primary Results
|
|
351
|
+
- `final_study.study5` - Complete study in masster native format
|
|
352
|
+
- `consensus_features.csv` - Feature table with RT, m/z, intensity data
|
|
353
|
+
- `study_results.xlsx` - Multi-sheet Excel workbook with results and metadata
|
|
354
|
+
- `consensus_ms2.mgf` - MS2 spectra for database searching
|
|
355
|
+
|
|
356
|
+
### Visualizations
|
|
357
|
+
- `alignment_plot.html` - Interactive alignment visualization
|
|
358
|
+
- `consensus_2d.html` - 2D feature map of consensus features
|
|
359
|
+
- `pca_plot.html` - Principal component analysis plot
|
|
360
|
+
- `consensus_stats.html` - Study statistics and quality metrics
|
|
361
|
+
|
|
362
|
+
### Processing Logs
|
|
363
|
+
- `wizard.log` - Detailed processing log with debug information
|
|
364
|
+
- `processing.log` - Simple progress log with timestamps
|
|
365
|
+
- `study_metadata.txt` - Study summary with parameters and statistics
|
|
366
|
+
|
|
367
|
+
### Individual Sample Outputs (if enabled)
|
|
368
|
+
- `sample_name.sample5` - Processed sample in masster format
|
|
369
|
+
- `sample_name.features.csv` - Individual sample feature table
|
|
370
|
+
- `sample_name.mgf` - Individual sample MS2 spectra
|
|
371
|
+
- `sample_name_2d.html` - Individual sample 2D plot
|
|
372
|
+
|
|
373
|
+
The Wizard provides a complete, automated solution for mass spectrometry data processing while maintaining flexibility for custom workflows and providing robust error handling and recovery capabilities.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Wizard module for automated processing of mass spectrometry studies.
|
|
3
|
+
|
|
4
|
+
This module provides the Wizard class for fully automated processing of MS data
|
|
5
|
+
from raw files to final study results, including batch conversion, assembly,
|
|
6
|
+
alignment, merging, plotting, and export.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .wizard import Wizard, wizard_def
|
|
10
|
+
|
|
11
|
+
__all__ = ["Wizard", "wizard_def"]
|