masster 0.4.13__py3-none-any.whl → 0.4.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +2 -0
- masster/_version.py +1 -1
- masster/sample/sample.py +41 -0
- masster/study/__init__.py +1 -0
- masster/study/defaults/find_consensus_def.py +1 -1
- masster/study/defaults/merge_def.py +69 -25
- masster/study/h5.py +65 -106
- masster/study/id.py +1 -1
- masster/study/load.py +11 -6
- masster/study/merge.py +1607 -0
- masster/study/processing.py +0 -874
- masster/study/save.py +1 -1
- masster/study/study.py +79 -21
- masster/wizard/README.md +373 -0
- masster/wizard/__init__.py +11 -0
- masster/wizard/example.py +223 -0
- masster/wizard/test_structure.py +49 -0
- masster/wizard/test_wizard.py +285 -0
- masster/wizard/wizard.py +1175 -0
- masster/wizard.py +1175 -0
- {masster-0.4.13.dist-info → masster-0.4.16.dist-info}/METADATA +1 -1
- {masster-0.4.13.dist-info → masster-0.4.16.dist-info}/RECORD +25 -17
- {masster-0.4.13.dist-info → masster-0.4.16.dist-info}/WHEEL +0 -0
- {masster-0.4.13.dist-info → masster-0.4.16.dist-info}/entry_points.txt +0 -0
- {masster-0.4.13.dist-info → masster-0.4.16.dist-info}/licenses/LICENSE +0 -0
masster/wizard.py
ADDED
|
@@ -0,0 +1,1175 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Wizard module for automated processing of mass spectrometry studies.
|
|
3
|
+
|
|
4
|
+
This module provides the Wizard class for fully automated processing of MS data
|
|
5
|
+
from raw files to final study results, including batch conversion, assembly,
|
|
6
|
+
alignment, merging, plotting, and export.
|
|
7
|
+
|
|
8
|
+
Key Features:
|
|
9
|
+
- Automated discovery and batch conversion of raw data files
|
|
10
|
+
- Intelligent resume capability for interrupted processes
|
|
11
|
+
- Parallel processing optimization for large datasets
|
|
12
|
+
- Adaptive study format based on study size
|
|
13
|
+
- Comprehensive logging and progress tracking
|
|
14
|
+
- Optimized memory management for large studies
|
|
15
|
+
|
|
16
|
+
Classes:
|
|
17
|
+
- Wizard: Main class for automated study processing
|
|
18
|
+
- wizard_def: Default parameters configuration class
|
|
19
|
+
|
|
20
|
+
Example Usage:
|
|
21
|
+
```python
|
|
22
|
+
from masster import Wizard, wizard_def
|
|
23
|
+
|
|
24
|
+
# Create wizard with default parameters
|
|
25
|
+
wizard = Wizard(
|
|
26
|
+
data_source="./raw_data",
|
|
27
|
+
study_folder="./processed_study",
|
|
28
|
+
polarity="positive",
|
|
29
|
+
num_cores=4
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Run complete processing pipeline
|
|
33
|
+
wizard.run_full_pipeline()
|
|
34
|
+
|
|
35
|
+
# Or run individual steps
|
|
36
|
+
wizard.convert_to_sample5()
|
|
37
|
+
wizard.assemble_study()
|
|
38
|
+
wizard.align_and_merge()
|
|
39
|
+
wizard.generate_plots()
|
|
40
|
+
wizard.export_results()
|
|
41
|
+
```
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
from __future__ import annotations
|
|
45
|
+
|
|
46
|
+
import os
|
|
47
|
+
import time
|
|
48
|
+
import multiprocessing
|
|
49
|
+
from pathlib import Path
|
|
50
|
+
from typing import Optional, Any, Dict, List
|
|
51
|
+
from dataclasses import dataclass, field
|
|
52
|
+
import concurrent.futures
|
|
53
|
+
from datetime import datetime
|
|
54
|
+
|
|
55
|
+
# Import masster modules - use delayed import to avoid circular dependencies
|
|
56
|
+
from masster.logger import MassterLogger
|
|
57
|
+
from masster.study.defaults.study_def import study_defaults
|
|
58
|
+
from masster.study.defaults.align_def import align_defaults
|
|
59
|
+
from masster.study.defaults.merge_def import merge_defaults
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class wizard_def:
|
|
64
|
+
"""
|
|
65
|
+
Default parameters for the Wizard automated processing system.
|
|
66
|
+
|
|
67
|
+
This class provides comprehensive configuration for all stages of automated
|
|
68
|
+
mass spectrometry data processing from raw files to final results.
|
|
69
|
+
|
|
70
|
+
Attributes:
|
|
71
|
+
# Core Configuration
|
|
72
|
+
data_source (str): Path to directory containing raw data files
|
|
73
|
+
study_folder (str): Output directory for processed study
|
|
74
|
+
polarity (str): Ion polarity mode ("positive" or "negative")
|
|
75
|
+
num_cores (int): Number of CPU cores to use for parallel processing
|
|
76
|
+
|
|
77
|
+
# File Discovery
|
|
78
|
+
file_extensions (List[str]): File extensions to search for
|
|
79
|
+
search_subfolders (bool): Whether to search subdirectories
|
|
80
|
+
skip_patterns (List[str]): Filename patterns to skip
|
|
81
|
+
|
|
82
|
+
# Processing Parameters
|
|
83
|
+
adducts (List[str]): Adduct specifications for given polarity
|
|
84
|
+
batch_size (int): Number of files to process per batch
|
|
85
|
+
memory_limit_gb (float): Memory limit for processing (GB)
|
|
86
|
+
|
|
87
|
+
# Resume & Recovery
|
|
88
|
+
resume_enabled (bool): Enable automatic resume capability
|
|
89
|
+
force_reprocess (bool): Force reprocessing of existing files
|
|
90
|
+
backup_enabled (bool): Create backups of intermediate results
|
|
91
|
+
|
|
92
|
+
# Output & Export
|
|
93
|
+
generate_plots (bool): Generate visualization plots
|
|
94
|
+
export_formats (List[str]): Output formats to generate
|
|
95
|
+
compress_output (bool): Compress final study file
|
|
96
|
+
|
|
97
|
+
# Logging
|
|
98
|
+
log_level (str): Logging detail level
|
|
99
|
+
log_to_file (bool): Save logs to file
|
|
100
|
+
progress_interval (int): Progress update interval (seconds)
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
# === Core Configuration ===
|
|
104
|
+
data_source: str = ""
|
|
105
|
+
study_folder: str = ""
|
|
106
|
+
polarity: str = "positive"
|
|
107
|
+
num_cores: int = 4
|
|
108
|
+
|
|
109
|
+
# === File Discovery ===
|
|
110
|
+
file_extensions: List[str] = field(default_factory=lambda: [".wiff", ".raw", ".mzML", ".d"])
|
|
111
|
+
search_subfolders: bool = True
|
|
112
|
+
skip_patterns: List[str] = field(default_factory=lambda: ["blank", "QC", "test"])
|
|
113
|
+
|
|
114
|
+
# === Processing Parameters ===
|
|
115
|
+
adducts: List[str] = field(default_factory=list) # Will be set based on polarity
|
|
116
|
+
batch_size: int = 8
|
|
117
|
+
memory_limit_gb: float = 16.0
|
|
118
|
+
max_file_size_gb: float = 4.0
|
|
119
|
+
|
|
120
|
+
# === Resume & Recovery ===
|
|
121
|
+
resume_enabled: bool = True
|
|
122
|
+
force_reprocess: bool = False
|
|
123
|
+
backup_enabled: bool = True
|
|
124
|
+
checkpoint_interval: int = 10 # Save progress every N files
|
|
125
|
+
|
|
126
|
+
# === Study Assembly ===
|
|
127
|
+
min_samples_for_merge: int = 50
|
|
128
|
+
rt_tolerance: float = 1.5
|
|
129
|
+
mz_tolerance: float = 0.01
|
|
130
|
+
alignment_algorithm: str = "kd"
|
|
131
|
+
merge_method: str = "chunked"
|
|
132
|
+
|
|
133
|
+
# === Feature Detection ===
|
|
134
|
+
chrom_fwhm: float = 0.2
|
|
135
|
+
noise_threshold: float = 1e5
|
|
136
|
+
chrom_peak_snr: float = 5.0
|
|
137
|
+
tol_ppm: float = 10.0
|
|
138
|
+
|
|
139
|
+
# === Output & Export ===
|
|
140
|
+
generate_plots: bool = True
|
|
141
|
+
generate_interactive: bool = True
|
|
142
|
+
export_formats: List[str] = field(default_factory=lambda: ["csv", "mgf", "xlsx"])
|
|
143
|
+
compress_output: bool = True
|
|
144
|
+
adaptive_compression: bool = True # Adapt based on study size
|
|
145
|
+
|
|
146
|
+
# === Logging ===
|
|
147
|
+
log_level: str = "INFO"
|
|
148
|
+
log_to_file: bool = True
|
|
149
|
+
progress_interval: int = 30 # seconds
|
|
150
|
+
verbose_progress: bool = True
|
|
151
|
+
|
|
152
|
+
# === Advanced Options ===
|
|
153
|
+
use_process_pool: bool = True # vs ThreadPoolExecutor
|
|
154
|
+
optimize_memory: bool = True
|
|
155
|
+
cleanup_temp_files: bool = True
|
|
156
|
+
validate_outputs: bool = True
|
|
157
|
+
|
|
158
|
+
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
159
|
+
default_factory=lambda: {
|
|
160
|
+
"data_source": {
|
|
161
|
+
"dtype": str,
|
|
162
|
+
"description": "Path to directory containing raw data files",
|
|
163
|
+
"required": True,
|
|
164
|
+
},
|
|
165
|
+
"study_folder": {
|
|
166
|
+
"dtype": str,
|
|
167
|
+
"description": "Output directory for processed study",
|
|
168
|
+
"required": True,
|
|
169
|
+
},
|
|
170
|
+
"polarity": {
|
|
171
|
+
"dtype": str,
|
|
172
|
+
"description": "Ion polarity mode",
|
|
173
|
+
"default": "positive",
|
|
174
|
+
"allowed_values": ["positive", "negative", "pos", "neg"],
|
|
175
|
+
},
|
|
176
|
+
"num_cores": {
|
|
177
|
+
"dtype": int,
|
|
178
|
+
"description": "Number of CPU cores to use",
|
|
179
|
+
"default": 4,
|
|
180
|
+
"min_value": 1,
|
|
181
|
+
"max_value": multiprocessing.cpu_count(),
|
|
182
|
+
},
|
|
183
|
+
"batch_size": {
|
|
184
|
+
"dtype": int,
|
|
185
|
+
"description": "Number of files to process per batch",
|
|
186
|
+
"default": 8,
|
|
187
|
+
"min_value": 1,
|
|
188
|
+
"max_value": 32,
|
|
189
|
+
},
|
|
190
|
+
"memory_limit_gb": {
|
|
191
|
+
"dtype": float,
|
|
192
|
+
"description": "Memory limit for processing (GB)",
|
|
193
|
+
"default": 16.0,
|
|
194
|
+
"min_value": 1.0,
|
|
195
|
+
"max_value": 128.0,
|
|
196
|
+
},
|
|
197
|
+
},
|
|
198
|
+
repr=False,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
def __post_init__(self):
|
|
202
|
+
"""Set polarity-specific defaults after initialization."""
|
|
203
|
+
# Set default adducts based on polarity if not provided
|
|
204
|
+
if not self.adducts:
|
|
205
|
+
if self.polarity.lower() in ["positive", "pos"]:
|
|
206
|
+
self.adducts = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]
|
|
207
|
+
elif self.polarity.lower() in ["negative", "neg"]:
|
|
208
|
+
self.adducts = ["H-1:-:1.0", "CH2O2:0:0.5"]
|
|
209
|
+
else:
|
|
210
|
+
# Default to positive
|
|
211
|
+
self.adducts = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]
|
|
212
|
+
|
|
213
|
+
# Validate num_cores
|
|
214
|
+
max_cores = multiprocessing.cpu_count()
|
|
215
|
+
if self.num_cores <= 0:
|
|
216
|
+
self.num_cores = max_cores
|
|
217
|
+
elif self.num_cores > max_cores:
|
|
218
|
+
self.num_cores = max_cores
|
|
219
|
+
|
|
220
|
+
# Ensure paths are absolute
|
|
221
|
+
if self.data_source:
|
|
222
|
+
self.data_source = os.path.abspath(self.data_source)
|
|
223
|
+
if self.study_folder:
|
|
224
|
+
self.study_folder = os.path.abspath(self.study_folder)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
class Wizard:
|
|
228
|
+
"""
|
|
229
|
+
Automated processing wizard for mass spectrometry studies.
|
|
230
|
+
|
|
231
|
+
The Wizard class provides end-to-end automation for processing collections
|
|
232
|
+
of mass spectrometry files from raw data to final study results, including:
|
|
233
|
+
|
|
234
|
+
1. Raw data discovery and batch conversion to sample5 format
|
|
235
|
+
2. Study assembly with feature alignment and merging
|
|
236
|
+
3. Automated plot generation and result export
|
|
237
|
+
4. Intelligent resume capability for interrupted processes
|
|
238
|
+
5. Adaptive optimization based on study size and system resources
|
|
239
|
+
|
|
240
|
+
The wizard handles the complete workflow with minimal user intervention
|
|
241
|
+
while providing comprehensive logging and progress tracking.
|
|
242
|
+
"""
|
|
243
|
+
|
|
244
|
+
def __init__(
|
|
245
|
+
self,
|
|
246
|
+
data_source: str = "",
|
|
247
|
+
study_folder: str = "",
|
|
248
|
+
polarity: str = "positive",
|
|
249
|
+
adducts: Optional[List[str]] = None,
|
|
250
|
+
num_cores: int = 4,
|
|
251
|
+
**kwargs
|
|
252
|
+
):
|
|
253
|
+
"""
|
|
254
|
+
Initialize the Wizard for automated study processing.
|
|
255
|
+
|
|
256
|
+
Parameters:
|
|
257
|
+
data_source: Directory containing raw data files
|
|
258
|
+
study_folder: Output directory for processed study
|
|
259
|
+
polarity: Ion polarity mode ("positive" or "negative")
|
|
260
|
+
adducts: List of adduct specifications (auto-set if None)
|
|
261
|
+
num_cores: Number of CPU cores for parallel processing
|
|
262
|
+
**kwargs: Additional parameters (see wizard_def for full list)
|
|
263
|
+
"""
|
|
264
|
+
|
|
265
|
+
# Create parameters instance
|
|
266
|
+
if "params" in kwargs and isinstance(kwargs["params"], wizard_def):
|
|
267
|
+
self.params = kwargs.pop("params")
|
|
268
|
+
else:
|
|
269
|
+
# Create default parameters and update with provided values
|
|
270
|
+
self.params = wizard_def(
|
|
271
|
+
data_source=data_source,
|
|
272
|
+
study_folder=study_folder,
|
|
273
|
+
polarity=polarity,
|
|
274
|
+
num_cores=num_cores
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
if adducts is not None:
|
|
278
|
+
self.params.adducts = adducts
|
|
279
|
+
|
|
280
|
+
# Update with any additional parameters
|
|
281
|
+
for key, value in kwargs.items():
|
|
282
|
+
if hasattr(self.params, key):
|
|
283
|
+
setattr(self.params, key, value)
|
|
284
|
+
|
|
285
|
+
# Validate required parameters
|
|
286
|
+
if not self.params.data_source:
|
|
287
|
+
raise ValueError("data_source is required")
|
|
288
|
+
if not self.params.study_folder:
|
|
289
|
+
raise ValueError("study_folder is required")
|
|
290
|
+
|
|
291
|
+
# Create directories
|
|
292
|
+
self.data_source_path = Path(self.params.data_source)
|
|
293
|
+
self.study_folder_path = Path(self.params.study_folder)
|
|
294
|
+
self.study_folder_path.mkdir(parents=True, exist_ok=True)
|
|
295
|
+
|
|
296
|
+
# Setup logging
|
|
297
|
+
self._setup_logging()
|
|
298
|
+
|
|
299
|
+
# Initialize state tracking
|
|
300
|
+
self.processed_files = []
|
|
301
|
+
self.failed_files = []
|
|
302
|
+
self.study = None
|
|
303
|
+
self.start_time = None
|
|
304
|
+
self.current_step = "initialized"
|
|
305
|
+
|
|
306
|
+
# Create checkpoint file path
|
|
307
|
+
self.checkpoint_file = self.study_folder_path / "wizard_checkpoint.json"
|
|
308
|
+
|
|
309
|
+
self.logger.info(f"Wizard initialized for {self.polarity} mode")
|
|
310
|
+
self.logger.info(f"Data source: {self.data_source_path}")
|
|
311
|
+
self.logger.info(f"Study folder: {self.study_folder_path}")
|
|
312
|
+
self.logger.info(f"Using {self.params.num_cores} CPU cores")
|
|
313
|
+
|
|
314
|
+
# Load checkpoint if resuming
|
|
315
|
+
if self.params.resume_enabled:
|
|
316
|
+
self._load_checkpoint()
|
|
317
|
+
|
|
318
|
+
@property
|
|
319
|
+
def polarity(self) -> str:
|
|
320
|
+
"""Get the polarity setting."""
|
|
321
|
+
return self.params.polarity
|
|
322
|
+
|
|
323
|
+
@property
|
|
324
|
+
def adducts(self) -> List[str]:
|
|
325
|
+
"""Get the adducts list."""
|
|
326
|
+
return self.params.adducts
|
|
327
|
+
|
|
328
|
+
def _setup_logging(self):
|
|
329
|
+
"""Setup comprehensive logging system."""
|
|
330
|
+
# Create logger
|
|
331
|
+
log_label = f"Wizard-{self.polarity}"
|
|
332
|
+
|
|
333
|
+
if self.params.log_to_file:
|
|
334
|
+
log_file = self.study_folder_path / "wizard.log"
|
|
335
|
+
sink = str(log_file)
|
|
336
|
+
else:
|
|
337
|
+
sink = "sys.stdout"
|
|
338
|
+
|
|
339
|
+
self.logger = MassterLogger(
|
|
340
|
+
instance_type="wizard",
|
|
341
|
+
level=self.params.log_level.upper(),
|
|
342
|
+
label=log_label,
|
|
343
|
+
sink=sink,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Also create a simple file logger for critical info
|
|
347
|
+
self.log_file = self.study_folder_path / "processing.log"
|
|
348
|
+
|
|
349
|
+
def _log_progress(self, message: str, level: str = "INFO"):
|
|
350
|
+
"""Log progress message with timestamp."""
|
|
351
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
352
|
+
full_message = f"[{timestamp}] {message}"
|
|
353
|
+
|
|
354
|
+
# Log to masster logger
|
|
355
|
+
getattr(self.logger, level.lower())(message)
|
|
356
|
+
|
|
357
|
+
# Also write to simple log file
|
|
358
|
+
with open(self.log_file, "a", encoding="utf-8") as f:
|
|
359
|
+
f.write(f"{full_message}\n")
|
|
360
|
+
|
|
361
|
+
if self.params.verbose_progress and level in ["INFO", "WARNING", "ERROR"]:
|
|
362
|
+
print(full_message)
|
|
363
|
+
|
|
364
|
+
def _save_checkpoint(self):
|
|
365
|
+
"""Save processing checkpoint for resume capability."""
|
|
366
|
+
if not self.params.resume_enabled:
|
|
367
|
+
return
|
|
368
|
+
|
|
369
|
+
import json
|
|
370
|
+
checkpoint_data = {
|
|
371
|
+
"timestamp": datetime.now().isoformat(),
|
|
372
|
+
"current_step": self.current_step,
|
|
373
|
+
"processed_files": self.processed_files,
|
|
374
|
+
"failed_files": self.failed_files,
|
|
375
|
+
"params": {
|
|
376
|
+
"data_source": self.params.data_source,
|
|
377
|
+
"study_folder": self.params.study_folder,
|
|
378
|
+
"polarity": self.params.polarity,
|
|
379
|
+
"adducts": self.params.adducts,
|
|
380
|
+
"num_cores": self.params.num_cores,
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
try:
|
|
385
|
+
with open(self.checkpoint_file, "w") as f:
|
|
386
|
+
json.dump(checkpoint_data, f, indent=2)
|
|
387
|
+
self.logger.debug(f"Checkpoint saved: {len(self.processed_files)} files processed")
|
|
388
|
+
except Exception as e:
|
|
389
|
+
self.logger.warning(f"Failed to save checkpoint: {e}")
|
|
390
|
+
|
|
391
|
+
def _load_checkpoint(self):
|
|
392
|
+
"""Load processing checkpoint for resume capability."""
|
|
393
|
+
if not self.checkpoint_file.exists():
|
|
394
|
+
return
|
|
395
|
+
|
|
396
|
+
import json
|
|
397
|
+
try:
|
|
398
|
+
with open(self.checkpoint_file, "r") as f:
|
|
399
|
+
checkpoint_data = json.load(f)
|
|
400
|
+
|
|
401
|
+
self.processed_files = checkpoint_data.get("processed_files", [])
|
|
402
|
+
self.failed_files = checkpoint_data.get("failed_files", [])
|
|
403
|
+
self.current_step = checkpoint_data.get("current_step", "initialized")
|
|
404
|
+
|
|
405
|
+
self.logger.info(f"Resuming from checkpoint: {len(self.processed_files)} files already processed")
|
|
406
|
+
self.logger.info(f"Previous step: {self.current_step}")
|
|
407
|
+
|
|
408
|
+
except Exception as e:
|
|
409
|
+
self.logger.warning(f"Failed to load checkpoint: {e}")
|
|
410
|
+
self.processed_files = []
|
|
411
|
+
self.failed_files = []
|
|
412
|
+
|
|
413
|
+
def discover_files(self) -> List[Path]:
|
|
414
|
+
"""
|
|
415
|
+
Discover raw data files in the source directory.
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
List of file paths found for processing
|
|
419
|
+
"""
|
|
420
|
+
self._log_progress("Discovering raw data files...")
|
|
421
|
+
self.current_step = "discovering_files"
|
|
422
|
+
|
|
423
|
+
found_files = []
|
|
424
|
+
|
|
425
|
+
for extension in self.params.file_extensions:
|
|
426
|
+
if self.params.search_subfolders:
|
|
427
|
+
pattern = f"**/*{extension}"
|
|
428
|
+
files = list(self.data_source_path.rglob(pattern))
|
|
429
|
+
else:
|
|
430
|
+
pattern = f"*{extension}"
|
|
431
|
+
files = list(self.data_source_path.glob(pattern))
|
|
432
|
+
|
|
433
|
+
# Filter out files matching skip patterns
|
|
434
|
+
filtered_files = []
|
|
435
|
+
for file_path in files:
|
|
436
|
+
skip_file = False
|
|
437
|
+
for pattern in self.params.skip_patterns:
|
|
438
|
+
if pattern.lower() in file_path.name.lower():
|
|
439
|
+
skip_file = True
|
|
440
|
+
self.logger.debug(f"Skipping file (matches pattern '{pattern}'): {file_path.name}")
|
|
441
|
+
break
|
|
442
|
+
|
|
443
|
+
if not skip_file:
|
|
444
|
+
# Check file size
|
|
445
|
+
try:
|
|
446
|
+
file_size_gb = file_path.stat().st_size / (1024**3)
|
|
447
|
+
if file_size_gb > self.params.max_file_size_gb:
|
|
448
|
+
self.logger.warning(f"Large file ({file_size_gb:.1f}GB): {file_path.name}")
|
|
449
|
+
filtered_files.append(file_path)
|
|
450
|
+
except Exception as e:
|
|
451
|
+
self.logger.warning(f"Could not check file size for {file_path}: {e}")
|
|
452
|
+
filtered_files.append(file_path)
|
|
453
|
+
|
|
454
|
+
found_files.extend(filtered_files)
|
|
455
|
+
self.logger.info(f"Found {len(filtered_files)} {extension} files")
|
|
456
|
+
|
|
457
|
+
# Remove duplicates and sort
|
|
458
|
+
found_files = sorted(list(set(found_files)))
|
|
459
|
+
|
|
460
|
+
self._log_progress(f"Total files discovered: {len(found_files)}")
|
|
461
|
+
|
|
462
|
+
return found_files
|
|
463
|
+
|
|
464
|
+
def _process_single_file(self, file_path: Path, reset: bool = False) -> Optional[str]:
|
|
465
|
+
"""
|
|
466
|
+
Process a single file to sample5 format.
|
|
467
|
+
|
|
468
|
+
This method replicates the core processing from parallel_sample_processing.py
|
|
469
|
+
but with wizard-specific configuration and error handling.
|
|
470
|
+
|
|
471
|
+
Parameters:
|
|
472
|
+
file_path: Path to the raw data file
|
|
473
|
+
reset: Force reprocessing even if output exists
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
Base filename of output on success, None on failure
|
|
477
|
+
"""
|
|
478
|
+
import gc
|
|
479
|
+
|
|
480
|
+
# Generate output filename
|
|
481
|
+
file_out = file_path.stem + '.sample5'
|
|
482
|
+
output_file = self.study_folder_path / file_out
|
|
483
|
+
|
|
484
|
+
# Initialize masster Sample with delayed import
|
|
485
|
+
import masster
|
|
486
|
+
sample = masster.Sample(
|
|
487
|
+
log_label=file_path.name,
|
|
488
|
+
log_level='ERROR' # Reduce logging overhead in parallel processing
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
# Check if file should be skipped
|
|
492
|
+
skip = False
|
|
493
|
+
if not reset and not self.params.force_reprocess and output_file.exists():
|
|
494
|
+
try:
|
|
495
|
+
# Attempt to load existing processed file to verify it's valid
|
|
496
|
+
sample.load(str(output_file))
|
|
497
|
+
skip = True
|
|
498
|
+
except Exception:
|
|
499
|
+
# If loading fails, file needs to be reprocessed
|
|
500
|
+
skip = False
|
|
501
|
+
|
|
502
|
+
if skip:
|
|
503
|
+
self.logger.debug(f"Skipping {file_path.name} (already processed)")
|
|
504
|
+
return output_file.stem
|
|
505
|
+
|
|
506
|
+
self.logger.info(f"Processing {file_path.name}")
|
|
507
|
+
|
|
508
|
+
try:
|
|
509
|
+
# STEP 1: Load raw data
|
|
510
|
+
sample.load(str(file_path))
|
|
511
|
+
|
|
512
|
+
# STEP 2: Feature detection - First pass (strict parameters)
|
|
513
|
+
sample.find_features(
|
|
514
|
+
chrom_fwhm=self.params.chrom_fwhm,
|
|
515
|
+
noise=self.params.noise_threshold,
|
|
516
|
+
tol_ppm=self.params.tol_ppm,
|
|
517
|
+
chrom_peak_snr=self.params.chrom_peak_snr,
|
|
518
|
+
min_trace_length_multiplier=0.5,
|
|
519
|
+
chrom_fwhm_min=self.params.chrom_fwhm
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
# STEP 3: Feature detection - Second pass (relaxed parameters)
|
|
523
|
+
sample.find_features(
|
|
524
|
+
chrom_peak_snr=self.params.chrom_peak_snr,
|
|
525
|
+
noise=self.params.noise_threshold / 10, # Lower noise threshold
|
|
526
|
+
chrom_fwhm=2.0 # Wider peaks
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
# STEP 4: Adduct detection
|
|
530
|
+
sample.find_adducts(adducts=self.adducts)
|
|
531
|
+
|
|
532
|
+
# STEP 5: MS2 spectrum identification
|
|
533
|
+
sample.find_ms2()
|
|
534
|
+
|
|
535
|
+
# STEP 6: Save processed data
|
|
536
|
+
sample.save(filename=str(output_file))
|
|
537
|
+
|
|
538
|
+
# STEP 7: Generate additional outputs if requested
|
|
539
|
+
if "csv" in self.params.export_formats:
|
|
540
|
+
csv_file = output_file.with_suffix('.features.csv')
|
|
541
|
+
sample.export_features(filename=str(csv_file))
|
|
542
|
+
|
|
543
|
+
if "mgf" in self.params.export_formats:
|
|
544
|
+
mgf_file = output_file.with_suffix('.mgf')
|
|
545
|
+
sample.export_mgf(filename=str(mgf_file), use_cache=False)
|
|
546
|
+
|
|
547
|
+
if self.params.generate_plots:
|
|
548
|
+
plot_file = output_file.with_suffix('_2d.html')
|
|
549
|
+
sample.plot_2d(filename=str(plot_file), markersize=4)
|
|
550
|
+
|
|
551
|
+
# Memory cleanup
|
|
552
|
+
result = output_file.stem
|
|
553
|
+
del sample
|
|
554
|
+
gc.collect()
|
|
555
|
+
|
|
556
|
+
return result
|
|
557
|
+
|
|
558
|
+
except Exception as e:
|
|
559
|
+
self.logger.error(f"Error processing {file_path.name}: {e}")
|
|
560
|
+
# Cleanup on error
|
|
561
|
+
gc.collect()
|
|
562
|
+
return None
|
|
563
|
+
|
|
564
|
+
def _process_batch(self, file_batch: List[Path]) -> List[str]:
|
|
565
|
+
"""Process a batch of files in a single worker."""
|
|
566
|
+
results = []
|
|
567
|
+
for file_path in file_batch:
|
|
568
|
+
result = self._process_single_file(file_path)
|
|
569
|
+
if result:
|
|
570
|
+
results.append(result)
|
|
571
|
+
else:
|
|
572
|
+
results.append(None)
|
|
573
|
+
return results
|
|
574
|
+
|
|
575
|
+
def convert_to_sample5(self, file_list: Optional[List[Path]] = None) -> bool:
|
|
576
|
+
"""
|
|
577
|
+
Convert raw data files to sample5 format in parallel.
|
|
578
|
+
|
|
579
|
+
Parameters:
|
|
580
|
+
file_list: List of files to process (None to discover automatically)
|
|
581
|
+
|
|
582
|
+
Returns:
|
|
583
|
+
True if conversion completed successfully
|
|
584
|
+
"""
|
|
585
|
+
self._log_progress("=== Starting Sample5 Conversion ===")
|
|
586
|
+
self.current_step = "converting_to_sample5"
|
|
587
|
+
|
|
588
|
+
if file_list is None:
|
|
589
|
+
file_list = self.discover_files()
|
|
590
|
+
|
|
591
|
+
if not file_list:
|
|
592
|
+
self.logger.warning("No files found for conversion")
|
|
593
|
+
return False
|
|
594
|
+
|
|
595
|
+
# Filter out already processed files if resuming
|
|
596
|
+
if self.params.resume_enabled and self.processed_files:
|
|
597
|
+
remaining_files = []
|
|
598
|
+
for file_path in file_list:
|
|
599
|
+
if str(file_path) not in self.processed_files:
|
|
600
|
+
remaining_files.append(file_path)
|
|
601
|
+
file_list = remaining_files
|
|
602
|
+
|
|
603
|
+
if not file_list:
|
|
604
|
+
self._log_progress("All files already processed")
|
|
605
|
+
return True
|
|
606
|
+
|
|
607
|
+
self._log_progress(f"Converting {len(file_list)} files to sample5 format")
|
|
608
|
+
|
|
609
|
+
conversion_start = time.time()
|
|
610
|
+
successful_count = 0
|
|
611
|
+
failed_count = 0
|
|
612
|
+
|
|
613
|
+
if self.params.use_process_pool:
|
|
614
|
+
# ProcessPoolExecutor approach - better for CPU-intensive work
|
|
615
|
+
if len(file_list) <= self.params.batch_size:
|
|
616
|
+
# Few files: process individually
|
|
617
|
+
self.logger.info(f"Processing {len(file_list)} files individually with {self.params.num_cores} workers")
|
|
618
|
+
|
|
619
|
+
with concurrent.futures.ProcessPoolExecutor(max_workers=self.params.num_cores) as executor:
|
|
620
|
+
futures = [
|
|
621
|
+
executor.submit(self._process_single_file, file_path)
|
|
622
|
+
for file_path in file_list
|
|
623
|
+
]
|
|
624
|
+
|
|
625
|
+
for i, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
626
|
+
result = future.result()
|
|
627
|
+
if result:
|
|
628
|
+
successful_count += 1
|
|
629
|
+
self.processed_files.append(str(file_list[i]))
|
|
630
|
+
else:
|
|
631
|
+
failed_count += 1
|
|
632
|
+
self.failed_files.append(str(file_list[i]))
|
|
633
|
+
|
|
634
|
+
# Progress update and checkpoint
|
|
635
|
+
if (successful_count + failed_count) % self.params.checkpoint_interval == 0:
|
|
636
|
+
progress = (successful_count + failed_count) / len(file_list) * 100
|
|
637
|
+
self._log_progress(f"Progress: {progress:.1f}% ({successful_count} successful, {failed_count} failed)")
|
|
638
|
+
self._save_checkpoint()
|
|
639
|
+
|
|
640
|
+
else:
|
|
641
|
+
# Many files: process in batches
|
|
642
|
+
batches = [
|
|
643
|
+
file_list[i:i + self.params.batch_size]
|
|
644
|
+
for i in range(0, len(file_list), self.params.batch_size)
|
|
645
|
+
]
|
|
646
|
+
|
|
647
|
+
self.logger.info(f"Processing {len(file_list)} files in {len(batches)} batches")
|
|
648
|
+
|
|
649
|
+
with concurrent.futures.ProcessPoolExecutor(max_workers=self.params.num_cores) as executor:
|
|
650
|
+
futures = [executor.submit(self._process_batch, batch) for batch in batches]
|
|
651
|
+
|
|
652
|
+
for batch_idx, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
653
|
+
batch_results = future.result()
|
|
654
|
+
batch = batches[batch_idx]
|
|
655
|
+
|
|
656
|
+
for i, result in enumerate(batch_results):
|
|
657
|
+
if result:
|
|
658
|
+
successful_count += 1
|
|
659
|
+
self.processed_files.append(str(batch[i]))
|
|
660
|
+
else:
|
|
661
|
+
failed_count += 1
|
|
662
|
+
self.failed_files.append(str(batch[i]))
|
|
663
|
+
|
|
664
|
+
# Progress update
|
|
665
|
+
progress = (successful_count + failed_count) / len(file_list) * 100
|
|
666
|
+
self._log_progress(f"Batch {batch_idx + 1}/{len(batches)} complete. Progress: {progress:.1f}%")
|
|
667
|
+
self._save_checkpoint()
|
|
668
|
+
|
|
669
|
+
else:
|
|
670
|
+
# ThreadPoolExecutor approach
|
|
671
|
+
self.logger.info(f"Processing {len(file_list)} files with {self.params.num_cores} threads")
|
|
672
|
+
|
|
673
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=self.params.num_cores) as executor:
|
|
674
|
+
futures = [
|
|
675
|
+
executor.submit(self._process_single_file, file_path)
|
|
676
|
+
for file_path in file_list
|
|
677
|
+
]
|
|
678
|
+
|
|
679
|
+
for i, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
680
|
+
result = future.result()
|
|
681
|
+
if result:
|
|
682
|
+
successful_count += 1
|
|
683
|
+
self.processed_files.append(str(file_list[i]))
|
|
684
|
+
else:
|
|
685
|
+
failed_count += 1
|
|
686
|
+
self.failed_files.append(str(file_list[i]))
|
|
687
|
+
|
|
688
|
+
if (successful_count + failed_count) % self.params.checkpoint_interval == 0:
|
|
689
|
+
progress = (successful_count + failed_count) / len(file_list) * 100
|
|
690
|
+
self._log_progress(f"Progress: {progress:.1f}%")
|
|
691
|
+
self._save_checkpoint()
|
|
692
|
+
|
|
693
|
+
conversion_time = time.time() - conversion_start
|
|
694
|
+
|
|
695
|
+
self._log_progress("=== Sample5 Conversion Complete ===")
|
|
696
|
+
self._log_progress(f"Successful: {successful_count}")
|
|
697
|
+
self._log_progress(f"Failed: {failed_count}")
|
|
698
|
+
self._log_progress(f"Total time: {conversion_time:.1f} seconds")
|
|
699
|
+
|
|
700
|
+
if failed_count > 0:
|
|
701
|
+
self.logger.warning(f"{failed_count} files failed to process")
|
|
702
|
+
for failed_file in self.failed_files[-failed_count:]:
|
|
703
|
+
self.logger.warning(f"Failed: {failed_file}")
|
|
704
|
+
|
|
705
|
+
self._save_checkpoint()
|
|
706
|
+
return successful_count > 0
|
|
707
|
+
|
|
708
|
+
def assemble_study(self) -> bool:
|
|
709
|
+
"""
|
|
710
|
+
Assemble processed sample5 files into a study.
|
|
711
|
+
|
|
712
|
+
Returns:
|
|
713
|
+
True if study assembly was successful
|
|
714
|
+
"""
|
|
715
|
+
self._log_progress("=== Starting Study Assembly ===")
|
|
716
|
+
self.current_step = "assembling_study"
|
|
717
|
+
|
|
718
|
+
# Find all sample5 files
|
|
719
|
+
sample5_files = list(self.study_folder_path.glob("*.sample5"))
|
|
720
|
+
|
|
721
|
+
if not sample5_files:
|
|
722
|
+
self.logger.error("No sample5 files found for study assembly")
|
|
723
|
+
return False
|
|
724
|
+
|
|
725
|
+
self._log_progress(f"Assembling study from {len(sample5_files)} sample5 files")
|
|
726
|
+
|
|
727
|
+
try:
|
|
728
|
+
# Create study with optimized settings
|
|
729
|
+
import masster
|
|
730
|
+
study_params = study_defaults(
|
|
731
|
+
folder=str(self.study_folder_path),
|
|
732
|
+
polarity=self.polarity,
|
|
733
|
+
log_level="INFO",
|
|
734
|
+
log_label=f"Study-{self.polarity}",
|
|
735
|
+
adducts=self.adducts
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
self.study = masster.Study(params=study_params)
|
|
739
|
+
|
|
740
|
+
# Add all sample5 files
|
|
741
|
+
sample5_pattern = str(self.study_folder_path / "*.sample5")
|
|
742
|
+
self.study.add(sample5_pattern)
|
|
743
|
+
|
|
744
|
+
self._log_progress(f"Added {len(self.study.samples_df)} samples to study")
|
|
745
|
+
|
|
746
|
+
# Filter features based on quality criteria
|
|
747
|
+
if hasattr(self.study, 'features_filter'):
|
|
748
|
+
initial_features = len(self.study.features_df) if hasattr(self.study, 'features_df') else 0
|
|
749
|
+
|
|
750
|
+
# Apply feature filtering
|
|
751
|
+
feature_selection = self.study.features_select(
|
|
752
|
+
chrom_coherence=0.3,
|
|
753
|
+
chrom_prominence_scaled=1
|
|
754
|
+
)
|
|
755
|
+
self.study.features_filter(feature_selection)
|
|
756
|
+
|
|
757
|
+
final_features = len(self.study.features_df) if hasattr(self.study, 'features_df') else 0
|
|
758
|
+
self._log_progress(f"Feature filtering: {initial_features} -> {final_features} features")
|
|
759
|
+
|
|
760
|
+
self._save_checkpoint()
|
|
761
|
+
return True
|
|
762
|
+
|
|
763
|
+
except Exception as e:
|
|
764
|
+
self.logger.error(f"Failed to assemble study: {e}")
|
|
765
|
+
return False
|
|
766
|
+
|
|
767
|
+
def align_and_merge(self) -> bool:
|
|
768
|
+
"""
|
|
769
|
+
Perform feature alignment and merging.
|
|
770
|
+
|
|
771
|
+
Returns:
|
|
772
|
+
True if alignment and merging were successful
|
|
773
|
+
"""
|
|
774
|
+
self._log_progress("=== Starting Feature Alignment and Merging ===")
|
|
775
|
+
self.current_step = "aligning_and_merging"
|
|
776
|
+
|
|
777
|
+
if self.study is None:
|
|
778
|
+
self.logger.error("Study not assembled. Run assemble_study() first.")
|
|
779
|
+
return False
|
|
780
|
+
|
|
781
|
+
try:
|
|
782
|
+
# Align features across samples
|
|
783
|
+
align_params = align_defaults(
|
|
784
|
+
rt_tol=self.params.rt_tolerance,
|
|
785
|
+
mz_tol=self.params.mz_tolerance,
|
|
786
|
+
algorithm=self.params.alignment_algorithm
|
|
787
|
+
)
|
|
788
|
+
|
|
789
|
+
self.logger.info(f"Aligning features with RT tolerance {self.params.rt_tolerance}s, m/z tolerance {self.params.mz_tolerance} Da")
|
|
790
|
+
self.study.align(params=align_params)
|
|
791
|
+
|
|
792
|
+
# Merge aligned features
|
|
793
|
+
merge_params = merge_defaults(
|
|
794
|
+
method=self.params.merge_method,
|
|
795
|
+
rt_tol=self.params.rt_tolerance,
|
|
796
|
+
mz_tol=self.params.mz_tolerance,
|
|
797
|
+
min_samples=self.params.min_samples_for_merge
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
self.logger.info(f"Merging features using {self.params.merge_method} method")
|
|
801
|
+
self.study.merge(params=merge_params)
|
|
802
|
+
|
|
803
|
+
# Log results
|
|
804
|
+
num_consensus = len(self.study.consensus_df) if hasattr(self.study, 'consensus_df') else 0
|
|
805
|
+
self._log_progress(f"Generated {num_consensus} consensus features")
|
|
806
|
+
|
|
807
|
+
# Get study info
|
|
808
|
+
if hasattr(self.study, 'info'):
|
|
809
|
+
self.study.info()
|
|
810
|
+
|
|
811
|
+
self._save_checkpoint()
|
|
812
|
+
return True
|
|
813
|
+
|
|
814
|
+
except Exception as e:
|
|
815
|
+
self.logger.error(f"Failed to align and merge: {e}")
|
|
816
|
+
return False
|
|
817
|
+
|
|
818
|
+
def generate_plots(self) -> bool:
|
|
819
|
+
"""
|
|
820
|
+
Generate visualization plots for the study.
|
|
821
|
+
|
|
822
|
+
Returns:
|
|
823
|
+
True if plot generation was successful
|
|
824
|
+
"""
|
|
825
|
+
if not self.params.generate_plots:
|
|
826
|
+
self._log_progress("Plot generation disabled, skipping...")
|
|
827
|
+
return True
|
|
828
|
+
|
|
829
|
+
self._log_progress("=== Generating Visualization Plots ===")
|
|
830
|
+
self.current_step = "generating_plots"
|
|
831
|
+
|
|
832
|
+
if self.study is None:
|
|
833
|
+
self.logger.error("Study not available. Complete previous steps first.")
|
|
834
|
+
return False
|
|
835
|
+
|
|
836
|
+
try:
|
|
837
|
+
plots_generated = 0
|
|
838
|
+
|
|
839
|
+
# Alignment plot
|
|
840
|
+
if hasattr(self.study, 'plot_alignment'):
|
|
841
|
+
alignment_plot = self.study_folder_path / "alignment_plot.html"
|
|
842
|
+
self.study.plot_alignment(filename=str(alignment_plot))
|
|
843
|
+
plots_generated += 1
|
|
844
|
+
self.logger.info(f"Generated alignment plot: {alignment_plot}")
|
|
845
|
+
|
|
846
|
+
# Consensus 2D plot
|
|
847
|
+
if hasattr(self.study, 'plot_consensus_2d'):
|
|
848
|
+
consensus_2d_plot = self.study_folder_path / "consensus_2d.html"
|
|
849
|
+
self.study.plot_consensus_2d(filename=str(consensus_2d_plot))
|
|
850
|
+
plots_generated += 1
|
|
851
|
+
self.logger.info(f"Generated consensus 2D plot: {consensus_2d_plot}")
|
|
852
|
+
|
|
853
|
+
# PCA plot
|
|
854
|
+
if hasattr(self.study, 'plot_pca'):
|
|
855
|
+
pca_plot = self.study_folder_path / "pca_plot.html"
|
|
856
|
+
self.study.plot_pca(filename=str(pca_plot))
|
|
857
|
+
plots_generated += 1
|
|
858
|
+
self.logger.info(f"Generated PCA plot: {pca_plot}")
|
|
859
|
+
|
|
860
|
+
# Consensus statistics
|
|
861
|
+
if hasattr(self.study, 'plot_consensus_stats'):
|
|
862
|
+
stats_plot = self.study_folder_path / "consensus_stats.html"
|
|
863
|
+
self.study.plot_consensus_stats(filename=str(stats_plot))
|
|
864
|
+
plots_generated += 1
|
|
865
|
+
self.logger.info(f"Generated statistics plot: {stats_plot}")
|
|
866
|
+
|
|
867
|
+
self._log_progress(f"Generated {plots_generated} visualization plots")
|
|
868
|
+
self._save_checkpoint()
|
|
869
|
+
return True
|
|
870
|
+
|
|
871
|
+
except Exception as e:
|
|
872
|
+
self.logger.error(f"Failed to generate plots: {e}")
|
|
873
|
+
return False
|
|
874
|
+
|
|
875
|
+
def export_results(self) -> bool:
|
|
876
|
+
"""
|
|
877
|
+
Export study results in requested formats.
|
|
878
|
+
|
|
879
|
+
Returns:
|
|
880
|
+
True if export was successful
|
|
881
|
+
"""
|
|
882
|
+
self._log_progress("=== Exporting Study Results ===")
|
|
883
|
+
self.current_step = "exporting_results"
|
|
884
|
+
|
|
885
|
+
if self.study is None:
|
|
886
|
+
self.logger.error("Study not available. Complete previous steps first.")
|
|
887
|
+
return False
|
|
888
|
+
|
|
889
|
+
try:
|
|
890
|
+
exports_completed = 0
|
|
891
|
+
|
|
892
|
+
# Export consensus features as CSV
|
|
893
|
+
if "csv" in self.params.export_formats:
|
|
894
|
+
csv_file = self.study_folder_path / "consensus_features.csv"
|
|
895
|
+
if hasattr(self.study.consensus_df, 'write_csv'):
|
|
896
|
+
self.study.consensus_df.write_csv(str(csv_file))
|
|
897
|
+
exports_completed += 1
|
|
898
|
+
self.logger.info(f"Exported CSV: {csv_file}")
|
|
899
|
+
|
|
900
|
+
# Export as Excel
|
|
901
|
+
if "xlsx" in self.params.export_formats and hasattr(self.study, 'export_xlsx'):
|
|
902
|
+
xlsx_file = self.study_folder_path / "study_results.xlsx"
|
|
903
|
+
self.study.export_xlsx(filename=str(xlsx_file))
|
|
904
|
+
exports_completed += 1
|
|
905
|
+
self.logger.info(f"Exported Excel: {xlsx_file}")
|
|
906
|
+
|
|
907
|
+
# Export MGF for MS2 spectra
|
|
908
|
+
if "mgf" in self.params.export_formats and hasattr(self.study, 'export_mgf'):
|
|
909
|
+
mgf_file = self.study_folder_path / "consensus_ms2.mgf"
|
|
910
|
+
self.study.export_mgf(filename=str(mgf_file))
|
|
911
|
+
exports_completed += 1
|
|
912
|
+
self.logger.info(f"Exported MGF: {mgf_file}")
|
|
913
|
+
|
|
914
|
+
# Export as Parquet for efficient storage
|
|
915
|
+
if "parquet" in self.params.export_formats and hasattr(self.study, 'export_parquet'):
|
|
916
|
+
parquet_file = self.study_folder_path / "study_data.parquet"
|
|
917
|
+
self.study.export_parquet(filename=str(parquet_file))
|
|
918
|
+
exports_completed += 1
|
|
919
|
+
self.logger.info(f"Exported Parquet: {parquet_file}")
|
|
920
|
+
|
|
921
|
+
self._log_progress(f"Completed {exports_completed} exports")
|
|
922
|
+
self._save_checkpoint()
|
|
923
|
+
return True
|
|
924
|
+
|
|
925
|
+
except Exception as e:
|
|
926
|
+
self.logger.error(f"Failed to export results: {e}")
|
|
927
|
+
return False
|
|
928
|
+
|
|
929
|
+
def save_study(self) -> bool:
|
|
930
|
+
"""
|
|
931
|
+
Save the final study in optimized format.
|
|
932
|
+
|
|
933
|
+
Returns:
|
|
934
|
+
True if study was saved successfully
|
|
935
|
+
"""
|
|
936
|
+
self._log_progress("=== Saving Final Study ===")
|
|
937
|
+
self.current_step = "saving_study"
|
|
938
|
+
|
|
939
|
+
if self.study is None:
|
|
940
|
+
self.logger.error("Study not available. Complete previous steps first.")
|
|
941
|
+
return False
|
|
942
|
+
|
|
943
|
+
try:
|
|
944
|
+
study_file = self.study_folder_path / "final_study.study5"
|
|
945
|
+
|
|
946
|
+
# Determine optimal save format based on study size
|
|
947
|
+
num_samples = len(self.study.samples_df)
|
|
948
|
+
num_features = len(self.study.consensus_df) if hasattr(self.study, 'consensus_df') else 0
|
|
949
|
+
|
|
950
|
+
if self.params.adaptive_compression:
|
|
951
|
+
# Use compressed format for large studies
|
|
952
|
+
if num_samples > 50 or num_features > 10000:
|
|
953
|
+
self.logger.info(f"Large study detected ({num_samples} samples, {num_features} features) - using compressed format")
|
|
954
|
+
self.params.compress_output = True
|
|
955
|
+
else:
|
|
956
|
+
self.logger.info(f"Small study ({num_samples} samples, {num_features} features) - using standard format")
|
|
957
|
+
self.params.compress_output = False
|
|
958
|
+
|
|
959
|
+
# Save study
|
|
960
|
+
if self.params.compress_output and hasattr(self.study, 'save_compressed'):
|
|
961
|
+
self.study.save_compressed(filename=str(study_file))
|
|
962
|
+
self.logger.info(f"Saved compressed study: {study_file}")
|
|
963
|
+
else:
|
|
964
|
+
self.study.save(filename=str(study_file))
|
|
965
|
+
self.logger.info(f"Saved study: {study_file}")
|
|
966
|
+
|
|
967
|
+
# Save metadata summary
|
|
968
|
+
metadata_file = self.study_folder_path / "study_metadata.txt"
|
|
969
|
+
with open(metadata_file, "w") as f:
|
|
970
|
+
f.write("Study Processing Summary\n")
|
|
971
|
+
f.write("========================\n")
|
|
972
|
+
f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
|
973
|
+
f.write(f"Polarity: {self.polarity}\n")
|
|
974
|
+
f.write(f"Adducts: {', '.join(self.adducts)}\n")
|
|
975
|
+
f.write(f"Number of Samples: {num_samples}\n")
|
|
976
|
+
f.write(f"Number of Consensus Features: {num_features}\n")
|
|
977
|
+
f.write(f"Successful Files: {len(self.processed_files)}\n")
|
|
978
|
+
f.write(f"Failed Files: {len(self.failed_files)}\n")
|
|
979
|
+
f.write(f"RT Tolerance: {self.params.rt_tolerance}s\n")
|
|
980
|
+
f.write(f"m/z Tolerance: {self.params.mz_tolerance} Da\n")
|
|
981
|
+
f.write(f"Merge Method: {self.params.merge_method}\n")
|
|
982
|
+
f.write(f"Processing Time: {self._get_total_processing_time()}\n")
|
|
983
|
+
|
|
984
|
+
self._log_progress(f"Saved study metadata: {metadata_file}")
|
|
985
|
+
self._save_checkpoint()
|
|
986
|
+
return True
|
|
987
|
+
|
|
988
|
+
except Exception as e:
|
|
989
|
+
self.logger.error(f"Failed to save study: {e}")
|
|
990
|
+
return False
|
|
991
|
+
|
|
992
|
+
def cleanup_temp_files(self) -> bool:
|
|
993
|
+
"""
|
|
994
|
+
Clean up temporary files if requested.
|
|
995
|
+
|
|
996
|
+
Returns:
|
|
997
|
+
True if cleanup was successful
|
|
998
|
+
"""
|
|
999
|
+
if not self.params.cleanup_temp_files:
|
|
1000
|
+
return True
|
|
1001
|
+
|
|
1002
|
+
self._log_progress("=== Cleaning Up Temporary Files ===")
|
|
1003
|
+
|
|
1004
|
+
try:
|
|
1005
|
+
cleaned_count = 0
|
|
1006
|
+
|
|
1007
|
+
# Remove individual sample plots if study plots were generated
|
|
1008
|
+
if self.params.generate_plots:
|
|
1009
|
+
temp_plots = list(self.study_folder_path.glob("*_2d.html"))
|
|
1010
|
+
for plot_file in temp_plots:
|
|
1011
|
+
if plot_file.name not in ["alignment_plot.html", "consensus_2d.html", "pca_plot.html"]:
|
|
1012
|
+
plot_file.unlink()
|
|
1013
|
+
cleaned_count += 1
|
|
1014
|
+
|
|
1015
|
+
# Remove checkpoint file
|
|
1016
|
+
if self.checkpoint_file.exists():
|
|
1017
|
+
self.checkpoint_file.unlink()
|
|
1018
|
+
cleaned_count += 1
|
|
1019
|
+
|
|
1020
|
+
self._log_progress(f"Cleaned up {cleaned_count} temporary files")
|
|
1021
|
+
return True
|
|
1022
|
+
|
|
1023
|
+
except Exception as e:
|
|
1024
|
+
self.logger.error(f"Failed to cleanup temp files: {e}")
|
|
1025
|
+
return False
|
|
1026
|
+
|
|
1027
|
+
def run_full_pipeline(self) -> bool:
|
|
1028
|
+
"""
|
|
1029
|
+
Run the complete automated processing pipeline.
|
|
1030
|
+
|
|
1031
|
+
This method executes all processing steps in sequence:
|
|
1032
|
+
1. Convert raw files to sample5 format
|
|
1033
|
+
2. Assemble study from sample5 files
|
|
1034
|
+
3. Align and merge features
|
|
1035
|
+
4. Generate visualization plots
|
|
1036
|
+
5. Export results in requested formats
|
|
1037
|
+
6. Save final study
|
|
1038
|
+
7. Clean up temporary files
|
|
1039
|
+
|
|
1040
|
+
Returns:
|
|
1041
|
+
True if the entire pipeline completed successfully
|
|
1042
|
+
"""
|
|
1043
|
+
self._log_progress("=" * 60)
|
|
1044
|
+
self._log_progress("STARTING AUTOMATED STUDY PROCESSING PIPELINE")
|
|
1045
|
+
self._log_progress("=" * 60)
|
|
1046
|
+
|
|
1047
|
+
self.start_time = time.time()
|
|
1048
|
+
pipeline_success = True
|
|
1049
|
+
|
|
1050
|
+
try:
|
|
1051
|
+
# Step 1: Convert to sample5
|
|
1052
|
+
if not self.convert_to_sample5():
|
|
1053
|
+
self.logger.error("Sample5 conversion failed")
|
|
1054
|
+
return False
|
|
1055
|
+
|
|
1056
|
+
# Step 2: Assemble study
|
|
1057
|
+
if not self.assemble_study():
|
|
1058
|
+
self.logger.error("Study assembly failed")
|
|
1059
|
+
return False
|
|
1060
|
+
|
|
1061
|
+
# Step 3: Align and merge
|
|
1062
|
+
if not self.align_and_merge():
|
|
1063
|
+
self.logger.error("Feature alignment and merging failed")
|
|
1064
|
+
return False
|
|
1065
|
+
|
|
1066
|
+
# Step 4: Generate plots
|
|
1067
|
+
if not self.generate_plots():
|
|
1068
|
+
self.logger.warning("Plot generation failed, continuing...")
|
|
1069
|
+
pipeline_success = False
|
|
1070
|
+
|
|
1071
|
+
# Step 5: Export results
|
|
1072
|
+
if not self.export_results():
|
|
1073
|
+
self.logger.warning("Result export failed, continuing...")
|
|
1074
|
+
pipeline_success = False
|
|
1075
|
+
|
|
1076
|
+
# Step 6: Save study
|
|
1077
|
+
if not self.save_study():
|
|
1078
|
+
self.logger.error("Study saving failed")
|
|
1079
|
+
return False
|
|
1080
|
+
|
|
1081
|
+
# Step 7: Cleanup
|
|
1082
|
+
if not self.cleanup_temp_files():
|
|
1083
|
+
self.logger.warning("Cleanup failed, continuing...")
|
|
1084
|
+
|
|
1085
|
+
# Final summary
|
|
1086
|
+
total_time = time.time() - self.start_time
|
|
1087
|
+
self._log_progress("=" * 60)
|
|
1088
|
+
self._log_progress("PIPELINE COMPLETED SUCCESSFULLY")
|
|
1089
|
+
self._log_progress(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
|
|
1090
|
+
self._log_progress(f"Files processed: {len(self.processed_files)}")
|
|
1091
|
+
self._log_progress(f"Files failed: {len(self.failed_files)}")
|
|
1092
|
+
if hasattr(self.study, 'consensus_df'):
|
|
1093
|
+
self._log_progress(f"Consensus features: {len(self.study.consensus_df)}")
|
|
1094
|
+
self._log_progress("=" * 60)
|
|
1095
|
+
|
|
1096
|
+
return pipeline_success
|
|
1097
|
+
|
|
1098
|
+
except KeyboardInterrupt:
|
|
1099
|
+
self.logger.info("Pipeline interrupted by user")
|
|
1100
|
+
self._save_checkpoint()
|
|
1101
|
+
return False
|
|
1102
|
+
except Exception as e:
|
|
1103
|
+
self.logger.error(f"Pipeline failed with unexpected error: {e}")
|
|
1104
|
+
self._save_checkpoint()
|
|
1105
|
+
return False
|
|
1106
|
+
|
|
1107
|
+
def _get_total_processing_time(self) -> str:
|
|
1108
|
+
"""Get formatted total processing time."""
|
|
1109
|
+
if self.start_time is None:
|
|
1110
|
+
return "Unknown"
|
|
1111
|
+
|
|
1112
|
+
total_seconds = time.time() - self.start_time
|
|
1113
|
+
hours = int(total_seconds // 3600)
|
|
1114
|
+
minutes = int((total_seconds % 3600) // 60)
|
|
1115
|
+
seconds = int(total_seconds % 60)
|
|
1116
|
+
|
|
1117
|
+
if hours > 0:
|
|
1118
|
+
return f"{hours}h {minutes}m {seconds}s"
|
|
1119
|
+
elif minutes > 0:
|
|
1120
|
+
return f"{minutes}m {seconds}s"
|
|
1121
|
+
else:
|
|
1122
|
+
return f"{seconds}s"
|
|
1123
|
+
|
|
1124
|
+
def get_status(self) -> Dict[str, Any]:
|
|
1125
|
+
"""
|
|
1126
|
+
Get current processing status.
|
|
1127
|
+
|
|
1128
|
+
Returns:
|
|
1129
|
+
Dictionary with current status information
|
|
1130
|
+
"""
|
|
1131
|
+
return {
|
|
1132
|
+
"current_step": self.current_step,
|
|
1133
|
+
"processed_files": len(self.processed_files),
|
|
1134
|
+
"failed_files": len(self.failed_files),
|
|
1135
|
+
"study_loaded": self.study is not None,
|
|
1136
|
+
"start_time": self.start_time.isoformat() if self.start_time else None,
|
|
1137
|
+
"processing_time": self._get_total_processing_time(),
|
|
1138
|
+
"parameters": {
|
|
1139
|
+
"data_source": self.params.data_source,
|
|
1140
|
+
"study_folder": self.params.study_folder,
|
|
1141
|
+
"polarity": self.params.polarity,
|
|
1142
|
+
"num_cores": self.params.num_cores,
|
|
1143
|
+
"adducts": self.params.adducts,
|
|
1144
|
+
}
|
|
1145
|
+
}
|
|
1146
|
+
|
|
1147
|
+
def info(self):
|
|
1148
|
+
"""Print comprehensive wizard status information."""
|
|
1149
|
+
status = self.get_status()
|
|
1150
|
+
|
|
1151
|
+
print("\n" + "=" * 50)
|
|
1152
|
+
print("WIZARD STATUS")
|
|
1153
|
+
print("=" * 50)
|
|
1154
|
+
print(f"Current Step: {status['current_step']}")
|
|
1155
|
+
print(f"Data Source: {self.params.data_source}")
|
|
1156
|
+
print(f"Study Folder: {self.params.study_folder}")
|
|
1157
|
+
print(f"Polarity: {status['parameters']['polarity']}")
|
|
1158
|
+
print(f"CPU Cores: {status['parameters']['num_cores']}")
|
|
1159
|
+
print(f"Adducts: {', '.join(status['parameters']['adducts'])}")
|
|
1160
|
+
print(f"Processing Time: {status['processing_time']}")
|
|
1161
|
+
print(f"Files Processed: {status['processed_files']}")
|
|
1162
|
+
print(f"Files Failed: {status['failed_files']}")
|
|
1163
|
+
print(f"Study Loaded: {status['study_loaded']}")
|
|
1164
|
+
|
|
1165
|
+
if self.study is not None and hasattr(self.study, 'samples_df'):
|
|
1166
|
+
print(f"Samples in Study: {len(self.study.samples_df)}")
|
|
1167
|
+
|
|
1168
|
+
if self.study is not None and hasattr(self.study, 'consensus_df'):
|
|
1169
|
+
print(f"Consensus Features: {len(self.study.consensus_df)}")
|
|
1170
|
+
|
|
1171
|
+
print("=" * 50)
|
|
1172
|
+
|
|
1173
|
+
|
|
1174
|
+
# Export the main classes
|
|
1175
|
+
__all__ = ["Wizard", "wizard_def"]
|