masster 0.4.19__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/wizard.py DELETED
@@ -1,1175 +0,0 @@
1
- """
2
- Wizard module for automated processing of mass spectrometry studies.
3
-
4
- This module provides the Wizard class for fully automated processing of MS data
5
- from raw files to final study results, including batch conversion, assembly,
6
- alignment, merging, plotting, and export.
7
-
8
- Key Features:
9
- - Automated discovery and batch conversion of raw data files
10
- - Intelligent resume capability for interrupted processes
11
- - Parallel processing optimization for large datasets
12
- - Adaptive study format based on study size
13
- - Comprehensive logging and progress tracking
14
- - Optimized memory management for large studies
15
-
16
- Classes:
17
- - Wizard: Main class for automated study processing
18
- - wizard_def: Default parameters configuration class
19
-
20
- Example Usage:
21
- ```python
22
- from masster import Wizard, wizard_def
23
-
24
- # Create wizard with default parameters
25
- wizard = Wizard(
26
- data_source="./raw_data",
27
- study_folder="./processed_study",
28
- polarity="positive",
29
- num_cores=4
30
- )
31
-
32
- # Run complete processing pipeline
33
- wizard.run_full_pipeline()
34
-
35
- # Or run individual steps
36
- wizard.convert_to_sample5()
37
- wizard.assemble_study()
38
- wizard.align_and_merge()
39
- wizard.generate_plots()
40
- wizard.export_results()
41
- ```
42
- """
43
-
44
- from __future__ import annotations
45
-
46
- import os
47
- import time
48
- import multiprocessing
49
- from pathlib import Path
50
- from typing import Optional, Any, Dict, List
51
- from dataclasses import dataclass, field
52
- import concurrent.futures
53
- from datetime import datetime
54
-
55
- # Import masster modules - use delayed import to avoid circular dependencies
56
- from masster.logger import MassterLogger
57
- from masster.study.defaults.study_def import study_defaults
58
- from masster.study.defaults.align_def import align_defaults
59
- from masster.study.defaults.merge_def import merge_defaults
60
-
61
-
62
- @dataclass
63
- class wizard_def:
64
- """
65
- Default parameters for the Wizard automated processing system.
66
-
67
- This class provides comprehensive configuration for all stages of automated
68
- mass spectrometry data processing from raw files to final results.
69
-
70
- Attributes:
71
- # Core Configuration
72
- data_source (str): Path to directory containing raw data files
73
- study_folder (str): Output directory for processed study
74
- polarity (str): Ion polarity mode ("positive" or "negative")
75
- num_cores (int): Number of CPU cores to use for parallel processing
76
-
77
- # File Discovery
78
- file_extensions (List[str]): File extensions to search for
79
- search_subfolders (bool): Whether to search subdirectories
80
- skip_patterns (List[str]): Filename patterns to skip
81
-
82
- # Processing Parameters
83
- adducts (List[str]): Adduct specifications for given polarity
84
- batch_size (int): Number of files to process per batch
85
- memory_limit_gb (float): Memory limit for processing (GB)
86
-
87
- # Resume & Recovery
88
- resume_enabled (bool): Enable automatic resume capability
89
- force_reprocess (bool): Force reprocessing of existing files
90
- backup_enabled (bool): Create backups of intermediate results
91
-
92
- # Output & Export
93
- generate_plots (bool): Generate visualization plots
94
- export_formats (List[str]): Output formats to generate
95
- compress_output (bool): Compress final study file
96
-
97
- # Logging
98
- log_level (str): Logging detail level
99
- log_to_file (bool): Save logs to file
100
- progress_interval (int): Progress update interval (seconds)
101
- """
102
-
103
- # === Core Configuration ===
104
- data_source: str = ""
105
- study_folder: str = ""
106
- polarity: str = "positive"
107
- num_cores: int = 4
108
-
109
- # === File Discovery ===
110
- file_extensions: List[str] = field(default_factory=lambda: [".wiff", ".raw", ".mzML", ".d"])
111
- search_subfolders: bool = True
112
- skip_patterns: List[str] = field(default_factory=lambda: ["blank", "QC", "test"])
113
-
114
- # === Processing Parameters ===
115
- adducts: List[str] = field(default_factory=list) # Will be set based on polarity
116
- batch_size: int = 8
117
- memory_limit_gb: float = 16.0
118
- max_file_size_gb: float = 4.0
119
-
120
- # === Resume & Recovery ===
121
- resume_enabled: bool = True
122
- force_reprocess: bool = False
123
- backup_enabled: bool = True
124
- checkpoint_interval: int = 10 # Save progress every N files
125
-
126
- # === Study Assembly ===
127
- min_samples_for_merge: int = 50
128
- rt_tolerance: float = 1.5
129
- mz_tolerance: float = 0.01
130
- alignment_algorithm: str = "kd"
131
- merge_method: str = "chunked"
132
-
133
- # === Feature Detection ===
134
- chrom_fwhm: float = 0.2
135
- noise_threshold: float = 1e5
136
- chrom_peak_snr: float = 5.0
137
- tol_ppm: float = 10.0
138
-
139
- # === Output & Export ===
140
- generate_plots: bool = True
141
- generate_interactive: bool = True
142
- export_formats: List[str] = field(default_factory=lambda: ["csv", "mgf", "xlsx"])
143
- compress_output: bool = True
144
- adaptive_compression: bool = True # Adapt based on study size
145
-
146
- # === Logging ===
147
- log_level: str = "INFO"
148
- log_to_file: bool = True
149
- progress_interval: int = 30 # seconds
150
- verbose_progress: bool = True
151
-
152
- # === Advanced Options ===
153
- use_process_pool: bool = True # vs ThreadPoolExecutor
154
- optimize_memory: bool = True
155
- cleanup_temp_files: bool = True
156
- validate_outputs: bool = True
157
-
158
- _param_metadata: dict[str, dict[str, Any]] = field(
159
- default_factory=lambda: {
160
- "data_source": {
161
- "dtype": str,
162
- "description": "Path to directory containing raw data files",
163
- "required": True,
164
- },
165
- "study_folder": {
166
- "dtype": str,
167
- "description": "Output directory for processed study",
168
- "required": True,
169
- },
170
- "polarity": {
171
- "dtype": str,
172
- "description": "Ion polarity mode",
173
- "default": "positive",
174
- "allowed_values": ["positive", "negative", "pos", "neg"],
175
- },
176
- "num_cores": {
177
- "dtype": int,
178
- "description": "Number of CPU cores to use",
179
- "default": 4,
180
- "min_value": 1,
181
- "max_value": multiprocessing.cpu_count(),
182
- },
183
- "batch_size": {
184
- "dtype": int,
185
- "description": "Number of files to process per batch",
186
- "default": 8,
187
- "min_value": 1,
188
- "max_value": 32,
189
- },
190
- "memory_limit_gb": {
191
- "dtype": float,
192
- "description": "Memory limit for processing (GB)",
193
- "default": 16.0,
194
- "min_value": 1.0,
195
- "max_value": 128.0,
196
- },
197
- },
198
- repr=False,
199
- )
200
-
201
- def __post_init__(self):
202
- """Set polarity-specific defaults after initialization."""
203
- # Set default adducts based on polarity if not provided
204
- if not self.adducts:
205
- if self.polarity.lower() in ["positive", "pos"]:
206
- self.adducts = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]
207
- elif self.polarity.lower() in ["negative", "neg"]:
208
- self.adducts = ["H-1:-:1.0", "CH2O2:0:0.5"]
209
- else:
210
- # Default to positive
211
- self.adducts = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]
212
-
213
- # Validate num_cores
214
- max_cores = multiprocessing.cpu_count()
215
- if self.num_cores <= 0:
216
- self.num_cores = max_cores
217
- elif self.num_cores > max_cores:
218
- self.num_cores = max_cores
219
-
220
- # Ensure paths are absolute
221
- if self.data_source:
222
- self.data_source = os.path.abspath(self.data_source)
223
- if self.study_folder:
224
- self.study_folder = os.path.abspath(self.study_folder)
225
-
226
-
227
- class Wizard:
228
- """
229
- Automated processing wizard for mass spectrometry studies.
230
-
231
- The Wizard class provides end-to-end automation for processing collections
232
- of mass spectrometry files from raw data to final study results, including:
233
-
234
- 1. Raw data discovery and batch conversion to sample5 format
235
- 2. Study assembly with feature alignment and merging
236
- 3. Automated plot generation and result export
237
- 4. Intelligent resume capability for interrupted processes
238
- 5. Adaptive optimization based on study size and system resources
239
-
240
- The wizard handles the complete workflow with minimal user intervention
241
- while providing comprehensive logging and progress tracking.
242
- """
243
-
244
- def __init__(
245
- self,
246
- data_source: str = "",
247
- study_folder: str = "",
248
- polarity: str = "positive",
249
- adducts: Optional[List[str]] = None,
250
- num_cores: int = 4,
251
- **kwargs
252
- ):
253
- """
254
- Initialize the Wizard for automated study processing.
255
-
256
- Parameters:
257
- data_source: Directory containing raw data files
258
- study_folder: Output directory for processed study
259
- polarity: Ion polarity mode ("positive" or "negative")
260
- adducts: List of adduct specifications (auto-set if None)
261
- num_cores: Number of CPU cores for parallel processing
262
- **kwargs: Additional parameters (see wizard_def for full list)
263
- """
264
-
265
- # Create parameters instance
266
- if "params" in kwargs and isinstance(kwargs["params"], wizard_def):
267
- self.params = kwargs.pop("params")
268
- else:
269
- # Create default parameters and update with provided values
270
- self.params = wizard_def(
271
- data_source=data_source,
272
- study_folder=study_folder,
273
- polarity=polarity,
274
- num_cores=num_cores
275
- )
276
-
277
- if adducts is not None:
278
- self.params.adducts = adducts
279
-
280
- # Update with any additional parameters
281
- for key, value in kwargs.items():
282
- if hasattr(self.params, key):
283
- setattr(self.params, key, value)
284
-
285
- # Validate required parameters
286
- if not self.params.data_source:
287
- raise ValueError("data_source is required")
288
- if not self.params.study_folder:
289
- raise ValueError("study_folder is required")
290
-
291
- # Create directories
292
- self.data_source_path = Path(self.params.data_source)
293
- self.study_folder_path = Path(self.params.study_folder)
294
- self.study_folder_path.mkdir(parents=True, exist_ok=True)
295
-
296
- # Setup logging
297
- self._setup_logging()
298
-
299
- # Initialize state tracking
300
- self.processed_files = []
301
- self.failed_files = []
302
- self.study = None
303
- self.start_time = None
304
- self.current_step = "initialized"
305
-
306
- # Create checkpoint file path
307
- self.checkpoint_file = self.study_folder_path / "wizard_checkpoint.json"
308
-
309
- self.logger.info(f"Wizard initialized for {self.polarity} mode")
310
- self.logger.info(f"Data source: {self.data_source_path}")
311
- self.logger.info(f"Study folder: {self.study_folder_path}")
312
- self.logger.info(f"Using {self.params.num_cores} CPU cores")
313
-
314
- # Load checkpoint if resuming
315
- if self.params.resume_enabled:
316
- self._load_checkpoint()
317
-
318
- @property
319
- def polarity(self) -> str:
320
- """Get the polarity setting."""
321
- return self.params.polarity
322
-
323
- @property
324
- def adducts(self) -> List[str]:
325
- """Get the adducts list."""
326
- return self.params.adducts
327
-
328
- def _setup_logging(self):
329
- """Setup comprehensive logging system."""
330
- # Create logger
331
- log_label = f"Wizard-{self.polarity}"
332
-
333
- if self.params.log_to_file:
334
- log_file = self.study_folder_path / "wizard.log"
335
- sink = str(log_file)
336
- else:
337
- sink = "sys.stdout"
338
-
339
- self.logger = MassterLogger(
340
- instance_type="wizard",
341
- level=self.params.log_level.upper(),
342
- label=log_label,
343
- sink=sink,
344
- )
345
-
346
- # Also create a simple file logger for critical info
347
- self.log_file = self.study_folder_path / "processing.log"
348
-
349
- def _log_progress(self, message: str, level: str = "INFO"):
350
- """Log progress message with timestamp."""
351
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
352
- full_message = f"[{timestamp}] {message}"
353
-
354
- # Log to masster logger
355
- getattr(self.logger, level.lower())(message)
356
-
357
- # Also write to simple log file
358
- with open(self.log_file, "a", encoding="utf-8") as f:
359
- f.write(f"{full_message}\n")
360
-
361
- if self.params.verbose_progress and level in ["INFO", "WARNING", "ERROR"]:
362
- print(full_message)
363
-
364
- def _save_checkpoint(self):
365
- """Save processing checkpoint for resume capability."""
366
- if not self.params.resume_enabled:
367
- return
368
-
369
- import json
370
- checkpoint_data = {
371
- "timestamp": datetime.now().isoformat(),
372
- "current_step": self.current_step,
373
- "processed_files": self.processed_files,
374
- "failed_files": self.failed_files,
375
- "params": {
376
- "data_source": self.params.data_source,
377
- "study_folder": self.params.study_folder,
378
- "polarity": self.params.polarity,
379
- "adducts": self.params.adducts,
380
- "num_cores": self.params.num_cores,
381
- }
382
- }
383
-
384
- try:
385
- with open(self.checkpoint_file, "w") as f:
386
- json.dump(checkpoint_data, f, indent=2)
387
- self.logger.debug(f"Checkpoint saved: {len(self.processed_files)} files processed")
388
- except Exception as e:
389
- self.logger.warning(f"Failed to save checkpoint: {e}")
390
-
391
- def _load_checkpoint(self):
392
- """Load processing checkpoint for resume capability."""
393
- if not self.checkpoint_file.exists():
394
- return
395
-
396
- import json
397
- try:
398
- with open(self.checkpoint_file, "r") as f:
399
- checkpoint_data = json.load(f)
400
-
401
- self.processed_files = checkpoint_data.get("processed_files", [])
402
- self.failed_files = checkpoint_data.get("failed_files", [])
403
- self.current_step = checkpoint_data.get("current_step", "initialized")
404
-
405
- self.logger.info(f"Resuming from checkpoint: {len(self.processed_files)} files already processed")
406
- self.logger.info(f"Previous step: {self.current_step}")
407
-
408
- except Exception as e:
409
- self.logger.warning(f"Failed to load checkpoint: {e}")
410
- self.processed_files = []
411
- self.failed_files = []
412
-
413
- def discover_files(self) -> List[Path]:
414
- """
415
- Discover raw data files in the source directory.
416
-
417
- Returns:
418
- List of file paths found for processing
419
- """
420
- self._log_progress("Discovering raw data files...")
421
- self.current_step = "discovering_files"
422
-
423
- found_files = []
424
-
425
- for extension in self.params.file_extensions:
426
- if self.params.search_subfolders:
427
- pattern = f"**/*{extension}"
428
- files = list(self.data_source_path.rglob(pattern))
429
- else:
430
- pattern = f"*{extension}"
431
- files = list(self.data_source_path.glob(pattern))
432
-
433
- # Filter out files matching skip patterns
434
- filtered_files = []
435
- for file_path in files:
436
- skip_file = False
437
- for pattern in self.params.skip_patterns:
438
- if pattern.lower() in file_path.name.lower():
439
- skip_file = True
440
- self.logger.debug(f"Skipping file (matches pattern '{pattern}'): {file_path.name}")
441
- break
442
-
443
- if not skip_file:
444
- # Check file size
445
- try:
446
- file_size_gb = file_path.stat().st_size / (1024**3)
447
- if file_size_gb > self.params.max_file_size_gb:
448
- self.logger.warning(f"Large file ({file_size_gb:.1f}GB): {file_path.name}")
449
- filtered_files.append(file_path)
450
- except Exception as e:
451
- self.logger.warning(f"Could not check file size for {file_path}: {e}")
452
- filtered_files.append(file_path)
453
-
454
- found_files.extend(filtered_files)
455
- self.logger.info(f"Found {len(filtered_files)} {extension} files")
456
-
457
- # Remove duplicates and sort
458
- found_files = sorted(list(set(found_files)))
459
-
460
- self._log_progress(f"Total files discovered: {len(found_files)}")
461
-
462
- return found_files
463
-
464
- def _process_single_file(self, file_path: Path, reset: bool = False) -> Optional[str]:
465
- """
466
- Process a single file to sample5 format.
467
-
468
- This method replicates the core processing from parallel_sample_processing.py
469
- but with wizard-specific configuration and error handling.
470
-
471
- Parameters:
472
- file_path: Path to the raw data file
473
- reset: Force reprocessing even if output exists
474
-
475
- Returns:
476
- Base filename of output on success, None on failure
477
- """
478
- import gc
479
-
480
- # Generate output filename
481
- file_out = file_path.stem + '.sample5'
482
- output_file = self.study_folder_path / file_out
483
-
484
- # Initialize masster Sample with delayed import
485
- import masster
486
- sample = masster.Sample(
487
- log_label=file_path.name,
488
- log_level='ERROR' # Reduce logging overhead in parallel processing
489
- )
490
-
491
- # Check if file should be skipped
492
- skip = False
493
- if not reset and not self.params.force_reprocess and output_file.exists():
494
- try:
495
- # Attempt to load existing processed file to verify it's valid
496
- sample.load(str(output_file))
497
- skip = True
498
- except Exception:
499
- # If loading fails, file needs to be reprocessed
500
- skip = False
501
-
502
- if skip:
503
- self.logger.debug(f"Skipping {file_path.name} (already processed)")
504
- return output_file.stem
505
-
506
- self.logger.info(f"Processing {file_path.name}")
507
-
508
- try:
509
- # STEP 1: Load raw data
510
- sample.load(str(file_path))
511
-
512
- # STEP 2: Feature detection - First pass (strict parameters)
513
- sample.find_features(
514
- chrom_fwhm=self.params.chrom_fwhm,
515
- noise=self.params.noise_threshold,
516
- tol_ppm=self.params.tol_ppm,
517
- chrom_peak_snr=self.params.chrom_peak_snr,
518
- min_trace_length_multiplier=0.5,
519
- chrom_fwhm_min=self.params.chrom_fwhm
520
- )
521
-
522
- # STEP 3: Feature detection - Second pass (relaxed parameters)
523
- sample.find_features(
524
- chrom_peak_snr=self.params.chrom_peak_snr,
525
- noise=self.params.noise_threshold / 10, # Lower noise threshold
526
- chrom_fwhm=2.0 # Wider peaks
527
- )
528
-
529
- # STEP 4: Adduct detection
530
- sample.find_adducts(adducts=self.adducts)
531
-
532
- # STEP 5: MS2 spectrum identification
533
- sample.find_ms2()
534
-
535
- # STEP 6: Save processed data
536
- sample.save(filename=str(output_file))
537
-
538
- # STEP 7: Generate additional outputs if requested
539
- if "csv" in self.params.export_formats:
540
- csv_file = output_file.with_suffix('.features.csv')
541
- sample.export_features(filename=str(csv_file))
542
-
543
- if "mgf" in self.params.export_formats:
544
- mgf_file = output_file.with_suffix('.mgf')
545
- sample.export_mgf(filename=str(mgf_file), use_cache=False)
546
-
547
- if self.params.generate_plots:
548
- plot_file = output_file.with_suffix('_2d.html')
549
- sample.plot_2d(filename=str(plot_file), markersize=4)
550
-
551
- # Memory cleanup
552
- result = output_file.stem
553
- del sample
554
- gc.collect()
555
-
556
- return result
557
-
558
- except Exception as e:
559
- self.logger.error(f"Error processing {file_path.name}: {e}")
560
- # Cleanup on error
561
- gc.collect()
562
- return None
563
-
564
- def _process_batch(self, file_batch: List[Path]) -> List[str]:
565
- """Process a batch of files in a single worker."""
566
- results = []
567
- for file_path in file_batch:
568
- result = self._process_single_file(file_path)
569
- if result:
570
- results.append(result)
571
- else:
572
- results.append(None)
573
- return results
574
-
575
- def convert_to_sample5(self, file_list: Optional[List[Path]] = None) -> bool:
576
- """
577
- Convert raw data files to sample5 format in parallel.
578
-
579
- Parameters:
580
- file_list: List of files to process (None to discover automatically)
581
-
582
- Returns:
583
- True if conversion completed successfully
584
- """
585
- self._log_progress("=== Starting Sample5 Conversion ===")
586
- self.current_step = "converting_to_sample5"
587
-
588
- if file_list is None:
589
- file_list = self.discover_files()
590
-
591
- if not file_list:
592
- self.logger.warning("No files found for conversion")
593
- return False
594
-
595
- # Filter out already processed files if resuming
596
- if self.params.resume_enabled and self.processed_files:
597
- remaining_files = []
598
- for file_path in file_list:
599
- if str(file_path) not in self.processed_files:
600
- remaining_files.append(file_path)
601
- file_list = remaining_files
602
-
603
- if not file_list:
604
- self._log_progress("All files already processed")
605
- return True
606
-
607
- self._log_progress(f"Converting {len(file_list)} files to sample5 format")
608
-
609
- conversion_start = time.time()
610
- successful_count = 0
611
- failed_count = 0
612
-
613
- if self.params.use_process_pool:
614
- # ProcessPoolExecutor approach - better for CPU-intensive work
615
- if len(file_list) <= self.params.batch_size:
616
- # Few files: process individually
617
- self.logger.info(f"Processing {len(file_list)} files individually with {self.params.num_cores} workers")
618
-
619
- with concurrent.futures.ProcessPoolExecutor(max_workers=self.params.num_cores) as executor:
620
- futures = [
621
- executor.submit(self._process_single_file, file_path)
622
- for file_path in file_list
623
- ]
624
-
625
- for i, future in enumerate(concurrent.futures.as_completed(futures)):
626
- result = future.result()
627
- if result:
628
- successful_count += 1
629
- self.processed_files.append(str(file_list[i]))
630
- else:
631
- failed_count += 1
632
- self.failed_files.append(str(file_list[i]))
633
-
634
- # Progress update and checkpoint
635
- if (successful_count + failed_count) % self.params.checkpoint_interval == 0:
636
- progress = (successful_count + failed_count) / len(file_list) * 100
637
- self._log_progress(f"Progress: {progress:.1f}% ({successful_count} successful, {failed_count} failed)")
638
- self._save_checkpoint()
639
-
640
- else:
641
- # Many files: process in batches
642
- batches = [
643
- file_list[i:i + self.params.batch_size]
644
- for i in range(0, len(file_list), self.params.batch_size)
645
- ]
646
-
647
- self.logger.info(f"Processing {len(file_list)} files in {len(batches)} batches")
648
-
649
- with concurrent.futures.ProcessPoolExecutor(max_workers=self.params.num_cores) as executor:
650
- futures = [executor.submit(self._process_batch, batch) for batch in batches]
651
-
652
- for batch_idx, future in enumerate(concurrent.futures.as_completed(futures)):
653
- batch_results = future.result()
654
- batch = batches[batch_idx]
655
-
656
- for i, result in enumerate(batch_results):
657
- if result:
658
- successful_count += 1
659
- self.processed_files.append(str(batch[i]))
660
- else:
661
- failed_count += 1
662
- self.failed_files.append(str(batch[i]))
663
-
664
- # Progress update
665
- progress = (successful_count + failed_count) / len(file_list) * 100
666
- self._log_progress(f"Batch {batch_idx + 1}/{len(batches)} complete. Progress: {progress:.1f}%")
667
- self._save_checkpoint()
668
-
669
- else:
670
- # ThreadPoolExecutor approach
671
- self.logger.info(f"Processing {len(file_list)} files with {self.params.num_cores} threads")
672
-
673
- with concurrent.futures.ThreadPoolExecutor(max_workers=self.params.num_cores) as executor:
674
- futures = [
675
- executor.submit(self._process_single_file, file_path)
676
- for file_path in file_list
677
- ]
678
-
679
- for i, future in enumerate(concurrent.futures.as_completed(futures)):
680
- result = future.result()
681
- if result:
682
- successful_count += 1
683
- self.processed_files.append(str(file_list[i]))
684
- else:
685
- failed_count += 1
686
- self.failed_files.append(str(file_list[i]))
687
-
688
- if (successful_count + failed_count) % self.params.checkpoint_interval == 0:
689
- progress = (successful_count + failed_count) / len(file_list) * 100
690
- self._log_progress(f"Progress: {progress:.1f}%")
691
- self._save_checkpoint()
692
-
693
- conversion_time = time.time() - conversion_start
694
-
695
- self._log_progress("=== Sample5 Conversion Complete ===")
696
- self._log_progress(f"Successful: {successful_count}")
697
- self._log_progress(f"Failed: {failed_count}")
698
- self._log_progress(f"Total time: {conversion_time:.1f} seconds")
699
-
700
- if failed_count > 0:
701
- self.logger.warning(f"{failed_count} files failed to process")
702
- for failed_file in self.failed_files[-failed_count:]:
703
- self.logger.warning(f"Failed: {failed_file}")
704
-
705
- self._save_checkpoint()
706
- return successful_count > 0
707
-
708
- def assemble_study(self) -> bool:
709
- """
710
- Assemble processed sample5 files into a study.
711
-
712
- Returns:
713
- True if study assembly was successful
714
- """
715
- self._log_progress("=== Starting Study Assembly ===")
716
- self.current_step = "assembling_study"
717
-
718
- # Find all sample5 files
719
- sample5_files = list(self.study_folder_path.glob("*.sample5"))
720
-
721
- if not sample5_files:
722
- self.logger.error("No sample5 files found for study assembly")
723
- return False
724
-
725
- self._log_progress(f"Assembling study from {len(sample5_files)} sample5 files")
726
-
727
- try:
728
- # Create study with optimized settings
729
- import masster
730
- study_params = study_defaults(
731
- folder=str(self.study_folder_path),
732
- polarity=self.polarity,
733
- log_level="INFO",
734
- log_label=f"Study-{self.polarity}",
735
- adducts=self.adducts
736
- )
737
-
738
- self.study = masster.Study(params=study_params)
739
-
740
- # Add all sample5 files
741
- sample5_pattern = str(self.study_folder_path / "*.sample5")
742
- self.study.add(sample5_pattern)
743
-
744
- self._log_progress(f"Added {len(self.study.samples_df)} samples to study")
745
-
746
- # Filter features based on quality criteria
747
- if hasattr(self.study, 'features_filter'):
748
- initial_features = len(self.study.features_df) if hasattr(self.study, 'features_df') else 0
749
-
750
- # Apply feature filtering
751
- feature_selection = self.study.features_select(
752
- chrom_coherence=0.3,
753
- chrom_prominence_scaled=1
754
- )
755
- self.study.features_filter(feature_selection)
756
-
757
- final_features = len(self.study.features_df) if hasattr(self.study, 'features_df') else 0
758
- self._log_progress(f"Feature filtering: {initial_features} -> {final_features} features")
759
-
760
- self._save_checkpoint()
761
- return True
762
-
763
- except Exception as e:
764
- self.logger.error(f"Failed to assemble study: {e}")
765
- return False
766
-
767
- def align_and_merge(self) -> bool:
768
- """
769
- Perform feature alignment and merging.
770
-
771
- Returns:
772
- True if alignment and merging were successful
773
- """
774
- self._log_progress("=== Starting Feature Alignment and Merging ===")
775
- self.current_step = "aligning_and_merging"
776
-
777
- if self.study is None:
778
- self.logger.error("Study not assembled. Run assemble_study() first.")
779
- return False
780
-
781
- try:
782
- # Align features across samples
783
- align_params = align_defaults(
784
- rt_tol=self.params.rt_tolerance,
785
- mz_tol=self.params.mz_tolerance,
786
- algorithm=self.params.alignment_algorithm
787
- )
788
-
789
- self.logger.info(f"Aligning features with RT tolerance {self.params.rt_tolerance}s, m/z tolerance {self.params.mz_tolerance} Da")
790
- self.study.align(params=align_params)
791
-
792
- # Merge aligned features
793
- merge_params = merge_defaults(
794
- method=self.params.merge_method,
795
- rt_tol=self.params.rt_tolerance,
796
- mz_tol=self.params.mz_tolerance,
797
- min_samples=self.params.min_samples_for_merge
798
- )
799
-
800
- self.logger.info(f"Merging features using {self.params.merge_method} method")
801
- self.study.merge(params=merge_params)
802
-
803
- # Log results
804
- num_consensus = len(self.study.consensus_df) if hasattr(self.study, 'consensus_df') else 0
805
- self._log_progress(f"Generated {num_consensus} consensus features")
806
-
807
- # Get study info
808
- if hasattr(self.study, 'info'):
809
- self.study.info()
810
-
811
- self._save_checkpoint()
812
- return True
813
-
814
- except Exception as e:
815
- self.logger.error(f"Failed to align and merge: {e}")
816
- return False
817
-
818
- def generate_plots(self) -> bool:
819
- """
820
- Generate visualization plots for the study.
821
-
822
- Returns:
823
- True if plot generation was successful
824
- """
825
- if not self.params.generate_plots:
826
- self._log_progress("Plot generation disabled, skipping...")
827
- return True
828
-
829
- self._log_progress("=== Generating Visualization Plots ===")
830
- self.current_step = "generating_plots"
831
-
832
- if self.study is None:
833
- self.logger.error("Study not available. Complete previous steps first.")
834
- return False
835
-
836
- try:
837
- plots_generated = 0
838
-
839
- # Alignment plot
840
- if hasattr(self.study, 'plot_alignment'):
841
- alignment_plot = self.study_folder_path / "alignment_plot.html"
842
- self.study.plot_alignment(filename=str(alignment_plot))
843
- plots_generated += 1
844
- self.logger.info(f"Generated alignment plot: {alignment_plot}")
845
-
846
- # Consensus 2D plot
847
- if hasattr(self.study, 'plot_consensus_2d'):
848
- consensus_2d_plot = self.study_folder_path / "consensus_2d.html"
849
- self.study.plot_consensus_2d(filename=str(consensus_2d_plot))
850
- plots_generated += 1
851
- self.logger.info(f"Generated consensus 2D plot: {consensus_2d_plot}")
852
-
853
- # PCA plot
854
- if hasattr(self.study, 'plot_pca'):
855
- pca_plot = self.study_folder_path / "pca_plot.html"
856
- self.study.plot_pca(filename=str(pca_plot))
857
- plots_generated += 1
858
- self.logger.info(f"Generated PCA plot: {pca_plot}")
859
-
860
- # Consensus statistics
861
- if hasattr(self.study, 'plot_consensus_stats'):
862
- stats_plot = self.study_folder_path / "consensus_stats.html"
863
- self.study.plot_consensus_stats(filename=str(stats_plot))
864
- plots_generated += 1
865
- self.logger.info(f"Generated statistics plot: {stats_plot}")
866
-
867
- self._log_progress(f"Generated {plots_generated} visualization plots")
868
- self._save_checkpoint()
869
- return True
870
-
871
- except Exception as e:
872
- self.logger.error(f"Failed to generate plots: {e}")
873
- return False
874
-
875
- def export_results(self) -> bool:
876
- """
877
- Export study results in requested formats.
878
-
879
- Returns:
880
- True if export was successful
881
- """
882
- self._log_progress("=== Exporting Study Results ===")
883
- self.current_step = "exporting_results"
884
-
885
- if self.study is None:
886
- self.logger.error("Study not available. Complete previous steps first.")
887
- return False
888
-
889
- try:
890
- exports_completed = 0
891
-
892
- # Export consensus features as CSV
893
- if "csv" in self.params.export_formats:
894
- csv_file = self.study_folder_path / "consensus_features.csv"
895
- if hasattr(self.study.consensus_df, 'write_csv'):
896
- self.study.consensus_df.write_csv(str(csv_file))
897
- exports_completed += 1
898
- self.logger.info(f"Exported CSV: {csv_file}")
899
-
900
- # Export as Excel
901
- if "xlsx" in self.params.export_formats and hasattr(self.study, 'export_xlsx'):
902
- xlsx_file = self.study_folder_path / "study_results.xlsx"
903
- self.study.export_xlsx(filename=str(xlsx_file))
904
- exports_completed += 1
905
- self.logger.info(f"Exported Excel: {xlsx_file}")
906
-
907
- # Export MGF for MS2 spectra
908
- if "mgf" in self.params.export_formats and hasattr(self.study, 'export_mgf'):
909
- mgf_file = self.study_folder_path / "consensus_ms2.mgf"
910
- self.study.export_mgf(filename=str(mgf_file))
911
- exports_completed += 1
912
- self.logger.info(f"Exported MGF: {mgf_file}")
913
-
914
- # Export as Parquet for efficient storage
915
- if "parquet" in self.params.export_formats and hasattr(self.study, 'export_parquet'):
916
- parquet_file = self.study_folder_path / "study_data.parquet"
917
- self.study.export_parquet(filename=str(parquet_file))
918
- exports_completed += 1
919
- self.logger.info(f"Exported Parquet: {parquet_file}")
920
-
921
- self._log_progress(f"Completed {exports_completed} exports")
922
- self._save_checkpoint()
923
- return True
924
-
925
- except Exception as e:
926
- self.logger.error(f"Failed to export results: {e}")
927
- return False
928
-
929
- def save_study(self) -> bool:
930
- """
931
- Save the final study in optimized format.
932
-
933
- Returns:
934
- True if study was saved successfully
935
- """
936
- self._log_progress("=== Saving Final Study ===")
937
- self.current_step = "saving_study"
938
-
939
- if self.study is None:
940
- self.logger.error("Study not available. Complete previous steps first.")
941
- return False
942
-
943
- try:
944
- study_file = self.study_folder_path / "final_study.study5"
945
-
946
- # Determine optimal save format based on study size
947
- num_samples = len(self.study.samples_df)
948
- num_features = len(self.study.consensus_df) if hasattr(self.study, 'consensus_df') else 0
949
-
950
- if self.params.adaptive_compression:
951
- # Use compressed format for large studies
952
- if num_samples > 50 or num_features > 10000:
953
- self.logger.info(f"Large study detected ({num_samples} samples, {num_features} features) - using compressed format")
954
- self.params.compress_output = True
955
- else:
956
- self.logger.info(f"Small study ({num_samples} samples, {num_features} features) - using standard format")
957
- self.params.compress_output = False
958
-
959
- # Save study
960
- if self.params.compress_output and hasattr(self.study, 'save_compressed'):
961
- self.study.save_compressed(filename=str(study_file))
962
- self.logger.info(f"Saved compressed study: {study_file}")
963
- else:
964
- self.study.save(filename=str(study_file))
965
- self.logger.info(f"Saved study: {study_file}")
966
-
967
- # Save metadata summary
968
- metadata_file = self.study_folder_path / "study_metadata.txt"
969
- with open(metadata_file, "w") as f:
970
- f.write("Study Processing Summary\n")
971
- f.write("========================\n")
972
- f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
973
- f.write(f"Polarity: {self.polarity}\n")
974
- f.write(f"Adducts: {', '.join(self.adducts)}\n")
975
- f.write(f"Number of Samples: {num_samples}\n")
976
- f.write(f"Number of Consensus Features: {num_features}\n")
977
- f.write(f"Successful Files: {len(self.processed_files)}\n")
978
- f.write(f"Failed Files: {len(self.failed_files)}\n")
979
- f.write(f"RT Tolerance: {self.params.rt_tolerance}s\n")
980
- f.write(f"m/z Tolerance: {self.params.mz_tolerance} Da\n")
981
- f.write(f"Merge Method: {self.params.merge_method}\n")
982
- f.write(f"Processing Time: {self._get_total_processing_time()}\n")
983
-
984
- self._log_progress(f"Saved study metadata: {metadata_file}")
985
- self._save_checkpoint()
986
- return True
987
-
988
- except Exception as e:
989
- self.logger.error(f"Failed to save study: {e}")
990
- return False
991
-
992
- def cleanup_temp_files(self) -> bool:
993
- """
994
- Clean up temporary files if requested.
995
-
996
- Returns:
997
- True if cleanup was successful
998
- """
999
- if not self.params.cleanup_temp_files:
1000
- return True
1001
-
1002
- self._log_progress("=== Cleaning Up Temporary Files ===")
1003
-
1004
- try:
1005
- cleaned_count = 0
1006
-
1007
- # Remove individual sample plots if study plots were generated
1008
- if self.params.generate_plots:
1009
- temp_plots = list(self.study_folder_path.glob("*_2d.html"))
1010
- for plot_file in temp_plots:
1011
- if plot_file.name not in ["alignment_plot.html", "consensus_2d.html", "pca_plot.html"]:
1012
- plot_file.unlink()
1013
- cleaned_count += 1
1014
-
1015
- # Remove checkpoint file
1016
- if self.checkpoint_file.exists():
1017
- self.checkpoint_file.unlink()
1018
- cleaned_count += 1
1019
-
1020
- self._log_progress(f"Cleaned up {cleaned_count} temporary files")
1021
- return True
1022
-
1023
- except Exception as e:
1024
- self.logger.error(f"Failed to cleanup temp files: {e}")
1025
- return False
1026
-
1027
- def run_full_pipeline(self) -> bool:
1028
- """
1029
- Run the complete automated processing pipeline.
1030
-
1031
- This method executes all processing steps in sequence:
1032
- 1. Convert raw files to sample5 format
1033
- 2. Assemble study from sample5 files
1034
- 3. Align and merge features
1035
- 4. Generate visualization plots
1036
- 5. Export results in requested formats
1037
- 6. Save final study
1038
- 7. Clean up temporary files
1039
-
1040
- Returns:
1041
- True if the entire pipeline completed successfully
1042
- """
1043
- self._log_progress("=" * 60)
1044
- self._log_progress("STARTING AUTOMATED STUDY PROCESSING PIPELINE")
1045
- self._log_progress("=" * 60)
1046
-
1047
- self.start_time = time.time()
1048
- pipeline_success = True
1049
-
1050
- try:
1051
- # Step 1: Convert to sample5
1052
- if not self.convert_to_sample5():
1053
- self.logger.error("Sample5 conversion failed")
1054
- return False
1055
-
1056
- # Step 2: Assemble study
1057
- if not self.assemble_study():
1058
- self.logger.error("Study assembly failed")
1059
- return False
1060
-
1061
- # Step 3: Align and merge
1062
- if not self.align_and_merge():
1063
- self.logger.error("Feature alignment and merging failed")
1064
- return False
1065
-
1066
- # Step 4: Generate plots
1067
- if not self.generate_plots():
1068
- self.logger.warning("Plot generation failed, continuing...")
1069
- pipeline_success = False
1070
-
1071
- # Step 5: Export results
1072
- if not self.export_results():
1073
- self.logger.warning("Result export failed, continuing...")
1074
- pipeline_success = False
1075
-
1076
- # Step 6: Save study
1077
- if not self.save_study():
1078
- self.logger.error("Study saving failed")
1079
- return False
1080
-
1081
- # Step 7: Cleanup
1082
- if not self.cleanup_temp_files():
1083
- self.logger.warning("Cleanup failed, continuing...")
1084
-
1085
- # Final summary
1086
- total_time = time.time() - self.start_time
1087
- self._log_progress("=" * 60)
1088
- self._log_progress("PIPELINE COMPLETED SUCCESSFULLY")
1089
- self._log_progress(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
1090
- self._log_progress(f"Files processed: {len(self.processed_files)}")
1091
- self._log_progress(f"Files failed: {len(self.failed_files)}")
1092
- if hasattr(self.study, 'consensus_df'):
1093
- self._log_progress(f"Consensus features: {len(self.study.consensus_df)}")
1094
- self._log_progress("=" * 60)
1095
-
1096
- return pipeline_success
1097
-
1098
- except KeyboardInterrupt:
1099
- self.logger.info("Pipeline interrupted by user")
1100
- self._save_checkpoint()
1101
- return False
1102
- except Exception as e:
1103
- self.logger.error(f"Pipeline failed with unexpected error: {e}")
1104
- self._save_checkpoint()
1105
- return False
1106
-
1107
- def _get_total_processing_time(self) -> str:
1108
- """Get formatted total processing time."""
1109
- if self.start_time is None:
1110
- return "Unknown"
1111
-
1112
- total_seconds = time.time() - self.start_time
1113
- hours = int(total_seconds // 3600)
1114
- minutes = int((total_seconds % 3600) // 60)
1115
- seconds = int(total_seconds % 60)
1116
-
1117
- if hours > 0:
1118
- return f"{hours}h {minutes}m {seconds}s"
1119
- elif minutes > 0:
1120
- return f"{minutes}m {seconds}s"
1121
- else:
1122
- return f"{seconds}s"
1123
-
1124
- def get_status(self) -> Dict[str, Any]:
1125
- """
1126
- Get current processing status.
1127
-
1128
- Returns:
1129
- Dictionary with current status information
1130
- """
1131
- return {
1132
- "current_step": self.current_step,
1133
- "processed_files": len(self.processed_files),
1134
- "failed_files": len(self.failed_files),
1135
- "study_loaded": self.study is not None,
1136
- "start_time": self.start_time.isoformat() if self.start_time else None,
1137
- "processing_time": self._get_total_processing_time(),
1138
- "parameters": {
1139
- "data_source": self.params.data_source,
1140
- "study_folder": self.params.study_folder,
1141
- "polarity": self.params.polarity,
1142
- "num_cores": self.params.num_cores,
1143
- "adducts": self.params.adducts,
1144
- }
1145
- }
1146
-
1147
- def info(self):
1148
- """Print comprehensive wizard status information."""
1149
- status = self.get_status()
1150
-
1151
- print("\n" + "=" * 50)
1152
- print("WIZARD STATUS")
1153
- print("=" * 50)
1154
- print(f"Current Step: {status['current_step']}")
1155
- print(f"Data Source: {self.params.data_source}")
1156
- print(f"Study Folder: {self.params.study_folder}")
1157
- print(f"Polarity: {status['parameters']['polarity']}")
1158
- print(f"CPU Cores: {status['parameters']['num_cores']}")
1159
- print(f"Adducts: {', '.join(status['parameters']['adducts'])}")
1160
- print(f"Processing Time: {status['processing_time']}")
1161
- print(f"Files Processed: {status['processed_files']}")
1162
- print(f"Files Failed: {status['failed_files']}")
1163
- print(f"Study Loaded: {status['study_loaded']}")
1164
-
1165
- if self.study is not None and hasattr(self.study, 'samples_df'):
1166
- print(f"Samples in Study: {len(self.study.samples_df)}")
1167
-
1168
- if self.study is not None and hasattr(self.study, 'consensus_df'):
1169
- print(f"Consensus Features: {len(self.study.consensus_df)}")
1170
-
1171
- print("=" * 50)
1172
-
1173
-
1174
- # Export the main classes
1175
- __all__ = ["Wizard", "wizard_def"]