masster 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/wizard/wizard.py CHANGED
@@ -23,21 +23,12 @@ from masster import Wizard, wizard_def
23
23
 
24
24
  # Create wizard with default parameters
25
25
  wizard = Wizard(
26
- data_source="./raw_data",
27
- study_folder="./processed_study",
26
+ source="./raw_data",
27
+ folder="./processed_study",
28
28
  polarity="positive",
29
29
  num_cores=4
30
30
  )
31
31
 
32
- # Run complete processing pipeline
33
- wizard.run_full_pipeline()
34
-
35
- # Or run individual steps
36
- wizard.convert_to_sample5()
37
- wizard.assemble_study()
38
- wizard.align_and_merge()
39
- wizard.generate_plots()
40
- wizard.export_results()
41
32
  ```
42
33
  """
43
34
 
@@ -47,6 +38,7 @@ import os
47
38
  import sys
48
39
  import time
49
40
  import importlib
41
+ import glob
50
42
  import multiprocessing
51
43
  from pathlib import Path
52
44
  from typing import Optional, Any, Dict, List
@@ -59,6 +51,7 @@ from masster.logger import MassterLogger
59
51
  from masster.study.defaults.study_def import study_defaults
60
52
  from masster.study.defaults.align_def import align_defaults
61
53
  from masster.study.defaults.merge_def import merge_defaults
54
+ from masster._version import __version__ as version
62
55
 
63
56
 
64
57
  @dataclass
@@ -71,8 +64,8 @@ class wizard_def:
71
64
 
72
65
  Attributes:
73
66
  # Core Configuration
74
- data_source (str): Path to directory containing raw data files
75
- study_folder (str): Output directory for processed study
67
+ source (str): Path to directory containing raw data files
68
+ folder (str): Output directory for processed study
76
69
  polarity (str): Ion polarity mode ("positive" or "negative")
77
70
  num_cores (int): Number of CPU cores to use for parallel processing
78
71
 
@@ -103,15 +96,15 @@ class wizard_def:
103
96
  """
104
97
 
105
98
  # === Core Configuration ===
106
- data_source: str = ""
107
- study_folder: str = ""
99
+ source: str = ""
100
+ folder: str = ""
108
101
  polarity: str = "positive"
109
102
  num_cores: int = 4
110
103
 
111
104
  # === File Discovery ===
112
- file_extensions: List[str] = field(default_factory=lambda: [".wiff", ".raw", ".mzML", ".d"])
105
+ file_extensions: List[str] = field(default_factory=lambda: [".wiff", ".raw", ".mzML"])
113
106
  search_subfolders: bool = True
114
- skip_patterns: List[str] = field(default_factory=lambda: ["blank", "QC", "test"])
107
+ skip_patterns: List[str] = field(default_factory=lambda: ["blank", "test"])
115
108
 
116
109
  # === Processing Parameters ===
117
110
  adducts: List[str] = field(default_factory=list) # Will be set based on polarity
@@ -130,11 +123,11 @@ class wizard_def:
130
123
  rt_tolerance: float = 1.5
131
124
  mz_max_diff: float = 0.01
132
125
  alignment_algorithm: str = "kd"
133
- merge_method: str = "chunked"
126
+ merge_method: str = "qt"
134
127
 
135
128
  # === Feature Detection ===
136
129
  chrom_fwhm: float = 0.5
137
- noise_threshold: float = 200.0
130
+ noise: float = 50.0
138
131
  chrom_peak_snr: float = 5.0
139
132
  tol_ppm: float = 10.0
140
133
  detector_type: str = "unknown" # Detected detector type ("orbitrap", "quadrupole", "unknown")
@@ -157,15 +150,15 @@ class wizard_def:
157
150
  optimize_memory: bool = True
158
151
  cleanup_temp_files: bool = True
159
152
  validate_outputs: bool = True
160
-
153
+
161
154
  _param_metadata: dict[str, dict[str, Any]] = field(
162
155
  default_factory=lambda: {
163
- "data_source": {
156
+ "source": {
164
157
  "dtype": str,
165
158
  "description": "Path to directory containing raw data files",
166
159
  "required": True,
167
160
  },
168
- "study_folder": {
161
+ "folder": {
169
162
  "dtype": str,
170
163
  "description": "Output directory for processed study",
171
164
  "required": True,
@@ -221,57 +214,47 @@ class wizard_def:
221
214
  self.num_cores = max_cores
222
215
 
223
216
  # Ensure paths are absolute
224
- if self.data_source:
225
- self.data_source = os.path.abspath(self.data_source)
226
- if self.study_folder:
227
- self.study_folder = os.path.abspath(self.study_folder)
217
+ if self.source:
218
+ self.source = os.path.abspath(self.source)
219
+ if self.folder:
220
+ self.folder = os.path.abspath(self.folder)
228
221
 
229
222
 
230
223
  class Wizard:
231
224
  """
232
- Automated processing wizard for mass spectrometry studies.
233
-
234
- The Wizard class provides end-to-end automation for processing collections
235
- of mass spectrometry files from raw data to final study results, including:
225
+ Simplified Wizard for automated mass spectrometry data processing.
236
226
 
237
- 1. Raw data discovery and batch conversion to sample5 format
238
- 2. Automatic detector type detection and parameter optimization
239
- 3. Study assembly with feature alignment and merging
240
- 4. Automated plot generation and result export
241
- 5. Intelligent resume capability for interrupted processes
242
- 6. Adaptive optimization based on study size and system resources
227
+ The Wizard provides a clean interface for creating and executing analysis scripts
228
+ that process raw MS data through the complete pipeline: file discovery, feature
229
+ detection, sample processing, study assembly, alignment, merging, and export.
243
230
 
244
- The wizard automatically detects the type of MS detector using simplified rules:
245
- - .raw files: Assume Orbitrap (noise threshold = 1e5)
246
- - .wiff files: Assume Quadrupole (noise threshold = 200)
247
- - .mzML files: Check metadata for Orbitrap detection
248
-
249
- The wizard handles the complete workflow with minimal user intervention
250
- while providing comprehensive logging and progress tracking.
231
+ This simplified version focuses on two core functions:
232
+ - create_script(): Generate standalone analysis scripts
233
+ - execute(): Create and run analysis scripts
251
234
  """
252
235
 
253
236
  def __init__(
254
237
  self,
255
- data_source: str = "",
256
- study_folder: str = "",
238
+ source: str = "",
239
+ folder: str = "",
257
240
  polarity: str = "positive",
258
241
  adducts: Optional[List[str]] = None,
259
- num_cores: int = 4,
242
+ num_cores: int = 0,
260
243
  **kwargs
261
244
  ):
262
245
  """
263
- Initialize the Wizard for automated study processing.
246
+ Initialize the Wizard with analysis parameters.
264
247
 
265
248
  Parameters:
266
- data_source: Directory containing raw data files
267
- study_folder: Output directory for processed study
249
+ source: Directory containing raw data files
250
+ folder: Output directory for processed study
268
251
  polarity: Ion polarity mode ("positive" or "negative")
269
252
  adducts: List of adduct specifications (auto-set if None)
270
- num_cores: Number of CPU cores for parallel processing
253
+ num_cores: Number of CPU cores (0 = auto-detect 75% of available)
271
254
  **kwargs: Additional parameters (see wizard_def for full list)
272
255
  """
273
256
 
274
- # Auto-detect optimal number of cores (75% of total)
257
+ # Auto-detect optimal number of cores if not specified
275
258
  if num_cores <= 0:
276
259
  num_cores = max(1, int(multiprocessing.cpu_count() * 0.75))
277
260
 
@@ -279,14 +262,15 @@ class Wizard:
279
262
  if "params" in kwargs and isinstance(kwargs["params"], wizard_def):
280
263
  self.params = kwargs.pop("params")
281
264
  else:
282
- # Create default parameters and update with provided values
265
+ # Create default parameters
283
266
  self.params = wizard_def(
284
- data_source=data_source,
285
- study_folder=study_folder,
267
+ source=source,
268
+ folder=folder,
286
269
  polarity=polarity,
287
270
  num_cores=num_cores
288
271
  )
289
272
 
273
+ # Set adducts if provided
290
274
  if adducts is not None:
291
275
  self.params.adducts = adducts
292
276
 
@@ -296,15 +280,15 @@ class Wizard:
296
280
  setattr(self.params, key, value)
297
281
 
298
282
  # Validate required parameters
299
- if not self.params.data_source:
300
- raise ValueError("data_source is required")
301
- if not self.params.study_folder:
302
- raise ValueError("study_folder is required")
283
+ if not self.params.source:
284
+ raise ValueError("source is required")
285
+ if not self.params.folder:
286
+ raise ValueError("folder is required")
303
287
 
304
- # Create directories
305
- self.data_source_path = Path(self.params.data_source)
306
- self.study_folder_path = Path(self.params.study_folder)
307
- self.study_folder_path.mkdir(parents=True, exist_ok=True)
288
+ # Create and validate paths
289
+ self.source_path = Path(self.params.source)
290
+ self.folder_path = Path(self.params.folder)
291
+ self.folder_path.mkdir(parents=True, exist_ok=True)
308
292
 
309
293
  # Auto-infer polarity from the first file if not explicitly set by user
310
294
  if polarity == "positive" and "polarity" not in kwargs:
@@ -313,29 +297,7 @@ class Wizard:
313
297
  self.params.polarity = inferred_polarity
314
298
  # Update adducts based on inferred polarity
315
299
  self.params.__post_init__()
316
-
317
- # Setup logging
318
- self._setup_logging()
319
-
320
- # Initialize state tracking
321
- self.processed_files = []
322
- self.failed_files = []
323
- self.study = None
324
- self.start_time = None
325
- self.current_step = "initialized"
326
-
327
- # Create checkpoint file path
328
- self.checkpoint_file = self.study_folder_path / "wizard_checkpoint.json"
329
-
330
- self.logger.info(f"Wizard initialized for {self.polarity} mode")
331
- self.logger.info(f"Data source: {self.data_source_path}")
332
- self.logger.info(f"Study folder: {self.study_folder_path}")
333
- self.logger.info(f"Using {self.params.num_cores} CPU cores")
334
-
335
- # Load checkpoint if resuming
336
- if self.params.resume_enabled:
337
- self._load_checkpoint()
338
-
300
+
339
301
  def _infer_polarity_from_first_file(self) -> str:
340
302
  """
341
303
  Infer polarity from the first available raw data file.
@@ -345,15 +307,15 @@ class Wizard:
345
307
  """
346
308
  try:
347
309
  # Find first file
348
- for extension in ['.wiff', '.raw', '.mzML', '.d']:
310
+ for extension in ['.wiff', '.raw', '.mzML']:
349
311
  pattern = f"**/*{extension}" if True else f"*{extension}" # search_subfolders=True
350
- files = list(self.data_source_path.rglob(pattern))
312
+ files = list(self.source_path.rglob(pattern))
351
313
  if files:
352
314
  first_file = files[0]
353
315
  break
354
316
  else:
355
- return None
356
-
317
+ return 'positive'
318
+
357
319
  # Only implement for .wiff files initially (most common format)
358
320
  if first_file.suffix.lower() == '.wiff':
359
321
  from masster.sample.load import _wiff_to_dict
@@ -375,1827 +337,361 @@ class Wizard:
375
337
  # Silently fall back to default if inference fails
376
338
  pass
377
339
 
378
- return None
379
-
340
+ return 'positive'
341
+
380
342
  @property
381
343
  def polarity(self) -> str:
382
- """Get the polarity setting."""
344
+ """Get the ion polarity mode."""
383
345
  return self.params.polarity
384
-
385
- @property
346
+
347
+ @property
386
348
  def adducts(self) -> List[str]:
387
- """Get the adducts list."""
349
+ """Get the adduct specifications."""
388
350
  return self.params.adducts
389
-
390
- def _reload(self):
391
- """
392
- Reloads all masster modules to pick up any changes to their source code,
393
- and updates the instance's class reference to the newly reloaded class version.
394
- This ensures that the instance uses the latest implementation without restarting the interpreter.
395
- """
396
- # Reset logger configuration flags to allow proper reconfiguration after reload
397
- try:
398
- import masster.logger as logger_module
399
-
400
- if hasattr(logger_module, "_WIZARD_LOGGER_CONFIGURED"):
401
- logger_module._WIZARD_LOGGER_CONFIGURED = False
402
- except Exception:
403
- pass
404
-
405
- # Get the base module name (masster)
406
- base_modname = self.__class__.__module__.split(".")[0]
407
- current_module = self.__class__.__module__
408
-
409
- # Dynamically find all wizard submodules
410
- wizard_modules = []
411
- wizard_module_prefix = f"{base_modname}.wizard."
412
-
413
- # Get all currently loaded modules that are part of the wizard package
414
- for module_name in sys.modules:
415
- if (
416
- module_name.startswith(wizard_module_prefix)
417
- and module_name != current_module
418
- ):
419
- wizard_modules.append(module_name)
420
-
421
- # Add core masster modules
422
- core_modules = [
423
- f"{base_modname}._version",
424
- f"{base_modname}.chromatogram",
425
- f"{base_modname}.spectrum",
426
- f"{base_modname}.logger",
427
- ]
428
-
429
- # Add sample submodules
430
- sample_modules = []
431
- sample_module_prefix = f"{base_modname}.sample."
432
- for module_name in sys.modules:
433
- if (
434
- module_name.startswith(sample_module_prefix)
435
- and module_name != current_module
436
- ):
437
- sample_modules.append(module_name)
438
351
 
439
- # Add study submodules
440
- study_modules = []
441
- study_module_prefix = f"{base_modname}.study."
442
- for module_name in sys.modules:
443
- if (
444
- module_name.startswith(study_module_prefix)
445
- and module_name != current_module
446
- ):
447
- study_modules.append(module_name)
448
-
449
- all_modules_to_reload = (
450
- core_modules + wizard_modules + sample_modules + study_modules
451
- )
452
-
453
- # Reload all discovered modules
454
- for full_module_name in all_modules_to_reload:
455
- try:
456
- if full_module_name in sys.modules:
457
- mod = sys.modules[full_module_name]
458
- importlib.reload(mod)
459
- self.logger.debug(f"Reloaded module: {full_module_name}")
460
- except Exception as e:
461
- self.logger.warning(f"Failed to reload module {full_module_name}: {e}")
462
-
463
- # Finally, reload the current module (wizard.py)
464
- try:
465
- mod = __import__(current_module, fromlist=[current_module.split(".")[0]])
466
- importlib.reload(mod)
467
-
468
- # Get the updated class reference from the reloaded module
469
- new = getattr(mod, self.__class__.__name__)
470
- # Update the class reference of the instance
471
- self.__class__ = new
472
-
473
- self.logger.debug("Module reload completed")
474
- except Exception as e:
475
- self.logger.error(f"Failed to reload current module {current_module}: {e}")
476
-
477
- def _setup_logging(self):
478
- """Setup comprehensive logging system."""
479
- # Create logger
480
- log_label = f"Wizard-{self.polarity}"
481
-
482
- if self.params.log_to_file:
483
- log_file = self.study_folder_path / "wizard.log"
484
- sink = str(log_file)
485
- else:
486
- sink = "sys.stdout"
487
-
488
- self.logger = MassterLogger(
489
- instance_type="wizard",
490
- level=self.params.log_level.upper(),
491
- label=log_label,
492
- sink=sink,
493
- )
494
-
495
- # Also create a simple file logger for critical info
496
- self.log_file = self.study_folder_path / "processing.log"
497
-
498
- def _log_progress(self, message: str, level: str = "INFO"):
499
- """Log progress message with timestamp."""
500
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
501
- full_message = f"[{timestamp}] {message}"
502
-
503
- # Log to masster logger
504
- getattr(self.logger, level.lower())(message)
505
-
506
- # Also write to simple log file
507
- with open(self.log_file, "a", encoding="utf-8") as f:
508
- f.write(f"{full_message}\n")
509
-
510
- if self.params.verbose_progress and level in ["INFO", "WARNING", "ERROR"]:
511
- print(full_message)
512
-
513
- def _save_checkpoint(self):
514
- """Save processing checkpoint for resume capability."""
515
- if not self.params.resume_enabled:
516
- return
517
-
518
- import json
519
- checkpoint_data = {
520
- "timestamp": datetime.now().isoformat(),
521
- "current_step": self.current_step,
522
- "processed_files": self.processed_files,
523
- "failed_files": self.failed_files,
524
- "params": {
525
- "data_source": self.params.data_source,
526
- "study_folder": self.params.study_folder,
527
- "polarity": self.params.polarity,
528
- "adducts": self.params.adducts,
529
- "num_cores": self.params.num_cores,
530
- }
531
- }
532
-
533
- try:
534
- with open(self.checkpoint_file, "w") as f:
535
- json.dump(checkpoint_data, f, indent=2)
536
- self.logger.debug(f"Checkpoint saved: {len(self.processed_files)} files processed")
537
- except Exception as e:
538
- self.logger.warning(f"Failed to save checkpoint: {e}")
539
-
540
- def _load_checkpoint(self):
541
- """Load processing checkpoint for resume capability."""
542
- if not self.checkpoint_file.exists():
543
- return
544
-
545
- import json
546
- try:
547
- with open(self.checkpoint_file, "r") as f:
548
- checkpoint_data = json.load(f)
549
-
550
- self.processed_files = checkpoint_data.get("processed_files", [])
551
- self.failed_files = checkpoint_data.get("failed_files", [])
552
- self.current_step = checkpoint_data.get("current_step", "initialized")
553
-
554
- self.logger.info(f"Resuming from checkpoint: {len(self.processed_files)} files already processed")
555
- self.logger.info(f"Previous step: {self.current_step}")
556
-
557
- except Exception as e:
558
- self.logger.warning(f"Failed to load checkpoint: {e}")
559
- self.processed_files = []
560
- self.failed_files = []
561
-
562
- def discover_files(self) -> List[Path]:
563
- """
564
- Discover raw data files in the source directory.
565
-
566
- Returns:
567
- List of file paths found for processing
568
- """
569
- self._log_progress("Discovering raw data files...")
570
- self.current_step = "discovering_files"
571
-
572
- found_files = []
573
-
574
- for extension in self.params.file_extensions:
575
- if self.params.search_subfolders:
576
- pattern = f"**/*{extension}"
577
- files = list(self.data_source_path.rglob(pattern))
578
- else:
579
- pattern = f"*{extension}"
580
- files = list(self.data_source_path.glob(pattern))
581
-
582
- # Filter out files matching skip patterns
583
- filtered_files = []
584
- for file_path in files:
585
- skip_file = False
586
- for pattern in self.params.skip_patterns:
587
- if pattern.lower() in file_path.name.lower():
588
- skip_file = True
589
- self.logger.debug(f"Skipping file (matches pattern '{pattern}'): {file_path.name}")
590
- break
591
-
592
- if not skip_file:
593
- # Check file size
594
- try:
595
- file_size_gb = file_path.stat().st_size / (1024**3)
596
- if file_size_gb > self.params.max_file_size_gb:
597
- self.logger.warning(f"Large file ({file_size_gb:.1f}GB): {file_path.name}")
598
- filtered_files.append(file_path)
599
- except Exception as e:
600
- self.logger.warning(f"Could not check file size for {file_path}: {e}")
601
- filtered_files.append(file_path)
602
-
603
- found_files.extend(filtered_files)
604
- self.logger.info(f"Found {len(filtered_files)} {extension} files")
605
-
606
- # Remove duplicates and sort
607
- found_files = sorted(list(set(found_files)))
608
-
609
- self._log_progress(f"Total files discovered: {len(found_files)}")
610
-
611
- return found_files
612
-
613
- def _process_single_file(self, file_path: Path, reset: bool = False) -> Optional[str]:
614
- """
615
- Process a single file to sample5 format.
616
-
617
- This method replicates the core processing from parallel_sample_processing.py
618
- but with wizard-specific configuration and error handling.
619
-
620
- Parameters:
621
- file_path: Path to the raw data file
622
- reset: Force reprocessing even if output exists
623
-
624
- Returns:
625
- Base filename of output on success, None on failure
626
- """
627
- import gc
628
-
629
- # Generate output filename
630
- file_out = file_path.stem + '.sample5'
631
- output_file = self.study_folder_path / file_out
632
-
633
- # Initialize masster Sample with delayed import
634
- import masster
635
- sample = masster.Sample(
636
- log_label=file_path.name,
637
- log_level='ERROR' # Reduce logging overhead in parallel processing
638
- )
639
-
640
- # Check if file should be skipped
641
- skip = False
642
- if not reset and not self.params.force_reprocess and output_file.exists():
643
- try:
644
- # Attempt to load existing processed file to verify it's valid
645
- sample.load(str(output_file))
646
- skip = True
647
- except Exception:
648
- # If loading fails, file needs to be reprocessed
649
- skip = False
650
-
651
- if skip:
652
- self.logger.debug(f"Skipping {file_path.name} (already processed)")
653
- return output_file.stem
654
-
655
- self.logger.info(f"Processing {file_path.name}")
656
-
657
- try:
658
- # STEP 1: Load raw data
659
- sample.load(str(file_path))
660
-
661
- # STEP 2: Feature detection - First pass (strict parameters)
662
- sample.find_features(
663
- chrom_fwhm=self.params.chrom_fwhm,
664
- noise=self.params.noise_threshold,
665
- tol_ppm=self.params.tol_ppm,
666
- chrom_peak_snr=self.params.chrom_peak_snr,
667
- min_trace_length_multiplier=0.5,
668
- chrom_fwhm_min=self.params.chrom_fwhm
669
- )
670
-
671
- # STEP 3: Feature detection - Second pass (relaxed parameters)
672
- sample.find_features(
673
- chrom_peak_snr=self.params.chrom_peak_snr,
674
- noise=self.params.noise_threshold / 10, # Lower noise threshold
675
- chrom_fwhm=2.0 # Wider peaks
676
- )
677
-
678
- # STEP 3.5: Validate feature detection results
679
- if not hasattr(sample, 'features_df') or sample.features_df is None or len(sample.features_df) == 0:
680
- self.logger.warning(f"No features detected in {file_path.name} - skipping additional processing")
681
- # Still save the sample5 file for record keeping
682
- sample.save(filename=str(output_file))
683
- return output_file.stem
684
-
685
- self.logger.info(f"Detected {len(sample.features_df)} features in {file_path.name}")
686
-
687
- # STEP 4: Adduct detection
688
- sample.find_adducts(adducts=self.adducts)
689
-
690
- # STEP 5: MS2 spectrum identification
691
- sample.find_ms2()
692
-
693
- # STEP 6: Save processed data
694
- sample.save(filename=str(output_file))
695
-
696
- # STEP 7: Generate additional outputs (only for samples with features)
697
- # Skip CSV export and individual MGF export as requested
698
-
699
- if self.params.generate_plots:
700
- plot_file = output_file.parent / (output_file.stem + "_2d.html")
701
- sample.plot_2d(filename=str(plot_file), markersize=4)
702
-
703
- # Memory cleanup
704
- result = output_file.stem
705
- del sample
706
- gc.collect()
707
-
708
- return result
709
-
710
- except Exception as e:
711
- self.logger.error(f"Error processing {file_path.name}: {e}")
712
- # Cleanup on error
713
- gc.collect()
714
- return None
715
-
716
- def _process_batch(self, file_batch: List[Path]) -> List[str]:
717
- """Process a batch of files in a single worker."""
718
- results = []
719
- for file_path in file_batch:
720
- result = self._process_single_file(file_path)
721
- if result:
722
- results.append(result)
723
- else:
724
- results.append(None)
725
- return results
726
-
727
- def convert_to_sample5(self, file_list: Optional[List[Path]] = None) -> bool:
352
+ def create_script(self, filename: str = "run_masster.py") -> bool:
728
353
  """
729
- Convert raw data files to sample5 format in parallel.
354
+ Generate a standalone Python script for the analysis pipeline.
730
355
 
731
356
  Parameters:
732
- file_list: List of files to process (None to discover automatically)
733
-
734
- Returns:
735
- True if conversion completed successfully
736
- """
737
- self._log_progress("=== Starting Sample5 Conversion ===")
738
- self.current_step = "converting_to_sample5"
739
-
740
- if file_list is None:
741
- file_list = self.discover_files()
742
-
743
- if not file_list:
744
- self.logger.warning("No files found for conversion")
745
- return False
746
-
747
- # Detect detector type and adjust parameters before processing
748
- detector_type = self._detect_detector_type()
749
- self._adjust_parameters_for_detector(detector_type)
750
-
751
- # Filter out already processed files if resuming
752
- if self.params.resume_enabled and self.processed_files:
753
- remaining_files = []
754
- for file_path in file_list:
755
- if str(file_path) not in self.processed_files:
756
- remaining_files.append(file_path)
757
- file_list = remaining_files
758
-
759
- if not file_list:
760
- self._log_progress("All files already processed")
761
- return True
762
-
763
- self._log_progress(f"Converting {len(file_list)} files to sample5 format")
764
-
765
- conversion_start = time.time()
766
- successful_count = 0
767
- failed_count = 0
768
-
769
- if self.params.use_process_pool:
770
- # ProcessPoolExecutor approach - better for CPU-intensive work
771
- if len(file_list) <= self.params.batch_size:
772
- # Few files: process individually
773
- self.logger.info(f"Processing {len(file_list)} files individually with {self.params.num_cores} workers")
774
-
775
- with concurrent.futures.ProcessPoolExecutor(max_workers=self.params.num_cores) as executor:
776
- futures = [
777
- executor.submit(self._process_single_file, file_path)
778
- for file_path in file_list
779
- ]
780
-
781
- for i, future in enumerate(concurrent.futures.as_completed(futures)):
782
- result = future.result()
783
- if result:
784
- successful_count += 1
785
- self.processed_files.append(str(file_list[i]))
786
- else:
787
- failed_count += 1
788
- self.failed_files.append(str(file_list[i]))
789
-
790
- # Progress update and checkpoint
791
- if (successful_count + failed_count) % self.params.checkpoint_interval == 0:
792
- progress = (successful_count + failed_count) / len(file_list) * 100
793
- self._log_progress(f"Progress: {progress:.1f}% ({successful_count} successful, {failed_count} failed)")
794
- self._save_checkpoint()
795
-
796
- else:
797
- # Many files: process in batches
798
- batches = [
799
- file_list[i:i + self.params.batch_size]
800
- for i in range(0, len(file_list), self.params.batch_size)
801
- ]
802
-
803
- self.logger.info(f"Processing {len(file_list)} files in {len(batches)} batches")
804
-
805
- with concurrent.futures.ProcessPoolExecutor(max_workers=self.params.num_cores) as executor:
806
- futures = [executor.submit(self._process_batch, batch) for batch in batches]
807
-
808
- for batch_idx, future in enumerate(concurrent.futures.as_completed(futures)):
809
- batch_results = future.result()
810
- batch = batches[batch_idx]
811
-
812
- for i, result in enumerate(batch_results):
813
- if result:
814
- successful_count += 1
815
- self.processed_files.append(str(batch[i]))
816
- else:
817
- failed_count += 1
818
- self.failed_files.append(str(batch[i]))
819
-
820
- # Progress update
821
- progress = (successful_count + failed_count) / len(file_list) * 100
822
- self._log_progress(f"Batch {batch_idx + 1}/{len(batches)} complete. Progress: {progress:.1f}%")
823
- self._save_checkpoint()
824
-
825
- else:
826
- # ThreadPoolExecutor approach
827
- self.logger.info(f"Processing {len(file_list)} files with {self.params.num_cores} threads")
357
+ filename: Name for the generated script file
828
358
 
829
- with concurrent.futures.ThreadPoolExecutor(max_workers=self.params.num_cores) as executor:
830
- futures = [
831
- executor.submit(self._process_single_file, file_path)
832
- for file_path in file_list
833
- ]
834
-
835
- for i, future in enumerate(concurrent.futures.as_completed(futures)):
836
- result = future.result()
837
- if result:
838
- successful_count += 1
839
- self.processed_files.append(str(file_list[i]))
840
- else:
841
- failed_count += 1
842
- self.failed_files.append(str(file_list[i]))
843
-
844
- if (successful_count + failed_count) % self.params.checkpoint_interval == 0:
845
- progress = (successful_count + failed_count) / len(file_list) * 100
846
- self._log_progress(f"Progress: {progress:.1f}%")
847
- self._save_checkpoint()
848
-
849
- conversion_time = time.time() - conversion_start
850
-
851
- self._log_progress("=== Sample5 Conversion Complete ===")
852
- self._log_progress(f"Successful: {successful_count}")
853
- self._log_progress(f"Failed: {failed_count}")
854
- self._log_progress(f"Total time: {conversion_time:.1f} seconds")
855
-
856
- if failed_count > 0:
857
- self.logger.warning(f"{failed_count} files failed to process")
858
- for failed_file in self.failed_files[-failed_count:]:
859
- self.logger.warning(f"Failed: {failed_file}")
860
-
861
- self._save_checkpoint()
862
- return successful_count > 0
863
-
864
- def _detect_detector_type(self) -> str:
865
- """
866
- Detect the type of MS detector from the first available file.
867
-
868
- Simplified detection rules:
869
- - .raw files: Assume Orbitrap (Thermo instruments)
870
- - .wiff files: Assume Quadrupole (SCIEX instruments)
871
- - .mzML files: Check metadata for Orbitrap detection
872
-
873
359
  Returns:
874
- String indicating detector type ("orbitrap", "quadrupole", "unknown")
360
+ True if script was generated successfully, False otherwise
875
361
  """
876
362
  try:
877
- # Find first raw file to analyze
878
- for extension in ['.raw', '.wiff', '.mzML', '.d']:
879
- if self.params.search_subfolders:
880
- pattern = f"**/*{extension}"
881
- files = list(self.data_source_path.rglob(pattern))
882
- else:
883
- pattern = f"*{extension}"
884
- files = list(self.data_source_path.glob(pattern))
885
- if files:
886
- first_file = files[0]
887
- break
888
- else:
889
- self.logger.warning("No raw files found for detector detection")
890
- return "unknown"
363
+ script_path = self.folder_path / filename
364
+ script_content = self._generate_script_content()
891
365
 
892
- self.logger.info(f"Detecting detector type from: {first_file.name}")
366
+ with open(script_path, 'w', encoding='utf-8') as f:
367
+ f.write(script_content)
893
368
 
894
- # Simplified detection rules
895
- if first_file.suffix.lower() == '.raw':
896
- # RAW files are Thermo -> assume Orbitrap
897
- detector_type = "orbitrap"
898
- self.logger.info("Detected .raw file -> Thermo Orbitrap detector")
899
- return detector_type
900
-
901
- elif first_file.suffix.lower() in ['.wiff', '.wiff2']:
902
- # WIFF files are SCIEX -> assume Quadrupole
903
- detector_type = "quadrupole"
904
- self.logger.info("Detected .wiff file -> SCIEX Quadrupole detector")
905
- return detector_type
369
+ print(f"Analysis script created: {script_path}")
370
+ return True
906
371
 
907
- elif first_file.suffix.lower() == '.mzml':
908
- # For mzML files, check metadata for Orbitrap detection
909
- try:
910
- import warnings
911
- with warnings.catch_warnings():
912
- warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH environment variable already exists.*", category=UserWarning)
913
- import pyopenms as oms
914
-
915
- exp = oms.MSExperiment()
916
- oms.MzMLFile().load(str(first_file), exp)
917
-
918
- # Check instrument metadata for Orbitrap keywords
919
- instrument_info = []
920
- if hasattr(exp, 'getExperimentalSettings'):
921
- settings = exp.getExperimentalSettings()
922
- if hasattr(settings, 'getInstrument'):
923
- instrument = settings.getInstrument()
924
- if hasattr(instrument, 'getName'):
925
- name = instrument.getName().decode() if hasattr(instrument.getName(), 'decode') else str(instrument.getName())
926
- instrument_info.append(name.lower())
927
- if hasattr(instrument, 'getModel'):
928
- model = instrument.getModel().decode() if hasattr(instrument.getModel(), 'decode') else str(instrument.getModel())
929
- instrument_info.append(model.lower())
930
-
931
- # Check for Orbitrap keywords in instrument info
932
- orbitrap_keywords = ['orbitrap', 'exactive', 'q-exactive', 'exploris', 'fusion', 'lumos', 'velos', 'elite']
933
- instrument_text = ' '.join(instrument_info)
934
-
935
- if any(keyword in instrument_text for keyword in orbitrap_keywords):
936
- detector_type = "orbitrap"
937
- self.logger.info(f"Detected mzML with Orbitrap instrument: {instrument_text}")
938
- else:
939
- detector_type = "unknown"
940
- self.logger.info(f"Detected mzML with unknown instrument: {instrument_text}")
941
-
942
- return detector_type
943
-
944
- except Exception as e:
945
- self.logger.warning(f"Failed to analyze mzML file for detector type: {e}")
946
- return "unknown"
947
-
948
372
  except Exception as e:
949
- self.logger.warning(f"Detector type detection failed: {e}")
950
-
951
- return "unknown"
952
-
953
- def _adjust_parameters_for_detector(self, detector_type: str):
373
+ print(f"Failed to create script: {e}")
374
+ return False
375
+
376
+ def execute(self, filename: str = "run_masster.py") -> bool:
954
377
  """
955
- Adjust processing parameters based on detected detector type.
956
-
957
- Simplified rules:
958
- - "orbitrap": Use 1e5 noise threshold (high background noise)
959
- - "quadrupole": Use 200 noise threshold (default, lower noise)
960
- - "unknown": Use 200 noise threshold (default)
378
+ Create and execute a standalone analysis script.
961
379
 
962
380
  Parameters:
963
- detector_type: Type of detector detected ("orbitrap", "quadrupole", "unknown")
964
- """
965
- original_noise = self.params.noise_threshold
966
- self.params.detector_type = detector_type # Store the detected type
967
-
968
- if detector_type == "orbitrap":
969
- # Orbitraps have much higher background noise, use 1e5 threshold
970
- self.params.noise_threshold = 1e5
971
- self._log_progress(f"Detector: Orbitrap detected - adjusted noise threshold: {original_noise} -> {self.params.noise_threshold}")
381
+ filename: Name for the generated script file
972
382
 
973
- elif detector_type == "quadrupole":
974
- # Quadrupole instruments have lower noise, use default threshold
975
- self.params.noise_threshold = 200.0
976
- self._log_progress(f"Detector: Quadrupole detected - noise threshold: {self.params.noise_threshold}")
977
-
978
- else:
979
- # Unknown detector type, keep default
980
- self.params.noise_threshold = 200.0
981
- self._log_progress(f"Detector: Unknown type detected - using default noise threshold: {self.params.noise_threshold}")
982
-
983
- def assemble_study(self) -> bool:
984
- """
985
- Assemble processed sample5 files into a study.
986
-
987
383
  Returns:
988
- True if study assembly was successful
384
+ True if execution completed successfully, False otherwise
989
385
  """
990
- self._log_progress("=== Starting Study Assembly ===")
991
- self.current_step = "assembling_study"
992
-
993
- # Find all sample5 files
994
- sample5_files = list(self.study_folder_path.glob("*.sample5"))
995
-
996
- if not sample5_files:
997
- self.logger.error("No sample5 files found for study assembly")
386
+ # First create the script
387
+ if not self.create_script(filename):
998
388
  return False
999
-
1000
- self._log_progress(f"Assembling study from {len(sample5_files)} sample5 files")
389
+
390
+ # Then execute it
391
+ script_path = self.folder_path / filename
1001
392
 
1002
393
  try:
1003
- # Detect detector type and adjust parameters if needed
1004
- detector_type = self._detect_detector_type()
1005
- self._adjust_parameters_for_detector(detector_type)
1006
-
1007
- # Create study with optimized settings
1008
- import masster
1009
- study_params = study_defaults(
1010
- folder=str(self.study_folder_path),
1011
- polarity=self.polarity,
1012
- log_level="INFO",
1013
- log_label=f"Study-{self.polarity}",
1014
- adducts=self.adducts
1015
- )
394
+ print("Executing...")
1016
395
 
1017
- self.study = masster.Study(params=study_params)
396
+ import subprocess
397
+ result = subprocess.run([
398
+ sys.executable, str(script_path)
399
+ ], cwd=str(self.folder_path), encoding='utf-8', errors='replace')
1018
400
 
1019
- # Add all sample5 files
1020
- sample5_pattern = str(self.study_folder_path / "*.sample5")
1021
- self.study.add(sample5_pattern)
401
+ success = result.returncode == 0
1022
402
 
1023
- self._log_progress(f"Added {len(self.study.samples_df)} samples to study")
1024
-
1025
- # Filter features based on quality criteria
1026
- if hasattr(self.study, 'features_filter'):
1027
- initial_features = len(self.study.features_df) if hasattr(self.study, 'features_df') else 0
1028
-
1029
- # Apply feature filtering
1030
- feature_selection = self.study.features_select(
1031
- chrom_coherence=0.3,
1032
- chrom_prominence_scaled=1
1033
- )
1034
- self.study.features_filter(feature_selection)
403
+ if success:
404
+ print("=" * 70)
405
+ print("Script execution completed successfully")
406
+ else:
407
+ print("=" * 70)
408
+ print(f"Script execution failed with return code: {result.returncode}")
1035
409
 
1036
- final_features = len(self.study.features_df) if hasattr(self.study, 'features_df') else 0
1037
- self._log_progress(f"Feature filtering: {initial_features} -> {final_features} features")
1038
-
1039
- self._save_checkpoint()
1040
- return True
410
+ return success
1041
411
 
1042
412
  except Exception as e:
1043
- self.logger.error(f"Failed to assemble study: {e}")
1044
- return False
1045
-
1046
- def align_and_merge(self) -> bool:
1047
- """
1048
- Perform feature alignment and merging.
1049
-
1050
- Returns:
1051
- True if alignment and merging were successful
1052
- """
1053
- self._log_progress("=== Starting Feature Alignment and Merging ===")
1054
- self.current_step = "aligning_and_merging"
1055
-
1056
- if self.study is None:
1057
- self.logger.error("Study not assembled. Run assemble_study() first.")
413
+ print(f"Error during script execution: {e}")
1058
414
  return False
1059
-
1060
- try:
1061
- # Determine optimal algorithms based on study size
1062
- num_samples = len(self.study.samples_df)
1063
-
1064
- if num_samples < 500:
1065
- # For smaller studies: use qt for both alignment and merge
1066
- alignment_algorithm = "qt"
1067
- merge_method = "qt"
1068
- self.logger.info(f"Small study ({num_samples} samples) - using qt algorithms")
415
+
416
+ def _generate_script_content(self) -> str:
417
+ """Generate the complete analysis script content."""
418
+
419
+ # Convert Path objects to strings for JSON serialization
420
+ params_dict = {}
421
+ for key, value in self.params.__dict__.items():
422
+ if key == '_param_metadata': # Skip metadata in generated script
423
+ continue
424
+ if isinstance(value, Path):
425
+ params_dict[key] = str(value)
1069
426
  else:
1070
- # For larger studies: use kd for alignment and qt-chunked for merge
1071
- alignment_algorithm = "kd"
1072
- merge_method = "qt-chunked"
1073
- self.logger.info(f"Large study ({num_samples} samples) - using kd alignment and qt-chunked merge")
1074
-
1075
- # Align features across samples
1076
- align_params = align_defaults(
1077
- rt_tol=self.params.rt_tolerance,
1078
- mz_max_diff=self.params.mz_max_diff,
1079
- algorithm=alignment_algorithm
1080
- )
427
+ params_dict[key] = value
1081
428
 
1082
- self.logger.info(f"Aligning features with RT tolerance {self.params.rt_tolerance}s, m/z max diff {self.params.mz_max_diff} Da, algorithm: {alignment_algorithm}")
1083
- self.study.align(params=align_params)
1084
-
1085
- # Merge aligned features
1086
- merge_params = merge_defaults(
1087
- method=merge_method,
1088
- rt_tol=self.params.rt_tolerance,
1089
- mz_tol=self.params.mz_max_diff,
1090
- min_samples=self.params.min_samples_for_merge
1091
- )
1092
-
1093
- self.logger.info(f"Merging features using {merge_method} method")
1094
- self.study.merge(params=merge_params)
1095
-
1096
- # Log results
1097
- num_consensus = len(self.study.consensus_df) if hasattr(self.study, 'consensus_df') else 0
1098
- self._log_progress(f"Generated {num_consensus} consensus features")
1099
-
1100
- # Get study info
1101
- if hasattr(self.study, 'info'):
1102
- self.study.info()
1103
-
1104
- self._save_checkpoint()
1105
- return True
1106
-
1107
- except Exception as e:
1108
- self.logger.error(f"Failed to align and merge: {e}")
1109
- return False
1110
-
1111
- def generate_plots(self) -> bool:
1112
- """
1113
- Generate visualization plots for the study.
1114
-
1115
- Returns:
1116
- True if plot generation was successful
1117
- """
1118
- if not self.params.generate_plots:
1119
- self._log_progress("Plot generation disabled, skipping...")
1120
- return True
1121
-
1122
- self._log_progress("=== Generating Visualization Plots ===")
1123
- self.current_step = "generating_plots"
429
+ # Obtain list of files in source with extension wiff, .raw, .mzML
430
+ raw_files = []
431
+ for ext in params_dict.get('file_extensions', []):
432
+ raw_files.extend(glob.glob(f"{params_dict.get('source', '')}/**/*{ext}", recursive=True))
433
+
434
+ # Create readable PARAMS dict with comments
435
+ params_lines = []
436
+ params_lines.append('# Analysis parameters')
437
+ params_lines.append('PARAMS = {')
1124
438
 
1125
- if self.study is None:
1126
- self.logger.error("Study not available. Complete previous steps first.")
1127
- return False
439
+ # Core Configuration
440
+ params_lines.append(' # === Core Configuration ===')
441
+ params_lines.append(f' "source": {params_dict.get("source", "")!r}, # Directory containing raw data files')
442
+ params_lines.append(f' "folder": {params_dict.get("folder", "")!r}, # Output directory for processed study')
443
+ params_lines.append(f' "polarity": {params_dict.get("polarity", "positive")!r}, # Ion polarity mode ("positive" or "negative")')
444
+ params_lines.append(f' "num_cores": {params_dict.get("num_cores", 4)}, # Number of CPU cores for parallel processing')
445
+ params_lines.append('')
1128
446
 
1129
- try:
1130
- plots_generated = 0
1131
-
1132
- # Alignment plot
1133
- if hasattr(self.study, 'plot_alignment'):
1134
- alignment_plot = self.study_folder_path / "alignment_plot.html"
1135
- self.study.plot_alignment(filename=str(alignment_plot))
1136
- plots_generated += 1
1137
- self.logger.info(f"Generated alignment plot: {alignment_plot}")
1138
-
1139
- # Consensus 2D plot
1140
- if hasattr(self.study, 'plot_consensus_2d'):
1141
- consensus_2d_plot = self.study_folder_path / "consensus_2d.html"
1142
- self.study.plot_consensus_2d(filename=str(consensus_2d_plot))
1143
- plots_generated += 1
1144
- self.logger.info(f"Generated consensus 2D plot: {consensus_2d_plot}")
1145
-
1146
- # PCA plot
1147
- if hasattr(self.study, 'plot_pca'):
1148
- pca_plot = self.study_folder_path / "pca_plot.html"
1149
- self.study.plot_pca(filename=str(pca_plot))
1150
- plots_generated += 1
1151
- self.logger.info(f"Generated PCA plot: {pca_plot}")
1152
-
1153
- # Consensus statistics
1154
- if hasattr(self.study, 'plot_consensus_stats'):
1155
- stats_plot = self.study_folder_path / "consensus_stats.html"
1156
- self.study.plot_consensus_stats(filename=str(stats_plot))
1157
- plots_generated += 1
1158
- self.logger.info(f"Generated statistics plot: {stats_plot}")
1159
-
1160
- self._log_progress(f"Generated {plots_generated} visualization plots")
1161
- self._save_checkpoint()
1162
- return True
1163
-
1164
- except Exception as e:
1165
- self.logger.error(f"Failed to generate plots: {e}")
1166
- return False
1167
-
1168
- def export_results(self) -> bool:
1169
- """
1170
- Export study results in requested formats.
447
+ # File Discovery
448
+ params_lines.append(' # === File Discovery ===')
449
+ params_lines.append(f' "file_extensions": {params_dict.get("file_extensions", [".wiff", ".raw", ".mzML"])!r}, # File extensions to search for')
450
+ params_lines.append(f' "search_subfolders": {params_dict.get("search_subfolders", True)}, # Whether to search subdirectories recursively')
451
+ params_lines.append(f' "skip_patterns": {params_dict.get("skip_patterns", ["blank", "condition"])!r}, # Filename patterns to skip')
452
+ params_lines.append('')
1171
453
 
1172
- Returns:
1173
- True if export was successful
1174
- """
1175
- self._log_progress("=== Exporting Study Results ===")
1176
- self.current_step = "exporting_results"
454
+ # Processing Parameters
455
+ params_lines.append(' # === Processing Parameters ===')
456
+ params_lines.append(f' "adducts": {params_dict.get("adducts", [])!r}, # Adduct specifications for feature detection and annotation')
457
+ params_lines.append(f' "detector_type": {params_dict.get("detector_type", "unknown")!r}, # MS detector type ("orbitrap", "tof", "unknown")')
458
+ params_lines.append('')
459
+
460
+ # Alignment & Merging
461
+ params_lines.append(' # === Alignment & Merging ===')
462
+ params_lines.append(f' "rt_tol": {params_dict.get("rt_tol", 2.0)}, # Retention time tolerance for alignment (seconds)')
463
+ params_lines.append(f' "mz_tol": {params_dict.get("mz_tol", 0.01)}, # Mass-to-charge ratio tolerance for alignment (Da)')
464
+ params_lines.append(f' "alignment_method": {params_dict.get("alignment_method", "kd")!r}, # Algorithm for sample alignment')
465
+ params_lines.append(f' "min_samples_per_feature": {params_dict.get("min_samples_per_feature", 1)}, # Minimum samples required per consensus feature')
466
+ params_lines.append(f' "merge_method": {params_dict.get("merge_method", "qt")!r}, # Method for merging consensus features')
467
+ params_lines.append('')
468
+
469
+ # Sample Processing
470
+ params_lines.append(' # === Sample Processing (used in add_samples_from_folder) ===')
471
+ params_lines.append(f' "batch_size": {params_dict.get("batch_size", 8)}, # Number of files to process per batch')
472
+ params_lines.append(f' "memory_limit_gb": {params_dict.get("memory_limit_gb", 16.0)}, # Memory limit for processing (GB)')
473
+ params_lines.append('')
474
+
475
+ # Script Options
476
+ params_lines.append(' # === Script Options ===')
477
+ params_lines.append(f' "resume_enabled": {params_dict.get("resume_enabled", True)}, # Enable automatic resume capability')
478
+ params_lines.append(f' "force_reprocess": {params_dict.get("force_reprocess", False)}, # Force reprocessing of existing files')
479
+ params_lines.append(f' "cleanup_temp_files": {params_dict.get("cleanup_temp_files", True)}, # Clean up temporary files after processing')
480
+
481
+ params_lines.append('}')
482
+
483
+ # Create script lines
484
+ script_lines = [
485
+ '#!/usr/bin/env python3',
486
+ '"""',
487
+ 'Automated Mass Spectrometry Data Analysis Pipeline',
488
+ f'Generated by masster wizard v{version}',
489
+ '"""',
490
+ '',
491
+ 'import sys',
492
+ 'import time',
493
+ 'from pathlib import Path',
494
+ '',
495
+ '# Import masster modules',
496
+ 'from masster.study import Study',
497
+ 'from masster import __version__',
498
+ '',
499
+ ]
1177
500
 
1178
- if self.study is None:
1179
- self.logger.error("Study not available. Complete previous steps first.")
1180
- return False
501
+ # Add the formatted PARAMS
502
+ script_lines.extend(params_lines)
1181
503
 
1182
- try:
1183
- exports_completed = 0
1184
-
1185
- # Export consensus features as CSV
1186
- if "csv" in self.params.export_formats:
1187
- csv_file = self.study_folder_path / "consensus_features.csv"
1188
- if hasattr(self.study.consensus_df, 'write_csv'):
1189
- self.study.consensus_df.write_csv(str(csv_file))
1190
- exports_completed += 1
1191
- self.logger.info(f"Exported CSV: {csv_file}")
1192
-
1193
- # Export as Excel
1194
- if "xlsx" in self.params.export_formats and hasattr(self.study, 'export_xlsx'):
1195
- xlsx_file = self.study_folder_path / "study_results.xlsx"
1196
- self.study.export_xlsx(filename=str(xlsx_file))
1197
- exports_completed += 1
1198
- self.logger.info(f"Exported Excel: {xlsx_file}")
1199
-
1200
- # Export MGF for MS2 spectra
1201
- if "mgf" in self.params.export_formats and hasattr(self.study, 'export_mgf'):
1202
- mgf_file = self.study_folder_path / "consensus_ms2.mgf"
1203
- self.study.export_mgf(filename=str(mgf_file))
1204
- exports_completed += 1
1205
- self.logger.info(f"Exported MGF: {mgf_file}")
1206
-
1207
- # Export as Parquet for efficient storage
1208
- if "parquet" in self.params.export_formats and hasattr(self.study, 'export_parquet'):
1209
- parquet_file = self.study_folder_path / "study_data.parquet"
1210
- self.study.export_parquet(filename=str(parquet_file))
1211
- exports_completed += 1
1212
- self.logger.info(f"Exported Parquet: {parquet_file}")
1213
-
1214
- self._log_progress(f"Completed {exports_completed} exports")
1215
- self._save_checkpoint()
1216
-
1217
- # Always perform additional export methods as requested
1218
- self._export_additional_formats()
1219
-
1220
- return True
1221
-
1222
- except Exception as e:
1223
- self.logger.error(f"Failed to export results: {e}")
1224
- return False
1225
-
1226
- def _export_additional_formats(self):
1227
- """Export additional formats: xlsx, parquet, save, and mgf."""
1228
- self.logger.info("=== Exporting Additional Formats ===")
1229
-
1230
- try:
1231
- # Force export xlsx (study results in Excel format)
1232
- xlsx_file = self.study_folder_path / "study_results.xlsx"
1233
- if hasattr(self.study, 'export_xlsx'):
1234
- self.study.export_xlsx(filename=str(xlsx_file))
1235
- self.logger.info(f"Exported Excel: {xlsx_file}")
1236
-
1237
- # Force export parquet (efficient binary format)
1238
- parquet_file = self.study_folder_path / "study_data.parquet"
1239
- if hasattr(self.study, 'export_parquet'):
1240
- self.study.export_parquet(filename=str(parquet_file))
1241
- self.logger.info(f"Exported Parquet: {parquet_file}")
1242
-
1243
- # Force save the study in study5 format
1244
- study_file = self.study_folder_path / "final_study.study5"
1245
- self.study.save(filename=str(study_file))
1246
- self.logger.info(f"Saved study: {study_file}")
1247
-
1248
- # Force export MGF for MS2 spectra
1249
- mgf_file = self.study_folder_path / "consensus_ms2.mgf"
1250
- if hasattr(self.study, 'export_mgf'):
1251
- self.study.export_mgf(filename=str(mgf_file))
1252
- self.logger.info(f"Exported MGF: {mgf_file}")
1253
-
1254
- except Exception as e:
1255
- self.logger.warning(f"Some additional exports failed: {e}")
1256
-
1257
- def save_study(self) -> bool:
1258
- """
1259
- Save the final study in optimized format.
1260
-
1261
- Returns:
1262
- True if study was saved successfully
1263
- """
1264
- self._log_progress("=== Saving Final Study ===")
1265
- self.current_step = "saving_study"
1266
-
1267
- if self.study is None:
1268
- self.logger.error("Study not available. Complete previous steps first.")
1269
- return False
1270
-
1271
- try:
1272
- study_file = self.study_folder_path / "final_study.study5"
1273
-
1274
- # Determine optimal save format based on study size
1275
- num_samples = len(self.study.samples_df)
1276
- num_features = len(self.study.consensus_df) if hasattr(self.study, 'consensus_df') else 0
1277
-
1278
- if self.params.adaptive_compression:
1279
- # Use compressed format for large studies
1280
- if num_samples > 50 or num_features > 10000:
1281
- self.logger.info(f"Large study detected ({num_samples} samples, {num_features} features) - using compressed format")
1282
- self.params.compress_output = True
1283
- else:
1284
- self.logger.info(f"Small study ({num_samples} samples, {num_features} features) - using standard format")
1285
- self.params.compress_output = False
1286
-
1287
- # Save study
1288
- if self.params.compress_output and hasattr(self.study, 'save_compressed'):
1289
- self.study.save_compressed(filename=str(study_file))
1290
- self.logger.info(f"Saved compressed study: {study_file}")
1291
- else:
1292
- self.study.save(filename=str(study_file))
1293
- self.logger.info(f"Saved study: {study_file}")
1294
-
1295
- # Save metadata summary
1296
- metadata_file = self.study_folder_path / "study_metadata.txt"
1297
- with open(metadata_file, "w") as f:
1298
- f.write("Study Processing Summary\n")
1299
- f.write("========================\n")
1300
- f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
1301
- f.write(f"Polarity: {self.polarity}\n")
1302
- f.write(f"Adducts: {', '.join(self.adducts)}\n")
1303
- f.write(f"Number of Samples: {num_samples}\n")
1304
- f.write(f"Number of Consensus Features: {num_features}\n")
1305
- f.write(f"Successful Files: {len(self.processed_files)}\n")
1306
- f.write(f"Failed Files: {len(self.failed_files)}\n")
1307
- f.write(f"RT Tolerance: {self.params.rt_tolerance}s\n")
1308
- f.write(f"m/z Max Diff: {self.params.mz_max_diff} Da\n")
1309
- f.write(f"Merge Method: {self.params.merge_method}\n")
1310
- f.write(f"Processing Time: {self._get_total_processing_time()}\n")
1311
-
1312
- self._log_progress(f"Saved study metadata: {metadata_file}")
1313
- self._save_checkpoint()
1314
- return True
1315
-
1316
- except Exception as e:
1317
- self.logger.error(f"Failed to save study: {e}")
1318
- return False
1319
-
1320
- def cleanup_temp_files(self) -> bool:
1321
- """
1322
- Clean up temporary files if requested.
1323
-
1324
- Returns:
1325
- True if cleanup was successful
1326
- """
1327
- if not self.params.cleanup_temp_files:
1328
- return True
1329
-
1330
- self._log_progress("=== Cleaning Up Temporary Files ===")
1331
-
1332
- try:
1333
- cleaned_count = 0
1334
-
1335
- # Remove individual sample plots if study plots were generated
1336
- if self.params.generate_plots:
1337
- temp_plots = list(self.study_folder_path.glob("*_2d.html"))
1338
- for plot_file in temp_plots:
1339
- if plot_file.name not in ["alignment_plot.html", "consensus_2d.html", "pca_plot.html"]:
1340
- plot_file.unlink()
1341
- cleaned_count += 1
1342
-
1343
- # Remove checkpoint file
1344
- if self.checkpoint_file.exists():
1345
- self.checkpoint_file.unlink()
1346
- cleaned_count += 1
1347
-
1348
- self._log_progress(f"Cleaned up {cleaned_count} temporary files")
1349
- return True
1350
-
1351
- except Exception as e:
1352
- self.logger.error(f"Failed to cleanup temp files: {e}")
1353
- return False
1354
-
1355
- def run_full_pipeline(self) -> bool:
1356
- """
1357
- Run the complete automated processing pipeline.
1358
-
1359
- This method executes all processing steps in sequence:
1360
- 1. Convert raw files to sample5 format
1361
- 2. Assemble study from sample5 files
1362
- 3. Align and merge features
1363
- 4. Generate visualization plots
1364
- 5. Export results in requested formats
1365
- 6. Save final study
1366
- 7. Clean up temporary files
1367
-
1368
- Returns:
1369
- True if the entire pipeline completed successfully
1370
- """
1371
- self._log_progress("=" * 60)
1372
- self._log_progress("STARTING AUTOMATED STUDY PROCESSING PIPELINE")
1373
- self._log_progress("=" * 60)
1374
-
1375
- self.start_time = time.time()
1376
- pipeline_success = True
1377
-
1378
- try:
1379
- # Step 1: Convert to sample5
1380
- if not self.convert_to_sample5():
1381
- self.logger.error("Sample5 conversion failed")
1382
- return False
1383
-
1384
- # Step 2: Assemble study
1385
- if not self.assemble_study():
1386
- self.logger.error("Study assembly failed")
1387
- return False
1388
-
1389
- # Step 3: Align and merge
1390
- if not self.align_and_merge():
1391
- self.logger.error("Feature alignment and merging failed")
1392
- return False
1393
-
1394
- # Step 4: Generate plots
1395
- if not self.generate_plots():
1396
- self.logger.warning("Plot generation failed, continuing...")
1397
- pipeline_success = False
1398
-
1399
- # Step 5: Export results
1400
- if not self.export_results():
1401
- self.logger.warning("Result export failed, continuing...")
1402
- pipeline_success = False
1403
-
1404
- # Step 6: Save study
1405
- if not self.save_study():
1406
- self.logger.error("Study saving failed")
1407
- return False
1408
-
1409
- # Step 7: Cleanup
1410
- if not self.cleanup_temp_files():
1411
- self.logger.warning("Cleanup failed, continuing...")
1412
-
1413
- # Final summary
1414
- total_time = time.time() - self.start_time
1415
- self._log_progress("=" * 60)
1416
- self._log_progress("PIPELINE COMPLETED SUCCESSFULLY")
1417
- self._log_progress(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
1418
- self._log_progress(f"Files processed: {len(self.processed_files)}")
1419
- self._log_progress(f"Files failed: {len(self.failed_files)}")
1420
- if hasattr(self.study, 'consensus_df'):
1421
- self._log_progress(f"Consensus features: {len(self.study.consensus_df)}")
1422
- self._log_progress("=" * 60)
1423
-
1424
- return pipeline_success
1425
-
1426
- except KeyboardInterrupt:
1427
- self.logger.info("Pipeline interrupted by user")
1428
- self._save_checkpoint()
1429
- return False
1430
- except Exception as e:
1431
- self.logger.error(f"Pipeline failed with unexpected error: {e}")
1432
- self._save_checkpoint()
1433
- return False
1434
-
1435
- def _get_total_processing_time(self) -> str:
1436
- """Get formatted total processing time."""
1437
- if self.start_time is None:
1438
- return "Unknown"
1439
-
1440
- total_seconds = time.time() - self.start_time
1441
- hours = int(total_seconds // 3600)
1442
- minutes = int((total_seconds % 3600) // 60)
1443
- seconds = int(total_seconds % 60)
1444
-
1445
- if hours > 0:
1446
- return f"{hours}h {minutes}m {seconds}s"
1447
- elif minutes > 0:
1448
- return f"{minutes}m {seconds}s"
1449
- else:
1450
- return f"{seconds}s"
1451
-
1452
- def get_status(self) -> Dict[str, Any]:
1453
- """
1454
- Get current processing status.
1455
-
1456
- Returns:
1457
- Dictionary with current status information
1458
- """
1459
- return {
1460
- "current_step": self.current_step,
1461
- "processed_files": len(self.processed_files),
1462
- "failed_files": len(self.failed_files),
1463
- "study_loaded": self.study is not None,
1464
- "start_time": self.start_time.isoformat() if self.start_time else None,
1465
- "processing_time": self._get_total_processing_time(),
1466
- "parameters": {
1467
- "data_source": self.params.data_source,
1468
- "study_folder": self.params.study_folder,
1469
- "polarity": self.params.polarity,
1470
- "num_cores": self.params.num_cores,
1471
- "adducts": self.params.adducts,
1472
- }
1473
- }
1474
-
1475
- def execute(self) -> bool:
1476
- """
1477
- Execute the complete automated processing pipeline.
1478
-
1479
- This is a convenience method that runs the full pipeline with the wizard's
1480
- current configuration. It performs standalone analysis of the samples/studies
1481
- as proposed by the Wizard.
1482
-
1483
- Returns:
1484
- True if execution completed successfully, False otherwise
1485
- """
1486
- self._log_progress("Executing Wizard automated processing...")
1487
- return self.run_full_pipeline()
1488
-
1489
- def export_script(self, filename: str) -> bool:
1490
- """
1491
- Generate a Python script that replicates the wizard's processing steps.
1492
-
1493
- Creates a standalone Python script that can be executed independently
1494
- to perform the same analysis as the wizard with the current configuration.
1495
- The script will be saved in the study folder.
1496
-
1497
- This is useful for:
1498
- - Creating reproducible analysis scripts
1499
- - Customizing processing steps
1500
- - Running analysis in different environments
1501
- - Batch processing automation
1502
-
1503
- Parameters:
1504
- filename: Filename for the script (should end with .py). Script will be saved in the study folder.
1505
-
1506
- Returns:
1507
- True if script was generated successfully, False otherwise
1508
- """
1509
- self._log_progress("Generating analysis script...")
1510
-
1511
- try:
1512
- # Ensure the filename is just a filename, not a full path
1513
- script_filename = Path(filename).name
1514
- if not script_filename.endswith('.py'):
1515
- script_filename = script_filename.replace(Path(script_filename).suffix, '') + '.py'
1516
-
1517
- # Place the script in the study folder
1518
- script_path = self.study_folder_path / script_filename
1519
-
1520
- # Generate the script content
1521
- script_content = self._generate_script_content()
1522
-
1523
- # Write the script
1524
- with open(script_path, 'w', encoding='utf-8') as f:
1525
- f.write(script_content)
1526
-
1527
- self._log_progress(f"Analysis script saved: {os.path.abspath(script_path)}")
1528
- self.logger.info(f"Generated standalone analysis script: {os.path.abspath(script_path)}")
1529
-
1530
- return True
1531
-
1532
- except Exception as e:
1533
- self.logger.error(f"Failed to generate script: {e}")
1534
- return False
1535
-
1536
- def to_script(self, filename: str) -> bool:
1537
- """
1538
- [DEPRECATED] Use export_script() instead.
1539
-
1540
- Backward compatibility alias for export_script().
1541
- """
1542
- return self.export_script(filename)
1543
-
1544
- def _generate_script_content(self) -> str:
1545
- """
1546
- Generate the content for the standalone analysis script.
1547
-
1548
- Returns:
1549
- Complete Python script content as string
1550
- """
1551
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
1552
-
1553
- # Create script header
1554
- script_lines = [
1555
- '#!/usr/bin/env python3',
1556
- '"""',
1557
- 'Standalone Mass Spectrometry Analysis Script',
1558
- f'Generated by masster.Wizard on {timestamp}',
504
+ # Add the main function and pipeline
505
+ script_lines.extend([
1559
506
  '',
1560
- 'This script replicates the automated processing pipeline configured',
1561
- 'in the Wizard with the following settings:',
1562
- f'- Data Source: {self.params.data_source.replace(chr(92), chr(92)*2)}',
1563
- f'- Study Folder: {self.params.study_folder.replace(chr(92), chr(92)*2)}',
1564
- f'- Polarity: {self.params.polarity}',
1565
- f'- Number of Cores: {self.params.num_cores}',
1566
- f'- Adducts: {", ".join(self.params.adducts)}',
1567
- f'- RT Tolerance: {self.params.rt_tolerance}s',
1568
- f'- m/z Max Diff: {self.params.mz_max_diff} Da',
1569
- f'- Merge Method: {self.params.merge_method}',
1570
- '"""',
1571
- '',
1572
- 'import os',
1573
- 'import sys',
1574
- 'import time',
1575
- 'import multiprocessing',
1576
- 'from pathlib import Path',
1577
- 'from typing import List, Optional',
1578
- 'import concurrent.futures',
1579
- 'from datetime import datetime',
1580
507
  '',
1581
- '# Add error handling for masster import',
1582
- '# First, try to add the masster directory to the Python path',
1583
- 'try:',
1584
- ' # Try to find masster by looking for it in common development locations',
1585
- ' possible_paths = [',
1586
- ' Path(__file__).parent.parent, # Script is in masster subfolder',
1587
- ' Path(__file__).parent.parent.parent, # Script is in study folder',
1588
- ' Path(os.getcwd()), # Current working directory',
1589
- ' Path(os.getcwd()).parent, # Parent of current directory',
1590
- ' Path(r"D:\\SW\\massistant"), # Specific development path',
1591
- ' Path.home() / "massistant", # Home directory',
1592
- ' Path.home() / "SW" / "massistant", # Common dev location',
1593
- ' ]',
508
+ 'def discover_raw_files(source_folder, file_extensions, search_subfolders=True):',
509
+ ' """Discover raw data files in the source folder."""',
510
+ ' source_path = Path(source_folder)',
511
+ ' raw_files = []',
1594
512
  ' ',
1595
- ' masster_found = False',
1596
- ' for possible_path in possible_paths:',
1597
- ' masster_dir = possible_path / "masster"',
1598
- ' if masster_dir.exists() and (masster_dir / "__init__.py").exists():',
1599
- ' if str(possible_path) not in sys.path:',
1600
- ' sys.path.insert(0, str(possible_path))',
1601
- ' masster_found = True',
1602
- ' print(f"Found masster at: {possible_path}")',
1603
- ' break',
1604
- ' ',
1605
- ' if not masster_found:',
1606
- ' # Try adding current directory to path as fallback',
1607
- ' current_dir = Path(os.getcwd())',
1608
- ' if str(current_dir) not in sys.path:',
1609
- ' sys.path.insert(0, str(current_dir))',
1610
- ' ',
1611
- ' import masster',
1612
- 'except ImportError as e:',
1613
- ' print(f"Error: masster library not found. {e}")',
1614
- ' print("Please ensure masster is installed or run this script from the masster directory.")',
1615
- ' print("You can install masster with: pip install -e .")',
1616
- ' sys.exit(1)',
1617
- '',
1618
- '',
1619
- 'def infer_polarity_from_first_file():',
1620
- ' """Infer polarity from the first available raw data file."""',
1621
- ' try:',
1622
- ' data_source_path = Path(DATA_SOURCE)',
1623
- ' # Find first file',
1624
- ' for extension in [\'.wiff\', \'.raw\', \'.mzML\', \'.d\']:',
1625
- ' pattern = f"**/*{extension}"',
1626
- ' files = list(data_source_path.rglob(pattern))',
1627
- ' if files:',
1628
- ' first_file = files[0]',
1629
- ' break',
513
+ ' for ext in file_extensions:',
514
+ ' if search_subfolders:',
515
+ ' pattern = f"**/*{ext}"',
516
+ ' files = list(source_path.rglob(pattern))',
1630
517
  ' else:',
1631
- ' return None',
1632
- ' ',
1633
- ' # Only implement for .wiff files initially',
1634
- ' if first_file.suffix.lower() == \'.wiff\':',
1635
- ' from masster.sample.load import _wiff_to_dict',
1636
- ' ',
1637
- ' # Extract metadata from first file',
1638
- ' metadata_df = _wiff_to_dict(str(first_file))',
1639
- ' ',
1640
- ' if not metadata_df.empty and \'polarity\' in metadata_df.columns:',
1641
- ' # Get polarity from first experiment',
1642
- ' first_polarity = metadata_df[\'polarity\'].iloc[0]',
1643
- ' ',
1644
- ' # Convert numeric polarity codes to string',
1645
- ' if first_polarity == 1 or str(first_polarity).lower() in [\'positive\', \'pos\', \'+\']:',
1646
- ' return "positive"',
1647
- ' elif first_polarity == -1 or str(first_polarity).lower() in [\'negative\', \'neg\', \'-\']:',
1648
- ' return "negative"',
1649
- ' except Exception:',
1650
- ' pass',
1651
- ' return None',
1652
- '',
1653
- '',
1654
- '# Configuration Parameters',
1655
- f'DATA_SOURCE = r"{self.params.data_source}"',
1656
- f'STUDY_FOLDER = r"{self.params.study_folder}"',
1657
- '',
1658
- '# Auto-infer polarity from first file, fall back to default',
1659
- 'detected_polarity = infer_polarity_from_first_file()',
1660
- f'POLARITY = detected_polarity or "{self.params.polarity}"',
1661
- 'NUM_CORES = max(1, int(multiprocessing.cpu_count() * 0.75)) # Auto-detect 75% of cores',
1662
- '',
1663
- '# Set adducts based on detected polarity',
1664
- 'if POLARITY.lower() in ["positive", "pos"]:',
1665
- ' ADDUCTS = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]',
1666
- 'elif POLARITY.lower() in ["negative", "neg"]:',
1667
- ' ADDUCTS = ["H-1:-:1.0", "CH2O2:0:0.5"]',
1668
- 'else:',
1669
- f' ADDUCTS = {self.params.adducts!r} # Fall back to original',
1670
- f'RT_TOLERANCE = {self.params.rt_tolerance}',
1671
- f'MZ_TOLERANCE = {self.params.mz_max_diff}',
1672
- f'MERGE_METHOD = "{self.params.merge_method}"',
1673
- f'BATCH_SIZE = {self.params.batch_size}',
1674
- f'CHROM_FWHM = {self.params.chrom_fwhm}',
1675
- f'NOISE_THRESHOLD = {self.params.noise_threshold}',
1676
- f'CHROM_PEAK_SNR = {self.params.chrom_peak_snr}',
1677
- f'TOL_PPM = {self.params.tol_ppm}',
1678
- f'MIN_SAMPLES_FOR_MERGE = {self.params.min_samples_for_merge}',
1679
- '',
1680
- '# File discovery settings',
1681
- "FILE_EXTENSIONS = ['.wiff', '.raw', '.mzML']",
1682
- f'SEARCH_SUBFOLDERS = {self.params.search_subfolders}',
1683
- "SKIP_PATTERNS = []",
1684
- f'MAX_FILE_SIZE_GB = {self.params.max_file_size_gb}',
1685
- '',
1686
- '# Output settings',
1687
- f'GENERATE_PLOTS = {self.params.generate_plots}',
1688
- f'EXPORT_FORMATS = {self.params.export_formats!r}',
1689
- f'COMPRESS_OUTPUT = {self.params.compress_output}',
1690
- f'CLEANUP_TEMP_FILES = {self.params.cleanup_temp_files}',
1691
- '',
1692
- '',
1693
- 'def log_progress(message: str):',
1694
- ' """Log progress message with timestamp."""',
1695
- ' timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")',
1696
- ' print(f"[{timestamp}] {message}")',
1697
- '',
1698
- '',
1699
- 'def discover_files() -> List[Path]:',
1700
- ' """Discover raw data files in the source directory."""',
1701
- ' log_progress("Discovering raw data files...")',
1702
- ' data_source_path = Path(DATA_SOURCE)',
1703
- ' found_files = []',
1704
- ' ',
1705
- ' for extension in FILE_EXTENSIONS:',
1706
- ' if SEARCH_SUBFOLDERS:',
1707
- ' pattern = f"**/*{extension}"',
1708
- ' files = list(data_source_path.rglob(pattern))',
1709
- ' else:',
1710
- ' pattern = f"*{extension}"',
1711
- ' files = list(data_source_path.glob(pattern))',
1712
- ' ',
1713
- ' # Filter out files matching skip patterns',
1714
- ' filtered_files = []',
1715
- ' for file_path in files:',
1716
- ' skip_file = False',
1717
- ' for pattern in SKIP_PATTERNS:',
1718
- ' if pattern.lower() in file_path.name.lower():',
1719
- ' skip_file = True',
1720
- ' print(f"Skipping file (matches pattern \'{pattern}\'): {file_path.name}")',
1721
- ' break',
1722
- ' ',
1723
- ' if not skip_file:',
1724
- ' # Check file size',
1725
- ' try:',
1726
- ' file_size_gb = file_path.stat().st_size / (1024**3)',
1727
- ' if file_size_gb > MAX_FILE_SIZE_GB:',
1728
- ' print(f"Large file ({file_size_gb:.1f}GB): {file_path.name}")',
1729
- ' filtered_files.append(file_path)',
1730
- ' except Exception as e:',
1731
- ' print(f"Could not check file size for {file_path}: {e}")',
1732
- ' filtered_files.append(file_path)',
1733
- ' ',
1734
- ' found_files.extend(filtered_files)',
1735
- ' log_progress(f"Found {len(filtered_files)} {extension} files")',
518
+ ' pattern = f"*{ext}"',
519
+ ' files = list(source_path.glob(pattern))',
520
+ ' raw_files.extend(files)',
1736
521
  ' ',
1737
- ' # Remove duplicates and sort',
1738
- ' found_files = sorted(list(set(found_files)))',
1739
- ' log_progress(f"Total files discovered: {len(found_files)}")',
1740
- ' return found_files',
522
+ ' return raw_files',
1741
523
  '',
1742
524
  '',
1743
- 'def process_single_file(file_path: Path) -> Optional[str]:',
1744
- ' """Process a single file to sample5 format."""',
1745
- ' import gc',
1746
- ' study_folder_path = Path(STUDY_FOLDER)',
1747
- ' ',
1748
- ' # Generate output filename',
1749
- ' file_out = file_path.stem + ".sample5"',
1750
- ' output_file = study_folder_path / file_out',
1751
- ' ',
1752
- ' # Check if file already exists',
1753
- ' if output_file.exists():',
1754
- ' try:',
1755
- ' # Try to load existing file to verify it\'s valid',
1756
- ' sample = masster.Sample(log_level="ERROR")',
1757
- ' sample.load(str(output_file))',
1758
- ' print(f"Skipping {file_path.name} (already processed)")',
1759
- ' return output_file.stem',
1760
- ' except Exception:',
1761
- ' # If loading fails, file needs to be reprocessed',
1762
- ' pass',
1763
- ' ',
1764
- ' print(f"Processing {file_path.name}")',
525
+ 'def process_single_file(args):',
526
+ ' """Process a single raw file to sample5 format - module level for multiprocessing."""',
527
+ ' raw_file, output_folder = args',
528
+ ' from masster.sample import Sample',
1765
529
  ' ',
1766
530
  ' try:',
1767
- ' # Initialize sample',
1768
- ' sample = masster.Sample(',
1769
- ' log_label=file_path.name,',
1770
- ' log_level="ERROR" # Reduce logging overhead',
1771
- ' )',
531
+ ' # Create sample5 filename',
532
+ ' sample_name = raw_file.stem',
533
+ ' sample5_path = Path(output_folder) / f"{sample_name}.sample5"',
1772
534
  ' ',
1773
- ' # STEP 1: Load raw data',
1774
- ' sample.load(str(file_path))',
535
+ ' # Skip if sample5 already exists',
536
+ ' if sample5_path.exists():',
537
+ ' print(f" Skipping {raw_file.name} (sample5 already exists)")',
538
+ ' return str(sample5_path)',
1775
539
  ' ',
1776
- ' # STEP 2: Feature detection - First pass (strict parameters)',
1777
- ' sample.find_features(',
1778
- ' chrom_fwhm=CHROM_FWHM,',
1779
- ' noise=NOISE_THRESHOLD,',
1780
- ' tol_ppm=TOL_PPM,',
1781
- ' chrom_peak_snr=CHROM_PEAK_SNR,',
1782
- ' min_trace_length_multiplier=0.5,',
1783
- ' chrom_fwhm_min=CHROM_FWHM',
1784
- ' )',
540
+ ' print(f" Converting {raw_file.name}...")',
1785
541
  ' ',
1786
- ' # STEP 3: Feature detection - Second pass (relaxed parameters)',
542
+ ' # Load and process raw file with full pipeline',
543
+ ' sample = Sample(log_label=sample_name)',
544
+ ' sample.load(filename=str(raw_file))',
1787
545
  ' sample.find_features(',
1788
- ' chrom_peak_snr=CHROM_PEAK_SNR,',
1789
- ' noise=NOISE_THRESHOLD / 10, # Lower noise threshold',
1790
- ' chrom_fwhm=2.0 # Wider peaks',
546
+ ' noise=PARAMS[\'noise\'],',
547
+ ' chrom_fwhm=PARAMS[\'chrom_fwhm\'],',
548
+ ' chrom_peak_snr=PARAMS[\'chrom_peak_snr\']',
1791
549
  ' )',
1792
- ' ',
1793
- ' # STEP 3.5: Validate feature detection results',
1794
- ' if not hasattr(sample, "features_df") or sample.features_df is None or len(sample.features_df) == 0:',
1795
- ' print(f"WARNING: No features detected in {file_path.name} - skipping additional processing")',
1796
- ' # Still save the sample5 file for record keeping',
1797
- ' sample.save(filename=str(output_file))',
1798
- ' return output_file.stem',
1799
- ' ',
1800
- ' print(f"Detected {len(sample.features_df)} features in {file_path.name}")',
1801
- ' ',
1802
- ' # STEP 4: Adduct detection',
1803
- ' sample.find_adducts(adducts=ADDUCTS)',
1804
- ' ',
1805
- ' # STEP 5: MS2 spectrum identification',
550
+ ' sample.find_adducts(adducts=PARAMS[\'adducts\'])',
1806
551
  ' sample.find_ms2()',
552
+ ' # sample.find_iso()',
553
+ ' # sample.export_mgf()',
554
+ ' # sample.export_mztab()',
555
+ ' # sample.plot_2d(filename="{sample_name}.html")',
556
+ ' sample.save(str(sample5_path))',
1807
557
  ' ',
1808
- ' # STEP 6: Save processed data',
1809
- ' sample.save(filename=str(output_file))',
1810
- ' ',
1811
- ' # STEP 7: Generate additional outputs (only for samples with features)',
1812
- ' # Skip CSV export and individual MGF export as requested',
1813
- ' ',
1814
- ' if GENERATE_PLOTS:',
1815
- ' plot_file = output_file.parent / (output_file.stem + "_2d.html")',
1816
- ' sample.plot_2d(filename=str(plot_file), markersize=4)',
1817
- ' ',
1818
- ' # Memory cleanup',
1819
- ' result = output_file.stem',
1820
- ' del sample',
1821
- ' gc.collect()',
1822
- ' return result',
558
+ ' # print(f" Completed {raw_file.name} -> {sample5_path.name}")',
559
+ ' return str(sample5_path)',
1823
560
  ' ',
1824
561
  ' except Exception as e:',
1825
- ' print(f"Error processing {file_path.name}: {e}")',
1826
- ' gc.collect()',
562
+ ' print(f" ERROR processing {raw_file.name}: {e}")',
1827
563
  ' return None',
1828
564
  '',
1829
565
  '',
1830
- 'def convert_to_sample5(file_list: List[Path]) -> bool:',
1831
- ' """Convert raw data files to sample5 format in parallel."""',
1832
- ' log_progress("=== Starting Sample5 Conversion ===")',
1833
- ' log_progress(f"Converting {len(file_list)} files to sample5 format")',
566
+ 'def convert_raw_to_sample5(raw_files, output_folder, polarity, num_cores):',
567
+ ' """Convert raw data files to sample5 format."""',
568
+ ' import concurrent.futures',
569
+ ' import os',
570
+ ' ',
571
+ ' # Create output directory',
572
+ ' os.makedirs(output_folder, exist_ok=True)',
1834
573
  ' ',
1835
- ' conversion_start = time.time()',
1836
- ' successful_count = 0',
1837
- ' failed_count = 0',
574
+ ' # Prepare arguments for multiprocessing',
575
+ ' file_args = [(raw_file, output_folder) for raw_file in raw_files]',
1838
576
  ' ',
1839
- ' with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_CORES) as executor:',
1840
- ' futures = [executor.submit(process_single_file, file_path) for file_path in file_list]',
577
+ ' # Process files in parallel',
578
+ ' sample5_files = []',
579
+ ' with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:',
580
+ ' futures = [executor.submit(process_single_file, args) for args in file_args]',
1841
581
  ' ',
1842
- ' for i, future in enumerate(concurrent.futures.as_completed(futures)):',
582
+ ' for future in concurrent.futures.as_completed(futures):',
1843
583
  ' result = future.result()',
1844
584
  ' if result:',
1845
- ' successful_count += 1',
1846
- ' else:',
1847
- ' failed_count += 1',
1848
- ' ',
1849
- ' # Progress update',
1850
- ' if (successful_count + failed_count) % 10 == 0:',
1851
- ' progress = (successful_count + failed_count) / len(file_list) * 100',
1852
- ' log_progress(f"Progress: {progress:.1f}% ({successful_count} successful, {failed_count} failed)")',
585
+ ' sample5_files.append(result)',
1853
586
  ' ',
1854
- ' conversion_time = time.time() - conversion_start',
1855
- ' log_progress("=== Sample5 Conversion Complete ===")',
1856
- ' log_progress(f"Successful: {successful_count}")',
1857
- ' log_progress(f"Failed: {failed_count}")',
1858
- ' log_progress(f"Total time: {conversion_time:.1f} seconds")',
1859
- ' ',
1860
- ' return successful_count > 0',
1861
- '',
1862
- '',
1863
- 'def assemble_study() -> masster.Study:',
1864
- ' """Assemble processed sample5 files into a study."""',
1865
- ' log_progress("=== Starting Study Assembly ===")',
1866
- ' study_folder_path = Path(STUDY_FOLDER)',
1867
- ' ',
1868
- ' # Find all sample5 files',
1869
- ' sample5_files = list(study_folder_path.glob("*.sample5"))',
1870
- ' if not sample5_files:',
1871
- ' raise RuntimeError("No sample5 files found for study assembly")',
1872
- ' ',
1873
- ' log_progress(f"Assembling study from {len(sample5_files)} sample5 files")',
1874
- ' ',
1875
- ' # Create study with optimized settings',
1876
- ' from masster.study.defaults.study_def import study_defaults',
1877
- ' study_params = study_defaults(',
1878
- ' folder=str(study_folder_path),',
1879
- ' polarity=POLARITY,',
1880
- ' log_level="INFO",',
1881
- f' log_label="Study-{self.params.polarity}",',
1882
- ' adducts=ADDUCTS',
1883
- ' )',
1884
- ' ',
1885
- ' study = masster.Study(params=study_params)',
1886
- ' ',
1887
- ' # Add all sample5 files',
1888
- ' sample5_pattern = str(study_folder_path / "*.sample5")',
1889
- ' study.add(sample5_pattern)',
1890
- ' log_progress(f"Added {len(study.samples_df)} samples to study")',
1891
- ' ',
1892
- ' # Filter features based on quality criteria',
1893
- ' if hasattr(study, "features_filter"):',
1894
- ' initial_features = len(study.features_df) if hasattr(study, "features_df") else 0',
1895
- ' feature_selection = study.features_select(',
1896
- ' chrom_coherence=0.3,',
1897
- ' chrom_prominence_scaled=1',
1898
- ' )',
1899
- ' study.features_filter(feature_selection)',
1900
- ' final_features = len(study.features_df) if hasattr(study, "features_df") else 0',
1901
- ' log_progress(f"Feature filtering: {initial_features} -> {final_features} features")',
1902
- ' ',
1903
- ' return study',
1904
- '',
1905
- '',
1906
- 'def align_and_merge(study: masster.Study) -> masster.Study:',
1907
- ' """Perform feature alignment and merging."""',
1908
- ' log_progress("=== Starting Feature Alignment and Merging ===")',
1909
- ' ',
1910
- ' # Import alignment and merge defaults',
1911
- ' from masster.study.defaults.align_def import align_defaults',
1912
- ' from masster.study.defaults.merge_def import merge_defaults',
1913
- ' ',
1914
- ' # Determine optimal algorithms based on study size',
1915
- ' num_samples = len(study.samples_df)',
1916
- ' ',
1917
- ' if num_samples < 500:',
1918
- ' # For smaller studies: use qt for both alignment and merge',
1919
- ' alignment_algorithm = "qt"',
1920
- ' merge_method = "qt"',
1921
- ' log_progress(f"Small study ({num_samples} samples) - using qt algorithms")',
1922
- ' else:',
1923
- ' # For larger studies: use kd for alignment and qt-chunked for merge',
1924
- ' alignment_algorithm = "kd"',
1925
- ' merge_method = "qt-chunked"',
1926
- ' log_progress(f"Large study ({num_samples} samples) - using kd alignment and qt-chunked merge")',
1927
- ' ',
1928
- ' # Align features across samples',
1929
- ' align_params = align_defaults(',
1930
- ' rt_tol=RT_TOLERANCE,',
1931
- ' mz_max_diff=MZ_TOLERANCE,',
1932
- ' algorithm=alignment_algorithm',
1933
- ' )',
1934
- ' ',
1935
- ' log_progress(f"Aligning features with RT tolerance {RT_TOLERANCE}s, m/z tolerance {MZ_TOLERANCE} Da, algorithm: {alignment_algorithm}")',
1936
- ' study.align(params=align_params)',
1937
- ' ',
1938
- ' # Merge aligned features',
1939
- ' merge_params = merge_defaults(',
1940
- ' method=merge_method,',
1941
- ' rt_tol=RT_TOLERANCE,',
1942
- ' mz_tol=MZ_TOLERANCE,',
1943
- ' min_samples=MIN_SAMPLES_FOR_MERGE',
1944
- ' )',
1945
- ' ',
1946
- ' log_progress(f"Merging features using {merge_method} method")',
1947
- ' study.merge(params=merge_params)',
1948
- ' ',
1949
- ' # Log results',
1950
- ' num_consensus = len(study.consensus_df) if hasattr(study, "consensus_df") else 0',
1951
- ' log_progress(f"Generated {num_consensus} consensus features")',
1952
- ' ',
1953
- ' # Get study info',
1954
- ' if hasattr(study, "info"):',
1955
- ' study.info()',
1956
- ' ',
1957
- ' return study',
1958
- '',
1959
- '',
1960
- 'def generate_plots(study: masster.Study) -> bool:',
1961
- ' """Generate visualization plots for the study."""',
1962
- ' if not GENERATE_PLOTS:',
1963
- ' log_progress("Plot generation disabled, skipping...")',
1964
- ' return True',
1965
- ' ',
1966
- ' log_progress("=== Generating Visualization Plots ===")',
1967
- ' study_folder_path = Path(STUDY_FOLDER)',
1968
- ' plots_generated = 0',
1969
- ' ',
1970
- ' try:',
1971
- ' # Alignment plot',
1972
- ' if hasattr(study, "plot_alignment"):',
1973
- ' alignment_plot = study_folder_path / "alignment_plot.html"',
1974
- ' study.plot_alignment(filename=str(alignment_plot))',
1975
- ' plots_generated += 1',
1976
- ' log_progress(f"Generated alignment plot: {alignment_plot}")',
1977
- ' ',
1978
- ' # Consensus 2D plot',
1979
- ' if hasattr(study, "plot_consensus_2d"):',
1980
- ' consensus_2d_plot = study_folder_path / "consensus_2d.html"',
1981
- ' study.plot_consensus_2d(filename=str(consensus_2d_plot))',
1982
- ' plots_generated += 1',
1983
- ' log_progress(f"Generated consensus 2D plot: {consensus_2d_plot}")',
1984
- ' ',
1985
- ' # PCA plot',
1986
- ' if hasattr(study, "plot_pca"):',
1987
- ' pca_plot = study_folder_path / "pca_plot.html"',
1988
- ' study.plot_pca(filename=str(pca_plot))',
1989
- ' plots_generated += 1',
1990
- ' log_progress(f"Generated PCA plot: {pca_plot}")',
1991
- ' ',
1992
- ' # Consensus statistics',
1993
- ' if hasattr(study, "plot_consensus_stats"):',
1994
- ' stats_plot = study_folder_path / "consensus_stats.html"',
1995
- ' study.plot_consensus_stats(filename=str(stats_plot))',
1996
- ' plots_generated += 1',
1997
- ' log_progress(f"Generated statistics plot: {stats_plot}")',
1998
- ' ',
1999
- ' log_progress(f"Generated {plots_generated} visualization plots")',
2000
- ' return True',
2001
- ' ',
2002
- ' except Exception as e:',
2003
- ' print(f"Failed to generate plots: {e}")',
2004
- ' return False',
2005
- '',
2006
- '',
2007
- 'def export_results(study: masster.Study) -> bool:',
2008
- ' """Export study results in requested formats."""',
2009
- ' log_progress("=== Exporting Study Results ===")',
2010
- ' study_folder_path = Path(STUDY_FOLDER)',
2011
- ' exports_completed = 0',
2012
- ' ',
2013
- ' try:',
2014
- ' # Skip CSV export as requested',
2015
- ' ',
2016
- ' # Export as Excel',
2017
- ' if "xlsx" in EXPORT_FORMATS and hasattr(study, "export_xlsx"):',
2018
- ' xlsx_file = study_folder_path / "study_results.xlsx"',
2019
- ' study.export_xlsx(filename=str(xlsx_file))',
2020
- ' exports_completed += 1',
2021
- ' log_progress(f"Exported Excel: {xlsx_file}")',
2022
- ' ',
2023
- ' # Export MGF for MS2 spectra',
2024
- ' if "mgf" in EXPORT_FORMATS and hasattr(study, "export_mgf"):',
2025
- ' mgf_file = study_folder_path / "consensus_ms2.mgf"',
2026
- ' study.export_mgf(filename=str(mgf_file))',
2027
- ' exports_completed += 1',
2028
- ' log_progress(f"Exported MGF: {mgf_file}")',
2029
- ' ',
2030
- ' # Export as Parquet for efficient storage',
2031
- ' if "parquet" in EXPORT_FORMATS and hasattr(study, "export_parquet"):',
2032
- ' parquet_file = study_folder_path / "study_data.parquet"',
2033
- ' study.export_parquet(filename=str(parquet_file))',
2034
- ' exports_completed += 1',
2035
- ' log_progress(f"Exported Parquet: {parquet_file}")',
2036
- ' ',
2037
- ' log_progress(f"Completed {exports_completed} exports")',
2038
- ' ',
2039
- ' # Always perform additional exports as requested',
2040
- ' log_progress("=== Exporting Additional Formats ===")',
2041
- ' ',
2042
- ' try:',
2043
- ' # Force export xlsx (study results in Excel format)',
2044
- ' xlsx_file = study_folder_path / "study_results.xlsx"',
2045
- ' if hasattr(study, "export_xlsx"):',
2046
- ' study.export_xlsx(filename=str(xlsx_file))',
2047
- ' log_progress(f"Exported Excel: {xlsx_file}")',
2048
- ' ',
2049
- ' # Force export parquet (efficient binary format)',
2050
- ' parquet_file = study_folder_path / "study_data.parquet"',
2051
- ' if hasattr(study, "export_parquet"):',
2052
- ' study.export_parquet(filename=str(parquet_file))',
2053
- ' log_progress(f"Exported Parquet: {parquet_file}")',
2054
- ' ',
2055
- ' # Force save the study in study5 format',
2056
- ' study_file = study_folder_path / "final_study.study5"',
2057
- ' study.save(filename=str(study_file))',
2058
- ' log_progress(f"Saved study: {study_file}")',
2059
- ' ',
2060
- ' # Force export MGF for MS2 spectra',
2061
- ' mgf_file = study_folder_path / "consensus_ms2.mgf"',
2062
- ' if hasattr(study, "export_mgf"):',
2063
- ' study.export_mgf(filename=str(mgf_file))',
2064
- ' log_progress(f"Exported MGF: {mgf_file}")',
2065
- ' ',
2066
- ' except Exception as e:',
2067
- ' print(f"Some additional exports failed: {e}")',
2068
- ' ',
2069
- ' return True',
2070
- ' ',
2071
- ' except Exception as e:',
2072
- ' print(f"Failed to export results: {e}")',
2073
- ' return False',
2074
- '',
2075
- '',
2076
- 'def save_study(study: masster.Study) -> bool:',
2077
- ' """Save the final study in optimized format."""',
2078
- ' log_progress("=== Saving Final Study ===")',
2079
- ' study_folder_path = Path(STUDY_FOLDER)',
2080
- ' ',
2081
- ' try:',
2082
- ' study_file = study_folder_path / "final_study.study5"',
2083
- ' ',
2084
- ' # Determine optimal save format based on study size',
2085
- ' num_samples = len(study.samples_df)',
2086
- ' num_features = len(study.consensus_df) if hasattr(study, "consensus_df") else 0',
2087
- ' ',
2088
- ' if num_samples > 50 or num_features > 10000:',
2089
- ' log_progress(f"Large study detected ({num_samples} samples, {num_features} features) - using compressed format")',
2090
- ' compress_output = True',
2091
- ' else:',
2092
- ' log_progress(f"Small study ({num_samples} samples, {num_features} features) - using standard format")',
2093
- ' compress_output = False',
2094
- ' ',
2095
- ' # Save study',
2096
- ' if compress_output and hasattr(study, "save_compressed"):',
2097
- ' study.save_compressed(filename=str(study_file))',
2098
- ' log_progress(f"Saved compressed study: {study_file}")',
2099
- ' else:',
2100
- ' study.save(filename=str(study_file))',
2101
- ' log_progress(f"Saved study: {study_file}")',
2102
- ' ',
2103
- ' # Save metadata summary',
2104
- ' metadata_file = study_folder_path / "study_metadata.txt"',
2105
- ' with open(metadata_file, "w") as f:',
2106
- ' f.write("Study Processing Summary\\n")',
2107
- ' f.write("========================\\n")',
2108
- ' f.write(f"Processing Date: {datetime.now().strftime(\'%Y-%m-%d %H:%M:%S\')}\\n")',
2109
- ' f.write(f"Polarity: {POLARITY}\\n")',
2110
- ' f.write(f"Adducts: {\', \'.join(ADDUCTS)}\\n")',
2111
- ' f.write(f"Number of Samples: {num_samples}\\n")',
2112
- ' f.write(f"Number of Consensus Features: {num_features}\\n")',
2113
- ' f.write(f"RT Tolerance: {RT_TOLERANCE}s\\n")',
2114
- ' f.write(f"m/z Tolerance: {MZ_TOLERANCE} Da\\n")',
2115
- ' f.write(f"Merge Method: {MERGE_METHOD}\\n")',
2116
- ' ',
2117
- ' log_progress(f"Saved study metadata: {metadata_file}")',
2118
- ' return True',
2119
- ' ',
2120
- ' except Exception as e:',
2121
- ' print(f"Failed to save study: {e}")',
2122
- ' return False',
2123
- '',
2124
- '',
2125
- 'def cleanup_temp_files() -> bool:',
2126
- ' """Clean up temporary files if requested."""',
2127
- ' if not CLEANUP_TEMP_FILES:',
2128
- ' return True',
2129
- ' ',
2130
- ' log_progress("=== Cleaning Up Temporary Files ===")',
2131
- ' study_folder_path = Path(STUDY_FOLDER)',
2132
- ' ',
2133
- ' try:',
2134
- ' cleaned_count = 0',
2135
- ' ',
2136
- ' # Remove individual sample plots if study plots were generated',
2137
- ' if GENERATE_PLOTS:',
2138
- ' temp_plots = list(study_folder_path.glob("*_2d.html"))',
2139
- ' for plot_file in temp_plots:',
2140
- ' if plot_file.name not in ["alignment_plot.html", "consensus_2d.html", "pca_plot.html"]:',
2141
- ' plot_file.unlink()',
2142
- ' cleaned_count += 1',
2143
- ' ',
2144
- ' log_progress(f"Cleaned up {cleaned_count} temporary files")',
2145
- ' return True',
2146
- ' ',
2147
- ' except Exception as e:',
2148
- ' print(f"Failed to cleanup temp files: {e}")',
2149
- ' return False',
587
+ ' return sample5_files',
2150
588
  '',
2151
589
  '',
2152
590
  'def main():',
2153
- ' """Main execution function."""',
2154
- ' print("=" * 70)',
2155
- ' print("AUTOMATED MASS SPECTROMETRY ANALYSIS SCRIPT")',
2156
- f' print("Generated by masster.Wizard on {timestamp}")',
2157
- ' print("=" * 70)',
2158
- ' ',
2159
- ' start_time = time.time()',
2160
- ' ',
591
+ ' """Main analysis pipeline."""',
2161
592
  ' try:',
2162
- ' # Ensure output directory exists',
2163
- ' Path(STUDY_FOLDER).mkdir(parents=True, exist_ok=True)',
593
+ ' print("=" * 70)',
594
+ f' print("masster {version} - Automated MS Data Analysis")',
595
+ ' print("=" * 70)',
596
+ ' print(f"Source: {PARAMS[\'source\']}")',
597
+ ' print(f"Output: {PARAMS[\'folder\']}")',
598
+ ' print(f"Polarity: {PARAMS[\'polarity\']}")',
599
+ ' print(f"CPU Cores: {PARAMS[\'num_cores\']}")',
600
+ ' print("=" * 70)',
2164
601
  ' ',
2165
- ' # Step 1: Discover files',
2166
- ' file_list = discover_files()',
2167
- ' if not file_list:',
2168
- ' print("No files found for processing")',
2169
- ' return False',
602
+ ' start_time = time.time()',
603
+ ' ',
604
+ ' # Step 1: Discover raw data files',
605
+ ' print("\\nStep 1/7: Discovering raw data files...")',
606
+ ' raw_files = discover_raw_files(',
607
+ ' PARAMS[\'source\'],',
608
+ ' PARAMS[\'file_extensions\'],',
609
+ ' PARAMS[\'search_subfolders\']',
610
+ ' )',
2170
611
  ' ',
2171
- ' # Step 2: Convert to sample5',
2172
- ' if not convert_to_sample5(file_list):',
2173
- ' print("Sample5 conversion failed")',
612
+ ' if not raw_files:',
613
+ ' print("No raw data files found!")',
2174
614
  ' return False',
2175
615
  ' ',
2176
- ' # Step 3: Assemble study',
2177
- ' study = assemble_study()',
616
+ ' print(f"Found {len(raw_files)} raw data files")',
617
+ ' for f in raw_files[:5]: # Show first 5 files',
618
+ ' print(f" {f.name}")',
619
+ ' if len(raw_files) > 5:',
620
+ ' print(f" ... and {len(raw_files) - 5} more")',
621
+ ' ',
622
+ ' # Step 2: Process raw files',
623
+ ' print("\\nStep 2/7: Processing raw files...")',
624
+ ' sample5_files = convert_raw_to_sample5(',
625
+ ' raw_files,',
626
+ ' PARAMS[\'folder\'],',
627
+ ' PARAMS[\'polarity\'],',
628
+ ' PARAMS[\'num_cores\']',
629
+ ' )',
2178
630
  ' ',
2179
- ' # Step 4: Align and merge',
2180
- ' study = align_and_merge(study)',
631
+ ' if not sample5_files:',
632
+ ' print("No sample5 files were created!")',
633
+ ' return False',
2181
634
  ' ',
2182
- ' # Step 5: Generate plots',
2183
- ' generate_plots(study)',
635
+ ' print(f"Successfully processed {len(sample5_files)} files to sample5")',
2184
636
  ' ',
2185
- ' # Step 6: Export results',
2186
- ' export_results(study)',
637
+ ' # Step 3: Create and configure study',
638
+ ' print("\\nStep 3/7: Initializing study...")',
639
+ ' study = Study(folder=PARAMS[\'folder\'])',
640
+ ' study.polarity = PARAMS[\'polarity\']',
641
+ ' study.adducts = PARAMS[\'adducts\']',
2187
642
  ' ',
2188
- ' # Step 7: Save study',
2189
- ' save_study(study)',
643
+ ' # Step 4: Add sample5 files to study',
644
+ ' print("\\nStep 4/7: Adding samples to study...")',
645
+ ' study.add(str(Path(PARAMS[\'folder\']) / "*.sample5"))',
2190
646
  ' ',
2191
- ' # Step 8: Cleanup',
2192
- ' cleanup_temp_files()',
647
+ ' # Step 5: Core processing',
648
+ ' print("\\nStep 5/7: Processing...")',
649
+ ' study.align(',
650
+ ' algorithm=PARAMS[\'alignment_method\'],',
651
+ ' rt_tol=PARAMS[\'rt_tol\']',
652
+ ' )',
2193
653
  ' ',
2194
- ' # Final summary',
654
+ ' # Merge and create consensus features',
655
+ ' study.merge(',
656
+ ' min_samples=PARAMS[\'min_samples_per_feature\'],',
657
+ ' threads=PARAMS[\'num_cores\'],',
658
+ ' rt_tol=PARAMS[\'rt_tol\'],',
659
+ ' mz_tol=PARAMS[\'mz_tol\']',
660
+ ' )',
661
+ ' study.find_iso()',
662
+ ' study.fill(min_samples_rel=0.0)',
663
+ ' study.integrate()',
664
+ ' ',
665
+ ' # Step 6/7: Saving results',
666
+ ' print("\\nStep 6/7: Saving results...")',
667
+ ' study.save()',
668
+ ' study.export_xlsx()',
669
+ ' study.export_mgf()',
670
+ ' study.export_mztab()',
671
+ ' ',
672
+ ' # Step 7: Plots',
673
+ ' print("\\nStep 7/7: Exporting plots...")',
674
+ ' study.plot_consensus_2d(filename="consensus.html")',
675
+ ' study.plot_consensus_2d(filename="consensus.png")',
676
+ ' study.plot_alignment(filename="alignment.html")',
677
+ ' study.plot_alignment(filename="alignment.png")',
678
+ ' study.plot_pca(filename="pca.html")',
679
+ ' study.plot_pca(filename="pca.png")',
680
+ ' study.plot_bpc(filename="bpc.html")',
681
+ ' study.plot_bpc(filename="bpc.png")',
682
+ ' study.plot_rt_correction(filename="rt_correction.html")',
683
+ ' study.plot_rt_correction(filename="rt_correction.png")',
684
+
685
+ ' ',
686
+ ' # Print summary',
687
+ ' study.info()',
2195
688
  ' total_time = time.time() - start_time',
689
+ ' print("\\n" + "=" * 70)',
690
+ ' print("ANALYSIS COMPLETE")',
2196
691
  ' print("=" * 70)',
2197
- ' print("ANALYSIS COMPLETED SUCCESSFULLY")',
2198
692
  ' print(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")',
693
+ ' print(f"Raw files processed: {len(raw_files)}")',
694
+ ' print(f"Sample5 files created: {len(sample5_files)}")',
2199
695
  ' if hasattr(study, "consensus_df"):',
2200
696
  ' print(f"Consensus features generated: {len(study.consensus_df)}")',
2201
697
  ' print("=" * 70)',
@@ -2215,43 +711,15 @@ class Wizard:
2215
711
  'if __name__ == "__main__":',
2216
712
  ' success = main()',
2217
713
  ' sys.exit(0 if success else 1)',
2218
- ]
714
+ ])
2219
715
 
2220
716
  return '\n'.join(script_lines)
2221
-
2222
- def info(self):
2223
- """Print comprehensive wizard status information."""
2224
- status = self.get_status()
2225
-
2226
- print("\n" + "=" * 50)
2227
- print("WIZARD STATUS")
2228
- print("=" * 50)
2229
- print(f"Current Step: {status['current_step']}")
2230
- print(f"Data Source: {self.params.data_source}")
2231
- print(f"Study Folder: {self.params.study_folder}")
2232
- print(f"Polarity: {status['parameters']['polarity']}")
2233
- print(f"CPU Cores: {status['parameters']['num_cores']}")
2234
- print(f"Adducts: {', '.join(status['parameters']['adducts'])}")
2235
- print(f"Detector Type: {self.params.detector_type}")
2236
- print(f"Noise Threshold: {self.params.noise_threshold}")
2237
- print(f"Processing Time: {status['processing_time']}")
2238
- print(f"Files Processed: {status['processed_files']}")
2239
- print(f"Files Failed: {status['failed_files']}")
2240
- print(f"Study Loaded: {status['study_loaded']}")
2241
-
2242
- if self.study is not None and hasattr(self.study, 'samples_df'):
2243
- print(f"Samples in Study: {len(self.study.samples_df)}")
2244
-
2245
- if self.study is not None and hasattr(self.study, 'consensus_df'):
2246
- print(f"Consensus Features: {len(self.study.consensus_df)}")
2247
-
2248
- print("=" * 50)
2249
717
 
2250
718
 
2251
719
  def create_script(
2252
720
  source: str,
2253
- study_folder: str,
2254
- filename: str,
721
+ folder: str,
722
+ filename: str = 'run_masster.py',
2255
723
  polarity: str = "positive",
2256
724
  adducts: Optional[List[str]] = None,
2257
725
  params: Optional[wizard_def] = None,
@@ -2267,7 +735,7 @@ def create_script(
2267
735
 
2268
736
  Parameters:
2269
737
  source: Directory containing raw data files
2270
- study_folder: Output directory for processed study
738
+ folder: Output directory for processed study
2271
739
  filename: Filename for the generated script (should end with .py)
2272
740
  polarity: Ion polarity mode ("positive" or "negative")
2273
741
  adducts: List of adduct specifications (auto-set if None)
@@ -2282,7 +750,7 @@ def create_script(
2282
750
  >>> from masster.wizard import create_script
2283
751
  >>> create_script(
2284
752
  ... source=r'D:\\Data\\raw_files',
2285
- ... study_folder=r'D:\\Data\\output',
753
+ ... folder=r'D:\\Data\\output',
2286
754
  ... filename='run_masster.py',
2287
755
  ... polarity='positive'
2288
756
  ... )
@@ -2294,8 +762,8 @@ def create_script(
2294
762
  # Use provided params as base
2295
763
  wizard_params = params
2296
764
  # Update with provided values
2297
- wizard_params.data_source = source
2298
- wizard_params.study_folder = study_folder
765
+ wizard_params.source = source
766
+ wizard_params.folder = folder
2299
767
  if polarity != "positive": # Only override if explicitly different
2300
768
  wizard_params.polarity = polarity
2301
769
  if num_cores > 0:
@@ -2305,8 +773,8 @@ def create_script(
2305
773
  else:
2306
774
  # Create new params with provided values
2307
775
  wizard_params = wizard_def(
2308
- data_source=source,
2309
- study_folder=study_folder,
776
+ source=source,
777
+ folder=folder,
2310
778
  polarity=polarity,
2311
779
  num_cores=max(1, int(multiprocessing.cpu_count() * 0.75)) if num_cores <= 0 else num_cores
2312
780
  )
@@ -2320,28 +788,124 @@ def create_script(
2320
788
  setattr(wizard_params, key, value)
2321
789
 
2322
790
  # Ensure study folder exists
2323
- study_path = Path(study_folder)
791
+ study_path = Path(folder)
2324
792
  study_path.mkdir(parents=True, exist_ok=True)
2325
793
 
2326
794
  # Create a temporary Wizard instance to generate the script
2327
795
  temp_wizard = Wizard(params=wizard_params)
2328
796
 
2329
- # Generate the script using the existing method
2330
- success = temp_wizard.export_script(filename)
797
+ # Generate the script using the instance method
798
+ success = temp_wizard.create_script(filename)
799
+
800
+ return success
801
+
802
+ except Exception as e:
803
+ print(f"Failed to create script: {e}")
804
+ import traceback
805
+ traceback.print_exc()
806
+ return False
807
+
808
+
809
+ def execute(
810
+ source: str,
811
+ folder: str,
812
+ filename: str = 'run_masster.py',
813
+ polarity: str = "positive",
814
+ adducts: Optional[List[str]] = None,
815
+ params: Optional[wizard_def] = None,
816
+ num_cores: int = 0,
817
+ **kwargs
818
+ ) -> bool:
819
+ """
820
+ Create and execute a standalone analysis script for automated MS data processing.
821
+
822
+ This function generates a Python script with the same parameters as create_script(),
823
+ but immediately executes it after creation. Combines script generation and execution
824
+ in a single step.
825
+
826
+ Parameters:
827
+ source: Directory containing raw data files
828
+ folder: Output directory for processed study
829
+ filename: Filename for the generated script (should end with .py)
830
+ polarity: Ion polarity mode ("positive" or "negative")
831
+ adducts: List of adduct specifications (auto-set if None)
832
+ params: Custom wizard_def parameters (optional)
833
+ num_cores: Number of CPU cores (0 = auto-detect)
834
+ **kwargs: Additional parameters to override defaults
835
+
836
+ Returns:
837
+ True if script was created and executed successfully, False otherwise
838
+
839
+ Example:
840
+ >>> from masster.wizard import execute
841
+ >>> execute(
842
+ ... source=r'D:\\Data\\raw_files',
843
+ ... folder=r'D:\\Data\\output',
844
+ ... polarity='positive'
845
+ ... )
846
+ """
847
+
848
+ try:
849
+ # First, create the script using create_script()
850
+ script_created = create_script(
851
+ source=source,
852
+ folder=folder,
853
+ filename=filename,
854
+ polarity=polarity,
855
+ adducts=adducts,
856
+ params=params,
857
+ num_cores=num_cores,
858
+ **kwargs
859
+ )
860
+
861
+ if not script_created:
862
+ print("Failed to create analysis script")
863
+ return False
864
+
865
+ # Get the full path to the created script
866
+ study_path = Path(folder)
867
+ script_path = study_path / Path(filename).name
868
+
869
+ if not script_path.exists():
870
+ print(f"Script file not found: {script_path}")
871
+ return False
872
+
873
+ print(f"Executing...")
874
+ #print("=" * 70)
875
+
876
+ # Execute the script using subprocess with real-time output
877
+ import subprocess
878
+
879
+ # Run the script with Python, letting it inherit our stdout/stderr
880
+ try:
881
+ # Use subprocess.run for direct output inheritance - no capturing/re-printing
882
+ result = subprocess.run([
883
+ sys.executable, str(script_path)
884
+ ], cwd=str(study_path))
885
+
886
+ return_code = result.returncode
887
+
888
+ except Exception as e:
889
+ print(f"Error during script execution: {e}")
890
+ return False
891
+
892
+ success = return_code == 0
2331
893
 
2332
894
  if success:
2333
- script_path = study_path / Path(filename).name
2334
- print(f"Analysis script created: {script_path.absolute()}")
2335
- print(f"Run with: python \"{script_path}\"")
895
+ print("=" * 70)
896
+ print("Script execution completed successfully")
897
+ else:
898
+ print("=" * 70)
899
+ print(f"Script execution failed with return code: {return_code}")
2336
900
 
2337
901
  return success
2338
902
 
2339
903
  except Exception as e:
2340
- print(f"Failed to create script: {e}")
904
+ print(f"Failed to execute script: {e}")
2341
905
  import traceback
2342
906
  traceback.print_exc()
2343
907
  return False
2344
908
 
2345
909
 
2346
910
  # Export the main classes and functions
2347
- __all__ = ["Wizard", "wizard_def", "create_script"]
911
+ __all__ = ["Wizard", "wizard_def", "create_script", "execute"]