masster 0.4.19__py3-none-any.whl → 0.4.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/wizard/wizard.py CHANGED
@@ -23,28 +23,22 @@ from masster import Wizard, wizard_def
23
23
 
24
24
  # Create wizard with default parameters
25
25
  wizard = Wizard(
26
- data_source="./raw_data",
27
- study_folder="./processed_study",
26
+ source="./raw_data",
27
+ folder="./processed_study",
28
28
  polarity="positive",
29
29
  num_cores=4
30
30
  )
31
31
 
32
- # Run complete processing pipeline
33
- wizard.run_full_pipeline()
34
-
35
- # Or run individual steps
36
- wizard.convert_to_sample5()
37
- wizard.assemble_study()
38
- wizard.align_and_merge()
39
- wizard.generate_plots()
40
- wizard.export_results()
41
32
  ```
42
33
  """
43
34
 
44
35
  from __future__ import annotations
45
36
 
46
37
  import os
38
+ import sys
47
39
  import time
40
+ import importlib
41
+ import glob
48
42
  import multiprocessing
49
43
  from pathlib import Path
50
44
  from typing import Optional, Any, Dict, List
@@ -57,6 +51,7 @@ from masster.logger import MassterLogger
57
51
  from masster.study.defaults.study_def import study_defaults
58
52
  from masster.study.defaults.align_def import align_defaults
59
53
  from masster.study.defaults.merge_def import merge_defaults
54
+ from masster._version import __version__ as version
60
55
 
61
56
 
62
57
  @dataclass
@@ -69,8 +64,8 @@ class wizard_def:
69
64
 
70
65
  Attributes:
71
66
  # Core Configuration
72
- data_source (str): Path to directory containing raw data files
73
- study_folder (str): Output directory for processed study
67
+ source (str): Path to directory containing raw data files
68
+ folder (str): Output directory for processed study
74
69
  polarity (str): Ion polarity mode ("positive" or "negative")
75
70
  num_cores (int): Number of CPU cores to use for parallel processing
76
71
 
@@ -101,15 +96,15 @@ class wizard_def:
101
96
  """
102
97
 
103
98
  # === Core Configuration ===
104
- data_source: str = ""
105
- study_folder: str = ""
99
+ source: str = ""
100
+ folder: str = ""
106
101
  polarity: str = "positive"
107
102
  num_cores: int = 4
108
103
 
109
104
  # === File Discovery ===
110
- file_extensions: List[str] = field(default_factory=lambda: [".wiff", ".raw", ".mzML", ".d"])
105
+ file_extensions: List[str] = field(default_factory=lambda: [".wiff", ".raw", ".mzML"])
111
106
  search_subfolders: bool = True
112
- skip_patterns: List[str] = field(default_factory=lambda: ["blank", "QC", "test"])
107
+ skip_patterns: List[str] = field(default_factory=lambda: ["blank", "test"])
113
108
 
114
109
  # === Processing Parameters ===
115
110
  adducts: List[str] = field(default_factory=list) # Will be set based on polarity
@@ -124,17 +119,18 @@ class wizard_def:
124
119
  checkpoint_interval: int = 10 # Save progress every N files
125
120
 
126
121
  # === Study Assembly ===
127
- min_samples_for_merge: int = 50
122
+ min_samples_for_merge: int = 2
128
123
  rt_tolerance: float = 1.5
129
- mz_tolerance: float = 0.01
124
+ mz_max_diff: float = 0.01
130
125
  alignment_algorithm: str = "kd"
131
- merge_method: str = "chunked"
126
+ merge_method: str = "qt"
132
127
 
133
128
  # === Feature Detection ===
134
- chrom_fwhm: float = 0.2
135
- noise_threshold: float = 1e5
129
+ chrom_fwhm: float = 0.5
130
+ noise_threshold: float = 200.0
136
131
  chrom_peak_snr: float = 5.0
137
132
  tol_ppm: float = 10.0
133
+ detector_type: str = "unknown" # Detected detector type ("orbitrap", "quadrupole", "unknown")
138
134
 
139
135
  # === Output & Export ===
140
136
  generate_plots: bool = True
@@ -154,15 +150,15 @@ class wizard_def:
154
150
  optimize_memory: bool = True
155
151
  cleanup_temp_files: bool = True
156
152
  validate_outputs: bool = True
157
-
153
+
158
154
  _param_metadata: dict[str, dict[str, Any]] = field(
159
155
  default_factory=lambda: {
160
- "data_source": {
156
+ "source": {
161
157
  "dtype": str,
162
158
  "description": "Path to directory containing raw data files",
163
159
  "required": True,
164
160
  },
165
- "study_folder": {
161
+ "folder": {
166
162
  "dtype": str,
167
163
  "description": "Output directory for processed study",
168
164
  "required": True,
@@ -218,62 +214,63 @@ class wizard_def:
218
214
  self.num_cores = max_cores
219
215
 
220
216
  # Ensure paths are absolute
221
- if self.data_source:
222
- self.data_source = os.path.abspath(self.data_source)
223
- if self.study_folder:
224
- self.study_folder = os.path.abspath(self.study_folder)
217
+ if self.source:
218
+ self.source = os.path.abspath(self.source)
219
+ if self.folder:
220
+ self.folder = os.path.abspath(self.folder)
225
221
 
226
222
 
227
223
  class Wizard:
228
224
  """
229
- Automated processing wizard for mass spectrometry studies.
230
-
231
- The Wizard class provides end-to-end automation for processing collections
232
- of mass spectrometry files from raw data to final study results, including:
225
+ Simplified Wizard for automated mass spectrometry data processing.
233
226
 
234
- 1. Raw data discovery and batch conversion to sample5 format
235
- 2. Study assembly with feature alignment and merging
236
- 3. Automated plot generation and result export
237
- 4. Intelligent resume capability for interrupted processes
238
- 5. Adaptive optimization based on study size and system resources
227
+ The Wizard provides a clean interface for creating and executing analysis scripts
228
+ that process raw MS data through the complete pipeline: file discovery, feature
229
+ detection, sample processing, study assembly, alignment, merging, and export.
239
230
 
240
- The wizard handles the complete workflow with minimal user intervention
241
- while providing comprehensive logging and progress tracking.
231
+ This simplified version focuses on two core functions:
232
+ - create_script(): Generate standalone analysis scripts
233
+ - execute(): Create and run analysis scripts
242
234
  """
243
235
 
244
236
  def __init__(
245
237
  self,
246
- data_source: str = "",
247
- study_folder: str = "",
238
+ source: str = "",
239
+ folder: str = "",
248
240
  polarity: str = "positive",
249
241
  adducts: Optional[List[str]] = None,
250
- num_cores: int = 4,
242
+ num_cores: int = 0,
251
243
  **kwargs
252
244
  ):
253
245
  """
254
- Initialize the Wizard for automated study processing.
246
+ Initialize the Wizard with analysis parameters.
255
247
 
256
248
  Parameters:
257
- data_source: Directory containing raw data files
258
- study_folder: Output directory for processed study
249
+ source: Directory containing raw data files
250
+ folder: Output directory for processed study
259
251
  polarity: Ion polarity mode ("positive" or "negative")
260
252
  adducts: List of adduct specifications (auto-set if None)
261
- num_cores: Number of CPU cores for parallel processing
253
+ num_cores: Number of CPU cores (0 = auto-detect 75% of available)
262
254
  **kwargs: Additional parameters (see wizard_def for full list)
263
255
  """
264
256
 
257
+ # Auto-detect optimal number of cores if not specified
258
+ if num_cores <= 0:
259
+ num_cores = max(1, int(multiprocessing.cpu_count() * 0.75))
260
+
265
261
  # Create parameters instance
266
262
  if "params" in kwargs and isinstance(kwargs["params"], wizard_def):
267
263
  self.params = kwargs.pop("params")
268
264
  else:
269
- # Create default parameters and update with provided values
265
+ # Create default parameters
270
266
  self.params = wizard_def(
271
- data_source=data_source,
272
- study_folder=study_folder,
267
+ source=source,
268
+ folder=folder,
273
269
  polarity=polarity,
274
270
  num_cores=num_cores
275
271
  )
276
272
 
273
+ # Set adducts if provided
277
274
  if adducts is not None:
278
275
  self.params.adducts = adducts
279
276
 
@@ -283,893 +280,632 @@ class Wizard:
283
280
  setattr(self.params, key, value)
284
281
 
285
282
  # Validate required parameters
286
- if not self.params.data_source:
287
- raise ValueError("data_source is required")
288
- if not self.params.study_folder:
289
- raise ValueError("study_folder is required")
290
-
291
- # Create directories
292
- self.data_source_path = Path(self.params.data_source)
293
- self.study_folder_path = Path(self.params.study_folder)
294
- self.study_folder_path.mkdir(parents=True, exist_ok=True)
295
-
296
- # Setup logging
297
- self._setup_logging()
298
-
299
- # Initialize state tracking
300
- self.processed_files = []
301
- self.failed_files = []
302
- self.study = None
303
- self.start_time = None
304
- self.current_step = "initialized"
305
-
306
- # Create checkpoint file path
307
- self.checkpoint_file = self.study_folder_path / "wizard_checkpoint.json"
308
-
309
- self.logger.info(f"Wizard initialized for {self.polarity} mode")
310
- self.logger.info(f"Data source: {self.data_source_path}")
311
- self.logger.info(f"Study folder: {self.study_folder_path}")
312
- self.logger.info(f"Using {self.params.num_cores} CPU cores")
313
-
314
- # Load checkpoint if resuming
315
- if self.params.resume_enabled:
316
- self._load_checkpoint()
317
-
318
- @property
319
- def polarity(self) -> str:
320
- """Get the polarity setting."""
321
- return self.params.polarity
322
-
323
- @property
324
- def adducts(self) -> List[str]:
325
- """Get the adducts list."""
326
- return self.params.adducts
327
-
328
- def _setup_logging(self):
329
- """Setup comprehensive logging system."""
330
- # Create logger
331
- log_label = f"Wizard-{self.polarity}"
332
-
333
- if self.params.log_to_file:
334
- log_file = self.study_folder_path / "wizard.log"
335
- sink = str(log_file)
336
- else:
337
- sink = "sys.stdout"
338
-
339
- self.logger = MassterLogger(
340
- instance_type="wizard",
341
- level=self.params.log_level.upper(),
342
- label=log_label,
343
- sink=sink,
344
- )
345
-
346
- # Also create a simple file logger for critical info
347
- self.log_file = self.study_folder_path / "processing.log"
348
-
349
- def _log_progress(self, message: str, level: str = "INFO"):
350
- """Log progress message with timestamp."""
351
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
352
- full_message = f"[{timestamp}] {message}"
353
-
354
- # Log to masster logger
355
- getattr(self.logger, level.lower())(message)
356
-
357
- # Also write to simple log file
358
- with open(self.log_file, "a", encoding="utf-8") as f:
359
- f.write(f"{full_message}\n")
360
-
361
- if self.params.verbose_progress and level in ["INFO", "WARNING", "ERROR"]:
362
- print(full_message)
363
-
364
- def _save_checkpoint(self):
365
- """Save processing checkpoint for resume capability."""
366
- if not self.params.resume_enabled:
367
- return
368
-
369
- import json
370
- checkpoint_data = {
371
- "timestamp": datetime.now().isoformat(),
372
- "current_step": self.current_step,
373
- "processed_files": self.processed_files,
374
- "failed_files": self.failed_files,
375
- "params": {
376
- "data_source": self.params.data_source,
377
- "study_folder": self.params.study_folder,
378
- "polarity": self.params.polarity,
379
- "adducts": self.params.adducts,
380
- "num_cores": self.params.num_cores,
381
- }
382
- }
383
-
384
- try:
385
- with open(self.checkpoint_file, "w") as f:
386
- json.dump(checkpoint_data, f, indent=2)
387
- self.logger.debug(f"Checkpoint saved: {len(self.processed_files)} files processed")
388
- except Exception as e:
389
- self.logger.warning(f"Failed to save checkpoint: {e}")
390
-
391
- def _load_checkpoint(self):
392
- """Load processing checkpoint for resume capability."""
393
- if not self.checkpoint_file.exists():
394
- return
395
-
396
- import json
397
- try:
398
- with open(self.checkpoint_file, "r") as f:
399
- checkpoint_data = json.load(f)
400
-
401
- self.processed_files = checkpoint_data.get("processed_files", [])
402
- self.failed_files = checkpoint_data.get("failed_files", [])
403
- self.current_step = checkpoint_data.get("current_step", "initialized")
404
-
405
- self.logger.info(f"Resuming from checkpoint: {len(self.processed_files)} files already processed")
406
- self.logger.info(f"Previous step: {self.current_step}")
407
-
408
- except Exception as e:
409
- self.logger.warning(f"Failed to load checkpoint: {e}")
410
- self.processed_files = []
411
- self.failed_files = []
412
-
413
- def discover_files(self) -> List[Path]:
283
+ if not self.params.source:
284
+ raise ValueError("source is required")
285
+ if not self.params.folder:
286
+ raise ValueError("folder is required")
287
+
288
+ # Create and validate paths
289
+ self.source_path = Path(self.params.source)
290
+ self.folder_path = Path(self.params.folder)
291
+ self.folder_path.mkdir(parents=True, exist_ok=True)
292
+
293
+ # Auto-infer polarity from the first file if not explicitly set by user
294
+ if polarity == "positive" and "polarity" not in kwargs:
295
+ inferred_polarity = self._infer_polarity_from_first_file()
296
+ if inferred_polarity:
297
+ self.params.polarity = inferred_polarity
298
+ # Update adducts based on inferred polarity
299
+ self.params.__post_init__()
300
+
301
+ def _infer_polarity_from_first_file(self) -> str:
414
302
  """
415
- Discover raw data files in the source directory.
303
+ Infer polarity from the first available raw data file.
416
304
 
417
305
  Returns:
418
- List of file paths found for processing
306
+ Inferred polarity string ("positive" or "negative") or None if detection fails
419
307
  """
420
- self._log_progress("Discovering raw data files...")
421
- self.current_step = "discovering_files"
422
-
423
- found_files = []
424
-
425
- for extension in self.params.file_extensions:
426
- if self.params.search_subfolders:
427
- pattern = f"**/*{extension}"
428
- files = list(self.data_source_path.rglob(pattern))
308
+ try:
309
+ # Find first file
310
+ for extension in ['.wiff', '.raw', '.mzML', '.d']:
311
+ pattern = f"**/*{extension}" if True else f"*{extension}" # search_subfolders=True
312
+ files = list(self.source_path.rglob(pattern))
313
+ if files:
314
+ first_file = files[0]
315
+ break
429
316
  else:
430
- pattern = f"*{extension}"
431
- files = list(self.data_source_path.glob(pattern))
317
+ return None
432
318
 
433
- # Filter out files matching skip patterns
434
- filtered_files = []
435
- for file_path in files:
436
- skip_file = False
437
- for pattern in self.params.skip_patterns:
438
- if pattern.lower() in file_path.name.lower():
439
- skip_file = True
440
- self.logger.debug(f"Skipping file (matches pattern '{pattern}'): {file_path.name}")
441
- break
319
+ # Only implement for .wiff files initially (most common format)
320
+ if first_file.suffix.lower() == '.wiff':
321
+ from masster.sample.load import _wiff_to_dict
322
+
323
+ # Extract metadata from first file
324
+ metadata_df = _wiff_to_dict(str(first_file))
442
325
 
443
- if not skip_file:
444
- # Check file size
445
- try:
446
- file_size_gb = file_path.stat().st_size / (1024**3)
447
- if file_size_gb > self.params.max_file_size_gb:
448
- self.logger.warning(f"Large file ({file_size_gb:.1f}GB): {file_path.name}")
449
- filtered_files.append(file_path)
450
- except Exception as e:
451
- self.logger.warning(f"Could not check file size for {file_path}: {e}")
452
- filtered_files.append(file_path)
326
+ if not metadata_df.empty and 'polarity' in metadata_df.columns:
327
+ # Get polarity from first experiment
328
+ first_polarity = metadata_df['polarity'].iloc[0]
329
+
330
+ # Convert numeric polarity codes to string
331
+ if first_polarity == 1 or str(first_polarity).lower() in ['positive', 'pos', '+']:
332
+ return "positive"
333
+ elif first_polarity == -1 or str(first_polarity).lower() in ['negative', 'neg', '-']:
334
+ return "negative"
335
+
336
+ except Exception:
337
+ # Silently fall back to default if inference fails
338
+ pass
453
339
 
454
- found_files.extend(filtered_files)
455
- self.logger.info(f"Found {len(filtered_files)} {extension} files")
456
-
457
- # Remove duplicates and sort
458
- found_files = sorted(list(set(found_files)))
459
-
460
- self._log_progress(f"Total files discovered: {len(found_files)}")
461
-
462
- return found_files
463
-
464
- def _process_single_file(self, file_path: Path, reset: bool = False) -> Optional[str]:
340
+ return None
341
+
342
+ @property
343
+ def polarity(self) -> str:
344
+ """Get the ion polarity mode."""
345
+ return self.params.polarity
346
+
347
+ @property
348
+ def adducts(self) -> List[str]:
349
+ """Get the adduct specifications."""
350
+ return self.params.adducts
351
+
352
+ def create_script(self, filename: str = "run_masster.py") -> bool:
465
353
  """
466
- Process a single file to sample5 format.
467
-
468
- This method replicates the core processing from parallel_sample_processing.py
469
- but with wizard-specific configuration and error handling.
354
+ Generate a standalone Python script for the analysis pipeline.
470
355
 
471
356
  Parameters:
472
- file_path: Path to the raw data file
473
- reset: Force reprocessing even if output exists
474
-
357
+ filename: Name for the generated script file
358
+
475
359
  Returns:
476
- Base filename of output on success, None on failure
360
+ True if script was generated successfully, False otherwise
477
361
  """
478
- import gc
479
-
480
- # Generate output filename
481
- file_out = file_path.stem + '.sample5'
482
- output_file = self.study_folder_path / file_out
483
-
484
- # Initialize masster Sample with delayed import
485
- import masster
486
- sample = masster.Sample(
487
- log_label=file_path.name,
488
- log_level='ERROR' # Reduce logging overhead in parallel processing
489
- )
490
-
491
- # Check if file should be skipped
492
- skip = False
493
- if not reset and not self.params.force_reprocess and output_file.exists():
494
- try:
495
- # Attempt to load existing processed file to verify it's valid
496
- sample.load(str(output_file))
497
- skip = True
498
- except Exception:
499
- # If loading fails, file needs to be reprocessed
500
- skip = False
501
-
502
- if skip:
503
- self.logger.debug(f"Skipping {file_path.name} (already processed)")
504
- return output_file.stem
505
-
506
- self.logger.info(f"Processing {file_path.name}")
507
-
508
362
  try:
509
- # STEP 1: Load raw data
510
- sample.load(str(file_path))
511
-
512
- # STEP 2: Feature detection - First pass (strict parameters)
513
- sample.find_features(
514
- chrom_fwhm=self.params.chrom_fwhm,
515
- noise=self.params.noise_threshold,
516
- tol_ppm=self.params.tol_ppm,
517
- chrom_peak_snr=self.params.chrom_peak_snr,
518
- min_trace_length_multiplier=0.5,
519
- chrom_fwhm_min=self.params.chrom_fwhm
520
- )
521
-
522
- # STEP 3: Feature detection - Second pass (relaxed parameters)
523
- sample.find_features(
524
- chrom_peak_snr=self.params.chrom_peak_snr,
525
- noise=self.params.noise_threshold / 10, # Lower noise threshold
526
- chrom_fwhm=2.0 # Wider peaks
527
- )
528
-
529
- # STEP 4: Adduct detection
530
- sample.find_adducts(adducts=self.adducts)
531
-
532
- # STEP 5: MS2 spectrum identification
533
- sample.find_ms2()
534
-
535
- # STEP 6: Save processed data
536
- sample.save(filename=str(output_file))
537
-
538
- # STEP 7: Generate additional outputs if requested
539
- if "csv" in self.params.export_formats:
540
- csv_file = output_file.with_suffix('.features.csv')
541
- sample.export_features(filename=str(csv_file))
542
-
543
- if "mgf" in self.params.export_formats:
544
- mgf_file = output_file.with_suffix('.mgf')
545
- sample.export_mgf(filename=str(mgf_file), use_cache=False)
363
+ script_path = self.folder_path / filename
364
+ script_content = self._generate_script_content()
546
365
 
547
- if self.params.generate_plots:
548
- plot_file = output_file.with_suffix('_2d.html')
549
- sample.plot_2d(filename=str(plot_file), markersize=4)
366
+ with open(script_path, 'w', encoding='utf-8') as f:
367
+ f.write(script_content)
550
368
 
551
- # Memory cleanup
552
- result = output_file.stem
553
- del sample
554
- gc.collect()
555
-
556
- return result
369
+ print(f"Analysis script created: {script_path}")
370
+ return True
557
371
 
558
372
  except Exception as e:
559
- self.logger.error(f"Error processing {file_path.name}: {e}")
560
- # Cleanup on error
561
- gc.collect()
562
- return None
563
-
564
- def _process_batch(self, file_batch: List[Path]) -> List[str]:
565
- """Process a batch of files in a single worker."""
566
- results = []
567
- for file_path in file_batch:
568
- result = self._process_single_file(file_path)
569
- if result:
570
- results.append(result)
571
- else:
572
- results.append(None)
573
- return results
574
-
575
- def convert_to_sample5(self, file_list: Optional[List[Path]] = None) -> bool:
373
+ print(f"Failed to create script: {e}")
374
+ return False
375
+
376
+ def execute(self, filename: str = "run_masster.py") -> bool:
576
377
  """
577
- Convert raw data files to sample5 format in parallel.
378
+ Create and execute a standalone analysis script.
578
379
 
579
380
  Parameters:
580
- file_list: List of files to process (None to discover automatically)
581
-
582
- Returns:
583
- True if conversion completed successfully
584
- """
585
- self._log_progress("=== Starting Sample5 Conversion ===")
586
- self.current_step = "converting_to_sample5"
587
-
588
- if file_list is None:
589
- file_list = self.discover_files()
590
-
591
- if not file_list:
592
- self.logger.warning("No files found for conversion")
593
- return False
594
-
595
- # Filter out already processed files if resuming
596
- if self.params.resume_enabled and self.processed_files:
597
- remaining_files = []
598
- for file_path in file_list:
599
- if str(file_path) not in self.processed_files:
600
- remaining_files.append(file_path)
601
- file_list = remaining_files
381
+ filename: Name for the generated script file
602
382
 
603
- if not file_list:
604
- self._log_progress("All files already processed")
605
- return True
606
-
607
- self._log_progress(f"Converting {len(file_list)} files to sample5 format")
608
-
609
- conversion_start = time.time()
610
- successful_count = 0
611
- failed_count = 0
612
-
613
- if self.params.use_process_pool:
614
- # ProcessPoolExecutor approach - better for CPU-intensive work
615
- if len(file_list) <= self.params.batch_size:
616
- # Few files: process individually
617
- self.logger.info(f"Processing {len(file_list)} files individually with {self.params.num_cores} workers")
618
-
619
- with concurrent.futures.ProcessPoolExecutor(max_workers=self.params.num_cores) as executor:
620
- futures = [
621
- executor.submit(self._process_single_file, file_path)
622
- for file_path in file_list
623
- ]
624
-
625
- for i, future in enumerate(concurrent.futures.as_completed(futures)):
626
- result = future.result()
627
- if result:
628
- successful_count += 1
629
- self.processed_files.append(str(file_list[i]))
630
- else:
631
- failed_count += 1
632
- self.failed_files.append(str(file_list[i]))
633
-
634
- # Progress update and checkpoint
635
- if (successful_count + failed_count) % self.params.checkpoint_interval == 0:
636
- progress = (successful_count + failed_count) / len(file_list) * 100
637
- self._log_progress(f"Progress: {progress:.1f}% ({successful_count} successful, {failed_count} failed)")
638
- self._save_checkpoint()
639
-
640
- else:
641
- # Many files: process in batches
642
- batches = [
643
- file_list[i:i + self.params.batch_size]
644
- for i in range(0, len(file_list), self.params.batch_size)
645
- ]
646
-
647
- self.logger.info(f"Processing {len(file_list)} files in {len(batches)} batches")
648
-
649
- with concurrent.futures.ProcessPoolExecutor(max_workers=self.params.num_cores) as executor:
650
- futures = [executor.submit(self._process_batch, batch) for batch in batches]
651
-
652
- for batch_idx, future in enumerate(concurrent.futures.as_completed(futures)):
653
- batch_results = future.result()
654
- batch = batches[batch_idx]
655
-
656
- for i, result in enumerate(batch_results):
657
- if result:
658
- successful_count += 1
659
- self.processed_files.append(str(batch[i]))
660
- else:
661
- failed_count += 1
662
- self.failed_files.append(str(batch[i]))
663
-
664
- # Progress update
665
- progress = (successful_count + failed_count) / len(file_list) * 100
666
- self._log_progress(f"Batch {batch_idx + 1}/{len(batches)} complete. Progress: {progress:.1f}%")
667
- self._save_checkpoint()
668
-
669
- else:
670
- # ThreadPoolExecutor approach
671
- self.logger.info(f"Processing {len(file_list)} files with {self.params.num_cores} threads")
672
-
673
- with concurrent.futures.ThreadPoolExecutor(max_workers=self.params.num_cores) as executor:
674
- futures = [
675
- executor.submit(self._process_single_file, file_path)
676
- for file_path in file_list
677
- ]
678
-
679
- for i, future in enumerate(concurrent.futures.as_completed(futures)):
680
- result = future.result()
681
- if result:
682
- successful_count += 1
683
- self.processed_files.append(str(file_list[i]))
684
- else:
685
- failed_count += 1
686
- self.failed_files.append(str(file_list[i]))
687
-
688
- if (successful_count + failed_count) % self.params.checkpoint_interval == 0:
689
- progress = (successful_count + failed_count) / len(file_list) * 100
690
- self._log_progress(f"Progress: {progress:.1f}%")
691
- self._save_checkpoint()
692
-
693
- conversion_time = time.time() - conversion_start
694
-
695
- self._log_progress("=== Sample5 Conversion Complete ===")
696
- self._log_progress(f"Successful: {successful_count}")
697
- self._log_progress(f"Failed: {failed_count}")
698
- self._log_progress(f"Total time: {conversion_time:.1f} seconds")
699
-
700
- if failed_count > 0:
701
- self.logger.warning(f"{failed_count} files failed to process")
702
- for failed_file in self.failed_files[-failed_count:]:
703
- self.logger.warning(f"Failed: {failed_file}")
704
-
705
- self._save_checkpoint()
706
- return successful_count > 0
707
-
708
- def assemble_study(self) -> bool:
709
- """
710
- Assemble processed sample5 files into a study.
711
-
712
383
  Returns:
713
- True if study assembly was successful
384
+ True if execution completed successfully, False otherwise
714
385
  """
715
- self._log_progress("=== Starting Study Assembly ===")
716
- self.current_step = "assembling_study"
717
-
718
- # Find all sample5 files
719
- sample5_files = list(self.study_folder_path.glob("*.sample5"))
720
-
721
- if not sample5_files:
722
- self.logger.error("No sample5 files found for study assembly")
386
+ # First create the script
387
+ if not self.create_script(filename):
723
388
  return False
724
-
725
- self._log_progress(f"Assembling study from {len(sample5_files)} sample5 files")
389
+
390
+ # Then execute it
391
+ script_path = self.folder_path / filename
726
392
 
727
393
  try:
728
- # Create study with optimized settings
729
- import masster
730
- study_params = study_defaults(
731
- folder=str(self.study_folder_path),
732
- polarity=self.polarity,
733
- log_level="INFO",
734
- log_label=f"Study-{self.polarity}",
735
- adducts=self.adducts
736
- )
737
-
738
- self.study = masster.Study(params=study_params)
394
+ print("Executing...")
739
395
 
740
- # Add all sample5 files
741
- sample5_pattern = str(self.study_folder_path / "*.sample5")
742
- self.study.add(sample5_pattern)
396
+ import subprocess
397
+ result = subprocess.run([
398
+ sys.executable, str(script_path)
399
+ ], cwd=str(self.folder_path), encoding='utf-8', errors='replace')
743
400
 
744
- self._log_progress(f"Added {len(self.study.samples_df)} samples to study")
401
+ success = result.returncode == 0
745
402
 
746
- # Filter features based on quality criteria
747
- if hasattr(self.study, 'features_filter'):
748
- initial_features = len(self.study.features_df) if hasattr(self.study, 'features_df') else 0
749
-
750
- # Apply feature filtering
751
- feature_selection = self.study.features_select(
752
- chrom_coherence=0.3,
753
- chrom_prominence_scaled=1
754
- )
755
- self.study.features_filter(feature_selection)
403
+ if success:
404
+ print("=" * 70)
405
+ print("Script execution completed successfully")
406
+ else:
407
+ print("=" * 70)
408
+ print(f"Script execution failed with return code: {result.returncode}")
756
409
 
757
- final_features = len(self.study.features_df) if hasattr(self.study, 'features_df') else 0
758
- self._log_progress(f"Feature filtering: {initial_features} -> {final_features} features")
759
-
760
- self._save_checkpoint()
761
- return True
410
+ return success
762
411
 
763
412
  except Exception as e:
764
- self.logger.error(f"Failed to assemble study: {e}")
413
+ print(f"Error during script execution: {e}")
765
414
  return False
766
-
767
- def align_and_merge(self) -> bool:
768
- """
769
- Perform feature alignment and merging.
415
+
416
+ def _generate_script_content(self) -> str:
417
+ """Generate the complete analysis script content."""
418
+
419
+ # Convert Path objects to strings for JSON serialization
420
+ params_dict = {}
421
+ for key, value in self.params.__dict__.items():
422
+ if key == '_param_metadata': # Skip metadata in generated script
423
+ continue
424
+ if isinstance(value, Path):
425
+ params_dict[key] = str(value)
426
+ else:
427
+ params_dict[key] = value
428
+
429
+ # Obtain list of files in source with extension wiff, .raw, .mzML
430
+ raw_files = []
431
+ for ext in params_dict.get('file_extensions', []):
432
+ raw_files.extend(glob.glob(f"{params_dict.get('source', '')}/**/*{ext}", recursive=True))
433
+
434
+ # Create readable PARAMS dict with comments
435
+ params_lines = []
436
+ params_lines.append('# Analysis parameters')
437
+ params_lines.append('PARAMS = {')
770
438
 
771
- Returns:
772
- True if alignment and merging were successful
773
- """
774
- self._log_progress("=== Starting Feature Alignment and Merging ===")
775
- self.current_step = "aligning_and_merging"
439
+ # Core Configuration
440
+ params_lines.append(' # === Core Configuration ===')
441
+ params_lines.append(f' "source": {params_dict.get("source", "")!r}, # Directory containing raw data files')
442
+ params_lines.append(f' "folder": {params_dict.get("folder", "")!r}, # Output directory for processed study')
443
+ params_lines.append(f' "polarity": {params_dict.get("polarity", "positive")!r}, # Ion polarity mode ("positive" or "negative")')
444
+ params_lines.append(f' "num_cores": {params_dict.get("num_cores", 4)}, # Number of CPU cores for parallel processing')
445
+ params_lines.append('')
776
446
 
777
- if self.study is None:
778
- self.logger.error("Study not assembled. Run assemble_study() first.")
779
- return False
447
+ # File Discovery
448
+ params_lines.append(' # === File Discovery ===')
449
+ params_lines.append(f' "file_extensions": {params_dict.get("file_extensions", [".wiff", ".raw", ".mzML"])!r}, # File extensions to search for')
450
+ params_lines.append(f' "search_subfolders": {params_dict.get("search_subfolders", True)}, # Whether to search subdirectories recursively')
451
+ params_lines.append(f' "skip_patterns": {params_dict.get("skip_patterns", ["blank", "condition"])!r}, # Filename patterns to skip')
452
+ params_lines.append('')
780
453
 
781
- try:
782
- # Align features across samples
783
- align_params = align_defaults(
784
- rt_tol=self.params.rt_tolerance,
785
- mz_tol=self.params.mz_tolerance,
786
- algorithm=self.params.alignment_algorithm
787
- )
788
-
789
- self.logger.info(f"Aligning features with RT tolerance {self.params.rt_tolerance}s, m/z tolerance {self.params.mz_tolerance} Da")
790
- self.study.align(params=align_params)
791
-
792
- # Merge aligned features
793
- merge_params = merge_defaults(
794
- method=self.params.merge_method,
795
- rt_tol=self.params.rt_tolerance,
796
- mz_tol=self.params.mz_tolerance,
797
- min_samples=self.params.min_samples_for_merge
454
+ # Processing Parameters
455
+ params_lines.append(' # === Processing Parameters ===')
456
+ params_lines.append(f' "adducts": {params_dict.get("adducts", [])!r}, # Adduct specifications for feature detection and annotation')
457
+ params_lines.append(f' "detector_type": {params_dict.get("detector_type", "unknown")!r}, # MS detector type ("orbitrap", "tof", "unknown")')
458
+ params_lines.append('')
459
+
460
+ # Alignment & Merging
461
+ params_lines.append(' # === Alignment & Merging ===')
462
+ params_lines.append(f' "rt_tol": {params_dict.get("rt_tol", 2.0)}, # Retention time tolerance for alignment (seconds)')
463
+ params_lines.append(f' "mz_tol": {params_dict.get("mz_tol", 0.01)}, # Mass-to-charge ratio tolerance for alignment (Da)')
464
+ params_lines.append(f' "alignment_method": {params_dict.get("alignment_method", "kd")!r}, # Algorithm for sample alignment')
465
+ params_lines.append(f' "min_samples_per_feature": {params_dict.get("min_samples_per_feature", 1)}, # Minimum samples required per consensus feature')
466
+ params_lines.append(f' "merge_method": {params_dict.get("merge_method", "qt")!r}, # Method for merging consensus features')
467
+ params_lines.append('')
468
+
469
+ # Sample Processing
470
+ params_lines.append(' # === Sample Processing (used in add_samples_from_folder) ===')
471
+ params_lines.append(f' "batch_size": {params_dict.get("batch_size", 8)}, # Number of files to process per batch')
472
+ params_lines.append(f' "memory_limit_gb": {params_dict.get("memory_limit_gb", 16.0)}, # Memory limit for processing (GB)')
473
+ params_lines.append('')
474
+
475
+ # Script Options
476
+ params_lines.append(' # === Script Options ===')
477
+ params_lines.append(f' "resume_enabled": {params_dict.get("resume_enabled", True)}, # Enable automatic resume capability')
478
+ params_lines.append(f' "force_reprocess": {params_dict.get("force_reprocess", False)}, # Force reprocessing of existing files')
479
+ params_lines.append(f' "cleanup_temp_files": {params_dict.get("cleanup_temp_files", True)}, # Clean up temporary files after processing')
480
+
481
+ params_lines.append('}')
482
+
483
+ # Create script lines
484
+ script_lines = [
485
+ '#!/usr/bin/env python3',
486
+ '"""',
487
+ 'Automated Mass Spectrometry Data Analysis Pipeline',
488
+ f'Generated by masster wizard v{version}',
489
+ '"""',
490
+ '',
491
+ 'import sys',
492
+ 'import time',
493
+ 'from pathlib import Path',
494
+ '',
495
+ '# Import masster modules',
496
+ 'from masster.study import Study',
497
+ 'from masster import __version__',
498
+ '',
499
+ ]
500
+
501
+ # Add the formatted PARAMS
502
+ script_lines.extend(params_lines)
503
+
504
+ # Add the main function and pipeline
505
+ script_lines.extend([
506
+ '',
507
+ '',
508
+ 'def discover_raw_files(source_folder, file_extensions, search_subfolders=True):',
509
+ ' """Discover raw data files in the source folder."""',
510
+ ' source_path = Path(source_folder)',
511
+ ' raw_files = []',
512
+ ' ',
513
+ ' for ext in file_extensions:',
514
+ ' if search_subfolders:',
515
+ ' pattern = f"**/*{ext}"',
516
+ ' files = list(source_path.rglob(pattern))',
517
+ ' else:',
518
+ ' pattern = f"*{ext}"',
519
+ ' files = list(source_path.glob(pattern))',
520
+ ' raw_files.extend(files)',
521
+ ' ',
522
+ ' return raw_files',
523
+ '',
524
+ '',
525
+ 'def process_single_file(args):',
526
+ ' """Process a single raw file to sample5 format - module level for multiprocessing."""',
527
+ ' raw_file, output_folder = args',
528
+ ' from masster.sample import Sample',
529
+ ' ',
530
+ ' try:',
531
+ ' # Create sample5 filename',
532
+ ' sample_name = raw_file.stem',
533
+ ' sample5_path = Path(output_folder) / f"{sample_name}.sample5"',
534
+ ' ',
535
+ ' # Skip if sample5 already exists',
536
+ ' if sample5_path.exists():',
537
+ ' print(f" Skipping {raw_file.name} (sample5 already exists)")',
538
+ ' return str(sample5_path)',
539
+ ' ',
540
+ ' print(f" Converting {raw_file.name}...")',
541
+ ' ',
542
+ ' # Load and process raw file with full pipeline',
543
+ ' sample = Sample(log_label=sample_name)',
544
+ ' sample.load(filename=str(raw_file))',
545
+ ' sample.find_features(',
546
+ ' noise=PARAMS[\'noise_threshold\'],',
547
+ ' chrom_fwhm=PARAMS[\'smoothing_width\'],',
548
+ ' chrom_peak_snr=PARAMS[\'peak_threshold\']',
549
+ ' )',
550
+ ' sample.find_adducts(adducts=PARAMS[\'adducts\'])',
551
+ ' sample.find_ms2()',
552
+ ' # sample.find_iso()',
553
+ ' # sample.export_mgf()',
554
+ ' # sample.export_mztab()',
555
+ ' # sample.plot_2d(filename="{sample_name}.html")',
556
+ ' sample.save(str(sample5_path))',
557
+ ' ',
558
+ ' # print(f" Completed {raw_file.name} -> {sample5_path.name}")',
559
+ ' return str(sample5_path)',
560
+ ' ',
561
+ ' except Exception as e:',
562
+ ' print(f" ERROR processing {raw_file.name}: {e}")',
563
+ ' return None',
564
+ '',
565
+ '',
566
+ 'def convert_raw_to_sample5(raw_files, output_folder, polarity, num_cores):',
567
+ ' """Convert raw data files to sample5 format."""',
568
+ ' import concurrent.futures',
569
+ ' import os',
570
+ ' ',
571
+ ' # Create output directory',
572
+ ' os.makedirs(output_folder, exist_ok=True)',
573
+ ' ',
574
+ ' # Prepare arguments for multiprocessing',
575
+ ' file_args = [(raw_file, output_folder) for raw_file in raw_files]',
576
+ ' ',
577
+ ' # Process files in parallel',
578
+ ' sample5_files = []',
579
+ ' with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:',
580
+ ' futures = [executor.submit(process_single_file, args) for args in file_args]',
581
+ ' ',
582
+ ' for future in concurrent.futures.as_completed(futures):',
583
+ ' result = future.result()',
584
+ ' if result:',
585
+ ' sample5_files.append(result)',
586
+ ' ',
587
+ ' return sample5_files',
588
+ '',
589
+ '',
590
+ 'def main():',
591
+ ' """Main analysis pipeline."""',
592
+ ' try:',
593
+ ' print("=" * 70)',
594
+ f' print("masster {version} - Automated MS Data Analysis")',
595
+ ' print("=" * 70)',
596
+ ' print(f"Source: {PARAMS[\'source\']}")',
597
+ ' print(f"Output: {PARAMS[\'folder\']}")',
598
+ ' print(f"Polarity: {PARAMS[\'polarity\']}")',
599
+ ' print(f"CPU Cores: {PARAMS[\'num_cores\']}")',
600
+ ' print("=" * 70)',
601
+ ' ',
602
+ ' start_time = time.time()',
603
+ ' ',
604
+ ' # Step 1: Discover raw data files',
605
+ ' print("\\nStep 1/7: Discovering raw data files...")',
606
+ ' raw_files = discover_raw_files(',
607
+ ' PARAMS[\'source\'],',
608
+ ' PARAMS[\'file_extensions\'],',
609
+ ' PARAMS[\'search_subfolders\']',
610
+ ' )',
611
+ ' ',
612
+ ' if not raw_files:',
613
+ ' print("No raw data files found!")',
614
+ ' return False',
615
+ ' ',
616
+ ' print(f"Found {len(raw_files)} raw data files")',
617
+ ' for f in raw_files[:5]: # Show first 5 files',
618
+ ' print(f" {f.name}")',
619
+ ' if len(raw_files) > 5:',
620
+ ' print(f" ... and {len(raw_files) - 5} more")',
621
+ ' ',
622
+ ' # Step 2: Process raw files',
623
+ ' print("\\nStep 2/7: Processing raw files...")',
624
+ ' sample5_files = convert_raw_to_sample5(',
625
+ ' raw_files,',
626
+ ' PARAMS[\'folder\'],',
627
+ ' PARAMS[\'polarity\'],',
628
+ ' PARAMS[\'num_cores\']',
629
+ ' )',
630
+ ' ',
631
+ ' if not sample5_files:',
632
+ ' print("No sample5 files were created!")',
633
+ ' return False',
634
+ ' ',
635
+ ' print(f"Successfully processed {len(sample5_files)} files to sample5")',
636
+ ' ',
637
+ ' # Step 3: Create and configure study',
638
+ ' print("\\nStep 3/7: Initializing study...")',
639
+ ' study = Study(folder=PARAMS[\'folder\'])',
640
+ ' study.polarity = PARAMS[\'polarity\']',
641
+ ' study.adducts = PARAMS[\'adducts\']',
642
+ ' ',
643
+ ' # Step 4: Add sample5 files to study',
644
+ ' print("\\nStep 4/7: Adding samples to study...")',
645
+ ' study.add(str(Path(PARAMS[\'folder\']) / "*.sample5"))',
646
+ ' ',
647
+ ' # Step 5: Core processing',
648
+ ' print("\\nStep 5/7: Processing...")',
649
+ ' study.align(',
650
+ ' algorithm=PARAMS[\'alignment_method\'],',
651
+ ' rt_tol=PARAMS[\'rt_tol\']',
652
+ ' )',
653
+ ' ',
654
+ ' # Merge and create consensus features',
655
+ ' study.merge(',
656
+ ' min_samples=PARAMS[\'min_samples_per_feature\'],',
657
+ ' threads=PARAMS[\'num_cores\'],',
658
+ ' rt_tol=PARAMS[\'rt_tol\'],',
659
+ ' mz_tol=PARAMS[\'mz_tol\']',
660
+ ' )',
661
+ ' study.find_iso()',
662
+ ' study.fill(min_samples_rel=0.0)',
663
+ ' study.integrate()',
664
+ ' ',
665
+ ' # Step 6/7: Saving results',
666
+ ' print("\\nStep 6/7: Saving results...")',
667
+ ' study.save()',
668
+ ' study.export_xlsx()',
669
+ ' study.export_mgf()',
670
+ ' study.export_mztab()',
671
+ ' ',
672
+ ' # Step 7: Plots',
673
+ ' print("\\nStep 7/7: Exporting plots...")',
674
+ ' study.plot_consensus_2d(filename="consensus.html")',
675
+ ' study.plot_consensus_2d(filename="consensus.png")',
676
+ ' study.plot_alignment(filename="alignment.html")',
677
+ ' study.plot_alignment(filename="alignment.png")',
678
+ ' study.plot_pca(filename="pca.html")',
679
+ ' study.plot_pca(filename="pca.png")',
680
+ ' study.plot_bpc(filename="bpc.html")',
681
+ ' study.plot_bpc(filename="bpc.png")',
682
+ ' study.plot_rt_correction(filename="rt_correction.html")',
683
+ ' study.plot_rt_correction(filename="rt_correction.png")',
684
+
685
+ ' ',
686
+ ' # Print summary',
687
+ ' study.info()',
688
+ ' total_time = time.time() - start_time',
689
+ ' print("\\n" + "=" * 70)',
690
+ ' print("ANALYSIS COMPLETE")',
691
+ ' print("=" * 70)',
692
+ ' print(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")',
693
+ ' print(f"Raw files processed: {len(raw_files)}")',
694
+ ' print(f"Sample5 files created: {len(sample5_files)}")',
695
+ ' if hasattr(study, "consensus_df"):',
696
+ ' print(f"Consensus features generated: {len(study.consensus_df)}")',
697
+ ' print("=" * 70)',
698
+ ' ',
699
+ ' return True',
700
+ ' ',
701
+ ' except KeyboardInterrupt:',
702
+ ' print("\\nAnalysis interrupted by user")',
703
+ ' return False',
704
+ ' except Exception as e:',
705
+ ' print(f"Analysis failed with error: {e}")',
706
+ ' import traceback',
707
+ ' traceback.print_exc()',
708
+ ' return False',
709
+ '',
710
+ '',
711
+ 'if __name__ == "__main__":',
712
+ ' success = main()',
713
+ ' sys.exit(0 if success else 1)',
714
+ ])
715
+
716
+ return '\n'.join(script_lines)
717
+
718
+
719
+ def create_script(
720
+ source: str,
721
+ folder: str,
722
+ filename: str = 'run_masster.py',
723
+ polarity: str = "positive",
724
+ adducts: Optional[List[str]] = None,
725
+ params: Optional[wizard_def] = None,
726
+ num_cores: int = 0,
727
+ **kwargs
728
+ ) -> bool:
729
+ """
730
+ Create a standalone analysis script without initializing a Wizard instance.
731
+
732
+ This function generates a Python script that replicates automated processing
733
+ steps with the specified configuration. The script can be executed independently
734
+ to perform the same analysis.
735
+
736
+ Parameters:
737
+ source: Directory containing raw data files
738
+ folder: Output directory for processed study
739
+ filename: Filename for the generated script (should end with .py)
740
+ polarity: Ion polarity mode ("positive" or "negative")
741
+ adducts: List of adduct specifications (auto-set if None)
742
+ params: Custom wizard_def parameters (optional)
743
+ num_cores: Number of CPU cores (0 = auto-detect)
744
+ **kwargs: Additional parameters to override defaults
745
+
746
+ Returns:
747
+ True if script was generated successfully, False otherwise
748
+
749
+ Example:
750
+ >>> from masster.wizard import create_script
751
+ >>> create_script(
752
+ ... source=r'D:\\Data\\raw_files',
753
+ ... folder=r'D:\\Data\\output',
754
+ ... filename='run_masster.py',
755
+ ... polarity='positive'
756
+ ... )
757
+ """
758
+
759
+ try:
760
+ # Create parameters
761
+ if params is not None:
762
+ # Use provided params as base
763
+ wizard_params = params
764
+ # Update with provided values
765
+ wizard_params.source = source
766
+ wizard_params.folder = folder
767
+ if polarity != "positive": # Only override if explicitly different
768
+ wizard_params.polarity = polarity
769
+ if num_cores > 0:
770
+ wizard_params.num_cores = num_cores
771
+ if adducts is not None:
772
+ wizard_params.adducts = adducts
773
+ else:
774
+ # Create new params with provided values
775
+ wizard_params = wizard_def(
776
+ source=source,
777
+ folder=folder,
778
+ polarity=polarity,
779
+ num_cores=max(1, int(multiprocessing.cpu_count() * 0.75)) if num_cores <= 0 else num_cores
798
780
  )
799
781
 
800
- self.logger.info(f"Merging features using {self.params.merge_method} method")
801
- self.study.merge(params=merge_params)
802
-
803
- # Log results
804
- num_consensus = len(self.study.consensus_df) if hasattr(self.study, 'consensus_df') else 0
805
- self._log_progress(f"Generated {num_consensus} consensus features")
806
-
807
- # Get study info
808
- if hasattr(self.study, 'info'):
809
- self.study.info()
810
-
811
- self._save_checkpoint()
812
- return True
813
-
814
- except Exception as e:
815
- self.logger.error(f"Failed to align and merge: {e}")
816
- return False
817
-
818
- def generate_plots(self) -> bool:
819
- """
820
- Generate visualization plots for the study.
821
-
822
- Returns:
823
- True if plot generation was successful
824
- """
825
- if not self.params.generate_plots:
826
- self._log_progress("Plot generation disabled, skipping...")
827
- return True
782
+ if adducts is not None:
783
+ wizard_params.adducts = adducts
828
784
 
829
- self._log_progress("=== Generating Visualization Plots ===")
830
- self.current_step = "generating_plots"
785
+ # Apply any additional kwargs
786
+ for key, value in kwargs.items():
787
+ if hasattr(wizard_params, key):
788
+ setattr(wizard_params, key, value)
831
789
 
832
- if self.study is None:
833
- self.logger.error("Study not available. Complete previous steps first.")
834
- return False
790
+ # Ensure study folder exists
791
+ study_path = Path(folder)
792
+ study_path.mkdir(parents=True, exist_ok=True)
835
793
 
836
- try:
837
- plots_generated = 0
838
-
839
- # Alignment plot
840
- if hasattr(self.study, 'plot_alignment'):
841
- alignment_plot = self.study_folder_path / "alignment_plot.html"
842
- self.study.plot_alignment(filename=str(alignment_plot))
843
- plots_generated += 1
844
- self.logger.info(f"Generated alignment plot: {alignment_plot}")
845
-
846
- # Consensus 2D plot
847
- if hasattr(self.study, 'plot_consensus_2d'):
848
- consensus_2d_plot = self.study_folder_path / "consensus_2d.html"
849
- self.study.plot_consensus_2d(filename=str(consensus_2d_plot))
850
- plots_generated += 1
851
- self.logger.info(f"Generated consensus 2D plot: {consensus_2d_plot}")
852
-
853
- # PCA plot
854
- if hasattr(self.study, 'plot_pca'):
855
- pca_plot = self.study_folder_path / "pca_plot.html"
856
- self.study.plot_pca(filename=str(pca_plot))
857
- plots_generated += 1
858
- self.logger.info(f"Generated PCA plot: {pca_plot}")
859
-
860
- # Consensus statistics
861
- if hasattr(self.study, 'plot_consensus_stats'):
862
- stats_plot = self.study_folder_path / "consensus_stats.html"
863
- self.study.plot_consensus_stats(filename=str(stats_plot))
864
- plots_generated += 1
865
- self.logger.info(f"Generated statistics plot: {stats_plot}")
866
-
867
- self._log_progress(f"Generated {plots_generated} visualization plots")
868
- self._save_checkpoint()
869
- return True
870
-
871
- except Exception as e:
872
- self.logger.error(f"Failed to generate plots: {e}")
873
- return False
874
-
875
- def export_results(self) -> bool:
876
- """
877
- Export study results in requested formats.
794
+ # Create a temporary Wizard instance to generate the script
795
+ temp_wizard = Wizard(params=wizard_params)
878
796
 
879
- Returns:
880
- True if export was successful
881
- """
882
- self._log_progress("=== Exporting Study Results ===")
883
- self.current_step = "exporting_results"
797
+ # Generate the script using the instance method
798
+ success = temp_wizard.create_script(filename)
884
799
 
885
- if self.study is None:
886
- self.logger.error("Study not available. Complete previous steps first.")
887
- return False
800
+ return success
888
801
 
889
- try:
890
- exports_completed = 0
891
-
892
- # Export consensus features as CSV
893
- if "csv" in self.params.export_formats:
894
- csv_file = self.study_folder_path / "consensus_features.csv"
895
- if hasattr(self.study.consensus_df, 'write_csv'):
896
- self.study.consensus_df.write_csv(str(csv_file))
897
- exports_completed += 1
898
- self.logger.info(f"Exported CSV: {csv_file}")
899
-
900
- # Export as Excel
901
- if "xlsx" in self.params.export_formats and hasattr(self.study, 'export_xlsx'):
902
- xlsx_file = self.study_folder_path / "study_results.xlsx"
903
- self.study.export_xlsx(filename=str(xlsx_file))
904
- exports_completed += 1
905
- self.logger.info(f"Exported Excel: {xlsx_file}")
906
-
907
- # Export MGF for MS2 spectra
908
- if "mgf" in self.params.export_formats and hasattr(self.study, 'export_mgf'):
909
- mgf_file = self.study_folder_path / "consensus_ms2.mgf"
910
- self.study.export_mgf(filename=str(mgf_file))
911
- exports_completed += 1
912
- self.logger.info(f"Exported MGF: {mgf_file}")
913
-
914
- # Export as Parquet for efficient storage
915
- if "parquet" in self.params.export_formats and hasattr(self.study, 'export_parquet'):
916
- parquet_file = self.study_folder_path / "study_data.parquet"
917
- self.study.export_parquet(filename=str(parquet_file))
918
- exports_completed += 1
919
- self.logger.info(f"Exported Parquet: {parquet_file}")
920
-
921
- self._log_progress(f"Completed {exports_completed} exports")
922
- self._save_checkpoint()
923
- return True
924
-
925
- except Exception as e:
926
- self.logger.error(f"Failed to export results: {e}")
927
- return False
802
+ except Exception as e:
803
+ print(f"Failed to create script: {e}")
804
+ import traceback
805
+ traceback.print_exc()
806
+ return False
807
+
808
+
809
+ def execute(
810
+ source: str,
811
+ folder: str,
812
+ filename: str = 'run_masster.py',
813
+ polarity: str = "positive",
814
+ adducts: Optional[List[str]] = None,
815
+ params: Optional[wizard_def] = None,
816
+ num_cores: int = 0,
817
+ **kwargs
818
+ ) -> bool:
819
+ """
820
+ Create and execute a standalone analysis script for automated MS data processing.
821
+
822
+ This function generates a Python script with the same parameters as create_script(),
823
+ but immediately executes it after creation. Combines script generation and execution
824
+ in a single step.
825
+
826
+ Parameters:
827
+ source: Directory containing raw data files
828
+ folder: Output directory for processed study
829
+ filename: Filename for the generated script (should end with .py)
830
+ polarity: Ion polarity mode ("positive" or "negative")
831
+ adducts: List of adduct specifications (auto-set if None)
832
+ params: Custom wizard_def parameters (optional)
833
+ num_cores: Number of CPU cores (0 = auto-detect)
834
+ **kwargs: Additional parameters to override defaults
835
+
836
+ Returns:
837
+ True if script was created and executed successfully, False otherwise
838
+
839
+ Example:
840
+ >>> from masster.wizard import execute
841
+ >>> execute(
842
+ ... source=r'D:\\Data\\raw_files',
843
+ ... folder=r'D:\\Data\\output',
844
+ ... polarity='positive'
845
+ ... )
846
+ """
928
847
 
929
- def save_study(self) -> bool:
930
- """
931
- Save the final study in optimized format.
932
-
933
- Returns:
934
- True if study was saved successfully
935
- """
936
- self._log_progress("=== Saving Final Study ===")
937
- self.current_step = "saving_study"
938
-
939
- if self.study is None:
940
- self.logger.error("Study not available. Complete previous steps first.")
941
- return False
848
+ try:
849
+ # First, create the script using create_script()
850
+ script_created = create_script(
851
+ source=source,
852
+ folder=folder,
853
+ filename=filename,
854
+ polarity=polarity,
855
+ adducts=adducts,
856
+ params=params,
857
+ num_cores=num_cores,
858
+ **kwargs
859
+ )
942
860
 
943
- try:
944
- study_file = self.study_folder_path / "final_study.study5"
945
-
946
- # Determine optimal save format based on study size
947
- num_samples = len(self.study.samples_df)
948
- num_features = len(self.study.consensus_df) if hasattr(self.study, 'consensus_df') else 0
949
-
950
- if self.params.adaptive_compression:
951
- # Use compressed format for large studies
952
- if num_samples > 50 or num_features > 10000:
953
- self.logger.info(f"Large study detected ({num_samples} samples, {num_features} features) - using compressed format")
954
- self.params.compress_output = True
955
- else:
956
- self.logger.info(f"Small study ({num_samples} samples, {num_features} features) - using standard format")
957
- self.params.compress_output = False
958
-
959
- # Save study
960
- if self.params.compress_output and hasattr(self.study, 'save_compressed'):
961
- self.study.save_compressed(filename=str(study_file))
962
- self.logger.info(f"Saved compressed study: {study_file}")
963
- else:
964
- self.study.save(filename=str(study_file))
965
- self.logger.info(f"Saved study: {study_file}")
966
-
967
- # Save metadata summary
968
- metadata_file = self.study_folder_path / "study_metadata.txt"
969
- with open(metadata_file, "w") as f:
970
- f.write("Study Processing Summary\n")
971
- f.write("========================\n")
972
- f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
973
- f.write(f"Polarity: {self.polarity}\n")
974
- f.write(f"Adducts: {', '.join(self.adducts)}\n")
975
- f.write(f"Number of Samples: {num_samples}\n")
976
- f.write(f"Number of Consensus Features: {num_features}\n")
977
- f.write(f"Successful Files: {len(self.processed_files)}\n")
978
- f.write(f"Failed Files: {len(self.failed_files)}\n")
979
- f.write(f"RT Tolerance: {self.params.rt_tolerance}s\n")
980
- f.write(f"m/z Tolerance: {self.params.mz_tolerance} Da\n")
981
- f.write(f"Merge Method: {self.params.merge_method}\n")
982
- f.write(f"Processing Time: {self._get_total_processing_time()}\n")
983
-
984
- self._log_progress(f"Saved study metadata: {metadata_file}")
985
- self._save_checkpoint()
986
- return True
987
-
988
- except Exception as e:
989
- self.logger.error(f"Failed to save study: {e}")
861
+ if not script_created:
862
+ print("Failed to create analysis script")
990
863
  return False
991
-
992
- def cleanup_temp_files(self) -> bool:
993
- """
994
- Clean up temporary files if requested.
995
-
996
- Returns:
997
- True if cleanup was successful
998
- """
999
- if not self.params.cleanup_temp_files:
1000
- return True
1001
864
 
1002
- self._log_progress("=== Cleaning Up Temporary Files ===")
865
+ # Get the full path to the created script
866
+ study_path = Path(folder)
867
+ script_path = study_path / Path(filename).name
1003
868
 
1004
- try:
1005
- cleaned_count = 0
1006
-
1007
- # Remove individual sample plots if study plots were generated
1008
- if self.params.generate_plots:
1009
- temp_plots = list(self.study_folder_path.glob("*_2d.html"))
1010
- for plot_file in temp_plots:
1011
- if plot_file.name not in ["alignment_plot.html", "consensus_2d.html", "pca_plot.html"]:
1012
- plot_file.unlink()
1013
- cleaned_count += 1
1014
-
1015
- # Remove checkpoint file
1016
- if self.checkpoint_file.exists():
1017
- self.checkpoint_file.unlink()
1018
- cleaned_count += 1
1019
-
1020
- self._log_progress(f"Cleaned up {cleaned_count} temporary files")
1021
- return True
1022
-
1023
- except Exception as e:
1024
- self.logger.error(f"Failed to cleanup temp files: {e}")
869
+ if not script_path.exists():
870
+ print(f"Script file not found: {script_path}")
1025
871
  return False
1026
-
1027
- def run_full_pipeline(self) -> bool:
1028
- """
1029
- Run the complete automated processing pipeline.
1030
-
1031
- This method executes all processing steps in sequence:
1032
- 1. Convert raw files to sample5 format
1033
- 2. Assemble study from sample5 files
1034
- 3. Align and merge features
1035
- 4. Generate visualization plots
1036
- 5. Export results in requested formats
1037
- 6. Save final study
1038
- 7. Clean up temporary files
1039
-
1040
- Returns:
1041
- True if the entire pipeline completed successfully
1042
- """
1043
- self._log_progress("=" * 60)
1044
- self._log_progress("STARTING AUTOMATED STUDY PROCESSING PIPELINE")
1045
- self._log_progress("=" * 60)
872
+
873
+ print(f"Executing...")
874
+ #print("=" * 70)
1046
875
 
1047
- self.start_time = time.time()
1048
- pipeline_success = True
876
+ # Execute the script using subprocess with real-time output
877
+ import subprocess
1049
878
 
879
+ # Run the script with Python, letting it inherit our stdout/stderr
1050
880
  try:
1051
- # Step 1: Convert to sample5
1052
- if not self.convert_to_sample5():
1053
- self.logger.error("Sample5 conversion failed")
1054
- return False
1055
-
1056
- # Step 2: Assemble study
1057
- if not self.assemble_study():
1058
- self.logger.error("Study assembly failed")
1059
- return False
1060
-
1061
- # Step 3: Align and merge
1062
- if not self.align_and_merge():
1063
- self.logger.error("Feature alignment and merging failed")
1064
- return False
1065
-
1066
- # Step 4: Generate plots
1067
- if not self.generate_plots():
1068
- self.logger.warning("Plot generation failed, continuing...")
1069
- pipeline_success = False
1070
-
1071
- # Step 5: Export results
1072
- if not self.export_results():
1073
- self.logger.warning("Result export failed, continuing...")
1074
- pipeline_success = False
881
+ # Use subprocess.run for direct output inheritance - no capturing/re-printing
882
+ result = subprocess.run([
883
+ sys.executable, str(script_path)
884
+ ], cwd=str(study_path))
1075
885
 
1076
- # Step 6: Save study
1077
- if not self.save_study():
1078
- self.logger.error("Study saving failed")
1079
- return False
886
+ return_code = result.returncode
1080
887
 
1081
- # Step 7: Cleanup
1082
- if not self.cleanup_temp_files():
1083
- self.logger.warning("Cleanup failed, continuing...")
1084
-
1085
- # Final summary
1086
- total_time = time.time() - self.start_time
1087
- self._log_progress("=" * 60)
1088
- self._log_progress("PIPELINE COMPLETED SUCCESSFULLY")
1089
- self._log_progress(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
1090
- self._log_progress(f"Files processed: {len(self.processed_files)}")
1091
- self._log_progress(f"Files failed: {len(self.failed_files)}")
1092
- if hasattr(self.study, 'consensus_df'):
1093
- self._log_progress(f"Consensus features: {len(self.study.consensus_df)}")
1094
- self._log_progress("=" * 60)
1095
-
1096
- return pipeline_success
1097
-
1098
- except KeyboardInterrupt:
1099
- self.logger.info("Pipeline interrupted by user")
1100
- self._save_checkpoint()
1101
- return False
1102
888
  except Exception as e:
1103
- self.logger.error(f"Pipeline failed with unexpected error: {e}")
1104
- self._save_checkpoint()
889
+ print(f"Error during script execution: {e}")
1105
890
  return False
1106
-
1107
- def _get_total_processing_time(self) -> str:
1108
- """Get formatted total processing time."""
1109
- if self.start_time is None:
1110
- return "Unknown"
1111
891
 
1112
- total_seconds = time.time() - self.start_time
1113
- hours = int(total_seconds // 3600)
1114
- minutes = int((total_seconds % 3600) // 60)
1115
- seconds = int(total_seconds % 60)
892
+ success = return_code == 0
1116
893
 
1117
- if hours > 0:
1118
- return f"{hours}h {minutes}m {seconds}s"
1119
- elif minutes > 0:
1120
- return f"{minutes}m {seconds}s"
894
+ if success:
895
+ print("=" * 70)
896
+ print("Script execution completed successfully")
1121
897
  else:
1122
- return f"{seconds}s"
1123
-
1124
- def get_status(self) -> Dict[str, Any]:
1125
- """
1126
- Get current processing status.
1127
-
1128
- Returns:
1129
- Dictionary with current status information
1130
- """
1131
- return {
1132
- "current_step": self.current_step,
1133
- "processed_files": len(self.processed_files),
1134
- "failed_files": len(self.failed_files),
1135
- "study_loaded": self.study is not None,
1136
- "start_time": self.start_time.isoformat() if self.start_time else None,
1137
- "processing_time": self._get_total_processing_time(),
1138
- "parameters": {
1139
- "data_source": self.params.data_source,
1140
- "study_folder": self.params.study_folder,
1141
- "polarity": self.params.polarity,
1142
- "num_cores": self.params.num_cores,
1143
- "adducts": self.params.adducts,
1144
- }
1145
- }
1146
-
1147
- def info(self):
1148
- """Print comprehensive wizard status information."""
1149
- status = self.get_status()
1150
-
1151
- print("\n" + "=" * 50)
1152
- print("WIZARD STATUS")
1153
- print("=" * 50)
1154
- print(f"Current Step: {status['current_step']}")
1155
- print(f"Data Source: {self.params.data_source}")
1156
- print(f"Study Folder: {self.params.study_folder}")
1157
- print(f"Polarity: {status['parameters']['polarity']}")
1158
- print(f"CPU Cores: {status['parameters']['num_cores']}")
1159
- print(f"Adducts: {', '.join(status['parameters']['adducts'])}")
1160
- print(f"Processing Time: {status['processing_time']}")
1161
- print(f"Files Processed: {status['processed_files']}")
1162
- print(f"Files Failed: {status['failed_files']}")
1163
- print(f"Study Loaded: {status['study_loaded']}")
1164
-
1165
- if self.study is not None and hasattr(self.study, 'samples_df'):
1166
- print(f"Samples in Study: {len(self.study.samples_df)}")
1167
-
1168
- if self.study is not None and hasattr(self.study, 'consensus_df'):
1169
- print(f"Consensus Features: {len(self.study.consensus_df)}")
898
+ print("=" * 70)
899
+ print(f"Script execution failed with return code: {return_code}")
900
+
901
+ return success
1170
902
 
1171
- print("=" * 50)
903
+ except Exception as e:
904
+ print(f"Failed to execute script: {e}")
905
+ import traceback
906
+ traceback.print_exc()
907
+ return False
1172
908
 
1173
909
 
1174
- # Export the main classes
1175
- __all__ = ["Wizard", "wizard_def"]
910
+ # Export the main classes and functions
911
+ __all__ = ["Wizard", "wizard_def", "create_script", "execute"]