masster 0.5.15__py3-none-any.whl → 0.5.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/wizard/wizard.py CHANGED
@@ -228,9 +228,17 @@ class Wizard:
228
228
  that process raw MS data through the complete pipeline: file discovery, feature
229
229
  detection, sample processing, study assembly, alignment, merging, and export.
230
230
 
231
- This simplified version focuses on three core functions:
232
- - create_scripts(): Generate workflow and interactive analysis scripts
233
- - analyze(): Create and run analysis scripts with interactive notebook
231
+ Core functions:
232
+ - create_scripts(): Generate standalone analysis scripts
233
+ - test_only(): Process only one file for parameter validation
234
+ - test_and_run(): Test with single file, then run full batch if successful
235
+ - run(): Execute full batch processing on all files
236
+
237
+ Recommended workflow:
238
+ 1. wizard = Wizard(source="raw_data", folder="output")
239
+ 2. wizard.create_scripts() # Generate analysis scripts
240
+ 3. wizard.test_only() # Validate with single file
241
+ 4. wizard.run() # Process all files
234
242
  """
235
243
 
236
244
  def __init__(
@@ -239,7 +247,7 @@ class Wizard:
239
247
  folder: str = "",
240
248
  polarity: str = "positive",
241
249
  adducts: Optional[List[str]] = None,
242
- num_cores: int = 0,
250
+ num_cores: int = 6,
243
251
  **kwargs
244
252
  ):
245
253
  """
@@ -290,14 +298,6 @@ class Wizard:
290
298
  self.folder_path = Path(self.params.folder)
291
299
  self.folder_path.mkdir(parents=True, exist_ok=True)
292
300
 
293
- # Initialize logger
294
- self.logger = MassterLogger(
295
- instance_type="wizard",
296
- level="INFO",
297
- label="Wizard",
298
- sink=None
299
- )
300
-
301
301
  # Auto-infer polarity from the first file if not explicitly set by user
302
302
  if polarity == "positive" and "polarity" not in kwargs:
303
303
  inferred_polarity = self._infer_polarity_from_first_file()
@@ -306,164 +306,6 @@ class Wizard:
306
306
  # Update adducts based on inferred polarity
307
307
  self.params.__post_init__()
308
308
 
309
- def _analyze_source_files(self) -> Dict[str, Any]:
310
- """
311
- Analyze source files to extract metadata: number of files, file type, polarity, and acquisition length.
312
-
313
- Returns:
314
- Dictionary containing:
315
- - number_of_files: Total count of data files found
316
- - file_types: List of file extensions found
317
- - polarity: Detected polarity ("positive" or "negative")
318
- - length_minutes: Acquisition length in minutes
319
- - first_file: Path to first file analyzed
320
- """
321
- result = {
322
- 'number_of_files': 0,
323
- 'file_types': [],
324
- 'polarity': 'positive',
325
- 'length_minutes': 0.0,
326
- 'first_file': None
327
- }
328
-
329
- try:
330
- # Find all data files
331
- all_files = []
332
- file_types_found = set()
333
-
334
- for extension in self.params.file_extensions:
335
- if self.params.search_subfolders:
336
- pattern = f"**/*{extension}"
337
- files = list(self.source_path.rglob(pattern))
338
- else:
339
- pattern = f"*{extension}"
340
- files = list(self.source_path.glob(pattern))
341
-
342
- if files:
343
- all_files.extend(files)
344
- file_types_found.add(extension)
345
-
346
- result['number_of_files'] = len(all_files)
347
- result['file_types'] = list(file_types_found)
348
-
349
- if not all_files:
350
- return result
351
-
352
- # Analyze first file for polarity and acquisition length
353
- first_file = all_files[0]
354
- result['first_file'] = str(first_file)
355
-
356
- # Extract metadata based on file type
357
- if first_file.suffix.lower() == '.wiff':
358
- metadata = self._analyze_wiff_file(first_file)
359
- elif first_file.suffix.lower() == '.mzml':
360
- metadata = self._analyze_mzml_file(first_file)
361
- elif first_file.suffix.lower() == '.raw':
362
- metadata = self._analyze_raw_file(first_file)
363
- else:
364
- metadata = {'polarity': 'positive', 'length_minutes': 0.0}
365
-
366
- result['polarity'] = metadata.get('polarity', 'positive')
367
- result['length_minutes'] = metadata.get('length_minutes', 0.0)
368
-
369
- except Exception as e:
370
- self.logger.warning(f"Failed to analyze source files: {e}")
371
-
372
- return result
373
-
374
- def _analyze_wiff_file(self, file_path: Path) -> Dict[str, Any]:
375
- """Analyze WIFF file to extract polarity and acquisition length."""
376
- try:
377
- from masster.sample.load import _wiff_to_dict
378
-
379
- # Extract metadata from WIFF file
380
- metadata_df = _wiff_to_dict(str(file_path))
381
-
382
- result = {'polarity': 'positive', 'length_minutes': 0.0}
383
-
384
- if not metadata_df.empty:
385
- # Get polarity from first experiment
386
- if 'polarity' in metadata_df.columns:
387
- first_polarity = metadata_df['polarity'].iloc[0]
388
-
389
- # Convert numeric polarity codes to string
390
- if first_polarity == 1 or str(first_polarity).lower() in ['positive', 'pos', '+']:
391
- result['polarity'] = "positive"
392
- elif first_polarity == -1 or str(first_polarity).lower() in ['negative', 'neg', '-']:
393
- result['polarity'] = "negative"
394
-
395
- # Estimate acquisition length by loading the file briefly
396
- # For a rough estimate, we'll load just the scan info
397
- from masster.sample import Sample
398
- sample = Sample()
399
- sample.logger_update(level="ERROR") # Suppress logs
400
- sample.load(str(file_path))
401
-
402
- if hasattr(sample, 'scans_df') and sample.scans_df is not None:
403
- if not sample.scans_df.is_empty():
404
- rt_values = sample.scans_df.select('rt').to_numpy().flatten()
405
- if len(rt_values) > 0:
406
- # RT is in seconds, convert to minutes
407
- result['length_minutes'] = float(rt_values.max()) / 60.0
408
-
409
- return result
410
-
411
- except Exception as e:
412
- self.logger.debug(f"Failed to analyze WIFF file {file_path}: {e}")
413
- return {'polarity': 'positive', 'length_minutes': 0.0}
414
-
415
- def _analyze_mzml_file(self, file_path: Path) -> Dict[str, Any]:
416
- """Analyze mzML file to extract polarity and acquisition length."""
417
- try:
418
- from masster.sample import Sample
419
-
420
- sample = Sample()
421
- sample.logger_update(level="ERROR") # Suppress logs
422
- sample.load(str(file_path))
423
-
424
- result = {'polarity': 'positive', 'length_minutes': 0.0}
425
-
426
- if hasattr(sample, 'scans_df') and sample.scans_df is not None:
427
- if not sample.scans_df.is_empty():
428
- rt_values = sample.scans_df.select('rt').to_numpy().flatten()
429
- if len(rt_values) > 0:
430
- # RT is in seconds, convert to minutes
431
- result['length_minutes'] = float(rt_values.max()) / 60.0
432
-
433
- # For mzML, polarity detection would require more detailed parsing
434
- # For now, use default
435
- return result
436
-
437
- except Exception as e:
438
- self.logger.debug(f"Failed to analyze mzML file {file_path}: {e}")
439
- return {'polarity': 'positive', 'length_minutes': 0.0}
440
-
441
- def _analyze_raw_file(self, file_path: Path) -> Dict[str, Any]:
442
- """Analyze RAW file to extract polarity and acquisition length."""
443
- try:
444
- from masster.sample import Sample
445
-
446
- sample = Sample()
447
- sample.logger_update(level="ERROR") # Suppress logs
448
- sample.load(str(file_path))
449
-
450
- result = {'polarity': 'positive', 'length_minutes': 0.0}
451
-
452
- if hasattr(sample, 'scans_df') and sample.scans_df is not None:
453
- if not sample.scans_df.is_empty():
454
- rt_values = sample.scans_df.select('rt').to_numpy().flatten()
455
- if len(rt_values) > 0:
456
- # RT is in seconds, convert to minutes
457
- result['length_minutes'] = float(rt_values.max()) / 60.0
458
-
459
- # For RAW files, polarity detection would require more detailed parsing
460
- # For now, use default
461
- return result
462
-
463
- except Exception as e:
464
- self.logger.debug(f"Failed to analyze RAW file {file_path}: {e}")
465
- return {'polarity': 'positive', 'length_minutes': 0.0}
466
-
467
309
  def _infer_polarity_from_first_file(self) -> str:
468
310
  """
469
311
  Infer polarity from the first available raw data file.
@@ -538,10 +380,8 @@ class Wizard:
538
380
  source_info = self._analyze_source_files()
539
381
 
540
382
  # Update wizard parameters based on detected metadata
541
- if source_info['polarity'] != 'positive': # Only update if different from default
383
+ if source_info.get('polarity') and source_info['polarity'] != 'positive':
542
384
  self.params.polarity = source_info['polarity']
543
- # Update adducts based on detected polarity
544
- self.params.__post_init__()
545
385
 
546
386
  files_created = []
547
387
 
@@ -549,6 +389,9 @@ class Wizard:
549
389
  workflow_script_path = self.folder_path / "1_masster_workflow.py"
550
390
  workflow_content = self._generate_workflow_script_content(source_info)
551
391
 
392
+ # Apply test mode modifications
393
+ workflow_content = self._add_test_mode_support(workflow_content)
394
+
552
395
  with open(workflow_script_path, 'w', encoding='utf-8') as f:
553
396
  f.write(workflow_content)
554
397
  files_created.append(str(workflow_script_path))
@@ -573,7 +416,6 @@ class Wizard:
573
416
  }
574
417
 
575
418
  except Exception as e:
576
- self.logger.error(f"Failed to create scripts: {e}")
577
419
  return {
578
420
  "status": "error",
579
421
  "message": f"Failed to create scripts: {e}",
@@ -582,120 +424,109 @@ class Wizard:
582
424
  "source_info": {}
583
425
  }
584
426
 
585
- def _generate_workflow_script_content(self, source_info: Dict[str, Any]) -> str:
586
- """Generate the content for 1_masster_workflow.py script."""
427
+ def _analyze_source_files(self) -> Dict[str, Any]:
428
+ """Analyze source files to extract metadata."""
429
+ result = {
430
+ "number_of_files": 0,
431
+ "file_types": [],
432
+ "polarity": "positive",
433
+ "length_minutes": 0.0,
434
+ "first_file": None
435
+ }
587
436
 
588
- # Convert Path objects to strings for JSON serialization
589
- params_dict = {}
590
- for key, value in self.params.__dict__.items():
591
- if key == '_param_metadata': # Skip metadata in generated script
592
- continue
593
- if isinstance(value, Path):
594
- params_dict[key] = str(value)
595
- else:
596
- params_dict[key] = value
437
+ try:
438
+ # Find raw data files
439
+ extensions = [".wiff", ".raw", ".mzML"]
440
+ raw_files = []
441
+
442
+ for ext in extensions:
443
+ pattern = f"**/*{ext}"
444
+ files = list(self.source_path.rglob(pattern))
445
+ if files:
446
+ raw_files.extend(files)
447
+ if ext not in result["file_types"]:
448
+ result["file_types"].append(ext)
449
+
450
+ result["number_of_files"] = len(raw_files)
451
+
452
+ if raw_files:
453
+ result["first_file"] = str(raw_files[0])
454
+ # Simple heuristic: assume 30 minutes per file if we can't determine
455
+ result["length_minutes"] = 30.0
456
+
457
+ except Exception as e:
458
+ print(f"Warning: Could not analyze source files: {e}")
459
+
460
+ return result
597
461
 
598
- # Create readable PARAMS dict with comments including discovered info
599
- params_lines = []
600
- params_lines.append('# Analysis parameters (auto-detected from source files)')
601
- params_lines.append('PARAMS = {')
602
-
603
- # File Discovery Summary
604
- params_lines.append(' # === Source File Analysis ===')
605
- params_lines.append(f' "number_of_files": {source_info.get("number_of_files", 0)}, # Total raw data files found')
606
- params_lines.append(f' "file_types": {source_info.get("file_types", [])!r}, # Detected file extensions')
607
- params_lines.append(f' "length_minutes": {source_info.get("length_minutes", 0.0):.1f}, # Estimated acquisition length per file (minutes)')
608
- if source_info.get('first_file'):
609
- params_lines.append(f' "first_file": {source_info["first_file"]!r}, # First file analyzed for metadata')
610
- params_lines.append('')
611
-
612
- # Core Configuration
613
- params_lines.append(' # === Core Configuration ===')
614
- params_lines.append(f' "source": {params_dict.get("source", "")!r}, # Directory containing raw data files')
615
- params_lines.append(f' "folder": {params_dict.get("folder", "")!r}, # Output directory for processed study')
616
- params_lines.append(f' "polarity": {params_dict.get("polarity", "positive")!r}, # Ion polarity mode (auto-detected)')
617
- params_lines.append(f' "num_cores": {params_dict.get("num_cores", 4)}, # Number of CPU cores for parallel processing')
618
- params_lines.append('')
619
-
620
- # File Discovery
621
- params_lines.append(' # === File Discovery ===')
622
- params_lines.append(f' "file_extensions": {params_dict.get("file_extensions", [".wiff", ".raw", ".mzML"])!r}, # File extensions to search for')
623
- params_lines.append(f' "search_subfolders": {params_dict.get("search_subfolders", True)}, # Whether to search subdirectories recursively')
624
- params_lines.append(f' "skip_patterns": {params_dict.get("skip_patterns", ["blank", "condition"])!r}, # Filename patterns to skip')
625
- params_lines.append('')
626
-
627
- # Processing Parameters - Critical values to review
628
- params_lines.append(' # === Processing Parameters (REVIEW THESE VALUES) ===')
629
- params_lines.append(f' "adducts": {params_dict.get("adducts", [])!r}, # Adduct specifications for feature detection and annotation')
630
- params_lines.append(f' "detector_type": {params_dict.get("detector_type", "unknown")!r}, # MS detector type ("orbitrap", "tof", "unknown")')
631
- params_lines.append(f' "noise": {params_dict.get("noise", 50.0)}, # REVIEW: Noise threshold for feature detection. Set to 1e5 for Orbitraps')
632
- params_lines.append(f' "chrom_fwhm": {params_dict.get("chrom_fwhm", 0.5)}, # REVIEW: Chromatographic peak FWHM (seconds)')
633
- params_lines.append(f' "chrom_peak_snr": {params_dict.get("chrom_peak_snr", 5.0)}, # Minimum signal-to-noise ratio for chromatographic peaks')
634
- params_lines.append('')
635
-
636
- # Other parameters...
637
- params_lines.append(' # === Alignment & Merging ===')
638
- params_lines.append(f' "rt_tol": {params_dict.get("rt_tol", 5.0)}, # Retention time tolerance for alignment (seconds)')
639
- params_lines.append(f' "mz_tol": {params_dict.get("mz_tol", 0.01)}, # Mass-to-charge ratio tolerance for alignment (Da)')
640
- params_lines.append(f' "alignment_method": {params_dict.get("alignment_method", "kd")!r}, # Algorithm for sample alignment')
641
- params_lines.append(f' "min_samples_per_feature": {params_dict.get("min_samples_per_feature", 1)}, # Minimum samples required per consensus feature')
642
- params_lines.append(f' "merge_method": {params_dict.get("merge_method", "qt")!r}, # Method for merging consensus features')
643
- params_lines.append('')
644
-
645
- # Other params
646
- params_lines.append(' # === Sample Processing ===')
647
- params_lines.append(f' "batch_size": {params_dict.get("batch_size", 8)}, # Number of files to process per batch')
648
- params_lines.append(f' "memory_limit_gb": {params_dict.get("memory_limit_gb", 16.0)}, # Memory limit for processing (GB)')
649
- params_lines.append('')
650
-
651
- params_lines.append(' # === Script Options ===')
652
- params_lines.append(f' "resume_enabled": {params_dict.get("resume_enabled", True)}, # Enable automatic resume capability')
653
- params_lines.append(f' "force_reprocess": {params_dict.get("force_reprocess", False)}, # Force reprocessing of existing files')
654
- params_lines.append(f' "cleanup_temp_files": {params_dict.get("cleanup_temp_files", True)}, # Clean up temporary files after processing')
655
-
656
- params_lines.append('}')
462
+ def _generate_workflow_script_content(self, source_info: Dict[str, Any]) -> str:
463
+ """Generate the content for 1_masster_workflow.py script."""
657
464
 
658
- # Create script lines
659
465
  script_lines = [
660
466
  '#!/usr/bin/env python3',
661
467
  '"""',
662
- 'MASSter Workflow Script - Sample Processing',
663
- f'Generated by masster wizard v{version}',
664
- '',
665
- 'Source Analysis:',
666
- f' - Files found: {source_info.get("number_of_files", 0)}',
667
- f' - File types: {", ".join(source_info.get("file_types", []))}',
668
- f' - Polarity detected: {source_info.get("polarity", "unknown")}',
669
- f' - Acquisition length: ~{source_info.get("length_minutes", 0.0):.1f} minutes per file',
670
- '',
671
- 'This script processes raw MS data files into sample5 format.',
672
- 'Review the NOISE and CHROM_FWHM parameters below before running.',
468
+ 'Automated Mass Spectrometry Data Analysis Pipeline',
469
+ 'Generated by masster wizard',
673
470
  '"""',
674
471
  '',
472
+ 'import os',
675
473
  'import sys',
676
474
  'import time',
677
475
  'from pathlib import Path',
678
- 'import concurrent.futures',
679
- 'import os',
680
476
  '',
681
477
  '# Import masster modules',
682
- 'from masster.sample import Sample',
478
+ 'from masster.study import Study',
683
479
  'from masster import __version__',
684
480
  '',
685
- ]
686
-
687
- # Add the formatted PARAMS
688
- script_lines.extend(params_lines)
689
-
690
- # Add the functions
691
- script_lines.extend([
481
+ '# Test mode configuration',
482
+ 'TEST_MODE = os.environ.get("MASSTER_TEST_MODE", "0") == "1"',
483
+ 'TEST_ONLY = os.environ.get("MASSTER_TEST_ONLY", "0") == "1" # Only run test, don\'t continue to full batch',
484
+ '',
485
+ '# Analysis parameters',
486
+ 'PARAMS = {',
487
+ ' # === Core Configuration ===',
488
+ f' "source": {str(self.source_path)!r}, # Directory containing raw data files',
489
+ f' "folder": {str(self.folder_path)!r}, # Output directory for processed study',
490
+ f' "polarity": {self.params.polarity!r}, # Ion polarity mode ("positive" or "negative")',
491
+ f' "num_cores": {self.params.num_cores}, # Number of CPU cores for parallel processing',
492
+ '',
493
+ ' # === Test Mode ===',
494
+ ' "test_mode": TEST_MODE, # Process only first file for testing',
495
+ ' "test_only": TEST_ONLY, # Stop after test, don\'t run full batch',
496
+ '',
497
+ ' # === File Discovery ===',
498
+ f' "file_extensions": {self.params.file_extensions!r}, # File extensions to search for',
499
+ f' "search_subfolders": {self.params.search_subfolders}, # Whether to search subdirectories recursively',
500
+ f' "skip_patterns": {self.params.skip_patterns!r}, # Filename patterns to skip',
501
+ '',
502
+ ' # === Processing Parameters ===',
503
+ f' "adducts": {self.params.adducts!r}, # Adduct specifications for feature detection and annotation',
504
+ f' "noise": {self.params.noise}, # Noise threshold for feature detection',
505
+ f' "chrom_fwhm": {self.params.chrom_fwhm}, # Chromatographic peak full width at half maximum (seconds)',
506
+ f' "chrom_peak_snr": {self.params.chrom_peak_snr}, # Minimum signal-to-noise ratio for chromatographic peaks',
507
+ '',
508
+ ' # === Alignment & Merging ===',
509
+ f' "rt_tol": {self.params.rt_tolerance}, # Retention time tolerance for alignment (seconds)',
510
+ f' "mz_tol": {self.params.mz_max_diff}, # Mass-to-charge ratio tolerance for alignment (Da)',
511
+ f' "alignment_method": {self.params.alignment_algorithm!r}, # Algorithm for sample alignment',
512
+ f' "min_samples_per_feature": {self.params.min_samples_for_merge}, # Minimum samples required per consensus feature',
513
+ f' "merge_method": {self.params.merge_method!r}, # Method for merging consensus features',
514
+ '',
515
+ ' # === Sample Processing (used in add_samples_from_folder) ===',
516
+ f' "batch_size": {self.params.batch_size}, # Number of files to process per batch',
517
+ f' "memory_limit_gb": {self.params.memory_limit_gb}, # Memory limit for processing (GB)',
518
+ '',
519
+ ' # === Script Options ===',
520
+ f' "resume_enabled": {self.params.resume_enabled}, # Enable automatic resume capability',
521
+ f' "force_reprocess": {self.params.force_reprocess}, # Force reprocessing of existing files',
522
+ f' "cleanup_temp_files": {self.params.cleanup_temp_files}, # Clean up temporary files after processing',
523
+ '}',
692
524
  '',
693
525
  '',
694
- 'def discover_raw_files(source_folder, file_extensions, search_subfolders=True, skip_patterns=None):',
526
+ 'def discover_raw_files(source_folder, file_extensions, search_subfolders=True):',
695
527
  ' """Discover raw data files in the source folder."""',
696
528
  ' source_path = Path(source_folder)',
697
529
  ' raw_files = []',
698
- ' skip_patterns = skip_patterns or []',
699
530
  ' ',
700
531
  ' for ext in file_extensions:',
701
532
  ' if search_subfolders:',
@@ -704,191 +535,189 @@ class Wizard:
704
535
  ' else:',
705
536
  ' pattern = f"*{ext}"',
706
537
  ' files = list(source_path.glob(pattern))',
707
- ' ',
708
- ' # Filter out files matching skip patterns',
709
- ' for file in files:',
710
- ' skip_file = False',
711
- ' for skip_pattern in skip_patterns:',
712
- ' if skip_pattern.lower() in file.name.lower():',
713
- ' skip_file = True',
714
- ' break',
715
- ' if not skip_file:',
716
- ' raw_files.append(file)',
538
+ ' raw_files.extend(files)',
717
539
  ' ',
718
540
  ' return raw_files',
719
541
  '',
720
542
  '',
721
543
  'def process_single_file(args):',
722
- ' """Process a single raw file to sample5 format - designed for multiprocessing."""',
723
- ' raw_file, output_folder, params = args',
544
+ ' """Process a single raw file to sample5 format - module level for multiprocessing."""',
545
+ ' raw_file, output_folder = args',
546
+ ' from masster.sample import Sample',
724
547
  ' ',
725
548
  ' try:',
726
549
  ' # Create sample5 filename',
727
550
  ' sample_name = raw_file.stem',
728
551
  ' sample5_path = Path(output_folder) / f"{sample_name}.sample5"',
729
552
  ' ',
730
- ' # Skip if sample5 already exists and resume is enabled',
731
- ' if sample5_path.exists() and params["resume_enabled"]:',
732
- ' print(f"Skipping {raw_file.name} (sample5 already exists)")',
733
- ' return {"status": "skipped", "file": str(sample5_path), "message": "Already exists"}',
553
+ ' # Skip if sample5 already exists',
554
+ ' if sample5_path.exists() and not PARAMS["force_reprocess"]:',
555
+ ' print(f" Skipping {raw_file.name} (sample5 already exists)")',
556
+ ' return str(sample5_path)',
734
557
  ' ',
735
- ' print(f" 🔄 Processing {raw_file.name}...")',
736
- ' start_time = time.time()',
558
+ ' print(f" Converting {raw_file.name}...")',
737
559
  ' ',
738
560
  ' # Load and process raw file with full pipeline',
739
561
  ' sample = Sample(log_label=sample_name)',
740
562
  ' sample.load(filename=str(raw_file))',
741
563
  ' sample.find_features(',
742
- ' noise=params["noise"],',
743
- ' chrom_fwhm=params["chrom_fwhm"],',
744
- ' chrom_peak_snr=params["chrom_peak_snr"]',
564
+ ' noise=PARAMS["noise"],',
565
+ ' chrom_fwhm=PARAMS["chrom_fwhm"],',
566
+ ' chrom_peak_snr=PARAMS["chrom_peak_snr"]',
745
567
  ' )',
746
- ' # sample.find_adducts(adducts=params["adducts"])',
747
568
  ' sample.find_ms2()',
748
- ' # sample.find_iso() # Optional - can be uncommented if needed',
569
+ ' sample.find_iso()',
570
+ ' # sample.export_mgf()',
571
+ ' # sample.plot_2d(filename=f"{sample5_path.replace(".sample5", ".html")}")',
749
572
  ' sample.save(str(sample5_path))',
750
573
  ' ',
751
- ' elapsed = time.time() - start_time',
752
- ' print(f" ✅ Completed {raw_file.name} -> {sample5_path.name} ({elapsed:.1f}s)")',
753
- ' ',
754
- ' return {"status": "success", "file": str(sample5_path), "elapsed": elapsed}',
574
+ ' # print(f" Completed {raw_file.name} -> {sample5_path.name}")',
575
+ ' return str(sample5_path)',
755
576
  ' ',
756
577
  ' except Exception as e:',
757
- ' print(f"ERROR processing {raw_file.name}: {e}")',
758
- ' return {"status": "error", "file": str(raw_file), "error": str(e)}',
578
+ ' print(f" ERROR processing {raw_file.name}: {e}")',
579
+ ' return None',
759
580
  '',
760
581
  '',
761
- 'def convert_raw_to_sample5_parallel(raw_files, output_folder, params):',
762
- ' """Convert raw data files to sample5 format with parallel processing and progress tracking."""',
582
+ 'def convert_raw_to_sample5(raw_files, output_folder, polarity, num_cores):',
583
+ ' """Convert raw data files to sample5 format."""',
763
584
  ' import concurrent.futures',
764
585
  ' import os',
765
586
  ' ',
766
587
  ' # Create output directory',
767
588
  ' os.makedirs(output_folder, exist_ok=True)',
768
589
  ' ',
769
- ' print(f"\\n🚀 Processing {len(raw_files)} files using {params[\'num_cores\']} CPU cores...")',
770
- ' print("=" * 70)',
771
- ' ',
772
590
  ' # Prepare arguments for multiprocessing',
773
- ' file_args = [(raw_file, output_folder, params) for raw_file in raw_files]',
774
- ' ',
775
- ' # Process files in parallel with progress tracking',
776
- ' results = []',
777
- ' successful = 0',
778
- ' skipped = 0',
779
- ' failed = 0',
780
- ' total_elapsed = 0',
591
+ ' file_args = [(raw_file, output_folder) for raw_file in raw_files]',
781
592
  ' ',
782
- ' with concurrent.futures.ProcessPoolExecutor(max_workers=params["num_cores"]) as executor:',
783
- ' # Submit all jobs',
784
- ' future_to_file = {executor.submit(process_single_file, args): args[0] for args in file_args}',
593
+ ' # Process files in parallel',
594
+ ' sample5_files = []',
595
+ ' with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:',
596
+ ' futures = [executor.submit(process_single_file, args) for args in file_args]',
785
597
  ' ',
786
- ' # Collect results as they complete',
787
- ' for i, future in enumerate(concurrent.futures.as_completed(future_to_file), 1):',
598
+ ' for future in concurrent.futures.as_completed(futures):',
788
599
  ' result = future.result()',
789
- ' results.append(result)',
790
- ' ',
791
- ' if result["status"] == "success":',
792
- ' successful += 1',
793
- ' total_elapsed += result.get("elapsed", 0)',
794
- ' elif result["status"] == "skipped":',
795
- ' skipped += 1',
796
- ' else:',
797
- ' failed += 1',
798
- ' ',
799
- ' # Progress update',
800
- ' print(f"\\r Progress: {i}/{len(raw_files)} files completed ({successful} success, {skipped} skipped, {failed} failed)", end="", flush=True)',
801
- ' ',
802
- ' print() # New line after progress',
803
- ' print("=" * 70)',
804
- ' ',
805
- ' # Summary',
806
- ' if successful > 0:',
807
- ' avg_time = total_elapsed / successful',
808
- ' print(f"✅ Successfully processed {successful} files (avg: {avg_time:.1f}s per file)")',
809
- ' if skipped > 0:',
810
- ' print(f"⏩ Skipped {skipped} files (already exist)")',
811
- ' if failed > 0:',
812
- ' print(f"❌ Failed to process {failed} files")',
813
- ' for result in results:',
814
- ' if result["status"] == "error":',
815
- ' print(f" - {Path(result[\'file\']).name}: {result[\'error\']}")',
600
+ ' if result:',
601
+ ' sample5_files.append(result)',
816
602
  ' ',
817
- ' # Return list of successful sample5 files',
818
- ' sample5_files = [result["file"] for result in results if result["status"] in ["success", "skipped"]]',
819
603
  ' return sample5_files',
820
604
  '',
821
605
  '',
822
606
  'def main():',
823
- ' """Main sample processing workflow."""',
607
+ ' """Main analysis pipeline."""',
824
608
  ' try:',
825
609
  ' print("=" * 70)',
826
- f' print("MASSter {version} - Sample Processing Workflow")',
610
+ f' print("masster {version} - Automated MS Data Analysis")',
827
611
  ' print("=" * 70)',
828
612
  ' print(f"Source: {PARAMS[\'source\']}")',
829
613
  ' print(f"Output: {PARAMS[\'folder\']}")',
830
- ' print(f"Polarity: {PARAMS[\'polarity\']} (detected)")',
614
+ ' print(f"Polarity: {PARAMS[\'polarity\']}")',
831
615
  ' print(f"CPU Cores: {PARAMS[\'num_cores\']}")',
832
616
  ' print("=" * 70)',
833
- ' print("\\n⚙️ IMPORTANT: Review these parameters before processing:")',
834
- ' print(f" NOISE threshold: {PARAMS[\'noise\']} (adjust based on your instrument)")',
835
- ' print(f" CHROM_FWHM: {PARAMS[\'chrom_fwhm\']}s (adjust based on your chromatography)")',
836
- ' print(" You can edit these values in the PARAMS section above.")',
837
- ' print("=" * 70)',
838
617
  ' ',
839
618
  ' start_time = time.time()',
840
619
  ' ',
841
620
  ' # Step 1: Discover raw data files',
842
- ' print("\\n📁 Step 1/2: Discovering raw data files...")',
621
+ ' print("\\nStep 1/7: Discovering raw data files...")',
843
622
  ' raw_files = discover_raw_files(',
844
623
  ' PARAMS[\'source\'],',
845
624
  ' PARAMS[\'file_extensions\'],',
846
- ' PARAMS[\'search_subfolders\'],',
847
- ' PARAMS[\'skip_patterns\']',
625
+ ' PARAMS[\'search_subfolders\']',
848
626
  ' )',
849
627
  ' ',
850
628
  ' if not raw_files:',
851
- ' print("No raw data files found!")',
629
+ ' print("No raw data files found!")',
852
630
  ' return False',
853
631
  ' ',
854
632
  ' print(f"Found {len(raw_files)} raw data files")',
855
- ' for i, f in enumerate(raw_files[:5]): # Show first 5 files',
856
- ' print(f" {i+1}. {f.name}")',
633
+ ' for f in raw_files[:5]: # Show first 5 files',
634
+ ' print(f" {f.name}")',
857
635
  ' if len(raw_files) > 5:',
858
- ' print(f" ... and {len(raw_files) - 5} more files")',
636
+ ' print(f" ... and {len(raw_files) - 5} more")',
859
637
  ' ',
860
- ' # Step 2: Process raw files to sample5',
861
- ' print("\\n🔄 Step 2/2: Processing raw files to sample5 format...")',
862
- ' sample5_files = convert_raw_to_sample5_parallel(',
638
+ ' # Step 2: Process raw files',
639
+ ' print("\\nStep 2/7: Processing raw files...")',
640
+ ' sample5_files = convert_raw_to_sample5(',
863
641
  ' raw_files,',
864
642
  ' PARAMS[\'folder\'],',
865
- ' PARAMS',
643
+ ' PARAMS[\'polarity\'],',
644
+ ' PARAMS[\'num_cores\']',
866
645
  ' )',
867
646
  ' ',
868
647
  ' if not sample5_files:',
869
- ' print("No sample5 files were created!")',
648
+ ' print("No sample5 files were created!")',
870
649
  ' return False',
871
650
  ' ',
872
- ' # Summary',
651
+ ' print(f"Successfully processed {len(sample5_files)} files to sample5")',
652
+ ' ',
653
+ ' # Step 3: Create and configure study',
654
+ ' print("\\nStep 3/7: Initializing study...")',
655
+ ' study = Study(folder=PARAMS[\'folder\'])',
656
+ ' study.polarity = PARAMS[\'polarity\']',
657
+ ' study.adducts = PARAMS[\'adducts\']',
658
+ ' ',
659
+ ' # Step 4: Add sample5 files to study',
660
+ ' print("\\nStep 4/7: Adding samples to study...")',
661
+ ' study.add(str(Path(PARAMS[\'folder\']) / "*.sample5"))',
662
+ ' study.features_filter(study.features_select(chrom_coherence=0.1, chrom_prominence_scaled=1))',
663
+ ' ',
664
+ ' # Step 5: Core processing',
665
+ ' print("\\nStep 5/7: Processing...")',
666
+ ' study.align(',
667
+ ' algorithm=PARAMS[\'alignment_method\'],',
668
+ ' rt_tol=PARAMS[\'rt_tol\']',
669
+ ' )',
670
+ ' ',
671
+ ' study.merge(',
672
+ ' method="qt",',
673
+ ' min_samples=PARAMS[\'min_samples_per_feature\'],',
674
+ ' threads=PARAMS[\'num_cores\'],',
675
+ ' rt_tol=PARAMS[\'rt_tol\']',
676
+ ' )',
677
+ ' study.find_iso()',
678
+ ' study.fill()',
679
+ ' study.integrate()',
680
+ ' ',
681
+ ' # Step 6/7: Saving results',
682
+ ' print("\\nStep 6/7: Saving results...")',
683
+ ' study.save()',
684
+ ' study.export_xlsx()',
685
+ ' study.export_mgf()',
686
+ ' study.export_mztab()',
687
+ ' ',
688
+ ' # Step 7: Plots',
689
+ ' print("\\nStep 7/7: Exporting plots...")',
690
+ ' study.plot_consensus_2d(filename="consensus.html")',
691
+ ' study.plot_consensus_2d(filename="consensus.png")',
692
+ ' study.plot_alignment(filename="alignment.html")',
693
+ ' study.plot_alignment(filename="alignment.png")',
694
+ ' study.plot_samples_pca(filename="pca.html")',
695
+ ' study.plot_samples_pca(filename="pca.png")',
696
+ ' study.plot_bpc(filename="bpc.html")',
697
+ ' study.plot_bpc(filename="bpc.png")',
698
+ ' study.plot_rt_correction(filename="rt_correction.html")',
699
+ ' study.plot_rt_correction(filename="rt_correction.png")',
700
+ ' ',
701
+ ' # Print summary',
702
+ ' study.info()',
873
703
  ' total_time = time.time() - start_time',
874
704
  ' print("\\n" + "=" * 70)',
875
- ' print("🎉 SAMPLE PROCESSING COMPLETE")',
705
+ ' print("ANALYSIS COMPLETE")',
876
706
  ' print("=" * 70)',
877
- ' print(f"Processing time: {total_time/60:.1f} minutes")',
878
- ' print(f"Raw files found: {len(raw_files)}")',
707
+ ' print(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")',
708
+ ' print(f"Raw files processed: {len(raw_files)}")',
879
709
  ' print(f"Sample5 files created: {len(sample5_files)}")',
880
- ' print("\\nNext steps:")',
881
- ' print("1. Run the interactive analysis: uv run marimo edit 2_interactive_analysis.py")',
882
- ' print("2. Or use the sample5 files in your own analysis scripts")',
710
+ ' if hasattr(study, "consensus_df"):',
711
+ ' print(f"Consensus features generated: {len(study.consensus_df)}")',
883
712
  ' print("=" * 70)',
884
713
  ' ',
885
714
  ' return True',
886
715
  ' ',
887
716
  ' except KeyboardInterrupt:',
888
- ' print("\\n❌ Processing interrupted by user")',
717
+ ' print("\\nAnalysis interrupted by user")',
889
718
  ' return False',
890
719
  ' except Exception as e:',
891
- ' print(f" Processing failed with error: {e}")',
720
+ ' print(f"Analysis failed with error: {e}")',
892
721
  ' import traceback',
893
722
  ' traceback.print_exc()',
894
723
  ' return False',
@@ -897,115 +726,292 @@ class Wizard:
897
726
  'if __name__ == "__main__":',
898
727
  ' success = main()',
899
728
  ' sys.exit(0 if success else 1)',
900
- ])
729
+ ]
730
+
731
+ return '\n'.join(script_lines)
732
+
733
+ def _generate_interactive_notebook_content(self, source_info: Dict[str, Any]) -> str:
734
+ """Generate the content for 2_interactive_analysis.py marimo notebook."""
735
+
736
+ notebook_lines = [
737
+ 'import marimo',
738
+ '',
739
+ '__generated_with = "0.9.14"',
740
+ 'app = marimo.App(width="medium")',
741
+ '',
742
+ '@app.cell',
743
+ 'def __():',
744
+ ' import marimo as mo',
745
+ ' return (mo,)',
746
+ '',
747
+ '@app.cell',
748
+ 'def __(mo):',
749
+ ' mo.md(r"""',
750
+ ' # MASSter Interactive Analysis',
751
+ ' ',
752
+ f' **Source:** {source_info.get("number_of_files", 0)} files detected',
753
+ f' **Polarity:** {source_info.get("polarity", "unknown")}',
754
+ ' ',
755
+ ' This notebook provides interactive exploration of your processed study.',
756
+ ' Make sure you have run `python 1_masster_workflow.py` first.',
757
+ ' """)',
758
+ ' return ()',
759
+ '',
760
+ '@app.cell',
761
+ 'def __():',
762
+ ' import masster',
763
+ ' return (masster,)',
764
+ '',
765
+ '@app.cell',
766
+ 'def __(masster):',
767
+ ' study = masster.Study(folder=".")',
768
+ ' return (study,)',
769
+ '',
770
+ '@app.cell',
771
+ 'def __(study):',
772
+ ' study.info()',
773
+ ' return ()',
774
+ '',
775
+ 'if __name__ == "__main__":',
776
+ ' app.run()',
777
+ ]
778
+
779
+ return '\n'.join(notebook_lines)
780
+
781
+ def _generate_instructions(self, source_info: Dict[str, Any], files_created: List[str]) -> List[str]:
782
+ """Generate usage instructions for the created scripts."""
783
+ instructions = [f"Source analysis: {source_info.get('number_of_files', 0)} files found",
784
+ f"Polarity detected: {source_info.get('polarity', 'unknown')}",
785
+ "Files created:"]
786
+ for file_path in files_created:
787
+ instructions.append(f" ✅ {str(Path(file_path).resolve())}")
788
+
789
+ # Find the workflow script name from created files
790
+ workflow_script_name = "1_masster_workflow.py"
791
+ for file_path in files_created:
792
+ if Path(file_path).name == "1_masster_workflow.py":
793
+ workflow_script_name = Path(file_path).name
794
+ break
795
+
796
+ instructions.extend([
797
+ "",
798
+ "Next steps:",
799
+ f"1. REVIEW PARAMETERS in {workflow_script_name}:",
800
+ f" In particular, verify the NOISE, CHROM_FWHM, and MIN_SAMPLES_FOR_MERGE",
801
+ "",
802
+ "2. TEST SINGLE FILE (RECOMMENDED):",
803
+ f" wizard.test_only() # Validate parameters with first file only",
804
+ "",
805
+ "3. EXECUTE FULL BATCH:",
806
+ f" wizard.run() # Process all files",
807
+ f" # OR: wizard.test_and_run() # Test first, then run all",
808
+ f" # OR: uv run python {workflow_script_name}",
809
+ "",
810
+ "4. INTERACTIVE ANALYSIS:",
811
+ f" uv run marimo edit {Path('2_interactive_analysis.py').name}",
812
+ ""]
813
+ )
814
+
815
+ return instructions
816
+
817
+ def _add_test_mode_support(self, workflow_content: str) -> str:
818
+ """Add test mode functionality to the generated workflow script."""
819
+ lines = workflow_content.split('\n')
820
+
821
+ # Insert test mode code after print statements in main function
822
+ for i, line in enumerate(lines):
823
+ # Add test mode print after the masster version line
824
+ if 'print("masster' in line and 'Automated MS Data Analysis")' in line:
825
+ lines.insert(i + 1, ' if TEST_MODE:')
826
+ lines.insert(i + 2, ' print("🧪 TEST MODE: Processing single file only")')
827
+ break
828
+
829
+ # Add mode info after num_cores print
830
+ for i, line in enumerate(lines):
831
+ if 'print(f"CPU Cores: {PARAMS[\'num_cores\']}")' in line:
832
+ lines.insert(i + 1, ' if TEST_MODE:')
833
+ lines.insert(i + 2, ' print(f"Mode: {\'Test Only\' if TEST_ONLY else \'Test + Full Batch\'}")')
834
+ break
835
+
836
+ # Add file limitation logic after file listing
837
+ for i, line in enumerate(lines):
838
+ if 'print(f" ... and {len(raw_files) - 5} more")' in line:
839
+ lines.insert(i + 1, ' ')
840
+ lines.insert(i + 2, ' # Limit to first file in test mode')
841
+ lines.insert(i + 3, ' if TEST_MODE:')
842
+ lines.insert(i + 4, ' raw_files = raw_files[:1]')
843
+ lines.insert(i + 5, ' print(f"\\n🧪 TEST MODE: Processing only first file: {raw_files[0].name}")')
844
+ break
845
+
846
+ # Modify num_cores for test mode
847
+ for i, line in enumerate(lines):
848
+ if 'PARAMS[\'num_cores\']' in line and 'convert_raw_to_sample5(' in lines[i-2:i+3]:
849
+ lines[i] = line.replace('PARAMS[\'num_cores\']', 'PARAMS[\'num_cores\'] if not TEST_MODE else 1 # Use single core for test')
850
+ break
851
+
852
+ # Add test-only exit logic after successful processing
853
+ for i, line in enumerate(lines):
854
+ if 'print(f"Successfully processed {len(sample5_files)} files to sample5")' in line:
855
+ lines.insert(i + 1, ' ')
856
+ lines.insert(i + 2, ' # Stop here if test-only mode')
857
+ lines.insert(i + 3, ' if TEST_ONLY:')
858
+ lines.insert(i + 4, ' print("\\n🧪 TEST ONLY mode: Stopping after successful single file processing")')
859
+ lines.insert(i + 5, ' print(f"Test file created: {sample5_files[0]}")')
860
+ lines.insert(i + 6, ' print("\\nTo run full batch, use: wizard.run()")')
861
+ lines.insert(i + 7, ' total_time = time.time() - start_time')
862
+ lines.insert(i + 8, ' print(f"\\nTest processing time: {total_time:.1f} seconds")')
863
+ lines.insert(i + 9, ' return True')
864
+ break
865
+
866
+ return '\n'.join(lines)
867
+
868
+ def test_and_run(self) -> Dict[str, Any]:
869
+ """
870
+ Test the sample processing workflow with a single file, then run full batch.
871
+
872
+ This method runs the 1_masster_workflow.py script in test mode to process
873
+ the first raw file for validation, then automatically continues with the
874
+ full batch if the test succeeds. The script must already exist - call
875
+ create_scripts() first if needed.
876
+
877
+ Returns:
878
+ Dictionary containing:
879
+ - status: "success" or "error"
880
+ - message: Status message
881
+ - instructions: List of next steps
882
+ """
883
+ return self._execute_workflow(test_mode=True)
884
+
885
+ def test_only(self) -> Dict[str, Any]:
886
+ """
887
+ Test the sample processing workflow with a single file only.
901
888
 
902
- return '\n'.join(script_lines)
889
+ This method runs the 1_masster_workflow.py script in test-only mode to process
890
+ only the first raw file and then stops (does not continue to full study processing).
891
+ The script must already exist - call create_scripts() first if needed.
892
+
893
+ Returns:
894
+ Dictionary containing:
895
+ - status: "success" or "error"
896
+ - message: Status message
897
+ - instructions: List of next steps
898
+ - test_file: Path to the processed test file (if successful)
899
+ """
900
+ return self._execute_workflow(test_mode=True, test_only=True)
903
901
 
904
- def analyze(self) -> Dict[str, Any]:
902
+ def run(self) -> Dict[str, Any]:
905
903
  """
906
- Execute the complete analysis workflow.
904
+ Run the sample processing workflow.
907
905
 
908
- This method:
909
- 1. Checks if 1_masster_workflow.py exists and runs it
910
- 2. If not, creates scripts first then runs the workflow
911
- 3. Provides clear feedback about next steps
906
+ This method runs the 1_masster_workflow.py script to process raw files.
907
+ The script must already exist - call create_scripts() first if needed.
912
908
 
913
909
  Returns:
914
- Dictionary containing execution results and instructions
910
+ Dictionary containing:
911
+ - status: "success" or "error"
912
+ - message: Status message
913
+ - instructions: List of next steps
914
+ """
915
+ return self._execute_workflow(test_mode=False)
916
+
917
+ def _execute_workflow(self, test_mode: bool = False, test_only: bool = False) -> Dict[str, Any]:
915
918
  """
916
- workflow_script = self.folder_path / "1_masster_workflow.py"
919
+ Execute the workflow script in either test or full mode.
917
920
 
921
+ Args:
922
+ test_mode: If True, run in test mode (single file), otherwise full batch
923
+ test_only: If True, stop after single file test (only used with test_mode=True)
924
+ """
918
925
  try:
926
+ workflow_script_path = self.folder_path / "1_masster_workflow.py"
927
+
919
928
  # Check if workflow script exists
920
- if workflow_script.exists():
921
- print("📋 Found existing workflow script, executing...")
922
- return self._execute_workflow_script(workflow_script)
929
+ if not workflow_script_path.exists():
930
+ return {
931
+ "status": "error",
932
+ "message": "Workflow script not found. Please run create_scripts() first.",
933
+ "instructions": [
934
+ "❌ Missing 1_masster_workflow.py",
935
+ "Run: wizard.create_scripts()",
936
+ "Then: wizard.run()"
937
+ ]
938
+ }
939
+
940
+ # Setup execution mode
941
+ if test_only:
942
+ mode_label = "test-only"
943
+ elif test_mode:
944
+ mode_label = "test"
923
945
  else:
924
- print("📝 Creating analysis scripts...")
925
- # Create scripts first
926
- result = self.create_scripts()
927
-
928
- if result["status"] != "success":
929
- return result
930
-
931
- # Print instructions
932
- print("\n" + "="*70)
933
- for instruction in result["instructions"]:
934
- print(instruction)
935
- print("="*70)
946
+ mode_label = "full batch"
936
947
 
937
- # Ask user if they want to proceed with execution
938
- print("\n🤔 Would you like to proceed with sample processing now?")
939
- print(" This will execute 1_masster_workflow.py")
940
- response = input(" Proceed? [y/N]: ").strip().lower()
948
+ env = None
949
+ if test_mode:
950
+ import os
951
+ env = os.environ.copy()
952
+ env['MASSTER_TEST_MODE'] = '1'
953
+ if test_only:
954
+ env['MASSTER_TEST_ONLY'] = '1'
941
955
 
942
- if response in ['y', 'yes']:
943
- return self._execute_workflow_script(workflow_script)
944
- else:
945
- print(" Processing paused. Run the scripts manually when ready.")
946
- return {
947
- "status": "scripts_created",
948
- "message": "Scripts created successfully, execution deferred",
949
- "instructions": result["instructions"],
950
- "files_created": result["files_created"]
951
- }
952
-
953
- except Exception as e:
954
- self.logger.error(f"Analysis failed: {e}")
955
- return {
956
- "status": "error",
957
- "message": f"Analysis failed: {e}",
958
- "instructions": [],
959
- "files_created": []
960
- }
961
-
962
- def _execute_workflow_script(self, script_path: Path) -> Dict[str, Any]:
963
- """Execute the workflow script and return results."""
964
- try:
965
- print(f"🚀 Executing {script_path.name}...")
956
+ # Execute the workflow script
957
+ print(f"🚀 Executing {mode_label} processing workflow...")
958
+ print(f"📄 Running: {workflow_script_path.name}")
959
+ print("=" * 60)
966
960
 
967
961
  import subprocess
968
962
  result = subprocess.run([
969
- sys.executable, str(script_path)
970
- ], cwd=str(self.folder_path), capture_output=False, text=True)
963
+ sys.executable, str(workflow_script_path)
964
+ ], cwd=str(self.folder_path), env=env)
971
965
 
972
966
  success = result.returncode == 0
973
967
 
974
968
  if success:
975
- print("="*70)
976
- print("✅ Workflow execution completed successfully!")
977
- print("="*70)
978
- print("Next step: Run interactive analysis")
979
- print(" uv run marimo edit 2_interactive_analysis.py")
980
- print("="*70)
969
+ print("=" * 60)
970
+ if test_only:
971
+ print("✅ Test-only processing completed successfully!")
972
+ print("📋 Single file validated - ready for full batch")
973
+ print(" wizard.run()")
974
+ elif test_mode:
975
+ print("✅ Test processing completed successfully!")
976
+ print("📋 Next step: Run full batch")
977
+ print(" wizard.run()")
978
+ else:
979
+ print("✅ Sample processing completed successfully!")
980
+ print("📋 Next step: Run interactive analysis")
981
+ print(" uv run marimo edit 2_interactive_analysis.py")
982
+ print("=" * 60)
983
+
984
+ next_step = ("Next: wizard.run()" if test_mode else
985
+ "Next: uv run marimo edit 2_interactive_analysis.py")
981
986
 
982
987
  return {
983
988
  "status": "success",
984
- "message": "Workflow completed successfully",
989
+ "message": f"{mode_label.capitalize()} processing completed successfully",
985
990
  "instructions": [
986
- "✅ Sample processing completed",
987
- "Next: uv run marimo edit 2_interactive_analysis.py"
988
- ],
989
- "files_created": []
991
+ f"✅ {mode_label.capitalize()} processing completed",
992
+ next_step
993
+ ]
990
994
  }
991
995
  else:
992
996
  return {
993
997
  "status": "error",
994
- "message": f"Workflow execution failed with code {result.returncode}",
998
+ "message": f"Workflow execution failed with return code {result.returncode}",
995
999
  "instructions": [
996
1000
  "❌ Check the error messages above",
997
1001
  "Review parameters in 1_masster_workflow.py",
998
- "Try running: python 1_masster_workflow.py"
999
- ],
1000
- "files_created": []
1002
+ f"Try running manually: python {workflow_script_path.name}"
1003
+ ]
1001
1004
  }
1002
1005
 
1003
1006
  except Exception as e:
1004
1007
  return {
1005
1008
  "status": "error",
1006
1009
  "message": f"Failed to execute workflow: {e}",
1007
- "instructions": [],
1008
- "files_created": []
1010
+ "instructions": [
1011
+ "❌ Execution failed",
1012
+ "Check that source files exist and are accessible",
1013
+ "Verify folder permissions"
1014
+ ]
1009
1015
  }
1010
1016
 
1011
1017
  def _generate_script_content(self) -> str:
@@ -1313,413 +1319,74 @@ class Wizard:
1313
1319
 
1314
1320
  return '\n'.join(script_lines)
1315
1321
 
1316
- def _generate_notebook_content(self) -> str:
1317
- """Generate the content for a marimo interactive notebook."""
1318
-
1319
- notebook_lines = [
1320
- 'import marimo',
1321
- '',
1322
- '__generated_with = "0.9.14"',
1323
- 'app = marimo.App(width="medium")',
1324
- '',
1325
- '',
1326
- '@app.cell',
1327
- 'def __():',
1328
- ' import marimo as mo',
1329
- ' return (mo,)',
1330
- '',
1331
- '',
1332
- '@app.cell',
1333
- 'def __(mo):',
1334
- ' mo.md(r"""',
1335
- ' # MASSter Interactive Analysis',
1336
- ' ',
1337
- ' This notebook provides interactive exploration of your mass spectrometry study results.',
1338
- ' The study has been processed and is ready for analysis.',
1339
- ' """)',
1340
- '',
1341
- '',
1342
- '@app.cell',
1343
- 'def __():',
1344
- ' # Import masster',
1345
- ' import masster',
1346
- ' return (masster,)',
1347
- '',
1348
- '',
1349
- '@app.cell',
1350
- 'def __(masster):',
1351
- ' # Load the processed study',
1352
- ' study = masster.Study(folder=".")',
1353
- ' study.load()',
1354
- ' return (study,)',
1355
- '',
1356
- '',
1357
- '@app.cell',
1358
- 'def __(mo, study):',
1359
- ' # Display study information',
1360
- ' mo.md(f"""',
1361
- ' ## Study Overview',
1362
- ' ',
1363
- ' **Samples:** {len(study.samples) if hasattr(study, "samples") else "Not loaded"}',
1364
- ' ',
1365
- ' **Features:** {len(study.consensus_df) if hasattr(study, "consensus_df") else "Not available"}',
1366
- ' ',
1367
- ' **Polarity:** {study.polarity if hasattr(study, "polarity") else "Unknown"}',
1368
- ' """)',
1369
- '',
1370
- '',
1371
- '@app.cell',
1372
- 'def __(study):',
1373
- ' # Print detailed study info',
1374
- ' study.info()',
1375
- '',
1376
- '',
1377
- '@app.cell',
1378
- 'def __(mo):',
1379
- ' mo.md(r"""',
1380
- ' ## Quick Visualizations',
1381
- ' ',
1382
- ' Use the cells below to create interactive plots of your data.',
1383
- ' """)',
1384
- '',
1385
- '',
1386
- '@app.cell',
1387
- 'def __(study):',
1388
- ' # Generate consensus 2D plot',
1389
- ' if hasattr(study, "consensus_df") and len(study.consensus_df) > 0:',
1390
- ' study.plot_consensus_2d(filename="consensus_interactive.html")',
1391
- ' print("Consensus 2D plot saved as: consensus_interactive.html")',
1392
- ' else:',
1393
- ' print("No consensus features available for plotting")',
1394
- '',
1395
- '',
1396
- '@app.cell',
1397
- 'def __(study):',
1398
- ' # Generate PCA plot',
1399
- ' if hasattr(study, "samples") and len(study.samples) > 1:',
1400
- ' study.plot_samples_pca(filename="pca_interactive.html")',
1401
- ' print("PCA plot saved as: pca_interactive.html")',
1402
- ' else:',
1403
- ' print("Not enough samples for PCA analysis")',
1404
- '',
1405
- '',
1406
- '@app.cell',
1407
- 'def __(mo):',
1408
- ' mo.md(r"""',
1409
- ' ## Data Export',
1410
- ' ',
1411
- ' Export your processed data in various formats.',
1412
- ' """)',
1413
- '',
1414
- '',
1415
- '@app.cell',
1416
- 'def __(study):',
1417
- ' # Export options',
1418
- ' if hasattr(study, "consensus_df"):',
1419
- ' # Export to Excel',
1420
- ' study.export_xlsx(filename="study_results.xlsx")',
1421
- ' print("✓ Results exported to: study_results.xlsx")',
1422
- ' ',
1423
- ' # Export to MGF',
1424
- ' study.export_mgf(filename="study_spectra.mgf")',
1425
- ' print("✓ Spectra exported to: study_spectra.mgf")',
1426
- ' else:',
1427
- ' print("No data available for export")',
1428
- '',
1429
- '',
1430
- '@app.cell',
1431
- 'def __(mo):',
1432
- ' mo.md(r"""',
1433
- ' ## Custom Analysis',
1434
- ' ',
1435
- ' Add your own analysis code in the cells below.',
1436
- ' """)',
1437
- '',
1438
- '',
1439
- '@app.cell',
1440
- 'def __(study):',
1441
- ' # Access consensus features dataframe',
1442
- ' if hasattr(study, "consensus_df"):',
1443
- ' df = study.consensus_df',
1444
- ' print(f"Consensus features shape: {df.shape}")',
1445
- ' print("\\nFirst 5 features:")',
1446
- ' print(df.head())',
1447
- ' return (df,) if "df" in locals() else ()',
1448
- '',
1449
- '',
1450
- '@app.cell',
1451
- 'def __():',
1452
- ' # Your custom analysis here',
1453
- ' pass',
1454
- '',
1455
- '',
1456
- 'if __name__ == "__main__":',
1457
- ' app.run()',
1458
- ]
1459
-
1460
- return '\n'.join(notebook_lines)
1461
-
1462
-
1463
- def create_analysis(
1464
- source: str,
1465
- folder: str,
1466
- filename: str = 'run_masster.py',
1467
- polarity: str = "positive",
1468
- adducts: Optional[List[str]] = None,
1469
- params: Optional[wizard_def] = None,
1470
- num_cores: int = 0,
1471
- **kwargs
1472
- ) -> bool:
1473
- """
1474
- Create standalone analysis scripts without initializing a Wizard instance.
1475
-
1476
- This function generates analysis scripts with the specified configuration.
1477
-
1478
- Parameters:
1479
- source: Directory containing raw data files
1480
- folder: Output directory for processed study
1481
- filename: Filename for the generated script (deprecated, will create standard files)
1482
- polarity: Ion polarity mode ("positive" or "negative")
1483
- adducts: List of adduct specifications (auto-set if None)
1484
- params: Custom wizard_def parameters (optional)
1485
- num_cores: Number of CPU cores (0 = auto-detect)
1486
- **kwargs: Additional parameters to override defaults
1487
-
1488
- Returns:
1489
- True if scripts were generated successfully, False otherwise
1490
-
1491
- Example:
1492
- >>> from masster.wizard import create_analysis
1493
- >>> create_analysis(
1494
- ... source=r'D:\\Data\\raw_files',
1495
- ... folder=r'D:\\Data\\output',
1496
- ... polarity='positive'
1497
- ... )
1498
- """
1499
-
1500
- try:
1501
- # Create parameters
1502
- if params is not None:
1503
- # Use provided params as base
1504
- wizard_params = params
1505
- # Update with provided values
1506
- wizard_params.source = source
1507
- wizard_params.folder = folder
1508
- if polarity != "positive": # Only override if explicitly different
1509
- wizard_params.polarity = polarity
1510
- if num_cores > 0:
1511
- wizard_params.num_cores = num_cores
1512
- if adducts is not None:
1513
- wizard_params.adducts = adducts
1514
- else:
1515
- # Create new params with provided values
1516
- wizard_params = wizard_def(
1517
- source=source,
1518
- folder=folder,
1519
- polarity=polarity,
1520
- num_cores=max(1, int(multiprocessing.cpu_count() * 0.75)) if num_cores <= 0 else num_cores
1521
- )
1522
-
1523
- if adducts is not None:
1524
- wizard_params.adducts = adducts
1525
-
1526
- # Apply any additional kwargs
1527
- for key, value in kwargs.items():
1528
- if hasattr(wizard_params, key):
1529
- setattr(wizard_params, key, value)
1530
-
1531
- # Ensure study folder exists
1532
- study_path = Path(folder)
1533
- study_path.mkdir(parents=True, exist_ok=True)
1534
-
1535
- # Create a temporary Wizard instance to generate the scripts
1536
- temp_wizard = Wizard(params=wizard_params)
1537
-
1538
- # Generate the scripts using the new method
1539
- result = temp_wizard.create_scripts()
1540
-
1541
- if result["status"] == "success":
1542
- print("Scripts created successfully!")
1543
- for instruction in result["instructions"]:
1544
- print(instruction)
1545
-
1546
- return result["status"] == "success"
1547
-
1548
- except Exception as e:
1549
- print(f"Failed to create scripts: {e}")
1550
- import traceback
1551
- traceback.print_exc()
1552
- return False
1553
-
1554
1322
 
1555
- def analyze(
1556
- source: str,
1557
- folder: str,
1558
- filename: str = 'run_masster.py',
1323
+ def create_scripts(
1324
+ source: str = "",
1325
+ folder: str = "",
1559
1326
  polarity: str = "positive",
1560
1327
  adducts: Optional[List[str]] = None,
1561
- params: Optional[wizard_def] = None,
1562
1328
  num_cores: int = 0,
1563
1329
  **kwargs
1564
- ) -> bool:
1330
+ ) -> Dict[str, Any]:
1565
1331
  """
1566
- Create and execute a standalone analysis script for automated MS data processing.
1332
+ Create analysis scripts without explicitly instantiating a Wizard.
1567
1333
 
1568
- This function generates a Python script with the same parameters as create_analysis(),
1569
- but immediately executes it after creation. Combines script generation and execution
1570
- in a single step.
1334
+ This is a convenience function that creates a Wizard instance internally
1335
+ and calls its create_scripts() method.
1571
1336
 
1572
1337
  Parameters:
1573
1338
  source: Directory containing raw data files
1574
- folder: Output directory for processed study
1575
- filename: Filename for the generated script (should end with .py)
1339
+ folder: Output directory for processed study
1576
1340
  polarity: Ion polarity mode ("positive" or "negative")
1577
1341
  adducts: List of adduct specifications (auto-set if None)
1578
- params: Custom wizard_def parameters (optional)
1579
1342
  num_cores: Number of CPU cores (0 = auto-detect)
1580
- **kwargs: Additional parameters to override defaults
1343
+ **kwargs: Additional parameters
1581
1344
 
1582
1345
  Returns:
1583
- True if script was created and executed successfully, False otherwise
1346
+ Dictionary containing:
1347
+ - status: "success" or "error"
1348
+ - message: Status message
1349
+ - instructions: List of next steps
1350
+ - files_created: List of created file paths
1351
+ - source_info: Metadata about source files
1584
1352
 
1585
1353
  Example:
1586
- >>> from masster.wizard import analyze
1587
- >>> analyze(
1354
+ >>> import masster.wizard
1355
+ >>> result = masster.wizard.create_scripts(
1588
1356
  ... source=r'D:\\Data\\raw_files',
1589
1357
  ... folder=r'D:\\Data\\output',
1590
- ... polarity='positive'
1358
+ ... polarity='negative'
1591
1359
  ... )
1360
+ >>> print("Status:", result["status"])
1592
1361
  """
1593
1362
 
1594
1363
  try:
1595
- # Create parameters (same logic as create_analysis)
1596
- if params is not None:
1597
- wizard_params = params
1598
- wizard_params.source = source
1599
- wizard_params.folder = folder
1600
- if polarity != "positive":
1601
- wizard_params.polarity = polarity
1602
- if num_cores > 0:
1603
- wizard_params.num_cores = num_cores
1604
- if adducts is not None:
1605
- wizard_params.adducts = adducts
1606
- else:
1607
- wizard_params = wizard_def(
1608
- source=source,
1609
- folder=folder,
1610
- polarity=polarity,
1611
- num_cores=max(1, int(multiprocessing.cpu_count() * 0.75)) if num_cores <= 0 else num_cores
1612
- )
1613
-
1614
- if adducts is not None:
1615
- wizard_params.adducts = adducts
1616
-
1617
- # Apply any additional kwargs
1618
- for key, value in kwargs.items():
1619
- if hasattr(wizard_params, key):
1620
- setattr(wizard_params, key, value)
1364
+ # Auto-detect optimal number of cores if not specified
1365
+ if num_cores <= 0:
1366
+ num_cores = max(1, int(multiprocessing.cpu_count() * 0.75))
1621
1367
 
1622
- # Create Wizard instance and run analysis
1623
- wizard = Wizard(params=wizard_params)
1624
- result = wizard.analyze()
1368
+ # Create Wizard instance
1369
+ wizard = Wizard(
1370
+ source=source,
1371
+ folder=folder,
1372
+ polarity=polarity,
1373
+ adducts=adducts,
1374
+ num_cores=num_cores,
1375
+ **kwargs
1376
+ )
1625
1377
 
1626
- # Return success status
1627
- return result.get("status") in ["success", "scripts_created"]
1378
+ # Call the instance method
1379
+ return wizard.create_scripts()
1628
1380
 
1629
1381
  except Exception as e:
1630
- print(f"Failed to execute script: {e}")
1631
- import traceback
1632
- traceback.print_exc()
1633
- return False
1634
-
1635
-
1636
- def _generate_interactive_notebook_content(self, source_info: Dict[str, Any]) -> str:
1637
- """Generate the content for 2_interactive_analysis.py marimo notebook."""
1638
-
1639
- notebook_lines = [
1640
- 'import marimo',
1641
- '',
1642
- '__generated_with = "0.9.14"',
1643
- 'app = marimo.App(width="medium")',
1644
- '',
1645
- '',
1646
- '@app.cell',
1647
- 'def __():',
1648
- ' import marimo as mo',
1649
- ' return (mo,)',
1650
- '',
1651
- '',
1652
- '@app.cell',
1653
- 'def __(mo):',
1654
- ' mo.md(r"""',
1655
- ' # MASSter Interactive Analysis',
1656
- ' ',
1657
- f' **Source:** {source_info.get("number_of_files", 0)} files ({", ".join(source_info.get("file_types", []))}) detected',
1658
- f' **Polarity:** {source_info.get("polarity", "unknown")} (auto-detected)',
1659
- f' **Acquisition length:** ~{source_info.get("length_minutes", 0.0):.1f} minutes per file',
1660
- ' ',
1661
- ' This notebook provides interactive exploration of your processed mass spectrometry study.',
1662
- ' Make sure you have run `python 1_masster_workflow.py` first to create the sample5 files.',
1663
- ' """)',
1664
- '',
1665
- '',
1666
- '@app.cell',
1667
- 'def __():',
1668
- ' # Import masster',
1669
- ' import masster',
1670
- ' return (masster,)',
1671
- '',
1672
- '',
1673
- '@app.cell',
1674
- 'def __(masster):',
1675
- ' # Load the study from sample5 files',
1676
- ' study = masster.Study(folder=".")',
1677
- ' return (study,)',
1678
- '',
1679
- '',
1680
- '@app.cell',
1681
- 'def __(mo, study):',
1682
- ' # Display study information',
1683
- ' study.info()',
1684
- ' return ()',
1685
- '',
1686
- '',
1687
- 'if __name__ == "__main__":',
1688
- ' app.run()',
1689
- ]
1690
-
1691
- return '\n'.join(notebook_lines)
1692
-
1693
- def _generate_instructions(self, source_info: Dict[str, Any], files_created: List[str]) -> List[str]:
1694
- """Generate usage instructions for the created scripts."""
1695
- instructions = [
1696
- "🎯 NEXT STEPS:",
1697
- "",
1698
- f"Source analysis completed: {source_info.get('number_of_files', 0)} files found",
1699
- f"Polarity detected: {source_info.get('polarity', 'unknown')}",
1700
- f"Estimated processing time: {source_info.get('number_of_files', 0) * source_info.get('length_minutes', 0.0) * 0.1:.1f} minutes",
1701
- "",
1702
- "1. REVIEW PARAMETERS:",
1703
- " Edit 1_masster_workflow.py and verify these key settings:",
1704
- " - NOISE threshold (adjust based on your instrument sensitivity)",
1705
- " - CHROM_FWHM (adjust based on your chromatography peak width)",
1706
- "",
1707
- "2. EXECUTE SAMPLE PROCESSING:",
1708
- " python 1_masster_workflow.py",
1709
- " (This will process all raw files to sample5 format)",
1710
- "",
1711
- "3. INTERACTIVE ANALYSIS:",
1712
- " uv run marimo edit 2_interactive_analysis.py",
1713
- " (This opens an interactive notebook for data exploration)",
1714
- "",
1715
- "FILES CREATED:"
1716
- ]
1717
-
1718
- for file_path in files_created:
1719
- instructions.append(f" ✅ {Path(file_path).name}")
1720
-
1721
- return instructions
1382
+ return {
1383
+ "status": "error",
1384
+ "message": f"Failed to create scripts: {e}",
1385
+ "instructions": [],
1386
+ "files_created": [],
1387
+ "source_info": {}
1388
+ }
1722
1389
 
1723
1390
 
1724
1391
  # Export the main classes and functions
1725
- __all__ = ["Wizard", "wizard_def", "create_analysis", "analyze"]
1392
+ __all__ = ["Wizard", "wizard_def", "create_scripts"]