masster 0.5.13__py3-none-any.whl → 0.5.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/wizard/wizard.py CHANGED
@@ -228,9 +228,9 @@ class Wizard:
228
228
  that process raw MS data through the complete pipeline: file discovery, feature
229
229
  detection, sample processing, study assembly, alignment, merging, and export.
230
230
 
231
- This simplified version focuses on two core functions:
232
- - create_script(): Generate standalone analysis scripts
233
- - execute(): Create and run analysis scripts
231
+ This simplified version focuses on three core functions:
232
+ - create_scripts(): Generate workflow and interactive analysis scripts
233
+ - analyze(): Create and run analysis scripts with interactive notebook
234
234
  """
235
235
 
236
236
  def __init__(
@@ -290,6 +290,14 @@ class Wizard:
290
290
  self.folder_path = Path(self.params.folder)
291
291
  self.folder_path.mkdir(parents=True, exist_ok=True)
292
292
 
293
+ # Initialize logger
294
+ self.logger = MassterLogger(
295
+ instance_type="wizard",
296
+ level="INFO",
297
+ label="Wizard",
298
+ sink=None
299
+ )
300
+
293
301
  # Auto-infer polarity from the first file if not explicitly set by user
294
302
  if polarity == "positive" and "polarity" not in kwargs:
295
303
  inferred_polarity = self._infer_polarity_from_first_file()
@@ -298,6 +306,164 @@ class Wizard:
298
306
  # Update adducts based on inferred polarity
299
307
  self.params.__post_init__()
300
308
 
309
+ def _analyze_source_files(self) -> Dict[str, Any]:
310
+ """
311
+ Analyze source files to extract metadata: number of files, file type, polarity, and acquisition length.
312
+
313
+ Returns:
314
+ Dictionary containing:
315
+ - number_of_files: Total count of data files found
316
+ - file_types: List of file extensions found
317
+ - polarity: Detected polarity ("positive" or "negative")
318
+ - length_minutes: Acquisition length in minutes
319
+ - first_file: Path to first file analyzed
320
+ """
321
+ result = {
322
+ 'number_of_files': 0,
323
+ 'file_types': [],
324
+ 'polarity': 'positive',
325
+ 'length_minutes': 0.0,
326
+ 'first_file': None
327
+ }
328
+
329
+ try:
330
+ # Find all data files
331
+ all_files = []
332
+ file_types_found = set()
333
+
334
+ for extension in self.params.file_extensions:
335
+ if self.params.search_subfolders:
336
+ pattern = f"**/*{extension}"
337
+ files = list(self.source_path.rglob(pattern))
338
+ else:
339
+ pattern = f"*{extension}"
340
+ files = list(self.source_path.glob(pattern))
341
+
342
+ if files:
343
+ all_files.extend(files)
344
+ file_types_found.add(extension)
345
+
346
+ result['number_of_files'] = len(all_files)
347
+ result['file_types'] = list(file_types_found)
348
+
349
+ if not all_files:
350
+ return result
351
+
352
+ # Analyze first file for polarity and acquisition length
353
+ first_file = all_files[0]
354
+ result['first_file'] = str(first_file)
355
+
356
+ # Extract metadata based on file type
357
+ if first_file.suffix.lower() == '.wiff':
358
+ metadata = self._analyze_wiff_file(first_file)
359
+ elif first_file.suffix.lower() == '.mzml':
360
+ metadata = self._analyze_mzml_file(first_file)
361
+ elif first_file.suffix.lower() == '.raw':
362
+ metadata = self._analyze_raw_file(first_file)
363
+ else:
364
+ metadata = {'polarity': 'positive', 'length_minutes': 0.0}
365
+
366
+ result['polarity'] = metadata.get('polarity', 'positive')
367
+ result['length_minutes'] = metadata.get('length_minutes', 0.0)
368
+
369
+ except Exception as e:
370
+ self.logger.warning(f"Failed to analyze source files: {e}")
371
+
372
+ return result
373
+
374
+ def _analyze_wiff_file(self, file_path: Path) -> Dict[str, Any]:
375
+ """Analyze WIFF file to extract polarity and acquisition length."""
376
+ try:
377
+ from masster.sample.load import _wiff_to_dict
378
+
379
+ # Extract metadata from WIFF file
380
+ metadata_df = _wiff_to_dict(str(file_path))
381
+
382
+ result = {'polarity': 'positive', 'length_minutes': 0.0}
383
+
384
+ if not metadata_df.empty:
385
+ # Get polarity from first experiment
386
+ if 'polarity' in metadata_df.columns:
387
+ first_polarity = metadata_df['polarity'].iloc[0]
388
+
389
+ # Convert numeric polarity codes to string
390
+ if first_polarity == 1 or str(first_polarity).lower() in ['positive', 'pos', '+']:
391
+ result['polarity'] = "positive"
392
+ elif first_polarity == -1 or str(first_polarity).lower() in ['negative', 'neg', '-']:
393
+ result['polarity'] = "negative"
394
+
395
+ # Estimate acquisition length by loading the file briefly
396
+ # For a rough estimate, we'll load just the scan info
397
+ from masster.sample import Sample
398
+ sample = Sample()
399
+ sample.logger_update(level="ERROR") # Suppress logs
400
+ sample.load(str(file_path))
401
+
402
+ if hasattr(sample, 'scans_df') and sample.scans_df is not None:
403
+ if not sample.scans_df.is_empty():
404
+ rt_values = sample.scans_df.select('rt').to_numpy().flatten()
405
+ if len(rt_values) > 0:
406
+ # RT is in seconds, convert to minutes
407
+ result['length_minutes'] = float(rt_values.max()) / 60.0
408
+
409
+ return result
410
+
411
+ except Exception as e:
412
+ self.logger.debug(f"Failed to analyze WIFF file {file_path}: {e}")
413
+ return {'polarity': 'positive', 'length_minutes': 0.0}
414
+
415
+ def _analyze_mzml_file(self, file_path: Path) -> Dict[str, Any]:
416
+ """Analyze mzML file to extract polarity and acquisition length."""
417
+ try:
418
+ from masster.sample import Sample
419
+
420
+ sample = Sample()
421
+ sample.logger_update(level="ERROR") # Suppress logs
422
+ sample.load(str(file_path))
423
+
424
+ result = {'polarity': 'positive', 'length_minutes': 0.0}
425
+
426
+ if hasattr(sample, 'scans_df') and sample.scans_df is not None:
427
+ if not sample.scans_df.is_empty():
428
+ rt_values = sample.scans_df.select('rt').to_numpy().flatten()
429
+ if len(rt_values) > 0:
430
+ # RT is in seconds, convert to minutes
431
+ result['length_minutes'] = float(rt_values.max()) / 60.0
432
+
433
+ # For mzML, polarity detection would require more detailed parsing
434
+ # For now, use default
435
+ return result
436
+
437
+ except Exception as e:
438
+ self.logger.debug(f"Failed to analyze mzML file {file_path}: {e}")
439
+ return {'polarity': 'positive', 'length_minutes': 0.0}
440
+
441
+ def _analyze_raw_file(self, file_path: Path) -> Dict[str, Any]:
442
+ """Analyze RAW file to extract polarity and acquisition length."""
443
+ try:
444
+ from masster.sample import Sample
445
+
446
+ sample = Sample()
447
+ sample.logger_update(level="ERROR") # Suppress logs
448
+ sample.load(str(file_path))
449
+
450
+ result = {'polarity': 'positive', 'length_minutes': 0.0}
451
+
452
+ if hasattr(sample, 'scans_df') and sample.scans_df is not None:
453
+ if not sample.scans_df.is_empty():
454
+ rt_values = sample.scans_df.select('rt').to_numpy().flatten()
455
+ if len(rt_values) > 0:
456
+ # RT is in seconds, convert to minutes
457
+ result['length_minutes'] = float(rt_values.max()) / 60.0
458
+
459
+ # For RAW files, polarity detection would require more detailed parsing
460
+ # For now, use default
461
+ return result
462
+
463
+ except Exception as e:
464
+ self.logger.debug(f"Failed to analyze RAW file {file_path}: {e}")
465
+ return {'polarity': 'positive', 'length_minutes': 0.0}
466
+
301
467
  def _infer_polarity_from_first_file(self) -> str:
302
468
  """
303
469
  Infer polarity from the first available raw data file.
@@ -349,69 +515,498 @@ class Wizard:
349
515
  """Get the adduct specifications."""
350
516
  return self.params.adducts
351
517
 
352
- def create_script(self, filename: str = "run_masster.py") -> bool:
518
+ def create_scripts(self) -> Dict[str, Any]:
353
519
  """
354
- Generate a standalone Python script for the analysis pipeline.
520
+ Generate analysis scripts based on source file analysis.
521
+
522
+ This method:
523
+ 1. Analyzes the source files to extract metadata
524
+ 2. Creates 1_masster_workflow.py with sample processing logic
525
+ 3. Creates 2_interactive_analysis.py marimo notebook for study exploration
526
+ 4. Returns instructions for next steps
355
527
 
356
- Parameters:
357
- filename: Name for the generated script file
358
-
359
528
  Returns:
360
- True if script was generated successfully, False otherwise
529
+ Dictionary containing:
530
+ - status: "success" or "error"
531
+ - message: Status message
532
+ - instructions: List of next steps
533
+ - files_created: List of created file paths
534
+ - source_info: Metadata about source files
361
535
  """
362
536
  try:
363
- script_path = self.folder_path / filename
364
- script_content = self._generate_script_content()
537
+ # Step 1: Analyze source files to extract metadata
538
+ source_info = self._analyze_source_files()
539
+
540
+ # Update wizard parameters based on detected metadata
541
+ if source_info['polarity'] != 'positive': # Only update if different from default
542
+ self.params.polarity = source_info['polarity']
543
+ # Update adducts based on detected polarity
544
+ self.params.__post_init__()
545
+
546
+ files_created = []
547
+
548
+ # Step 2: Create 1_masster_workflow.py
549
+ workflow_script_path = self.folder_path / "1_masster_workflow.py"
550
+ workflow_content = self._generate_workflow_script_content(source_info)
551
+
552
+ with open(workflow_script_path, 'w', encoding='utf-8') as f:
553
+ f.write(workflow_content)
554
+ files_created.append(str(workflow_script_path))
365
555
 
366
- with open(script_path, 'w', encoding='utf-8') as f:
367
- f.write(script_content)
556
+ # Step 3: Create 2_interactive_analysis.py marimo notebook
557
+ notebook_path = self.folder_path / "2_interactive_analysis.py"
558
+ notebook_content = self._generate_interactive_notebook_content(source_info)
368
559
 
369
- print(f"Analysis script created: {script_path}")
370
- return True
560
+ with open(notebook_path, 'w', encoding='utf-8') as f:
561
+ f.write(notebook_content)
562
+ files_created.append(str(notebook_path))
563
+
564
+ # Step 4: Generate instructions
565
+ instructions = self._generate_instructions(source_info, files_created)
566
+
567
+ return {
568
+ "status": "success",
569
+ "message": f"Successfully created {len(files_created)} script files",
570
+ "instructions": instructions,
571
+ "files_created": files_created,
572
+ "source_info": source_info
573
+ }
371
574
 
372
575
  except Exception as e:
373
- print(f"Failed to create script: {e}")
374
- return False
576
+ self.logger.error(f"Failed to create scripts: {e}")
577
+ return {
578
+ "status": "error",
579
+ "message": f"Failed to create scripts: {e}",
580
+ "instructions": [],
581
+ "files_created": [],
582
+ "source_info": {}
583
+ }
584
+
585
+ def _generate_workflow_script_content(self, source_info: Dict[str, Any]) -> str:
586
+ """Generate the content for 1_masster_workflow.py script."""
587
+
588
+ # Convert Path objects to strings for JSON serialization
589
+ params_dict = {}
590
+ for key, value in self.params.__dict__.items():
591
+ if key == '_param_metadata': # Skip metadata in generated script
592
+ continue
593
+ if isinstance(value, Path):
594
+ params_dict[key] = str(value)
595
+ else:
596
+ params_dict[key] = value
375
597
 
376
- def execute(self, filename: str = "run_masster.py") -> bool:
598
+ # Create readable PARAMS dict with comments including discovered info
599
+ params_lines = []
600
+ params_lines.append('# Analysis parameters (auto-detected from source files)')
601
+ params_lines.append('PARAMS = {')
602
+
603
+ # File Discovery Summary
604
+ params_lines.append(' # === Source File Analysis ===')
605
+ params_lines.append(f' "number_of_files": {source_info.get("number_of_files", 0)}, # Total raw data files found')
606
+ params_lines.append(f' "file_types": {source_info.get("file_types", [])!r}, # Detected file extensions')
607
+ params_lines.append(f' "length_minutes": {source_info.get("length_minutes", 0.0):.1f}, # Estimated acquisition length per file (minutes)')
608
+ if source_info.get('first_file'):
609
+ params_lines.append(f' "first_file": {source_info["first_file"]!r}, # First file analyzed for metadata')
610
+ params_lines.append('')
611
+
612
+ # Core Configuration
613
+ params_lines.append(' # === Core Configuration ===')
614
+ params_lines.append(f' "source": {params_dict.get("source", "")!r}, # Directory containing raw data files')
615
+ params_lines.append(f' "folder": {params_dict.get("folder", "")!r}, # Output directory for processed study')
616
+ params_lines.append(f' "polarity": {params_dict.get("polarity", "positive")!r}, # Ion polarity mode (auto-detected)')
617
+ params_lines.append(f' "num_cores": {params_dict.get("num_cores", 4)}, # Number of CPU cores for parallel processing')
618
+ params_lines.append('')
619
+
620
+ # File Discovery
621
+ params_lines.append(' # === File Discovery ===')
622
+ params_lines.append(f' "file_extensions": {params_dict.get("file_extensions", [".wiff", ".raw", ".mzML"])!r}, # File extensions to search for')
623
+ params_lines.append(f' "search_subfolders": {params_dict.get("search_subfolders", True)}, # Whether to search subdirectories recursively')
624
+ params_lines.append(f' "skip_patterns": {params_dict.get("skip_patterns", ["blank", "condition"])!r}, # Filename patterns to skip')
625
+ params_lines.append('')
626
+
627
+ # Processing Parameters - Critical values to review
628
+ params_lines.append(' # === Processing Parameters (REVIEW THESE VALUES) ===')
629
+ params_lines.append(f' "adducts": {params_dict.get("adducts", [])!r}, # Adduct specifications for feature detection and annotation')
630
+ params_lines.append(f' "detector_type": {params_dict.get("detector_type", "unknown")!r}, # MS detector type ("orbitrap", "tof", "unknown")')
631
+ params_lines.append(f' "noise": {params_dict.get("noise", 50.0)}, # REVIEW: Noise threshold for feature detection. Set to 1e5 for Orbitraps')
632
+ params_lines.append(f' "chrom_fwhm": {params_dict.get("chrom_fwhm", 0.5)}, # REVIEW: Chromatographic peak FWHM (seconds)')
633
+ params_lines.append(f' "chrom_peak_snr": {params_dict.get("chrom_peak_snr", 5.0)}, # Minimum signal-to-noise ratio for chromatographic peaks')
634
+ params_lines.append('')
635
+
636
+ # Other parameters...
637
+ params_lines.append(' # === Alignment & Merging ===')
638
+ params_lines.append(f' "rt_tol": {params_dict.get("rt_tol", 5.0)}, # Retention time tolerance for alignment (seconds)')
639
+ params_lines.append(f' "mz_tol": {params_dict.get("mz_tol", 0.01)}, # Mass-to-charge ratio tolerance for alignment (Da)')
640
+ params_lines.append(f' "alignment_method": {params_dict.get("alignment_method", "kd")!r}, # Algorithm for sample alignment')
641
+ params_lines.append(f' "min_samples_per_feature": {params_dict.get("min_samples_per_feature", 1)}, # Minimum samples required per consensus feature')
642
+ params_lines.append(f' "merge_method": {params_dict.get("merge_method", "qt")!r}, # Method for merging consensus features')
643
+ params_lines.append('')
644
+
645
+ # Other params
646
+ params_lines.append(' # === Sample Processing ===')
647
+ params_lines.append(f' "batch_size": {params_dict.get("batch_size", 8)}, # Number of files to process per batch')
648
+ params_lines.append(f' "memory_limit_gb": {params_dict.get("memory_limit_gb", 16.0)}, # Memory limit for processing (GB)')
649
+ params_lines.append('')
650
+
651
+ params_lines.append(' # === Script Options ===')
652
+ params_lines.append(f' "resume_enabled": {params_dict.get("resume_enabled", True)}, # Enable automatic resume capability')
653
+ params_lines.append(f' "force_reprocess": {params_dict.get("force_reprocess", False)}, # Force reprocessing of existing files')
654
+ params_lines.append(f' "cleanup_temp_files": {params_dict.get("cleanup_temp_files", True)}, # Clean up temporary files after processing')
655
+
656
+ params_lines.append('}')
657
+
658
+ # Create script lines
659
+ script_lines = [
660
+ '#!/usr/bin/env python3',
661
+ '"""',
662
+ 'MASSter Workflow Script - Sample Processing',
663
+ f'Generated by masster wizard v{version}',
664
+ '',
665
+ 'Source Analysis:',
666
+ f' - Files found: {source_info.get("number_of_files", 0)}',
667
+ f' - File types: {", ".join(source_info.get("file_types", []))}',
668
+ f' - Polarity detected: {source_info.get("polarity", "unknown")}',
669
+ f' - Acquisition length: ~{source_info.get("length_minutes", 0.0):.1f} minutes per file',
670
+ '',
671
+ 'This script processes raw MS data files into sample5 format.',
672
+ 'Review the NOISE and CHROM_FWHM parameters below before running.',
673
+ '"""',
674
+ '',
675
+ 'import sys',
676
+ 'import time',
677
+ 'from pathlib import Path',
678
+ 'import concurrent.futures',
679
+ 'import os',
680
+ '',
681
+ '# Import masster modules',
682
+ 'from masster.sample import Sample',
683
+ 'from masster import __version__',
684
+ '',
685
+ ]
686
+
687
+ # Add the formatted PARAMS
688
+ script_lines.extend(params_lines)
689
+
690
+ # Add the functions
691
+ script_lines.extend([
692
+ '',
693
+ '',
694
+ 'def discover_raw_files(source_folder, file_extensions, search_subfolders=True, skip_patterns=None):',
695
+ ' """Discover raw data files in the source folder."""',
696
+ ' source_path = Path(source_folder)',
697
+ ' raw_files = []',
698
+ ' skip_patterns = skip_patterns or []',
699
+ ' ',
700
+ ' for ext in file_extensions:',
701
+ ' if search_subfolders:',
702
+ ' pattern = f"**/*{ext}"',
703
+ ' files = list(source_path.rglob(pattern))',
704
+ ' else:',
705
+ ' pattern = f"*{ext}"',
706
+ ' files = list(source_path.glob(pattern))',
707
+ ' ',
708
+ ' # Filter out files matching skip patterns',
709
+ ' for file in files:',
710
+ ' skip_file = False',
711
+ ' for skip_pattern in skip_patterns:',
712
+ ' if skip_pattern.lower() in file.name.lower():',
713
+ ' skip_file = True',
714
+ ' break',
715
+ ' if not skip_file:',
716
+ ' raw_files.append(file)',
717
+ ' ',
718
+ ' return raw_files',
719
+ '',
720
+ '',
721
+ 'def process_single_file(args):',
722
+ ' """Process a single raw file to sample5 format - designed for multiprocessing."""',
723
+ ' raw_file, output_folder, params = args',
724
+ ' ',
725
+ ' try:',
726
+ ' # Create sample5 filename',
727
+ ' sample_name = raw_file.stem',
728
+ ' sample5_path = Path(output_folder) / f"{sample_name}.sample5"',
729
+ ' ',
730
+ ' # Skip if sample5 already exists and resume is enabled',
731
+ ' if sample5_path.exists() and params["resume_enabled"]:',
732
+ ' print(f" ✓ Skipping {raw_file.name} (sample5 already exists)")',
733
+ ' return {"status": "skipped", "file": str(sample5_path), "message": "Already exists"}',
734
+ ' ',
735
+ ' print(f" 🔄 Processing {raw_file.name}...")',
736
+ ' start_time = time.time()',
737
+ ' ',
738
+ ' # Load and process raw file with full pipeline',
739
+ ' sample = Sample(log_label=sample_name)',
740
+ ' sample.load(filename=str(raw_file))',
741
+ ' sample.find_features(',
742
+ ' noise=params["noise"],',
743
+ ' chrom_fwhm=params["chrom_fwhm"],',
744
+ ' chrom_peak_snr=params["chrom_peak_snr"]',
745
+ ' )',
746
+ ' # sample.find_adducts(adducts=params["adducts"])',
747
+ ' sample.find_ms2()',
748
+ ' # sample.find_iso() # Optional - can be uncommented if needed',
749
+ ' sample.save(str(sample5_path))',
750
+ ' ',
751
+ ' elapsed = time.time() - start_time',
752
+ ' print(f" ✅ Completed {raw_file.name} -> {sample5_path.name} ({elapsed:.1f}s)")',
753
+ ' ',
754
+ ' return {"status": "success", "file": str(sample5_path), "elapsed": elapsed}',
755
+ ' ',
756
+ ' except Exception as e:',
757
+ ' print(f" ❌ ERROR processing {raw_file.name}: {e}")',
758
+ ' return {"status": "error", "file": str(raw_file), "error": str(e)}',
759
+ '',
760
+ '',
761
+ 'def convert_raw_to_sample5_parallel(raw_files, output_folder, params):',
762
+ ' """Convert raw data files to sample5 format with parallel processing and progress tracking."""',
763
+ ' import concurrent.futures',
764
+ ' import os',
765
+ ' ',
766
+ ' # Create output directory',
767
+ ' os.makedirs(output_folder, exist_ok=True)',
768
+ ' ',
769
+ ' print(f"\\n🚀 Processing {len(raw_files)} files using {params[\'num_cores\']} CPU cores...")',
770
+ ' print("=" * 70)',
771
+ ' ',
772
+ ' # Prepare arguments for multiprocessing',
773
+ ' file_args = [(raw_file, output_folder, params) for raw_file in raw_files]',
774
+ ' ',
775
+ ' # Process files in parallel with progress tracking',
776
+ ' results = []',
777
+ ' successful = 0',
778
+ ' skipped = 0',
779
+ ' failed = 0',
780
+ ' total_elapsed = 0',
781
+ ' ',
782
+ ' with concurrent.futures.ProcessPoolExecutor(max_workers=params["num_cores"]) as executor:',
783
+ ' # Submit all jobs',
784
+ ' future_to_file = {executor.submit(process_single_file, args): args[0] for args in file_args}',
785
+ ' ',
786
+ ' # Collect results as they complete',
787
+ ' for i, future in enumerate(concurrent.futures.as_completed(future_to_file), 1):',
788
+ ' result = future.result()',
789
+ ' results.append(result)',
790
+ ' ',
791
+ ' if result["status"] == "success":',
792
+ ' successful += 1',
793
+ ' total_elapsed += result.get("elapsed", 0)',
794
+ ' elif result["status"] == "skipped":',
795
+ ' skipped += 1',
796
+ ' else:',
797
+ ' failed += 1',
798
+ ' ',
799
+ ' # Progress update',
800
+ ' print(f"\\r Progress: {i}/{len(raw_files)} files completed ({successful} success, {skipped} skipped, {failed} failed)", end="", flush=True)',
801
+ ' ',
802
+ ' print() # New line after progress',
803
+ ' print("=" * 70)',
804
+ ' ',
805
+ ' # Summary',
806
+ ' if successful > 0:',
807
+ ' avg_time = total_elapsed / successful',
808
+ ' print(f"✅ Successfully processed {successful} files (avg: {avg_time:.1f}s per file)")',
809
+ ' if skipped > 0:',
810
+ ' print(f"⏩ Skipped {skipped} files (already exist)")',
811
+ ' if failed > 0:',
812
+ ' print(f"❌ Failed to process {failed} files")',
813
+ ' for result in results:',
814
+ ' if result["status"] == "error":',
815
+ ' print(f" - {Path(result[\'file\']).name}: {result[\'error\']}")',
816
+ ' ',
817
+ ' # Return list of successful sample5 files',
818
+ ' sample5_files = [result["file"] for result in results if result["status"] in ["success", "skipped"]]',
819
+ ' return sample5_files',
820
+ '',
821
+ '',
822
+ 'def main():',
823
+ ' """Main sample processing workflow."""',
824
+ ' try:',
825
+ ' print("=" * 70)',
826
+ f' print("MASSter {version} - Sample Processing Workflow")',
827
+ ' print("=" * 70)',
828
+ ' print(f"Source: {PARAMS[\'source\']}")',
829
+ ' print(f"Output: {PARAMS[\'folder\']}")',
830
+ ' print(f"Polarity: {PARAMS[\'polarity\']} (detected)")',
831
+ ' print(f"CPU Cores: {PARAMS[\'num_cores\']}")',
832
+ ' print("=" * 70)',
833
+ ' print("\\n⚙️ IMPORTANT: Review these parameters before processing:")',
834
+ ' print(f" NOISE threshold: {PARAMS[\'noise\']} (adjust based on your instrument)")',
835
+ ' print(f" CHROM_FWHM: {PARAMS[\'chrom_fwhm\']}s (adjust based on your chromatography)")',
836
+ ' print(" You can edit these values in the PARAMS section above.")',
837
+ ' print("=" * 70)',
838
+ ' ',
839
+ ' start_time = time.time()',
840
+ ' ',
841
+ ' # Step 1: Discover raw data files',
842
+ ' print("\\n📁 Step 1/2: Discovering raw data files...")',
843
+ ' raw_files = discover_raw_files(',
844
+ ' PARAMS[\'source\'],',
845
+ ' PARAMS[\'file_extensions\'],',
846
+ ' PARAMS[\'search_subfolders\'],',
847
+ ' PARAMS[\'skip_patterns\']',
848
+ ' )',
849
+ ' ',
850
+ ' if not raw_files:',
851
+ ' print("❌ No raw data files found!")',
852
+ ' return False',
853
+ ' ',
854
+ ' print(f"Found {len(raw_files)} raw data files")',
855
+ ' for i, f in enumerate(raw_files[:5]): # Show first 5 files',
856
+ ' print(f" {i+1}. {f.name}")',
857
+ ' if len(raw_files) > 5:',
858
+ ' print(f" ... and {len(raw_files) - 5} more files")',
859
+ ' ',
860
+ ' # Step 2: Process raw files to sample5',
861
+ ' print("\\n🔄 Step 2/2: Processing raw files to sample5 format...")',
862
+ ' sample5_files = convert_raw_to_sample5_parallel(',
863
+ ' raw_files,',
864
+ ' PARAMS[\'folder\'],',
865
+ ' PARAMS',
866
+ ' )',
867
+ ' ',
868
+ ' if not sample5_files:',
869
+ ' print("❌ No sample5 files were created!")',
870
+ ' return False',
871
+ ' ',
872
+ ' # Summary',
873
+ ' total_time = time.time() - start_time',
874
+ ' print("\\n" + "=" * 70)',
875
+ ' print("🎉 SAMPLE PROCESSING COMPLETE")',
876
+ ' print("=" * 70)',
877
+ ' print(f"Processing time: {total_time/60:.1f} minutes")',
878
+ ' print(f"Raw files found: {len(raw_files)}")',
879
+ ' print(f"Sample5 files created: {len(sample5_files)}")',
880
+ ' print("\\nNext steps:")',
881
+ ' print("1. Run the interactive analysis: uv run marimo edit 2_interactive_analysis.py")',
882
+ ' print("2. Or use the sample5 files in your own analysis scripts")',
883
+ ' print("=" * 70)',
884
+ ' ',
885
+ ' return True',
886
+ ' ',
887
+ ' except KeyboardInterrupt:',
888
+ ' print("\\n❌ Processing interrupted by user")',
889
+ ' return False',
890
+ ' except Exception as e:',
891
+ ' print(f"❌ Processing failed with error: {e}")',
892
+ ' import traceback',
893
+ ' traceback.print_exc()',
894
+ ' return False',
895
+ '',
896
+ '',
897
+ 'if __name__ == "__main__":',
898
+ ' success = main()',
899
+ ' sys.exit(0 if success else 1)',
900
+ ])
901
+
902
+ return '\n'.join(script_lines)
903
+
904
+ def analyze(self) -> Dict[str, Any]:
377
905
  """
378
- Create and execute a standalone analysis script.
906
+ Execute the complete analysis workflow.
907
+
908
+ This method:
909
+ 1. Checks if 1_masster_workflow.py exists and runs it
910
+ 2. If not, creates scripts first then runs the workflow
911
+ 3. Provides clear feedback about next steps
379
912
 
380
- Parameters:
381
- filename: Name for the generated script file
382
-
383
913
  Returns:
384
- True if execution completed successfully, False otherwise
914
+ Dictionary containing execution results and instructions
385
915
  """
386
- # First create the script
387
- if not self.create_script(filename):
388
- return False
389
-
390
- # Then execute it
391
- script_path = self.folder_path / filename
916
+ workflow_script = self.folder_path / "1_masster_workflow.py"
392
917
 
393
918
  try:
394
- print("Executing...")
919
+ # Check if workflow script exists
920
+ if workflow_script.exists():
921
+ print("📋 Found existing workflow script, executing...")
922
+ return self._execute_workflow_script(workflow_script)
923
+ else:
924
+ print("📝 Creating analysis scripts...")
925
+ # Create scripts first
926
+ result = self.create_scripts()
927
+
928
+ if result["status"] != "success":
929
+ return result
930
+
931
+ # Print instructions
932
+ print("\n" + "="*70)
933
+ for instruction in result["instructions"]:
934
+ print(instruction)
935
+ print("="*70)
936
+
937
+ # Ask user if they want to proceed with execution
938
+ print("\n🤔 Would you like to proceed with sample processing now?")
939
+ print(" This will execute 1_masster_workflow.py")
940
+ response = input(" Proceed? [y/N]: ").strip().lower()
941
+
942
+ if response in ['y', 'yes']:
943
+ return self._execute_workflow_script(workflow_script)
944
+ else:
945
+ print("✋ Processing paused. Run the scripts manually when ready.")
946
+ return {
947
+ "status": "scripts_created",
948
+ "message": "Scripts created successfully, execution deferred",
949
+ "instructions": result["instructions"],
950
+ "files_created": result["files_created"]
951
+ }
952
+
953
+ except Exception as e:
954
+ self.logger.error(f"Analysis failed: {e}")
955
+ return {
956
+ "status": "error",
957
+ "message": f"Analysis failed: {e}",
958
+ "instructions": [],
959
+ "files_created": []
960
+ }
961
+
962
+ def _execute_workflow_script(self, script_path: Path) -> Dict[str, Any]:
963
+ """Execute the workflow script and return results."""
964
+ try:
965
+ print(f"🚀 Executing {script_path.name}...")
395
966
 
396
967
  import subprocess
397
968
  result = subprocess.run([
398
969
  sys.executable, str(script_path)
399
- ], cwd=str(self.folder_path), encoding='utf-8', errors='replace')
970
+ ], cwd=str(self.folder_path), capture_output=False, text=True)
400
971
 
401
972
  success = result.returncode == 0
402
973
 
403
974
  if success:
404
- print("=" * 70)
405
- print("Script execution completed successfully")
975
+ print("="*70)
976
+ print(" Workflow execution completed successfully!")
977
+ print("="*70)
978
+ print("Next step: Run interactive analysis")
979
+ print(" uv run marimo edit 2_interactive_analysis.py")
980
+ print("="*70)
981
+
982
+ return {
983
+ "status": "success",
984
+ "message": "Workflow completed successfully",
985
+ "instructions": [
986
+ "✅ Sample processing completed",
987
+ "Next: uv run marimo edit 2_interactive_analysis.py"
988
+ ],
989
+ "files_created": []
990
+ }
406
991
  else:
407
- print("=" * 70)
408
- print(f"Script execution failed with return code: {result.returncode}")
992
+ return {
993
+ "status": "error",
994
+ "message": f"Workflow execution failed with code {result.returncode}",
995
+ "instructions": [
996
+ "❌ Check the error messages above",
997
+ "Review parameters in 1_masster_workflow.py",
998
+ "Try running: python 1_masster_workflow.py"
999
+ ],
1000
+ "files_created": []
1001
+ }
409
1002
 
410
- return success
411
-
412
1003
  except Exception as e:
413
- print(f"Error during script execution: {e}")
414
- return False
1004
+ return {
1005
+ "status": "error",
1006
+ "message": f"Failed to execute workflow: {e}",
1007
+ "instructions": [],
1008
+ "files_created": []
1009
+ }
415
1010
 
416
1011
  def _generate_script_content(self) -> str:
417
1012
  """Generate the complete analysis script content."""
@@ -718,8 +1313,154 @@ class Wizard:
718
1313
 
719
1314
  return '\n'.join(script_lines)
720
1315
 
1316
+ def _generate_notebook_content(self) -> str:
1317
+ """Generate the content for a marimo interactive notebook."""
1318
+
1319
+ notebook_lines = [
1320
+ 'import marimo',
1321
+ '',
1322
+ '__generated_with = "0.9.14"',
1323
+ 'app = marimo.App(width="medium")',
1324
+ '',
1325
+ '',
1326
+ '@app.cell',
1327
+ 'def __():',
1328
+ ' import marimo as mo',
1329
+ ' return (mo,)',
1330
+ '',
1331
+ '',
1332
+ '@app.cell',
1333
+ 'def __(mo):',
1334
+ ' mo.md(r"""',
1335
+ ' # MASSter Interactive Analysis',
1336
+ ' ',
1337
+ ' This notebook provides interactive exploration of your mass spectrometry study results.',
1338
+ ' The study has been processed and is ready for analysis.',
1339
+ ' """)',
1340
+ '',
1341
+ '',
1342
+ '@app.cell',
1343
+ 'def __():',
1344
+ ' # Import masster',
1345
+ ' import masster',
1346
+ ' return (masster,)',
1347
+ '',
1348
+ '',
1349
+ '@app.cell',
1350
+ 'def __(masster):',
1351
+ ' # Load the processed study',
1352
+ ' study = masster.Study(folder=".")',
1353
+ ' study.load()',
1354
+ ' return (study,)',
1355
+ '',
1356
+ '',
1357
+ '@app.cell',
1358
+ 'def __(mo, study):',
1359
+ ' # Display study information',
1360
+ ' mo.md(f"""',
1361
+ ' ## Study Overview',
1362
+ ' ',
1363
+ ' **Samples:** {len(study.samples) if hasattr(study, "samples") else "Not loaded"}',
1364
+ ' ',
1365
+ ' **Features:** {len(study.consensus_df) if hasattr(study, "consensus_df") else "Not available"}',
1366
+ ' ',
1367
+ ' **Polarity:** {study.polarity if hasattr(study, "polarity") else "Unknown"}',
1368
+ ' """)',
1369
+ '',
1370
+ '',
1371
+ '@app.cell',
1372
+ 'def __(study):',
1373
+ ' # Print detailed study info',
1374
+ ' study.info()',
1375
+ '',
1376
+ '',
1377
+ '@app.cell',
1378
+ 'def __(mo):',
1379
+ ' mo.md(r"""',
1380
+ ' ## Quick Visualizations',
1381
+ ' ',
1382
+ ' Use the cells below to create interactive plots of your data.',
1383
+ ' """)',
1384
+ '',
1385
+ '',
1386
+ '@app.cell',
1387
+ 'def __(study):',
1388
+ ' # Generate consensus 2D plot',
1389
+ ' if hasattr(study, "consensus_df") and len(study.consensus_df) > 0:',
1390
+ ' study.plot_consensus_2d(filename="consensus_interactive.html")',
1391
+ ' print("Consensus 2D plot saved as: consensus_interactive.html")',
1392
+ ' else:',
1393
+ ' print("No consensus features available for plotting")',
1394
+ '',
1395
+ '',
1396
+ '@app.cell',
1397
+ 'def __(study):',
1398
+ ' # Generate PCA plot',
1399
+ ' if hasattr(study, "samples") and len(study.samples) > 1:',
1400
+ ' study.plot_samples_pca(filename="pca_interactive.html")',
1401
+ ' print("PCA plot saved as: pca_interactive.html")',
1402
+ ' else:',
1403
+ ' print("Not enough samples for PCA analysis")',
1404
+ '',
1405
+ '',
1406
+ '@app.cell',
1407
+ 'def __(mo):',
1408
+ ' mo.md(r"""',
1409
+ ' ## Data Export',
1410
+ ' ',
1411
+ ' Export your processed data in various formats.',
1412
+ ' """)',
1413
+ '',
1414
+ '',
1415
+ '@app.cell',
1416
+ 'def __(study):',
1417
+ ' # Export options',
1418
+ ' if hasattr(study, "consensus_df"):',
1419
+ ' # Export to Excel',
1420
+ ' study.export_xlsx(filename="study_results.xlsx")',
1421
+ ' print("✓ Results exported to: study_results.xlsx")',
1422
+ ' ',
1423
+ ' # Export to MGF',
1424
+ ' study.export_mgf(filename="study_spectra.mgf")',
1425
+ ' print("✓ Spectra exported to: study_spectra.mgf")',
1426
+ ' else:',
1427
+ ' print("No data available for export")',
1428
+ '',
1429
+ '',
1430
+ '@app.cell',
1431
+ 'def __(mo):',
1432
+ ' mo.md(r"""',
1433
+ ' ## Custom Analysis',
1434
+ ' ',
1435
+ ' Add your own analysis code in the cells below.',
1436
+ ' """)',
1437
+ '',
1438
+ '',
1439
+ '@app.cell',
1440
+ 'def __(study):',
1441
+ ' # Access consensus features dataframe',
1442
+ ' if hasattr(study, "consensus_df"):',
1443
+ ' df = study.consensus_df',
1444
+ ' print(f"Consensus features shape: {df.shape}")',
1445
+ ' print("\\nFirst 5 features:")',
1446
+ ' print(df.head())',
1447
+ ' return (df,) if "df" in locals() else ()',
1448
+ '',
1449
+ '',
1450
+ '@app.cell',
1451
+ 'def __():',
1452
+ ' # Your custom analysis here',
1453
+ ' pass',
1454
+ '',
1455
+ '',
1456
+ 'if __name__ == "__main__":',
1457
+ ' app.run()',
1458
+ ]
1459
+
1460
+ return '\n'.join(notebook_lines)
1461
+
721
1462
 
722
- def create_script(
1463
+ def create_analysis(
723
1464
  source: str,
724
1465
  folder: str,
725
1466
  filename: str = 'run_masster.py',
@@ -730,16 +1471,14 @@ def create_script(
730
1471
  **kwargs
731
1472
  ) -> bool:
732
1473
  """
733
- Create a standalone analysis script without initializing a Wizard instance.
1474
+ Create standalone analysis scripts without initializing a Wizard instance.
734
1475
 
735
- This function generates a Python script that replicates automated processing
736
- steps with the specified configuration. The script can be executed independently
737
- to perform the same analysis.
1476
+ This function generates analysis scripts with the specified configuration.
738
1477
 
739
1478
  Parameters:
740
1479
  source: Directory containing raw data files
741
1480
  folder: Output directory for processed study
742
- filename: Filename for the generated script (should end with .py)
1481
+ filename: Filename for the generated script (deprecated, will create standard files)
743
1482
  polarity: Ion polarity mode ("positive" or "negative")
744
1483
  adducts: List of adduct specifications (auto-set if None)
745
1484
  params: Custom wizard_def parameters (optional)
@@ -747,14 +1486,13 @@ def create_script(
747
1486
  **kwargs: Additional parameters to override defaults
748
1487
 
749
1488
  Returns:
750
- True if script was generated successfully, False otherwise
1489
+ True if scripts were generated successfully, False otherwise
751
1490
 
752
1491
  Example:
753
- >>> from masster.wizard import create_script
754
- >>> create_script(
1492
+ >>> from masster.wizard import create_analysis
1493
+ >>> create_analysis(
755
1494
  ... source=r'D:\\Data\\raw_files',
756
1495
  ... folder=r'D:\\Data\\output',
757
- ... filename='run_masster.py',
758
1496
  ... polarity='positive'
759
1497
  ... )
760
1498
  """
@@ -794,22 +1532,27 @@ def create_script(
794
1532
  study_path = Path(folder)
795
1533
  study_path.mkdir(parents=True, exist_ok=True)
796
1534
 
797
- # Create a temporary Wizard instance to generate the script
1535
+ # Create a temporary Wizard instance to generate the scripts
798
1536
  temp_wizard = Wizard(params=wizard_params)
799
1537
 
800
- # Generate the script using the instance method
801
- success = temp_wizard.create_script(filename)
1538
+ # Generate the scripts using the new method
1539
+ result = temp_wizard.create_scripts()
802
1540
 
803
- return success
1541
+ if result["status"] == "success":
1542
+ print("Scripts created successfully!")
1543
+ for instruction in result["instructions"]:
1544
+ print(instruction)
1545
+
1546
+ return result["status"] == "success"
804
1547
 
805
1548
  except Exception as e:
806
- print(f"Failed to create script: {e}")
1549
+ print(f"Failed to create scripts: {e}")
807
1550
  import traceback
808
1551
  traceback.print_exc()
809
1552
  return False
810
1553
 
811
1554
 
812
- def execute(
1555
+ def analyze(
813
1556
  source: str,
814
1557
  folder: str,
815
1558
  filename: str = 'run_masster.py',
@@ -822,7 +1565,7 @@ def execute(
822
1565
  """
823
1566
  Create and execute a standalone analysis script for automated MS data processing.
824
1567
 
825
- This function generates a Python script with the same parameters as create_script(),
1568
+ This function generates a Python script with the same parameters as create_analysis(),
826
1569
  but immediately executes it after creation. Combines script generation and execution
827
1570
  in a single step.
828
1571
 
@@ -840,8 +1583,8 @@ def execute(
840
1583
  True if script was created and executed successfully, False otherwise
841
1584
 
842
1585
  Example:
843
- >>> from masster.wizard import execute
844
- >>> execute(
1586
+ >>> from masster.wizard import analyze
1587
+ >>> analyze(
845
1588
  ... source=r'D:\\Data\\raw_files',
846
1589
  ... folder=r'D:\\Data\\output',
847
1590
  ... polarity='positive'
@@ -849,59 +1592,39 @@ def execute(
849
1592
  """
850
1593
 
851
1594
  try:
852
- # First, create the script using create_script()
853
- script_created = create_script(
854
- source=source,
855
- folder=folder,
856
- filename=filename,
857
- polarity=polarity,
858
- adducts=adducts,
859
- params=params,
860
- num_cores=num_cores,
861
- **kwargs
862
- )
863
-
864
- if not script_created:
865
- print("Failed to create analysis script")
866
- return False
867
-
868
- # Get the full path to the created script
869
- study_path = Path(folder)
870
- script_path = study_path / Path(filename).name
871
-
872
- if not script_path.exists():
873
- print(f"Script file not found: {script_path}")
874
- return False
875
-
876
- print(f"Executing...")
877
- #print("=" * 70)
878
-
879
- # Execute the script using subprocess with real-time output
880
- import subprocess
881
-
882
- # Run the script with Python, letting it inherit our stdout/stderr
883
- try:
884
- # Use subprocess.run for direct output inheritance - no capturing/re-printing
885
- result = subprocess.run([
886
- sys.executable, str(script_path)
887
- ], cwd=str(study_path))
1595
+ # Create parameters (same logic as create_analysis)
1596
+ if params is not None:
1597
+ wizard_params = params
1598
+ wizard_params.source = source
1599
+ wizard_params.folder = folder
1600
+ if polarity != "positive":
1601
+ wizard_params.polarity = polarity
1602
+ if num_cores > 0:
1603
+ wizard_params.num_cores = num_cores
1604
+ if adducts is not None:
1605
+ wizard_params.adducts = adducts
1606
+ else:
1607
+ wizard_params = wizard_def(
1608
+ source=source,
1609
+ folder=folder,
1610
+ polarity=polarity,
1611
+ num_cores=max(1, int(multiprocessing.cpu_count() * 0.75)) if num_cores <= 0 else num_cores
1612
+ )
888
1613
 
889
- return_code = result.returncode
1614
+ if adducts is not None:
1615
+ wizard_params.adducts = adducts
890
1616
 
891
- except Exception as e:
892
- print(f"Error during script execution: {e}")
893
- return False
1617
+ # Apply any additional kwargs
1618
+ for key, value in kwargs.items():
1619
+ if hasattr(wizard_params, key):
1620
+ setattr(wizard_params, key, value)
894
1621
 
895
- success = return_code == 0
1622
+ # Create Wizard instance and run analysis
1623
+ wizard = Wizard(params=wizard_params)
1624
+ result = wizard.analyze()
896
1625
 
897
- if success:
898
- print("=" * 70)
899
- print("Script execution completed successfully")
900
- else:
901
- print("=" * 70)
902
- print(f"Script execution failed with return code: {return_code}")
903
-
904
- return success
1626
+ # Return success status
1627
+ return result.get("status") in ["success", "scripts_created"]
905
1628
 
906
1629
  except Exception as e:
907
1630
  print(f"Failed to execute script: {e}")
@@ -910,5 +1633,93 @@ def execute(
910
1633
  return False
911
1634
 
912
1635
 
1636
+ def _generate_interactive_notebook_content(self, source_info: Dict[str, Any]) -> str:
1637
+ """Generate the content for 2_interactive_analysis.py marimo notebook."""
1638
+
1639
+ notebook_lines = [
1640
+ 'import marimo',
1641
+ '',
1642
+ '__generated_with = "0.9.14"',
1643
+ 'app = marimo.App(width="medium")',
1644
+ '',
1645
+ '',
1646
+ '@app.cell',
1647
+ 'def __():',
1648
+ ' import marimo as mo',
1649
+ ' return (mo,)',
1650
+ '',
1651
+ '',
1652
+ '@app.cell',
1653
+ 'def __(mo):',
1654
+ ' mo.md(r"""',
1655
+ ' # MASSter Interactive Analysis',
1656
+ ' ',
1657
+ f' **Source:** {source_info.get("number_of_files", 0)} files ({", ".join(source_info.get("file_types", []))}) detected',
1658
+ f' **Polarity:** {source_info.get("polarity", "unknown")} (auto-detected)',
1659
+ f' **Acquisition length:** ~{source_info.get("length_minutes", 0.0):.1f} minutes per file',
1660
+ ' ',
1661
+ ' This notebook provides interactive exploration of your processed mass spectrometry study.',
1662
+ ' Make sure you have run `python 1_masster_workflow.py` first to create the sample5 files.',
1663
+ ' """)',
1664
+ '',
1665
+ '',
1666
+ '@app.cell',
1667
+ 'def __():',
1668
+ ' # Import masster',
1669
+ ' import masster',
1670
+ ' return (masster,)',
1671
+ '',
1672
+ '',
1673
+ '@app.cell',
1674
+ 'def __(masster):',
1675
+ ' # Load the study from sample5 files',
1676
+ ' study = masster.Study(folder=".")',
1677
+ ' return (study,)',
1678
+ '',
1679
+ '',
1680
+ '@app.cell',
1681
+ 'def __(mo, study):',
1682
+ ' # Display study information',
1683
+ ' study.info()',
1684
+ ' return ()',
1685
+ '',
1686
+ '',
1687
+ 'if __name__ == "__main__":',
1688
+ ' app.run()',
1689
+ ]
1690
+
1691
+ return '\n'.join(notebook_lines)
1692
+
1693
+ def _generate_instructions(self, source_info: Dict[str, Any], files_created: List[str]) -> List[str]:
1694
+ """Generate usage instructions for the created scripts."""
1695
+ instructions = [
1696
+ "🎯 NEXT STEPS:",
1697
+ "",
1698
+ f"Source analysis completed: {source_info.get('number_of_files', 0)} files found",
1699
+ f"Polarity detected: {source_info.get('polarity', 'unknown')}",
1700
+ f"Estimated processing time: {source_info.get('number_of_files', 0) * source_info.get('length_minutes', 0.0) * 0.1:.1f} minutes",
1701
+ "",
1702
+ "1. REVIEW PARAMETERS:",
1703
+ " Edit 1_masster_workflow.py and verify these key settings:",
1704
+ " - NOISE threshold (adjust based on your instrument sensitivity)",
1705
+ " - CHROM_FWHM (adjust based on your chromatography peak width)",
1706
+ "",
1707
+ "2. EXECUTE SAMPLE PROCESSING:",
1708
+ " python 1_masster_workflow.py",
1709
+ " (This will process all raw files to sample5 format)",
1710
+ "",
1711
+ "3. INTERACTIVE ANALYSIS:",
1712
+ " uv run marimo edit 2_interactive_analysis.py",
1713
+ " (This opens an interactive notebook for data exploration)",
1714
+ "",
1715
+ "FILES CREATED:"
1716
+ ]
1717
+
1718
+ for file_path in files_created:
1719
+ instructions.append(f" ✅ {Path(file_path).name}")
1720
+
1721
+ return instructions
1722
+
1723
+
913
1724
  # Export the main classes and functions
914
- __all__ = ["Wizard", "wizard_def", "create_script", "execute"]
1725
+ __all__ = ["Wizard", "wizard_def", "create_analysis", "analyze"]