masster 0.4.18__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/wizard/wizard.py CHANGED
@@ -44,7 +44,9 @@ wizard.export_results()
44
44
  from __future__ import annotations
45
45
 
46
46
  import os
47
+ import sys
47
48
  import time
49
+ import importlib
48
50
  import multiprocessing
49
51
  from pathlib import Path
50
52
  from typing import Optional, Any, Dict, List
@@ -124,17 +126,18 @@ class wizard_def:
124
126
  checkpoint_interval: int = 10 # Save progress every N files
125
127
 
126
128
  # === Study Assembly ===
127
- min_samples_for_merge: int = 50
129
+ min_samples_for_merge: int = 2
128
130
  rt_tolerance: float = 1.5
129
- mz_tolerance: float = 0.01
131
+ mz_max_diff: float = 0.01
130
132
  alignment_algorithm: str = "kd"
131
133
  merge_method: str = "chunked"
132
134
 
133
135
  # === Feature Detection ===
134
- chrom_fwhm: float = 0.2
135
- noise_threshold: float = 1e5
136
+ chrom_fwhm: float = 0.5
137
+ noise_threshold: float = 200.0
136
138
  chrom_peak_snr: float = 5.0
137
139
  tol_ppm: float = 10.0
140
+ detector_type: str = "unknown" # Detected detector type ("orbitrap", "quadrupole", "unknown")
138
141
 
139
142
  # === Output & Export ===
140
143
  generate_plots: bool = True
@@ -232,10 +235,16 @@ class Wizard:
232
235
  of mass spectrometry files from raw data to final study results, including:
233
236
 
234
237
  1. Raw data discovery and batch conversion to sample5 format
235
- 2. Study assembly with feature alignment and merging
236
- 3. Automated plot generation and result export
237
- 4. Intelligent resume capability for interrupted processes
238
- 5. Adaptive optimization based on study size and system resources
238
+ 2. Automatic detector type detection and parameter optimization
239
+ 3. Study assembly with feature alignment and merging
240
+ 4. Automated plot generation and result export
241
+ 5. Intelligent resume capability for interrupted processes
242
+ 6. Adaptive optimization based on study size and system resources
243
+
244
+ The wizard automatically detects the type of MS detector using simplified rules:
245
+ - .raw files: Assume Orbitrap (noise threshold = 1e5)
246
+ - .wiff files: Assume Quadrupole (noise threshold = 200)
247
+ - .mzML files: Check metadata for Orbitrap detection
239
248
 
240
249
  The wizard handles the complete workflow with minimal user intervention
241
250
  while providing comprehensive logging and progress tracking.
@@ -262,6 +271,10 @@ class Wizard:
262
271
  **kwargs: Additional parameters (see wizard_def for full list)
263
272
  """
264
273
 
274
+ # Auto-detect optimal number of cores (75% of total)
275
+ if num_cores <= 0:
276
+ num_cores = max(1, int(multiprocessing.cpu_count() * 0.75))
277
+
265
278
  # Create parameters instance
266
279
  if "params" in kwargs and isinstance(kwargs["params"], wizard_def):
267
280
  self.params = kwargs.pop("params")
@@ -293,6 +306,14 @@ class Wizard:
293
306
  self.study_folder_path = Path(self.params.study_folder)
294
307
  self.study_folder_path.mkdir(parents=True, exist_ok=True)
295
308
 
309
+ # Auto-infer polarity from the first file if not explicitly set by user
310
+ if polarity == "positive" and "polarity" not in kwargs:
311
+ inferred_polarity = self._infer_polarity_from_first_file()
312
+ if inferred_polarity:
313
+ self.params.polarity = inferred_polarity
314
+ # Update adducts based on inferred polarity
315
+ self.params.__post_init__()
316
+
296
317
  # Setup logging
297
318
  self._setup_logging()
298
319
 
@@ -315,6 +336,47 @@ class Wizard:
315
336
  if self.params.resume_enabled:
316
337
  self._load_checkpoint()
317
338
 
339
+ def _infer_polarity_from_first_file(self) -> str:
340
+ """
341
+ Infer polarity from the first available raw data file.
342
+
343
+ Returns:
344
+ Inferred polarity string ("positive" or "negative") or None if detection fails
345
+ """
346
+ try:
347
+ # Find first file
348
+ for extension in ['.wiff', '.raw', '.mzML', '.d']:
349
+ pattern = f"**/*{extension}" if True else f"*{extension}" # search_subfolders=True
350
+ files = list(self.data_source_path.rglob(pattern))
351
+ if files:
352
+ first_file = files[0]
353
+ break
354
+ else:
355
+ return None
356
+
357
+ # Only implement for .wiff files initially (most common format)
358
+ if first_file.suffix.lower() == '.wiff':
359
+ from masster.sample.load import _wiff_to_dict
360
+
361
+ # Extract metadata from first file
362
+ metadata_df = _wiff_to_dict(str(first_file))
363
+
364
+ if not metadata_df.empty and 'polarity' in metadata_df.columns:
365
+ # Get polarity from first experiment
366
+ first_polarity = metadata_df['polarity'].iloc[0]
367
+
368
+ # Convert numeric polarity codes to string
369
+ if first_polarity == 1 or str(first_polarity).lower() in ['positive', 'pos', '+']:
370
+ return "positive"
371
+ elif first_polarity == -1 or str(first_polarity).lower() in ['negative', 'neg', '-']:
372
+ return "negative"
373
+
374
+ except Exception:
375
+ # Silently fall back to default if inference fails
376
+ pass
377
+
378
+ return None
379
+
318
380
  @property
319
381
  def polarity(self) -> str:
320
382
  """Get the polarity setting."""
@@ -324,6 +386,93 @@ class Wizard:
324
386
  def adducts(self) -> List[str]:
325
387
  """Get the adducts list."""
326
388
  return self.params.adducts
389
+
390
+ def _reload(self):
391
+ """
392
+ Reloads all masster modules to pick up any changes to their source code,
393
+ and updates the instance's class reference to the newly reloaded class version.
394
+ This ensures that the instance uses the latest implementation without restarting the interpreter.
395
+ """
396
+ # Reset logger configuration flags to allow proper reconfiguration after reload
397
+ try:
398
+ import masster.logger as logger_module
399
+
400
+ if hasattr(logger_module, "_WIZARD_LOGGER_CONFIGURED"):
401
+ logger_module._WIZARD_LOGGER_CONFIGURED = False
402
+ except Exception:
403
+ pass
404
+
405
+ # Get the base module name (masster)
406
+ base_modname = self.__class__.__module__.split(".")[0]
407
+ current_module = self.__class__.__module__
408
+
409
+ # Dynamically find all wizard submodules
410
+ wizard_modules = []
411
+ wizard_module_prefix = f"{base_modname}.wizard."
412
+
413
+ # Get all currently loaded modules that are part of the wizard package
414
+ for module_name in sys.modules:
415
+ if (
416
+ module_name.startswith(wizard_module_prefix)
417
+ and module_name != current_module
418
+ ):
419
+ wizard_modules.append(module_name)
420
+
421
+ # Add core masster modules
422
+ core_modules = [
423
+ f"{base_modname}._version",
424
+ f"{base_modname}.chromatogram",
425
+ f"{base_modname}.spectrum",
426
+ f"{base_modname}.logger",
427
+ ]
428
+
429
+ # Add sample submodules
430
+ sample_modules = []
431
+ sample_module_prefix = f"{base_modname}.sample."
432
+ for module_name in sys.modules:
433
+ if (
434
+ module_name.startswith(sample_module_prefix)
435
+ and module_name != current_module
436
+ ):
437
+ sample_modules.append(module_name)
438
+
439
+ # Add study submodules
440
+ study_modules = []
441
+ study_module_prefix = f"{base_modname}.study."
442
+ for module_name in sys.modules:
443
+ if (
444
+ module_name.startswith(study_module_prefix)
445
+ and module_name != current_module
446
+ ):
447
+ study_modules.append(module_name)
448
+
449
+ all_modules_to_reload = (
450
+ core_modules + wizard_modules + sample_modules + study_modules
451
+ )
452
+
453
+ # Reload all discovered modules
454
+ for full_module_name in all_modules_to_reload:
455
+ try:
456
+ if full_module_name in sys.modules:
457
+ mod = sys.modules[full_module_name]
458
+ importlib.reload(mod)
459
+ self.logger.debug(f"Reloaded module: {full_module_name}")
460
+ except Exception as e:
461
+ self.logger.warning(f"Failed to reload module {full_module_name}: {e}")
462
+
463
+ # Finally, reload the current module (wizard.py)
464
+ try:
465
+ mod = __import__(current_module, fromlist=[current_module.split(".")[0]])
466
+ importlib.reload(mod)
467
+
468
+ # Get the updated class reference from the reloaded module
469
+ new = getattr(mod, self.__class__.__name__)
470
+ # Update the class reference of the instance
471
+ self.__class__ = new
472
+
473
+ self.logger.debug("Module reload completed")
474
+ except Exception as e:
475
+ self.logger.error(f"Failed to reload current module {current_module}: {e}")
327
476
 
328
477
  def _setup_logging(self):
329
478
  """Setup comprehensive logging system."""
@@ -526,6 +675,15 @@ class Wizard:
526
675
  chrom_fwhm=2.0 # Wider peaks
527
676
  )
528
677
 
678
+ # STEP 3.5: Validate feature detection results
679
+ if not hasattr(sample, 'features_df') or sample.features_df is None or len(sample.features_df) == 0:
680
+ self.logger.warning(f"No features detected in {file_path.name} - skipping additional processing")
681
+ # Still save the sample5 file for record keeping
682
+ sample.save(filename=str(output_file))
683
+ return output_file.stem
684
+
685
+ self.logger.info(f"Detected {len(sample.features_df)} features in {file_path.name}")
686
+
529
687
  # STEP 4: Adduct detection
530
688
  sample.find_adducts(adducts=self.adducts)
531
689
 
@@ -535,17 +693,11 @@ class Wizard:
535
693
  # STEP 6: Save processed data
536
694
  sample.save(filename=str(output_file))
537
695
 
538
- # STEP 7: Generate additional outputs if requested
539
- if "csv" in self.params.export_formats:
540
- csv_file = output_file.with_suffix('.features.csv')
541
- sample.export_features(filename=str(csv_file))
542
-
543
- if "mgf" in self.params.export_formats:
544
- mgf_file = output_file.with_suffix('.mgf')
545
- sample.export_mgf(filename=str(mgf_file), use_cache=False)
696
+ # STEP 7: Generate additional outputs (only for samples with features)
697
+ # Skip CSV export and individual MGF export as requested
546
698
 
547
699
  if self.params.generate_plots:
548
- plot_file = output_file.with_suffix('_2d.html')
700
+ plot_file = output_file.parent / (output_file.stem + "_2d.html")
549
701
  sample.plot_2d(filename=str(plot_file), markersize=4)
550
702
 
551
703
  # Memory cleanup
@@ -592,6 +744,10 @@ class Wizard:
592
744
  self.logger.warning("No files found for conversion")
593
745
  return False
594
746
 
747
+ # Detect detector type and adjust parameters before processing
748
+ detector_type = self._detect_detector_type()
749
+ self._adjust_parameters_for_detector(detector_type)
750
+
595
751
  # Filter out already processed files if resuming
596
752
  if self.params.resume_enabled and self.processed_files:
597
753
  remaining_files = []
@@ -705,6 +861,125 @@ class Wizard:
705
861
  self._save_checkpoint()
706
862
  return successful_count > 0
707
863
 
864
+ def _detect_detector_type(self) -> str:
865
+ """
866
+ Detect the type of MS detector from the first available file.
867
+
868
+ Simplified detection rules:
869
+ - .raw files: Assume Orbitrap (Thermo instruments)
870
+ - .wiff files: Assume Quadrupole (SCIEX instruments)
871
+ - .mzML files: Check metadata for Orbitrap detection
872
+
873
+ Returns:
874
+ String indicating detector type ("orbitrap", "quadrupole", "unknown")
875
+ """
876
+ try:
877
+ # Find first raw file to analyze
878
+ for extension in ['.raw', '.wiff', '.mzML', '.d']:
879
+ if self.params.search_subfolders:
880
+ pattern = f"**/*{extension}"
881
+ files = list(self.data_source_path.rglob(pattern))
882
+ else:
883
+ pattern = f"*{extension}"
884
+ files = list(self.data_source_path.glob(pattern))
885
+ if files:
886
+ first_file = files[0]
887
+ break
888
+ else:
889
+ self.logger.warning("No raw files found for detector detection")
890
+ return "unknown"
891
+
892
+ self.logger.info(f"Detecting detector type from: {first_file.name}")
893
+
894
+ # Simplified detection rules
895
+ if first_file.suffix.lower() == '.raw':
896
+ # RAW files are Thermo -> assume Orbitrap
897
+ detector_type = "orbitrap"
898
+ self.logger.info("Detected .raw file -> Thermo Orbitrap detector")
899
+ return detector_type
900
+
901
+ elif first_file.suffix.lower() in ['.wiff', '.wiff2']:
902
+ # WIFF files are SCIEX -> assume Quadrupole
903
+ detector_type = "quadrupole"
904
+ self.logger.info("Detected .wiff file -> SCIEX Quadrupole detector")
905
+ return detector_type
906
+
907
+ elif first_file.suffix.lower() == '.mzml':
908
+ # For mzML files, check metadata for Orbitrap detection
909
+ try:
910
+ import warnings
911
+ with warnings.catch_warnings():
912
+ warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH environment variable already exists.*", category=UserWarning)
913
+ import pyopenms as oms
914
+
915
+ exp = oms.MSExperiment()
916
+ oms.MzMLFile().load(str(first_file), exp)
917
+
918
+ # Check instrument metadata for Orbitrap keywords
919
+ instrument_info = []
920
+ if hasattr(exp, 'getExperimentalSettings'):
921
+ settings = exp.getExperimentalSettings()
922
+ if hasattr(settings, 'getInstrument'):
923
+ instrument = settings.getInstrument()
924
+ if hasattr(instrument, 'getName'):
925
+ name = instrument.getName().decode() if hasattr(instrument.getName(), 'decode') else str(instrument.getName())
926
+ instrument_info.append(name.lower())
927
+ if hasattr(instrument, 'getModel'):
928
+ model = instrument.getModel().decode() if hasattr(instrument.getModel(), 'decode') else str(instrument.getModel())
929
+ instrument_info.append(model.lower())
930
+
931
+ # Check for Orbitrap keywords in instrument info
932
+ orbitrap_keywords = ['orbitrap', 'exactive', 'q-exactive', 'exploris', 'fusion', 'lumos', 'velos', 'elite']
933
+ instrument_text = ' '.join(instrument_info)
934
+
935
+ if any(keyword in instrument_text for keyword in orbitrap_keywords):
936
+ detector_type = "orbitrap"
937
+ self.logger.info(f"Detected mzML with Orbitrap instrument: {instrument_text}")
938
+ else:
939
+ detector_type = "unknown"
940
+ self.logger.info(f"Detected mzML with unknown instrument: {instrument_text}")
941
+
942
+ return detector_type
943
+
944
+ except Exception as e:
945
+ self.logger.warning(f"Failed to analyze mzML file for detector type: {e}")
946
+ return "unknown"
947
+
948
+ except Exception as e:
949
+ self.logger.warning(f"Detector type detection failed: {e}")
950
+
951
+ return "unknown"
952
+
953
+ def _adjust_parameters_for_detector(self, detector_type: str):
954
+ """
955
+ Adjust processing parameters based on detected detector type.
956
+
957
+ Simplified rules:
958
+ - "orbitrap": Use 1e5 noise threshold (high background noise)
959
+ - "quadrupole": Use 200 noise threshold (default, lower noise)
960
+ - "unknown": Use 200 noise threshold (default)
961
+
962
+ Parameters:
963
+ detector_type: Type of detector detected ("orbitrap", "quadrupole", "unknown")
964
+ """
965
+ original_noise = self.params.noise_threshold
966
+ self.params.detector_type = detector_type # Store the detected type
967
+
968
+ if detector_type == "orbitrap":
969
+ # Orbitraps have much higher background noise, use 1e5 threshold
970
+ self.params.noise_threshold = 1e5
971
+ self._log_progress(f"Detector: Orbitrap detected - adjusted noise threshold: {original_noise} -> {self.params.noise_threshold}")
972
+
973
+ elif detector_type == "quadrupole":
974
+ # Quadrupole instruments have lower noise, use default threshold
975
+ self.params.noise_threshold = 200.0
976
+ self._log_progress(f"Detector: Quadrupole detected - noise threshold: {self.params.noise_threshold}")
977
+
978
+ else:
979
+ # Unknown detector type, keep default
980
+ self.params.noise_threshold = 200.0
981
+ self._log_progress(f"Detector: Unknown type detected - using default noise threshold: {self.params.noise_threshold}")
982
+
708
983
  def assemble_study(self) -> bool:
709
984
  """
710
985
  Assemble processed sample5 files into a study.
@@ -725,6 +1000,10 @@ class Wizard:
725
1000
  self._log_progress(f"Assembling study from {len(sample5_files)} sample5 files")
726
1001
 
727
1002
  try:
1003
+ # Detect detector type and adjust parameters if needed
1004
+ detector_type = self._detect_detector_type()
1005
+ self._adjust_parameters_for_detector(detector_type)
1006
+
728
1007
  # Create study with optimized settings
729
1008
  import masster
730
1009
  study_params = study_defaults(
@@ -779,25 +1058,39 @@ class Wizard:
779
1058
  return False
780
1059
 
781
1060
  try:
1061
+ # Determine optimal algorithms based on study size
1062
+ num_samples = len(self.study.samples_df)
1063
+
1064
+ if num_samples < 500:
1065
+ # For smaller studies: use qt for both alignment and merge
1066
+ alignment_algorithm = "qt"
1067
+ merge_method = "qt"
1068
+ self.logger.info(f"Small study ({num_samples} samples) - using qt algorithms")
1069
+ else:
1070
+ # For larger studies: use kd for alignment and qt-chunked for merge
1071
+ alignment_algorithm = "kd"
1072
+ merge_method = "qt-chunked"
1073
+ self.logger.info(f"Large study ({num_samples} samples) - using kd alignment and qt-chunked merge")
1074
+
782
1075
  # Align features across samples
783
1076
  align_params = align_defaults(
784
1077
  rt_tol=self.params.rt_tolerance,
785
- mz_tol=self.params.mz_tolerance,
786
- algorithm=self.params.alignment_algorithm
1078
+ mz_max_diff=self.params.mz_max_diff,
1079
+ algorithm=alignment_algorithm
787
1080
  )
788
-
789
- self.logger.info(f"Aligning features with RT tolerance {self.params.rt_tolerance}s, m/z tolerance {self.params.mz_tolerance} Da")
1081
+
1082
+ self.logger.info(f"Aligning features with RT tolerance {self.params.rt_tolerance}s, m/z max diff {self.params.mz_max_diff} Da, algorithm: {alignment_algorithm}")
790
1083
  self.study.align(params=align_params)
791
1084
 
792
1085
  # Merge aligned features
793
1086
  merge_params = merge_defaults(
794
- method=self.params.merge_method,
1087
+ method=merge_method,
795
1088
  rt_tol=self.params.rt_tolerance,
796
- mz_tol=self.params.mz_tolerance,
1089
+ mz_tol=self.params.mz_max_diff,
797
1090
  min_samples=self.params.min_samples_for_merge
798
1091
  )
799
1092
 
800
- self.logger.info(f"Merging features using {self.params.merge_method} method")
1093
+ self.logger.info(f"Merging features using {merge_method} method")
801
1094
  self.study.merge(params=merge_params)
802
1095
 
803
1096
  # Log results
@@ -920,12 +1213,47 @@ class Wizard:
920
1213
 
921
1214
  self._log_progress(f"Completed {exports_completed} exports")
922
1215
  self._save_checkpoint()
1216
+
1217
+ # Always perform additional export methods as requested
1218
+ self._export_additional_formats()
1219
+
923
1220
  return True
924
1221
 
925
1222
  except Exception as e:
926
1223
  self.logger.error(f"Failed to export results: {e}")
927
1224
  return False
928
1225
 
1226
+ def _export_additional_formats(self):
1227
+ """Export additional formats: xlsx, parquet, save, and mgf."""
1228
+ self.logger.info("=== Exporting Additional Formats ===")
1229
+
1230
+ try:
1231
+ # Force export xlsx (study results in Excel format)
1232
+ xlsx_file = self.study_folder_path / "study_results.xlsx"
1233
+ if hasattr(self.study, 'export_xlsx'):
1234
+ self.study.export_xlsx(filename=str(xlsx_file))
1235
+ self.logger.info(f"Exported Excel: {xlsx_file}")
1236
+
1237
+ # Force export parquet (efficient binary format)
1238
+ parquet_file = self.study_folder_path / "study_data.parquet"
1239
+ if hasattr(self.study, 'export_parquet'):
1240
+ self.study.export_parquet(filename=str(parquet_file))
1241
+ self.logger.info(f"Exported Parquet: {parquet_file}")
1242
+
1243
+ # Force save the study in study5 format
1244
+ study_file = self.study_folder_path / "final_study.study5"
1245
+ self.study.save(filename=str(study_file))
1246
+ self.logger.info(f"Saved study: {study_file}")
1247
+
1248
+ # Force export MGF for MS2 spectra
1249
+ mgf_file = self.study_folder_path / "consensus_ms2.mgf"
1250
+ if hasattr(self.study, 'export_mgf'):
1251
+ self.study.export_mgf(filename=str(mgf_file))
1252
+ self.logger.info(f"Exported MGF: {mgf_file}")
1253
+
1254
+ except Exception as e:
1255
+ self.logger.warning(f"Some additional exports failed: {e}")
1256
+
929
1257
  def save_study(self) -> bool:
930
1258
  """
931
1259
  Save the final study in optimized format.
@@ -977,7 +1305,7 @@ class Wizard:
977
1305
  f.write(f"Successful Files: {len(self.processed_files)}\n")
978
1306
  f.write(f"Failed Files: {len(self.failed_files)}\n")
979
1307
  f.write(f"RT Tolerance: {self.params.rt_tolerance}s\n")
980
- f.write(f"m/z Tolerance: {self.params.mz_tolerance} Da\n")
1308
+ f.write(f"m/z Max Diff: {self.params.mz_max_diff} Da\n")
981
1309
  f.write(f"Merge Method: {self.params.merge_method}\n")
982
1310
  f.write(f"Processing Time: {self._get_total_processing_time()}\n")
983
1311
 
@@ -1144,6 +1472,753 @@ class Wizard:
1144
1472
  }
1145
1473
  }
1146
1474
 
1475
+ def execute(self) -> bool:
1476
+ """
1477
+ Execute the complete automated processing pipeline.
1478
+
1479
+ This is a convenience method that runs the full pipeline with the wizard's
1480
+ current configuration. It performs standalone analysis of the samples/studies
1481
+ as proposed by the Wizard.
1482
+
1483
+ Returns:
1484
+ True if execution completed successfully, False otherwise
1485
+ """
1486
+ self._log_progress("Executing Wizard automated processing...")
1487
+ return self.run_full_pipeline()
1488
+
1489
+ def export_script(self, filename: str) -> bool:
1490
+ """
1491
+ Generate a Python script that replicates the wizard's processing steps.
1492
+
1493
+ Creates a standalone Python script that can be executed independently
1494
+ to perform the same analysis as the wizard with the current configuration.
1495
+ The script will be saved in the study folder.
1496
+
1497
+ This is useful for:
1498
+ - Creating reproducible analysis scripts
1499
+ - Customizing processing steps
1500
+ - Running analysis in different environments
1501
+ - Batch processing automation
1502
+
1503
+ Parameters:
1504
+ filename: Filename for the script (should end with .py). Script will be saved in the study folder.
1505
+
1506
+ Returns:
1507
+ True if script was generated successfully, False otherwise
1508
+ """
1509
+ self._log_progress("Generating analysis script...")
1510
+
1511
+ try:
1512
+ # Ensure the filename is just a filename, not a full path
1513
+ script_filename = Path(filename).name
1514
+ if not script_filename.endswith('.py'):
1515
+ script_filename = script_filename.replace(Path(script_filename).suffix, '') + '.py'
1516
+
1517
+ # Place the script in the study folder
1518
+ script_path = self.study_folder_path / script_filename
1519
+
1520
+ # Generate the script content
1521
+ script_content = self._generate_script_content()
1522
+
1523
+ # Write the script
1524
+ with open(script_path, 'w', encoding='utf-8') as f:
1525
+ f.write(script_content)
1526
+
1527
+ self._log_progress(f"Analysis script saved: {os.path.abspath(script_path)}")
1528
+ self.logger.info(f"Generated standalone analysis script: {os.path.abspath(script_path)}")
1529
+
1530
+ return True
1531
+
1532
+ except Exception as e:
1533
+ self.logger.error(f"Failed to generate script: {e}")
1534
+ return False
1535
+
1536
+ def to_script(self, filename: str) -> bool:
1537
+ """
1538
+ [DEPRECATED] Use export_script() instead.
1539
+
1540
+ Backward compatibility alias for export_script().
1541
+ """
1542
+ return self.export_script(filename)
1543
+
1544
+ def _generate_script_content(self) -> str:
1545
+ """
1546
+ Generate the content for the standalone analysis script.
1547
+
1548
+ Returns:
1549
+ Complete Python script content as string
1550
+ """
1551
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
1552
+
1553
+ # Create script header
1554
+ script_lines = [
1555
+ '#!/usr/bin/env python3',
1556
+ '"""',
1557
+ 'Standalone Mass Spectrometry Analysis Script',
1558
+ f'Generated by masster.Wizard on {timestamp}',
1559
+ '',
1560
+ 'This script replicates the automated processing pipeline configured',
1561
+ 'in the Wizard with the following settings:',
1562
+ f'- Data Source: {self.params.data_source.replace(chr(92), chr(92)*2)}',
1563
+ f'- Study Folder: {self.params.study_folder.replace(chr(92), chr(92)*2)}',
1564
+ f'- Polarity: {self.params.polarity}',
1565
+ f'- Number of Cores: {self.params.num_cores}',
1566
+ f'- Adducts: {", ".join(self.params.adducts)}',
1567
+ f'- RT Tolerance: {self.params.rt_tolerance}s',
1568
+ f'- m/z Max Diff: {self.params.mz_max_diff} Da',
1569
+ f'- Merge Method: {self.params.merge_method}',
1570
+ '"""',
1571
+ '',
1572
+ 'import os',
1573
+ 'import sys',
1574
+ 'import time',
1575
+ 'import multiprocessing',
1576
+ 'from pathlib import Path',
1577
+ 'from typing import List, Optional',
1578
+ 'import concurrent.futures',
1579
+ 'from datetime import datetime',
1580
+ '',
1581
+ '# Add error handling for masster import',
1582
+ '# First, try to add the masster directory to the Python path',
1583
+ 'try:',
1584
+ ' # Try to find masster by looking for it in common development locations',
1585
+ ' possible_paths = [',
1586
+ ' Path(__file__).parent.parent, # Script is in masster subfolder',
1587
+ ' Path(__file__).parent.parent.parent, # Script is in study folder',
1588
+ ' Path(os.getcwd()), # Current working directory',
1589
+ ' Path(os.getcwd()).parent, # Parent of current directory',
1590
+ ' Path(r"D:\\SW\\massistant"), # Specific development path',
1591
+ ' Path.home() / "massistant", # Home directory',
1592
+ ' Path.home() / "SW" / "massistant", # Common dev location',
1593
+ ' ]',
1594
+ ' ',
1595
+ ' masster_found = False',
1596
+ ' for possible_path in possible_paths:',
1597
+ ' masster_dir = possible_path / "masster"',
1598
+ ' if masster_dir.exists() and (masster_dir / "__init__.py").exists():',
1599
+ ' if str(possible_path) not in sys.path:',
1600
+ ' sys.path.insert(0, str(possible_path))',
1601
+ ' masster_found = True',
1602
+ ' print(f"Found masster at: {possible_path}")',
1603
+ ' break',
1604
+ ' ',
1605
+ ' if not masster_found:',
1606
+ ' # Try adding current directory to path as fallback',
1607
+ ' current_dir = Path(os.getcwd())',
1608
+ ' if str(current_dir) not in sys.path:',
1609
+ ' sys.path.insert(0, str(current_dir))',
1610
+ ' ',
1611
+ ' import masster',
1612
+ 'except ImportError as e:',
1613
+ ' print(f"Error: masster library not found. {e}")',
1614
+ ' print("Please ensure masster is installed or run this script from the masster directory.")',
1615
+ ' print("You can install masster with: pip install -e .")',
1616
+ ' sys.exit(1)',
1617
+ '',
1618
+ '',
1619
+ 'def infer_polarity_from_first_file():',
1620
+ ' """Infer polarity from the first available raw data file."""',
1621
+ ' try:',
1622
+ ' data_source_path = Path(DATA_SOURCE)',
1623
+ ' # Find first file',
1624
+ ' for extension in [\'.wiff\', \'.raw\', \'.mzML\', \'.d\']:',
1625
+ ' pattern = f"**/*{extension}"',
1626
+ ' files = list(data_source_path.rglob(pattern))',
1627
+ ' if files:',
1628
+ ' first_file = files[0]',
1629
+ ' break',
1630
+ ' else:',
1631
+ ' return None',
1632
+ ' ',
1633
+ ' # Only implement for .wiff files initially',
1634
+ ' if first_file.suffix.lower() == \'.wiff\':',
1635
+ ' from masster.sample.load import _wiff_to_dict',
1636
+ ' ',
1637
+ ' # Extract metadata from first file',
1638
+ ' metadata_df = _wiff_to_dict(str(first_file))',
1639
+ ' ',
1640
+ ' if not metadata_df.empty and \'polarity\' in metadata_df.columns:',
1641
+ ' # Get polarity from first experiment',
1642
+ ' first_polarity = metadata_df[\'polarity\'].iloc[0]',
1643
+ ' ',
1644
+ ' # Convert numeric polarity codes to string',
1645
+ ' if first_polarity == 1 or str(first_polarity).lower() in [\'positive\', \'pos\', \'+\']:',
1646
+ ' return "positive"',
1647
+ ' elif first_polarity == -1 or str(first_polarity).lower() in [\'negative\', \'neg\', \'-\']:',
1648
+ ' return "negative"',
1649
+ ' except Exception:',
1650
+ ' pass',
1651
+ ' return None',
1652
+ '',
1653
+ '',
1654
+ '# Configuration Parameters',
1655
+ f'DATA_SOURCE = r"{self.params.data_source}"',
1656
+ f'STUDY_FOLDER = r"{self.params.study_folder}"',
1657
+ '',
1658
+ '# Auto-infer polarity from first file, fall back to default',
1659
+ 'detected_polarity = infer_polarity_from_first_file()',
1660
+ f'POLARITY = detected_polarity or "{self.params.polarity}"',
1661
+ 'NUM_CORES = max(1, int(multiprocessing.cpu_count() * 0.75)) # Auto-detect 75% of cores',
1662
+ '',
1663
+ '# Set adducts based on detected polarity',
1664
+ 'if POLARITY.lower() in ["positive", "pos"]:',
1665
+ ' ADDUCTS = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]',
1666
+ 'elif POLARITY.lower() in ["negative", "neg"]:',
1667
+ ' ADDUCTS = ["H-1:-:1.0", "CH2O2:0:0.5"]',
1668
+ 'else:',
1669
+ f' ADDUCTS = {self.params.adducts!r} # Fall back to original',
1670
+ f'RT_TOLERANCE = {self.params.rt_tolerance}',
1671
+ f'MZ_TOLERANCE = {self.params.mz_max_diff}',
1672
+ f'MERGE_METHOD = "{self.params.merge_method}"',
1673
+ f'BATCH_SIZE = {self.params.batch_size}',
1674
+ f'CHROM_FWHM = {self.params.chrom_fwhm}',
1675
+ f'NOISE_THRESHOLD = {self.params.noise_threshold}',
1676
+ f'CHROM_PEAK_SNR = {self.params.chrom_peak_snr}',
1677
+ f'TOL_PPM = {self.params.tol_ppm}',
1678
+ f'MIN_SAMPLES_FOR_MERGE = {self.params.min_samples_for_merge}',
1679
+ '',
1680
+ '# File discovery settings',
1681
+ "FILE_EXTENSIONS = ['.wiff', '.raw', '.mzML']",
1682
+ f'SEARCH_SUBFOLDERS = {self.params.search_subfolders}',
1683
+ "SKIP_PATTERNS = []",
1684
+ f'MAX_FILE_SIZE_GB = {self.params.max_file_size_gb}',
1685
+ '',
1686
+ '# Output settings',
1687
+ f'GENERATE_PLOTS = {self.params.generate_plots}',
1688
+ f'EXPORT_FORMATS = {self.params.export_formats!r}',
1689
+ f'COMPRESS_OUTPUT = {self.params.compress_output}',
1690
+ f'CLEANUP_TEMP_FILES = {self.params.cleanup_temp_files}',
1691
+ '',
1692
+ '',
1693
+ 'def log_progress(message: str):',
1694
+ ' """Log progress message with timestamp."""',
1695
+ ' timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")',
1696
+ ' print(f"[{timestamp}] {message}")',
1697
+ '',
1698
+ '',
1699
+ 'def discover_files() -> List[Path]:',
1700
+ ' """Discover raw data files in the source directory."""',
1701
+ ' log_progress("Discovering raw data files...")',
1702
+ ' data_source_path = Path(DATA_SOURCE)',
1703
+ ' found_files = []',
1704
+ ' ',
1705
+ ' for extension in FILE_EXTENSIONS:',
1706
+ ' if SEARCH_SUBFOLDERS:',
1707
+ ' pattern = f"**/*{extension}"',
1708
+ ' files = list(data_source_path.rglob(pattern))',
1709
+ ' else:',
1710
+ ' pattern = f"*{extension}"',
1711
+ ' files = list(data_source_path.glob(pattern))',
1712
+ ' ',
1713
+ ' # Filter out files matching skip patterns',
1714
+ ' filtered_files = []',
1715
+ ' for file_path in files:',
1716
+ ' skip_file = False',
1717
+ ' for pattern in SKIP_PATTERNS:',
1718
+ ' if pattern.lower() in file_path.name.lower():',
1719
+ ' skip_file = True',
1720
+ ' print(f"Skipping file (matches pattern \'{pattern}\'): {file_path.name}")',
1721
+ ' break',
1722
+ ' ',
1723
+ ' if not skip_file:',
1724
+ ' # Check file size',
1725
+ ' try:',
1726
+ ' file_size_gb = file_path.stat().st_size / (1024**3)',
1727
+ ' if file_size_gb > MAX_FILE_SIZE_GB:',
1728
+ ' print(f"Large file ({file_size_gb:.1f}GB): {file_path.name}")',
1729
+ ' filtered_files.append(file_path)',
1730
+ ' except Exception as e:',
1731
+ ' print(f"Could not check file size for {file_path}: {e}")',
1732
+ ' filtered_files.append(file_path)',
1733
+ ' ',
1734
+ ' found_files.extend(filtered_files)',
1735
+ ' log_progress(f"Found {len(filtered_files)} {extension} files")',
1736
+ ' ',
1737
+ ' # Remove duplicates and sort',
1738
+ ' found_files = sorted(list(set(found_files)))',
1739
+ ' log_progress(f"Total files discovered: {len(found_files)}")',
1740
+ ' return found_files',
1741
+ '',
1742
+ '',
1743
+ 'def process_single_file(file_path: Path) -> Optional[str]:',
1744
+ ' """Process a single file to sample5 format."""',
1745
+ ' import gc',
1746
+ ' study_folder_path = Path(STUDY_FOLDER)',
1747
+ ' ',
1748
+ ' # Generate output filename',
1749
+ ' file_out = file_path.stem + ".sample5"',
1750
+ ' output_file = study_folder_path / file_out',
1751
+ ' ',
1752
+ ' # Check if file already exists',
1753
+ ' if output_file.exists():',
1754
+ ' try:',
1755
+ ' # Try to load existing file to verify it\'s valid',
1756
+ ' sample = masster.Sample(log_level="ERROR")',
1757
+ ' sample.load(str(output_file))',
1758
+ ' print(f"Skipping {file_path.name} (already processed)")',
1759
+ ' return output_file.stem',
1760
+ ' except Exception:',
1761
+ ' # If loading fails, file needs to be reprocessed',
1762
+ ' pass',
1763
+ ' ',
1764
+ ' print(f"Processing {file_path.name}")',
1765
+ ' ',
1766
+ ' try:',
1767
+ ' # Initialize sample',
1768
+ ' sample = masster.Sample(',
1769
+ ' log_label=file_path.name,',
1770
+ ' log_level="ERROR" # Reduce logging overhead',
1771
+ ' )',
1772
+ ' ',
1773
+ ' # STEP 1: Load raw data',
1774
+ ' sample.load(str(file_path))',
1775
+ ' ',
1776
+ ' # STEP 2: Feature detection - First pass (strict parameters)',
1777
+ ' sample.find_features(',
1778
+ ' chrom_fwhm=CHROM_FWHM,',
1779
+ ' noise=NOISE_THRESHOLD,',
1780
+ ' tol_ppm=TOL_PPM,',
1781
+ ' chrom_peak_snr=CHROM_PEAK_SNR,',
1782
+ ' min_trace_length_multiplier=0.5,',
1783
+ ' chrom_fwhm_min=CHROM_FWHM',
1784
+ ' )',
1785
+ ' ',
1786
+ ' # STEP 3: Feature detection - Second pass (relaxed parameters)',
1787
+ ' sample.find_features(',
1788
+ ' chrom_peak_snr=CHROM_PEAK_SNR,',
1789
+ ' noise=NOISE_THRESHOLD / 10, # Lower noise threshold',
1790
+ ' chrom_fwhm=2.0 # Wider peaks',
1791
+ ' )',
1792
+ ' ',
1793
+ ' # STEP 3.5: Validate feature detection results',
1794
+ ' if not hasattr(sample, "features_df") or sample.features_df is None or len(sample.features_df) == 0:',
1795
+ ' print(f"WARNING: No features detected in {file_path.name} - skipping additional processing")',
1796
+ ' # Still save the sample5 file for record keeping',
1797
+ ' sample.save(filename=str(output_file))',
1798
+ ' return output_file.stem',
1799
+ ' ',
1800
+ ' print(f"Detected {len(sample.features_df)} features in {file_path.name}")',
1801
+ ' ',
1802
+ ' # STEP 4: Adduct detection',
1803
+ ' sample.find_adducts(adducts=ADDUCTS)',
1804
+ ' ',
1805
+ ' # STEP 5: MS2 spectrum identification',
1806
+ ' sample.find_ms2()',
1807
+ ' ',
1808
+ ' # STEP 6: Save processed data',
1809
+ ' sample.save(filename=str(output_file))',
1810
+ ' ',
1811
+ ' # STEP 7: Generate additional outputs (only for samples with features)',
1812
+ ' # Skip CSV export and individual MGF export as requested',
1813
+ ' ',
1814
+ ' if GENERATE_PLOTS:',
1815
+ ' plot_file = output_file.parent / (output_file.stem + "_2d.html")',
1816
+ ' sample.plot_2d(filename=str(plot_file), markersize=4)',
1817
+ ' ',
1818
+ ' # Memory cleanup',
1819
+ ' result = output_file.stem',
1820
+ ' del sample',
1821
+ ' gc.collect()',
1822
+ ' return result',
1823
+ ' ',
1824
+ ' except Exception as e:',
1825
+ ' print(f"Error processing {file_path.name}: {e}")',
1826
+ ' gc.collect()',
1827
+ ' return None',
1828
+ '',
1829
+ '',
1830
+ 'def convert_to_sample5(file_list: List[Path]) -> bool:',
1831
+ ' """Convert raw data files to sample5 format in parallel."""',
1832
+ ' log_progress("=== Starting Sample5 Conversion ===")',
1833
+ ' log_progress(f"Converting {len(file_list)} files to sample5 format")',
1834
+ ' ',
1835
+ ' conversion_start = time.time()',
1836
+ ' successful_count = 0',
1837
+ ' failed_count = 0',
1838
+ ' ',
1839
+ ' with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_CORES) as executor:',
1840
+ ' futures = [executor.submit(process_single_file, file_path) for file_path in file_list]',
1841
+ ' ',
1842
+ ' for i, future in enumerate(concurrent.futures.as_completed(futures)):',
1843
+ ' result = future.result()',
1844
+ ' if result:',
1845
+ ' successful_count += 1',
1846
+ ' else:',
1847
+ ' failed_count += 1',
1848
+ ' ',
1849
+ ' # Progress update',
1850
+ ' if (successful_count + failed_count) % 10 == 0:',
1851
+ ' progress = (successful_count + failed_count) / len(file_list) * 100',
1852
+ ' log_progress(f"Progress: {progress:.1f}% ({successful_count} successful, {failed_count} failed)")',
1853
+ ' ',
1854
+ ' conversion_time = time.time() - conversion_start',
1855
+ ' log_progress("=== Sample5 Conversion Complete ===")',
1856
+ ' log_progress(f"Successful: {successful_count}")',
1857
+ ' log_progress(f"Failed: {failed_count}")',
1858
+ ' log_progress(f"Total time: {conversion_time:.1f} seconds")',
1859
+ ' ',
1860
+ ' return successful_count > 0',
1861
+ '',
1862
+ '',
1863
+ 'def assemble_study() -> masster.Study:',
1864
+ ' """Assemble processed sample5 files into a study."""',
1865
+ ' log_progress("=== Starting Study Assembly ===")',
1866
+ ' study_folder_path = Path(STUDY_FOLDER)',
1867
+ ' ',
1868
+ ' # Find all sample5 files',
1869
+ ' sample5_files = list(study_folder_path.glob("*.sample5"))',
1870
+ ' if not sample5_files:',
1871
+ ' raise RuntimeError("No sample5 files found for study assembly")',
1872
+ ' ',
1873
+ ' log_progress(f"Assembling study from {len(sample5_files)} sample5 files")',
1874
+ ' ',
1875
+ ' # Create study with optimized settings',
1876
+ ' from masster.study.defaults.study_def import study_defaults',
1877
+ ' study_params = study_defaults(',
1878
+ ' folder=str(study_folder_path),',
1879
+ ' polarity=POLARITY,',
1880
+ ' log_level="INFO",',
1881
+ f' log_label="Study-{self.params.polarity}",',
1882
+ ' adducts=ADDUCTS',
1883
+ ' )',
1884
+ ' ',
1885
+ ' study = masster.Study(params=study_params)',
1886
+ ' ',
1887
+ ' # Add all sample5 files',
1888
+ ' sample5_pattern = str(study_folder_path / "*.sample5")',
1889
+ ' study.add(sample5_pattern)',
1890
+ ' log_progress(f"Added {len(study.samples_df)} samples to study")',
1891
+ ' ',
1892
+ ' # Filter features based on quality criteria',
1893
+ ' if hasattr(study, "features_filter"):',
1894
+ ' initial_features = len(study.features_df) if hasattr(study, "features_df") else 0',
1895
+ ' feature_selection = study.features_select(',
1896
+ ' chrom_coherence=0.3,',
1897
+ ' chrom_prominence_scaled=1',
1898
+ ' )',
1899
+ ' study.features_filter(feature_selection)',
1900
+ ' final_features = len(study.features_df) if hasattr(study, "features_df") else 0',
1901
+ ' log_progress(f"Feature filtering: {initial_features} -> {final_features} features")',
1902
+ ' ',
1903
+ ' return study',
1904
+ '',
1905
+ '',
1906
+ 'def align_and_merge(study: masster.Study) -> masster.Study:',
1907
+ ' """Perform feature alignment and merging."""',
1908
+ ' log_progress("=== Starting Feature Alignment and Merging ===")',
1909
+ ' ',
1910
+ ' # Import alignment and merge defaults',
1911
+ ' from masster.study.defaults.align_def import align_defaults',
1912
+ ' from masster.study.defaults.merge_def import merge_defaults',
1913
+ ' ',
1914
+ ' # Determine optimal algorithms based on study size',
1915
+ ' num_samples = len(study.samples_df)',
1916
+ ' ',
1917
+ ' if num_samples < 500:',
1918
+ ' # For smaller studies: use qt for both alignment and merge',
1919
+ ' alignment_algorithm = "qt"',
1920
+ ' merge_method = "qt"',
1921
+ ' log_progress(f"Small study ({num_samples} samples) - using qt algorithms")',
1922
+ ' else:',
1923
+ ' # For larger studies: use kd for alignment and qt-chunked for merge',
1924
+ ' alignment_algorithm = "kd"',
1925
+ ' merge_method = "qt-chunked"',
1926
+ ' log_progress(f"Large study ({num_samples} samples) - using kd alignment and qt-chunked merge")',
1927
+ ' ',
1928
+ ' # Align features across samples',
1929
+ ' align_params = align_defaults(',
1930
+ ' rt_tol=RT_TOLERANCE,',
1931
+ ' mz_max_diff=MZ_TOLERANCE,',
1932
+ ' algorithm=alignment_algorithm',
1933
+ ' )',
1934
+ ' ',
1935
+ ' log_progress(f"Aligning features with RT tolerance {RT_TOLERANCE}s, m/z tolerance {MZ_TOLERANCE} Da, algorithm: {alignment_algorithm}")',
1936
+ ' study.align(params=align_params)',
1937
+ ' ',
1938
+ ' # Merge aligned features',
1939
+ ' merge_params = merge_defaults(',
1940
+ ' method=merge_method,',
1941
+ ' rt_tol=RT_TOLERANCE,',
1942
+ ' mz_tol=MZ_TOLERANCE,',
1943
+ ' min_samples=MIN_SAMPLES_FOR_MERGE',
1944
+ ' )',
1945
+ ' ',
1946
+ ' log_progress(f"Merging features using {merge_method} method")',
1947
+ ' study.merge(params=merge_params)',
1948
+ ' ',
1949
+ ' # Log results',
1950
+ ' num_consensus = len(study.consensus_df) if hasattr(study, "consensus_df") else 0',
1951
+ ' log_progress(f"Generated {num_consensus} consensus features")',
1952
+ ' ',
1953
+ ' # Get study info',
1954
+ ' if hasattr(study, "info"):',
1955
+ ' study.info()',
1956
+ ' ',
1957
+ ' return study',
1958
+ '',
1959
+ '',
1960
+ 'def generate_plots(study: masster.Study) -> bool:',
1961
+ ' """Generate visualization plots for the study."""',
1962
+ ' if not GENERATE_PLOTS:',
1963
+ ' log_progress("Plot generation disabled, skipping...")',
1964
+ ' return True',
1965
+ ' ',
1966
+ ' log_progress("=== Generating Visualization Plots ===")',
1967
+ ' study_folder_path = Path(STUDY_FOLDER)',
1968
+ ' plots_generated = 0',
1969
+ ' ',
1970
+ ' try:',
1971
+ ' # Alignment plot',
1972
+ ' if hasattr(study, "plot_alignment"):',
1973
+ ' alignment_plot = study_folder_path / "alignment_plot.html"',
1974
+ ' study.plot_alignment(filename=str(alignment_plot))',
1975
+ ' plots_generated += 1',
1976
+ ' log_progress(f"Generated alignment plot: {alignment_plot}")',
1977
+ ' ',
1978
+ ' # Consensus 2D plot',
1979
+ ' if hasattr(study, "plot_consensus_2d"):',
1980
+ ' consensus_2d_plot = study_folder_path / "consensus_2d.html"',
1981
+ ' study.plot_consensus_2d(filename=str(consensus_2d_plot))',
1982
+ ' plots_generated += 1',
1983
+ ' log_progress(f"Generated consensus 2D plot: {consensus_2d_plot}")',
1984
+ ' ',
1985
+ ' # PCA plot',
1986
+ ' if hasattr(study, "plot_pca"):',
1987
+ ' pca_plot = study_folder_path / "pca_plot.html"',
1988
+ ' study.plot_pca(filename=str(pca_plot))',
1989
+ ' plots_generated += 1',
1990
+ ' log_progress(f"Generated PCA plot: {pca_plot}")',
1991
+ ' ',
1992
+ ' # Consensus statistics',
1993
+ ' if hasattr(study, "plot_consensus_stats"):',
1994
+ ' stats_plot = study_folder_path / "consensus_stats.html"',
1995
+ ' study.plot_consensus_stats(filename=str(stats_plot))',
1996
+ ' plots_generated += 1',
1997
+ ' log_progress(f"Generated statistics plot: {stats_plot}")',
1998
+ ' ',
1999
+ ' log_progress(f"Generated {plots_generated} visualization plots")',
2000
+ ' return True',
2001
+ ' ',
2002
+ ' except Exception as e:',
2003
+ ' print(f"Failed to generate plots: {e}")',
2004
+ ' return False',
2005
+ '',
2006
+ '',
2007
+ 'def export_results(study: masster.Study) -> bool:',
2008
+ ' """Export study results in requested formats."""',
2009
+ ' log_progress("=== Exporting Study Results ===")',
2010
+ ' study_folder_path = Path(STUDY_FOLDER)',
2011
+ ' exports_completed = 0',
2012
+ ' ',
2013
+ ' try:',
2014
+ ' # Skip CSV export as requested',
2015
+ ' ',
2016
+ ' # Export as Excel',
2017
+ ' if "xlsx" in EXPORT_FORMATS and hasattr(study, "export_xlsx"):',
2018
+ ' xlsx_file = study_folder_path / "study_results.xlsx"',
2019
+ ' study.export_xlsx(filename=str(xlsx_file))',
2020
+ ' exports_completed += 1',
2021
+ ' log_progress(f"Exported Excel: {xlsx_file}")',
2022
+ ' ',
2023
+ ' # Export MGF for MS2 spectra',
2024
+ ' if "mgf" in EXPORT_FORMATS and hasattr(study, "export_mgf"):',
2025
+ ' mgf_file = study_folder_path / "consensus_ms2.mgf"',
2026
+ ' study.export_mgf(filename=str(mgf_file))',
2027
+ ' exports_completed += 1',
2028
+ ' log_progress(f"Exported MGF: {mgf_file}")',
2029
+ ' ',
2030
+ ' # Export as Parquet for efficient storage',
2031
+ ' if "parquet" in EXPORT_FORMATS and hasattr(study, "export_parquet"):',
2032
+ ' parquet_file = study_folder_path / "study_data.parquet"',
2033
+ ' study.export_parquet(filename=str(parquet_file))',
2034
+ ' exports_completed += 1',
2035
+ ' log_progress(f"Exported Parquet: {parquet_file}")',
2036
+ ' ',
2037
+ ' log_progress(f"Completed {exports_completed} exports")',
2038
+ ' ',
2039
+ ' # Always perform additional exports as requested',
2040
+ ' log_progress("=== Exporting Additional Formats ===")',
2041
+ ' ',
2042
+ ' try:',
2043
+ ' # Force export xlsx (study results in Excel format)',
2044
+ ' xlsx_file = study_folder_path / "study_results.xlsx"',
2045
+ ' if hasattr(study, "export_xlsx"):',
2046
+ ' study.export_xlsx(filename=str(xlsx_file))',
2047
+ ' log_progress(f"Exported Excel: {xlsx_file}")',
2048
+ ' ',
2049
+ ' # Force export parquet (efficient binary format)',
2050
+ ' parquet_file = study_folder_path / "study_data.parquet"',
2051
+ ' if hasattr(study, "export_parquet"):',
2052
+ ' study.export_parquet(filename=str(parquet_file))',
2053
+ ' log_progress(f"Exported Parquet: {parquet_file}")',
2054
+ ' ',
2055
+ ' # Force save the study in study5 format',
2056
+ ' study_file = study_folder_path / "final_study.study5"',
2057
+ ' study.save(filename=str(study_file))',
2058
+ ' log_progress(f"Saved study: {study_file}")',
2059
+ ' ',
2060
+ ' # Force export MGF for MS2 spectra',
2061
+ ' mgf_file = study_folder_path / "consensus_ms2.mgf"',
2062
+ ' if hasattr(study, "export_mgf"):',
2063
+ ' study.export_mgf(filename=str(mgf_file))',
2064
+ ' log_progress(f"Exported MGF: {mgf_file}")',
2065
+ ' ',
2066
+ ' except Exception as e:',
2067
+ ' print(f"Some additional exports failed: {e}")',
2068
+ ' ',
2069
+ ' return True',
2070
+ ' ',
2071
+ ' except Exception as e:',
2072
+ ' print(f"Failed to export results: {e}")',
2073
+ ' return False',
2074
+ '',
2075
+ '',
2076
+ 'def save_study(study: masster.Study) -> bool:',
2077
+ ' """Save the final study in optimized format."""',
2078
+ ' log_progress("=== Saving Final Study ===")',
2079
+ ' study_folder_path = Path(STUDY_FOLDER)',
2080
+ ' ',
2081
+ ' try:',
2082
+ ' study_file = study_folder_path / "final_study.study5"',
2083
+ ' ',
2084
+ ' # Determine optimal save format based on study size',
2085
+ ' num_samples = len(study.samples_df)',
2086
+ ' num_features = len(study.consensus_df) if hasattr(study, "consensus_df") else 0',
2087
+ ' ',
2088
+ ' if num_samples > 50 or num_features > 10000:',
2089
+ ' log_progress(f"Large study detected ({num_samples} samples, {num_features} features) - using compressed format")',
2090
+ ' compress_output = True',
2091
+ ' else:',
2092
+ ' log_progress(f"Small study ({num_samples} samples, {num_features} features) - using standard format")',
2093
+ ' compress_output = False',
2094
+ ' ',
2095
+ ' # Save study',
2096
+ ' if compress_output and hasattr(study, "save_compressed"):',
2097
+ ' study.save_compressed(filename=str(study_file))',
2098
+ ' log_progress(f"Saved compressed study: {study_file}")',
2099
+ ' else:',
2100
+ ' study.save(filename=str(study_file))',
2101
+ ' log_progress(f"Saved study: {study_file}")',
2102
+ ' ',
2103
+ ' # Save metadata summary',
2104
+ ' metadata_file = study_folder_path / "study_metadata.txt"',
2105
+ ' with open(metadata_file, "w") as f:',
2106
+ ' f.write("Study Processing Summary\\n")',
2107
+ ' f.write("========================\\n")',
2108
+ ' f.write(f"Processing Date: {datetime.now().strftime(\'%Y-%m-%d %H:%M:%S\')}\\n")',
2109
+ ' f.write(f"Polarity: {POLARITY}\\n")',
2110
+ ' f.write(f"Adducts: {\', \'.join(ADDUCTS)}\\n")',
2111
+ ' f.write(f"Number of Samples: {num_samples}\\n")',
2112
+ ' f.write(f"Number of Consensus Features: {num_features}\\n")',
2113
+ ' f.write(f"RT Tolerance: {RT_TOLERANCE}s\\n")',
2114
+ ' f.write(f"m/z Tolerance: {MZ_TOLERANCE} Da\\n")',
2115
+ ' f.write(f"Merge Method: {MERGE_METHOD}\\n")',
2116
+ ' ',
2117
+ ' log_progress(f"Saved study metadata: {metadata_file}")',
2118
+ ' return True',
2119
+ ' ',
2120
+ ' except Exception as e:',
2121
+ ' print(f"Failed to save study: {e}")',
2122
+ ' return False',
2123
+ '',
2124
+ '',
2125
+ 'def cleanup_temp_files() -> bool:',
2126
+ ' """Clean up temporary files if requested."""',
2127
+ ' if not CLEANUP_TEMP_FILES:',
2128
+ ' return True',
2129
+ ' ',
2130
+ ' log_progress("=== Cleaning Up Temporary Files ===")',
2131
+ ' study_folder_path = Path(STUDY_FOLDER)',
2132
+ ' ',
2133
+ ' try:',
2134
+ ' cleaned_count = 0',
2135
+ ' ',
2136
+ ' # Remove individual sample plots if study plots were generated',
2137
+ ' if GENERATE_PLOTS:',
2138
+ ' temp_plots = list(study_folder_path.glob("*_2d.html"))',
2139
+ ' for plot_file in temp_plots:',
2140
+ ' if plot_file.name not in ["alignment_plot.html", "consensus_2d.html", "pca_plot.html"]:',
2141
+ ' plot_file.unlink()',
2142
+ ' cleaned_count += 1',
2143
+ ' ',
2144
+ ' log_progress(f"Cleaned up {cleaned_count} temporary files")',
2145
+ ' return True',
2146
+ ' ',
2147
+ ' except Exception as e:',
2148
+ ' print(f"Failed to cleanup temp files: {e}")',
2149
+ ' return False',
2150
+ '',
2151
+ '',
2152
+ 'def main():',
2153
+ ' """Main execution function."""',
2154
+ ' print("=" * 70)',
2155
+ ' print("AUTOMATED MASS SPECTROMETRY ANALYSIS SCRIPT")',
2156
+ f' print("Generated by masster.Wizard on {timestamp}")',
2157
+ ' print("=" * 70)',
2158
+ ' ',
2159
+ ' start_time = time.time()',
2160
+ ' ',
2161
+ ' try:',
2162
+ ' # Ensure output directory exists',
2163
+ ' Path(STUDY_FOLDER).mkdir(parents=True, exist_ok=True)',
2164
+ ' ',
2165
+ ' # Step 1: Discover files',
2166
+ ' file_list = discover_files()',
2167
+ ' if not file_list:',
2168
+ ' print("No files found for processing")',
2169
+ ' return False',
2170
+ ' ',
2171
+ ' # Step 2: Convert to sample5',
2172
+ ' if not convert_to_sample5(file_list):',
2173
+ ' print("Sample5 conversion failed")',
2174
+ ' return False',
2175
+ ' ',
2176
+ ' # Step 3: Assemble study',
2177
+ ' study = assemble_study()',
2178
+ ' ',
2179
+ ' # Step 4: Align and merge',
2180
+ ' study = align_and_merge(study)',
2181
+ ' ',
2182
+ ' # Step 5: Generate plots',
2183
+ ' generate_plots(study)',
2184
+ ' ',
2185
+ ' # Step 6: Export results',
2186
+ ' export_results(study)',
2187
+ ' ',
2188
+ ' # Step 7: Save study',
2189
+ ' save_study(study)',
2190
+ ' ',
2191
+ ' # Step 8: Cleanup',
2192
+ ' cleanup_temp_files()',
2193
+ ' ',
2194
+ ' # Final summary',
2195
+ ' total_time = time.time() - start_time',
2196
+ ' print("=" * 70)',
2197
+ ' print("ANALYSIS COMPLETED SUCCESSFULLY")',
2198
+ ' print(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")',
2199
+ ' if hasattr(study, "consensus_df"):',
2200
+ ' print(f"Consensus features generated: {len(study.consensus_df)}")',
2201
+ ' print("=" * 70)',
2202
+ ' ',
2203
+ ' return True',
2204
+ ' ',
2205
+ ' except KeyboardInterrupt:',
2206
+ ' print("\\nAnalysis interrupted by user")',
2207
+ ' return False',
2208
+ ' except Exception as e:',
2209
+ ' print(f"Analysis failed with error: {e}")',
2210
+ ' import traceback',
2211
+ ' traceback.print_exc()',
2212
+ ' return False',
2213
+ '',
2214
+ '',
2215
+ 'if __name__ == "__main__":',
2216
+ ' success = main()',
2217
+ ' sys.exit(0 if success else 1)',
2218
+ ]
2219
+
2220
+ return '\n'.join(script_lines)
2221
+
1147
2222
  def info(self):
1148
2223
  """Print comprehensive wizard status information."""
1149
2224
  status = self.get_status()
@@ -1157,6 +2232,8 @@ class Wizard:
1157
2232
  print(f"Polarity: {status['parameters']['polarity']}")
1158
2233
  print(f"CPU Cores: {status['parameters']['num_cores']}")
1159
2234
  print(f"Adducts: {', '.join(status['parameters']['adducts'])}")
2235
+ print(f"Detector Type: {self.params.detector_type}")
2236
+ print(f"Noise Threshold: {self.params.noise_threshold}")
1160
2237
  print(f"Processing Time: {status['processing_time']}")
1161
2238
  print(f"Files Processed: {status['processed_files']}")
1162
2239
  print(f"Files Failed: {status['failed_files']}")
@@ -1171,5 +2248,100 @@ class Wizard:
1171
2248
  print("=" * 50)
1172
2249
 
1173
2250
 
1174
- # Export the main classes
1175
- __all__ = ["Wizard", "wizard_def"]
2251
+ def create_script(
2252
+ source: str,
2253
+ study_folder: str,
2254
+ filename: str,
2255
+ polarity: str = "positive",
2256
+ adducts: Optional[List[str]] = None,
2257
+ params: Optional[wizard_def] = None,
2258
+ num_cores: int = 0,
2259
+ **kwargs
2260
+ ) -> bool:
2261
+ """
2262
+ Create a standalone analysis script without initializing a Wizard instance.
2263
+
2264
+ This function generates a Python script that replicates automated processing
2265
+ steps with the specified configuration. The script can be executed independently
2266
+ to perform the same analysis.
2267
+
2268
+ Parameters:
2269
+ source: Directory containing raw data files
2270
+ study_folder: Output directory for processed study
2271
+ filename: Filename for the generated script (should end with .py)
2272
+ polarity: Ion polarity mode ("positive" or "negative")
2273
+ adducts: List of adduct specifications (auto-set if None)
2274
+ params: Custom wizard_def parameters (optional)
2275
+ num_cores: Number of CPU cores (0 = auto-detect)
2276
+ **kwargs: Additional parameters to override defaults
2277
+
2278
+ Returns:
2279
+ True if script was generated successfully, False otherwise
2280
+
2281
+ Example:
2282
+ >>> from masster.wizard import create_script
2283
+ >>> create_script(
2284
+ ... source=r'D:\\Data\\raw_files',
2285
+ ... study_folder=r'D:\\Data\\output',
2286
+ ... filename='run_masster.py',
2287
+ ... polarity='positive'
2288
+ ... )
2289
+ """
2290
+
2291
+ try:
2292
+ # Create parameters
2293
+ if params is not None:
2294
+ # Use provided params as base
2295
+ wizard_params = params
2296
+ # Update with provided values
2297
+ wizard_params.data_source = source
2298
+ wizard_params.study_folder = study_folder
2299
+ if polarity != "positive": # Only override if explicitly different
2300
+ wizard_params.polarity = polarity
2301
+ if num_cores > 0:
2302
+ wizard_params.num_cores = num_cores
2303
+ if adducts is not None:
2304
+ wizard_params.adducts = adducts
2305
+ else:
2306
+ # Create new params with provided values
2307
+ wizard_params = wizard_def(
2308
+ data_source=source,
2309
+ study_folder=study_folder,
2310
+ polarity=polarity,
2311
+ num_cores=max(1, int(multiprocessing.cpu_count() * 0.75)) if num_cores <= 0 else num_cores
2312
+ )
2313
+
2314
+ if adducts is not None:
2315
+ wizard_params.adducts = adducts
2316
+
2317
+ # Apply any additional kwargs
2318
+ for key, value in kwargs.items():
2319
+ if hasattr(wizard_params, key):
2320
+ setattr(wizard_params, key, value)
2321
+
2322
+ # Ensure study folder exists
2323
+ study_path = Path(study_folder)
2324
+ study_path.mkdir(parents=True, exist_ok=True)
2325
+
2326
+ # Create a temporary Wizard instance to generate the script
2327
+ temp_wizard = Wizard(params=wizard_params)
2328
+
2329
+ # Generate the script using the existing method
2330
+ success = temp_wizard.export_script(filename)
2331
+
2332
+ if success:
2333
+ script_path = study_path / Path(filename).name
2334
+ print(f"Analysis script created: {script_path.absolute()}")
2335
+ print(f"Run with: python \"{script_path}\"")
2336
+
2337
+ return success
2338
+
2339
+ except Exception as e:
2340
+ print(f"Failed to create script: {e}")
2341
+ import traceback
2342
+ traceback.print_exc()
2343
+ return False
2344
+
2345
+
2346
+ # Export the main classes and functions
2347
+ __all__ = ["Wizard", "wizard_def", "create_script"]