masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/wizard/wizard.py CHANGED
@@ -34,6 +34,7 @@ wizard = Wizard(
34
34
 
35
35
  from __future__ import annotations
36
36
 
37
+ from math import e
37
38
  import os
38
39
  import sys
39
40
  import time
@@ -58,99 +59,99 @@ from masster._version import __version__ as version
58
59
  class wizard_def:
59
60
  """
60
61
  Default parameters for the Wizard automated processing system.
61
-
62
+
62
63
  This class provides comprehensive configuration for all stages of automated
63
64
  mass spectrometry data processing from raw files to final results.
64
-
65
+
65
66
  Attributes:
66
67
  # Core Configuration
67
68
  source (str): Path to directory containing raw data files
68
69
  folder (str): Output directory for processed study
69
70
  polarity (Optional[str]): Ion polarity mode ("positive", "negative", or None for auto-detection)
70
71
  num_cores (int): Number of CPU cores to use for parallel processing
71
-
72
+
72
73
  # File Discovery
73
74
  file_extensions (List[str]): File extensions to search for
74
75
  search_subfolders (bool): Whether to search subdirectories
75
76
  skip_patterns (List[str]): Filename patterns to skip
76
-
77
+
77
78
  # Processing Parameters
78
79
  adducts (List[str]): Adduct specifications for given polarity
79
80
  batch_size (int): Number of files to process per batch
80
81
  memory_limit_gb (float): Memory limit for processing (GB)
81
-
82
+
82
83
  # Resume & Recovery
83
84
  resume_enabled (bool): Enable automatic resume capability
84
85
  force_reprocess (bool): Force reprocessing of existing files
85
86
  backup_enabled (bool): Create backups of intermediate results
86
-
87
+
87
88
  # Output & Export
88
89
  generate_plots (bool): Generate visualization plots
89
90
  export_formats (List[str]): Output formats to generate
90
91
  compress_output (bool): Compress final study file
91
-
92
+
92
93
  # Logging
93
94
  log_level (str): Logging detail level
94
95
  log_to_file (bool): Save logs to file
95
96
  progress_interval (int): Progress update interval (seconds)
96
97
  """
97
-
98
+
98
99
  # === Core Configuration ===
99
100
  source: str = ""
100
- folder: str = ""
101
+ folder: str = ""
101
102
  polarity: Optional[str] = None
102
103
  num_cores: int = 4
103
-
104
+
104
105
  # === File Discovery ===
105
106
  file_extensions: List[str] = field(default_factory=lambda: [".wiff", ".raw", ".mzML"])
106
107
  search_subfolders: bool = True
107
108
  skip_patterns: List[str] = field(default_factory=lambda: ["blank", "test"])
108
-
109
+
109
110
  # === Processing Parameters ===
110
111
  adducts: List[str] = field(default_factory=list) # Will be set based on polarity
111
112
  batch_size: int = 8
112
113
  memory_limit_gb: float = 16.0
113
114
  max_file_size_gb: float = 4.0
114
-
115
+
115
116
  # === Resume & Recovery ===
116
117
  resume_enabled: bool = True
117
118
  force_reprocess: bool = False
118
119
  backup_enabled: bool = True
119
120
  checkpoint_interval: int = 10 # Save progress every N files
120
-
121
+
121
122
  # === Study Assembly ===
122
123
  min_samples_for_merge: int = 2
123
124
  rt_tolerance: float = 1.5
124
125
  mz_max_diff: float = 0.01
125
126
  alignment_algorithm: str = "kd"
126
127
  merge_method: str = "qt"
127
-
128
+
128
129
  # === Feature Detection ===
129
- chrom_fwhm: float = 0.5
130
- noise: float = 50.0
130
+ chrom_fwhm: float | None = None
131
+ noise: float | None = None
131
132
  chrom_peak_snr: float = 5.0
132
133
  tol_ppm: float = 10.0
133
134
  detector_type: str = "unknown" # Detected detector type ("orbitrap", "quadrupole", "unknown")
134
-
135
+
135
136
  # === Output & Export ===
136
137
  generate_plots: bool = True
137
138
  generate_interactive: bool = True
138
139
  export_formats: List[str] = field(default_factory=lambda: ["csv", "mgf", "xlsx"])
139
140
  compress_output: bool = True
140
141
  adaptive_compression: bool = True # Adapt based on study size
141
-
142
+
142
143
  # === Logging ===
143
144
  log_level: str = "INFO"
144
145
  log_to_file: bool = True
145
146
  progress_interval: int = 30 # seconds
146
147
  verbose_progress: bool = True
147
-
148
+
148
149
  # === Advanced Options ===
149
150
  use_process_pool: bool = True # vs ThreadPoolExecutor
150
151
  optimize_memory: bool = True
151
152
  cleanup_temp_files: bool = True
152
153
  validate_outputs: bool = True
153
-
154
+
154
155
  _param_metadata: dict[str, dict[str, Any]] = field(
155
156
  default_factory=lambda: {
156
157
  "source": {
@@ -159,7 +160,7 @@ class wizard_def:
159
160
  "required": True,
160
161
  },
161
162
  "folder": {
162
- "dtype": str,
163
+ "dtype": str,
163
164
  "description": "Output directory for processed study",
164
165
  "required": True,
165
166
  },
@@ -193,26 +194,26 @@ class wizard_def:
193
194
  },
194
195
  repr=False,
195
196
  )
196
-
197
+
197
198
  def __post_init__(self):
198
199
  """Set polarity-specific defaults after initialization."""
199
200
  # Set default adducts based on polarity if not provided
200
201
  if not self.adducts:
201
202
  if self.polarity and self.polarity.lower() in ["positive", "pos"]:
202
203
  self.adducts = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]
203
- elif self.polarity and self.polarity.lower() in ["negative", "neg"]:
204
+ elif self.polarity and self.polarity.lower() in ["negative", "neg"]:
204
205
  self.adducts = ["H-1:-:1.0", "CH2O2:0:0.5"]
205
206
  else:
206
207
  # Default to positive if polarity is None or unknown
207
208
  self.adducts = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]
208
-
209
+
209
210
  # Validate num_cores
210
211
  max_cores = multiprocessing.cpu_count()
211
212
  if self.num_cores <= 0:
212
213
  self.num_cores = max_cores
213
214
  elif self.num_cores > max_cores:
214
215
  self.num_cores = max_cores
215
-
216
+
216
217
  # Ensure paths are absolute
217
218
  if self.source:
218
219
  self.source = os.path.abspath(self.source)
@@ -223,24 +224,24 @@ class wizard_def:
223
224
  class Wizard:
224
225
  """
225
226
  Simplified Wizard for automated mass spectrometry data processing.
226
-
227
+
227
228
  The Wizard provides a clean interface for creating and executing analysis scripts
228
229
  that process raw MS data through the complete pipeline: file discovery, feature
229
230
  detection, sample processing, study assembly, alignment, merging, and export.
230
-
231
+
231
232
  Core functions:
232
233
  - create_scripts(): Generate standalone analysis scripts
233
234
  - test_only(): Process only one file for parameter validation
234
235
  - test_and_run(): Test with single file, then run full batch if successful
235
236
  - run(): Execute full batch processing on all files
236
-
237
+
237
238
  Recommended workflow:
238
239
  1. wizard = Wizard(source="raw_data", folder="output")
239
240
  2. wizard.create_scripts() # Generate analysis scripts
240
241
  3. wizard.test_only() # Validate with single file
241
242
  4. wizard.run() # Process all files
242
243
  """
243
-
244
+
244
245
  def __init__(
245
246
  self,
246
247
  source: str = "",
@@ -248,11 +249,11 @@ class Wizard:
248
249
  polarity: Optional[str] = None,
249
250
  adducts: Optional[List[str]] = None,
250
251
  num_cores: int = 6,
251
- **kwargs
252
+ **kwargs,
252
253
  ):
253
254
  """
254
255
  Initialize the Wizard with analysis parameters.
255
-
256
+
256
257
  Parameters:
257
258
  source: Directory containing raw data files
258
259
  folder: Output directory for processed study
@@ -261,145 +262,43 @@ class Wizard:
261
262
  num_cores: Number of CPU cores (0 = auto-detect 75% of available)
262
263
  **kwargs: Additional parameters (see wizard_def for full list)
263
264
  """
264
-
265
+
265
266
  # Auto-detect optimal number of cores if not specified
266
267
  if num_cores <= 0:
267
268
  num_cores = max(1, int(multiprocessing.cpu_count() * 0.75))
268
-
269
+
269
270
  # Create parameters instance
270
271
  if "params" in kwargs and isinstance(kwargs["params"], wizard_def):
271
272
  self.params = kwargs.pop("params")
272
273
  else:
273
274
  # Create default parameters
274
- self.params = wizard_def(
275
- source=source,
276
- folder=folder,
277
- polarity=polarity,
278
- num_cores=num_cores
279
- )
280
-
275
+ self.params = wizard_def(source=source, folder=folder, polarity=polarity, num_cores=num_cores)
276
+
281
277
  # Set adducts if provided
282
278
  if adducts is not None:
283
279
  self.params.adducts = adducts
284
-
280
+
285
281
  # Update with any additional parameters
286
282
  for key, value in kwargs.items():
287
283
  if hasattr(self.params, key):
288
284
  setattr(self.params, key, value)
289
-
285
+
290
286
  # Validate required parameters
291
287
  if not self.params.source:
292
288
  raise ValueError("source is required")
293
289
  if not self.params.folder:
294
290
  raise ValueError("folder is required")
295
-
291
+
296
292
  # Create and validate paths
297
293
  self.source_path = Path(self.params.source)
298
- self.folder_path = Path(self.params.folder)
294
+ self.folder_path = Path(self.params.folder)
299
295
  self.folder_path.mkdir(parents=True, exist_ok=True)
300
-
301
- # Auto-infer polarity from the first file if polarity is None
302
- if self.params.polarity is None:
303
- inferred_polarity = self._infer_polarity_from_first_file()
304
- if inferred_polarity:
305
- self.params.polarity = inferred_polarity
306
- # Update adducts based on inferred polarity
307
- self.params.__post_init__()
308
296
 
309
- def _infer_polarity_from_first_file(self) -> str:
310
- """
311
- Infer polarity from the first available raw data file.
312
-
313
- Returns:
314
- Inferred polarity string ("positive" or "negative") or "positive" as fallback
315
- """
316
- try:
317
- # Find first file
318
- for extension in ['.wiff', '.raw', '.mzML']:
319
- pattern = f"**/*{extension}" if True else f"*{extension}" # search_subfolders=True
320
- files = list(self.source_path.rglob(pattern))
321
- if files:
322
- first_file = files[0]
323
- break
324
- else:
325
- return 'positive'
326
-
327
- # Handle different file formats
328
- if first_file.suffix.lower() == '.wiff':
329
- return self._infer_polarity_from_wiff(str(first_file))
330
- elif first_file.suffix.lower() == '.raw':
331
- return self._infer_polarity_from_raw(str(first_file))
332
- elif first_file.suffix.lower() == '.mzml':
333
- return self._infer_polarity_from_mzml(str(first_file))
334
-
335
- except Exception:
336
- # Silently fall back to default if inference fails
337
- pass
338
-
339
- return 'positive'
340
-
341
- def _infer_polarity_from_wiff(self, filename: str) -> str:
342
- """Infer polarity from WIFF file."""
343
- try:
344
- from masster.sample.load import _wiff_to_dict
345
-
346
- # Extract metadata from first file
347
- metadata_df = _wiff_to_dict(filename)
348
-
349
- if not metadata_df.empty and 'polarity' in metadata_df.columns:
350
- # Get polarity from first experiment
351
- first_polarity = metadata_df['polarity'].iloc[0]
352
-
353
- # Convert numeric polarity codes to string
354
- if first_polarity == 1 or str(first_polarity).lower() in ['positive', 'pos', '+']:
355
- return "positive"
356
- elif first_polarity == -1 or str(first_polarity).lower() in ['negative', 'neg', '-']:
357
- return "negative"
358
- except Exception:
359
- pass
360
- return 'positive'
361
-
362
- def _infer_polarity_from_raw(self, filename: str) -> str:
363
- """Infer polarity from Thermo RAW file."""
364
- try:
365
- from masster.sample.thermo import ThermoRawFileReader
366
-
367
- with ThermoRawFileReader(filename) as raw_reader:
368
- # Get polarity from first scan
369
- first_scan = 1
370
- polarity = raw_reader.get_polarity_from_scan_event(first_scan)
371
- if polarity in ['positive', 'negative']:
372
- return polarity
373
- except Exception:
374
- pass
375
- return 'positive'
376
-
377
- def _infer_polarity_from_mzml(self, filename: str) -> str:
378
- """Infer polarity from mzML file."""
379
- try:
380
- # Import pyopenms with warnings suppression
381
- import warnings
382
- with warnings.catch_warnings():
383
- warnings.filterwarnings("ignore", message=".*OPENMS_DATA_PATH.*", category=UserWarning)
384
- import pyopenms as oms
385
-
386
- # Load the first few spectra to check polarity
387
- omsexp = oms.MSExperiment()
388
- oms.MzMLFile().load(filename, omsexp)
389
-
390
- if omsexp.getNrSpectra() > 0:
391
- first_spectrum = omsexp.getSpectra()[0]
392
- try:
393
- pol = first_spectrum.getInstrumentSettings().getPolarity()
394
- if pol == 1:
395
- return "positive"
396
- elif pol == 2:
397
- return "negative"
398
- except Exception:
399
- pass
400
- except Exception:
401
- pass
402
- return 'positive'
297
+ # Set default polarity if not specified
298
+ if self.params.polarity is None:
299
+ self.params.polarity = "positive"
300
+ # Update adducts based on default polarity
301
+ self.params.__post_init__()
403
302
 
404
303
  @property
405
304
  def polarity(self) -> Optional[str]:
@@ -414,13 +313,13 @@ class Wizard:
414
313
  def create_scripts(self) -> Dict[str, Any]:
415
314
  """
416
315
  Generate analysis scripts based on source file analysis.
417
-
316
+
418
317
  This method:
419
318
  1. Analyzes the source files to extract metadata
420
319
  2. Creates 1_masster_workflow.py with sample processing logic
421
320
  3. Creates 2_interactive_analysis.py marimo notebook for study exploration
422
321
  4. Returns instructions for next steps
423
-
322
+
424
323
  Returns:
425
324
  Dictionary containing:
426
325
  - status: "success" or "error"
@@ -432,50 +331,79 @@ class Wizard:
432
331
  try:
433
332
  # Step 1: Analyze source files to extract metadata
434
333
  source_info = self._analyze_source_files()
435
-
334
+
335
+ # Report extracted information from first file
336
+ print("\nInformation retrieval complete:")
337
+ print(f" Number of files found: {source_info.get('number_of_files', 0)}")
338
+ print(f" File types detected: {', '.join(source_info.get('file_types', []))}")
339
+ #print(f" First file: {source_info.get('first_file', 'None')}")
340
+ if source_info.get('first_file'):
341
+ print(f" Detected polarity: {source_info.get('polarity', 'unknown')}")
342
+ print(f" Detected detector type: {source_info.get('detector_type', 'unknown')}")
343
+ if source_info.get('baseline', 0) > 0:
344
+ print(f" Estimated baseline intensity: {source_info.get('baseline', 0):.1f}")
345
+ if source_info.get('length_minutes', 0) > 0:
346
+ print(f" Run length: {source_info.get('length_minutes', 0):.1f} minutes")
347
+ else:
348
+ print(" Run length: Could not determine")
349
+ if source_info.get('ms1_scans_per_second', 0) > 0:
350
+ print(f" MS1 scans per second: {source_info.get('ms1_scans_per_second', 0):.2f}")
351
+ print()
352
+
436
353
  # Update wizard parameters based on detected metadata
437
- if source_info.get('polarity') and source_info['polarity'] != 'positive':
438
- self.params.polarity = source_info['polarity']
439
-
354
+ if source_info.get("polarity") and source_info["polarity"] != "positive":
355
+ self.params.polarity = source_info["polarity"]
356
+
440
357
  files_created = []
441
-
358
+
442
359
  # Step 2: Create 1_masster_workflow.py
443
360
  workflow_script_path = self.folder_path / "1_masster_workflow.py"
361
+ print(f"Creating workflow script: {workflow_script_path}")
444
362
  workflow_content = self._generate_workflow_script_content(source_info)
445
363
 
446
364
  # Apply test mode modifications
447
365
  workflow_content = self._add_test_mode_support(workflow_content)
448
-
449
- with open(workflow_script_path, 'w', encoding='utf-8') as f:
366
+
367
+ with open(workflow_script_path, "w", encoding="utf-8") as f:
450
368
  f.write(workflow_content)
451
369
  files_created.append(str(workflow_script_path))
452
-
370
+
453
371
  # Step 3: Create 2_interactive_analysis.py marimo notebook
454
372
  notebook_path = self.folder_path / "2_interactive_analysis.py"
373
+ print(f"Creating interactive analysis notebook: {notebook_path}")
455
374
  notebook_content = self._generate_interactive_notebook_content(source_info)
456
-
457
- with open(notebook_path, 'w', encoding='utf-8') as f:
375
+
376
+ with open(notebook_path, "w", encoding="utf-8") as f:
458
377
  f.write(notebook_content)
459
378
  files_created.append(str(notebook_path))
460
-
379
+
461
380
  # Step 4: Generate instructions
462
381
  instructions = self._generate_instructions(source_info, files_created)
463
-
382
+
464
383
  return {
465
384
  "status": "success",
466
385
  "message": f"Successfully created {len(files_created)} script files",
467
386
  "instructions": instructions,
468
387
  "files_created": files_created,
469
- "source_info": source_info
388
+ "source_info": source_info,
470
389
  }
471
-
390
+
472
391
  except Exception as e:
392
+ import traceback
393
+ tb = traceback.extract_tb(e.__traceback__)
394
+ if tb:
395
+ line_number = tb[-1].lineno
396
+ function_name = tb[-1].name
397
+ error_location = f" (at line {line_number} in {function_name})"
398
+ else:
399
+ error_location = ""
400
+
473
401
  return {
474
- "status": "error",
475
- "message": f"Failed to create scripts: {e}",
402
+ "status": "error",
403
+ "message": f"Failed to create scripts: {e}{error_location}",
476
404
  "instructions": [],
477
405
  "files_created": [],
478
- "source_info": {}
406
+ "source_info": {},
479
407
  }
480
408
 
481
409
  def _analyze_source_files(self) -> Dict[str, Any]:
@@ -483,16 +411,20 @@ class Wizard:
483
411
  result = {
484
412
  "number_of_files": 0,
485
413
  "file_types": [],
486
- "polarity": "positive",
414
+ "detector_type": "tof",
415
+ "polarity": None,
416
+ "baseline": None,
487
417
  "length_minutes": 0.0,
488
- "first_file": None
418
+ "ms1_scans_per_second": 0.0,
419
+ "first_file": None,
420
+
489
421
  }
490
-
422
+
491
423
  try:
492
424
  # Find raw data files
493
425
  extensions = [".wiff", ".raw", ".mzML"]
494
426
  raw_files = []
495
-
427
+
496
428
  for ext in extensions:
497
429
  pattern = f"**/*{ext}"
498
430
  files = list(self.source_path.rglob(pattern))
@@ -500,246 +432,284 @@ class Wizard:
500
432
  raw_files.extend(files)
501
433
  if ext not in result["file_types"]:
502
434
  result["file_types"].append(ext)
503
-
435
+
504
436
  result["number_of_files"] = len(raw_files)
505
-
437
+
506
438
  if raw_files:
507
439
  result["first_file"] = str(raw_files[0])
508
- # Simple heuristic: assume 30 minutes per file if we can't determine
509
- result["length_minutes"] = 30.0
510
-
440
+ # load first file to infer polarity and length
441
+ print(f"Analyzing first file: {raw_files[0]}")
442
+ from masster import Sample
443
+ sample = Sample(filename=result["first_file"], logging_level='WARNING')
444
+ result['polarity'] = sample.polarity
445
+ # take max from polars ms1_df['rt']
446
+ if sample.ms1_df is not None:
447
+ if not sample.ms1_df.is_empty() and 'rt' in sample.ms1_df.columns:
448
+ max_rt = sample.ms1_df['rt'].max()
449
+ if max_rt is not None and isinstance(max_rt, (int, float)) and max_rt > 0:
450
+ result["length_minutes"] = float(max_rt) / 60.0
451
+ result["ms1_scans_per_second"] = len(sample.ms1_df) / float(max_rt) / 60.0
452
+
453
+ baseline = sample.ms1_df['inty'].quantile(0.001)
454
+ if baseline is not None and isinstance(baseline, (int, float)):
455
+ result["baseline"] = float(baseline)
456
+ if baseline > 5e3:
457
+ result["detector_type"] = "orbitrap"
458
+ else:
459
+ result["detector_type"] = "tof"
460
+
511
461
  except Exception as e:
512
462
  print(f"Warning: Could not analyze source files: {e}")
513
-
463
+
514
464
  return result
515
465
 
516
466
  def _generate_workflow_script_content(self, source_info: Dict[str, Any]) -> str:
517
467
  """Generate the content for 1_masster_workflow.py script."""
518
-
468
+
469
+ # Logic
470
+ noise = self.params.noise
471
+ if noise is None:
472
+ if source_info.get("detector_type") == "orbitrap":
473
+ noise = max(self.params.noise or 50.0, 5e4)
474
+ elif source_info.get("detector_type") == "tof":
475
+ default_noise = self.params.noise or 50.0
476
+ baseline = source_info.get("baseline", default_noise / 2.0)
477
+ noise = baseline * 2
478
+
479
+ chrom_fwhm = self.params.chrom_fwhm
480
+ if chrom_fwhm is None:
481
+ if source_info.get("length_minutes", 0) > 0:
482
+ if source_info["length_minutes"] < 10:
483
+ chrom_fwhm = 0.5
484
+ else:
485
+ chrom_fwhm = 2.0
486
+
487
+
488
+ # Generate script content
519
489
  script_lines = [
520
- '#!/usr/bin/env python3',
521
- '"""',
522
- 'Automated Mass Spectrometry Data Analysis Pipeline',
523
- 'Generated by masster wizard',
490
+ "#!/usr/bin/env python3",
491
+ '"""',
492
+ "Automated Mass Spectrometry Data Analysis Pipeline",
493
+ "Generated by masster wizard",
524
494
  '"""',
525
- '',
526
- 'import os',
527
- 'import sys',
528
- 'import time',
529
- 'from pathlib import Path',
530
- '',
531
- '# Import masster modules',
532
- 'from masster.study import Study',
533
- 'from masster import __version__',
534
- '',
535
- '# Test mode configuration',
536
- 'TEST_MODE = os.environ.get("MASSTER_TEST_MODE", "0") == "1"',
537
- 'TEST_ONLY = os.environ.get("MASSTER_TEST_ONLY", "0") == "1" # Only run test, don\'t continue to full batch',
538
- '',
539
- '# Analysis parameters',
540
- 'PARAMS = {',
541
- ' # === Core Configuration ===',
495
+ "",
496
+ "import os",
497
+ "import sys",
498
+ "import time",
499
+ "from pathlib import Path",
500
+ "",
501
+ "# Import masster modules",
502
+ "from masster.study import Study",
503
+ "from masster import __version__",
504
+ "",
505
+ "# Test mode configuration",
506
+ 'TEST = os.environ.get("MASSTER_TEST", "0") == "1"',
507
+ 'STOP_AFTER_TEST = os.environ.get("MASSTER_STOP_AFTER_TEST", "0") == "1" # Only run test, don\'t continue to full batch',
508
+ "",
509
+ "# Analysis parameters",
510
+ "PARAMS = {",
511
+ " # === Core Configuration ===",
542
512
  f' "source": {str(self.source_path)!r}, # Directory containing raw data files',
543
513
  f' "folder": {str(self.folder_path)!r}, # Output directory for processed study',
544
514
  f' "polarity": {self.params.polarity!r}, # Ion polarity mode ("positive" or "negative")',
545
515
  f' "num_cores": {self.params.num_cores}, # Number of CPU cores for parallel processing',
546
- '',
547
- ' # === Test Mode ===',
548
- ' "test_mode": TEST_MODE, # Process only first file for testing',
549
- ' "test_only": TEST_ONLY, # Stop after test, don\'t run full batch',
550
- '',
551
- ' # === File Discovery ===',
516
+ "",
517
+ " # === Test Mode ===",
518
+ ' "test": TEST, # Process only first file for testing',
519
+ ' "stop_after_test": STOP_AFTER_TEST, # Stop after test, don\'t run full batch',
520
+ "",
521
+ " # === File Discovery ===",
552
522
  f' "file_extensions": {self.params.file_extensions!r}, # File extensions to search for',
553
523
  f' "search_subfolders": {self.params.search_subfolders}, # Whether to search subdirectories recursively',
554
524
  f' "skip_patterns": {self.params.skip_patterns!r}, # Filename patterns to skip',
555
- '',
556
- ' # === Processing Parameters ===',
525
+ "",
526
+ " # === Processing Parameters ===",
557
527
  f' "adducts": {self.params.adducts!r}, # Adduct specifications for feature detection and annotation',
558
- f' "noise": {self.params.noise}, # Noise threshold for feature detection',
559
- f' "chrom_fwhm": {self.params.chrom_fwhm}, # Chromatographic peak full width at half maximum (seconds)',
528
+ f' "noise": {noise}, # Noise threshold for feature detection',
529
+ f' "chrom_fwhm": {chrom_fwhm}, # Chromatographic peak full width at half maximum (seconds)',
560
530
  f' "chrom_peak_snr": {self.params.chrom_peak_snr}, # Minimum signal-to-noise ratio for chromatographic peaks',
561
- '',
562
- ' # === Alignment & Merging ===',
531
+ "",
532
+ " # === Alignment & Merging ===",
563
533
  f' "rt_tol": {self.params.rt_tolerance}, # Retention time tolerance for alignment (seconds)',
564
534
  f' "mz_tol": {self.params.mz_max_diff}, # Mass-to-charge ratio tolerance for alignment (Da)',
565
535
  f' "alignment_method": {self.params.alignment_algorithm!r}, # Algorithm for sample alignment',
566
536
  f' "min_samples_per_feature": {self.params.min_samples_for_merge}, # Minimum samples required per consensus feature',
567
537
  f' "merge_method": {self.params.merge_method!r}, # Method for merging consensus features',
568
- '',
569
- ' # === Sample Processing (used in add_samples_from_folder) ===',
538
+ "",
539
+ " # === Sample Processing (used in add_samples_from_folder) ===",
570
540
  f' "batch_size": {self.params.batch_size}, # Number of files to process per batch',
571
541
  f' "memory_limit_gb": {self.params.memory_limit_gb}, # Memory limit for processing (GB)',
572
- '',
573
- ' # === Script Options ===',
542
+ "",
543
+ " # === Script Options ===",
574
544
  f' "resume_enabled": {self.params.resume_enabled}, # Enable automatic resume capability',
575
545
  f' "force_reprocess": {self.params.force_reprocess}, # Force reprocessing of existing files',
576
546
  f' "cleanup_temp_files": {self.params.cleanup_temp_files}, # Clean up temporary files after processing',
577
- '}',
578
- '',
579
- '',
580
- 'def discover_raw_files(source_folder, file_extensions, search_subfolders=True):',
547
+ "}",
548
+ "",
549
+ "",
550
+ "def discover_raw_files(source_folder, file_extensions, search_subfolders=True):",
581
551
  ' """Discover raw data files in the source folder."""',
582
- ' source_path = Path(source_folder)',
583
- ' raw_files = []',
584
- ' ',
585
- ' for ext in file_extensions:',
586
- ' if search_subfolders:',
552
+ " source_path = Path(source_folder)",
553
+ " raw_files = []",
554
+ " ",
555
+ " for ext in file_extensions:",
556
+ " if search_subfolders:",
587
557
  ' pattern = f"**/*{ext}"',
588
- ' files = list(source_path.rglob(pattern))',
589
- ' else:',
558
+ " files = list(source_path.rglob(pattern))",
559
+ " else:",
590
560
  ' pattern = f"*{ext}"',
591
- ' files = list(source_path.glob(pattern))',
592
- ' raw_files.extend(files)',
593
- ' ',
594
- ' return raw_files',
595
- '',
596
- '',
597
- 'def process_single_file(args):',
561
+ " files = list(source_path.glob(pattern))",
562
+ " raw_files.extend(files)",
563
+ " ",
564
+ " return raw_files",
565
+ "",
566
+ "",
567
+ "def process_single_file(args):",
598
568
  ' """Process a single raw file to sample5 format - module level for multiprocessing."""',
599
- ' raw_file, output_folder = args',
600
- ' from masster.sample import Sample',
601
- ' ',
602
- ' try:',
603
- ' # Create sample5 filename',
604
- ' sample_name = raw_file.stem',
569
+ " raw_file, output_folder = args",
570
+ " from masster.sample import Sample",
571
+ " ",
572
+ " try:",
573
+ " # Create sample5 filename",
574
+ " sample_name = raw_file.stem",
605
575
  ' sample5_path = Path(output_folder) / f"{sample_name}.sample5"',
606
- ' ',
607
- ' # Skip if sample5 already exists',
576
+ " ",
577
+ " # Skip if sample5 already exists",
608
578
  ' if sample5_path.exists() and not PARAMS["force_reprocess"]:',
609
579
  ' print(f" Skipping {raw_file.name} (sample5 already exists)")',
610
- ' return str(sample5_path)',
611
- ' ',
580
+ " return str(sample5_path)",
581
+ " ",
612
582
  ' print(f" Converting {raw_file.name}...")',
613
- ' ',
614
- ' # Load and process raw file with full pipeline',
615
- ' sample = Sample(log_label=sample_name)',
616
- ' sample.load(filename=str(raw_file))',
617
- ' sample.find_features(',
583
+ " ",
584
+ " # Load and process raw file with full pipeline",
585
+ " sample = Sample(log_label=sample_name)",
586
+ " sample.load(filename=str(raw_file))",
587
+ " sample.find_features(",
618
588
  ' noise=PARAMS["noise"],',
619
589
  ' chrom_fwhm=PARAMS["chrom_fwhm"],',
620
590
  ' chrom_peak_snr=PARAMS["chrom_peak_snr"]',
621
- ' )',
622
- ' sample.find_ms2()',
623
- ' sample.find_iso()',
624
- ' # sample.export_mgf()',
591
+ " )",
592
+ " sample.find_ms2()",
593
+ " sample.find_iso()",
594
+ " # sample.export_mgf()",
625
595
  ' # sample.plot_2d(filename=f"{sample5_path.replace(".sample5", ".html")}")',
626
- ' sample.save(str(sample5_path))',
627
- ' ',
596
+ " sample.save(str(sample5_path))",
597
+ " ",
628
598
  ' # print(f" Completed {raw_file.name} -> {sample5_path.name}")',
629
- ' return str(sample5_path)',
630
- ' ',
631
- ' except Exception as e:',
599
+ " return str(sample5_path)",
600
+ " ",
601
+ " except Exception as e:",
632
602
  ' print(f" ERROR processing {raw_file.name}: {e}")',
633
- ' return None',
634
- '',
635
- '',
636
- 'def convert_raw_to_sample5(raw_files, output_folder, polarity, num_cores):',
603
+ " return None",
604
+ "",
605
+ "",
606
+ "def convert_raw_to_sample5(raw_files, output_folder, polarity, num_cores):",
637
607
  ' """Convert raw data files to sample5 format."""',
638
- ' import concurrent.futures',
639
- ' import os',
640
- ' ',
641
- ' # Create output directory',
642
- ' os.makedirs(output_folder, exist_ok=True)',
643
- ' ',
644
- ' # Prepare arguments for multiprocessing',
645
- ' file_args = [(raw_file, output_folder) for raw_file in raw_files]',
646
- ' ',
647
- ' # Process files in parallel',
648
- ' sample5_files = []',
649
- ' with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:',
650
- ' futures = [executor.submit(process_single_file, args) for args in file_args]',
651
- ' ',
652
- ' for future in concurrent.futures.as_completed(futures):',
653
- ' result = future.result()',
654
- ' if result:',
655
- ' sample5_files.append(result)',
656
- ' ',
657
- ' return sample5_files',
658
- '',
659
- '',
660
- 'def main():',
608
+ " import concurrent.futures",
609
+ " import os",
610
+ " ",
611
+ " # Create output directory",
612
+ " os.makedirs(output_folder, exist_ok=True)",
613
+ " ",
614
+ " # Prepare arguments for multiprocessing",
615
+ " file_args = [(raw_file, output_folder) for raw_file in raw_files]",
616
+ " ",
617
+ " # Process files in parallel",
618
+ " sample5_files = []",
619
+ " with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:",
620
+ " futures = [executor.submit(process_single_file, args) for args in file_args]",
621
+ " ",
622
+ " for future in concurrent.futures.as_completed(futures):",
623
+ " result = future.result()",
624
+ " if result:",
625
+ " sample5_files.append(result)",
626
+ " ",
627
+ " return sample5_files",
628
+ "",
629
+ "",
630
+ "def main():",
661
631
  ' """Main analysis pipeline."""',
662
- ' try:',
632
+ " try:",
663
633
  ' print("=" * 70)',
664
634
  f' print("masster {version} - Automated MS Data Analysis")',
665
635
  ' print("=" * 70)',
666
- ' print(f"Source: {PARAMS[\'source\']}")',
667
- ' print(f"Output: {PARAMS[\'folder\']}")',
668
- ' print(f"Polarity: {PARAMS[\'polarity\']}")',
669
- ' print(f"CPU Cores: {PARAMS[\'num_cores\']}")',
636
+ " print(f\"Source: {PARAMS['source']}\")",
637
+ " print(f\"Output: {PARAMS['folder']}\")",
638
+ " print(f\"Polarity: {PARAMS['polarity']}\")",
639
+ " print(f\"CPU Cores: {PARAMS['num_cores']}\")",
670
640
  ' print("=" * 70)',
671
- ' ',
672
- ' start_time = time.time()',
673
- ' ',
674
- ' # Step 1: Discover raw data files',
641
+ " ",
642
+ " start_time = time.time()",
643
+ " ",
644
+ " # Step 1: Discover raw data files",
675
645
  ' print("\\nStep 1/7: Discovering raw data files...")',
676
- ' raw_files = discover_raw_files(',
677
- ' PARAMS[\'source\'],',
678
- ' PARAMS[\'file_extensions\'],',
679
- ' PARAMS[\'search_subfolders\']',
680
- ' )',
681
- ' ',
682
- ' if not raw_files:',
646
+ " raw_files = discover_raw_files(",
647
+ " PARAMS['source'],",
648
+ " PARAMS['file_extensions'],",
649
+ " PARAMS['search_subfolders']",
650
+ " )",
651
+ " ",
652
+ " if not raw_files:",
683
653
  ' print("No raw data files found!")',
684
- ' return False',
685
- ' ',
654
+ " return False",
655
+ " ",
686
656
  ' print(f"Found {len(raw_files)} raw data files")',
687
- ' for f in raw_files[:5]: # Show first 5 files',
657
+ " for f in raw_files[:5]: # Show first 5 files",
688
658
  ' print(f" {f.name}")',
689
- ' if len(raw_files) > 5:',
659
+ " if len(raw_files) > 5:",
690
660
  ' print(f" ... and {len(raw_files) - 5} more")',
691
- ' ',
692
- ' # Step 2: Process raw files',
661
+ " ",
662
+ " # Step 2: Process raw files",
693
663
  ' print("\\nStep 2/7: Processing raw files...")',
694
- ' sample5_files = convert_raw_to_sample5(',
695
- ' raw_files,',
696
- ' PARAMS[\'folder\'],',
697
- ' PARAMS[\'polarity\'],',
698
- ' PARAMS[\'num_cores\']',
699
- ' )',
700
- ' ',
701
- ' if not sample5_files:',
664
+ " sample5_files = convert_raw_to_sample5(",
665
+ " raw_files,",
666
+ " PARAMS['folder'],",
667
+ " PARAMS['polarity'],",
668
+ " PARAMS['num_cores']",
669
+ " )",
670
+ " ",
671
+ " if not sample5_files:",
702
672
  ' print("No sample5 files were created!")',
703
- ' return False',
704
- ' ',
673
+ " return False",
674
+ " ",
705
675
  ' print(f"Successfully processed {len(sample5_files)} files to sample5")',
706
- ' ',
707
- ' # Step 3: Create and configure study',
676
+ " ",
677
+ " # Step 3: Create and configure study",
708
678
  ' print("\\nStep 3/7: Initializing study...")',
709
- ' study = Study(folder=PARAMS[\'folder\'])',
710
- ' study.polarity = PARAMS[\'polarity\']',
711
- ' study.adducts = PARAMS[\'adducts\']',
712
- ' ',
713
- ' # Step 4: Add sample5 files to study',
679
+ " study = Study(folder=PARAMS['folder'])",
680
+ " study.polarity = PARAMS['polarity']",
681
+ " study.adducts = PARAMS['adducts']",
682
+ " ",
683
+ " # Step 4: Add sample5 files to study",
714
684
  ' print("\\nStep 4/7: Adding samples to study...")',
715
- ' study.add(str(Path(PARAMS[\'folder\']) / "*.sample5"))',
716
- ' study.features_filter(study.features_select(chrom_coherence=0.1, chrom_prominence_scaled=1))',
717
- ' ',
718
- ' # Step 5: Core processing',
685
+ " study.add(str(Path(PARAMS['folder']) / \"*.sample5\"))",
686
+ " study.features_filter(study.features_select(chrom_coherence=0.1, chrom_prominence_scaled=1))",
687
+ " ",
688
+ " # Step 5: Core processing",
719
689
  ' print("\\nStep 5/7: Processing...")',
720
- ' study.align(',
721
- ' algorithm=PARAMS[\'alignment_method\'],',
722
- ' rt_tol=PARAMS[\'rt_tol\']',
723
- ' )',
724
- ' ',
725
- ' study.merge(',
690
+ " study.align(",
691
+ " algorithm=PARAMS['alignment_method'],",
692
+ " rt_tol=PARAMS['rt_tol']",
693
+ " )",
694
+ " ",
695
+ " study.merge(",
726
696
  ' method="qt",',
727
- ' min_samples=PARAMS[\'min_samples_per_feature\'],',
728
- ' threads=PARAMS[\'num_cores\'],',
729
- ' rt_tol=PARAMS[\'rt_tol\']',
730
- ' )',
731
- ' study.find_iso()',
732
- ' study.fill()',
733
- ' study.integrate()',
734
- ' ',
735
- ' # Step 6/7: Saving results',
697
+ " min_samples=PARAMS['min_samples_per_feature'],",
698
+ " threads=PARAMS['num_cores'],",
699
+ " rt_tol=PARAMS['rt_tol']",
700
+ " )",
701
+ " study.find_iso()",
702
+ " study.fill()",
703
+ " study.integrate()",
704
+ " ",
705
+ " # Step 6/7: Saving results",
736
706
  ' print("\\nStep 6/7: Saving results...")',
737
- ' study.save()',
738
- ' study.export_xlsx()',
739
- ' study.export_mgf()',
740
- ' study.export_mztab()',
741
- ' ',
742
- ' # Step 7: Plots',
707
+ " study.save()",
708
+ " study.export_xlsx()",
709
+ " study.export_mgf()",
710
+ " study.export_mztab()",
711
+ " ",
712
+ " # Step 7: Plots",
743
713
  ' print("\\nStep 7/7: Exporting plots...")',
744
714
  ' study.plot_consensus_2d(filename="consensus.html")',
745
715
  ' study.plot_consensus_2d(filename="consensus.png")',
@@ -751,10 +721,10 @@ class Wizard:
751
721
  ' study.plot_bpc(filename="bpc.png")',
752
722
  ' study.plot_rt_correction(filename="rt_correction.html")',
753
723
  ' study.plot_rt_correction(filename="rt_correction.png")',
754
- ' ',
755
- ' # Print summary',
756
- ' study.info()',
757
- ' total_time = time.time() - start_time',
724
+ " ",
725
+ " # Print summary",
726
+ " study.info()",
727
+ " total_time = time.time() - start_time",
758
728
  ' print("\\n" + "=" * 70)',
759
729
  ' print("ANALYSIS COMPLETE")',
760
730
  ' print("=" * 70)',
@@ -764,50 +734,50 @@ class Wizard:
764
734
  ' if hasattr(study, "consensus_df"):',
765
735
  ' print(f"Consensus features generated: {len(study.consensus_df)}")',
766
736
  ' print("=" * 70)',
767
- ' ',
768
- ' return True',
769
- ' ',
770
- ' except KeyboardInterrupt:',
737
+ " ",
738
+ " return True",
739
+ " ",
740
+ " except KeyboardInterrupt:",
771
741
  ' print("\\nAnalysis interrupted by user")',
772
- ' return False',
773
- ' except Exception as e:',
742
+ " return False",
743
+ " except Exception as e:",
774
744
  ' print(f"Analysis failed with error: {e}")',
775
- ' import traceback',
776
- ' traceback.print_exc()',
777
- ' return False',
778
- '',
779
- '',
745
+ " import traceback",
746
+ " traceback.print_exc()",
747
+ " return False",
748
+ "",
749
+ "",
780
750
  'if __name__ == "__main__":',
781
- ' success = main()',
782
- ' sys.exit(0 if success else 1)',
751
+ " success = main()",
752
+ " sys.exit(0 if success else 1)",
783
753
  ]
784
-
785
- return '\n'.join(script_lines)
754
+
755
+ return "\n".join(script_lines)
786
756
 
787
757
  def _generate_interactive_notebook_content(self, source_info: Dict[str, Any]) -> str:
788
758
  """Generate the content for 2_interactive_analysis.py marimo notebook."""
789
-
759
+
790
760
  notebook_lines = [
791
- 'import marimo',
792
- '',
761
+ "import marimo",
762
+ "",
793
763
  '__generated_with = "0.9.14"',
794
764
  'app = marimo.App(width="medium")',
795
- '',
796
- '@app.cell',
797
- 'def __():',
798
- ' import marimo as mo',
799
- ' return (mo,)',
800
- '',
801
- '@app.cell',
802
- 'def __(mo):',
765
+ "",
766
+ "@app.cell",
767
+ "def __():",
768
+ " import marimo as mo",
769
+ " return (mo,)",
770
+ "",
771
+ "@app.cell",
772
+ "def __(mo):",
803
773
  ' mo.md(r"""',
804
- ' # MASSter Interactive Analysis',
805
- ' ',
806
- f' **Source:** {source_info.get("number_of_files", 0)} files detected',
807
- f' **Polarity:** {source_info.get("polarity", "unknown")}',
808
- ' ',
809
- ' This notebook provides interactive exploration of your processed study.',
810
- ' Make sure you have run `python 1_masster_workflow.py` first.',
774
+ " # MASSter Interactive Analysis",
775
+ " ",
776
+ f" **Source:** {source_info.get('number_of_files', 0)} files detected",
777
+ f" **Polarity:** {source_info.get('polarity', 'unknown')}",
778
+ " ",
779
+ " This notebook provides interactive exploration of your processed study.",
780
+ " Make sure you have run `python 1_masster_workflow.py` first.",
811
781
  ' """)',
812
782
  ' return ()',
813
783
  '',
@@ -828,18 +798,20 @@ class Wizard:
828
798
  ' return ()',
829
799
  '',
830
800
  'if __name__ == "__main__":',
831
- ' app.run()',
801
+ " app.run()",
832
802
  ]
833
-
834
- return '\n'.join(notebook_lines)
803
+
804
+ return "\n".join(notebook_lines)
835
805
 
836
806
  def _generate_instructions(self, source_info: Dict[str, Any], files_created: List[str]) -> List[str]:
837
807
  """Generate usage instructions for the created scripts."""
838
- instructions = [f"Source analysis: {source_info.get('number_of_files', 0)} files found",
808
+ instructions = [
809
+ f"Source analysis: {source_info.get('number_of_files', 0)} files found",
839
810
  f"Polarity detected: {source_info.get('polarity', 'unknown')}",
840
- "Files created:"]
811
+ "Files created:",
812
+ ]
841
813
  for file_path in files_created:
842
- instructions.append(f" [OK] {str(Path(file_path).resolve())}")
814
+ instructions.append(f" {str(Path(file_path).resolve())}")
843
815
 
844
816
  # Find the workflow script name from created files
845
817
  workflow_script_name = "1_masster_workflow.py"
@@ -847,7 +819,7 @@ class Wizard:
847
819
  if Path(file_path).name == "1_masster_workflow.py":
848
820
  workflow_script_name = Path(file_path).name
849
821
  break
850
-
822
+
851
823
  instructions.extend([
852
824
  "",
853
825
  "Next steps:",
@@ -864,87 +836,127 @@ class Wizard:
864
836
  "",
865
837
  "4. INTERACTIVE ANALYSIS:",
866
838
  f" uv run marimo edit {Path('2_interactive_analysis.py').name}",
867
- ""]
868
- )
869
-
839
+ "",
840
+ ])
841
+
870
842
  return instructions
871
843
 
872
844
  def _add_test_mode_support(self, workflow_content: str) -> str:
873
845
  """Add test mode functionality to the generated workflow script."""
874
- lines = workflow_content.split('\n')
875
-
846
+ lines = workflow_content.split("\n")
847
+
876
848
  # Insert test mode code after print statements in main function
877
849
  for i, line in enumerate(lines):
878
850
  # Add test mode print after the masster version line
879
851
  if 'print("masster' in line and 'Automated MS Data Analysis")' in line:
880
- lines.insert(i + 1, ' if TEST_MODE:')
881
- lines.insert(i + 2, ' print("[TEST] TEST MODE: Processing single file only")')
852
+ lines.insert(i + 1, ' if TEST:')
853
+ lines.insert(i + 2, ' print("TEST MODE: Processing single file only")')
882
854
  break
883
-
855
+
884
856
  # Add mode info after num_cores print
885
857
  for i, line in enumerate(lines):
886
- if 'print(f"CPU Cores: {PARAMS[\'num_cores\']}")' in line:
887
- lines.insert(i + 1, ' if TEST_MODE:')
888
- lines.insert(i + 2, ' print(f"Mode: {\'Test Only\' if TEST_ONLY else \'Test + Full Batch\'}")')
858
+ if "print(f\"CPU Cores: {PARAMS['num_cores']}\")" in line:
859
+ lines.insert(i + 1, " if TEST:")
860
+ lines.insert(i + 2, " print(f\"Mode: {'Test Only' if STOP_AFTER_TEST else 'Test + Full Batch'}\")")
889
861
  break
890
-
862
+
891
863
  # Add file limitation logic after file listing
892
864
  for i, line in enumerate(lines):
893
865
  if 'print(f" ... and {len(raw_files) - 5} more")' in line:
894
866
  lines.insert(i + 1, ' ')
895
867
  lines.insert(i + 2, ' # Limit to first file in test mode')
896
- lines.insert(i + 3, ' if TEST_MODE:')
868
+ lines.insert(i + 3, ' if TEST:')
897
869
  lines.insert(i + 4, ' raw_files = raw_files[:1]')
898
- lines.insert(i + 5, ' print(f"\\n[TEST] TEST MODE: Processing only first file: {raw_files[0].name}")')
870
+ lines.insert(i + 5, ' print(f"\\nTEST MODE: Processing only first file: {raw_files[0].name}")')
899
871
  break
900
-
872
+
901
873
  # Modify num_cores for test mode
902
874
  for i, line in enumerate(lines):
903
- if 'PARAMS[\'num_cores\']' in line and 'convert_raw_to_sample5(' in lines[i-2:i+3]:
904
- lines[i] = line.replace('PARAMS[\'num_cores\']', 'PARAMS[\'num_cores\'] if not TEST_MODE else 1 # Use single core for test')
875
+ if "PARAMS['num_cores']" in line and "convert_raw_to_sample5(" in lines[i - 2 : i + 3]:
876
+ lines[i] = line.replace(
877
+ "PARAMS['num_cores']", "PARAMS['num_cores'] if not TEST else 1 # Use single core for test"
878
+ )
905
879
  break
906
-
880
+
907
881
  # Add test-only exit logic after successful processing
908
882
  for i, line in enumerate(lines):
909
883
  if 'print(f"Successfully processed {len(sample5_files)} files to sample5")' in line:
910
884
  lines.insert(i + 1, ' ')
911
- lines.insert(i + 2, ' # Stop here if test-only mode')
912
- lines.insert(i + 3, ' if TEST_ONLY:')
913
- lines.insert(i + 4, ' print("\\n[TEST] TEST ONLY mode: Stopping after successful single file processing")')
885
+ lines.insert(i + 2, ' # Stop here if stop-after-test mode')
886
+ lines.insert(i + 3, ' if STOP_AFTER_TEST:')
887
+ lines.insert(i + 4, ' print("\\nSTOP AFTER TEST mode: Stopping after successful single file processing")')
914
888
  lines.insert(i + 5, ' print(f"Test file created: {sample5_files[0]}")')
915
889
  lines.insert(i + 6, ' print("\\nTo run full batch, use: wizard.run()")')
916
- lines.insert(i + 7, ' total_time = time.time() - start_time')
890
+ lines.insert(i + 7, " total_time = time.time() - start_time")
917
891
  lines.insert(i + 8, ' print(f"\\nTest processing time: {total_time:.1f} seconds")')
918
- lines.insert(i + 9, ' return True')
892
+ lines.insert(i + 9, " return True")
919
893
  break
920
-
921
- return '\n'.join(lines)
894
+
895
+ return "\n".join(lines)
922
896
 
923
897
  def test_and_run(self) -> Dict[str, Any]:
924
898
  """
925
899
  Test the sample processing workflow with a single file, then run full batch.
926
-
927
- This method runs the 1_masster_workflow.py script in test mode to process
928
- the first raw file for validation, then automatically continues with the
929
- full batch if the test succeeds. The script must already exist - call
900
+
901
+ This method first runs the 1_masster_workflow.py script in test-only mode to process
902
+ the first raw file for validation, then automatically continues with the
903
+ full batch if the test succeeds. The script must already exist - call
930
904
  create_scripts() first if needed.
931
-
905
+
932
906
  Returns:
933
907
  Dictionary containing:
934
908
  - status: "success" or "error"
935
909
  - message: Status message
936
910
  - instructions: List of next steps
937
911
  """
938
- return self._execute_workflow(test_mode=True)
912
+ # Step 1: Run test-only mode first
913
+ print("Step 1/2: Testing with single file...")
914
+ test_result = self._execute_workflow(test=True, run=False)
915
+
916
+ if test_result["status"] != "success":
917
+ return {
918
+ "status": "error",
919
+ "message": f"Test failed: {test_result['message']}",
920
+ "instructions": [
921
+ "Single file test failed",
922
+ "Review parameters in 1_masster_workflow.py",
923
+ "Fix issues and try again",
924
+ ],
925
+ }
926
+
927
+ print("Test successful! Proceeding with full batch...")
928
+ print("Step 2/2: Processing all files...")
929
+
930
+ # Step 2: Run full batch mode
931
+ full_result = self._execute_workflow(test=False, run=True)
932
+
933
+ return full_result
939
934
 
940
935
  def test_only(self) -> Dict[str, Any]:
941
936
  """
942
937
  Test the sample processing workflow with a single file only.
943
-
938
+
944
939
  This method runs the 1_masster_workflow.py script in test-only mode to process
945
940
  only the first raw file and then stops (does not continue to full study processing).
946
941
  The script must already exist - call create_scripts() first if needed.
947
-
942
+
943
+ Returns:
944
+ Dictionary containing:
945
+ - status: "success" or "error"
946
+ - message: Status message
947
+ - instructions: List of next steps
948
+ - test_file: Path to the processed test file (if successful)
949
+ """
950
+ return self._execute_workflow(test=True, run=False)
951
+
952
+ def test(self) -> Dict[str, Any]:
953
+ """
954
+ Test the sample processing workflow with a single file only.
955
+
956
+ This method runs the 1_masster_workflow.py script in test-only mode to process
957
+ only the first raw file and then stops (does not continue to full study processing).
958
+ The script must already exist - call create_scripts() first if needed.
959
+
948
960
  Returns:
949
961
  Dictionary containing:
950
962
  - status: "success" or "error"
@@ -952,98 +964,107 @@ class Wizard:
952
964
  - instructions: List of next steps
953
965
  - test_file: Path to the processed test file (if successful)
954
966
  """
955
- return self._execute_workflow(test_mode=True, test_only=True)
967
+ return self._execute_workflow(test=True, run=False)
968
+
956
969
 
957
970
  def run(self) -> Dict[str, Any]:
958
971
  """
959
972
  Run the sample processing workflow.
960
-
973
+
961
974
  This method runs the 1_masster_workflow.py script to process raw files.
962
975
  The script must already exist - call create_scripts() first if needed.
963
-
976
+
964
977
  Returns:
965
978
  Dictionary containing:
966
979
  - status: "success" or "error"
967
980
  - message: Status message
968
981
  - instructions: List of next steps
969
982
  """
970
- return self._execute_workflow(test_mode=False)
983
+ return self._execute_workflow(test=False, run=True)
971
984
 
972
- def _execute_workflow(self, test_mode: bool = False, test_only: bool = False) -> Dict[str, Any]:
985
+ def _execute_workflow(self, test: bool = False, run: bool = True) -> Dict[str, Any]:
973
986
  """
974
987
  Execute the workflow script in either test or full mode.
975
-
988
+
976
989
  Args:
977
- test_mode: If True, run in test mode (single file), otherwise full batch
978
- test_only: If True, stop after single file test (only used with test_mode=True)
990
+ test: If True, run in test mode (single file), otherwise full batch
991
+ run: If False, stop after test (only used with test=True), if True continue with full processing
979
992
  """
980
993
  try:
981
994
  workflow_script_path = self.folder_path / "1_masster_workflow.py"
982
-
995
+
983
996
  # Check if workflow script exists
984
997
  if not workflow_script_path.exists():
985
- return {
986
- "status": "error",
987
- "message": "Workflow script not found. Please run create_scripts() first.",
988
- "instructions": [
989
- "[ERROR] Missing 1_masster_workflow.py",
990
- "Run: wizard.create_scripts()",
991
- "Then: wizard.run()"
992
- ]
993
- }
994
-
998
+ print("Workflow script not found. Creating scripts automatically...")
999
+ create_result = self.create_scripts()
1000
+
1001
+ if create_result["status"] == "error":
1002
+ return {
1003
+ "status": "error",
1004
+ "message": f"Failed to create workflow script: {create_result['message']}",
1005
+ "instructions": [
1006
+ "Could not create 1_masster_workflow.py",
1007
+ "Please check source path and permissions",
1008
+ ],
1009
+ }
1010
+
1011
+ print(f"Created {len(create_result.get('files_created', []))} script files")
1012
+ for file_path in create_result.get('files_created', []):
1013
+ print(f" - {file_path}")
1014
+ print("Proceeding with workflow execution...")
1015
+
995
1016
  # Setup execution mode
996
- if test_only:
1017
+ if test and not run:
997
1018
  mode_label = "test-only"
998
- elif test_mode:
1019
+ elif test:
999
1020
  mode_label = "test"
1000
1021
  else:
1001
1022
  mode_label = "full batch"
1002
-
1023
+
1003
1024
  env = None
1004
- if test_mode:
1025
+ if test:
1005
1026
  import os
1027
+
1006
1028
  env = os.environ.copy()
1007
- env['MASSTER_TEST_MODE'] = '1'
1008
- if test_only:
1009
- env['MASSTER_TEST_ONLY'] = '1'
1010
-
1029
+ env["MASSTER_TEST"] = "1"
1030
+ if not run:
1031
+ env["MASSTER_STOP_AFTER_TEST"] = "1"
1032
+
1011
1033
  # Execute the workflow script
1012
1034
  print(f">> Executing {mode_label} processing workflow...")
1013
- print(f"[SCRIPT] Running: {workflow_script_path.name}")
1035
+ print(f"Running: {workflow_script_path.name}")
1014
1036
  print("=" * 60)
1015
-
1037
+
1016
1038
  import subprocess
1017
- result = subprocess.run([
1018
- sys.executable, str(workflow_script_path)
1019
- ], cwd=str(self.folder_path), env=env)
1020
-
1039
+
1040
+ result = subprocess.run([sys.executable, str(workflow_script_path)], cwd=str(self.folder_path), env=env)
1041
+
1021
1042
  success = result.returncode == 0
1022
-
1043
+
1023
1044
  if success:
1024
1045
  print("=" * 60)
1025
- if test_only:
1026
- print("[OK] Test-only processing completed successfully!")
1027
- print("[INFO] Single file validated - ready for full batch")
1046
+ if test and not run:
1047
+ print("Test-only processing completed successfully!")
1048
+ print("Single file validated - ready for full batch")
1028
1049
  print(" wizard.run()")
1029
- elif test_mode:
1030
- print("[OK] Test processing completed successfully!")
1031
- print("[INFO] Next step: Run full batch")
1050
+ elif test:
1051
+ print("Test processing completed successfully!")
1052
+ print("Next step: Run full batch")
1032
1053
  print(" wizard.run()")
1033
1054
  else:
1034
- print("[OK] Sample processing completed successfully!")
1035
- print("[INFO] Next step: Run interactive analysis")
1036
- print(" uv run marimo edit 2_interactive_analysis.py")
1055
+ notebook_path = self.folder_path / "2_interactive_analysis.py"
1056
+ print("Sample processing completed successfully!")
1057
+ print("Next step: Run interactive analysis")
1058
+ print(f" uv run marimo edit {notebook_path}")
1037
1059
  print("=" * 60)
1038
-
1039
- next_step = ("Next: wizard.run()" if test_mode else
1040
- "Next: uv run marimo edit 2_interactive_analysis.py")
1041
-
1060
+
1061
+ next_step = "Next: wizard.run()" if test else f"Next: uv run marimo edit {self.folder_path / '2_interactive_analysis.py'}"
1062
+
1042
1063
  return {
1043
1064
  "status": "success",
1044
1065
  "message": f"{mode_label.capitalize()} processing completed successfully",
1045
1066
  "instructions": [
1046
- f"[OK] {mode_label.capitalize()} processing completed",
1067
+ f"{mode_label.capitalize()} processing completed",
1047
1068
  next_step
1048
1069
  ]
1049
1070
  }
@@ -1052,30 +1073,30 @@ class Wizard:
1052
1073
  "status": "error",
1053
1074
  "message": f"Workflow execution failed with return code {result.returncode}",
1054
1075
  "instructions": [
1055
- "[ERROR] Check the error messages above",
1076
+ "Check the error messages above",
1056
1077
  "Review parameters in 1_masster_workflow.py",
1057
- f"Try running manually: python {workflow_script_path.name}"
1058
- ]
1078
+ f"Try running manually: python {workflow_script_path.name}",
1079
+ ],
1059
1080
  }
1060
-
1081
+
1061
1082
  except Exception as e:
1062
1083
  return {
1063
1084
  "status": "error",
1064
1085
  "message": f"Failed to execute workflow: {e}",
1065
1086
  "instructions": [
1066
- "[ERROR] Execution failed",
1087
+ "Execution failed",
1067
1088
  "Check that source files exist and are accessible",
1068
- "Verify folder permissions"
1069
- ]
1089
+ "Verify folder permissions",
1090
+ ],
1070
1091
  }
1071
1092
 
1072
1093
  def _generate_script_content(self) -> str:
1073
1094
  """Generate the complete analysis script content."""
1074
-
1095
+
1075
1096
  # Convert Path objects to strings for JSON serialization
1076
1097
  params_dict = {}
1077
1098
  for key, value in self.params.__dict__.items():
1078
- if key == '_param_metadata': # Skip metadata in generated script
1099
+ if key == "_param_metadata": # Skip metadata in generated script
1079
1100
  continue
1080
1101
  if isinstance(value, Path):
1081
1102
  params_dict[key] = str(value)
@@ -1084,251 +1105,288 @@ class Wizard:
1084
1105
 
1085
1106
  # Obtain list of files in source with extension wiff, .raw, .mzML
1086
1107
  raw_files = []
1087
- for ext in params_dict.get('file_extensions', []):
1108
+ for ext in params_dict.get("file_extensions", []):
1088
1109
  raw_files.extend(glob.glob(f"{params_dict.get('source', '')}/**/*{ext}", recursive=True))
1089
1110
 
1090
1111
  # Create readable PARAMS dict with comments
1091
1112
  params_lines = []
1092
- params_lines.append('# Analysis parameters')
1093
- params_lines.append('PARAMS = {')
1094
-
1113
+ params_lines.append("# Analysis parameters")
1114
+ params_lines.append("PARAMS = {")
1115
+
1095
1116
  # Core Configuration
1096
- params_lines.append(' # === Core Configuration ===')
1117
+ params_lines.append(" # === Core Configuration ===")
1097
1118
  params_lines.append(f' "source": {params_dict.get("source", "")!r}, # Directory containing raw data files')
1098
1119
  params_lines.append(f' "folder": {params_dict.get("folder", "")!r}, # Output directory for processed study')
1099
- params_lines.append(f' "polarity": {params_dict.get("polarity", "positive")!r}, # Ion polarity mode ("positive" or "negative")')
1100
- params_lines.append(f' "num_cores": {params_dict.get("num_cores", 4)}, # Number of CPU cores for parallel processing')
1101
- params_lines.append('')
1102
-
1120
+ params_lines.append(
1121
+ f' "polarity": {params_dict.get("polarity", "positive")!r}, # Ion polarity mode ("positive" or "negative")'
1122
+ )
1123
+ params_lines.append(
1124
+ f' "num_cores": {params_dict.get("num_cores", 4)}, # Number of CPU cores for parallel processing'
1125
+ )
1126
+ params_lines.append("")
1127
+
1103
1128
  # File Discovery
1104
- params_lines.append(' # === File Discovery ===')
1105
- params_lines.append(f' "file_extensions": {params_dict.get("file_extensions", [".wiff", ".raw", ".mzML"])!r}, # File extensions to search for')
1106
- params_lines.append(f' "search_subfolders": {params_dict.get("search_subfolders", True)}, # Whether to search subdirectories recursively')
1107
- params_lines.append(f' "skip_patterns": {params_dict.get("skip_patterns", ["blank", "condition"])!r}, # Filename patterns to skip')
1108
- params_lines.append('')
1109
-
1129
+ params_lines.append(" # === File Discovery ===")
1130
+ params_lines.append(
1131
+ f' "file_extensions": {params_dict.get("file_extensions", [".wiff", ".raw", ".mzML"])!r}, # File extensions to search for'
1132
+ )
1133
+ params_lines.append(
1134
+ f' "search_subfolders": {params_dict.get("search_subfolders", True)}, # Whether to search subdirectories recursively'
1135
+ )
1136
+ params_lines.append(
1137
+ f' "skip_patterns": {params_dict.get("skip_patterns", ["blank", "condition"])!r}, # Filename patterns to skip'
1138
+ )
1139
+ params_lines.append("")
1140
+
1110
1141
  # Processing Parameters
1111
- params_lines.append(' # === Processing Parameters ===')
1112
- params_lines.append(f' "adducts": {params_dict.get("adducts", [])!r}, # Adduct specifications for feature detection and annotation')
1113
- params_lines.append(f' "detector_type": {params_dict.get("detector_type", "unknown")!r}, # MS detector type ("orbitrap", "tof", "unknown")')
1142
+ params_lines.append(" # === Processing Parameters ===")
1143
+ params_lines.append(
1144
+ f' "adducts": {params_dict.get("adducts", [])!r}, # Adduct specifications for feature detection and annotation'
1145
+ )
1146
+ params_lines.append(
1147
+ f' "detector_type": {params_dict.get("detector_type", "unknown")!r}, # MS detector type ("orbitrap", "tof", "unknown")'
1148
+ )
1114
1149
  params_lines.append(f' "noise": {params_dict.get("noise", 50.0)}, # Noise threshold for feature detection')
1115
- params_lines.append(f' "chrom_fwhm": {params_dict.get("chrom_fwhm", 0.5)}, # Chromatographic peak full width at half maximum (seconds)')
1116
- params_lines.append(f' "chrom_peak_snr": {params_dict.get("chrom_peak_snr", 5.0)}, # Minimum signal-to-noise ratio for chromatographic peaks')
1117
- params_lines.append('')
1118
-
1150
+ params_lines.append(
1151
+ f' "chrom_fwhm": {params_dict.get("chrom_fwhm", 0.5)}, # Chromatographic peak full width at half maximum (seconds)'
1152
+ )
1153
+ params_lines.append(
1154
+ f' "chrom_peak_snr": {params_dict.get("chrom_peak_snr", 5.0)}, # Minimum signal-to-noise ratio for chromatographic peaks'
1155
+ )
1156
+ params_lines.append("")
1157
+
1119
1158
  # Alignment & Merging
1120
- params_lines.append(' # === Alignment & Merging ===')
1121
- params_lines.append(f' "rt_tol": {params_dict.get("rt_tol", 2.0)}, # Retention time tolerance for alignment (seconds)')
1122
- params_lines.append(f' "mz_tol": {params_dict.get("mz_tol", 0.01)}, # Mass-to-charge ratio tolerance for alignment (Da)')
1123
- params_lines.append(f' "alignment_method": {params_dict.get("alignment_method", "kd")!r}, # Algorithm for sample alignment')
1124
- params_lines.append(f' "min_samples_per_feature": {params_dict.get("min_samples_per_feature", 1)}, # Minimum samples required per consensus feature')
1125
- params_lines.append(f' "merge_method": {params_dict.get("merge_method", "qt")!r}, # Method for merging consensus features')
1126
- params_lines.append('')
1159
+ params_lines.append(" # === Alignment & Merging ===")
1160
+ params_lines.append(
1161
+ f' "rt_tol": {params_dict.get("rt_tol", 2.0)}, # Retention time tolerance for alignment (seconds)'
1162
+ )
1163
+ params_lines.append(
1164
+ f' "mz_tol": {params_dict.get("mz_tol", 0.01)}, # Mass-to-charge ratio tolerance for alignment (Da)'
1165
+ )
1166
+ params_lines.append(
1167
+ f' "alignment_method": {params_dict.get("alignment_method", "kd")!r}, # Algorithm for sample alignment'
1168
+ )
1169
+ params_lines.append(
1170
+ f' "min_samples_per_feature": {params_dict.get("min_samples_per_feature", 1)}, # Minimum samples required per consensus feature'
1171
+ )
1172
+ params_lines.append(
1173
+ f' "merge_method": {params_dict.get("merge_method", "qt")!r}, # Method for merging consensus features'
1174
+ )
1175
+ params_lines.append("")
1127
1176
 
1128
1177
  # Sample Processing
1129
- params_lines.append(' # === Sample Processing (used in add_samples_from_folder) ===')
1130
- params_lines.append(f' "batch_size": {params_dict.get("batch_size", 8)}, # Number of files to process per batch')
1131
- params_lines.append(f' "memory_limit_gb": {params_dict.get("memory_limit_gb", 16.0)}, # Memory limit for processing (GB)')
1132
- params_lines.append('')
1133
-
1178
+ params_lines.append(" # === Sample Processing (used in add_samples_from_folder) ===")
1179
+ params_lines.append(
1180
+ f' "batch_size": {params_dict.get("batch_size", 8)}, # Number of files to process per batch'
1181
+ )
1182
+ params_lines.append(
1183
+ f' "memory_limit_gb": {params_dict.get("memory_limit_gb", 16.0)}, # Memory limit for processing (GB)'
1184
+ )
1185
+ params_lines.append("")
1186
+
1134
1187
  # Script Options
1135
- params_lines.append(' # === Script Options ===')
1136
- params_lines.append(f' "resume_enabled": {params_dict.get("resume_enabled", True)}, # Enable automatic resume capability')
1137
- params_lines.append(f' "force_reprocess": {params_dict.get("force_reprocess", False)}, # Force reprocessing of existing files')
1138
- params_lines.append(f' "cleanup_temp_files": {params_dict.get("cleanup_temp_files", True)}, # Clean up temporary files after processing')
1139
-
1140
- params_lines.append('}')
1141
-
1188
+ params_lines.append(" # === Script Options ===")
1189
+ params_lines.append(
1190
+ f' "resume_enabled": {params_dict.get("resume_enabled", True)}, # Enable automatic resume capability'
1191
+ )
1192
+ params_lines.append(
1193
+ f' "force_reprocess": {params_dict.get("force_reprocess", False)}, # Force reprocessing of existing files'
1194
+ )
1195
+ params_lines.append(
1196
+ f' "cleanup_temp_files": {params_dict.get("cleanup_temp_files", True)}, # Clean up temporary files after processing'
1197
+ )
1198
+
1199
+ params_lines.append("}")
1200
+
1142
1201
  # Create script lines
1143
1202
  script_lines = [
1144
- '#!/usr/bin/env python3',
1203
+ "#!/usr/bin/env python3",
1145
1204
  '"""',
1146
- 'Automated Mass Spectrometry Data Analysis Pipeline',
1147
- f'Generated by masster wizard v{version}',
1205
+ "Automated Mass Spectrometry Data Analysis Pipeline",
1206
+ f"Generated by masster wizard v{version}",
1148
1207
  '"""',
1149
- '',
1150
- 'import sys',
1151
- 'import time',
1152
- 'from pathlib import Path',
1153
- '',
1154
- '# Import masster modules',
1155
- 'from masster.study import Study',
1156
- 'from masster import __version__',
1157
- '',
1208
+ "",
1209
+ "import sys",
1210
+ "import time",
1211
+ "from pathlib import Path",
1212
+ "",
1213
+ "# Import masster modules",
1214
+ "from masster.study import Study",
1215
+ "from masster import __version__",
1216
+ "",
1158
1217
  ]
1159
-
1218
+
1160
1219
  # Add the formatted PARAMS
1161
1220
  script_lines.extend(params_lines)
1162
-
1221
+
1163
1222
  # Add the main function and pipeline
1164
1223
  script_lines.extend([
1165
- '',
1166
- '',
1167
- 'def discover_raw_files(source_folder, file_extensions, search_subfolders=True):',
1224
+ "",
1225
+ "",
1226
+ "def discover_raw_files(source_folder, file_extensions, search_subfolders=True):",
1168
1227
  ' """Discover raw data files in the source folder."""',
1169
- ' source_path = Path(source_folder)',
1170
- ' raw_files = []',
1171
- ' ',
1172
- ' for ext in file_extensions:',
1173
- ' if search_subfolders:',
1228
+ " source_path = Path(source_folder)",
1229
+ " raw_files = []",
1230
+ " ",
1231
+ " for ext in file_extensions:",
1232
+ " if search_subfolders:",
1174
1233
  ' pattern = f"**/*{ext}"',
1175
- ' files = list(source_path.rglob(pattern))',
1176
- ' else:',
1234
+ " files = list(source_path.rglob(pattern))",
1235
+ " else:",
1177
1236
  ' pattern = f"*{ext}"',
1178
- ' files = list(source_path.glob(pattern))',
1179
- ' raw_files.extend(files)',
1180
- ' ',
1181
- ' return raw_files',
1182
- '',
1183
- '',
1184
- 'def process_single_file(args):',
1237
+ " files = list(source_path.glob(pattern))",
1238
+ " raw_files.extend(files)",
1239
+ " ",
1240
+ " return raw_files",
1241
+ "",
1242
+ "",
1243
+ "def process_single_file(args):",
1185
1244
  ' """Process a single raw file to sample5 format - module level for multiprocessing."""',
1186
- ' raw_file, output_folder = args',
1187
- ' from masster.sample import Sample',
1188
- ' ',
1189
- ' try:',
1190
- ' # Create sample5 filename',
1191
- ' sample_name = raw_file.stem',
1245
+ " raw_file, output_folder = args",
1246
+ " from masster.sample import Sample",
1247
+ " ",
1248
+ " try:",
1249
+ " # Create sample5 filename",
1250
+ " sample_name = raw_file.stem",
1192
1251
  ' sample5_path = Path(output_folder) / f"{sample_name}.sample5"',
1193
- ' ',
1194
- ' # Skip if sample5 already exists',
1195
- ' if sample5_path.exists():',
1252
+ " ",
1253
+ " # Skip if sample5 already exists",
1254
+ " if sample5_path.exists():",
1196
1255
  ' print(f" Skipping {raw_file.name} (sample5 already exists)")',
1197
- ' return str(sample5_path)',
1198
- ' ',
1256
+ " return str(sample5_path)",
1257
+ " ",
1199
1258
  ' print(f" Converting {raw_file.name}...")',
1200
- ' ',
1201
- ' # Load and process raw file with full pipeline',
1202
- ' sample = Sample(log_label=sample_name)',
1203
- ' sample.load(filename=str(raw_file))',
1204
- ' sample.find_features(',
1205
- ' noise=PARAMS[\'noise\'],',
1206
- ' chrom_fwhm=PARAMS[\'chrom_fwhm\'],',
1207
- ' chrom_peak_snr=PARAMS[\'chrom_peak_snr\']',
1208
- ' )',
1209
- ' sample.find_adducts(adducts=PARAMS[\'adducts\'])',
1210
- ' sample.find_ms2()',
1211
- ' # sample.find_iso()',
1212
- ' # sample.export_mgf()',
1213
- ' # sample.export_mztab()',
1259
+ " ",
1260
+ " # Load and process raw file with full pipeline",
1261
+ " sample = Sample(log_label=sample_name)",
1262
+ " sample.load(filename=str(raw_file))",
1263
+ " sample.find_features(",
1264
+ " noise=PARAMS['noise'],",
1265
+ " chrom_fwhm=PARAMS['chrom_fwhm'],",
1266
+ " chrom_peak_snr=PARAMS['chrom_peak_snr']",
1267
+ " )",
1268
+ " sample.find_adducts(adducts=PARAMS['adducts'])",
1269
+ " sample.find_ms2()",
1270
+ " sample.find_iso()",
1271
+ " # sample.export_mgf()",
1272
+ " # sample.export_mztab()",
1214
1273
  ' # sample.plot_2d(filename="{sample_name}.html")',
1215
- ' sample.save(str(sample5_path))',
1216
- ' ',
1274
+ " sample.save(str(sample5_path))",
1275
+ " ",
1217
1276
  ' # print(f" Completed {raw_file.name} -> {sample5_path.name}")',
1218
- ' return str(sample5_path)',
1219
- ' ',
1220
- ' except Exception as e:',
1277
+ " return str(sample5_path)",
1278
+ " ",
1279
+ " except Exception as e:",
1221
1280
  ' print(f" ERROR processing {raw_file.name}: {e}")',
1222
- ' return None',
1223
- '',
1224
- '',
1225
- 'def convert_raw_to_sample5(raw_files, output_folder, polarity, num_cores):',
1281
+ " return None",
1282
+ "",
1283
+ "",
1284
+ "def convert_raw_to_sample5(raw_files, output_folder, polarity, num_cores):",
1226
1285
  ' """Convert raw data files to sample5 format."""',
1227
- ' import concurrent.futures',
1228
- ' import os',
1229
- ' ',
1230
- ' # Create output directory',
1231
- ' os.makedirs(output_folder, exist_ok=True)',
1232
- ' ',
1233
- ' # Prepare arguments for multiprocessing',
1234
- ' file_args = [(raw_file, output_folder) for raw_file in raw_files]',
1235
- ' ',
1236
- ' # Process files in parallel',
1237
- ' sample5_files = []',
1238
- ' with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:',
1239
- ' futures = [executor.submit(process_single_file, args) for args in file_args]',
1240
- ' ',
1241
- ' for future in concurrent.futures.as_completed(futures):',
1242
- ' result = future.result()',
1243
- ' if result:',
1244
- ' sample5_files.append(result)',
1245
- ' ',
1246
- ' return sample5_files',
1247
- '',
1248
- '',
1249
- 'def main():',
1286
+ " import concurrent.futures",
1287
+ " import os",
1288
+ " ",
1289
+ " # Create output directory",
1290
+ " os.makedirs(output_folder, exist_ok=True)",
1291
+ " ",
1292
+ " # Prepare arguments for multiprocessing",
1293
+ " file_args = [(raw_file, output_folder) for raw_file in raw_files]",
1294
+ " ",
1295
+ " # Process files in parallel",
1296
+ " sample5_files = []",
1297
+ " with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:",
1298
+ " futures = [executor.submit(process_single_file, args) for args in file_args]",
1299
+ " ",
1300
+ " for future in concurrent.futures.as_completed(futures):",
1301
+ " result = future.result()",
1302
+ " if result:",
1303
+ " sample5_files.append(result)",
1304
+ " ",
1305
+ " return sample5_files",
1306
+ "",
1307
+ "",
1308
+ "def main():",
1250
1309
  ' """Main analysis pipeline."""',
1251
- ' try:',
1310
+ " try:",
1252
1311
  ' print("=" * 70)',
1253
1312
  f' print("masster {version} - Automated MS Data Analysis")',
1254
1313
  ' print("=" * 70)',
1255
- ' print(f"Source: {PARAMS[\'source\']}")',
1256
- ' print(f"Output: {PARAMS[\'folder\']}")',
1257
- ' print(f"Polarity: {PARAMS[\'polarity\']}")',
1258
- ' print(f"CPU Cores: {PARAMS[\'num_cores\']}")',
1314
+ " print(f\"Source: {PARAMS['source']}\")",
1315
+ " print(f\"Output: {PARAMS['folder']}\")",
1316
+ " print(f\"Polarity: {PARAMS['polarity']}\")",
1317
+ " print(f\"CPU Cores: {PARAMS['num_cores']}\")",
1259
1318
  ' print("=" * 70)',
1260
- ' ',
1261
- ' start_time = time.time()',
1262
- ' ',
1263
- ' # Step 1: Discover raw data files',
1319
+ " ",
1320
+ " start_time = time.time()",
1321
+ " ",
1322
+ " # Step 1: Discover raw data files",
1264
1323
  ' print("\\nStep 1/7: Discovering raw data files...")',
1265
- ' raw_files = discover_raw_files(',
1266
- ' PARAMS[\'source\'],',
1267
- ' PARAMS[\'file_extensions\'],',
1268
- ' PARAMS[\'search_subfolders\']',
1269
- ' )',
1270
- ' ',
1271
- ' if not raw_files:',
1324
+ " raw_files = discover_raw_files(",
1325
+ " PARAMS['source'],",
1326
+ " PARAMS['file_extensions'],",
1327
+ " PARAMS['search_subfolders']",
1328
+ " )",
1329
+ " ",
1330
+ " if not raw_files:",
1272
1331
  ' print("No raw data files found!")',
1273
- ' return False',
1274
- ' ',
1332
+ " return False",
1333
+ " ",
1275
1334
  ' print(f"Found {len(raw_files)} raw data files")',
1276
- ' for f in raw_files[:5]: # Show first 5 files',
1335
+ " for f in raw_files[:5]: # Show first 5 files",
1277
1336
  ' print(f" {f.name}")',
1278
- ' if len(raw_files) > 5:',
1337
+ " if len(raw_files) > 5:",
1279
1338
  ' print(f" ... and {len(raw_files) - 5} more")',
1280
- ' ',
1281
- ' # Step 2: Process raw files',
1339
+ " ",
1340
+ " # Step 2: Process raw files",
1282
1341
  ' print("\\nStep 2/7: Processing raw files...")',
1283
- ' sample5_files = convert_raw_to_sample5(',
1284
- ' raw_files,',
1285
- ' PARAMS[\'folder\'],',
1286
- ' PARAMS[\'polarity\'],',
1287
- ' PARAMS[\'num_cores\']',
1288
- ' )',
1289
- ' ',
1290
- ' if not sample5_files:',
1342
+ " sample5_files = convert_raw_to_sample5(",
1343
+ " raw_files,",
1344
+ " PARAMS['folder'],",
1345
+ " PARAMS['polarity'],",
1346
+ " PARAMS['num_cores']",
1347
+ " )",
1348
+ " ",
1349
+ " if not sample5_files:",
1291
1350
  ' print("No sample5 files were created!")',
1292
- ' return False',
1293
- ' ',
1351
+ " return False",
1352
+ " ",
1294
1353
  ' print(f"Successfully processed {len(sample5_files)} files to sample5")',
1295
- ' ',
1296
- ' # Step 3: Create and configure study',
1354
+ " ",
1355
+ " # Step 3: Create and configure study",
1297
1356
  ' print("\\nStep 3/7: Initializing study...")',
1298
- ' study = Study(folder=PARAMS[\'folder\'])',
1299
- ' study.polarity = PARAMS[\'polarity\']',
1300
- ' study.adducts = PARAMS[\'adducts\']',
1301
- ' ',
1302
- ' # Step 4: Add sample5 files to study',
1357
+ " study = Study(folder=PARAMS['folder'])",
1358
+ " study.polarity = PARAMS['polarity']",
1359
+ " study.adducts = PARAMS['adducts']",
1360
+ " ",
1361
+ " # Step 4: Add sample5 files to study",
1303
1362
  ' print("\\nStep 4/7: Adding samples to study...")',
1304
- ' study.add(str(Path(PARAMS[\'folder\']) / "*.sample5"))',
1305
- ' study.features_filter(study.features_select(chrom_coherence=0.1, chrom_prominence_scaled=1))',
1306
- ' ',
1307
- ' # Step 5: Core processing',
1363
+ " study.add(str(Path(PARAMS['folder']) / \"*.sample5\"))",
1364
+ " study.features_filter(study.features_select(chrom_coherence=0.1, chrom_prominence_scaled=1))",
1365
+ " ",
1366
+ " # Step 5: Core processing",
1308
1367
  ' print("\\nStep 5/7: Processing...")',
1309
- ' study.align(',
1310
- ' algorithm=PARAMS[\'alignment_method\'],',
1311
- ' rt_tol=PARAMS[\'rt_tol\']',
1312
- ' )',
1313
- ' ',
1314
- ' study.merge(',
1368
+ " study.align(",
1369
+ " algorithm=PARAMS['alignment_method'],",
1370
+ " rt_tol=PARAMS['rt_tol']",
1371
+ " )",
1372
+ " ",
1373
+ " study.merge(",
1315
1374
  ' method="qt",',
1316
- ' min_samples=PARAMS[\'min_samples_per_feature\'],',
1317
- ' threads=PARAMS[\'num_cores\'],',
1318
- ' rt_tol=PARAMS[\'rt_tol\'],'
1319
- ' )',
1320
- ' study.find_iso()',
1321
- ' study.fill()',
1322
- ' study.integrate()',
1323
- ' ',
1324
- ' # Step 6/7: Saving results',
1375
+ " min_samples=PARAMS['min_samples_per_feature'],",
1376
+ " threads=PARAMS['num_cores'],",
1377
+ " rt_tol=PARAMS['rt_tol'], )",
1378
+ " study.find_iso()",
1379
+ " study.fill()",
1380
+ " study.integrate()",
1381
+ " ",
1382
+ " # Step 6/7: Saving results",
1325
1383
  ' print("\\nStep 6/7: Saving results...")',
1326
- ' study.save()',
1327
- ' study.export_xlsx()',
1328
- ' study.export_mgf()',
1329
- ' study.export_mztab()',
1330
- ' ',
1331
- ' # Step 7: Plots',
1384
+ " study.save()",
1385
+ " study.export_xlsx()",
1386
+ " study.export_mgf()",
1387
+ " study.export_mztab()",
1388
+ " ",
1389
+ " # Step 7: Plots",
1332
1390
  ' print("\\nStep 7/7: Exporting plots...")',
1333
1391
  ' study.plot_consensus_2d(filename="consensus.html")',
1334
1392
  ' study.plot_consensus_2d(filename="consensus.png")',
@@ -1340,11 +1398,10 @@ class Wizard:
1340
1398
  ' study.plot_bpc(filename="bpc.png")',
1341
1399
  ' study.plot_rt_correction(filename="rt_correction.html")',
1342
1400
  ' study.plot_rt_correction(filename="rt_correction.png")',
1343
-
1344
- ' ',
1345
- ' # Print summary',
1346
- ' study.info()',
1347
- ' total_time = time.time() - start_time',
1401
+ " ",
1402
+ " # Print summary",
1403
+ " study.info()",
1404
+ " total_time = time.time() - start_time",
1348
1405
  ' print("\\n" + "=" * 70)',
1349
1406
  ' print("ANALYSIS COMPLETE")',
1350
1407
  ' print("=" * 70)',
@@ -1354,41 +1411,41 @@ class Wizard:
1354
1411
  ' if hasattr(study, "consensus_df"):',
1355
1412
  ' print(f"Consensus features generated: {len(study.consensus_df)}")',
1356
1413
  ' print("=" * 70)',
1357
- ' ',
1358
- ' return True',
1359
- ' ',
1360
- ' except KeyboardInterrupt:',
1414
+ " ",
1415
+ " return True",
1416
+ " ",
1417
+ " except KeyboardInterrupt:",
1361
1418
  ' print("\\nAnalysis interrupted by user")',
1362
- ' return False',
1363
- ' except Exception as e:',
1419
+ " return False",
1420
+ " except Exception as e:",
1364
1421
  ' print(f"Analysis failed with error: {e}")',
1365
- ' import traceback',
1366
- ' traceback.print_exc()',
1367
- ' return False',
1368
- '',
1369
- '',
1422
+ " import traceback",
1423
+ " traceback.print_exc()",
1424
+ " return False",
1425
+ "",
1426
+ "",
1370
1427
  'if __name__ == "__main__":',
1371
- ' success = main()',
1372
- ' sys.exit(0 if success else 1)',
1428
+ " success = main()",
1429
+ " sys.exit(0 if success else 1)",
1373
1430
  ])
1374
-
1375
- return '\n'.join(script_lines)
1431
+
1432
+ return "\n".join(script_lines)
1376
1433
 
1377
1434
 
1378
1435
  def create_scripts(
1379
- source: str = "",
1380
- folder: str = "",
1436
+ source: str = "",
1437
+ folder: str = "",
1381
1438
  polarity: Optional[str] = None,
1382
1439
  adducts: Optional[List[str]] = None,
1383
1440
  num_cores: int = 0,
1384
- **kwargs
1441
+ **kwargs,
1385
1442
  ) -> Dict[str, Any]:
1386
1443
  """
1387
1444
  Create analysis scripts without explicitly instantiating a Wizard.
1388
-
1445
+
1389
1446
  This is a convenience function that creates a Wizard instance internally
1390
1447
  and calls its create_scripts() method.
1391
-
1448
+
1392
1449
  Parameters:
1393
1450
  source: Directory containing raw data files
1394
1451
  folder: Output directory for processed study
@@ -1396,7 +1453,7 @@ def create_scripts(
1396
1453
  adducts: List of adduct specifications (auto-set if None)
1397
1454
  num_cores: Number of CPU cores (0 = auto-detect)
1398
1455
  **kwargs: Additional parameters
1399
-
1456
+
1400
1457
  Returns:
1401
1458
  Dictionary containing:
1402
1459
  - status: "success" or "error"
@@ -1404,42 +1461,35 @@ def create_scripts(
1404
1461
  - instructions: List of next steps
1405
1462
  - files_created: List of created file paths
1406
1463
  - source_info: Metadata about source files
1407
-
1464
+
1408
1465
  Example:
1409
1466
  >>> import masster.wizard
1410
1467
  >>> result = masster.wizard.create_scripts(
1411
1468
  ... source=r'D:\\Data\\raw_files',
1412
- ... folder=r'D:\\Data\\output',
1469
+ ... folder=r'D:\\Data\\output',
1413
1470
  ... polarity='negative'
1414
1471
  ... )
1415
1472
  >>> print("Status:", result["status"])
1416
1473
  """
1417
-
1474
+
1418
1475
  try:
1419
1476
  # Auto-detect optimal number of cores if not specified
1420
1477
  if num_cores <= 0:
1421
1478
  num_cores = max(1, int(multiprocessing.cpu_count() * 0.75))
1422
-
1479
+
1423
1480
  # Create Wizard instance
1424
- wizard = Wizard(
1425
- source=source,
1426
- folder=folder,
1427
- polarity=polarity,
1428
- adducts=adducts,
1429
- num_cores=num_cores,
1430
- **kwargs
1431
- )
1432
-
1481
+ wizard = Wizard(source=source, folder=folder, polarity=polarity, adducts=adducts, num_cores=num_cores, **kwargs)
1482
+
1433
1483
  # Call the instance method
1434
1484
  return wizard.create_scripts()
1435
-
1485
+
1436
1486
  except Exception as e:
1437
1487
  return {
1438
1488
  "status": "error",
1439
1489
  "message": f"Failed to create scripts: {e}",
1440
1490
  "instructions": [],
1441
1491
  "files_created": [],
1442
- "source_info": {}
1492
+ "source_info": {},
1443
1493
  }
1444
1494
 
1445
1495