masster 0.5.14__py3-none-any.whl → 0.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/adducts.py +8 -5
- masster/sample/processing.py +6 -0
- masster/study/id.py +4 -3
- masster/wizard/__init__.py +2 -2
- masster/wizard/wizard.py +736 -343
- {masster-0.5.14.dist-info → masster-0.5.15.dist-info}/METADATA +1 -1
- {masster-0.5.14.dist-info → masster-0.5.15.dist-info}/RECORD +11 -11
- {masster-0.5.14.dist-info → masster-0.5.15.dist-info}/WHEEL +0 -0
- {masster-0.5.14.dist-info → masster-0.5.15.dist-info}/entry_points.txt +0 -0
- {masster-0.5.14.dist-info → masster-0.5.15.dist-info}/licenses/LICENSE +0 -0
masster/wizard/wizard.py
CHANGED
|
@@ -229,8 +229,7 @@ class Wizard:
|
|
|
229
229
|
detection, sample processing, study assembly, alignment, merging, and export.
|
|
230
230
|
|
|
231
231
|
This simplified version focuses on three core functions:
|
|
232
|
-
-
|
|
233
|
-
- create_notebook(): Generate marimo interactive notebooks
|
|
232
|
+
- create_scripts(): Generate workflow and interactive analysis scripts
|
|
234
233
|
- analyze(): Create and run analysis scripts with interactive notebook
|
|
235
234
|
"""
|
|
236
235
|
|
|
@@ -291,6 +290,14 @@ class Wizard:
|
|
|
291
290
|
self.folder_path = Path(self.params.folder)
|
|
292
291
|
self.folder_path.mkdir(parents=True, exist_ok=True)
|
|
293
292
|
|
|
293
|
+
# Initialize logger
|
|
294
|
+
self.logger = MassterLogger(
|
|
295
|
+
instance_type="wizard",
|
|
296
|
+
level="INFO",
|
|
297
|
+
label="Wizard",
|
|
298
|
+
sink=None
|
|
299
|
+
)
|
|
300
|
+
|
|
294
301
|
# Auto-infer polarity from the first file if not explicitly set by user
|
|
295
302
|
if polarity == "positive" and "polarity" not in kwargs:
|
|
296
303
|
inferred_polarity = self._infer_polarity_from_first_file()
|
|
@@ -299,6 +306,164 @@ class Wizard:
|
|
|
299
306
|
# Update adducts based on inferred polarity
|
|
300
307
|
self.params.__post_init__()
|
|
301
308
|
|
|
309
|
+
def _analyze_source_files(self) -> Dict[str, Any]:
|
|
310
|
+
"""
|
|
311
|
+
Analyze source files to extract metadata: number of files, file type, polarity, and acquisition length.
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
Dictionary containing:
|
|
315
|
+
- number_of_files: Total count of data files found
|
|
316
|
+
- file_types: List of file extensions found
|
|
317
|
+
- polarity: Detected polarity ("positive" or "negative")
|
|
318
|
+
- length_minutes: Acquisition length in minutes
|
|
319
|
+
- first_file: Path to first file analyzed
|
|
320
|
+
"""
|
|
321
|
+
result = {
|
|
322
|
+
'number_of_files': 0,
|
|
323
|
+
'file_types': [],
|
|
324
|
+
'polarity': 'positive',
|
|
325
|
+
'length_minutes': 0.0,
|
|
326
|
+
'first_file': None
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
try:
|
|
330
|
+
# Find all data files
|
|
331
|
+
all_files = []
|
|
332
|
+
file_types_found = set()
|
|
333
|
+
|
|
334
|
+
for extension in self.params.file_extensions:
|
|
335
|
+
if self.params.search_subfolders:
|
|
336
|
+
pattern = f"**/*{extension}"
|
|
337
|
+
files = list(self.source_path.rglob(pattern))
|
|
338
|
+
else:
|
|
339
|
+
pattern = f"*{extension}"
|
|
340
|
+
files = list(self.source_path.glob(pattern))
|
|
341
|
+
|
|
342
|
+
if files:
|
|
343
|
+
all_files.extend(files)
|
|
344
|
+
file_types_found.add(extension)
|
|
345
|
+
|
|
346
|
+
result['number_of_files'] = len(all_files)
|
|
347
|
+
result['file_types'] = list(file_types_found)
|
|
348
|
+
|
|
349
|
+
if not all_files:
|
|
350
|
+
return result
|
|
351
|
+
|
|
352
|
+
# Analyze first file for polarity and acquisition length
|
|
353
|
+
first_file = all_files[0]
|
|
354
|
+
result['first_file'] = str(first_file)
|
|
355
|
+
|
|
356
|
+
# Extract metadata based on file type
|
|
357
|
+
if first_file.suffix.lower() == '.wiff':
|
|
358
|
+
metadata = self._analyze_wiff_file(first_file)
|
|
359
|
+
elif first_file.suffix.lower() == '.mzml':
|
|
360
|
+
metadata = self._analyze_mzml_file(first_file)
|
|
361
|
+
elif first_file.suffix.lower() == '.raw':
|
|
362
|
+
metadata = self._analyze_raw_file(first_file)
|
|
363
|
+
else:
|
|
364
|
+
metadata = {'polarity': 'positive', 'length_minutes': 0.0}
|
|
365
|
+
|
|
366
|
+
result['polarity'] = metadata.get('polarity', 'positive')
|
|
367
|
+
result['length_minutes'] = metadata.get('length_minutes', 0.0)
|
|
368
|
+
|
|
369
|
+
except Exception as e:
|
|
370
|
+
self.logger.warning(f"Failed to analyze source files: {e}")
|
|
371
|
+
|
|
372
|
+
return result
|
|
373
|
+
|
|
374
|
+
def _analyze_wiff_file(self, file_path: Path) -> Dict[str, Any]:
|
|
375
|
+
"""Analyze WIFF file to extract polarity and acquisition length."""
|
|
376
|
+
try:
|
|
377
|
+
from masster.sample.load import _wiff_to_dict
|
|
378
|
+
|
|
379
|
+
# Extract metadata from WIFF file
|
|
380
|
+
metadata_df = _wiff_to_dict(str(file_path))
|
|
381
|
+
|
|
382
|
+
result = {'polarity': 'positive', 'length_minutes': 0.0}
|
|
383
|
+
|
|
384
|
+
if not metadata_df.empty:
|
|
385
|
+
# Get polarity from first experiment
|
|
386
|
+
if 'polarity' in metadata_df.columns:
|
|
387
|
+
first_polarity = metadata_df['polarity'].iloc[0]
|
|
388
|
+
|
|
389
|
+
# Convert numeric polarity codes to string
|
|
390
|
+
if first_polarity == 1 or str(first_polarity).lower() in ['positive', 'pos', '+']:
|
|
391
|
+
result['polarity'] = "positive"
|
|
392
|
+
elif first_polarity == -1 or str(first_polarity).lower() in ['negative', 'neg', '-']:
|
|
393
|
+
result['polarity'] = "negative"
|
|
394
|
+
|
|
395
|
+
# Estimate acquisition length by loading the file briefly
|
|
396
|
+
# For a rough estimate, we'll load just the scan info
|
|
397
|
+
from masster.sample import Sample
|
|
398
|
+
sample = Sample()
|
|
399
|
+
sample.logger_update(level="ERROR") # Suppress logs
|
|
400
|
+
sample.load(str(file_path))
|
|
401
|
+
|
|
402
|
+
if hasattr(sample, 'scans_df') and sample.scans_df is not None:
|
|
403
|
+
if not sample.scans_df.is_empty():
|
|
404
|
+
rt_values = sample.scans_df.select('rt').to_numpy().flatten()
|
|
405
|
+
if len(rt_values) > 0:
|
|
406
|
+
# RT is in seconds, convert to minutes
|
|
407
|
+
result['length_minutes'] = float(rt_values.max()) / 60.0
|
|
408
|
+
|
|
409
|
+
return result
|
|
410
|
+
|
|
411
|
+
except Exception as e:
|
|
412
|
+
self.logger.debug(f"Failed to analyze WIFF file {file_path}: {e}")
|
|
413
|
+
return {'polarity': 'positive', 'length_minutes': 0.0}
|
|
414
|
+
|
|
415
|
+
def _analyze_mzml_file(self, file_path: Path) -> Dict[str, Any]:
|
|
416
|
+
"""Analyze mzML file to extract polarity and acquisition length."""
|
|
417
|
+
try:
|
|
418
|
+
from masster.sample import Sample
|
|
419
|
+
|
|
420
|
+
sample = Sample()
|
|
421
|
+
sample.logger_update(level="ERROR") # Suppress logs
|
|
422
|
+
sample.load(str(file_path))
|
|
423
|
+
|
|
424
|
+
result = {'polarity': 'positive', 'length_minutes': 0.0}
|
|
425
|
+
|
|
426
|
+
if hasattr(sample, 'scans_df') and sample.scans_df is not None:
|
|
427
|
+
if not sample.scans_df.is_empty():
|
|
428
|
+
rt_values = sample.scans_df.select('rt').to_numpy().flatten()
|
|
429
|
+
if len(rt_values) > 0:
|
|
430
|
+
# RT is in seconds, convert to minutes
|
|
431
|
+
result['length_minutes'] = float(rt_values.max()) / 60.0
|
|
432
|
+
|
|
433
|
+
# For mzML, polarity detection would require more detailed parsing
|
|
434
|
+
# For now, use default
|
|
435
|
+
return result
|
|
436
|
+
|
|
437
|
+
except Exception as e:
|
|
438
|
+
self.logger.debug(f"Failed to analyze mzML file {file_path}: {e}")
|
|
439
|
+
return {'polarity': 'positive', 'length_minutes': 0.0}
|
|
440
|
+
|
|
441
|
+
def _analyze_raw_file(self, file_path: Path) -> Dict[str, Any]:
|
|
442
|
+
"""Analyze RAW file to extract polarity and acquisition length."""
|
|
443
|
+
try:
|
|
444
|
+
from masster.sample import Sample
|
|
445
|
+
|
|
446
|
+
sample = Sample()
|
|
447
|
+
sample.logger_update(level="ERROR") # Suppress logs
|
|
448
|
+
sample.load(str(file_path))
|
|
449
|
+
|
|
450
|
+
result = {'polarity': 'positive', 'length_minutes': 0.0}
|
|
451
|
+
|
|
452
|
+
if hasattr(sample, 'scans_df') and sample.scans_df is not None:
|
|
453
|
+
if not sample.scans_df.is_empty():
|
|
454
|
+
rt_values = sample.scans_df.select('rt').to_numpy().flatten()
|
|
455
|
+
if len(rt_values) > 0:
|
|
456
|
+
# RT is in seconds, convert to minutes
|
|
457
|
+
result['length_minutes'] = float(rt_values.max()) / 60.0
|
|
458
|
+
|
|
459
|
+
# For RAW files, polarity detection would require more detailed parsing
|
|
460
|
+
# For now, use default
|
|
461
|
+
return result
|
|
462
|
+
|
|
463
|
+
except Exception as e:
|
|
464
|
+
self.logger.debug(f"Failed to analyze RAW file {file_path}: {e}")
|
|
465
|
+
return {'polarity': 'positive', 'length_minutes': 0.0}
|
|
466
|
+
|
|
302
467
|
def _infer_polarity_from_first_file(self) -> str:
|
|
303
468
|
"""
|
|
304
469
|
Infer polarity from the first available raw data file.
|
|
@@ -350,101 +515,498 @@ class Wizard:
|
|
|
350
515
|
"""Get the adduct specifications."""
|
|
351
516
|
return self.params.adducts
|
|
352
517
|
|
|
353
|
-
def
|
|
518
|
+
def create_scripts(self) -> Dict[str, Any]:
|
|
354
519
|
"""
|
|
355
|
-
Generate
|
|
520
|
+
Generate analysis scripts based on source file analysis.
|
|
521
|
+
|
|
522
|
+
This method:
|
|
523
|
+
1. Analyzes the source files to extract metadata
|
|
524
|
+
2. Creates 1_masster_workflow.py with sample processing logic
|
|
525
|
+
3. Creates 2_interactive_analysis.py marimo notebook for study exploration
|
|
526
|
+
4. Returns instructions for next steps
|
|
356
527
|
|
|
357
|
-
Parameters:
|
|
358
|
-
filename: Name for the generated script file
|
|
359
|
-
|
|
360
528
|
Returns:
|
|
361
|
-
|
|
529
|
+
Dictionary containing:
|
|
530
|
+
- status: "success" or "error"
|
|
531
|
+
- message: Status message
|
|
532
|
+
- instructions: List of next steps
|
|
533
|
+
- files_created: List of created file paths
|
|
534
|
+
- source_info: Metadata about source files
|
|
362
535
|
"""
|
|
363
536
|
try:
|
|
364
|
-
|
|
365
|
-
|
|
537
|
+
# Step 1: Analyze source files to extract metadata
|
|
538
|
+
source_info = self._analyze_source_files()
|
|
366
539
|
|
|
367
|
-
|
|
368
|
-
|
|
540
|
+
# Update wizard parameters based on detected metadata
|
|
541
|
+
if source_info['polarity'] != 'positive': # Only update if different from default
|
|
542
|
+
self.params.polarity = source_info['polarity']
|
|
543
|
+
# Update adducts based on detected polarity
|
|
544
|
+
self.params.__post_init__()
|
|
369
545
|
|
|
370
|
-
|
|
371
|
-
return True
|
|
546
|
+
files_created = []
|
|
372
547
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
def create_notebook(self, filename: str = "interactive.py") -> bool:
|
|
378
|
-
"""
|
|
379
|
-
Generate a marimo notebook for interactive analysis of the study.
|
|
380
|
-
|
|
381
|
-
Parameters:
|
|
382
|
-
filename: Name for the generated notebook file
|
|
548
|
+
# Step 2: Create 1_masster_workflow.py
|
|
549
|
+
workflow_script_path = self.folder_path / "1_masster_workflow.py"
|
|
550
|
+
workflow_content = self._generate_workflow_script_content(source_info)
|
|
383
551
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
552
|
+
with open(workflow_script_path, 'w', encoding='utf-8') as f:
|
|
553
|
+
f.write(workflow_content)
|
|
554
|
+
files_created.append(str(workflow_script_path))
|
|
555
|
+
|
|
556
|
+
# Step 3: Create 2_interactive_analysis.py marimo notebook
|
|
557
|
+
notebook_path = self.folder_path / "2_interactive_analysis.py"
|
|
558
|
+
notebook_content = self._generate_interactive_notebook_content(source_info)
|
|
390
559
|
|
|
391
560
|
with open(notebook_path, 'w', encoding='utf-8') as f:
|
|
392
561
|
f.write(notebook_content)
|
|
562
|
+
files_created.append(str(notebook_path))
|
|
563
|
+
|
|
564
|
+
# Step 4: Generate instructions
|
|
565
|
+
instructions = self._generate_instructions(source_info, files_created)
|
|
393
566
|
|
|
394
|
-
|
|
395
|
-
|
|
567
|
+
return {
|
|
568
|
+
"status": "success",
|
|
569
|
+
"message": f"Successfully created {len(files_created)} script files",
|
|
570
|
+
"instructions": instructions,
|
|
571
|
+
"files_created": files_created,
|
|
572
|
+
"source_info": source_info
|
|
573
|
+
}
|
|
396
574
|
|
|
397
575
|
except Exception as e:
|
|
398
|
-
|
|
399
|
-
return
|
|
576
|
+
self.logger.error(f"Failed to create scripts: {e}")
|
|
577
|
+
return {
|
|
578
|
+
"status": "error",
|
|
579
|
+
"message": f"Failed to create scripts: {e}",
|
|
580
|
+
"instructions": [],
|
|
581
|
+
"files_created": [],
|
|
582
|
+
"source_info": {}
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
def _generate_workflow_script_content(self, source_info: Dict[str, Any]) -> str:
|
|
586
|
+
"""Generate the content for 1_masster_workflow.py script."""
|
|
587
|
+
|
|
588
|
+
# Convert Path objects to strings for JSON serialization
|
|
589
|
+
params_dict = {}
|
|
590
|
+
for key, value in self.params.__dict__.items():
|
|
591
|
+
if key == '_param_metadata': # Skip metadata in generated script
|
|
592
|
+
continue
|
|
593
|
+
if isinstance(value, Path):
|
|
594
|
+
params_dict[key] = str(value)
|
|
595
|
+
else:
|
|
596
|
+
params_dict[key] = value
|
|
597
|
+
|
|
598
|
+
# Create readable PARAMS dict with comments including discovered info
|
|
599
|
+
params_lines = []
|
|
600
|
+
params_lines.append('# Analysis parameters (auto-detected from source files)')
|
|
601
|
+
params_lines.append('PARAMS = {')
|
|
602
|
+
|
|
603
|
+
# File Discovery Summary
|
|
604
|
+
params_lines.append(' # === Source File Analysis ===')
|
|
605
|
+
params_lines.append(f' "number_of_files": {source_info.get("number_of_files", 0)}, # Total raw data files found')
|
|
606
|
+
params_lines.append(f' "file_types": {source_info.get("file_types", [])!r}, # Detected file extensions')
|
|
607
|
+
params_lines.append(f' "length_minutes": {source_info.get("length_minutes", 0.0):.1f}, # Estimated acquisition length per file (minutes)')
|
|
608
|
+
if source_info.get('first_file'):
|
|
609
|
+
params_lines.append(f' "first_file": {source_info["first_file"]!r}, # First file analyzed for metadata')
|
|
610
|
+
params_lines.append('')
|
|
611
|
+
|
|
612
|
+
# Core Configuration
|
|
613
|
+
params_lines.append(' # === Core Configuration ===')
|
|
614
|
+
params_lines.append(f' "source": {params_dict.get("source", "")!r}, # Directory containing raw data files')
|
|
615
|
+
params_lines.append(f' "folder": {params_dict.get("folder", "")!r}, # Output directory for processed study')
|
|
616
|
+
params_lines.append(f' "polarity": {params_dict.get("polarity", "positive")!r}, # Ion polarity mode (auto-detected)')
|
|
617
|
+
params_lines.append(f' "num_cores": {params_dict.get("num_cores", 4)}, # Number of CPU cores for parallel processing')
|
|
618
|
+
params_lines.append('')
|
|
619
|
+
|
|
620
|
+
# File Discovery
|
|
621
|
+
params_lines.append(' # === File Discovery ===')
|
|
622
|
+
params_lines.append(f' "file_extensions": {params_dict.get("file_extensions", [".wiff", ".raw", ".mzML"])!r}, # File extensions to search for')
|
|
623
|
+
params_lines.append(f' "search_subfolders": {params_dict.get("search_subfolders", True)}, # Whether to search subdirectories recursively')
|
|
624
|
+
params_lines.append(f' "skip_patterns": {params_dict.get("skip_patterns", ["blank", "condition"])!r}, # Filename patterns to skip')
|
|
625
|
+
params_lines.append('')
|
|
626
|
+
|
|
627
|
+
# Processing Parameters - Critical values to review
|
|
628
|
+
params_lines.append(' # === Processing Parameters (REVIEW THESE VALUES) ===')
|
|
629
|
+
params_lines.append(f' "adducts": {params_dict.get("adducts", [])!r}, # Adduct specifications for feature detection and annotation')
|
|
630
|
+
params_lines.append(f' "detector_type": {params_dict.get("detector_type", "unknown")!r}, # MS detector type ("orbitrap", "tof", "unknown")')
|
|
631
|
+
params_lines.append(f' "noise": {params_dict.get("noise", 50.0)}, # REVIEW: Noise threshold for feature detection. Set to 1e5 for Orbitraps')
|
|
632
|
+
params_lines.append(f' "chrom_fwhm": {params_dict.get("chrom_fwhm", 0.5)}, # REVIEW: Chromatographic peak FWHM (seconds)')
|
|
633
|
+
params_lines.append(f' "chrom_peak_snr": {params_dict.get("chrom_peak_snr", 5.0)}, # Minimum signal-to-noise ratio for chromatographic peaks')
|
|
634
|
+
params_lines.append('')
|
|
635
|
+
|
|
636
|
+
# Other parameters...
|
|
637
|
+
params_lines.append(' # === Alignment & Merging ===')
|
|
638
|
+
params_lines.append(f' "rt_tol": {params_dict.get("rt_tol", 5.0)}, # Retention time tolerance for alignment (seconds)')
|
|
639
|
+
params_lines.append(f' "mz_tol": {params_dict.get("mz_tol", 0.01)}, # Mass-to-charge ratio tolerance for alignment (Da)')
|
|
640
|
+
params_lines.append(f' "alignment_method": {params_dict.get("alignment_method", "kd")!r}, # Algorithm for sample alignment')
|
|
641
|
+
params_lines.append(f' "min_samples_per_feature": {params_dict.get("min_samples_per_feature", 1)}, # Minimum samples required per consensus feature')
|
|
642
|
+
params_lines.append(f' "merge_method": {params_dict.get("merge_method", "qt")!r}, # Method for merging consensus features')
|
|
643
|
+
params_lines.append('')
|
|
644
|
+
|
|
645
|
+
# Other params
|
|
646
|
+
params_lines.append(' # === Sample Processing ===')
|
|
647
|
+
params_lines.append(f' "batch_size": {params_dict.get("batch_size", 8)}, # Number of files to process per batch')
|
|
648
|
+
params_lines.append(f' "memory_limit_gb": {params_dict.get("memory_limit_gb", 16.0)}, # Memory limit for processing (GB)')
|
|
649
|
+
params_lines.append('')
|
|
650
|
+
|
|
651
|
+
params_lines.append(' # === Script Options ===')
|
|
652
|
+
params_lines.append(f' "resume_enabled": {params_dict.get("resume_enabled", True)}, # Enable automatic resume capability')
|
|
653
|
+
params_lines.append(f' "force_reprocess": {params_dict.get("force_reprocess", False)}, # Force reprocessing of existing files')
|
|
654
|
+
params_lines.append(f' "cleanup_temp_files": {params_dict.get("cleanup_temp_files", True)}, # Clean up temporary files after processing')
|
|
655
|
+
|
|
656
|
+
params_lines.append('}')
|
|
657
|
+
|
|
658
|
+
# Create script lines
|
|
659
|
+
script_lines = [
|
|
660
|
+
'#!/usr/bin/env python3',
|
|
661
|
+
'"""',
|
|
662
|
+
'MASSter Workflow Script - Sample Processing',
|
|
663
|
+
f'Generated by masster wizard v{version}',
|
|
664
|
+
'',
|
|
665
|
+
'Source Analysis:',
|
|
666
|
+
f' - Files found: {source_info.get("number_of_files", 0)}',
|
|
667
|
+
f' - File types: {", ".join(source_info.get("file_types", []))}',
|
|
668
|
+
f' - Polarity detected: {source_info.get("polarity", "unknown")}',
|
|
669
|
+
f' - Acquisition length: ~{source_info.get("length_minutes", 0.0):.1f} minutes per file',
|
|
670
|
+
'',
|
|
671
|
+
'This script processes raw MS data files into sample5 format.',
|
|
672
|
+
'Review the NOISE and CHROM_FWHM parameters below before running.',
|
|
673
|
+
'"""',
|
|
674
|
+
'',
|
|
675
|
+
'import sys',
|
|
676
|
+
'import time',
|
|
677
|
+
'from pathlib import Path',
|
|
678
|
+
'import concurrent.futures',
|
|
679
|
+
'import os',
|
|
680
|
+
'',
|
|
681
|
+
'# Import masster modules',
|
|
682
|
+
'from masster.sample import Sample',
|
|
683
|
+
'from masster import __version__',
|
|
684
|
+
'',
|
|
685
|
+
]
|
|
686
|
+
|
|
687
|
+
# Add the formatted PARAMS
|
|
688
|
+
script_lines.extend(params_lines)
|
|
689
|
+
|
|
690
|
+
# Add the functions
|
|
691
|
+
script_lines.extend([
|
|
692
|
+
'',
|
|
693
|
+
'',
|
|
694
|
+
'def discover_raw_files(source_folder, file_extensions, search_subfolders=True, skip_patterns=None):',
|
|
695
|
+
' """Discover raw data files in the source folder."""',
|
|
696
|
+
' source_path = Path(source_folder)',
|
|
697
|
+
' raw_files = []',
|
|
698
|
+
' skip_patterns = skip_patterns or []',
|
|
699
|
+
' ',
|
|
700
|
+
' for ext in file_extensions:',
|
|
701
|
+
' if search_subfolders:',
|
|
702
|
+
' pattern = f"**/*{ext}"',
|
|
703
|
+
' files = list(source_path.rglob(pattern))',
|
|
704
|
+
' else:',
|
|
705
|
+
' pattern = f"*{ext}"',
|
|
706
|
+
' files = list(source_path.glob(pattern))',
|
|
707
|
+
' ',
|
|
708
|
+
' # Filter out files matching skip patterns',
|
|
709
|
+
' for file in files:',
|
|
710
|
+
' skip_file = False',
|
|
711
|
+
' for skip_pattern in skip_patterns:',
|
|
712
|
+
' if skip_pattern.lower() in file.name.lower():',
|
|
713
|
+
' skip_file = True',
|
|
714
|
+
' break',
|
|
715
|
+
' if not skip_file:',
|
|
716
|
+
' raw_files.append(file)',
|
|
717
|
+
' ',
|
|
718
|
+
' return raw_files',
|
|
719
|
+
'',
|
|
720
|
+
'',
|
|
721
|
+
'def process_single_file(args):',
|
|
722
|
+
' """Process a single raw file to sample5 format - designed for multiprocessing."""',
|
|
723
|
+
' raw_file, output_folder, params = args',
|
|
724
|
+
' ',
|
|
725
|
+
' try:',
|
|
726
|
+
' # Create sample5 filename',
|
|
727
|
+
' sample_name = raw_file.stem',
|
|
728
|
+
' sample5_path = Path(output_folder) / f"{sample_name}.sample5"',
|
|
729
|
+
' ',
|
|
730
|
+
' # Skip if sample5 already exists and resume is enabled',
|
|
731
|
+
' if sample5_path.exists() and params["resume_enabled"]:',
|
|
732
|
+
' print(f" ✓ Skipping {raw_file.name} (sample5 already exists)")',
|
|
733
|
+
' return {"status": "skipped", "file": str(sample5_path), "message": "Already exists"}',
|
|
734
|
+
' ',
|
|
735
|
+
' print(f" 🔄 Processing {raw_file.name}...")',
|
|
736
|
+
' start_time = time.time()',
|
|
737
|
+
' ',
|
|
738
|
+
' # Load and process raw file with full pipeline',
|
|
739
|
+
' sample = Sample(log_label=sample_name)',
|
|
740
|
+
' sample.load(filename=str(raw_file))',
|
|
741
|
+
' sample.find_features(',
|
|
742
|
+
' noise=params["noise"],',
|
|
743
|
+
' chrom_fwhm=params["chrom_fwhm"],',
|
|
744
|
+
' chrom_peak_snr=params["chrom_peak_snr"]',
|
|
745
|
+
' )',
|
|
746
|
+
' # sample.find_adducts(adducts=params["adducts"])',
|
|
747
|
+
' sample.find_ms2()',
|
|
748
|
+
' # sample.find_iso() # Optional - can be uncommented if needed',
|
|
749
|
+
' sample.save(str(sample5_path))',
|
|
750
|
+
' ',
|
|
751
|
+
' elapsed = time.time() - start_time',
|
|
752
|
+
' print(f" ✅ Completed {raw_file.name} -> {sample5_path.name} ({elapsed:.1f}s)")',
|
|
753
|
+
' ',
|
|
754
|
+
' return {"status": "success", "file": str(sample5_path), "elapsed": elapsed}',
|
|
755
|
+
' ',
|
|
756
|
+
' except Exception as e:',
|
|
757
|
+
' print(f" ❌ ERROR processing {raw_file.name}: {e}")',
|
|
758
|
+
' return {"status": "error", "file": str(raw_file), "error": str(e)}',
|
|
759
|
+
'',
|
|
760
|
+
'',
|
|
761
|
+
'def convert_raw_to_sample5_parallel(raw_files, output_folder, params):',
|
|
762
|
+
' """Convert raw data files to sample5 format with parallel processing and progress tracking."""',
|
|
763
|
+
' import concurrent.futures',
|
|
764
|
+
' import os',
|
|
765
|
+
' ',
|
|
766
|
+
' # Create output directory',
|
|
767
|
+
' os.makedirs(output_folder, exist_ok=True)',
|
|
768
|
+
' ',
|
|
769
|
+
' print(f"\\n🚀 Processing {len(raw_files)} files using {params[\'num_cores\']} CPU cores...")',
|
|
770
|
+
' print("=" * 70)',
|
|
771
|
+
' ',
|
|
772
|
+
' # Prepare arguments for multiprocessing',
|
|
773
|
+
' file_args = [(raw_file, output_folder, params) for raw_file in raw_files]',
|
|
774
|
+
' ',
|
|
775
|
+
' # Process files in parallel with progress tracking',
|
|
776
|
+
' results = []',
|
|
777
|
+
' successful = 0',
|
|
778
|
+
' skipped = 0',
|
|
779
|
+
' failed = 0',
|
|
780
|
+
' total_elapsed = 0',
|
|
781
|
+
' ',
|
|
782
|
+
' with concurrent.futures.ProcessPoolExecutor(max_workers=params["num_cores"]) as executor:',
|
|
783
|
+
' # Submit all jobs',
|
|
784
|
+
' future_to_file = {executor.submit(process_single_file, args): args[0] for args in file_args}',
|
|
785
|
+
' ',
|
|
786
|
+
' # Collect results as they complete',
|
|
787
|
+
' for i, future in enumerate(concurrent.futures.as_completed(future_to_file), 1):',
|
|
788
|
+
' result = future.result()',
|
|
789
|
+
' results.append(result)',
|
|
790
|
+
' ',
|
|
791
|
+
' if result["status"] == "success":',
|
|
792
|
+
' successful += 1',
|
|
793
|
+
' total_elapsed += result.get("elapsed", 0)',
|
|
794
|
+
' elif result["status"] == "skipped":',
|
|
795
|
+
' skipped += 1',
|
|
796
|
+
' else:',
|
|
797
|
+
' failed += 1',
|
|
798
|
+
' ',
|
|
799
|
+
' # Progress update',
|
|
800
|
+
' print(f"\\r Progress: {i}/{len(raw_files)} files completed ({successful} success, {skipped} skipped, {failed} failed)", end="", flush=True)',
|
|
801
|
+
' ',
|
|
802
|
+
' print() # New line after progress',
|
|
803
|
+
' print("=" * 70)',
|
|
804
|
+
' ',
|
|
805
|
+
' # Summary',
|
|
806
|
+
' if successful > 0:',
|
|
807
|
+
' avg_time = total_elapsed / successful',
|
|
808
|
+
' print(f"✅ Successfully processed {successful} files (avg: {avg_time:.1f}s per file)")',
|
|
809
|
+
' if skipped > 0:',
|
|
810
|
+
' print(f"⏩ Skipped {skipped} files (already exist)")',
|
|
811
|
+
' if failed > 0:',
|
|
812
|
+
' print(f"❌ Failed to process {failed} files")',
|
|
813
|
+
' for result in results:',
|
|
814
|
+
' if result["status"] == "error":',
|
|
815
|
+
' print(f" - {Path(result[\'file\']).name}: {result[\'error\']}")',
|
|
816
|
+
' ',
|
|
817
|
+
' # Return list of successful sample5 files',
|
|
818
|
+
' sample5_files = [result["file"] for result in results if result["status"] in ["success", "skipped"]]',
|
|
819
|
+
' return sample5_files',
|
|
820
|
+
'',
|
|
821
|
+
'',
|
|
822
|
+
'def main():',
|
|
823
|
+
' """Main sample processing workflow."""',
|
|
824
|
+
' try:',
|
|
825
|
+
' print("=" * 70)',
|
|
826
|
+
f' print("MASSter {version} - Sample Processing Workflow")',
|
|
827
|
+
' print("=" * 70)',
|
|
828
|
+
' print(f"Source: {PARAMS[\'source\']}")',
|
|
829
|
+
' print(f"Output: {PARAMS[\'folder\']}")',
|
|
830
|
+
' print(f"Polarity: {PARAMS[\'polarity\']} (detected)")',
|
|
831
|
+
' print(f"CPU Cores: {PARAMS[\'num_cores\']}")',
|
|
832
|
+
' print("=" * 70)',
|
|
833
|
+
' print("\\n⚙️ IMPORTANT: Review these parameters before processing:")',
|
|
834
|
+
' print(f" NOISE threshold: {PARAMS[\'noise\']} (adjust based on your instrument)")',
|
|
835
|
+
' print(f" CHROM_FWHM: {PARAMS[\'chrom_fwhm\']}s (adjust based on your chromatography)")',
|
|
836
|
+
' print(" You can edit these values in the PARAMS section above.")',
|
|
837
|
+
' print("=" * 70)',
|
|
838
|
+
' ',
|
|
839
|
+
' start_time = time.time()',
|
|
840
|
+
' ',
|
|
841
|
+
' # Step 1: Discover raw data files',
|
|
842
|
+
' print("\\n📁 Step 1/2: Discovering raw data files...")',
|
|
843
|
+
' raw_files = discover_raw_files(',
|
|
844
|
+
' PARAMS[\'source\'],',
|
|
845
|
+
' PARAMS[\'file_extensions\'],',
|
|
846
|
+
' PARAMS[\'search_subfolders\'],',
|
|
847
|
+
' PARAMS[\'skip_patterns\']',
|
|
848
|
+
' )',
|
|
849
|
+
' ',
|
|
850
|
+
' if not raw_files:',
|
|
851
|
+
' print("❌ No raw data files found!")',
|
|
852
|
+
' return False',
|
|
853
|
+
' ',
|
|
854
|
+
' print(f"Found {len(raw_files)} raw data files")',
|
|
855
|
+
' for i, f in enumerate(raw_files[:5]): # Show first 5 files',
|
|
856
|
+
' print(f" {i+1}. {f.name}")',
|
|
857
|
+
' if len(raw_files) > 5:',
|
|
858
|
+
' print(f" ... and {len(raw_files) - 5} more files")',
|
|
859
|
+
' ',
|
|
860
|
+
' # Step 2: Process raw files to sample5',
|
|
861
|
+
' print("\\n🔄 Step 2/2: Processing raw files to sample5 format...")',
|
|
862
|
+
' sample5_files = convert_raw_to_sample5_parallel(',
|
|
863
|
+
' raw_files,',
|
|
864
|
+
' PARAMS[\'folder\'],',
|
|
865
|
+
' PARAMS',
|
|
866
|
+
' )',
|
|
867
|
+
' ',
|
|
868
|
+
' if not sample5_files:',
|
|
869
|
+
' print("❌ No sample5 files were created!")',
|
|
870
|
+
' return False',
|
|
871
|
+
' ',
|
|
872
|
+
' # Summary',
|
|
873
|
+
' total_time = time.time() - start_time',
|
|
874
|
+
' print("\\n" + "=" * 70)',
|
|
875
|
+
' print("🎉 SAMPLE PROCESSING COMPLETE")',
|
|
876
|
+
' print("=" * 70)',
|
|
877
|
+
' print(f"Processing time: {total_time/60:.1f} minutes")',
|
|
878
|
+
' print(f"Raw files found: {len(raw_files)}")',
|
|
879
|
+
' print(f"Sample5 files created: {len(sample5_files)}")',
|
|
880
|
+
' print("\\nNext steps:")',
|
|
881
|
+
' print("1. Run the interactive analysis: uv run marimo edit 2_interactive_analysis.py")',
|
|
882
|
+
' print("2. Or use the sample5 files in your own analysis scripts")',
|
|
883
|
+
' print("=" * 70)',
|
|
884
|
+
' ',
|
|
885
|
+
' return True',
|
|
886
|
+
' ',
|
|
887
|
+
' except KeyboardInterrupt:',
|
|
888
|
+
' print("\\n❌ Processing interrupted by user")',
|
|
889
|
+
' return False',
|
|
890
|
+
' except Exception as e:',
|
|
891
|
+
' print(f"❌ Processing failed with error: {e}")',
|
|
892
|
+
' import traceback',
|
|
893
|
+
' traceback.print_exc()',
|
|
894
|
+
' return False',
|
|
895
|
+
'',
|
|
896
|
+
'',
|
|
897
|
+
'if __name__ == "__main__":',
|
|
898
|
+
' success = main()',
|
|
899
|
+
' sys.exit(0 if success else 1)',
|
|
900
|
+
])
|
|
901
|
+
|
|
902
|
+
return '\n'.join(script_lines)
|
|
400
903
|
|
|
401
|
-
def analyze(self
|
|
904
|
+
def analyze(self) -> Dict[str, Any]:
|
|
402
905
|
"""
|
|
403
|
-
|
|
906
|
+
Execute the complete analysis workflow.
|
|
907
|
+
|
|
908
|
+
This method:
|
|
909
|
+
1. Checks if 1_masster_workflow.py exists and runs it
|
|
910
|
+
2. If not, creates scripts first then runs the workflow
|
|
911
|
+
3. Provides clear feedback about next steps
|
|
404
912
|
|
|
405
|
-
Parameters:
|
|
406
|
-
filename: Name for the generated script file
|
|
407
|
-
|
|
408
913
|
Returns:
|
|
409
|
-
|
|
914
|
+
Dictionary containing execution results and instructions
|
|
410
915
|
"""
|
|
411
|
-
|
|
412
|
-
if not self.create_analysis(filename):
|
|
413
|
-
return False
|
|
414
|
-
|
|
415
|
-
# Create interactive notebook
|
|
416
|
-
if not self.create_notebook("interactive.py"):
|
|
417
|
-
print("Warning: Failed to create interactive notebook")
|
|
418
|
-
|
|
419
|
-
# Then execute the analysis script
|
|
420
|
-
script_path = self.folder_path / filename
|
|
916
|
+
workflow_script = self.folder_path / "1_masster_workflow.py"
|
|
421
917
|
|
|
422
918
|
try:
|
|
423
|
-
|
|
919
|
+
# Check if workflow script exists
|
|
920
|
+
if workflow_script.exists():
|
|
921
|
+
print("📋 Found existing workflow script, executing...")
|
|
922
|
+
return self._execute_workflow_script(workflow_script)
|
|
923
|
+
else:
|
|
924
|
+
print("📝 Creating analysis scripts...")
|
|
925
|
+
# Create scripts first
|
|
926
|
+
result = self.create_scripts()
|
|
927
|
+
|
|
928
|
+
if result["status"] != "success":
|
|
929
|
+
return result
|
|
930
|
+
|
|
931
|
+
# Print instructions
|
|
932
|
+
print("\n" + "="*70)
|
|
933
|
+
for instruction in result["instructions"]:
|
|
934
|
+
print(instruction)
|
|
935
|
+
print("="*70)
|
|
936
|
+
|
|
937
|
+
# Ask user if they want to proceed with execution
|
|
938
|
+
print("\n🤔 Would you like to proceed with sample processing now?")
|
|
939
|
+
print(" This will execute 1_masster_workflow.py")
|
|
940
|
+
response = input(" Proceed? [y/N]: ").strip().lower()
|
|
941
|
+
|
|
942
|
+
if response in ['y', 'yes']:
|
|
943
|
+
return self._execute_workflow_script(workflow_script)
|
|
944
|
+
else:
|
|
945
|
+
print("✋ Processing paused. Run the scripts manually when ready.")
|
|
946
|
+
return {
|
|
947
|
+
"status": "scripts_created",
|
|
948
|
+
"message": "Scripts created successfully, execution deferred",
|
|
949
|
+
"instructions": result["instructions"],
|
|
950
|
+
"files_created": result["files_created"]
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
except Exception as e:
|
|
954
|
+
self.logger.error(f"Analysis failed: {e}")
|
|
955
|
+
return {
|
|
956
|
+
"status": "error",
|
|
957
|
+
"message": f"Analysis failed: {e}",
|
|
958
|
+
"instructions": [],
|
|
959
|
+
"files_created": []
|
|
960
|
+
}
|
|
961
|
+
|
|
962
|
+
def _execute_workflow_script(self, script_path: Path) -> Dict[str, Any]:
|
|
963
|
+
"""Execute the workflow script and return results."""
|
|
964
|
+
try:
|
|
965
|
+
print(f"🚀 Executing {script_path.name}...")
|
|
424
966
|
|
|
425
967
|
import subprocess
|
|
426
968
|
result = subprocess.run([
|
|
427
969
|
sys.executable, str(script_path)
|
|
428
|
-
], cwd=str(self.folder_path),
|
|
970
|
+
], cwd=str(self.folder_path), capture_output=False, text=True)
|
|
429
971
|
|
|
430
972
|
success = result.returncode == 0
|
|
431
973
|
|
|
432
974
|
if success:
|
|
433
|
-
print("="
|
|
434
|
-
print("
|
|
435
|
-
print("="
|
|
436
|
-
print("
|
|
437
|
-
print(
|
|
438
|
-
print("="
|
|
975
|
+
print("="*70)
|
|
976
|
+
print("✅ Workflow execution completed successfully!")
|
|
977
|
+
print("="*70)
|
|
978
|
+
print("Next step: Run interactive analysis")
|
|
979
|
+
print(" uv run marimo edit 2_interactive_analysis.py")
|
|
980
|
+
print("="*70)
|
|
981
|
+
|
|
982
|
+
return {
|
|
983
|
+
"status": "success",
|
|
984
|
+
"message": "Workflow completed successfully",
|
|
985
|
+
"instructions": [
|
|
986
|
+
"✅ Sample processing completed",
|
|
987
|
+
"Next: uv run marimo edit 2_interactive_analysis.py"
|
|
988
|
+
],
|
|
989
|
+
"files_created": []
|
|
990
|
+
}
|
|
439
991
|
else:
|
|
440
|
-
|
|
441
|
-
|
|
992
|
+
return {
|
|
993
|
+
"status": "error",
|
|
994
|
+
"message": f"Workflow execution failed with code {result.returncode}",
|
|
995
|
+
"instructions": [
|
|
996
|
+
"❌ Check the error messages above",
|
|
997
|
+
"Review parameters in 1_masster_workflow.py",
|
|
998
|
+
"Try running: python 1_masster_workflow.py"
|
|
999
|
+
],
|
|
1000
|
+
"files_created": []
|
|
1001
|
+
}
|
|
442
1002
|
|
|
443
|
-
return success
|
|
444
|
-
|
|
445
1003
|
except Exception as e:
|
|
446
|
-
|
|
447
|
-
|
|
1004
|
+
return {
|
|
1005
|
+
"status": "error",
|
|
1006
|
+
"message": f"Failed to execute workflow: {e}",
|
|
1007
|
+
"instructions": [],
|
|
1008
|
+
"files_created": []
|
|
1009
|
+
}
|
|
448
1010
|
|
|
449
1011
|
def _generate_script_content(self) -> str:
|
|
450
1012
|
"""Generate the complete analysis script content."""
|
|
@@ -787,7 +1349,7 @@ class Wizard:
|
|
|
787
1349
|
'@app.cell',
|
|
788
1350
|
'def __(masster):',
|
|
789
1351
|
' # Load the processed study',
|
|
790
|
-
|
|
1352
|
+
' study = masster.Study(folder=".")',
|
|
791
1353
|
' study.load()',
|
|
792
1354
|
' return (study,)',
|
|
793
1355
|
'',
|
|
@@ -909,16 +1471,14 @@ def create_analysis(
|
|
|
909
1471
|
**kwargs
|
|
910
1472
|
) -> bool:
|
|
911
1473
|
"""
|
|
912
|
-
Create
|
|
1474
|
+
Create standalone analysis scripts without initializing a Wizard instance.
|
|
913
1475
|
|
|
914
|
-
This function generates
|
|
915
|
-
steps with the specified configuration. The script can be executed independently
|
|
916
|
-
to perform the same analysis.
|
|
1476
|
+
This function generates analysis scripts with the specified configuration.
|
|
917
1477
|
|
|
918
1478
|
Parameters:
|
|
919
1479
|
source: Directory containing raw data files
|
|
920
1480
|
folder: Output directory for processed study
|
|
921
|
-
filename: Filename for the generated script (
|
|
1481
|
+
filename: Filename for the generated script (deprecated, will create standard files)
|
|
922
1482
|
polarity: Ion polarity mode ("positive" or "negative")
|
|
923
1483
|
adducts: List of adduct specifications (auto-set if None)
|
|
924
1484
|
params: Custom wizard_def parameters (optional)
|
|
@@ -926,14 +1486,13 @@ def create_analysis(
|
|
|
926
1486
|
**kwargs: Additional parameters to override defaults
|
|
927
1487
|
|
|
928
1488
|
Returns:
|
|
929
|
-
True if
|
|
1489
|
+
True if scripts were generated successfully, False otherwise
|
|
930
1490
|
|
|
931
1491
|
Example:
|
|
932
1492
|
>>> from masster.wizard import create_analysis
|
|
933
1493
|
>>> create_analysis(
|
|
934
1494
|
... source=r'D:\\Data\\raw_files',
|
|
935
1495
|
... folder=r'D:\\Data\\output',
|
|
936
|
-
... filename='run_masster.py',
|
|
937
1496
|
... polarity='positive'
|
|
938
1497
|
... )
|
|
939
1498
|
"""
|
|
@@ -973,16 +1532,21 @@ def create_analysis(
|
|
|
973
1532
|
study_path = Path(folder)
|
|
974
1533
|
study_path.mkdir(parents=True, exist_ok=True)
|
|
975
1534
|
|
|
976
|
-
# Create a temporary Wizard instance to generate the
|
|
1535
|
+
# Create a temporary Wizard instance to generate the scripts
|
|
977
1536
|
temp_wizard = Wizard(params=wizard_params)
|
|
978
1537
|
|
|
979
|
-
# Generate the
|
|
980
|
-
|
|
1538
|
+
# Generate the scripts using the new method
|
|
1539
|
+
result = temp_wizard.create_scripts()
|
|
981
1540
|
|
|
982
|
-
|
|
1541
|
+
if result["status"] == "success":
|
|
1542
|
+
print("Scripts created successfully!")
|
|
1543
|
+
for instruction in result["instructions"]:
|
|
1544
|
+
print(instruction)
|
|
1545
|
+
|
|
1546
|
+
return result["status"] == "success"
|
|
983
1547
|
|
|
984
1548
|
except Exception as e:
|
|
985
|
-
print(f"Failed to create
|
|
1549
|
+
print(f"Failed to create scripts: {e}")
|
|
986
1550
|
import traceback
|
|
987
1551
|
traceback.print_exc()
|
|
988
1552
|
return False
|
|
@@ -1028,122 +1592,18 @@ def analyze(
|
|
|
1028
1592
|
"""
|
|
1029
1593
|
|
|
1030
1594
|
try:
|
|
1031
|
-
#
|
|
1032
|
-
script_created = create_analysis(
|
|
1033
|
-
source=source,
|
|
1034
|
-
folder=folder,
|
|
1035
|
-
filename=filename,
|
|
1036
|
-
polarity=polarity,
|
|
1037
|
-
adducts=adducts,
|
|
1038
|
-
params=params,
|
|
1039
|
-
num_cores=num_cores,
|
|
1040
|
-
**kwargs
|
|
1041
|
-
)
|
|
1042
|
-
|
|
1043
|
-
if not script_created:
|
|
1044
|
-
print("Failed to create analysis script")
|
|
1045
|
-
return False
|
|
1046
|
-
|
|
1047
|
-
# Get the full path to the created script
|
|
1048
|
-
study_path = Path(folder)
|
|
1049
|
-
script_path = study_path / Path(filename).name
|
|
1050
|
-
|
|
1051
|
-
if not script_path.exists():
|
|
1052
|
-
print(f"Script file not found: {script_path}")
|
|
1053
|
-
return False
|
|
1054
|
-
|
|
1055
|
-
print(f"Executing...")
|
|
1056
|
-
#print("=" * 70)
|
|
1057
|
-
|
|
1058
|
-
# Execute the script using subprocess with real-time output
|
|
1059
|
-
import subprocess
|
|
1060
|
-
|
|
1061
|
-
# Run the script with Python, letting it inherit our stdout/stderr
|
|
1062
|
-
try:
|
|
1063
|
-
# Use subprocess.run for direct output inheritance - no capturing/re-printing
|
|
1064
|
-
result = subprocess.run([
|
|
1065
|
-
sys.executable, str(script_path)
|
|
1066
|
-
], cwd=str(study_path))
|
|
1067
|
-
|
|
1068
|
-
return_code = result.returncode
|
|
1069
|
-
|
|
1070
|
-
except Exception as e:
|
|
1071
|
-
print(f"Error during script execution: {e}")
|
|
1072
|
-
return False
|
|
1073
|
-
|
|
1074
|
-
success = return_code == 0
|
|
1075
|
-
|
|
1076
|
-
if success:
|
|
1077
|
-
print("=" * 70)
|
|
1078
|
-
print("Script execution completed successfully")
|
|
1079
|
-
else:
|
|
1080
|
-
print("=" * 70)
|
|
1081
|
-
print(f"Script execution failed with return code: {return_code}")
|
|
1082
|
-
|
|
1083
|
-
return success
|
|
1084
|
-
|
|
1085
|
-
except Exception as e:
|
|
1086
|
-
print(f"Failed to execute script: {e}")
|
|
1087
|
-
import traceback
|
|
1088
|
-
traceback.print_exc()
|
|
1089
|
-
return False
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
def create_notebook(
|
|
1093
|
-
source: str,
|
|
1094
|
-
folder: str,
|
|
1095
|
-
filename: str = 'interactive.py',
|
|
1096
|
-
polarity: str = "positive",
|
|
1097
|
-
adducts: Optional[List[str]] = None,
|
|
1098
|
-
params: Optional[wizard_def] = None,
|
|
1099
|
-
num_cores: int = 0,
|
|
1100
|
-
**kwargs
|
|
1101
|
-
) -> bool:
|
|
1102
|
-
"""
|
|
1103
|
-
Create a marimo interactive notebook for analysis without initializing a Wizard instance.
|
|
1104
|
-
|
|
1105
|
-
This function generates a marimo notebook file that provides interactive exploration
|
|
1106
|
-
of mass spectrometry study results with the specified configuration.
|
|
1107
|
-
|
|
1108
|
-
Parameters:
|
|
1109
|
-
source: Directory containing raw data files
|
|
1110
|
-
folder: Output directory for processed study
|
|
1111
|
-
filename: Filename for the generated notebook (should end with .py)
|
|
1112
|
-
polarity: Ion polarity mode ("positive" or "negative")
|
|
1113
|
-
adducts: List of adduct specifications (auto-set if None)
|
|
1114
|
-
params: Custom wizard_def parameters (optional)
|
|
1115
|
-
num_cores: Number of CPU cores (0 = auto-detect)
|
|
1116
|
-
**kwargs: Additional parameters to override defaults
|
|
1117
|
-
|
|
1118
|
-
Returns:
|
|
1119
|
-
True if notebook was generated successfully, False otherwise
|
|
1120
|
-
|
|
1121
|
-
Example:
|
|
1122
|
-
>>> from masster.wizard import create_notebook
|
|
1123
|
-
>>> create_notebook(
|
|
1124
|
-
... source=r'D:\\Data\\raw_files',
|
|
1125
|
-
... folder=r'D:\\Data\\output',
|
|
1126
|
-
... filename='interactive.py',
|
|
1127
|
-
... polarity='positive'
|
|
1128
|
-
... )
|
|
1129
|
-
"""
|
|
1130
|
-
|
|
1131
|
-
try:
|
|
1132
|
-
# Create parameters
|
|
1595
|
+
# Create parameters (same logic as create_analysis)
|
|
1133
1596
|
if params is not None:
|
|
1134
|
-
# Use provided params as base
|
|
1135
1597
|
wizard_params = params
|
|
1136
|
-
# Update with provided values
|
|
1137
1598
|
wizard_params.source = source
|
|
1138
1599
|
wizard_params.folder = folder
|
|
1139
|
-
if polarity != "positive":
|
|
1600
|
+
if polarity != "positive":
|
|
1140
1601
|
wizard_params.polarity = polarity
|
|
1141
1602
|
if num_cores > 0:
|
|
1142
1603
|
wizard_params.num_cores = num_cores
|
|
1143
1604
|
if adducts is not None:
|
|
1144
1605
|
wizard_params.adducts = adducts
|
|
1145
1606
|
else:
|
|
1146
|
-
# Create new params with provided values
|
|
1147
1607
|
wizard_params = wizard_def(
|
|
1148
1608
|
source=source,
|
|
1149
1609
|
folder=folder,
|
|
@@ -1159,174 +1619,107 @@ def create_notebook(
|
|
|
1159
1619
|
if hasattr(wizard_params, key):
|
|
1160
1620
|
setattr(wizard_params, key, value)
|
|
1161
1621
|
|
|
1162
|
-
#
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
# Generate notebook content
|
|
1167
|
-
notebook_content = _generate_notebook_content(wizard_params)
|
|
1622
|
+
# Create Wizard instance and run analysis
|
|
1623
|
+
wizard = Wizard(params=wizard_params)
|
|
1624
|
+
result = wizard.analyze()
|
|
1168
1625
|
|
|
1169
|
-
#
|
|
1170
|
-
|
|
1171
|
-
with open(notebook_path, 'w', encoding='utf-8') as f:
|
|
1172
|
-
f.write(notebook_content)
|
|
1173
|
-
|
|
1174
|
-
print(f"Interactive notebook created: {notebook_path}")
|
|
1175
|
-
return True
|
|
1626
|
+
# Return success status
|
|
1627
|
+
return result.get("status") in ["success", "scripts_created"]
|
|
1176
1628
|
|
|
1177
1629
|
except Exception as e:
|
|
1178
|
-
print(f"Failed to
|
|
1630
|
+
print(f"Failed to execute script: {e}")
|
|
1179
1631
|
import traceback
|
|
1180
1632
|
traceback.print_exc()
|
|
1181
1633
|
return False
|
|
1182
1634
|
|
|
1183
1635
|
|
|
1184
|
-
def
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
'
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
' else:',
|
|
1271
|
-
' print("Not enough samples for PCA analysis")',
|
|
1272
|
-
'',
|
|
1273
|
-
'',
|
|
1274
|
-
'@app.cell',
|
|
1275
|
-
'def __(mo):',
|
|
1276
|
-
' mo.md(r"""',
|
|
1277
|
-
' ## Data Export',
|
|
1278
|
-
' ',
|
|
1279
|
-
' Export your processed data in various formats.',
|
|
1280
|
-
' """)',
|
|
1281
|
-
'',
|
|
1282
|
-
'',
|
|
1283
|
-
'@app.cell',
|
|
1284
|
-
'def __(study):',
|
|
1285
|
-
' # Export options',
|
|
1286
|
-
' if hasattr(study, "consensus_df"):',
|
|
1287
|
-
' # Export to Excel',
|
|
1288
|
-
' study.export_xlsx(filename="study_results.xlsx")',
|
|
1289
|
-
' print("✓ Results exported to: study_results.xlsx")',
|
|
1290
|
-
' ',
|
|
1291
|
-
' # Export to MGF',
|
|
1292
|
-
' study.export_mgf(filename="study_spectra.mgf")',
|
|
1293
|
-
' print("✓ Spectra exported to: study_spectra.mgf")',
|
|
1294
|
-
' else:',
|
|
1295
|
-
' print("No data available for export")',
|
|
1296
|
-
'',
|
|
1297
|
-
'',
|
|
1298
|
-
'@app.cell',
|
|
1299
|
-
'def __(mo):',
|
|
1300
|
-
' mo.md(r"""',
|
|
1301
|
-
' ## Custom Analysis',
|
|
1302
|
-
' ',
|
|
1303
|
-
' Add your own analysis code in the cells below.',
|
|
1304
|
-
' """)',
|
|
1305
|
-
'',
|
|
1306
|
-
'',
|
|
1307
|
-
'@app.cell',
|
|
1308
|
-
'def __(study):',
|
|
1309
|
-
' # Access consensus features dataframe',
|
|
1310
|
-
' if hasattr(study, "consensus_df"):',
|
|
1311
|
-
' df = study.consensus_df',
|
|
1312
|
-
' print(f"Consensus features shape: {df.shape}")',
|
|
1313
|
-
' print("\\nFirst 5 features:")',
|
|
1314
|
-
' print(df.head())',
|
|
1315
|
-
' return (df,) if "df" in locals() else ()',
|
|
1316
|
-
'',
|
|
1317
|
-
'',
|
|
1318
|
-
'@app.cell',
|
|
1319
|
-
'def __():',
|
|
1320
|
-
' # Your custom analysis here',
|
|
1321
|
-
' pass',
|
|
1322
|
-
'',
|
|
1323
|
-
'',
|
|
1324
|
-
'if __name__ == "__main__":',
|
|
1325
|
-
' app.run()',
|
|
1326
|
-
]
|
|
1327
|
-
|
|
1328
|
-
return '\n'.join(notebook_lines)
|
|
1636
|
+
def _generate_interactive_notebook_content(self, source_info: Dict[str, Any]) -> str:
|
|
1637
|
+
"""Generate the content for 2_interactive_analysis.py marimo notebook."""
|
|
1638
|
+
|
|
1639
|
+
notebook_lines = [
|
|
1640
|
+
'import marimo',
|
|
1641
|
+
'',
|
|
1642
|
+
'__generated_with = "0.9.14"',
|
|
1643
|
+
'app = marimo.App(width="medium")',
|
|
1644
|
+
'',
|
|
1645
|
+
'',
|
|
1646
|
+
'@app.cell',
|
|
1647
|
+
'def __():',
|
|
1648
|
+
' import marimo as mo',
|
|
1649
|
+
' return (mo,)',
|
|
1650
|
+
'',
|
|
1651
|
+
'',
|
|
1652
|
+
'@app.cell',
|
|
1653
|
+
'def __(mo):',
|
|
1654
|
+
' mo.md(r"""',
|
|
1655
|
+
' # MASSter Interactive Analysis',
|
|
1656
|
+
' ',
|
|
1657
|
+
f' **Source:** {source_info.get("number_of_files", 0)} files ({", ".join(source_info.get("file_types", []))}) detected',
|
|
1658
|
+
f' **Polarity:** {source_info.get("polarity", "unknown")} (auto-detected)',
|
|
1659
|
+
f' **Acquisition length:** ~{source_info.get("length_minutes", 0.0):.1f} minutes per file',
|
|
1660
|
+
' ',
|
|
1661
|
+
' This notebook provides interactive exploration of your processed mass spectrometry study.',
|
|
1662
|
+
' Make sure you have run `python 1_masster_workflow.py` first to create the sample5 files.',
|
|
1663
|
+
' """)',
|
|
1664
|
+
'',
|
|
1665
|
+
'',
|
|
1666
|
+
'@app.cell',
|
|
1667
|
+
'def __():',
|
|
1668
|
+
' # Import masster',
|
|
1669
|
+
' import masster',
|
|
1670
|
+
' return (masster,)',
|
|
1671
|
+
'',
|
|
1672
|
+
'',
|
|
1673
|
+
'@app.cell',
|
|
1674
|
+
'def __(masster):',
|
|
1675
|
+
' # Load the study from sample5 files',
|
|
1676
|
+
' study = masster.Study(folder=".")',
|
|
1677
|
+
' return (study,)',
|
|
1678
|
+
'',
|
|
1679
|
+
'',
|
|
1680
|
+
'@app.cell',
|
|
1681
|
+
'def __(mo, study):',
|
|
1682
|
+
' # Display study information',
|
|
1683
|
+
' study.info()',
|
|
1684
|
+
' return ()',
|
|
1685
|
+
'',
|
|
1686
|
+
'',
|
|
1687
|
+
'if __name__ == "__main__":',
|
|
1688
|
+
' app.run()',
|
|
1689
|
+
]
|
|
1690
|
+
|
|
1691
|
+
return '\n'.join(notebook_lines)
|
|
1692
|
+
|
|
1693
|
+
def _generate_instructions(self, source_info: Dict[str, Any], files_created: List[str]) -> List[str]:
|
|
1694
|
+
"""Generate usage instructions for the created scripts."""
|
|
1695
|
+
instructions = [
|
|
1696
|
+
"🎯 NEXT STEPS:",
|
|
1697
|
+
"",
|
|
1698
|
+
f"Source analysis completed: {source_info.get('number_of_files', 0)} files found",
|
|
1699
|
+
f"Polarity detected: {source_info.get('polarity', 'unknown')}",
|
|
1700
|
+
f"Estimated processing time: {source_info.get('number_of_files', 0) * source_info.get('length_minutes', 0.0) * 0.1:.1f} minutes",
|
|
1701
|
+
"",
|
|
1702
|
+
"1. REVIEW PARAMETERS:",
|
|
1703
|
+
" Edit 1_masster_workflow.py and verify these key settings:",
|
|
1704
|
+
" - NOISE threshold (adjust based on your instrument sensitivity)",
|
|
1705
|
+
" - CHROM_FWHM (adjust based on your chromatography peak width)",
|
|
1706
|
+
"",
|
|
1707
|
+
"2. EXECUTE SAMPLE PROCESSING:",
|
|
1708
|
+
" python 1_masster_workflow.py",
|
|
1709
|
+
" (This will process all raw files to sample5 format)",
|
|
1710
|
+
"",
|
|
1711
|
+
"3. INTERACTIVE ANALYSIS:",
|
|
1712
|
+
" uv run marimo edit 2_interactive_analysis.py",
|
|
1713
|
+
" (This opens an interactive notebook for data exploration)",
|
|
1714
|
+
"",
|
|
1715
|
+
"FILES CREATED:"
|
|
1716
|
+
]
|
|
1717
|
+
|
|
1718
|
+
for file_path in files_created:
|
|
1719
|
+
instructions.append(f" ✅ {Path(file_path).name}")
|
|
1720
|
+
|
|
1721
|
+
return instructions
|
|
1329
1722
|
|
|
1330
1723
|
|
|
1331
1724
|
# Export the main classes and functions
|
|
1332
|
-
__all__ = ["Wizard", "wizard_def", "create_analysis", "
|
|
1725
|
+
__all__ = ["Wizard", "wizard_def", "create_analysis", "analyze"]
|